From ff9a88ea03382df48404312dcaf8567b9a59dcd3 Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Thu, 4 Jun 2026 16:22:25 -0400 Subject: [PATCH 1/5] fix small bug --- R/colocboost_output.R | 9 ++-- tests/testthat/test_inference.R | 29 ++++++++++++- vignettes/ColocBoost_Wrapper_Pipeline.Rmd | 52 ++++++++++++++++++++--- vignettes/announcements.Rmd | 6 +++ 4 files changed, 84 insertions(+), 12 deletions(-) diff --git a/R/colocboost_output.R b/R/colocboost_output.R index e1d2490..050e510 100644 --- a/R/colocboost_output.R +++ b/R/colocboost_output.R @@ -517,8 +517,7 @@ get_robust_ucos <- function(cb_output, } if (npc_outcome_cutoff == 0 && is.null(pvalue_cutoff)) { - message("All possible uncolocalized events are reported regardless of their relative evidence (npc_outcome_cutoff = 0).") - return(cb_output) + message("All possible uncolocalized events with positive relative evidence are reported (npc_outcome_cutoff = 0).") } else { if (is.null(pvalue_cutoff)) { message(paste0( @@ -580,7 +579,7 @@ get_robust_ucos <- function(cb_output, ucolocset_names <- ucos_min_npc_outcome <- c() for (i in 1:length(ucos_details$ucos$ucos_index)) { npc_outcome <- ucos_details$ucos_outcomes_npc$npc_outcome[i] - pos_pass <- which(npc_outcome >= npc_outcome_cutoff) + pos_pass <- which(npc_outcome >= npc_outcome_cutoff & npc_outcome > 0) if (!is.null(pvalue_cutoff)) { ucos_tmp <- ucos_details$ucos$ucos_index[[i]] ucos_trait <- ucos_details$ucos_outcomes$outcome_index[[i]] @@ -592,8 +591,6 @@ get_robust_ucos <- function(cb_output, pos_pass_pvalue <- which(minPV <= pvalue_cutoff) if (length(pos_pass_pvalue) == 0) { pos_pass <- NULL - } else { - pos_pass <- 1 } } if (length(pos_pass) == 0) { @@ -1472,4 +1469,4 @@ merge_ucos_details <- function(ucos_details, ucos_from_cos) { ), "ucos_outcomes_npc" = rbind(ucos_details$ucos_outcomes_npc, ucos_from_cos$ucos_outcomes_npc) ) -} \ No newline at end of file +} diff --git a/tests/testthat/test_inference.R b/tests/testthat/test_inference.R index 74c11c6..d397dcc 100644 --- a/tests/testthat/test_inference.R +++ b/tests/testthat/test_inference.R @@ -598,7 +598,7 @@ test_that("get_robust_ucos handles npc_outcome_cutoff = 0 correctly", { # With npc_outcome_cutoff = 0 and no pvalue_cutoff, should return unchanged expect_message( result <- get_robust_ucos(cb_res, npc_outcome_cutoff = 0), - "All possible uncolocalized events are reported" + "positive relative evidence" ) # Should be essentially unchanged @@ -608,6 +608,33 @@ test_that("get_robust_ucos handles npc_outcome_cutoff = 0 correctly", { ) }) +test_that("get_robust_ucos removes zero npc_outcome even with zero cutoff", { + + # Generate test data + cb_res <- generate_ucos_test_data(output_level = 2) + + # Skip if no ucos were detected + skip_if(is.null(cb_res$ucos_details), "No ucos detected in test data") + + n_ucos_original <- length(cb_res$ucos_details$ucos$ucos_index) + cb_res$ucos_details$ucos_outcomes_npc$npc_outcome[1] <- 0 + + expect_message( + result <- get_robust_ucos(cb_res, npc_outcome_cutoff = 0), + "positive relative evidence" + ) + + if (n_ucos_original == 1) { + expect_null(result$ucos_details) + } else { + expect_equal( + length(result$ucos_details$ucos$ucos_index), + n_ucos_original - 1 + ) + expect_false(any(result$ucos_details$ucos_outcomes_npc$npc_outcome == 0)) + } +}) + test_that("get_robust_ucos handles missing ucos_details", { # Generate test data diff --git a/vignettes/ColocBoost_Wrapper_Pipeline.Rmd b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd index 9b26e9a..b634f5e 100644 --- a/vignettes/ColocBoost_Wrapper_Pipeline.Rmd +++ b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd @@ -23,7 +23,49 @@ This vignette demonstrates how to use the bioinformatics pipeline for ColocBoost Acknowledgment: Thanks to Kate (Kathryn) Lawrence (GitHub:@kal26) for her contributions to this vignette. -# 1. Loading Data using `colocboost_pipeline` function +# 1. ColocBoost analysis with basic QC steps + +The `colocboost_analysis()` function from `pecotmr` runs ColocBoost with optional basic QC before model fitting. It uses conventional ColocBoost inputs, such as `X`, `Y`, `sumstat`, and `LD`, while adding QC parameters for common data-cleaning steps. + +The QC parameters are optional and can be set according to the input data: + +- **`missing_rate_thresh`** removes variants with high genotype missingness. +- **`maf_cutoff`** removes variants with low minor allele frequency. +- **`xvar_cutoff`** removes variants with low genotype variance. +- **`ld_reference_meta_file`** filters individual-level variants against a reference variant list. +- **`pip_cutoff_to_skip_ind`** skips weak individual-level contexts based on single-effect PIP screening; use `0` to skip this screening. +- **`qc_method`** controls summary-statistics QC. Use `"none"` for basic allele/variant harmonization only, or `"slalom"` / `"dentist"` for LD-mismatch outlier detection. +- **`keep_indel`** controls whether insertion/deletion variants are retained during harmonization. +- **`pip_cutoff_to_skip_sumstat`** skips weak summary-statistics studies based on single-effect PIP screening; use `0` to skip this screening. +- **`impute`** runs RAISS imputation after QC when set to `TRUE`. +- **`impute_opts`** sets RAISS imputation options, including `rcond`, `R2_threshold`, `minimum_ld`, and `lamb`. +- **`LD_reference_info`** provides extra reference metadata for QC when `LD` or `X_ref` variant names are not sufficient. +- **`variant_convention`** specifies allele order in ColocBoost-style variant IDs, either `"A2_A1"` or `"A1_A2"`. + +Example: + +```{r colocboost-analysis-basic-qc, eval = FALSE} +fit <- colocboost_analysis( + X = X, + Y = Y, + dict_YX = dict_YX, + sumstat = sumstat, + X_ref = X_ref, + dict_sumstatLD = dict_sumstatLD, + outcome_names = outcome_names, + missing_rate_thresh = 0.1, + maf_cutoff = 0.0005, + xvar_cutoff = 0, + pip_cutoff_to_skip_ind = 0, + qc_method = "none", + keep_indel = TRUE, + pip_cutoff_to_skip_sumstat = 0, + impute = FALSE, + variant_convention = "A2_A1" +) +``` + +# 2. Loading Data using `colocboost_pipeline` function This function harmonizes the input data and prepares it for colocalization analysis. @@ -37,7 +79,7 @@ This list is then passed to the `colocboost_pipeline` function for the colocaliz Below are the input parameters for this function for loading individual-level data: -## 1.1. Loading individual-level data from multiple cohorts +## 2.1. Loading individual-level data from multiple cohorts Inputs: @@ -109,7 +151,7 @@ region_data_individual <- load_multitask_regional_data( -## 1.2. Loading summary statistics from multiple cohorts or datasets +## 2.2. Loading summary statistics from multiple cohorts or datasets Inputs: @@ -199,7 +241,7 @@ The LD metadata file is a tab-separated file with the following columns: ``` -# 2. Perform ColocBoost using `colocboost_pipeline` function +# 3. Perform ColocBoost using `colocboost_pipeline` function In this section, we load region data for a combination of individual-level and summary statistics data, then perform the colocalization analysis using the `colocboost_pipeline` function. The colocalization analysis can be run in any one of three modes, or in a combination of these modes (names assume that individual-level data is xQTL data and summary statistics data is GWAS data): @@ -284,4 +326,4 @@ colocboost_plot(colocboost_results$joint_gwas) for (i in 1:length(colocboost_results$separate_gwas)) { colocboost_plot(colocboost_results$separate_gwas[[i]]) } -``` \ No newline at end of file +``` diff --git a/vignettes/announcements.Rmd b/vignettes/announcements.Rmd index 40330b8..63155cd 100644 --- a/vignettes/announcements.Rmd +++ b/vignettes/announcements.Rmd @@ -14,6 +14,12 @@ vignette: > - *May 2, 2025*: `colocboost` R package is available on [CRAN](https://CRAN.R-project.org/package=colocboost). ## Software updates +- `v1.0.8` (**Upcoming release**) Improvements to summary-statistics workflows, trait-specific result filtering, and computational efficiency. + - Added `X_ref` support as a memory-efficient alternative to precomputed LD matrices for large summary-statistics analyses. + - Added `get_robust_ucos` to recalibrate and summarize robust trait-specific, uncolocalized events. + - Improved computational efficiency for repeated matrix products in large reference-panel workflows. + - Improved plotting robustness for extreme association signals and coefficient or z-score displays. + - Minor robustness fixes for summary-statistics analyses with reference-panel or LD-free inputs. - `v1.0.7` (**Important update**) Improvements to ColocBoost (check out the full details in [PR](https://github.com/StatFunGen/colocboost/pull/116) and [PR](https://github.com/StatFunGen/colocboost/pull/121)). - Enhanced `colocboost` main function with post-filtering and only keep the robust colocalization events. - Enhanced `colocboost_plot` function with flexible highlighting options and new visualization styles. From f5ee5a03700e6cd2e2ffa661c46eb72484bc8402 Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Sat, 6 Jun 2026 09:02:32 -0400 Subject: [PATCH 2/5] Update colocboost_output.R --- R/colocboost_output.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/colocboost_output.R b/R/colocboost_output.R index 050e510..f98953b 100644 --- a/R/colocboost_output.R +++ b/R/colocboost_output.R @@ -255,7 +255,7 @@ get_robust_colocalization <- function(cb_output, for (i in 1:length(cos_details$cos$cos_index)) { cos_npc_config <- cos_details$cos_outcomes_npc[[i]] npc_outcome <- cos_npc_config$npc_outcome - pos_pass <- which(npc_outcome >= npc_outcome_cutoff) + pos_pass <- which(npc_outcome >= npc_outcome_cutoff & npc_outcome > 0) if (!is.null(pvalue_cutoff)) { cos_tmp <- cos_details$cos$cos_index[[i]] cos_trait <- cos_details$cos_outcomes$outcome_index[[i]] From d0851ca305ff6b153a6628cf9dccba325b639320 Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Sat, 6 Jun 2026 10:40:20 -0400 Subject: [PATCH 3/5] pre-submission fix --- DESCRIPTION | 4 +-- R/colocboost.R | 5 ++-- cran-comments.md | 28 +++++++++++++++---- inst/WORDLIST | 11 +++++++- man/colocboost_validate_input_data.Rd | 2 +- vignettes/FineBoost_Special_Case.Rmd | 4 +-- .../Summary_Statistics_Colocalization.Rmd | 14 +++++++++- vignettes/announcements.Rmd | 4 +-- 8 files changed, 54 insertions(+), 18 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d932e84..f3bf847 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: colocboost Type: Package -Date: 2025-11-22 +Date: 2026-06-06 Title: Multi-Context Colocalization Analysis for QTL and GWAS Studies -Version: 1.0.7 +Version: 1.0.8 Authors@R: c( person(given = "Xuewei", family = "Cao", email = "xc2270@cumc.columbia.edu", role = c("cre", "aut", "cph")), person(given = "Haochen", family = "Sun", email = "hs3393@cumc.columbia.edu", role = c("aut", "cph")), diff --git a/R/colocboost.R b/R/colocboost.R index 542103c..2712cfe 100644 --- a/R/colocboost.R +++ b/R/colocboost.R @@ -424,7 +424,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data #' \item{sumstat}{Processed list of summary statistics data.frames} #' \item{LD}{Processed list of LD matrices} #' \item{X_ref}{Processed list of reference genotype matrices} -#' \item{ref_label}{Style of reference matrics} +#' \item{ref_label}{Style of reference matrices} #' \item{sumstatLD_dict}{Dictionary mapping sumstat to LD} #' \item{keep_variable_sumstat}{List of variant names for each sumstat} #' \item{Z}{List of z-scores for each outcome} @@ -514,7 +514,7 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL, }) keep_variable_individual <- lapply(X, colnames) if (!is.list(X) & !is.list(Y)) { - warning("Error: Input X and Y must be the list containing genotype matrics and all phenotype vectors!") + warning("Error: Input X and Y must be the list containing genotype matrices and all phenotype vectors!") return(NULL) } else { if (length(X) == 1) { @@ -968,4 +968,3 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL, npc_outcome_cutoff = npc_outcome_cutoff_updated )) } - diff --git a/cran-comments.md b/cran-comments.md index e63cf89..d782b4a 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,10 +1,26 @@ +## colocboost 1.0.8 release comments -## New release comments +This is an update to colocboost 1.0.7. -* This release includes three major improvements: - - Enhanced main function by adding post-filtering step. - - Enhanced plot function with flexible highlighting options. - - Optimized performance and computational efficiency. +This release includes: + +* Added X_ref support as a memory-efficient alternative to precomputed LD matrices for summary-statistics workflows. +* Added and refined robust post-filtering for colocalization and trait-specific uncolocalized events. +* Improved computational efficiency for repeated matrix products in reference-panel workflows. +* Improved plotting robustness for extreme association signals and coefficient or z-score displays. +* Updated documentation and vignettes, including the bioinformatics pipeline vignette. + +## R CMD check results + +There is one NOTE about installed package size: + +* checking installed package size ... NOTE + installed size is 5.0 MB + sub-directories of 1Mb or more: + data 2.0 MB + doc 1.9 MB + +This NOTE is expected. The installed size is mainly due to reduced example datasets and rendered vignettes with figures. These files are kept to make the tutorials reproducible and self-contained for multi-trait colocalization workflows. No external data are downloaded during examples or vignette rendering. ## Previous comments @@ -15,4 +31,4 @@ - Fixed reset users' options issues - Added proper COPYRIGHT HOLDER and ORGANIZATION to LICENSE - Added explanation of acronyms used in this package to inst/WORDLIST -* The examples and vignettes use small datasets to avoid long check times \ No newline at end of file +* The examples and vignettes use small datasets to avoid long check times diff --git a/inst/WORDLIST b/inst/WORDLIST index 072822b..a11345d 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -7,6 +7,7 @@ HyPrColoc # Hypothesis Prioritization in multi-trait Colocalization method SuSiE # Sum of Single Effects regression model # Statistical and Genetic Terms +bim # PLINK BIM variant information file eQTL # Expression Quantitative Trait Loci GWAS # Genome-Wide Association Study INDELs # Insertions and Deletions @@ -31,11 +32,14 @@ GTEx # Genotype-Tissue Expression project Micromamba # Lightweight Conda implementation Nealelab # Lab developing genetics GWAS summary statistics PLINK # Whole genome association analysis toolset +RAISS # Summary-statistics imputation method tabix # Tool for indexing genomic data UKBB # UK Biobank dataset +YAML # Human-readable configuration file format medRxiv # A preprint resource pixi # An environment manager conda # Package and environment management system +xz # Compression format # Researcher Names Jager # Philip L. de Jager @@ -48,18 +52,23 @@ et # and more al # and more # Technical Terms +changelog # Release change log cis # Referring to nearby location of a regulatory element chrom # Chromosome decayrate # Decay rate, an input parameter in our package doi # Digital Object Identifier +func # Function abbreviation used in parameter names grey # One color name in R iteratively # Performed through iterations +loglik # Log-likelihood abbreviation used in parameter names lfsr # Local False Sign Rate lth # Lower threshold modularity # Property of network structure omics # Collective biological data fields phenotypes # Observable traits pos # Position in genome +precomputed # Computed in advance +precomputes # Computes in advance probabilistically # Based on probability theory qc # Quality Control rcond # Reciprocal condition number @@ -89,4 +98,4 @@ npc # Normalization probability of colocalization in our proposed Co Pre # Before pre # Before jk # Index used in ColocBoost -nd # Second \ No newline at end of file +nd # Second diff --git a/man/colocboost_validate_input_data.Rd b/man/colocboost_validate_input_data.Rd index ec7e40a..16e235f 100644 --- a/man/colocboost_validate_input_data.Rd +++ b/man/colocboost_validate_input_data.Rd @@ -68,7 +68,7 @@ A list containing: \item{sumstat}{Processed list of summary statistics data.frames} \item{LD}{Processed list of LD matrices} \item{X_ref}{Processed list of reference genotype matrices} -\item{ref_label}{Style of reference matrics} +\item{ref_label}{Style of reference matrices} \item{sumstatLD_dict}{Dictionary mapping sumstat to LD} \item{keep_variable_sumstat}{List of variant names for each sumstat} \item{Z}{List of z-scores for each outcome} diff --git a/vignettes/FineBoost_Special_Case.Rmd b/vignettes/FineBoost_Special_Case.Rmd index 547d136..aded0a0 100644 --- a/vignettes/FineBoost_Special_Case.Rmd +++ b/vignettes/FineBoost_Special_Case.Rmd @@ -70,6 +70,6 @@ colocboost_plot(res) **Note**: Weak learners SEL in FineBoost may capture noise as putative signals, potentially introducing false positives to our findings. -To identify and filter spurious signals, we discard fine-tunned the threshold of $\Delta L_l$ using extensive simulations to balance sensitivity and specificity. +To identify and filter spurious signals, we use a fine-tuned threshold of $\Delta L_l$ based on extensive simulations to balance sensitivity and specificity. This threshold is set to 0.025 by default for ColocBoost when detect the colocalization, but we suggested a less conservative threshold of 0.015 for FineBoost -when performing single-trait fine-mapping analysis (`check_null_max = 0.015` as we suggested). \ No newline at end of file +when performing single-trait fine-mapping analysis (`check_null_max = 0.015` as we suggested). diff --git a/vignettes/Summary_Statistics_Colocalization.Rmd b/vignettes/Summary_Statistics_Colocalization.Rmd index 2dc127a..b8ff0aa 100644 --- a/vignettes/Summary_Statistics_Colocalization.Rmd +++ b/vignettes/Summary_Statistics_Colocalization.Rmd @@ -90,6 +90,19 @@ res$cos_details$cos$cos_index colocboost_plot(res) ``` +Alternatively, you can provide the reference panel genotype matrix directly through `X_ref`, which avoids storing the full LD matrix: + +```{r one-X-ref} +# Use reference genotype directly instead of precomputing LD +X_ref <- Ind_5traits$X[[1]] + +# Run colocboost +res <- colocboost(sumstat = Sumstat_5traits$sumstat, X_ref = X_ref) + +# Identified CoS +res$cos_details$cos$cos_index +``` + ### Results Interpretation @@ -234,4 +247,3 @@ res$cos_details$cos$cos_index See more details about data format to implement LD-free ColocBoost and LD-mismatch diagnosis in [LD mismatch and LD-free Colocalization](https://statfungen.github.io/colocboost/articles/LD_Free_Colocalization.html)). - diff --git a/vignettes/announcements.Rmd b/vignettes/announcements.Rmd index 63155cd..7508ff2 100644 --- a/vignettes/announcements.Rmd +++ b/vignettes/announcements.Rmd @@ -14,13 +14,13 @@ vignette: > - *May 2, 2025*: `colocboost` R package is available on [CRAN](https://CRAN.R-project.org/package=colocboost). ## Software updates -- `v1.0.8` (**Upcoming release**) Improvements to summary-statistics workflows, trait-specific result filtering, and computational efficiency. +- `v1.0.8` Improvements to summary-statistics workflows, trait-specific result filtering, and computational efficiency. - Added `X_ref` support as a memory-efficient alternative to precomputed LD matrices for large summary-statistics analyses. - Added `get_robust_ucos` to recalibrate and summarize robust trait-specific, uncolocalized events. - Improved computational efficiency for repeated matrix products in large reference-panel workflows. - Improved plotting robustness for extreme association signals and coefficient or z-score displays. - Minor robustness fixes for summary-statistics analyses with reference-panel or LD-free inputs. -- `v1.0.7` (**Important update**) Improvements to ColocBoost (check out the full details in [PR](https://github.com/StatFunGen/colocboost/pull/116) and [PR](https://github.com/StatFunGen/colocboost/pull/121)). +- `v1.0.7` Improvements to ColocBoost (check out the full details in [PR](https://github.com/StatFunGen/colocboost/pull/116) and [PR](https://github.com/StatFunGen/colocboost/pull/121)). - Enhanced `colocboost` main function with post-filtering and only keep the robust colocalization events. - Enhanced `colocboost_plot` function with flexible highlighting options and new visualization styles. - Optimized performance and computational efficiency From a4f2738b329c9659b2d3d6b8b620172138b5d21b Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Sat, 6 Jun 2026 11:52:42 -0400 Subject: [PATCH 4/5] adding unit test --- tests/testthat/test_inference.R | 56 ++++++++++ tests/testthat/test_sumstats.R | 182 ++++++++++++++++++++++++++++++++ 2 files changed, 238 insertions(+) diff --git a/tests/testthat/test_inference.R b/tests/testthat/test_inference.R index d397dcc..9b36923 100644 --- a/tests/testthat/test_inference.R +++ b/tests/testthat/test_inference.R @@ -125,6 +125,62 @@ test_that("get_robust_colocalization filters results correctly", { expect_error(suppressWarnings(get_robust_colocalization(cb_res, pvalue_cutoff = 0.05)), NA) }) +test_that("get_robust_colocalization handles validation and early return branches", { + + cb_res <- generate_test_result() + + expect_error( + get_robust_colocalization("not_a_colocboost_object"), + "colocboost object" + ) + + no_cos <- cb_res + no_cos$cos_details <- NULL + expect_message( + no_cos_result <- get_robust_colocalization(no_cos), + "No colocalization results" + ) + expect_null(no_cos_result$cos_details) + + expect_warning( + bad_pvalue_result <- get_robust_colocalization(cb_res, pvalue_cutoff = 1.5), + "pvalue cutoff" + ) + expect_equal(bad_pvalue_result, cb_res) + + expect_message( + all_events_result <- get_robust_colocalization( + cb_res, + cos_npc_cutoff = 0, + npc_outcome_cutoff = 0 + ), + "All possible colocalization events" + ) + expect_equal(all_events_result, cb_res) +}) + +test_that("get_robust_colocalization removes CoS with zero npc_outcome", { + + cb_res <- generate_test_result() + skip_if(is.null(cb_res$cos_details), "No CoS detected in test data") + + cb_res$cos_details$cos_outcomes_npc[[1]]$npc_outcome <- 0 + + expect_message( + filtered <- get_robust_colocalization( + cb_res, + cos_npc_cutoff = 0.2, + npc_outcome_cutoff = 0 + ), + "Extracting colocalization results" + ) + + expect_s3_class(filtered, "colocboost") + expect_null(filtered$cos_details) + expect_true("ucos_details" %in% names(filtered)) + expect_false(is.null(filtered$ucos_details)) +}) + # Test for get_hierarchical_clusters test_that("get_hierarchical_clusters functions correctly", { # Test case 1: Simple 2x2 correlation matrix with high correlation diff --git a/tests/testthat/test_sumstats.R b/tests/testthat/test_sumstats.R index ef896c3..5341c81 100644 --- a/tests/testthat/test_sumstats.R +++ b/tests/testthat/test_sumstats.R @@ -99,6 +99,26 @@ generate_sumstat_test_data <- function(n = 100, p = 20, L = 2, seed = 42) { # Create summary statistics test data test_sumstat_data <- generate_sumstat_test_data() +make_validation_x <- function(n = 6, p = 4) { + X <- matrix(seq_len(n * p), nrow = n, ncol = p) + colnames(X) <- paste0("SNP", seq_len(p)) + X +} + +make_validation_ld <- function(p = 4) { + LD <- diag(p) + colnames(LD) <- rownames(LD) <- paste0("SNP", seq_len(p)) + LD +} + +make_validation_sumstat <- function(p = 4) { + data.frame( + z = seq(0.1, 0.4, length.out = p), + n = 100, + variant = paste0("SNP", seq_len(p)) + ) +} + # Test 1: Basic summary statistics input test_that("colocboost runs with basic summary statistics format", { # Run colocboost with sumstat and single LD matrix @@ -355,3 +375,165 @@ test_that("colocboost errors with no common variants", { expect_true(any(grepl("is empty after filtering", warnings))) }) +test_that("colocboost_validate_input_data covers individual input validation edge cases", { + X <- make_validation_x() + y <- seq_len(nrow(X)) + Y3 <- list(y, y + 1, y + 2) + + X_unequal_1 <- matrix(seq_len(24), nrow = 6, ncol = 4) + X_unequal_2 <- matrix(seq_len(30), nrow = 6, ncol = 5) + expect_warning( + expect_null(colocboost_validate_input_data( + X = list(X_unequal_1, X_unequal_2), + Y = list(y, y + 1) + )), + "same number of variables" + ) + + X_dup <- X + colnames(X_dup) <- c("SNP1", "SNP1", "SNP2", "SNP3") + expect_message( + validated_dup <- colocboost_validate_input_data(X = X_dup, Y = y), + "Removed duplicate columns from X matrix" + ) + expect_equal(validated_dup$keep_variable_individual[[1]], c("SNP1", "SNP2", "SNP3")) + + expect_warning( + expect_null(colocboost_validate_input_data( + X = list(X, X), + Y = Y3 + )), + "dict_YX" + ) + + expect_warning( + expect_null(colocboost_validate_input_data( + X = list(X, X), + Y = Y3, + dict_YX = matrix(c(1, 1, 2, 2), ncol = 2, byrow = TRUE) + )), + "matched X for outcome 3" + ) + + expect_warning( + expect_null(colocboost_validate_input_data( + X = list(X, X), + Y = Y3, + dict_YX = matrix(c(1, 1, 1, 2, 2, 2, 3, 1), ncol = 2, byrow = TRUE) + )), + "different matched X for outcome 1" + ) + + expect_warning( + expect_null(colocboost_validate_input_data( + X = list(X, X), + Y = Y3, + dict_YX = matrix(c(1, 1, 2, 2, 3, 3), ncol = 2, byrow = TRUE) + )), + "enough X matrices" + ) + + X_na <- X + X_na[1, 1] <- NA + expect_warning( + expect_null(colocboost_validate_input_data(X = X_na, Y = y)), + "Input X must not contain missing values" + ) +}) + +test_that("colocboost_validate_input_data covers summary reference mapping validation", { + LD <- make_validation_ld() + sumstat3 <- rep(list(make_validation_sumstat()), 3) + LD2 <- list(LD, LD) + + expect_warning( + expect_null(colocboost_validate_input_data(sumstat = sumstat3, LD = LD2)), + "dict_sumstatLD" + ) + + expect_warning( + expect_null(colocboost_validate_input_data( + sumstat = sumstat3, + LD = LD2, + dict_sumstatLD = matrix(c(1, 1, 2, 2), ncol = 2, byrow = TRUE) + )), + "matched.*sumstat" + ) + + expect_warning( + expect_null(colocboost_validate_input_data( + sumstat = sumstat3, + LD = LD2, + dict_sumstatLD = matrix(c(1, 1, 1, 2, 2, 2, 3, 1), ncol = 2, byrow = TRUE) + )), + "multiple matched" + ) + + expect_warning( + expect_null(colocboost_validate_input_data( + sumstat = sumstat3, + LD = LD2, + dict_sumstatLD = matrix(c(1, 1, 2, 2, 3, 3), ncol = 2, byrow = TRUE) + )), + "enough.*matrices" + ) + + expect_warning( + expect_null(colocboost_validate_input_data( + sumstat = list(make_validation_sumstat()), + LD = diag(3) + )), + "has no variant names" + ) +}) + +test_that("colocboost_validate_input_data covers summary statistic value validation", { + LD <- make_validation_ld() + variants <- paste0("SNP", 1:4) + + expect_warning( + expect_null(colocboost_validate_input_data( + sumstat = list(data.frame(beta = 1:4, n = 100, variant = variants)), + LD = LD + )), + "either z" + ) + + expect_warning( + expect_null(colocboost_validate_input_data( + sumstat = list(data.frame(beta = c(1, NA, 3, 4), sebeta = 1, n = 100, variant = variants)), + LD = LD + )), + "cannot have missing values" + ) + + expect_warning( + expect_null(colocboost_validate_input_data( + sumstat = list(data.frame(beta = 1:4, sebeta = c(1, 1, 0, 1), n = 100, variant = variants)), + LD = LD + )), + "zero or negative" + ) + + sumstat_na_z <- make_validation_sumstat() + sumstat_na_z$z[1] <- NA + expect_warning( + validated_na_z <- colocboost_validate_input_data(sumstat = list(sumstat_na_z), LD = LD), + "contains NA values" + ) + expect_false(anyNA(validated_na_z$Z[[1]])) + + sumstat_bad_n <- make_validation_sumstat() + sumstat_bad_n$n[1] <- 1 + expect_warning( + expect_null(colocboost_validate_input_data(sumstat = list(sumstat_bad_n), LD = LD)), + "Sample size N" + ) + + sumstat_bad_var_y <- make_validation_sumstat() + sumstat_bad_var_y$var_y <- -1 + expect_warning( + expect_null(colocboost_validate_input_data(sumstat = list(sumstat_bad_var_y), LD = LD)), + "var_y" + ) +}) From 36593ca88d7b93669031a609722cb8cb061ca0ac Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Sat, 6 Jun 2026 13:54:41 -0400 Subject: [PATCH 5/5] Update test_inference.R --- tests/testthat/test_inference.R | 41 +++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/tests/testthat/test_inference.R b/tests/testthat/test_inference.R index 9b36923..631d9b1 100644 --- a/tests/testthat/test_inference.R +++ b/tests/testthat/test_inference.R @@ -107,6 +107,38 @@ generate_ucos_test_data <- function(n = 500, p = 60, L = 3, seed = 42, output_le return(result) } +generate_cos_test_result <- function(n = 250, p = 30, L = 3, seed = 20260606) { + set.seed(seed) + + sigma <- 0.9^abs(outer(seq_len(p), seq_len(p), "-")) + X <- MASS::mvrnorm(n, rep(0, p), sigma) + colnames(X) <- paste0("SNP", seq_len(p)) + + true_beta <- matrix(0, p, L) + true_beta[8, 1] <- 1.2 + true_beta[8, 2] <- 1.1 + true_beta[22, 3] <- 1.2 + + Y <- matrix(0, n, L) + for (l in seq_len(L)) { + Y[, l] <- X %*% true_beta[, l] + rnorm(n, 0, 0.6) + } + + suppressWarnings({ + result <- colocboost( + X = replicate(L, X, simplify = FALSE), + Y = lapply(seq_len(L), function(l) Y[, l]), + M = 80, + output_level = 3, + cos_npc_cutoff = 0, + npc_outcome_cutoff = 0, + pvalue_cutoff = NULL + ) + }) + + return(result) +} + # Test for get_strong_colocalization @@ -127,7 +159,8 @@ test_that("get_robust_colocalization filters results correctly", { test_that("get_robust_colocalization handles validation and early return branches", { - cb_res <- generate_test_result() + cb_res <- generate_cos_test_result() + expect_false(is.null(cb_res$cos_details)) expect_error( get_robust_colocalization("not_a_colocboost_object"), @@ -161,8 +194,8 @@ test_that("get_robust_colocalization handles validation and early return branche test_that("get_robust_colocalization removes CoS with zero npc_outcome", { - cb_res <- generate_test_result() - skip_if(is.null(cb_res$cos_details), "No CoS detected in test data") + cb_res <- generate_cos_test_result() + expect_false(is.null(cb_res$cos_details)) cb_res$cos_details$cos_outcomes_npc[[1]]$npc_outcome <- 0 @@ -177,8 +210,6 @@ test_that("get_robust_colocalization removes CoS with zero npc_outcome", { expect_s3_class(filtered, "colocboost") expect_null(filtered$cos_details) - expect_true("ucos_details" %in% names(filtered)) - expect_false(is.null(filtered$ucos_details)) }) # Test for get_hierarchical_clusters