From ff9a88ea03382df48404312dcaf8567b9a59dcd3 Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Thu, 4 Jun 2026 16:22:25 -0400
Subject: [PATCH 1/5] fix small bug

---
 R/colocboost_output.R                     |  9 ++--
 tests/testthat/test_inference.R           | 29 ++++++++++++-
 vignettes/ColocBoost_Wrapper_Pipeline.Rmd | 52 ++++++++++++++++++++---
 vignettes/announcements.Rmd               |  6 +++
 4 files changed, 84 insertions(+), 12 deletions(-)

diff --git a/R/colocboost_output.R b/R/colocboost_output.R
index e1d2490..050e510 100644
--- a/R/colocboost_output.R
+++ b/R/colocboost_output.R
@@ -517,8 +517,7 @@ get_robust_ucos <- function(cb_output,
   }
 
   if (npc_outcome_cutoff == 0 && is.null(pvalue_cutoff)) {
-    message("All possible uncolocalized events are reported regardless of their relative evidence (npc_outcome_cutoff = 0).")
-    return(cb_output)
+    message("All possible uncolocalized events with positive relative evidence are reported (npc_outcome_cutoff = 0).")
   } else {
     if (is.null(pvalue_cutoff)) {
       message(paste0(
@@ -580,7 +579,7 @@ get_robust_ucos <- function(cb_output,
   ucolocset_names <- ucos_min_npc_outcome <- c()
   for (i in 1:length(ucos_details$ucos$ucos_index)) {
     npc_outcome <- ucos_details$ucos_outcomes_npc$npc_outcome[i]
-    pos_pass <- which(npc_outcome >= npc_outcome_cutoff)
+    pos_pass <- which(npc_outcome >= npc_outcome_cutoff & npc_outcome > 0)
     if (!is.null(pvalue_cutoff)) {
       ucos_tmp <- ucos_details$ucos$ucos_index[[i]]
       ucos_trait <- ucos_details$ucos_outcomes$outcome_index[[i]]
@@ -592,8 +591,6 @@ get_robust_ucos <- function(cb_output,
       pos_pass_pvalue <- which(minPV <= pvalue_cutoff)
       if (length(pos_pass_pvalue) == 0) {
         pos_pass <- NULL
-      } else {
-        pos_pass <- 1
       }
     }
     if (length(pos_pass) == 0) {
@@ -1472,4 +1469,4 @@ merge_ucos_details <- function(ucos_details, ucos_from_cos) {
     ),
     "ucos_outcomes_npc" = rbind(ucos_details$ucos_outcomes_npc, ucos_from_cos$ucos_outcomes_npc)
   )
-}
\ No newline at end of file
+}
diff --git a/tests/testthat/test_inference.R b/tests/testthat/test_inference.R
index 74c11c6..d397dcc 100644
--- a/tests/testthat/test_inference.R
+++ b/tests/testthat/test_inference.R
@@ -598,7 +598,7 @@ test_that("get_robust_ucos handles npc_outcome_cutoff = 0 correctly", {
   # With npc_outcome_cutoff = 0 and no pvalue_cutoff, should return unchanged
   expect_message(
     result <- get_robust_ucos(cb_res, npc_outcome_cutoff = 0),
-    "All possible uncolocalized events are reported"
+    "positive relative evidence"
   )
   
   # Should be essentially unchanged
@@ -608,6 +608,33 @@ test_that("get_robust_ucos handles npc_outcome_cutoff = 0 correctly", {
   )
 })
 
+test_that("get_robust_ucos removes zero npc_outcome even with zero cutoff", {
+
+  # Generate test data
+  cb_res <- generate_ucos_test_data(output_level = 2)
+
+  # Skip if no ucos were detected
+  skip_if(is.null(cb_res$ucos_details), "No ucos detected in test data")
+
+  n_ucos_original <- length(cb_res$ucos_details$ucos$ucos_index)
+  cb_res$ucos_details$ucos_outcomes_npc$npc_outcome[1] <- 0
+
+  expect_message(
+    result <- get_robust_ucos(cb_res, npc_outcome_cutoff = 0),
+    "positive relative evidence"
+  )
+
+  if (n_ucos_original == 1) {
+    expect_null(result$ucos_details)
+  } else {
+    expect_equal(
+      length(result$ucos_details$ucos$ucos_index),
+      n_ucos_original - 1
+    )
+    expect_false(any(result$ucos_details$ucos_outcomes_npc$npc_outcome == 0))
+  }
+})
+
 test_that("get_robust_ucos handles missing ucos_details", {
   
   # Generate test data
diff --git a/vignettes/ColocBoost_Wrapper_Pipeline.Rmd b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd
index 9b26e9a..b634f5e 100644
--- a/vignettes/ColocBoost_Wrapper_Pipeline.Rmd
+++ b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd
@@ -23,7 +23,49 @@ This vignette demonstrates how to use the bioinformatics pipeline for ColocBoost
 
 Acknowledgment: Thanks to Kate (Kathryn) Lawrence (GitHub:@kal26) for her contributions to this vignette.
 
-# 1. Loading Data using `colocboost_pipeline` function
+# 1. ColocBoost analysis with basic QC steps
+
+The `colocboost_analysis()` function from `pecotmr` runs ColocBoost with optional basic QC before model fitting. It uses conventional ColocBoost inputs, such as `X`, `Y`, `sumstat`, and `LD`, while adding QC parameters for common data-cleaning steps.
+
+The QC parameters are optional and can be set according to the input data:
+
+- **`missing_rate_thresh`** removes variants with high genotype missingness.
+- **`maf_cutoff`** removes variants with low minor allele frequency.
+- **`xvar_cutoff`** removes variants with low genotype variance.
+- **`ld_reference_meta_file`** filters individual-level variants against a reference variant list.
+- **`pip_cutoff_to_skip_ind`** skips weak individual-level contexts based on single-effect PIP screening; use `0` to skip this screening.
+- **`qc_method`** controls summary-statistics QC. Use `"none"` for basic allele/variant harmonization only, or `"slalom"` / `"dentist"` for LD-mismatch outlier detection.
+- **`keep_indel`** controls whether insertion/deletion variants are retained during harmonization.
+- **`pip_cutoff_to_skip_sumstat`** skips weak summary-statistics studies based on single-effect PIP screening; use `0` to skip this screening.
+- **`impute`** runs RAISS imputation after QC when set to `TRUE`.
+- **`impute_opts`** sets RAISS imputation options, including `rcond`, `R2_threshold`, `minimum_ld`, and `lamb`.
+- **`LD_reference_info`** provides extra reference metadata for QC when `LD` or `X_ref` variant names are not sufficient.
+- **`variant_convention`** specifies allele order in ColocBoost-style variant IDs, either `"A2_A1"` or `"A1_A2"`.
+
+Example:
+
+```{r colocboost-analysis-basic-qc, eval = FALSE}
+fit <- colocboost_analysis(
+    X = X,
+    Y = Y,
+    dict_YX = dict_YX,
+    sumstat = sumstat,
+    X_ref = X_ref,
+    dict_sumstatLD = dict_sumstatLD,
+    outcome_names = outcome_names,
+    missing_rate_thresh = 0.1,
+    maf_cutoff = 0.0005,
+    xvar_cutoff = 0,
+    pip_cutoff_to_skip_ind = 0,
+    qc_method = "none",
+    keep_indel = TRUE,
+    pip_cutoff_to_skip_sumstat = 0,
+    impute = FALSE,
+    variant_convention = "A2_A1"
+)
+```
+
+# 2. Loading Data using `colocboost_pipeline` function
 
 This function harmonizes the input data and prepares it for colocalization analysis. 
 
@@ -37,7 +79,7 @@ This list is then passed to the `colocboost_pipeline` function for the colocaliz
 
 Below are the input parameters for this function for loading individual-level data:
 
-## 1.1. Loading individual-level data from multiple cohorts
+## 2.1. Loading individual-level data from multiple cohorts
 
 Inputs:
 
@@ -109,7 +151,7 @@ region_data_individual <- load_multitask_regional_data(
 
 
 
-## 1.2. Loading summary statistics from multiple cohorts or datasets
+## 2.2. Loading summary statistics from multiple cohorts or datasets
 
 Inputs:
 
@@ -199,7 +241,7 @@ The LD metadata file is a tab-separated file with the following columns:
 ```
 
 
-# 2. Perform ColocBoost using `colocboost_pipeline` function
+# 3. Perform ColocBoost using `colocboost_pipeline` function
 
 In this section, we load region data for a combination of individual-level and summary statistics data, then perform the colocalization analysis using the `colocboost_pipeline` function.
 The colocalization analysis can be run in any one of three modes, or in a combination of these modes (names assume that individual-level data is xQTL data and summary statistics data is GWAS data):
@@ -284,4 +326,4 @@ colocboost_plot(colocboost_results$joint_gwas)
 for (i in 1:length(colocboost_results$separate_gwas)) {
     colocboost_plot(colocboost_results$separate_gwas[[i]])
 }
-```
\ No newline at end of file
+```
diff --git a/vignettes/announcements.Rmd b/vignettes/announcements.Rmd
index 40330b8..63155cd 100644
--- a/vignettes/announcements.Rmd
+++ b/vignettes/announcements.Rmd
@@ -14,6 +14,12 @@ vignette: >
 - *May 2, 2025*: `colocboost` R package is available on [CRAN](https://CRAN.R-project.org/package=colocboost).
 
 ## Software updates
+- `v1.0.8` (**Upcoming release**) Improvements to summary-statistics workflows, trait-specific result filtering, and computational efficiency.
+  - Added `X_ref` support as a memory-efficient alternative to precomputed LD matrices for large summary-statistics analyses.
+  - Added `get_robust_ucos` to recalibrate and summarize robust trait-specific, uncolocalized events.
+  - Improved computational efficiency for repeated matrix products in large reference-panel workflows.
+  - Improved plotting robustness for extreme association signals and coefficient or z-score displays.
+  - Minor robustness fixes for summary-statistics analyses with reference-panel or LD-free inputs.
 - `v1.0.7` (**Important update**) Improvements to ColocBoost (check out the full details in [PR](https://github.com/StatFunGen/colocboost/pull/116) and [PR](https://github.com/StatFunGen/colocboost/pull/121)). 
   - Enhanced `colocboost` main function with post-filtering and only keep the robust colocalization events.
   - Enhanced `colocboost_plot` function with flexible highlighting options and new visualization styles.

From f5ee5a03700e6cd2e2ffa661c46eb72484bc8402 Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Sat, 6 Jun 2026 09:02:32 -0400
Subject: [PATCH 2/5] Update colocboost_output.R

---
 R/colocboost_output.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/colocboost_output.R b/R/colocboost_output.R
index 050e510..f98953b 100644
--- a/R/colocboost_output.R
+++ b/R/colocboost_output.R
@@ -255,7 +255,7 @@ get_robust_colocalization <- function(cb_output,
   for (i in 1:length(cos_details$cos$cos_index)) {
     cos_npc_config <- cos_details$cos_outcomes_npc[[i]]
     npc_outcome <- cos_npc_config$npc_outcome
-    pos_pass <- which(npc_outcome >= npc_outcome_cutoff)
+    pos_pass <- which(npc_outcome >= npc_outcome_cutoff & npc_outcome > 0)
     if (!is.null(pvalue_cutoff)) {
       cos_tmp <- cos_details$cos$cos_index[[i]]
       cos_trait <- cos_details$cos_outcomes$outcome_index[[i]]

From d0851ca305ff6b153a6628cf9dccba325b639320 Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Sat, 6 Jun 2026 10:40:20 -0400
Subject: [PATCH 3/5] pre-submission fix

---
 DESCRIPTION                                   |  4 +--
 R/colocboost.R                                |  5 ++--
 cran-comments.md                              | 28 +++++++++++++++----
 inst/WORDLIST                                 | 11 +++++++-
 man/colocboost_validate_input_data.Rd         |  2 +-
 vignettes/FineBoost_Special_Case.Rmd          |  4 +--
 .../Summary_Statistics_Colocalization.Rmd     | 14 +++++++++-
 vignettes/announcements.Rmd                   |  4 +--
 8 files changed, 54 insertions(+), 18 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index d932e84..f3bf847 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: colocboost
 Type: Package
-Date: 2025-11-22
+Date: 2026-06-06
 Title: Multi-Context Colocalization Analysis for QTL and GWAS Studies
-Version: 1.0.7
+Version: 1.0.8
 Authors@R: c(
   person(given = "Xuewei", family = "Cao", email = "xc2270@cumc.columbia.edu", role = c("cre", "aut", "cph")),
   person(given = "Haochen", family = "Sun", email = "hs3393@cumc.columbia.edu", role = c("aut", "cph")),
diff --git a/R/colocboost.R b/R/colocboost.R
index 542103c..2712cfe 100644
--- a/R/colocboost.R
+++ b/R/colocboost.R
@@ -424,7 +424,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
 #'   \item{sumstat}{Processed list of summary statistics data.frames}
 #'   \item{LD}{Processed list of LD matrices}
 #'   \item{X_ref}{Processed list of reference genotype matrices}
-#'   \item{ref_label}{Style of reference matrics}
+#'   \item{ref_label}{Style of reference matrices}
 #'   \item{sumstatLD_dict}{Dictionary mapping sumstat to LD}
 #'   \item{keep_variable_sumstat}{List of variant names for each sumstat}
 #'   \item{Z}{List of z-scores for each outcome}
@@ -514,7 +514,7 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL,
     })
     keep_variable_individual <- lapply(X, colnames)
     if (!is.list(X) & !is.list(Y)) {
-      warning("Error: Input X and Y must be the list containing genotype matrics and all phenotype vectors!")
+      warning("Error: Input X and Y must be the list containing genotype matrices and all phenotype vectors!")
       return(NULL)
     } else {
       if (length(X) == 1) {
@@ -968,4 +968,3 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL,
     npc_outcome_cutoff = npc_outcome_cutoff_updated
   ))
 }
-
diff --git a/cran-comments.md b/cran-comments.md
index e63cf89..d782b4a 100644
--- a/cran-comments.md
+++ b/cran-comments.md
@@ -1,10 +1,26 @@
+## colocboost 1.0.8 release comments
 
-## New release comments
+This is an update to colocboost 1.0.7.
 
-* This release includes three major improvements:
-  - Enhanced main function by adding post-filtering step.
-  - Enhanced plot function with flexible highlighting options.
-  - Optimized performance and computational efficiency.
+This release includes:
+
+* Added X_ref support as a memory-efficient alternative to precomputed LD matrices for summary-statistics workflows.
+* Added and refined robust post-filtering for colocalization and trait-specific uncolocalized events.
+* Improved computational efficiency for repeated matrix products in reference-panel workflows.
+* Improved plotting robustness for extreme association signals and coefficient or z-score displays.
+* Updated documentation and vignettes, including the bioinformatics pipeline vignette.
+
+## R CMD check results
+
+There is one NOTE about installed package size:
+
+* checking installed package size ... NOTE
+  installed size is 5.0 MB
+  sub-directories of 1Mb or more:
+    data 2.0 MB
+    doc  1.9 MB
+
+This NOTE is expected. The installed size is mainly due to reduced example datasets and rendered vignettes with figures. These files are kept to make the tutorials reproducible and self-contained for multi-trait colocalization workflows. No external data are downloaded during examples or vignette rendering.
 
 ## Previous comments
 
@@ -15,4 +31,4 @@
   - Fixed reset users' options issues
   - Added proper COPYRIGHT HOLDER and ORGANIZATION to LICENSE
   - Added explanation of acronyms used in this package to inst/WORDLIST
-* The examples and vignettes use small datasets to avoid long check times
\ No newline at end of file
+* The examples and vignettes use small datasets to avoid long check times
diff --git a/inst/WORDLIST b/inst/WORDLIST
index 072822b..a11345d 100644
--- a/inst/WORDLIST
+++ b/inst/WORDLIST
@@ -7,6 +7,7 @@ HyPrColoc       # Hypothesis Prioritization in multi-trait Colocalization method
 SuSiE           # Sum of Single Effects regression model
 
 # Statistical and Genetic Terms
+bim             # PLINK BIM variant information file
 eQTL            # Expression Quantitative Trait Loci
 GWAS            # Genome-Wide Association Study
 INDELs          # Insertions and Deletions
@@ -31,11 +32,14 @@ GTEx            # Genotype-Tissue Expression project
 Micromamba      # Lightweight Conda implementation
 Nealelab        # Lab developing genetics GWAS summary statistics
 PLINK           # Whole genome association analysis toolset
+RAISS           # Summary-statistics imputation method
 tabix           # Tool for indexing genomic data
 UKBB            # UK Biobank dataset
+YAML            # Human-readable configuration file format
 medRxiv         # A preprint resource
 pixi            # An environment manager
 conda           # Package and environment management system
+xz              # Compression format
 
 # Researcher Names
 Jager           # Philip L. de Jager
@@ -48,18 +52,23 @@ et              # and more
 al              # and more
 
 # Technical Terms
+changelog       # Release change log
 cis             # Referring to nearby location of a regulatory element
 chrom           # Chromosome
 decayrate       # Decay rate, an input parameter in our package
 doi             # Digital Object Identifier
+func            # Function abbreviation used in parameter names
 grey            # One color name in R
 iteratively     # Performed through iterations
+loglik          # Log-likelihood abbreviation used in parameter names
 lfsr            # Local False Sign Rate
 lth             # Lower threshold
 modularity      # Property of network structure
 omics           # Collective biological data fields
 phenotypes      # Observable traits
 pos             # Position in genome
+precomputed     # Computed in advance
+precomputes     # Computes in advance
 probabilistically # Based on probability theory
 qc              # Quality Control
 rcond           # Reciprocal condition number
@@ -89,4 +98,4 @@ npc             # Normalization probability of colocalization in our proposed Co
 Pre             # Before
 pre             # Before
 jk              # Index used in ColocBoost
-nd              # Second
\ No newline at end of file
+nd              # Second
diff --git a/man/colocboost_validate_input_data.Rd b/man/colocboost_validate_input_data.Rd
index ec7e40a..16e235f 100644
--- a/man/colocboost_validate_input_data.Rd
+++ b/man/colocboost_validate_input_data.Rd
@@ -68,7 +68,7 @@ A list containing:
 \item{sumstat}{Processed list of summary statistics data.frames}
 \item{LD}{Processed list of LD matrices}
 \item{X_ref}{Processed list of reference genotype matrices}
-\item{ref_label}{Style of reference matrics}
+\item{ref_label}{Style of reference matrices}
 \item{sumstatLD_dict}{Dictionary mapping sumstat to LD}
 \item{keep_variable_sumstat}{List of variant names for each sumstat}
 \item{Z}{List of z-scores for each outcome}
diff --git a/vignettes/FineBoost_Special_Case.Rmd b/vignettes/FineBoost_Special_Case.Rmd
index 547d136..aded0a0 100644
--- a/vignettes/FineBoost_Special_Case.Rmd
+++ b/vignettes/FineBoost_Special_Case.Rmd
@@ -70,6 +70,6 @@ colocboost_plot(res)
 
 
 **Note**: Weak learners SEL in FineBoost may capture noise as putative signals, potentially introducing false positives to our findings. 
-To identify and filter spurious signals, we discard fine-tunned the threshold of $\Delta L_l$ using extensive simulations to balance sensitivity and specificity.
+To identify and filter spurious signals, we use a fine-tuned threshold of $\Delta L_l$ based on extensive simulations to balance sensitivity and specificity.
 This threshold is set to 0.025 by default for ColocBoost when detect the colocalization, but we suggested a less conservative threshold of 0.015 for FineBoost
-when performing single-trait fine-mapping analysis (`check_null_max = 0.015` as we suggested).
\ No newline at end of file
+when performing single-trait fine-mapping analysis (`check_null_max = 0.015` as we suggested).
diff --git a/vignettes/Summary_Statistics_Colocalization.Rmd b/vignettes/Summary_Statistics_Colocalization.Rmd
index 2dc127a..b8ff0aa 100644
--- a/vignettes/Summary_Statistics_Colocalization.Rmd
+++ b/vignettes/Summary_Statistics_Colocalization.Rmd
@@ -90,6 +90,19 @@ res$cos_details$cos$cos_index
 colocboost_plot(res)
 ```
 
+Alternatively, you can provide the reference panel genotype matrix directly through `X_ref`, which avoids storing the full LD matrix:
+
+```{r one-X-ref}
+# Use reference genotype directly instead of precomputing LD
+X_ref <- Ind_5traits$X[[1]]
+
+# Run colocboost
+res <- colocboost(sumstat = Sumstat_5traits$sumstat, X_ref = X_ref)
+
+# Identified CoS
+res$cos_details$cos$cos_index
+```
+
 
 ### Results Interpretation
 
@@ -234,4 +247,3 @@ res$cos_details$cos$cos_index
 
 See more details about data format to implement LD-free ColocBoost and LD-mismatch diagnosis in [LD mismatch and LD-free Colocalization](https://statfungen.github.io/colocboost/articles/LD_Free_Colocalization.html)).
 
-
diff --git a/vignettes/announcements.Rmd b/vignettes/announcements.Rmd
index 63155cd..7508ff2 100644
--- a/vignettes/announcements.Rmd
+++ b/vignettes/announcements.Rmd
@@ -14,13 +14,13 @@ vignette: >
 - *May 2, 2025*: `colocboost` R package is available on [CRAN](https://CRAN.R-project.org/package=colocboost).
 
 ## Software updates
-- `v1.0.8` (**Upcoming release**) Improvements to summary-statistics workflows, trait-specific result filtering, and computational efficiency.
+- `v1.0.8` Improvements to summary-statistics workflows, trait-specific result filtering, and computational efficiency.
   - Added `X_ref` support as a memory-efficient alternative to precomputed LD matrices for large summary-statistics analyses.
   - Added `get_robust_ucos` to recalibrate and summarize robust trait-specific, uncolocalized events.
   - Improved computational efficiency for repeated matrix products in large reference-panel workflows.
   - Improved plotting robustness for extreme association signals and coefficient or z-score displays.
   - Minor robustness fixes for summary-statistics analyses with reference-panel or LD-free inputs.
-- `v1.0.7` (**Important update**) Improvements to ColocBoost (check out the full details in [PR](https://github.com/StatFunGen/colocboost/pull/116) and [PR](https://github.com/StatFunGen/colocboost/pull/121)). 
+- `v1.0.7` Improvements to ColocBoost (check out the full details in [PR](https://github.com/StatFunGen/colocboost/pull/116) and [PR](https://github.com/StatFunGen/colocboost/pull/121)).
   - Enhanced `colocboost` main function with post-filtering and only keep the robust colocalization events.
   - Enhanced `colocboost_plot` function with flexible highlighting options and new visualization styles.
   - Optimized performance and computational efficiency

From a4f2738b329c9659b2d3d6b8b620172138b5d21b Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Sat, 6 Jun 2026 11:52:42 -0400
Subject: [PATCH 4/5] adding unit test

---
 tests/testthat/test_inference.R |  56 ++++++++++
 tests/testthat/test_sumstats.R  | 182 ++++++++++++++++++++++++++++++++
 2 files changed, 238 insertions(+)

diff --git a/tests/testthat/test_inference.R b/tests/testthat/test_inference.R
index d397dcc..9b36923 100644
--- a/tests/testthat/test_inference.R
+++ b/tests/testthat/test_inference.R
@@ -125,6 +125,62 @@ test_that("get_robust_colocalization filters results correctly", {
   expect_error(suppressWarnings(get_robust_colocalization(cb_res, pvalue_cutoff = 0.05)), NA)
 })
 
+test_that("get_robust_colocalization handles validation and early return branches", {
+
+  cb_res <- generate_test_result()
+
+  expect_error(
+    get_robust_colocalization("not_a_colocboost_object"),
+    "colocboost object"
+  )
+
+  no_cos <- cb_res
+  no_cos$cos_details <- NULL
+  expect_message(
+    no_cos_result <- get_robust_colocalization(no_cos),
+    "No colocalization results"
+  )
+  expect_null(no_cos_result$cos_details)
+
+  expect_warning(
+    bad_pvalue_result <- get_robust_colocalization(cb_res, pvalue_cutoff = 1.5),
+    "pvalue cutoff"
+  )
+  expect_equal(bad_pvalue_result, cb_res)
+
+  expect_message(
+    all_events_result <- get_robust_colocalization(
+      cb_res,
+      cos_npc_cutoff = 0,
+      npc_outcome_cutoff = 0
+    ),
+    "All possible colocalization events"
+  )
+  expect_equal(all_events_result, cb_res)
+})
+
+test_that("get_robust_colocalization removes CoS with zero npc_outcome", {
+
+  cb_res <- generate_test_result()
+  skip_if(is.null(cb_res$cos_details), "No CoS detected in test data")
+
+  cb_res$cos_details$cos_outcomes_npc[[1]]$npc_outcome <- 0
+
+  expect_message(
+    filtered <- get_robust_colocalization(
+      cb_res,
+      cos_npc_cutoff = 0.2,
+      npc_outcome_cutoff = 0
+    ),
+    "Extracting colocalization results"
+  )
+
+  expect_s3_class(filtered, "colocboost")
+  expect_null(filtered$cos_details)
+  expect_true("ucos_details" %in% names(filtered))
+  expect_false(is.null(filtered$ucos_details))
+})
+
 # Test for get_hierarchical_clusters
 test_that("get_hierarchical_clusters functions correctly", {
   # Test case 1: Simple 2x2 correlation matrix with high correlation
diff --git a/tests/testthat/test_sumstats.R b/tests/testthat/test_sumstats.R
index ef896c3..5341c81 100644
--- a/tests/testthat/test_sumstats.R
+++ b/tests/testthat/test_sumstats.R
@@ -99,6 +99,26 @@ generate_sumstat_test_data <- function(n = 100, p = 20, L = 2, seed = 42) {
 # Create summary statistics test data
 test_sumstat_data <- generate_sumstat_test_data()
 
+make_validation_x <- function(n = 6, p = 4) {
+  X <- matrix(seq_len(n * p), nrow = n, ncol = p)
+  colnames(X) <- paste0("SNP", seq_len(p))
+  X
+}
+
+make_validation_ld <- function(p = 4) {
+  LD <- diag(p)
+  colnames(LD) <- rownames(LD) <- paste0("SNP", seq_len(p))
+  LD
+}
+
+make_validation_sumstat <- function(p = 4) {
+  data.frame(
+    z = seq(0.1, 0.4, length.out = p),
+    n = 100,
+    variant = paste0("SNP", seq_len(p))
+  )
+}
+
 # Test 1: Basic summary statistics input
 test_that("colocboost runs with basic summary statistics format", {
   # Run colocboost with sumstat and single LD matrix
@@ -355,3 +375,165 @@ test_that("colocboost errors with no common variants", {
   expect_true(any(grepl("is empty after filtering", warnings)))
 })
 
+test_that("colocboost_validate_input_data covers individual input validation edge cases", {
+  X <- make_validation_x()
+  y <- seq_len(nrow(X))
+  Y3 <- list(y, y + 1, y + 2)
+
+  X_unequal_1 <- matrix(seq_len(24), nrow = 6, ncol = 4)
+  X_unequal_2 <- matrix(seq_len(30), nrow = 6, ncol = 5)
+  expect_warning(
+    expect_null(colocboost_validate_input_data(
+      X = list(X_unequal_1, X_unequal_2),
+      Y = list(y, y + 1)
+    )),
+    "same number of variables"
+  )
+
+  X_dup <- X
+  colnames(X_dup) <- c("SNP1", "SNP1", "SNP2", "SNP3")
+  expect_message(
+    validated_dup <- colocboost_validate_input_data(X = X_dup, Y = y),
+    "Removed duplicate columns from X matrix"
+  )
+  expect_equal(validated_dup$keep_variable_individual[[1]], c("SNP1", "SNP2", "SNP3"))
+
+  expect_warning(
+    expect_null(colocboost_validate_input_data(
+      X = list(X, X),
+      Y = Y3
+    )),
+    "dict_YX"
+  )
+
+  expect_warning(
+    expect_null(colocboost_validate_input_data(
+      X = list(X, X),
+      Y = Y3,
+      dict_YX = matrix(c(1, 1, 2, 2), ncol = 2, byrow = TRUE)
+    )),
+    "matched X for outcome 3"
+  )
+
+  expect_warning(
+    expect_null(colocboost_validate_input_data(
+      X = list(X, X),
+      Y = Y3,
+      dict_YX = matrix(c(1, 1, 1, 2, 2, 2, 3, 1), ncol = 2, byrow = TRUE)
+    )),
+    "different matched X for outcome 1"
+  )
+
+  expect_warning(
+    expect_null(colocboost_validate_input_data(
+      X = list(X, X),
+      Y = Y3,
+      dict_YX = matrix(c(1, 1, 2, 2, 3, 3), ncol = 2, byrow = TRUE)
+    )),
+    "enough X matrices"
+  )
+
+  X_na <- X
+  X_na[1, 1] <- NA
+  expect_warning(
+    expect_null(colocboost_validate_input_data(X = X_na, Y = y)),
+    "Input X must not contain missing values"
+  )
+})
+
+test_that("colocboost_validate_input_data covers summary reference mapping validation", {
+  LD <- make_validation_ld()
+  sumstat3 <- rep(list(make_validation_sumstat()), 3)
+  LD2 <- list(LD, LD)
+
+  expect_warning(
+    expect_null(colocboost_validate_input_data(sumstat = sumstat3, LD = LD2)),
+    "dict_sumstatLD"
+  )
+
+  expect_warning(
+    expect_null(colocboost_validate_input_data(
+      sumstat = sumstat3,
+      LD = LD2,
+      dict_sumstatLD = matrix(c(1, 1, 2, 2), ncol = 2, byrow = TRUE)
+    )),
+    "matched.*sumstat"
+  )
+
+  expect_warning(
+    expect_null(colocboost_validate_input_data(
+      sumstat = sumstat3,
+      LD = LD2,
+      dict_sumstatLD = matrix(c(1, 1, 1, 2, 2, 2, 3, 1), ncol = 2, byrow = TRUE)
+    )),
+    "multiple matched"
+  )
+
+  expect_warning(
+    expect_null(colocboost_validate_input_data(
+      sumstat = sumstat3,
+      LD = LD2,
+      dict_sumstatLD = matrix(c(1, 1, 2, 2, 3, 3), ncol = 2, byrow = TRUE)
+    )),
+    "enough.*matrices"
+  )
+
+  expect_warning(
+    expect_null(colocboost_validate_input_data(
+      sumstat = list(make_validation_sumstat()),
+      LD = diag(3)
+    )),
+    "has no variant names"
+  )
+})
+
+test_that("colocboost_validate_input_data covers summary statistic value validation", {
+  LD <- make_validation_ld()
+  variants <- paste0("SNP", 1:4)
+
+  expect_warning(
+    expect_null(colocboost_validate_input_data(
+      sumstat = list(data.frame(beta = 1:4, n = 100, variant = variants)),
+      LD = LD
+    )),
+    "either z"
+  )
+
+  expect_warning(
+    expect_null(colocboost_validate_input_data(
+      sumstat = list(data.frame(beta = c(1, NA, 3, 4), sebeta = 1, n = 100, variant = variants)),
+      LD = LD
+    )),
+    "cannot have missing values"
+  )
+
+  expect_warning(
+    expect_null(colocboost_validate_input_data(
+      sumstat = list(data.frame(beta = 1:4, sebeta = c(1, 1, 0, 1), n = 100, variant = variants)),
+      LD = LD
+    )),
+    "zero or negative"
+  )
+
+  sumstat_na_z <- make_validation_sumstat()
+  sumstat_na_z$z[1] <- NA
+  expect_warning(
+    validated_na_z <- colocboost_validate_input_data(sumstat = list(sumstat_na_z), LD = LD),
+    "contains NA values"
+  )
+  expect_false(anyNA(validated_na_z$Z[[1]]))
+
+  sumstat_bad_n <- make_validation_sumstat()
+  sumstat_bad_n$n[1] <- 1
+  expect_warning(
+    expect_null(colocboost_validate_input_data(sumstat = list(sumstat_bad_n), LD = LD)),
+    "Sample size N"
+  )
+
+  sumstat_bad_var_y <- make_validation_sumstat()
+  sumstat_bad_var_y$var_y <- -1
+  expect_warning(
+    expect_null(colocboost_validate_input_data(sumstat = list(sumstat_bad_var_y), LD = LD)),
+    "var_y"
+  )
+})

From 36593ca88d7b93669031a609722cb8cb061ca0ac Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Sat, 6 Jun 2026 13:54:41 -0400
Subject: [PATCH 5/5] Update test_inference.R

---
 tests/testthat/test_inference.R | 41 +++++++++++++++++++++++++++++----
 1 file changed, 36 insertions(+), 5 deletions(-)

diff --git a/tests/testthat/test_inference.R b/tests/testthat/test_inference.R
index 9b36923..631d9b1 100644
--- a/tests/testthat/test_inference.R
+++ b/tests/testthat/test_inference.R
@@ -107,6 +107,38 @@ generate_ucos_test_data <- function(n = 500, p = 60, L = 3, seed = 42, output_le
   return(result)
 }
 
+generate_cos_test_result <- function(n = 250, p = 30, L = 3, seed = 20260606) {
+  set.seed(seed)
+
+  sigma <- 0.9^abs(outer(seq_len(p), seq_len(p), "-"))
+  X <- MASS::mvrnorm(n, rep(0, p), sigma)
+  colnames(X) <- paste0("SNP", seq_len(p))
+
+  true_beta <- matrix(0, p, L)
+  true_beta[8, 1] <- 1.2
+  true_beta[8, 2] <- 1.1
+  true_beta[22, 3] <- 1.2
+
+  Y <- matrix(0, n, L)
+  for (l in seq_len(L)) {
+    Y[, l] <- X %*% true_beta[, l] + rnorm(n, 0, 0.6)
+  }
+
+  suppressWarnings({
+    result <- colocboost(
+      X = replicate(L, X, simplify = FALSE),
+      Y = lapply(seq_len(L), function(l) Y[, l]),
+      M = 80,
+      output_level = 3,
+      cos_npc_cutoff = 0,
+      npc_outcome_cutoff = 0,
+      pvalue_cutoff = NULL
+    )
+  })
+
+  return(result)
+}
+
 
 
 # Test for get_strong_colocalization
@@ -127,7 +159,8 @@ test_that("get_robust_colocalization filters results correctly", {
 
 test_that("get_robust_colocalization handles validation and early return branches", {
 
-  cb_res <- generate_test_result()
+  cb_res <- generate_cos_test_result()
+  expect_false(is.null(cb_res$cos_details))
 
   expect_error(
     get_robust_colocalization("not_a_colocboost_object"),
@@ -161,8 +194,8 @@ test_that("get_robust_colocalization handles validation and early return branche
 
 test_that("get_robust_colocalization removes CoS with zero npc_outcome", {
 
-  cb_res <- generate_test_result()
-  skip_if(is.null(cb_res$cos_details), "No CoS detected in test data")
+  cb_res <- generate_cos_test_result()
+  expect_false(is.null(cb_res$cos_details))
 
   cb_res$cos_details$cos_outcomes_npc[[1]]$npc_outcome <- 0
 
@@ -177,8 +210,6 @@ test_that("get_robust_colocalization removes CoS with zero npc_outcome", {
 
   expect_s3_class(filtered, "colocboost")
   expect_null(filtered$cos_details)
-  expect_true("ucos_details" %in% names(filtered))
-  expect_false(is.null(filtered$ucos_details))
 })
 
 # Test for get_hierarchical_clusters