StatFunGen · danielnachun · Jun 25, 2026 · Jun 25, 2026
diff --git a/R/colocboostPipeline.R b/R/colocboostPipeline.R
@@ -72,6 +72,14 @@
 #'   as the focal outcome.
 #' @param xqtlColoc,jointGwas,separateGwas Logical flags selecting which
 #'   colocboost variants to run.
+#' @param pipCutoffToSkip Individual-level pre-filter (ports the legacy
+#'   \code{pip_cutoff_to_skip_ind}). Scalar (applied to every context) or a
+#'   context-named numeric vector. For each context, every outcome is fit
+#'   with a single-effect SuSiE (\code{L = 1}) and dropped unless some
+#'   variant's PIP exceeds the cutoff; a context with no surviving outcome is
+#'   skipped. \code{0} (default) disables it; a negative value uses
+#'   \code{3 / n_variants}. (Summary-statistic skipping is handled upstream by
+#'   \code{\link{summaryStatsQc}}'s own \code{pipCutoffToSkip}.)
 #' @param ... Additional arguments forwarded to
 #'   \code{\link[colocboost]{colocboost}} (e.g., \code{M}, \code{L},
 #'   \code{output_level}).
@@ -144,16 +152,56 @@ setGeneric("colocboostPipeline",
   invisible(NULL)
 }
 
+# Resolve the per-context pipCutoffToSkip from either a scalar (applies to
+# every context) or a named vector keyed by context. Default 0 (no skip).
+.cbResolveCutoff <- function(pipCutoffToSkip, ctx) {
+  if (is.null(pipCutoffToSkip) || length(pipCutoffToSkip) == 0L) return(0)
+  if (!is.null(names(pipCutoffToSkip))) {
+    if (ctx %in% names(pipCutoffToSkip)) return(pipCutoffToSkip[[ctx]])
+    return(0)
+  }
+  pipCutoffToSkip[[1L]]
+}
+
+# Per-outcome single-trait skip (ports the legacy qc_individual_data
+# pip_cutoff_to_skip): for each outcome column of Y, fit a single-effect
+# SuSiE (L = 1, max_iter = 100) on (X, Y[, j]) and keep the outcome only if
+# any variant's PIP exceeds the cutoff. A cutoff < 0 means 3 / n_variants.
+# Returns the retained Y (NULL when no outcome clears the threshold).
+.cbPipSkipOutcomes <- function(X, Y, cutoff) {
+  if (is.null(cutoff) || is.na(cutoff) || cutoff == 0) return(Y)
+  if (!requireNamespace("susieR", quietly = TRUE)) {
+    warning("susieR not available; pipCutoffToSkip filter not applied.")
+    return(Y)
+  }
+  if (!is.double(X)) storage.mode(X) <- "double"  # susieR needs double X
+  thr <- if (cutoff < 0) 3 / ncol(X) else cutoff
+  keep <- logical(ncol(Y))
+  for (j in seq_len(ncol(Y))) {
+    obs <- !is.na(Y[, j])
+    if (sum(obs) < 2L) next
+    pip <- tryCatch(
+      susieR::susie(X[obs, , drop = FALSE], Y[obs, j],
+                    L = 1, max_iter = 100)$pip,
+      error = function(e) NULL)
+    if (!is.null(pip) && any(pip > thr, na.rm = TRUE)) keep[[j]] <- TRUE
+  }
+  if (!any(keep)) return(NULL)
+  Y[, keep, drop = FALSE]
+}
+
 # Materialise an individual-level QtlDataset into the colocboost
 # (X, Y, dict_YX, outcome_names) bundle. Each context becomes one X /
 # Y pair; the YA matrices are split into single-trait columns and
 # dict_YX maps each split column back to its X. Returns NULL when no
-# context survives selection.
+# context survives selection. pipCutoffToSkip (scalar or context-named
+# vector) optionally drops weak-signal outcomes / contexts up front.
 .cbIndividualBundle <- function(qd, contexts = NULL,
                                 traitId = NULL,
                                 region = NULL,
                                 cisWindow = NULL,
-                                samples = NULL) {
+                                samples = NULL,
+                                pipCutoffToSkip = 0) {
   if (is.null(contexts) || length(contexts) == 0L) {
     contexts <- getContexts(qd)
   } else {
@@ -198,6 +246,15 @@ setGeneric("colocboostPipeline",
     }
     X <- X[common, , drop = FALSE]
     Y <- Y[common, , drop = FALSE]
+    cutoffCtx <- .cbResolveCutoff(pipCutoffToSkip, ctx)
+    if (!is.null(cutoffCtx) && !is.na(cutoffCtx) && cutoffCtx != 0) {
+      Y <- .cbPipSkipOutcomes(X, Y, cutoffCtx)
+      if (is.null(Y) || ncol(Y) == 0L) {
+        message("colocboostPipeline: skipping context '", ctx,
+                "' (no outcome cleared pipCutoffToSkip = ", cutoffCtx, ").")
+        next
+      }
+    }
     XperCtx[[ctx]] <- X
     YperCtx[[ctx]] <- Y
   }
@@ -257,7 +314,8 @@ setGeneric("colocboostPipeline",
 # Build a single (sumstat data.frame, LD correlation matrix) pair from a
 # QtlSumStats / GwasSumStats entry. Returns NULL when the entry has no
 # variants overlapping the ldSketch panel.
-.cbSumstatPair <- function(df, ldSketch, varY = NULL) {
+.cbSumstatPair <- function(df, ldSketch, varY = NULL,
+                           nCase = NULL, nControl = NULL) {
   if (is.null(df) || nrow(df) == 0L) return(NULL)
   variantIds <- df$variant_id
   if (anyNA(variantIds)) {
@@ -278,9 +336,16 @@ setGeneric("colocboostPipeline",
   df <- df[keep, , drop = FALSE]
   variantIds <- keptIds
 
+  # Case/control GWAS: use the effective sample size
+  # 4 / (1/nCase + 1/nControl); otherwise the per-variant N.
+  okCC <- !is.null(nCase) && !is.null(nControl) &&
+          !is.na(nCase) && !is.na(nControl) &&
+          nCase > 0 && nControl > 0
+  nVal <- if (okCC) 4 / (1 / nCase + 1 / nControl)
+          else if (!is.null(df$N)) df$N else NA_real_
   ss <- data.frame(
     z       = df$z,
-    n       = if (!is.null(df$N)) df$N else NA_real_,
+    n       = nVal,
     variant = variantIds,
     stringsAsFactors = FALSE)
   if (!is.null(varY) && !is.na(varY)) {
@@ -332,7 +397,9 @@ setGeneric("colocboostPipeline",
     pair <- .cbSumstatPair(
       df       = getSumstatDf(gws, study = st, require = "Z"),
       ldSketch = ldSketch,
-      varY     = if ("varY" %in% names(gws)) gws$varY[[i]] else NA_real_)
+      varY     = if ("varY" %in% names(gws)) gws$varY[[i]] else NA_real_,
+      nCase    = if ("nCase" %in% names(gws)) gws$nCase[[i]] else NA_real_,
+      nControl = if ("nControl" %in% names(gws)) gws$nControl[[i]] else NA_real_)
     if (!is.null(pair)) bundle[[st]] <- pair
   }
   bundle
@@ -540,6 +607,7 @@ setMethod("colocboostPipeline", "QtlDataset",
            jointGwas = FALSE,
            separateGwas = FALSE,
            samples = NULL,
+           pipCutoffToSkip = 0,
            ...) {
     dotArgs <- list(...)
     indBundle <- .cbIndividualBundle(
@@ -548,7 +616,8 @@ setMethod("colocboostPipeline", "QtlDataset",
       traitId      = traitId,
       region       = region,
       cisWindow    = cisWindow,
-      samples      = samples)
+      samples      = samples,
+      pipCutoffToSkip = pipCutoffToSkip)
     .cbDriver(indBundle, qtlPairs = list(), gwasSumStats,
               xqtlColoc, jointGwas, separateGwas,
               focalTrait, dotArgs)
@@ -591,6 +660,7 @@ setMethod("colocboostPipeline", "MultiStudyQtlDataset",
            jointGwas = FALSE,
            separateGwas = FALSE,
            samples = NULL,
+           pipCutoffToSkip = 0,
            ...) {
     dotArgs <- list(...)
 
@@ -611,7 +681,8 @@ setMethod("colocboostPipeline", "MultiStudyQtlDataset",
         traitId      = traitId,
         region       = region,
         cisWindow    = cisWindow,
-        samples      = samples)
+        samples      = samples,
+        pipCutoffToSkip = pipCutoffToSkip)
       if (is.null(sub)) next
       xOffset <- length(combinedX)
       yOffset <- length(combinedY)

diff --git a/R/gwasSumStats.R b/R/gwasSumStats.R
@@ -86,11 +86,20 @@ NULL
 #' @param varY Optional numeric vector of per-study phenotype variances
 #'   (\code{NA_real_} entries allowed). Used by the sufficient-statistic
 #'   interface; z-score RSS analyses should leave entries as NA.
+#' @param nCase,nControl Optional per-study case / control counts. The
+#'   columns are attached \strong{only when supplied} (default \code{NULL}),
+#'   so quantitative-trait collections keep the original schema. When given,
+#'   pass length 1 or length(study) (use \code{NA} for the non-case/control
+#'   studies in a mixed collection). For case/control GWAS, downstream
+#'   consumers (e.g. \code{\link{colocboostPipeline}}) use the effective
+#'   sample size \code{4 / (1/nCase + 1/nControl)} in place of the
+#'   per-variant \code{N}.
 #' @param ... Additional per-study columns to attach to the collection.
 #' @return A \code{GwasSumStats} object.
 #' @export
 GwasSumStats <- function(study, entry, genome, ldSketch,
-                          varY = NA_real_, qcInfo = list(), ...) {
+                          varY = NA_real_, nCase = NULL,
+                          nControl = NULL, qcInfo = list(), ...) {
   if (missing(study) || missing(entry) || missing(genome) || missing(ldSketch)) {
     stop("`study`, `entry`, `genome`, and `ldSketch` are all required.")
   }
@@ -105,18 +114,26 @@ GwasSumStats <- function(study, entry, genome, ldSketch,
     stop("length(entry) (", length(entry),
          ") must equal length(study) (", length(study), ").")
   }
-  if (length(varY) == 1L && length(study) > 1L) {
-    varY <- rep(varY, length(study))
-  }
-  if (length(varY) != length(study)) {
-    stop("`varY` must have length 1 or length(study).")
+  # Per-study scalars: recycle a single value to one-per-study.
+  recycle <- function(v, nm) {
+    if (length(v) == 1L && length(study) > 1L) v <- rep(v, length(study))
+    if (length(v) != length(study))
+      stop("`", nm, "` must have length 1 or length(study).")
+    as.numeric(v)
   }
+  varY <- recycle(varY, "varY")
 
   cols <- list(
     study = as.character(study),
     entry = S4Vectors::SimpleList(entry),
-    varY  = as.numeric(varY)
+    varY  = varY
   )
+  # nCase / nControl are OPTIONAL: the columns are attached only when
+  # supplied (default NULL), so quantitative-trait GwasSumStats keep the
+  # original schema. Provide a vector (length = n studies) with NA for the
+  # non-case/control studies in a mixed collection.
+  if (!is.null(nCase))    cols$nCase    <- recycle(nCase, "nCase")
+  if (!is.null(nControl)) cols$nControl <- recycle(nControl, "nControl")
   extras <- list(...)
   for (nm in names(extras)) cols[[nm]] <- extras[[nm]]
   df <- do.call(S4Vectors::DataFrame, c(cols, list(check.names = FALSE)))

diff --git a/R/sumstatsQc.R b/R/sumstatsQc.R
@@ -3424,6 +3424,11 @@ summaryStatsQc <- function(sumstats,
       genome   = getGenome(sumstats),
       ldSketch = getLdSketch(sumstats),
       varY     = as.numeric(sumstats$varY),
+      # Preserve the optional per-study case/control counts through QC.
+      nCase    = if ("nCase" %in% names(sumstats))
+                   as.numeric(sumstats$nCase) else NULL,
+      nControl = if ("nControl" %in% names(sumstats))
+                   as.numeric(sumstats$nControl) else NULL,
       qcInfo   = qcInfo)
   } else {
     QtlSumStats(

diff --git a/man/GwasSumStats.Rd b/man/GwasSumStats.Rd
diff --git a/man/colocboostPipeline.Rd b/man/colocboostPipeline.Rd
diff --git a/tests/testthat/test_colocboostPipeline.R b/tests/testthat/test_colocboostPipeline.R
@@ -422,3 +422,73 @@ test_that("colocboostPipeline: GWAS ldSketch mismatch errors during the driver",
     "different sample sets"
   )
 })
+
+# ===========================================================================
+# GWAS case/control: optional nCase/nControl columns + effective-N wiring
+# ===========================================================================
+
+test_that("GwasSumStats: nCase/nControl are optional columns (absent by default)", {
+  gr <- GenomicRanges::GRanges(
+    seqnames = "chr1",
+    ranges = IRanges::IRanges(start = seq(100L, by = 100L, length.out = 5L),
+                              width = 1L))
+  S4Vectors::mcols(gr) <- S4Vectors::DataFrame(
+    SNP = paste0("v", 1:5), A1 = "A", A2 = "G", Z = rnorm(5), N = rep(1000L, 5))
+  base <- list(study = "G1", entry = list(gr), genome = "hg19",
+               ldSketch = .cbp_makeHandle(), qcInfo = list(ok = 1))
+  g0 <- do.call(GwasSumStats, base)
+  expect_false(any(c("nCase", "nControl") %in% names(g0)))
+  g1 <- do.call(GwasSumStats, c(base, list(nCase = 500, nControl = 1500)))
+  expect_true(all(c("nCase", "nControl") %in% names(g1)))
+  expect_equal(g1$nCase, 500)
+  expect_equal(g1$nControl, 1500)
+})
+
+test_that("colocboost GWAS bundle: effective N for case/control, per-variant N otherwise", {
+  gr <- GenomicRanges::GRanges(
+    seqnames = "chr1",
+    ranges = IRanges::IRanges(start = seq(100L, by = 100L, length.out = 5L),
+                              width = 1L))
+  S4Vectors::mcols(gr) <- S4Vectors::DataFrame(
+    SNP = paste0("v", 1:5), A1 = "A", A2 = "G", Z = rnorm(5), N = rep(1000L, 5))
+  base <- list(study = "G1", entry = list(gr), genome = "hg19",
+               ldSketch = .cbp_makeHandle(), qcInfo = list(ok = 1))
+  local_mocked_bindings(extractBlockGenotypes = .cbp_mockExtractor(),
+                        .package = "pecotmr")
+  # case/control -> effective N = 4 / (1/500 + 1/1500) = 1500
+  gcc <- do.call(GwasSumStats, c(base, list(nCase = 500, nControl = 1500)))
+  bcc <- pecotmr:::.cbGwasSumStatsBundle(gcc)
+  expect_true(all(bcc[["G1"]]$sumstat$n == 4 / (1/500 + 1/1500)))
+  # quantitative (no nCase/nControl) -> per-variant N (1000)
+  bq <- pecotmr:::.cbGwasSumStatsBundle(do.call(GwasSumStats, base))
+  expect_true(all(bq[["G1"]]$sumstat$n == 1000L))
+})
+
+# ===========================================================================
+# pipCutoffToSkip: per-context single-trait (L=1 SuSiE) outcome skip
+# ===========================================================================
+
+test_that(".cbPipSkipOutcomes: keeps signal outcomes, drops noise, honours cutoff", {
+  skip_if_not_installed("susieR")
+  set.seed(1)
+  n <- 200L; p <- 20L
+  X <- matrix(rbinom(n * p, 2, 0.3), n, p,
+              dimnames = list(paste0("s", 1:n), paste0("v", 1:p)))
+  Y <- cbind(sig   = X[, 1] * 1.5 + rnorm(n, sd = 0.3),  # strong signal at v1
+             noise = rnorm(n))                            # null
+  # cutoff 0 -> no-op
+  expect_identical(pecotmr:::.cbPipSkipOutcomes(X, Y, 0), Y)
+  # cutoff 0.5 -> keep the signal outcome, drop the noise outcome
+  kept <- pecotmr:::.cbPipSkipOutcomes(X, Y, 0.5)
+  expect_equal(colnames(kept), "sig")
+  # all-noise -> NULL (whole context would be skipped)
+  Yn <- cbind(n1 = rnorm(n), n2 = rnorm(n))
+  expect_null(pecotmr:::.cbPipSkipOutcomes(X, Yn, 0.5))
+})
+
+test_that(".cbResolveCutoff: scalar applies to all; named vector is per-context", {
+  expect_equal(pecotmr:::.cbResolveCutoff(0.5, "brain"), 0.5)
+  expect_equal(pecotmr:::.cbResolveCutoff(c(brain = 0.3, blood = 0.7), "blood"), 0.7)
+  expect_equal(pecotmr:::.cbResolveCutoff(c(brain = 0.3), "missing"), 0)
+  expect_equal(pecotmr:::.cbResolveCutoff(NULL, "brain"), 0)
+})
diff --git a/tests/testthat/test_sumstatsQc.R b/tests/testthat/test_sumstatsQc.R
@@ -4464,3 +4464,15 @@ test_that("autoDecision assigns SER for single CS", {
   expect_false(result$tagged_cs[1])
   expect_equal(result$method, "SER")
 })
+
+test_that("summaryStatsQc: preserves optional nCase/nControl columns through QC", {
+  gr <- .ssQ_makeEntryGr(paste0("rs", 1:4), c(100L, 200L, 300L, 400L))
+  ss <- GwasSumStats(study = "g1", entry = list(gr), genome = "hg19",
+                     ldSketch = .ssQ_makeHandle(),
+                     nCase = 500, nControl = 1500)
+  expect_true(all(c("nCase", "nControl") %in% names(ss)))
+  out <- summaryStatsQc(ss, pipCutoffToSkip = 0, nCutoff = 0)
+  expect_true(all(c("nCase", "nControl") %in% names(out)))
+  expect_equal(out$nCase, 500)
+  expect_equal(out$nControl, 1500)
+})