diff --git a/R/colocboostPipeline.R b/R/colocboostPipeline.R index 37ee8a94..d6e8af59 100644 --- a/R/colocboostPipeline.R +++ b/R/colocboostPipeline.R @@ -72,6 +72,14 @@ #' as the focal outcome. #' @param xqtlColoc,jointGwas,separateGwas Logical flags selecting which #' colocboost variants to run. +#' @param pipCutoffToSkip Individual-level pre-filter (ports the legacy +#' \code{pip_cutoff_to_skip_ind}). Scalar (applied to every context) or a +#' context-named numeric vector. For each context, every outcome is fit +#' with a single-effect SuSiE (\code{L = 1}) and dropped unless some +#' variant's PIP exceeds the cutoff; a context with no surviving outcome is +#' skipped. \code{0} (default) disables it; a negative value uses +#' \code{3 / n_variants}. (Summary-statistic skipping is handled upstream by +#' \code{\link{summaryStatsQc}}'s own \code{pipCutoffToSkip}.) #' @param ... Additional arguments forwarded to #' \code{\link[colocboost]{colocboost}} (e.g., \code{M}, \code{L}, #' \code{output_level}). @@ -144,16 +152,56 @@ setGeneric("colocboostPipeline", invisible(NULL) } +# Resolve the per-context pipCutoffToSkip from either a scalar (applies to +# every context) or a named vector keyed by context. Default 0 (no skip). +.cbResolveCutoff <- function(pipCutoffToSkip, ctx) { + if (is.null(pipCutoffToSkip) || length(pipCutoffToSkip) == 0L) return(0) + if (!is.null(names(pipCutoffToSkip))) { + if (ctx %in% names(pipCutoffToSkip)) return(pipCutoffToSkip[[ctx]]) + return(0) + } + pipCutoffToSkip[[1L]] +} + +# Per-outcome single-trait skip (ports the legacy qc_individual_data +# pip_cutoff_to_skip): for each outcome column of Y, fit a single-effect +# SuSiE (L = 1, max_iter = 100) on (X, Y[, j]) and keep the outcome only if +# any variant's PIP exceeds the cutoff. A cutoff < 0 means 3 / n_variants. +# Returns the retained Y (NULL when no outcome clears the threshold). +.cbPipSkipOutcomes <- function(X, Y, cutoff) { + if (is.null(cutoff) || is.na(cutoff) || cutoff == 0) return(Y) + if (!requireNamespace("susieR", quietly = TRUE)) { + warning("susieR not available; pipCutoffToSkip filter not applied.") + return(Y) + } + if (!is.double(X)) storage.mode(X) <- "double" # susieR needs double X + thr <- if (cutoff < 0) 3 / ncol(X) else cutoff + keep <- logical(ncol(Y)) + for (j in seq_len(ncol(Y))) { + obs <- !is.na(Y[, j]) + if (sum(obs) < 2L) next + pip <- tryCatch( + susieR::susie(X[obs, , drop = FALSE], Y[obs, j], + L = 1, max_iter = 100)$pip, + error = function(e) NULL) + if (!is.null(pip) && any(pip > thr, na.rm = TRUE)) keep[[j]] <- TRUE + } + if (!any(keep)) return(NULL) + Y[, keep, drop = FALSE] +} + # Materialise an individual-level QtlDataset into the colocboost # (X, Y, dict_YX, outcome_names) bundle. Each context becomes one X / # Y pair; the YA matrices are split into single-trait columns and # dict_YX maps each split column back to its X. Returns NULL when no -# context survives selection. +# context survives selection. pipCutoffToSkip (scalar or context-named +# vector) optionally drops weak-signal outcomes / contexts up front. .cbIndividualBundle <- function(qd, contexts = NULL, traitId = NULL, region = NULL, cisWindow = NULL, - samples = NULL) { + samples = NULL, + pipCutoffToSkip = 0) { if (is.null(contexts) || length(contexts) == 0L) { contexts <- getContexts(qd) } else { @@ -198,6 +246,15 @@ setGeneric("colocboostPipeline", } X <- X[common, , drop = FALSE] Y <- Y[common, , drop = FALSE] + cutoffCtx <- .cbResolveCutoff(pipCutoffToSkip, ctx) + if (!is.null(cutoffCtx) && !is.na(cutoffCtx) && cutoffCtx != 0) { + Y <- .cbPipSkipOutcomes(X, Y, cutoffCtx) + if (is.null(Y) || ncol(Y) == 0L) { + message("colocboostPipeline: skipping context '", ctx, + "' (no outcome cleared pipCutoffToSkip = ", cutoffCtx, ").") + next + } + } XperCtx[[ctx]] <- X YperCtx[[ctx]] <- Y } @@ -257,7 +314,8 @@ setGeneric("colocboostPipeline", # Build a single (sumstat data.frame, LD correlation matrix) pair from a # QtlSumStats / GwasSumStats entry. Returns NULL when the entry has no # variants overlapping the ldSketch panel. -.cbSumstatPair <- function(df, ldSketch, varY = NULL) { +.cbSumstatPair <- function(df, ldSketch, varY = NULL, + nCase = NULL, nControl = NULL) { if (is.null(df) || nrow(df) == 0L) return(NULL) variantIds <- df$variant_id if (anyNA(variantIds)) { @@ -278,9 +336,16 @@ setGeneric("colocboostPipeline", df <- df[keep, , drop = FALSE] variantIds <- keptIds + # Case/control GWAS: use the effective sample size + # 4 / (1/nCase + 1/nControl); otherwise the per-variant N. + okCC <- !is.null(nCase) && !is.null(nControl) && + !is.na(nCase) && !is.na(nControl) && + nCase > 0 && nControl > 0 + nVal <- if (okCC) 4 / (1 / nCase + 1 / nControl) + else if (!is.null(df$N)) df$N else NA_real_ ss <- data.frame( z = df$z, - n = if (!is.null(df$N)) df$N else NA_real_, + n = nVal, variant = variantIds, stringsAsFactors = FALSE) if (!is.null(varY) && !is.na(varY)) { @@ -332,7 +397,9 @@ setGeneric("colocboostPipeline", pair <- .cbSumstatPair( df = getSumstatDf(gws, study = st, require = "Z"), ldSketch = ldSketch, - varY = if ("varY" %in% names(gws)) gws$varY[[i]] else NA_real_) + varY = if ("varY" %in% names(gws)) gws$varY[[i]] else NA_real_, + nCase = if ("nCase" %in% names(gws)) gws$nCase[[i]] else NA_real_, + nControl = if ("nControl" %in% names(gws)) gws$nControl[[i]] else NA_real_) if (!is.null(pair)) bundle[[st]] <- pair } bundle @@ -540,6 +607,7 @@ setMethod("colocboostPipeline", "QtlDataset", jointGwas = FALSE, separateGwas = FALSE, samples = NULL, + pipCutoffToSkip = 0, ...) { dotArgs <- list(...) indBundle <- .cbIndividualBundle( @@ -548,7 +616,8 @@ setMethod("colocboostPipeline", "QtlDataset", traitId = traitId, region = region, cisWindow = cisWindow, - samples = samples) + samples = samples, + pipCutoffToSkip = pipCutoffToSkip) .cbDriver(indBundle, qtlPairs = list(), gwasSumStats, xqtlColoc, jointGwas, separateGwas, focalTrait, dotArgs) @@ -591,6 +660,7 @@ setMethod("colocboostPipeline", "MultiStudyQtlDataset", jointGwas = FALSE, separateGwas = FALSE, samples = NULL, + pipCutoffToSkip = 0, ...) { dotArgs <- list(...) @@ -611,7 +681,8 @@ setMethod("colocboostPipeline", "MultiStudyQtlDataset", traitId = traitId, region = region, cisWindow = cisWindow, - samples = samples) + samples = samples, + pipCutoffToSkip = pipCutoffToSkip) if (is.null(sub)) next xOffset <- length(combinedX) yOffset <- length(combinedY) diff --git a/R/gwasSumStats.R b/R/gwasSumStats.R index 063a986a..73088d41 100644 --- a/R/gwasSumStats.R +++ b/R/gwasSumStats.R @@ -86,11 +86,20 @@ NULL #' @param varY Optional numeric vector of per-study phenotype variances #' (\code{NA_real_} entries allowed). Used by the sufficient-statistic #' interface; z-score RSS analyses should leave entries as NA. +#' @param nCase,nControl Optional per-study case / control counts. The +#' columns are attached \strong{only when supplied} (default \code{NULL}), +#' so quantitative-trait collections keep the original schema. When given, +#' pass length 1 or length(study) (use \code{NA} for the non-case/control +#' studies in a mixed collection). For case/control GWAS, downstream +#' consumers (e.g. \code{\link{colocboostPipeline}}) use the effective +#' sample size \code{4 / (1/nCase + 1/nControl)} in place of the +#' per-variant \code{N}. #' @param ... Additional per-study columns to attach to the collection. #' @return A \code{GwasSumStats} object. #' @export GwasSumStats <- function(study, entry, genome, ldSketch, - varY = NA_real_, qcInfo = list(), ...) { + varY = NA_real_, nCase = NULL, + nControl = NULL, qcInfo = list(), ...) { if (missing(study) || missing(entry) || missing(genome) || missing(ldSketch)) { stop("`study`, `entry`, `genome`, and `ldSketch` are all required.") } @@ -105,18 +114,26 @@ GwasSumStats <- function(study, entry, genome, ldSketch, stop("length(entry) (", length(entry), ") must equal length(study) (", length(study), ").") } - if (length(varY) == 1L && length(study) > 1L) { - varY <- rep(varY, length(study)) - } - if (length(varY) != length(study)) { - stop("`varY` must have length 1 or length(study).") + # Per-study scalars: recycle a single value to one-per-study. + recycle <- function(v, nm) { + if (length(v) == 1L && length(study) > 1L) v <- rep(v, length(study)) + if (length(v) != length(study)) + stop("`", nm, "` must have length 1 or length(study).") + as.numeric(v) } + varY <- recycle(varY, "varY") cols <- list( study = as.character(study), entry = S4Vectors::SimpleList(entry), - varY = as.numeric(varY) + varY = varY ) + # nCase / nControl are OPTIONAL: the columns are attached only when + # supplied (default NULL), so quantitative-trait GwasSumStats keep the + # original schema. Provide a vector (length = n studies) with NA for the + # non-case/control studies in a mixed collection. + if (!is.null(nCase)) cols$nCase <- recycle(nCase, "nCase") + if (!is.null(nControl)) cols$nControl <- recycle(nControl, "nControl") extras <- list(...) for (nm in names(extras)) cols[[nm]] <- extras[[nm]] df <- do.call(S4Vectors::DataFrame, c(cols, list(check.names = FALSE))) diff --git a/R/sumstatsQc.R b/R/sumstatsQc.R index 91239807..2b6cdb68 100644 --- a/R/sumstatsQc.R +++ b/R/sumstatsQc.R @@ -3424,6 +3424,11 @@ summaryStatsQc <- function(sumstats, genome = getGenome(sumstats), ldSketch = getLdSketch(sumstats), varY = as.numeric(sumstats$varY), + # Preserve the optional per-study case/control counts through QC. + nCase = if ("nCase" %in% names(sumstats)) + as.numeric(sumstats$nCase) else NULL, + nControl = if ("nControl" %in% names(sumstats)) + as.numeric(sumstats$nControl) else NULL, qcInfo = qcInfo) } else { QtlSumStats( diff --git a/man/GwasSumStats.Rd b/man/GwasSumStats.Rd index 5a5c584e..672dad03 100644 --- a/man/GwasSumStats.Rd +++ b/man/GwasSumStats.Rd @@ -10,6 +10,8 @@ GwasSumStats( genome, ldSketch, varY = NA_real_, + nCase = NULL, + nControl = NULL, qcInfo = list(), ... ) @@ -30,6 +32,15 @@ because all entries share the same LD sketch.} (\code{NA_real_} entries allowed). Used by the sufficient-statistic interface; z-score RSS analyses should leave entries as NA.} +\item{nCase, nControl}{Optional per-study case / control counts. The +columns are attached \strong{only when supplied} (default \code{NULL}), +so quantitative-trait collections keep the original schema. When given, +pass length 1 or length(study) (use \code{NA} for the non-case/control +studies in a mixed collection). For case/control GWAS, downstream +consumers (e.g. \code{\link{colocboostPipeline}}) use the effective +sample size \code{4 / (1/nCase + 1/nControl)} in place of the +per-variant \code{N}.} + \item{...}{Additional per-study columns to attach to the collection.} } \value{ diff --git a/man/colocboostPipeline.Rd b/man/colocboostPipeline.Rd index 8bb73373..03357802 100644 --- a/man/colocboostPipeline.Rd +++ b/man/colocboostPipeline.Rd @@ -22,6 +22,7 @@ colocboostPipeline(qtlData, gwasSumStats = NULL, ...) jointGwas = FALSE, separateGwas = FALSE, samples = NULL, + pipCutoffToSkip = 0, ... ) @@ -51,6 +52,7 @@ colocboostPipeline(qtlData, gwasSumStats = NULL, ...) jointGwas = FALSE, separateGwas = FALSE, samples = NULL, + pipCutoffToSkip = 0, ... ) @@ -91,6 +93,15 @@ as the focal outcome.} \item{xqtlColoc, jointGwas, separateGwas}{Logical flags selecting which colocboost variants to run.} + +\item{pipCutoffToSkip}{Individual-level pre-filter (ports the legacy +\code{pip_cutoff_to_skip_ind}). Scalar (applied to every context) or a +context-named numeric vector. For each context, every outcome is fit +with a single-effect SuSiE (\code{L = 1}) and dropped unless some +variant's PIP exceeds the cutoff; a context with no surviving outcome is +skipped. \code{0} (default) disables it; a negative value uses +\code{3 / n_variants}. (Summary-statistic skipping is handled upstream by +\code{\link{summaryStatsQc}}'s own \code{pipCutoffToSkip}.)} } \value{ A list with elements \code{xqtl_coloc}, \code{joint_gwas}, diff --git a/tests/testthat/test_colocboostPipeline.R b/tests/testthat/test_colocboostPipeline.R index 0afdc0d3..d0cc7469 100644 --- a/tests/testthat/test_colocboostPipeline.R +++ b/tests/testthat/test_colocboostPipeline.R @@ -422,3 +422,73 @@ test_that("colocboostPipeline: GWAS ldSketch mismatch errors during the driver", "different sample sets" ) }) + +# =========================================================================== +# GWAS case/control: optional nCase/nControl columns + effective-N wiring +# =========================================================================== + +test_that("GwasSumStats: nCase/nControl are optional columns (absent by default)", { + gr <- GenomicRanges::GRanges( + seqnames = "chr1", + ranges = IRanges::IRanges(start = seq(100L, by = 100L, length.out = 5L), + width = 1L)) + S4Vectors::mcols(gr) <- S4Vectors::DataFrame( + SNP = paste0("v", 1:5), A1 = "A", A2 = "G", Z = rnorm(5), N = rep(1000L, 5)) + base <- list(study = "G1", entry = list(gr), genome = "hg19", + ldSketch = .cbp_makeHandle(), qcInfo = list(ok = 1)) + g0 <- do.call(GwasSumStats, base) + expect_false(any(c("nCase", "nControl") %in% names(g0))) + g1 <- do.call(GwasSumStats, c(base, list(nCase = 500, nControl = 1500))) + expect_true(all(c("nCase", "nControl") %in% names(g1))) + expect_equal(g1$nCase, 500) + expect_equal(g1$nControl, 1500) +}) + +test_that("colocboost GWAS bundle: effective N for case/control, per-variant N otherwise", { + gr <- GenomicRanges::GRanges( + seqnames = "chr1", + ranges = IRanges::IRanges(start = seq(100L, by = 100L, length.out = 5L), + width = 1L)) + S4Vectors::mcols(gr) <- S4Vectors::DataFrame( + SNP = paste0("v", 1:5), A1 = "A", A2 = "G", Z = rnorm(5), N = rep(1000L, 5)) + base <- list(study = "G1", entry = list(gr), genome = "hg19", + ldSketch = .cbp_makeHandle(), qcInfo = list(ok = 1)) + local_mocked_bindings(extractBlockGenotypes = .cbp_mockExtractor(), + .package = "pecotmr") + # case/control -> effective N = 4 / (1/500 + 1/1500) = 1500 + gcc <- do.call(GwasSumStats, c(base, list(nCase = 500, nControl = 1500))) + bcc <- pecotmr:::.cbGwasSumStatsBundle(gcc) + expect_true(all(bcc[["G1"]]$sumstat$n == 4 / (1/500 + 1/1500))) + # quantitative (no nCase/nControl) -> per-variant N (1000) + bq <- pecotmr:::.cbGwasSumStatsBundle(do.call(GwasSumStats, base)) + expect_true(all(bq[["G1"]]$sumstat$n == 1000L)) +}) + +# =========================================================================== +# pipCutoffToSkip: per-context single-trait (L=1 SuSiE) outcome skip +# =========================================================================== + +test_that(".cbPipSkipOutcomes: keeps signal outcomes, drops noise, honours cutoff", { + skip_if_not_installed("susieR") + set.seed(1) + n <- 200L; p <- 20L + X <- matrix(rbinom(n * p, 2, 0.3), n, p, + dimnames = list(paste0("s", 1:n), paste0("v", 1:p))) + Y <- cbind(sig = X[, 1] * 1.5 + rnorm(n, sd = 0.3), # strong signal at v1 + noise = rnorm(n)) # null + # cutoff 0 -> no-op + expect_identical(pecotmr:::.cbPipSkipOutcomes(X, Y, 0), Y) + # cutoff 0.5 -> keep the signal outcome, drop the noise outcome + kept <- pecotmr:::.cbPipSkipOutcomes(X, Y, 0.5) + expect_equal(colnames(kept), "sig") + # all-noise -> NULL (whole context would be skipped) + Yn <- cbind(n1 = rnorm(n), n2 = rnorm(n)) + expect_null(pecotmr:::.cbPipSkipOutcomes(X, Yn, 0.5)) +}) + +test_that(".cbResolveCutoff: scalar applies to all; named vector is per-context", { + expect_equal(pecotmr:::.cbResolveCutoff(0.5, "brain"), 0.5) + expect_equal(pecotmr:::.cbResolveCutoff(c(brain = 0.3, blood = 0.7), "blood"), 0.7) + expect_equal(pecotmr:::.cbResolveCutoff(c(brain = 0.3), "missing"), 0) + expect_equal(pecotmr:::.cbResolveCutoff(NULL, "brain"), 0) +}) diff --git a/tests/testthat/test_sumstatsQc.R b/tests/testthat/test_sumstatsQc.R index 3580a35a..a91de8ae 100644 --- a/tests/testthat/test_sumstatsQc.R +++ b/tests/testthat/test_sumstatsQc.R @@ -4464,3 +4464,15 @@ test_that("autoDecision assigns SER for single CS", { expect_false(result$tagged_cs[1]) expect_equal(result$method, "SER") }) + +test_that("summaryStatsQc: preserves optional nCase/nControl columns through QC", { + gr <- .ssQ_makeEntryGr(paste0("rs", 1:4), c(100L, 200L, 300L, 400L)) + ss <- GwasSumStats(study = "g1", entry = list(gr), genome = "hg19", + ldSketch = .ssQ_makeHandle(), + nCase = 500, nControl = 1500) + expect_true(all(c("nCase", "nControl") %in% names(ss))) + out <- summaryStatsQc(ss, pipCutoffToSkip = 0, nCutoff = 0) + expect_true(all(c("nCase", "nControl") %in% names(out))) + expect_equal(out$nCase, 500) + expect_equal(out$nControl, 1500) +})