Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 78 additions & 7 deletions R/colocboostPipeline.R
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,14 @@
#' as the focal outcome.
#' @param xqtlColoc,jointGwas,separateGwas Logical flags selecting which
#' colocboost variants to run.
#' @param pipCutoffToSkip Individual-level pre-filter (ports the legacy
#' \code{pip_cutoff_to_skip_ind}). Scalar (applied to every context) or a
#' context-named numeric vector. For each context, every outcome is fit
#' with a single-effect SuSiE (\code{L = 1}) and dropped unless some
#' variant's PIP exceeds the cutoff; a context with no surviving outcome is
#' skipped. \code{0} (default) disables it; a negative value uses
#' \code{3 / n_variants}. (Summary-statistic skipping is handled upstream by
#' \code{\link{summaryStatsQc}}'s own \code{pipCutoffToSkip}.)
#' @param ... Additional arguments forwarded to
#' \code{\link[colocboost]{colocboost}} (e.g., \code{M}, \code{L},
#' \code{output_level}).
Expand Down Expand Up @@ -144,16 +152,56 @@ setGeneric("colocboostPipeline",
invisible(NULL)
}

# Resolve the per-context pipCutoffToSkip from either a scalar (applies to
# every context) or a named vector keyed by context. Default 0 (no skip).
.cbResolveCutoff <- function(pipCutoffToSkip, ctx) {
if (is.null(pipCutoffToSkip) || length(pipCutoffToSkip) == 0L) return(0)
if (!is.null(names(pipCutoffToSkip))) {
if (ctx %in% names(pipCutoffToSkip)) return(pipCutoffToSkip[[ctx]])
return(0)
}
pipCutoffToSkip[[1L]]
}

# Per-outcome single-trait skip (ports the legacy qc_individual_data
# pip_cutoff_to_skip): for each outcome column of Y, fit a single-effect
# SuSiE (L = 1, max_iter = 100) on (X, Y[, j]) and keep the outcome only if
# any variant's PIP exceeds the cutoff. A cutoff < 0 means 3 / n_variants.
# Returns the retained Y (NULL when no outcome clears the threshold).
.cbPipSkipOutcomes <- function(X, Y, cutoff) {
if (is.null(cutoff) || is.na(cutoff) || cutoff == 0) return(Y)
if (!requireNamespace("susieR", quietly = TRUE)) {
warning("susieR not available; pipCutoffToSkip filter not applied.")
return(Y)
}
if (!is.double(X)) storage.mode(X) <- "double" # susieR needs double X
thr <- if (cutoff < 0) 3 / ncol(X) else cutoff
keep <- logical(ncol(Y))
for (j in seq_len(ncol(Y))) {
obs <- !is.na(Y[, j])
if (sum(obs) < 2L) next
pip <- tryCatch(
susieR::susie(X[obs, , drop = FALSE], Y[obs, j],
L = 1, max_iter = 100)$pip,
error = function(e) NULL)
if (!is.null(pip) && any(pip > thr, na.rm = TRUE)) keep[[j]] <- TRUE
}
if (!any(keep)) return(NULL)
Y[, keep, drop = FALSE]
}

# Materialise an individual-level QtlDataset into the colocboost
# (X, Y, dict_YX, outcome_names) bundle. Each context becomes one X /
# Y pair; the YA matrices are split into single-trait columns and
# dict_YX maps each split column back to its X. Returns NULL when no
# context survives selection.
# context survives selection. pipCutoffToSkip (scalar or context-named
# vector) optionally drops weak-signal outcomes / contexts up front.
.cbIndividualBundle <- function(qd, contexts = NULL,
traitId = NULL,
region = NULL,
cisWindow = NULL,
samples = NULL) {
samples = NULL,
pipCutoffToSkip = 0) {
if (is.null(contexts) || length(contexts) == 0L) {
contexts <- getContexts(qd)
} else {
Expand Down Expand Up @@ -198,6 +246,15 @@ setGeneric("colocboostPipeline",
}
X <- X[common, , drop = FALSE]
Y <- Y[common, , drop = FALSE]
cutoffCtx <- .cbResolveCutoff(pipCutoffToSkip, ctx)
if (!is.null(cutoffCtx) && !is.na(cutoffCtx) && cutoffCtx != 0) {
Y <- .cbPipSkipOutcomes(X, Y, cutoffCtx)
if (is.null(Y) || ncol(Y) == 0L) {
message("colocboostPipeline: skipping context '", ctx,
"' (no outcome cleared pipCutoffToSkip = ", cutoffCtx, ").")
next
}
}
XperCtx[[ctx]] <- X
YperCtx[[ctx]] <- Y
}
Expand Down Expand Up @@ -257,7 +314,8 @@ setGeneric("colocboostPipeline",
# Build a single (sumstat data.frame, LD correlation matrix) pair from a
# QtlSumStats / GwasSumStats entry. Returns NULL when the entry has no
# variants overlapping the ldSketch panel.
.cbSumstatPair <- function(df, ldSketch, varY = NULL) {
.cbSumstatPair <- function(df, ldSketch, varY = NULL,
nCase = NULL, nControl = NULL) {
if (is.null(df) || nrow(df) == 0L) return(NULL)
variantIds <- df$variant_id
if (anyNA(variantIds)) {
Expand All @@ -278,9 +336,16 @@ setGeneric("colocboostPipeline",
df <- df[keep, , drop = FALSE]
variantIds <- keptIds

# Case/control GWAS: use the effective sample size
# 4 / (1/nCase + 1/nControl); otherwise the per-variant N.
okCC <- !is.null(nCase) && !is.null(nControl) &&
!is.na(nCase) && !is.na(nControl) &&
nCase > 0 && nControl > 0
nVal <- if (okCC) 4 / (1 / nCase + 1 / nControl)
else if (!is.null(df$N)) df$N else NA_real_
ss <- data.frame(
z = df$z,
n = if (!is.null(df$N)) df$N else NA_real_,
n = nVal,
variant = variantIds,
stringsAsFactors = FALSE)
if (!is.null(varY) && !is.na(varY)) {
Expand Down Expand Up @@ -332,7 +397,9 @@ setGeneric("colocboostPipeline",
pair <- .cbSumstatPair(
df = getSumstatDf(gws, study = st, require = "Z"),
ldSketch = ldSketch,
varY = if ("varY" %in% names(gws)) gws$varY[[i]] else NA_real_)
varY = if ("varY" %in% names(gws)) gws$varY[[i]] else NA_real_,
nCase = if ("nCase" %in% names(gws)) gws$nCase[[i]] else NA_real_,
nControl = if ("nControl" %in% names(gws)) gws$nControl[[i]] else NA_real_)
if (!is.null(pair)) bundle[[st]] <- pair
}
bundle
Expand Down Expand Up @@ -540,6 +607,7 @@ setMethod("colocboostPipeline", "QtlDataset",
jointGwas = FALSE,
separateGwas = FALSE,
samples = NULL,
pipCutoffToSkip = 0,
...) {
dotArgs <- list(...)
indBundle <- .cbIndividualBundle(
Expand All @@ -548,7 +616,8 @@ setMethod("colocboostPipeline", "QtlDataset",
traitId = traitId,
region = region,
cisWindow = cisWindow,
samples = samples)
samples = samples,
pipCutoffToSkip = pipCutoffToSkip)
.cbDriver(indBundle, qtlPairs = list(), gwasSumStats,
xqtlColoc, jointGwas, separateGwas,
focalTrait, dotArgs)
Expand Down Expand Up @@ -591,6 +660,7 @@ setMethod("colocboostPipeline", "MultiStudyQtlDataset",
jointGwas = FALSE,
separateGwas = FALSE,
samples = NULL,
pipCutoffToSkip = 0,
...) {
dotArgs <- list(...)

Expand All @@ -611,7 +681,8 @@ setMethod("colocboostPipeline", "MultiStudyQtlDataset",
traitId = traitId,
region = region,
cisWindow = cisWindow,
samples = samples)
samples = samples,
pipCutoffToSkip = pipCutoffToSkip)
if (is.null(sub)) next
xOffset <- length(combinedX)
yOffset <- length(combinedY)
Expand Down
31 changes: 24 additions & 7 deletions R/gwasSumStats.R
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,20 @@ NULL
#' @param varY Optional numeric vector of per-study phenotype variances
#' (\code{NA_real_} entries allowed). Used by the sufficient-statistic
#' interface; z-score RSS analyses should leave entries as NA.
#' @param nCase,nControl Optional per-study case / control counts. The
#' columns are attached \strong{only when supplied} (default \code{NULL}),
#' so quantitative-trait collections keep the original schema. When given,
#' pass length 1 or length(study) (use \code{NA} for the non-case/control
#' studies in a mixed collection). For case/control GWAS, downstream
#' consumers (e.g. \code{\link{colocboostPipeline}}) use the effective
#' sample size \code{4 / (1/nCase + 1/nControl)} in place of the
#' per-variant \code{N}.
#' @param ... Additional per-study columns to attach to the collection.
#' @return A \code{GwasSumStats} object.
#' @export
GwasSumStats <- function(study, entry, genome, ldSketch,
varY = NA_real_, qcInfo = list(), ...) {
varY = NA_real_, nCase = NULL,
nControl = NULL, qcInfo = list(), ...) {
if (missing(study) || missing(entry) || missing(genome) || missing(ldSketch)) {
stop("`study`, `entry`, `genome`, and `ldSketch` are all required.")
}
Expand All @@ -105,18 +114,26 @@ GwasSumStats <- function(study, entry, genome, ldSketch,
stop("length(entry) (", length(entry),
") must equal length(study) (", length(study), ").")
}
if (length(varY) == 1L && length(study) > 1L) {
varY <- rep(varY, length(study))
}
if (length(varY) != length(study)) {
stop("`varY` must have length 1 or length(study).")
# Per-study scalars: recycle a single value to one-per-study.
recycle <- function(v, nm) {
if (length(v) == 1L && length(study) > 1L) v <- rep(v, length(study))
if (length(v) != length(study))
stop("`", nm, "` must have length 1 or length(study).")
as.numeric(v)
}
varY <- recycle(varY, "varY")

cols <- list(
study = as.character(study),
entry = S4Vectors::SimpleList(entry),
varY = as.numeric(varY)
varY = varY
)
# nCase / nControl are OPTIONAL: the columns are attached only when
# supplied (default NULL), so quantitative-trait GwasSumStats keep the
# original schema. Provide a vector (length = n studies) with NA for the
# non-case/control studies in a mixed collection.
if (!is.null(nCase)) cols$nCase <- recycle(nCase, "nCase")
if (!is.null(nControl)) cols$nControl <- recycle(nControl, "nControl")
extras <- list(...)
for (nm in names(extras)) cols[[nm]] <- extras[[nm]]
df <- do.call(S4Vectors::DataFrame, c(cols, list(check.names = FALSE)))
Expand Down
5 changes: 5 additions & 0 deletions R/sumstatsQc.R
Original file line number Diff line number Diff line change
Expand Up @@ -3424,6 +3424,11 @@ summaryStatsQc <- function(sumstats,
genome = getGenome(sumstats),
ldSketch = getLdSketch(sumstats),
varY = as.numeric(sumstats$varY),
# Preserve the optional per-study case/control counts through QC.
nCase = if ("nCase" %in% names(sumstats))
as.numeric(sumstats$nCase) else NULL,
nControl = if ("nControl" %in% names(sumstats))
as.numeric(sumstats$nControl) else NULL,
qcInfo = qcInfo)
} else {
QtlSumStats(
Expand Down
11 changes: 11 additions & 0 deletions man/GwasSumStats.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 11 additions & 0 deletions man/colocboostPipeline.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

70 changes: 70 additions & 0 deletions tests/testthat/test_colocboostPipeline.R
Original file line number Diff line number Diff line change
Expand Up @@ -422,3 +422,73 @@ test_that("colocboostPipeline: GWAS ldSketch mismatch errors during the driver",
"different sample sets"
)
})

# ===========================================================================
# GWAS case/control: optional nCase/nControl columns + effective-N wiring
# ===========================================================================

test_that("GwasSumStats: nCase/nControl are optional columns (absent by default)", {
gr <- GenomicRanges::GRanges(
seqnames = "chr1",
ranges = IRanges::IRanges(start = seq(100L, by = 100L, length.out = 5L),
width = 1L))
S4Vectors::mcols(gr) <- S4Vectors::DataFrame(
SNP = paste0("v", 1:5), A1 = "A", A2 = "G", Z = rnorm(5), N = rep(1000L, 5))
base <- list(study = "G1", entry = list(gr), genome = "hg19",
ldSketch = .cbp_makeHandle(), qcInfo = list(ok = 1))
g0 <- do.call(GwasSumStats, base)
expect_false(any(c("nCase", "nControl") %in% names(g0)))
g1 <- do.call(GwasSumStats, c(base, list(nCase = 500, nControl = 1500)))
expect_true(all(c("nCase", "nControl") %in% names(g1)))
expect_equal(g1$nCase, 500)
expect_equal(g1$nControl, 1500)
})

test_that("colocboost GWAS bundle: effective N for case/control, per-variant N otherwise", {
gr <- GenomicRanges::GRanges(
seqnames = "chr1",
ranges = IRanges::IRanges(start = seq(100L, by = 100L, length.out = 5L),
width = 1L))
S4Vectors::mcols(gr) <- S4Vectors::DataFrame(
SNP = paste0("v", 1:5), A1 = "A", A2 = "G", Z = rnorm(5), N = rep(1000L, 5))
base <- list(study = "G1", entry = list(gr), genome = "hg19",
ldSketch = .cbp_makeHandle(), qcInfo = list(ok = 1))
local_mocked_bindings(extractBlockGenotypes = .cbp_mockExtractor(),
.package = "pecotmr")
# case/control -> effective N = 4 / (1/500 + 1/1500) = 1500
gcc <- do.call(GwasSumStats, c(base, list(nCase = 500, nControl = 1500)))
bcc <- pecotmr:::.cbGwasSumStatsBundle(gcc)
expect_true(all(bcc[["G1"]]$sumstat$n == 4 / (1/500 + 1/1500)))
# quantitative (no nCase/nControl) -> per-variant N (1000)
bq <- pecotmr:::.cbGwasSumStatsBundle(do.call(GwasSumStats, base))
expect_true(all(bq[["G1"]]$sumstat$n == 1000L))
})

# ===========================================================================
# pipCutoffToSkip: per-context single-trait (L=1 SuSiE) outcome skip
# ===========================================================================

test_that(".cbPipSkipOutcomes: keeps signal outcomes, drops noise, honours cutoff", {
skip_if_not_installed("susieR")
set.seed(1)
n <- 200L; p <- 20L
X <- matrix(rbinom(n * p, 2, 0.3), n, p,
dimnames = list(paste0("s", 1:n), paste0("v", 1:p)))
Y <- cbind(sig = X[, 1] * 1.5 + rnorm(n, sd = 0.3), # strong signal at v1
noise = rnorm(n)) # null
# cutoff 0 -> no-op
expect_identical(pecotmr:::.cbPipSkipOutcomes(X, Y, 0), Y)
# cutoff 0.5 -> keep the signal outcome, drop the noise outcome
kept <- pecotmr:::.cbPipSkipOutcomes(X, Y, 0.5)
expect_equal(colnames(kept), "sig")
# all-noise -> NULL (whole context would be skipped)
Yn <- cbind(n1 = rnorm(n), n2 = rnorm(n))
expect_null(pecotmr:::.cbPipSkipOutcomes(X, Yn, 0.5))
})

test_that(".cbResolveCutoff: scalar applies to all; named vector is per-context", {
expect_equal(pecotmr:::.cbResolveCutoff(0.5, "brain"), 0.5)
expect_equal(pecotmr:::.cbResolveCutoff(c(brain = 0.3, blood = 0.7), "blood"), 0.7)
expect_equal(pecotmr:::.cbResolveCutoff(c(brain = 0.3), "missing"), 0)
expect_equal(pecotmr:::.cbResolveCutoff(NULL, "brain"), 0)
})
12 changes: 12 additions & 0 deletions tests/testthat/test_sumstatsQc.R
Original file line number Diff line number Diff line change
Expand Up @@ -4464,3 +4464,15 @@ test_that("autoDecision assigns SER for single CS", {
expect_false(result$tagged_cs[1])
expect_equal(result$method, "SER")
})

test_that("summaryStatsQc: preserves optional nCase/nControl columns through QC", {
gr <- .ssQ_makeEntryGr(paste0("rs", 1:4), c(100L, 200L, 300L, 400L))
ss <- GwasSumStats(study = "g1", entry = list(gr), genome = "hg19",
ldSketch = .ssQ_makeHandle(),
nCase = 500, nControl = 1500)
expect_true(all(c("nCase", "nControl") %in% names(ss)))
out <- summaryStatsQc(ss, pipCutoffToSkip = 0, nCutoff = 0)
expect_true(all(c("nCase", "nControl") %in% names(out)))
expect_equal(out$nCase, 500)
expect_equal(out$nControl, 1500)
})
Loading