From 00def7d1ee8cf37debedc03d4eca40f6d06437d0 Mon Sep 17 00:00:00 2001 From: Daniel Nachun Date: Tue, 23 Jun 2026 19:13:42 -0700 Subject: [PATCH 1/2] add fine-mapping CV support and other features --- NAMESPACE | 3 + R/AllGenerics.R | 15 + R/FineMappingEntry.R | 24 +- R/QtlDataset.R | 39 +- R/QtlFineMappingResult.R | 8 + R/fineMappingPipeline.R | 414 ++++++++++++++++++++- R/fineMappingWrappers.R | 7 + R/regularizedRegressionWrappers.R | 170 ++++++++- R/twasWeights.R | 17 +- R/twasWeightsPipeline.R | 208 +++++++++-- man/FineMappingEntry.Rd | 6 +- man/fineMappingPipeline.Rd | 20 + man/fsusieWeights.Rd | 63 ++++ man/getCvResult.Rd | 33 ++ man/twasWeightsPipeline.Rd | 10 + tests/testthat/test_FineMappingEntry.R | 24 ++ tests/testthat/test_QtlDataset.R | 32 ++ tests/testthat/test_QtlFineMappingResult.R | 14 + tests/testthat/test_fineMappingPipeline.R | 204 ++++++++++ tests/testthat/test_rrMrmashMvsusie.R | 108 +++++- tests/testthat/test_twasWeightsPipeline.R | 141 ++++++- 21 files changed, 1501 insertions(+), 59 deletions(-) create mode 100644 man/fsusieWeights.Rd create mode 100644 man/getCvResult.Rd diff --git a/NAMESPACE b/NAMESPACE index db66db93..bf58241d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -83,6 +83,7 @@ export(fitSusieInfThenSusieRss) export(fixupExampleGenotypePaths) export(formatFinemappingOutput) export(fsusieGetCs) +export(fsusieWeights) export(fsusieWrapper) export(getAnnotationMeta) export(getAnnotations) @@ -95,6 +96,7 @@ export(getCorrelation) export(getCs) export(getCtwasMetaData) export(getCvPerformance) +export(getCvResult) export(getDataType) export(getEigenList) export(getEnrichment) @@ -281,6 +283,7 @@ exportMethods(getContexts) exportMethods(getCorrelation) exportMethods(getCs) exportMethods(getCvPerformance) +exportMethods(getCvResult) exportMethods(getDataType) exportMethods(getEigenList) exportMethods(getEnrichment) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 6b32d210..8f93337a 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -397,6 +397,21 @@ setGeneric("getPip", function(x, ...) standardGeneric("getPip")) #' @export setGeneric("getSusieFit", function(x, ...) standardGeneric("getSusieFit")) +#' @title Get Cross-Validation Result +#' @description Extract the cross-validation payload stored on a +#' \code{FineMappingEntry} (or the matching entry of a +#' \code{FineMappingResult}). The payload is a list with components +#' \code{samplePartition} (a \code{data.frame} of \code{Sample}/\code{Fold} +#' assignments), \code{predictions} (a named list of per-method out-of-fold +#' prediction matrices), and \code{performance} (a named list of per-method +#' metric matrices). \code{NULL} when fine-mapping was run without +#' cross-validation (\code{cvFolds <= 1}). +#' @param x A \code{FineMappingEntry} or \code{FineMappingResult}. +#' @param ... Class-specific selection arguments. +#' @return A list (the CV payload) or \code{NULL}. +#' @export +setGeneric("getCvResult", function(x, ...) standardGeneric("getCvResult")) + #' @title Get Marginal Effects #' @description Extract per-variant marginal univariate effects from a #' fine-mapping entry or result. Returns a \code{data.frame} with diff --git a/R/FineMappingEntry.R b/R/FineMappingEntry.R index f948d201..3b1bb0b2 100644 --- a/R/FineMappingEntry.R +++ b/R/FineMappingEntry.R @@ -29,11 +29,15 @@ setClass("FineMappingEntry", representation( variantIds = "character", susieFit = "ANY", - topLoci = "data.frame" + topLoci = "data.frame", + cvResult = "ANY" ), validity = function(object) { errors <- character() n <- length(object@variantIds) + if (!is.null(object@cvResult) && !is.list(object@cvResult)) + errors <- c(errors, + "cvResult must be NULL or a list (samplePartition/predictions/performance)") if (nrow(object@topLoci) > 0L) { # Minimal contract: variant_id + pip. Canonical projector columns # (marginal_*, posterior_*, etc.) are pipeline-populated; tests @@ -85,13 +89,17 @@ setClass("FineMappingEntry", #' (\code{pip, posterior_mean, posterior_sd, cs_*, cs_*_purity}), #' pipeline stamps (\code{method, gene, event, grange_start, #' grange_end}). Unfiltered: one row per variant in the fit. +#' @param cvResult Optional cross-validation payload (list with +#' \code{samplePartition}, \code{predictions}, \code{performance}) recorded +#' when fine-mapping is run with \code{cvFolds > 1}. \code{NULL} otherwise. #' @return A \code{FineMappingEntry} object. #' @export -FineMappingEntry <- function(variantIds, susieFit, topLoci) { +FineMappingEntry <- function(variantIds, susieFit, topLoci, cvResult = NULL) { obj <- new("FineMappingEntry", variantIds = as.character(variantIds), susieFit = susieFit, - topLoci = as.data.frame(topLoci)) + topLoci = as.data.frame(topLoci), + cvResult = cvResult) validObject(obj) obj } @@ -110,6 +118,11 @@ setMethod("getVariantIds", "FineMappingEntry", setMethod("getSusieFit", "FineMappingEntry", function(x, ...) x@susieFit) +#' @rdname getCvResult +#' @export +setMethod("getCvResult", "FineMappingEntry", + function(x, ...) x@cvResult) + #' @rdname getTopLoci #' @export setMethod("getTopLoci", "FineMappingEntry", @@ -223,10 +236,13 @@ setMethod("adjustPips", "FineMappingEntry", } } } + # cvResult is sample-indexed (partition + held-out predictions), not + # variant-indexed, so it carries through a variant-subsetting unchanged. new("FineMappingEntry", variantIds = common, susieFit = fit, - topLoci = newTopLoci) + topLoci = newTopLoci, + cvResult = x@cvResult) }) #' @export diff --git a/R/QtlDataset.R b/R/QtlDataset.R index 6fe18cf4..7625d232 100644 --- a/R/QtlDataset.R +++ b/R/QtlDataset.R @@ -26,14 +26,18 @@ setClass("QtlDataset", xvarCutoff = "numeric", imissCutoff = "numeric", keepSamples = "character", - keepVariants = "character" + keepVariants = "character", + keepIndel = "logical" ), + prototype = prototype(keepIndel = TRUE), validity = function(object) { errors <- character() if (length(object@study) != 1L || !nzchar(object@study)) errors <- c(errors, "'study' must be a single non-empty character string") if (length(object@scaleResiduals) != 1L) errors <- c(errors, "'scaleResiduals' must be a single logical value") + if (length(object@keepIndel) != 1L || is.na(object@keepIndel)) + errors <- c(errors, "'keepIndel' must be a single logical value") for (nm in c("mafCutoff", "macCutoff", "xvarCutoff", "imissCutoff")) { v <- methods::slot(object, nm) if (length(v) != 1L || is.na(v) || !is.finite(v) || v < 0) @@ -132,6 +136,9 @@ setClass("QtlDataset", #' @param genotypeCovariates Numeric matrix of genotype-derived covariates #' (e.g., ancestry PCs); rows are samples. #' @param scaleResiduals Logical (length 1). Default \code{TRUE}. +#' @param keepIndel Logical (length 1). When \code{FALSE}, variants whose +#' alleles are not single nucleotides (indels) are dropped at extraction. +#' Default \code{TRUE} (keep all variants). #' @return A \code{QtlDataset} object. #' @export QtlDataset <- function(study, genotypes, phenotypes, @@ -142,7 +149,8 @@ QtlDataset <- function(study, genotypes, phenotypes, xvarCutoff = 0, imissCutoff = 0, keepSamples = character(0), - keepVariants = character(0)) { + keepVariants = character(0), + keepIndel = TRUE) { obj <- new("QtlDataset", study = as.character(study), genotypes = genotypes, @@ -154,7 +162,8 @@ QtlDataset <- function(study, genotypes, phenotypes, xvarCutoff = as.numeric(xvarCutoff), imissCutoff = as.numeric(imissCutoff), keepSamples = as.character(keepSamples), - keepVariants = as.character(keepVariants)) + keepVariants = as.character(keepVariants), + keepIndel = isTRUE(keepIndel)) validObject(obj) obj } @@ -271,6 +280,12 @@ setMethod("getScaleResiduals", "QtlDataset", function(x) x@scaleResiduals) unique(idx) } +# Internal: keepIndel slot read, tolerant of QtlDataset objects serialized +# before the slot existed (treat a missing slot as TRUE = keep indels). +.qtlKeepIndel <- function(x) { + isTRUE(tryCatch(x@keepIndel, error = function(e) TRUE)) +} + # Internal: extract the panel dosage block (samples x variants) for the # requested region, narrow to the requested sample set, and apply lazy QC # (per-sample imiss filter, then per-variant max(mafCutoff, @@ -315,6 +330,24 @@ setMethod("getScaleResiduals", "QtlDataset", function(x) x@scaleResiduals) } } + # Drop indels (variants whose A1/A2 are not single nucleotides) unless kept, + # before materialization so we do not extract dosage we will immediately drop. + if (!.qtlKeepIndel(x)) { + si <- x@genotypes@snpInfo + snpMask <- nchar(as.character(si$A1[snpIdx])) == 1L & + nchar(as.character(si$A2[snpIdx])) == 1L + snpIdx <- snpIdx[snpMask] + if (length(snpIdx) == 0L) { + return(list( + geno = matrix(numeric(0), nrow = 0L, ncol = 0L, + dimnames = list(character(0), character(0))), + variantIds = character(0), + sampleIds = character(0), + maf = numeric(0) + )) + } + } + block <- extractBlockGenotypes(x@genotypes, snpIdx, meanImpute = FALSE) # `block` is variants x samples (Bioc convention); transpose to # samples x variants for analysis-style operations. diff --git a/R/QtlFineMappingResult.R b/R/QtlFineMappingResult.R index dbb864c6..90291c5c 100644 --- a/R/QtlFineMappingResult.R +++ b/R/QtlFineMappingResult.R @@ -165,6 +165,14 @@ setMethod("getCs", "QtlFineMappingResult", getCs(entry, coverage = coverage) }) +#' @rdname getCvResult +#' @export +setMethod("getCvResult", "QtlFineMappingResult", + function(x, study = NULL, context = NULL, trait = NULL, method = NULL, ...) { + entry <- getFineMappingResult(x, study, context, trait, method) + getCvResult(entry) + }) + #' @rdname getTopLoci #' @export setMethod("getTopLoci", "QtlFineMappingResult", diff --git a/R/fineMappingPipeline.R b/R/fineMappingPipeline.R index 06ba7ce3..9da97e9b 100644 --- a/R/fineMappingPipeline.R +++ b/R/fineMappingPipeline.R @@ -82,9 +82,6 @@ #' level QC lives on the \code{QtlDataset} constructor; sumstat #' QC lives in \code{summaryStatsQc()}. No filtering happens #' inside this pipeline. -#' \item \code{pipCutoffToSkip} pre-screen: this lived in the old -#' pipelines but is not ported. Callers can run a one-shot -#' \code{susieR::susie} check externally if needed. #' \item Diagnostic re-analysis paths #' (\code{singleEffect} / \code{bayesianConditionalRegression} #' reanalysis on the RSS path): these are not exposed as @@ -157,6 +154,30 @@ #' (OR-logic). Default \code{NULL} (off). #' @param fineMappingResult Optional existing \code{FineMappingResult} #' to use as a resume cache; tuples already present are not refit. +#' @param cvFolds Integer. Number of cross-validation folds. Default +#' \code{0} (no CV). When \code{> 1}, each method is refit on the +#' training samples of every fold and used to predict the held-out +#' samples; the fold partition plus per-fold out-of-fold predictions and +#' metrics are stored on each \code{FineMappingEntry}'s \code{cvResult} +#' slot (see \code{\link{getCvResult}}). \code{twasWeightsPipeline} reuses +#' this partition and feeds these predictions into the SR-TWAS ensemble. +#' Individual-level (\code{QtlDataset} / \code{MultiStudyQtlDataset}) +#' input only; ignored for sumstat inputs. +#' @param samplePartition Optional pre-defined CV partition +#' \code{data.frame} with columns \code{Sample} and \code{Fold}. When +#' supplied (and \code{cvFolds > 1}), every method reuses this exact +#' partition; otherwise a fresh partition is generated per +#' \code{(study, context, trait)}. +#' @param seed Optional integer. When non-NULL, \code{set.seed(seed)} is +#' called once at the start of the call for reproducible fits. Default +#' \code{NULL} (no seeding). +#' @param pipCutoffToSkip Numeric (length 1). Individual-level single-effect +#' (SER) pre-screen applied to each residualized \code{(X, y)} block before a +#' full fit: a susie model with \code{L = 1} is fit and the block is skipped +#' when no PIP exceeds the cutoff (no potentially significant variant). The +#' summary-statistics analog lives in \code{summaryStatsQc()}. \code{0} +#' (default) disables the screen; a negative value uses the adaptive +#' \code{3 / nVariants} threshold. #' @param jointSpecification Optional joint-fit specification (NULL by #' default). When NULL, the pipeline runs the implicit multi-context / #' multi-trait mvSuSiE / fSuSiE branches as before. When non-NULL, the @@ -650,6 +671,52 @@ setGeneric("fineMappingPipeline", } } +# Single-effect (SER) pre-screen, individual-level. Fits susie with L = 1 on a +# residualized (X, y) block and reports whether any PIP clears `cutoff` -- i.e. +# whether the block shows any potentially significant variant worth a full fit. +# Ports the deleted multivariate_pipeline.R `skipConditions` / susie_twas +# `pip_cutoff_to_skip` logic (the individual-level analog of the sumstat-path +# `.applyPipScreen`): +# * `cutoff == 0` (or NULL/non-scalar) disables the screen -> always keep. +# * `cutoff < 0` uses the adaptive 3 / nVariants threshold. +# * NA entries of `y` are dropped before fitting. +# The screen is advisory: too few samples/variants or a fit failure keeps the +# block (returns TRUE) rather than discarding a potentially real signal. +# @noRd +.fmSerScreen <- function(X, y, cutoff) { + if (is.null(cutoff) || length(cutoff) != 1L || is.na(cutoff) || cutoff == 0) + return(TRUE) + ok <- !is.na(y) + if (sum(ok) < 2L || ncol(X) < 1L) return(TRUE) + Xs <- X[ok, , drop = FALSE] + ys <- y[ok] + thr <- if (cutoff < 0) 3 / ncol(Xs) else cutoff + pip <- tryCatch(suppressMessages(susieR::susie(Xs, ys, L = 1L))$pip, + error = function(e) NULL) + if (is.null(pip)) return(TRUE) + any(pip > thr) +} + +# Is the SER pre-screen enabled? Only a finite, non-zero scalar activates it; +# this gates the extra screening extraction so the default (cutoff 0) costs +# nothing. +# @noRd +.fmScreenActive <- function(cutoff) { + !is.null(cutoff) && length(cutoff) == 1L && !is.na(cutoff) && cutoff != 0 +} + +# Per-condition SER pre-screen for a joint (multi-context / multi-trait) fit: +# returns a logical vector over the columns of `Y` (the conditions) marking +# which show single-effect signal. The multivariate analog of `.fmSerScreen` +# and a port of the deleted `skipConditions`: callers drop the FALSE columns +# (null contexts / traits) before the joint mvSuSiE fit. +# @noRd +.fmSerScreenColumns <- function(X, Y, cutoff) { + vapply(seq_len(ncol(Y)), + function(j) .fmSerScreen(X, Y[, j], cutoff), + logical(1L)) +} + # Fit every requested univariate token on one residualized (X, y) block, # returning a named list (token -> FineMappingEntry). Extracted from the # univariate dispatch so the same logic serves the cis path (one block), the @@ -657,7 +724,8 @@ setGeneric("fineMappingPipeline", # path (one block per region, merged afterwards via .fmMergeEntries). .fmFitXBlock <- function(X, y, toRun, addSusieInf, coverage, secondaryCoverage, signalCutoff, minAbsCorr, - methodArgs, verbose, ctx, tid) { + methodArgs, verbose, ctx, tid, + cvFolds = 0, samplePartition = NULL) { chainLocal <- .fmResolveSusieChain(toRun, addSusieInf) infFit <- NULL if (chainLocal$runInf) { @@ -687,6 +755,19 @@ setGeneric("fineMappingPipeline", secondaryCoverage = secondaryCoverage, signalCutoff = signalCutoff, minAbsCorr = minAbsCorr, csInput = "X") } + # Per-fold cross-validation across the fitted univariate methods; attach + # each method's out-of-fold predictions to its entry. + if (cvFolds > 1L && length(out) > 0L) { + if (verbose >= 1) + message(sprintf("Cross-validating (%d folds) for (context='%s', trait='%s') ...", + cvFolds, ctx, tid)) + cv <- .fmCrossValidate(X, y, names(out), methodArgs, cvFolds, + samplePartition = samplePartition, + coverage = coverage, verbose = verbose) + for (tk in names(out)) { + out[[tk]] <- .fmAttachCv(out[[tk]], .fmSliceCv(cv, tk)) + } + } out } @@ -738,8 +819,14 @@ setGeneric("fineMappingPipeline", rownames(topLoci) <- NULL susieFit <- setNames(lapply(entries, function(e) e@susieFit), paste0("region", seq_along(entries))) + # Per-region CV partitions/predictions share the same sample set; keep them + # per region under region* names so a multi-region entry retains each block's + # cross-validated predictions (NULL when no region carried CV). + cvList <- setNames(lapply(entries, function(e) e@cvResult), + paste0("region", seq_along(entries))) + cvResult <- if (all(vapply(cvList, is.null, logical(1)))) NULL else cvList FineMappingEntry(variantIds = variantIds, susieFit = susieFit, - topLoci = topLoci) + topLoci = topLoci, cvResult = cvResult) } # Run a joint-method fit (mvsusie / fsusie) once per region block via the @@ -860,6 +947,194 @@ setGeneric("fineMappingPipeline", } +# ============================================================================= +# Per-fold cross-validation of fine-mapping methods +# ----------------------------------------------------------------------------- +# fineMappingPipeline mirrors twasWeightsPipeline's cross-validation: when +# cvFolds > 1, each fine-mapping method is refit on the training samples of +# every fold, its weights extracted and used to predict the held-out samples, +# yielding out-of-fold predictions + per-outcome metrics. The partition and +# predictions are stored on each FineMappingEntry's cvResult slot so +# twasWeightsPipeline can (a) reuse the identical fold partition and (b) feed +# fine-mapping's own cross-validated predictions straight into the SR-TWAS +# ensemble instead of recomputing them. Output shape mirrors twasWeightsCv() +# (samplePartition + per-method _predicted / _performance), keyed by +# the TWAS snake method name (adapter methodKey) for a drop-in merge. +# ============================================================================= + +# Generate a Sample/Fold partition over the rows of X, matching the scheme in +# twasWeightsCv() (shuffle samples, then cut into `fold` contiguous blocks). +# @noRd +.fmMakeSamplePartition <- function(sampleNames, fold) { + idx <- sample(length(sampleNames)) + folds <- cut(seq_along(sampleNames), breaks = fold, labels = FALSE) + data.frame(Sample = sampleNames[idx], Fold = folds, stringsAsFactors = FALSE) +} + +# Snake method key (e.g. "susie_inf") for a fine-mapping token, taken from the +# shared adapter registry so fineMapping CV keys match the TwasWeights `method` +# column and twasWeightsCv()'s prediction keys. +# @noRd +.fmTwasMethodKey <- function(token) { + adapter <- .twasFineMappingMethodAdapters[[token]] + if (is.null(adapter)) return(token) + sub("_weights$", "", adapter$methodKey) +} + +# Compact CV metric row (corr, rsq, adj_rsq, pval, RMSE, MAE) for one outcome, +# mirroring the metric block of twasWeightsCv(). +# @noRd +.fmCvMetricRow <- function(pred, actual) { + out <- setNames(rep(NA_real_, 6L), + c("corr", "rsq", "adj_rsq", "pval", "RMSE", "MAE")) + ok <- !is.na(pred) & !is.na(actual) + pred <- pred[ok]; actual <- actual[ok] + if (length(pred) < 3L || stats::sd(pred) == 0) return(out) + lmFit <- stats::lm(actual ~ pred); s <- summary(lmFit) + out["corr"] <- stats::cor(actual, pred) + out["rsq"] <- s$r.squared + out["adj_rsq"] <- s$adj.r.squared + out["pval"] <- if (nrow(s$coefficients) >= 2L) s$coefficients[2L, 4L] else NA_real_ + res <- actual - pred + out["RMSE"] <- sqrt(mean(res^2)) + out["MAE"] <- mean(abs(res)) + out +} + +# Fit one fine-mapping method on (Xtr, Ytr) for a CV fold and return a +# variants x outcomes weight matrix (rownames = colnames(Xtr)). susie-family +# tokens are fit independently (no chained init) per fold, matching +# twasWeightsCv's per-fold refit. Returns NULL on failure (caller skips it). +# @noRd +.fmFoldWeights <- function(token, Xtr, Ytr, coverage, userArgs, pos) { + asMat <- function(w) { + if (is.matrix(w)) return(w) + matrix(w, ncol = 1L, dimnames = list(names(w), NULL)) + } + if (token %in% c("susie", "susieInf", "susieAsh")) { + y <- if (is.matrix(Ytr)) Ytr[, 1L] else Ytr + fit <- .fmFitSusieIndiv(Xtr, y, token, coverage = coverage, + userArgs = userArgs) + w <- switch(token, + susie = susieWeights(susieFit = fit), + susieInf = susieInfWeights(susieInfFit = fit), + susieAsh = susieAshWeights(susieAshFit = fit)) + w <- as.numeric(w) + names(w) <- colnames(Xtr) + return(asMat(w)) + } + if (token == "mvsusie") { + pv <- mvsusieR::create_mixture_prior(R = ncol(Ytr)) + fit <- do.call(fitMvsusie, + .fmMergeUserArgs(list(X = Xtr, Y = Ytr, prior_variance = pv, + coverage = coverage), + "mvsusie", userArgs)) + W <- as.matrix(mvsusieWeights(mvsusieFit = fit)) + if (is.null(rownames(W))) rownames(W) <- colnames(Xtr) + return(W) + } + if (token == "fsusie") { + fit <- do.call(fitFsusie, + .fmMergeUserArgs(list(X = Xtr, Y = Ytr, pos = pos), + "fsusie", userArgs)) + W <- fsusieWeights(fsusieFit = fit, variantIds = colnames(Xtr)) + return(as.matrix(W)) + } + NULL +} + +# Cross-validate a homogeneous set of fine-mapping `tokens` over (X, Y). For +# univariate tokens Y is a single column; for mvsusie/fsusie Y carries one +# column per condition/feature (and fsusie additionally needs `pos`). Returns +# a list(samplePartition, prediction, performance) shaped like twasWeightsCv(). +# @noRd +.fmCrossValidate <- function(X, Y, tokens, methodArgs, fold, + samplePartition = NULL, coverage = 0.95, + pos = NULL, verbose = 1) { + if (length(tokens) == 0L) return(NULL) + if (!is.matrix(Y)) { + Y <- matrix(Y, ncol = 1L, + dimnames = list(rownames(X), NULL)) + } + if (is.null(rownames(Y))) rownames(Y) <- rownames(X) + sampleNames <- rownames(X) + if (is.null(samplePartition)) { + samplePartition <- .fmMakeSamplePartition(sampleNames, fold) + } + foldIds <- sort(unique(samplePartition$Fold)) + + preds <- setNames( + lapply(tokens, function(tk) { + matrix(NA_real_, nrow(Y), ncol(Y), dimnames = dimnames(Y)) + }), tokens) + + for (j in foldIds) { + testIds <- samplePartition$Sample[samplePartition$Fold == j] + isTest <- rownames(X) %in% testIds + if (all(isTest) || !any(isTest)) next + Xtr <- X[!isTest, , drop = FALSE] + Xte <- X[isTest, , drop = FALSE] + Ytr <- Y[!isTest, , drop = FALSE] + # Drop columns with zero variance in this training fold. + keepCol <- .nonzeroVarColumns(Xtr) + XtrK <- Xtr[, keepCol, drop = FALSE] + for (tk in tokens) { + W <- tryCatch( + .fmFoldWeights(tk, XtrK, Ytr, coverage, methodArgs[[tk]], pos), + error = function(e) { + if (verbose >= 1) + message(sprintf(" CV fold %s, method %s failed: %s", + j, tk, conditionMessage(e))) + NULL + }) + if (is.null(W)) next + common <- intersect(colnames(Xte), rownames(W)) + if (length(common) == 0L) next + yhat <- Xte[, common, drop = FALSE] %*% W[common, , drop = FALSE] + preds[[tk]][rownames(Xte), ] <- yhat + } + } + + prediction <- list() + performance <- list() + for (tk in tokens) { + key <- .fmTwasMethodKey(tk) + prediction[[paste0(key, "_predicted")]] <- preds[[tk]] + perf <- t(vapply(seq_len(ncol(Y)), function(r) { + .fmCvMetricRow(preds[[tk]][, r], Y[, r]) + }, numeric(6L))) + rownames(perf) <- colnames(Y) + performance[[paste0(key, "_performance")]] <- perf + } + list(samplePartition = samplePartition, + prediction = prediction, performance = performance) +} + +# Slice a full .fmCrossValidate() result down to one method's payload, keeping +# the shared samplePartition. Stored on that method's FineMappingEntry. +# @noRd +.fmSliceCv <- function(cv, token) { + if (is.null(cv)) return(NULL) + key <- .fmTwasMethodKey(token) + pk <- paste0(key, "_predicted") + mk <- paste0(key, "_performance") + if (!pk %in% names(cv$prediction)) return(NULL) + list(samplePartition = cv$samplePartition, + prediction = cv$prediction[pk], + performance = cv$performance[mk]) +} + +# Rebuild a FineMappingEntry with a cvResult attached (the class is immutable). +# @noRd +.fmAttachCv <- function(entry, cvResult) { + if (is.null(entry) || is.null(cvResult)) return(entry) + FineMappingEntry(variantIds = entry@variantIds, + susieFit = entry@susieFit, + topLoci = entry@topLoci, + cvResult = cvResult) +} + + # ============================================================================= # QtlDataset method # ============================================================================= @@ -882,6 +1157,10 @@ setMethod("fineMappingPipeline", "QtlDataset", minAbsCorr = 0.8, medianAbsCorr = NULL, fineMappingResult = NULL, + cvFolds = 0, + samplePartition = NULL, + pipCutoffToSkip = 0, + seed = NULL, naAction = c("drop", "impute"), verbose = 1, trim = TRUE, @@ -891,6 +1170,7 @@ setMethod("fineMappingPipeline", "QtlDataset", residualizeGenotypeCovariates = TRUE, ...) { naAction <- match.arg(naAction) + if (!is.null(seed)) set.seed(as.integer(seed)) # `cisWindow` expands a trait's own coordinates; `region` is taken # literally. Supplying both signals a misunderstanding -> reject. if (!is.null(region) && !is.null(cisWindow)) { @@ -1038,9 +1318,19 @@ setMethod("fineMappingPipeline", "QtlDataset", X <- X[common, , drop = FALSE] y <- Y[common, , drop = FALSE] if (ncol(y) > 1L) y <- y[, 1L, drop = TRUE] else y <- drop(y) + # SER pre-screen: skip this block when a single-effect fit finds no + # PIP above pipCutoffToSkip (no potentially significant variant). + if (!.fmSerScreen(X, y, pipCutoffToSkip)) { + if (verbose >= 1) + message(sprintf( + "Skipping (context='%s', trait='%s'): SER pre-screen found no PIP above pipCutoffToSkip.", + ctx, tid)) + return(list()) + } .fmFitXBlock(X, y, toRun, addSusieInf, coverage, secondaryCoverage, signalCutoff, minAbsCorr, - methodArgs, verbose, ctx, tid) + methodArgs, verbose, ctx, tid, + cvFolds = cvFolds, samplePartition = samplePartition) }) for (tk in toRun) { @@ -1111,6 +1401,41 @@ setMethod("fineMappingPipeline", "QtlDataset", next } + # SER pre-screen: drop contexts with no single-effect signal before + # the joint fit (faithful port of skipConditions). Screen the first + # region block; skip the trait entirely when < 2 contexts survive. + if (.fmScreenActive(pipCutoffToSkip)) { + rg0 <- xRegions[[1L]] + Xscr <- if (is.null(rg0)) { + .fmResidGeno(data, contexts = contextsHere, traitId = tid, + cisWindow = cisWindow, samples = baseSamples) + } else { + .fmResidGeno(data, contexts = contextsHere, region = rg0, + samples = baseSamples) + } + csS <- intersect(baseSamples, rownames(Xscr)) + if (length(csS) >= 2L) { + Yscr <- do.call(cbind, lapply(contextsHere, + function(ctx) Yres[[ctx]][csS, 1L])) + kept <- contextsHere[.fmSerScreenColumns( + Xscr[csS, , drop = FALSE], Yscr, pipCutoffToSkip)] + if (length(kept) < 2L) { + if (verbose >= 1) + message(sprintf( + "Skipping mvsusie (multi-context) for trait='%s': < 2 contexts pass the SER pre-screen.", + tid)) + next + } + if (length(kept) < length(contextsHere)) { + if (verbose >= 1) + message(sprintf( + "mvsusie (multi-context) trait='%s': SER pre-screen kept %d of %d contexts.", + tid, length(kept), length(contextsHere))) + contextsHere <- kept + } + } + } + if (verbose >= 1) message(sprintf("Fitting mvsusie (multi-context) for trait='%s' ...", tid)) fitOneRegion <- function(rg) { @@ -1140,11 +1465,18 @@ setMethod("fineMappingPipeline", "QtlDataset", .fmMergeUserArgs(mvBaseArgs, "mvsusie", methodArgs[["mvsusie"]])) fit <- .setFinemappingFitClass(fit, "mvsusie") - .fmPostprocessOne( + entry <- .fmPostprocessOne( fit = fit, method = "mvsusie", dataX = Xc, dataY = NULL, coverage = coverage, secondaryCoverage = secondaryCoverage, signalCutoff = signalCutoff, minAbsCorr = minAbsCorr, csInput = "X") + if (cvFolds > 1L) { + cv <- .fmCrossValidate(Xc, Yc, "mvsusie", methodArgs, cvFolds, + samplePartition = samplePartition, + coverage = coverage, verbose = verbose) + entry <- .fmAttachCv(entry, .fmSliceCv(cv, "mvsusie")) + } + entry } entry <- .fmJointBlocks(xRegions, fitOneRegion) # Share the joint (merged) entry across contexts via copy-on-modify. @@ -1173,6 +1505,41 @@ setMethod("fineMappingPipeline", "QtlDataset", Y <- .fmResidPheno( data, contexts = ctx, traitId = traits, naAction = naAction) + # SER pre-screen: drop traits with no single-effect signal before the + # joint fit (faithful port of skipConditions). Skip the context's + # mvsusie when < 2 traits survive. + if (.fmScreenActive(pipCutoffToSkip)) { + rg0 <- xRegions[[1L]] + Xscr <- if (is.null(rg0)) { + .fmResidGeno(data, contexts = ctx, traitId = traits, + cisWindow = cisWindow, samples = rownames(Y)) + } else { + .fmResidGeno(data, contexts = ctx, region = rg0, + samples = rownames(Y)) + } + csS <- intersect(rownames(Xscr), rownames(Y)) + if (length(csS) >= 2L) { + keep <- .fmSerScreenColumns( + Xscr[csS, , drop = FALSE], Y[csS, , drop = FALSE], + pipCutoffToSkip) + if (sum(keep) < 2L) { + if (verbose >= 1) + message(sprintf( + "Skipping mvsusie (multi-trait) for context='%s': < 2 traits pass the SER pre-screen.", + ctx)) + next + } + if (sum(keep) < length(traits)) { + if (verbose >= 1) + message(sprintf( + "mvsusie (multi-trait) context='%s': SER pre-screen kept %d of %d traits.", + ctx, sum(keep), length(traits))) + traits <- traits[keep] + Y <- Y[, keep, drop = FALSE] + } + } + } + if (verbose >= 1) message(sprintf("Fitting mvsusie (multi-trait) for context='%s' ...", ctx)) fitOneRegion <- function(rg) { @@ -1199,11 +1566,18 @@ setMethod("fineMappingPipeline", "QtlDataset", .fmMergeUserArgs(mvBaseArgs, "mvsusie", methodArgs[["mvsusie"]])) fit <- .setFinemappingFitClass(fit, "mvsusie") - .fmPostprocessOne( + entry <- .fmPostprocessOne( fit = fit, method = "mvsusie", dataX = Xc, dataY = NULL, coverage = coverage, secondaryCoverage = secondaryCoverage, signalCutoff = signalCutoff, minAbsCorr = minAbsCorr, csInput = "X") + if (cvFolds > 1L) { + cv <- .fmCrossValidate(Xc, Yc, "mvsusie", methodArgs, cvFolds, + samplePartition = samplePartition, + coverage = coverage, verbose = verbose) + entry <- .fmAttachCv(entry, .fmSliceCv(cv, "mvsusie")) + } + entry } entry <- .fmJointBlocks(xRegions, fitOneRegion) for (tid in traits) { @@ -1275,12 +1649,26 @@ setMethod("fineMappingPipeline", "QtlDataset", fit <- do.call(fitFsusie, .fmMergeUserArgs(list(X = Xc, Y = Yc, pos = pos), "fsusie", methodArgs[["fsusie"]])) + # Collapse the functional fit to a variants x features TWAS weight + # matrix now, while fitted_wc/csd_X are still present (trimming drops + # them). Stored on $coef so a trimmed fit can still yield weights. + fit$coef <- tryCatch( + fsusieWeights(fsusieFit = fit, variantIds = colnames(Xc)), + error = function(e) NULL) fit <- .setFinemappingFitClass(fit, "fsusie") - .fmPostprocessOne( + entry <- .fmPostprocessOne( fit = fit, method = "fsusie", dataX = Xc, dataY = NULL, coverage = coverage, secondaryCoverage = secondaryCoverage, signalCutoff = signalCutoff, minAbsCorr = minAbsCorr, csInput = "fsusie") + if (cvFolds > 1L) { + cv <- .fmCrossValidate(Xc, Yc, "fsusie", methodArgs, cvFolds, + samplePartition = samplePartition, + coverage = coverage, pos = pos, + verbose = verbose) + entry <- .fmAttachCv(entry, .fmSliceCv(cv, "fsusie")) + } + entry } entry <- .fmJointBlocks(xRegions, fitOneRegion) for (tid in traits) { @@ -1327,6 +1715,10 @@ setMethod("fineMappingPipeline", "MultiStudyQtlDataset", minAbsCorr = 0.8, medianAbsCorr = NULL, fineMappingResult = NULL, + cvFolds = 0, + samplePartition = NULL, + pipCutoffToSkip = 0, + seed = NULL, naAction = c("drop", "impute"), verbose = 1, trim = TRUE, @@ -1393,6 +1785,10 @@ setMethod("fineMappingPipeline", "MultiStudyQtlDataset", signalCutoff = signalCutoff, minAbsCorr = minAbsCorr, fineMappingResult = fineMappingResult, + cvFolds = cvFolds, + samplePartition = samplePartition, + pipCutoffToSkip = pipCutoffToSkip, + seed = seed, naAction = naAction, verbose = verbose, ...) diff --git a/R/fineMappingWrappers.R b/R/fineMappingWrappers.R index 3922cbc1..1a199f79 100644 --- a/R/fineMappingWrappers.R +++ b/R/fineMappingWrappers.R @@ -851,6 +851,13 @@ trimFinemappingFit <- function(fit, effectIdx, method, csTables) { if (!is.null(fit$conditional_lfsr)) trimmed$clfsr <- fit$conditional_lfsr[effectIdx, , , drop = FALSE] } + # fSuSiE: keep the precomputed variants x features TWAS weight matrix + # (fsusieWeights output, attached as $coef before trimming) so downstream + # TWAS can read it without the dropped wavelet slots. + if (method == "fsusie" && !is.null(fit$coef)) { + trimmed$coef <- fit$coef + } + class(trimmed) <- unique(c(method, "susie")) trimmed } diff --git a/R/regularizedRegressionWrappers.R b/R/regularizedRegressionWrappers.R index 242b6c0f..866cfe97 100644 --- a/R/regularizedRegressionWrappers.R +++ b/R/regularizedRegressionWrappers.R @@ -444,13 +444,21 @@ susieAshRssWeights <- function(stat, LD, susieAshRssFit = NULL, retainFit = TRUE #' @param mrmashFit Optional fitted mr.mash object. #' @param X Genotype matrix. Required when `mrmashFit` is NULL. #' @param Y Phenotype matrix. Required when `mrmashFit` is NULL. +#' @param retainFit If TRUE, attach (as the `"fit"` attribute of the returned +#' weights) the parts of the mr.mash fit that `fineMappingPipeline` needs to +#' rebuild the mvSuSiE reweighted mixture prior + residual variance: the +#' original data-driven prior matrices (`dataDrivenPriorMatrices`), the fitted +#' mixture weights (`w0`) and the residual covariance (`V`). The heavy +#' coefficient matrix (`mu1`) is intentionally not retained. Default FALSE. #' @param ... Additional arguments passed to `mrmashWrapper()` when fitting. #' @return Matrix of variant weights. #' @export -mrmashWeights <- function(mrmashFit = NULL, X = NULL, Y = NULL, ...) { +mrmashWeights <- function(mrmashFit = NULL, X = NULL, Y = NULL, + retainFit = FALSE, ...) { if (!requireNamespace("mr.mashr", quietly = TRUE)) { stop("Package 'mr.mashr' is required. Install with: devtools::install_github('stephenslab/mr.mashr')") } + dotArgs <- list(...) if (is.null(mrmashFit)) { message("mrmashFit is not provided; fitting mr.mash now ...") if (is.null(X) || is.null(Y)) { @@ -458,7 +466,18 @@ mrmashWeights <- function(mrmashFit = NULL, X = NULL, Y = NULL, ...) { } mrmashFit <- mrmashWrapper(X, Y, ...) } - return(mr.mashr::coef.mr.mash(mrmashFit)[-1, ]) + out <- mr.mashr::coef.mr.mash(mrmashFit)[-1, ] + if (isTRUE(retainFit)) { + # Lean payload consumed by fineMappingPipeline to reproduce the legacy + # initializeMvsusiePrior reweighting (see the mvSuSiE-prior-from-mr.mash + # note). The original matrices are required for bit-identical results; + # rescaleCovW0(w0) collapses the expanded weights back onto them. + attr(out, "fit") <- list( + dataDrivenPriorMatrices = dotArgs$dataDrivenPriorMatrices, + w0 = mrmashFit$w0, + V = mrmashFit$V) + } + out } #' Compute mvSuSiE TWAS weights @@ -501,6 +520,153 @@ mvsusieWeights <- function(mvsusieFit = NULL, X = NULL, Y = NULL, return(mvsusieR::coef.mvsusie(mvsusieFit)[-1, ]) } +# Build the wavelet synthesis (inverse-DWT) matrix S (n_wac x nFeat) for the +# basis fSuSiE uses, by reconstructing each unit wavelet coefficient through the +# SAME $D / $C assignment as out_prep.susiF (detail columns -> $D, the coarsest +# scaling column -> last $C entry), then `wavethresh::wr`. A wavelet-coefficient +# row `c` then maps to the feature domain as `c %*% S`. `scaleCols` is the +# column index of the scaling coefficient(s) (per the prior family). fSuSiE's +# default basis (DaubLeAsymm, filter 10) matches `wavethresh::wd`'s default, the +# same one out_prep uses, so the plain `wd(rep(0, nWac))` template is consistent. +# @noRd +.fsusieSynthesisMatrix <- function(nWac, scaleCols) { + template <- wavethresh::wd(rep(0, nWac)) + reconstructUnit <- function(k) { + coeffRow <- numeric(nWac) + coeffRow[k] <- 1 + temp <- template + temp$D <- coeffRow[-scaleCols] + temp$C[length(temp$C)] <- sum(coeffRow[scaleCols]) + as.numeric(wavethresh::wr(temp)) + } + do.call(rbind, lapply(seq_len(nWac), reconstructUnit)) +} + +#' Compute fSuSiE feature-level TWAS weights +#' +#' Collapses a functional SuSiE (\code{fsusieR::susiF}) fit back to a +#' \code{variants x features} weight matrix usable for TWAS prediction of each +#' molecular feature. fSuSiE fits the regression in the wavelet domain, storing +#' per-SNP posterior-mean wavelet effects \code{fitted_wc[[l]]} +#' (\code{nSNP x n_wac}) and inclusion probabilities \code{alpha[[l]]}. Because +#' the inverse wavelet transform \code{wr()} is linear, the posterior-mean +#' prediction pushes through to a per-SNP, per-feature weight matrix: +#' \deqn{W[j, f] = \sum_l alpha[[l]][j] \cdot +#' \mathrm{wr}\!\left(fitted\_wc[[l]][j, ] / csd\_X[j]\right)[f].} +#' This is the exact analog of \code{coef.susie} for scalar SuSiE (all SNPs, +#' alpha-weighted), which spreads weight across the credible set — more robust +#' for out-of-sample TWAS than fSuSiE's in-sample lead-SNP summary +#' (\code{update_cal_indf}). +#' +#' The reconstruction uses the raw posterior wavelet coefficients +#' \code{fitted_wc}, so it is independent of the \code{post_processing} mode +#' (\code{"smash"}/\code{"TI"}/\code{"HMM"}/\code{"none"}) — that smoothing only +#' denoises the alpha-collapsed display curve \code{fitted_func}, never the +#' per-SNP predictive coefficients. The \code{$D}/\code{$C} coefficient layout +#' and wavelet basis mirror \code{out_prep.susiF}, so the feature-domain output +#' matches fSuSiE's own conventions. +#' +#' @param fsusieFit A fitted \code{fsusieR::susiF} object. Must retain +#' \code{fitted_wc}, \code{alpha}, \code{csd_X}, \code{n_wac}, and +#' \code{outing_grid} (i.e. an untrimmed fit). Required. +#' @param X,Y Accepted for call-compatibility with the multivariate +#' weight-method dispatch in \code{\link{learnTwasWeights}}, which invokes +#' every method as \code{fn(X = ., Y = ., ...)}. fSuSiE is a functional method +#' that cannot be refit from a bare \code{(X, Y)} pair (it needs feature +#' positions and the wavelet model), so these are ignored: a fitted +#' \code{fsusieFit} is always required. +#' @param variantIds Optional character vector of variant IDs (length = number +#' of SNPs in the fit) for the matrix row names. Defaults to +#' \code{names(fsusieFit$csd_X)} / \code{names(fsusieFit$pip)}. +#' @param featureNames Optional character vector of feature (outcome) names for +#' the matrix column names. Defaults to the fit's \code{outing_grid}. +#' @param retainFit If TRUE, stores the fit as an attribute on the result. +#' @return A numeric matrix of variant (rows) by feature (columns) weights. +#' @export +fsusieWeights <- function(fsusieFit = NULL, X = NULL, Y = NULL, + variantIds = NULL, featureNames = NULL, + retainFit = FALSE) { + if (is.null(fsusieFit)) { + stop("fsusieWeights: `fsusieFit` is required. fSuSiE is functional and ", + "cannot be refit from a bare (X, Y); fit it via fineMappingPipeline() ", + "and pass the fitted fsusieR::susiF object.") + } + # Fast path: a trimmed fit carries the precomputed variants x features weight + # matrix in `$coef` (fineMappingPipeline computes it eagerly while the full + # fit is in hand, because trimming drops fitted_wc/csd_X/...). Return it. + if (is.matrix(fsusieFit$coef) && + is.null(fsusieFit$fitted_wc)) { + W <- fsusieFit$coef + if (!is.null(variantIds) && length(variantIds) == nrow(W)) + rownames(W) <- variantIds + if (retainFit) attr(W, "fit") <- fsusieFit + return(W) + } + if (!requireNamespace("fsusieR", quietly = TRUE)) { + stop("Package 'fsusieR' is required for fsusieWeights().") + } + if (!requireNamespace("wavethresh", quietly = TRUE)) { + stop("Package 'wavethresh' is required for fsusieWeights().") + } + fit <- fsusieFit + missingSlots <- setdiff(c("fitted_wc", "alpha", "csd_X", "n_wac", + "outing_grid"), names(fit)) + if (length(missingSlots) > 0L) { + stop("fsusieWeights: the fSuSiE fit is missing required slot(s): ", + paste(missingSlots, collapse = ", "), + ". Pass an untrimmed fit (these are dropped when trimmed).") + } + + csdX <- as.numeric(fit$csd_X) + p <- length(csdX) + nWac <- fit$n_wac + + # alpha may be a list (one vector per effect, the fsusieR::susiF default) or + # a matrix/data.frame (L x nSNP) after fsusieWrapper reshaping. Normalize to + # a list of per-effect vectors. + alpha <- fit$alpha + alphaList <- if (is.list(alpha) && !is.data.frame(alpha)) { + lapply(alpha, as.numeric) + } else { + am <- as.matrix(alpha) + lapply(seq_len(nrow(am)), function(l) as.numeric(am[l, ])) + } + L <- length(fit$fitted_wc) + + # Scaling-coefficient column(s): the coarsest level for a per-scale prior, + # else the last column. Mirrors the two branches of out_prep.susiF. + perScale <- "mixture_normal_per_scale" %in% class(fsusieR::get_G_prior(fit)) + indxLst <- fsusieR::gen_wavelet_indx(log2(length(fit$outing_grid))) + scaleCols <- if (perScale) indxLst[[length(indxLst)]] + else ncol(as.matrix(fit$fitted_wc[[1L]])) + + # One inverse transform per wavelet coefficient (built once), then every SNP / + # effect is a matrix multiply: W = sum_l (alpha_l/csd_X-scaled fitted_wc_l) %*% S. + S <- .fsusieSynthesisMatrix(nWac, scaleCols) + nFeat <- ncol(S) + invCsd <- 1 / csdX + + W <- matrix(0, nrow = p, ncol = nFeat) + for (l in seq_len(L)) { + wc <- as.matrix(fit$fitted_wc[[l]]) + rowScale <- alphaList[[l]] * invCsd + W <- W + (rowScale * wc) %*% S + } + + rn <- variantIds + if (is.null(rn)) rn <- names(fit$csd_X) + if (is.null(rn)) rn <- names(fit$pip) + if (!is.null(rn) && length(rn) == p) rownames(W) <- rn + cn <- featureNames + if (is.null(cn) && !is.null(fit$outing_grid) && + length(fit$outing_grid) == nFeat) { + cn <- as.character(fit$outing_grid) + } + if (!is.null(cn) && length(cn) == nFeat) colnames(W) <- cn + if (retainFit) attr(W, "fit") <- fit + W +} + #' Compute mr.mash-RSS TWAS weights from summary statistics #' #' Multi-context summary-statistics analog of \code{\link{mrmashWeights}}: diff --git a/R/twasWeights.R b/R/twasWeights.R index a5043dbb..a04b45e4 100644 --- a/R/twasWeights.R +++ b/R/twasWeights.R @@ -351,7 +351,8 @@ setMethod("show", "TwasWeights", function(object) { mcp = list(fn = "mcp_weights", impl = "mcpWeights", args = list()), l0learn = list(fn = "l0learn_weights", impl = "l0learnWeights", args = list()), mvsusie = list(fn = "mvsusie_weights", impl = "mvsusieWeights", args = list(L = 30, L_greedy = 5)), - mrmash = list(fn = "mrmash_weights", impl = "mrmashWeights", args = list()) + mrmash = list(fn = "mrmash_weights", impl = "mrmashWeights", args = list()), + fsusie = list(fn = "fsusie_weights", impl = "fsusieWeights", args = list()) ) # Handle presets @@ -691,7 +692,11 @@ twasWeightsCv <- function(X, Y, fold = NULL, samplePartitions = NULL, weightMeth if (is.null(weightMethods)) { return(list(samplePartition = samplePartition)) } else { - # Hardcoded vector of multivariate weightMethods (accept both snake and camel) + # Hardcoded vector of multivariate weightMethods (accept both snake and camel). + # fSuSiE is excluded from the per-fold CV refit path: it is functional and + # cannot be refit from a bare (X, y) fold split, so its cross-validated + # predictions are supplied by fineMappingPipeline (FineMappingResult + # cvResult) rather than recomputed here. multivariateWeightMethods <- c("mrmash_weights", "mvsusie_weights", "mrmashWeights", "mvsusieWeights") @@ -917,9 +922,13 @@ learnTwasWeights <- function(X, Y, weightMethods, tic() } - # Hardcoded vector of multivariate methods (accept both snake and camel) + # Hardcoded vector of multivariate methods (accept both snake and camel). + # fSuSiE is multivariate (variants x features weight matrix) but is never + # refit here — fsusieWeights extracts from the supplied fsusieFit. multivariateWeightMethods <- c("mrmash_weights", "mvsusie_weights", - "mrmashWeights", "mvsusieWeights") + "fsusie_weights", + "mrmashWeights", "mvsusieWeights", + "fsusieWeights") args <- weightMethods[[methodName]] fnName <- .resolveMethodFunction(methodName, args) diff --git a/R/twasWeightsPipeline.R b/R/twasWeightsPipeline.R index a31b617d..0f712bc0 100644 --- a/R/twasWeightsPipeline.R +++ b/R/twasWeightsPipeline.R @@ -418,7 +418,8 @@ # governed by .fineMappingMethodCapabilities) to its TWAS-weight extractor # wrapper. The wrapper names follow the *Weights / *RssWeights convention, # and the *Fit argument receives the pre-fitted fine-mapping object. -# fSuSiE is intentionally absent: no TWAS-weight extractor exists for it. +# fSuSiE is multivariate (it collapses a functional fit to a variants x +# features weight matrix via fsusieWeights) and has no RSS counterpart. # @noRd .twasFineMappingMethodAdapters <- list( susie = list(weightFn = "susieWeights", @@ -440,7 +441,12 @@ rssWeightFn = "mvsusieRssWeights", fitArg = "mvsusieFit", rssFitArg = "mvsusieRssFit", - methodKey = "mvsusie_weights")) + methodKey = "mvsusie_weights"), + fsusie = list(weightFn = "fsusieWeights", + rssWeightFn = NULL, + fitArg = "fsusieFit", + rssFitArg = NULL, + methodKey = "fsusie_weights")) # Canonical list of fine-mapping tokens recognised by twasWeightsPipeline. # Sourced from fineMappingPipeline's registry minus mrmash (which @@ -581,7 +587,7 @@ } out <- list() methods <- as.character(fineMappingResult$method) - for (canonical in c("susie", "susieInf", "susieAsh", "mvsusie")) { + for (canonical in c("susie", "susieInf", "susieAsh", "mvsusie", "fsusie")) { candidates <- c(canonical, paste0(tolower(substring(canonical, 1L, 1L)), substring(canonical, 2L)), @@ -611,6 +617,46 @@ fits[[token]] } +# Collect the cross-validation payload that fineMappingPipeline stored on the +# FineMappingResult for one (study, context, trait) tuple. fineMapping records +# one cvResult per (study, context, trait, method) entry (samplePartition + +# per-fold predictions/metrics, keyed by the TWAS snake method name); this +# merges them across the fine-mapping methods of the tuple into a single +# twasWeightsCv()-shaped list so twasWeightsPipeline can reuse the partition and +# feed those out-of-fold predictions into the SR-TWAS ensemble without re- +# fitting the fine-mapping models. A multi-region entry stores cvResult as a +# per-region list; the first region carrying CV is used. Returns NULL when no +# fine-mapping entry for the tuple recorded CV. +# @noRd +.twasCvResultFor <- function(fineMappingResult, study, context, trait) { + if (is.null(fineMappingResult)) return(NULL) + if (!is(fineMappingResult, "FineMappingResultBase")) return(NULL) + idx <- which(as.character(fineMappingResult$study) == study & + as.character(fineMappingResult$context) == context & + as.character(fineMappingResult$trait) == trait) + if (length(idx) == 0L) return(NULL) + samplePartition <- NULL + prediction <- list() + performance <- list() + for (i in idx) { + cv <- getCvResult(fineMappingResult$entry[[i]]) + if (is.null(cv)) next + # Multi-region entries store cvResult as a named per-region list; pick the + # first region that carries a partition. + if (is.null(cv$samplePartition)) { + hit <- Filter(function(z) is.list(z) && !is.null(z$samplePartition), cv) + if (length(hit) == 0L) next + cv <- hit[[1L]] + } + if (is.null(samplePartition)) samplePartition <- cv$samplePartition + prediction <- c(prediction, cv$prediction) + performance <- c(performance, cv$performance) + } + if (length(prediction) == 0L) return(NULL) + list(samplePartition = samplePartition, + prediction = prediction, performance = performance) +} + #' TWAS Weights Pipeline #' #' S4-dispatched per-region pipeline for learning TWAS prediction weights. @@ -649,6 +695,16 @@ #' / the RSS sub-pipelines via the \code{fittedModels} slot, avoiding #' a re-fit. #' +#' When the supplied \code{FineMappingResult} was produced with +#' cross-validation (\code{fineMappingPipeline(..., cvFolds > 1)}), each +#' matching \code{(study, context, trait)} entry's \code{cvResult} is +#' reused: its fold partition becomes the CV partition (unless +#' \code{samplePartition} is given explicitly) and its per-fold out-of-fold +#' predictions/metrics are fed directly into the SR-TWAS ensemble in place +#' of re-fitting those fine-mapping methods here. Non-fine-mapping methods +#' (lasso, enet, ...) are still cross-validated on the same shared +#' partition. +#' #' @param data A \code{QtlDataset}, \code{MultiStudyQtlDataset}, or #' \code{QtlSumStats}. The \code{MultiStudyQtlDataset} method iterates #' the embedded individual-level \code{QtlDataset} entries and the @@ -669,6 +725,15 @@ #' @param cisWindow For QtlDataset: cis-window (bp) around each trait's #' genomic position when extracting variants. Required when #' \code{traitId} is supplied. Mutually exclusive with \code{region}. +#' @param minTwasMaf For QtlDataset: optional minimum minor-allele frequency +#' applied to the variant set used for TWAS weight learning, on top of the +#' dataset's construct-time \code{mafCutoff} (the effective cutoff is the +#' larger of the two). Lets the TWAS pass use a stricter MAF threshold than +#' fine mapping. \code{NULL} (default) leaves the construct-time cutoff in +#' place. +#' @param minTwasXvar As \code{minTwasMaf} but for the per-variant genotype +#' variance cutoff (\code{xvarCutoff}). \code{NULL} (default) leaves the +#' construct-time cutoff in place. #' @param jointRegions For QtlDataset with a multi-range \code{region}: #' \code{FALSE} (default) learns weights for each range independently and #' concatenates them into one entry per (study, context, trait, method); @@ -754,6 +819,8 @@ setMethod("twasWeightsPipeline", "QtlDataset", traitId = NULL, region = NULL, cisWindow = NULL, + minTwasMaf = NULL, + minTwasXvar = NULL, jointRegions = FALSE, jointSpecification = NULL, fineMappingResult = NULL, @@ -785,6 +852,14 @@ setMethod("twasWeightsPipeline", "QtlDataset", "coordinates, whereas `region` is the literal variant window.") } xRegions <- .makeXRegions(region, jointRegions) + # TWAS-specific variant filters: tighten the QtlDataset maf/xvar cutoffs for + # weight learning (distinct from the construct-time / fine-mapping cutoffs). + # Modifying the local `data` copy elevates them everywhere downstream + # (runOne, runMultivariate, and the jointSpec dispatcher all extract from it). + if (!is.null(minTwasMaf)) + data@mafCutoff <- max(data@mafCutoff, as.numeric(minTwasMaf)) + if (!is.null(minTwasXvar)) + data@xvarCutoff <- max(data@xvarCutoff, as.numeric(minTwasXvar)) parsedJointSpec <- parseJointSpecification(jointSpecification, data) norm <- .twasNormalizeMethods(methods) .twasCheckMethodCapabilities(norm$tokens, "QtlDataset") @@ -850,10 +925,10 @@ setMethod("twasWeightsPipeline", "QtlDataset", nCtx <- length(useCtx) .twasCheckMultivariateY(norm$tokens, length(allTraits), nCtx) - multivariate <- any(vapply(norm$tokens, function(tk) { - info <- .twasMethodCapabilities[[tk]] - !is.null(info) && isTRUE(info$multivariate) - }, logical(1))) + # Multivariate if any requested token is multivariate in either the TWAS + # capability table (mrmash) or the fine-mapping one (mvsusie / fsusie). + multivariate <- any(vapply(norm$tokens, .twasIsMultivariateToken, + logical(1))) runOne <- function(ctx, tid) { # Resume cache: per-method check against the supplied `twasWeights` @@ -883,6 +958,9 @@ setMethod("twasWeightsPipeline", "QtlDataset", # per-region list and is selected blockwise inside the loop. allFits <- .twasFineMappingFits(fineMappingResult, study = study, context = ctx, trait = tid) + # Fine-mapping's own cross-validated predictions (shared fold partition), + # reused by the ensemble instead of re-fitting the fine-mapping methods. + fmCv <- .twasCvResultFor(fineMappingResult, study, ctx, tid) nBlocks <- length(xRegions) perBlockTw <- lapply(seq_len(nBlocks), function(bi) { rg <- xRegions[[bi]] @@ -911,6 +989,7 @@ setMethod("twasWeightsPipeline", "QtlDataset", fittedModels = .twasFitsForRegion(allFits, bi, nBlocks), cvFolds = cvFolds, samplePartition = samplePartition, + fineMappingCv = fmCv, weightMethods = remaining, maxCvVariants = maxCvVariants, cvThreads = cvThreads, @@ -1019,14 +1098,21 @@ setMethod("twasWeightsPipeline", "QtlDataset", context = meta$context[[1L]], trait = meta$trait[[1L]]), bi, length(xRegions)) + fmCv <- .twasCvResultFor(fineMappingResult, study, + meta$context[[1L]], meta$trait[[1L]]) .twasWeightsPipelineMatrix( X = Xr, y = Y, study = study, context = meta$context, trait = meta$trait, + # Retain the mr.mash fit parts ({dataDrivenPriorMatrices, w0, V}) on + # the entry's `fits` slot so fineMappingPipeline can rebuild the + # mvSuSiE reweighted prior + residual variance from this shared fit. + retainFits = TRUE, fittedModels = jointFits, cvFolds = cvFolds, samplePartition = samplePartition, + fineMappingCv = fmCv, weightMethods = norm$methodList, maxCvVariants = maxCvVariants, cvThreads = cvThreads, @@ -1536,6 +1622,7 @@ setMethod("twasWeightsPipeline", "ANY", fittedModels = NULL, cvFolds = 5, samplePartition = NULL, + fineMappingCv = NULL, weightMethods = "default", maxCvVariants = -1, cvThreads = 1, @@ -1548,6 +1635,7 @@ setMethod("twasWeightsPipeline", "ANY", standardized = FALSE, dataType = NULL, ldSketch = NULL, + retainFits = FALSE, verbose = 1) { if (is.character(weightMethods)) { weightMethods <- .twasMethodLookup(weightMethods) @@ -1618,7 +1706,8 @@ setMethod("twasWeightsPipeline", "ANY", remainingMethods <- weightMethods[remainingFnNames] remainingTw <- do.call(learnTwasWeights, c( list(X = X, Y = y, weightMethods = remainingMethods, - fittedModels = fittedModels, verbose = verbose), + fittedModels = fittedModels, retainFits = retainFits, + verbose = verbose), learnArgs)) res$twasWeights <- .rbindTwasWeights(mrashWeights, remainingTw, ldSketch = ldSketch) @@ -1642,7 +1731,8 @@ setMethod("twasWeightsPipeline", "ANY", # Run all methods at once res$twasWeights <- do.call(learnTwasWeights, c( list(X = X, Y = y, weightMethods = weightMethods, - fittedModels = fittedModels, verbose = verbose), + fittedModels = fittedModels, retainFits = retainFits, + verbose = verbose), learnArgs)) } if (verbose >= 1) { @@ -1672,6 +1762,44 @@ setMethod("twasWeightsPipeline", "ANY", cvWeightMethods <- .filterZeroWeightMethods(weightMethods, res$twasWeights) } + # Fine-mapping handoff: when fineMappingPipeline supplied cross-validated + # predictions for some methods (shared fold partition + per-fold out-of- + # fold predictions), reuse them rather than refitting those methods here. + # Drop them from the CV refit set, adopt the shared partition (unless the + # caller passed one explicitly), and merge their predictions/metrics into + # the CV result below so the SR-TWAS ensemble consumes fine-mapping's own + # cross-validation. + fmCvPrediction <- NULL; fmCvPerformance <- NULL + if (!is.null(fineMappingCv) && length(fineMappingCv$prediction) > 0L) { + if (is.null(samplePartition) && !is.null(fineMappingCv$samplePartition)) { + samplePartition <- fineMappingCv$samplePartition + } + fmBase <- sub("(_predicted|Predicted)$", "", names(fineMappingCv$prediction)) + cvWeightMethods <- cvWeightMethods[ + setdiff(names(cvWeightMethods), paste0(fmBase, "_weights"))] + yMat <- if (is.matrix(y)) y + else matrix(y, ncol = 1L, dimnames = list(names(y), NULL)) + sampleNames <- rownames(X) + outcomeNames <- colnames(yMat) + alignFmPred <- function(mat) { + out <- matrix(NA_real_, length(sampleNames), + max(1L, length(outcomeNames)), + dimnames = list(sampleNames, outcomeNames)) + rs <- intersect(rownames(mat), sampleNames) + cs <- if (!is.null(colnames(mat)) && !is.null(outcomeNames)) + intersect(colnames(mat), outcomeNames) else character(0) + if (length(cs) > 0L) { + out[rs, cs] <- mat[rs, cs, drop = FALSE] + } else if (ncol(mat) == ncol(out)) { + out[rs, ] <- mat[rs, , drop = FALSE] + } + out + } + fmCvPrediction <- setNames(lapply(fineMappingCv$prediction, alignFmPred), + names(fineMappingCv$prediction)) + fmCvPerformance <- fineMappingCv$performance + } + variantsForCv <- c() if (maxCvVariants <= 0) { maxCvVariants <- Inf @@ -1680,26 +1808,48 @@ setMethod("twasWeightsPipeline", "ANY", variantsForCv <- sample(colnames(X), maxCvVariants, replace = FALSE) } - if (verbose >= 1) { - message("Performing cross-validation to assess TWAS weights ...") - tic() + if (length(cvWeightMethods) > 0L) { + if (verbose >= 1) { + message("Performing cross-validation to assess TWAS weights ...") + tic() + } + res$twasCvResult <- twasWeightsCv( + X, + y, + fold = cvFolds, + samplePartitions = samplePartition, + weightMethods = cvWeightMethods, + maxNumVariants = maxCvVariants, + numThreads = cvThreads, + verbose = verbose, + variantsToKeep = if (length(variantsForCv) > 0) variantsForCv else NULL + ) + if (verbose >= 1) { + elapsed <- toc(quiet = TRUE) + message(sprintf("Cross-validation done in %.1fs", elapsed$toc - elapsed$tic)) + } + } else { + # Every CV method came from fine-mapping; no refit needed here. + res$twasCvResult <- list(samplePartition = samplePartition, + prediction = list(), performance = list()) } - res$twasCvResult <- twasWeightsCv( - X, - y, - fold = cvFolds, - samplePartitions = samplePartition, - weightMethods = cvWeightMethods, - maxNumVariants = maxCvVariants, - numThreads = cvThreads, - verbose = verbose, - variantsToKeep = if (length(variantsForCv) > 0) variantsForCv else NULL - ) - if (verbose >= 1) { - elapsed <- toc(quiet = TRUE) - message(sprintf("Cross-validation done in %.1fs", elapsed$toc - elapsed$tic)) + + # Merge fine-mapping's cross-validated predictions/metrics into the CV + # result so downstream splicing + ensemble treat them as first-class. + if (!is.null(fmCvPrediction)) { + res$twasCvResult$prediction <- c(res$twasCvResult$prediction, + fmCvPrediction) + res$twasCvResult$performance <- c(res$twasCvResult$performance, + fmCvPerformance) + if (is.null(res$twasCvResult$samplePartition)) { + res$twasCvResult$samplePartition <- samplePartition + } } + # Number of methods participating in cross-validation / ensemble (refit + # here plus those handed over by fine-mapping). + nCvMethods <- length(res$twasCvResult$prediction) + # Splice per-(method, outcome) CV predictions + metrics into the # corresponding TwasWeightsEntry$cvPerformance slot. res$twasWeights <- .spliceCvIntoTwasWeights(res$twasWeights, @@ -1707,11 +1857,11 @@ setMethod("twasWeightsPipeline", "ANY", ldSketch = ldSketch) # Ensemble learning: learn optimal method combination via stacked regression - if (isTRUE(ensemble) && length(cvWeightMethods) <= 1) { - if (verbose >= 1) message("Ensemble model skipped: only ", length(cvWeightMethods), + if (isTRUE(ensemble) && nCvMethods <= 1) { + if (verbose >= 1) message("Ensemble model skipped: only ", nCvMethods, " weight method provided (need >= 2 for ensemble learning).") } - if (isTRUE(ensemble) && length(cvWeightMethods) > 1) { + if (isTRUE(ensemble) && nCvMethods > 1) { if (!is.null(res$twasCvResult$performance)) { # Extract R-squared for each method from CV performance table methodRsq <- vapply(res$twasCvResult$performance, function(perf) { diff --git a/man/FineMappingEntry.Rd b/man/FineMappingEntry.Rd index 75069044..d030ecd4 100644 --- a/man/FineMappingEntry.Rd +++ b/man/FineMappingEntry.Rd @@ -4,7 +4,7 @@ \alias{FineMappingEntry} \title{Create a FineMappingEntry Object} \usage{ -FineMappingEntry(variantIds, susieFit, topLoci) +FineMappingEntry(variantIds, susieFit, topLoci, cvResult = NULL) } \arguments{ \item{variantIds}{Character vector of variant IDs in fit order.} @@ -19,6 +19,10 @@ marginal_se, marginal_z, marginal_p}), posterior columns (\code{pip, posterior_mean, posterior_sd, cs_*, cs_*_purity}), pipeline stamps (\code{method, gene, event, grange_start, grange_end}). Unfiltered: one row per variant in the fit.} + +\item{cvResult}{Optional cross-validation payload (list with +\code{samplePartition}, \code{predictions}, \code{performance}) recorded +when fine-mapping is run with \code{cvFolds > 1}. \code{NULL} otherwise.} } \value{ A \code{FineMappingEntry} object. diff --git a/man/fineMappingPipeline.Rd b/man/fineMappingPipeline.Rd index 379ba9c1..07e57636 100644 --- a/man/fineMappingPipeline.Rd +++ b/man/fineMappingPipeline.Rd @@ -27,6 +27,8 @@ fineMappingPipeline(data, ...) minAbsCorr = 0.8, medianAbsCorr = NULL, fineMappingResult = NULL, + cvFolds = 0, + samplePartition = NULL, naAction = c("drop", "impute"), verbose = 1, trim = TRUE, @@ -53,6 +55,8 @@ fineMappingPipeline(data, ...) minAbsCorr = 0.8, medianAbsCorr = NULL, fineMappingResult = NULL, + cvFolds = 0, + samplePartition = NULL, naAction = c("drop", "impute"), verbose = 1, trim = TRUE, @@ -179,6 +183,22 @@ kept if it passes either \code{minAbsCorr} or \code{medianAbsCorr} \item{fineMappingResult}{Optional existing \code{FineMappingResult} to use as a resume cache; tuples already present are not refit.} +\item{cvFolds}{Integer. Number of cross-validation folds. Default +\code{0} (no CV). When \code{> 1}, each method is refit on the +training samples of every fold and used to predict the held-out +samples; the fold partition plus per-fold out-of-fold predictions and +metrics are stored on each \code{FineMappingEntry}'s \code{cvResult} +slot (see \code{\link{getCvResult}}). \code{twasWeightsPipeline} reuses +this partition and feeds these predictions into the SR-TWAS ensemble. +Individual-level (\code{QtlDataset} / \code{MultiStudyQtlDataset}) +input only; ignored for sumstat inputs.} + +\item{samplePartition}{Optional pre-defined CV partition +\code{data.frame} with columns \code{Sample} and \code{Fold}. When +supplied (and \code{cvFolds > 1}), every method reuses this exact +partition; otherwise a fresh partition is generated per +\code{(study, context, trait)}.} + \item{verbose}{Verbosity (0 silent, 1 default). Default \code{1}.} \item{trim}{Logical (length 1). When \code{TRUE} (default) the diff --git a/man/fsusieWeights.Rd b/man/fsusieWeights.Rd new file mode 100644 index 00000000..fad49604 --- /dev/null +++ b/man/fsusieWeights.Rd @@ -0,0 +1,63 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/regularizedRegressionWrappers.R +\name{fsusieWeights} +\alias{fsusieWeights} +\title{Compute fSuSiE feature-level TWAS weights} +\usage{ +fsusieWeights( + fsusieFit = NULL, + X = NULL, + Y = NULL, + variantIds = NULL, + featureNames = NULL, + retainFit = FALSE +) +} +\arguments{ +\item{fsusieFit}{A fitted \code{fsusieR::susiF} object. Must retain +\code{fitted_wc}, \code{alpha}, \code{csd_X}, \code{n_wac}, and +\code{outing_grid} (i.e. an untrimmed fit). Required.} + +\item{X, Y}{Accepted for call-compatibility with the multivariate +weight-method dispatch in \code{\link{learnTwasWeights}}, which invokes +every method as \code{fn(X = ., Y = ., ...)}. fSuSiE is a functional method +that cannot be refit from a bare \code{(X, Y)} pair (it needs feature +positions and the wavelet model), so these are ignored: a fitted +\code{fsusieFit} is always required.} + +\item{variantIds}{Optional character vector of variant IDs (length = number +of SNPs in the fit) for the matrix row names. Defaults to +\code{names(fsusieFit$csd_X)} / \code{names(fsusieFit$pip)}.} + +\item{featureNames}{Optional character vector of feature (outcome) names for +the matrix column names. Defaults to the fit's \code{outing_grid}.} + +\item{retainFit}{If TRUE, stores the fit as an attribute on the result.} +} +\value{ +A numeric matrix of variant (rows) by feature (columns) weights. +} +\description{ +Collapses a functional SuSiE (\code{fsusieR::susiF}) fit back to a +\code{variants x features} weight matrix usable for TWAS prediction of each +molecular feature. fSuSiE fits the regression in the wavelet domain, storing +per-SNP posterior-mean wavelet effects \code{fitted_wc[[l]]} +(\code{nSNP x n_wac}) and inclusion probabilities \code{alpha[[l]]}. Because +the inverse wavelet transform \code{wr()} is linear, the posterior-mean +prediction pushes through to a per-SNP, per-feature weight matrix: +\deqn{W[j, f] = \sum_l alpha[[l]][j] \cdot + \mathrm{wr}\!\left(fitted\_wc[[l]][j, ] / csd\_X[j]\right)[f].} +This is the exact analog of \code{coef.susie} for scalar SuSiE (all SNPs, +alpha-weighted), which spreads weight across the credible set — more robust +for out-of-sample TWAS than fSuSiE's in-sample lead-SNP summary +(\code{update_cal_indf}). +} +\details{ +The reconstruction uses the raw posterior wavelet coefficients +\code{fitted_wc}, so it is independent of the \code{post_processing} mode +(\code{"smash"}/\code{"TI"}/\code{"HMM"}/\code{"none"}) — that smoothing only +denoises the alpha-collapsed display curve \code{fitted_func}, never the +per-SNP predictive coefficients. The \code{$D}/\code{$C} coefficient layout +and wavelet basis mirror \code{out_prep.susiF}, so the feature-domain output +matches fSuSiE's own conventions. +} diff --git a/man/getCvResult.Rd b/man/getCvResult.Rd new file mode 100644 index 00000000..1cf22c96 --- /dev/null +++ b/man/getCvResult.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/AllGenerics.R, R/FineMappingEntry.R, +% R/QtlFineMappingResult.R +\name{getCvResult} +\alias{getCvResult} +\alias{getCvResult,FineMappingEntry-method} +\alias{getCvResult,QtlFineMappingResult-method} +\title{Get Cross-Validation Result} +\usage{ +getCvResult(x, ...) + +\S4method{getCvResult}{FineMappingEntry}(x, ...) + +\S4method{getCvResult}{QtlFineMappingResult}(x, study = NULL, context = NULL, trait = NULL, method = NULL, ...) +} +\arguments{ +\item{x}{A \code{FineMappingEntry} or \code{FineMappingResult}.} + +\item{...}{Class-specific selection arguments.} +} +\value{ +A list (the CV payload) or \code{NULL}. +} +\description{ +Extract the cross-validation payload stored on a + \code{FineMappingEntry} (or the matching entry of a + \code{FineMappingResult}). The payload is a list with components + \code{samplePartition} (a \code{data.frame} of \code{Sample}/\code{Fold} + assignments), \code{predictions} (a named list of per-method out-of-fold + prediction matrices), and \code{performance} (a named list of per-method + metric matrices). \code{NULL} when fine-mapping was run without + cross-validation (\code{cvFolds <= 1}). +} diff --git a/man/twasWeightsPipeline.Rd b/man/twasWeightsPipeline.Rd index 2678b759..ab1d31da 100644 --- a/man/twasWeightsPipeline.Rd +++ b/man/twasWeightsPipeline.Rd @@ -232,4 +232,14 @@ source of pre-fit SuSiE / SuSiE-inf / SuSiE-ash objects; their \code{trimmedFit} payloads are passed through to \code{learnTwasWeights} / the RSS sub-pipelines via the \code{fittedModels} slot, avoiding a re-fit. + +When the supplied \code{FineMappingResult} was produced with +cross-validation (\code{fineMappingPipeline(..., cvFolds > 1)}), each +matching \code{(study, context, trait)} entry's \code{cvResult} is +reused: its fold partition becomes the CV partition (unless +\code{samplePartition} is given explicitly) and its per-fold out-of-fold +predictions/metrics are fed directly into the SR-TWAS ensemble in place +of re-fitting those fine-mapping methods here. Non-fine-mapping methods +(lasso, enet, ...) are still cross-validated on the same shared +partition. } diff --git a/tests/testthat/test_FineMappingEntry.R b/tests/testthat/test_FineMappingEntry.R index d70d5cbe..b7c2af48 100644 --- a/tests/testthat/test_FineMappingEntry.R +++ b/tests/testthat/test_FineMappingEntry.R @@ -181,6 +181,30 @@ test_that("FineMappingEntry: validity errors when topLoci is missing required co ) }) + +# === cvResult slot (cross-validation payload) === + +test_that("FineMappingEntry stores and returns a cvResult payload", { + tl <- data.frame(variant_id = c("v1", "v2"), pip = c(0.8, 0.2), + stringsAsFactors = FALSE) + cv <- list(samplePartition = data.frame(Sample = c("s1", "s2"), Fold = c(1L, 2L)), + prediction = list(susie_predicted = matrix(0, 2, 1)), + performance = list(susie_performance = matrix(0, 1, 6))) + e <- FineMappingEntry(variantIds = tl$variant_id, susieFit = list(), + topLoci = tl, cvResult = cv) + expect_identical(getCvResult(e), cv) +}) + +test_that("FineMappingEntry cvResult defaults to NULL and rejects non-list", { + tl <- data.frame(variant_id = "v1", pip = 0.5, stringsAsFactors = FALSE) + e <- FineMappingEntry(variantIds = "v1", susieFit = list(), topLoci = tl) + expect_null(getCvResult(e)) + expect_error( + FineMappingEntry(variantIds = "v1", susieFit = list(), topLoci = tl, + cvResult = 1:3), + "cvResult must be NULL or a list") +}) + # =========================================================================== # TwasWeightsEntry # =========================================================================== diff --git a/tests/testthat/test_QtlDataset.R b/tests/testthat/test_QtlDataset.R index 28df4d96..d3f30654 100644 --- a/tests/testthat/test_QtlDataset.R +++ b/tests/testthat/test_QtlDataset.R @@ -546,6 +546,38 @@ test_that(".qtlExtractBlock: keepVariants with empty intersection returns empty expect_equal(nrow(blk$geno), 0L) }) +test_that(".qtlExtractBlock: keepIndel = FALSE drops multi-base (indel) variants", { + qd <- .qh_makeDataset() + # Make rs2 (insertion) and rs4 (deletion) indels; the rest stay SNPs. + qd@genotypes@snpInfo$A1[2] <- "AT" + qd@genotypes@snpInfo$A2[4] <- "GC" + qd@keepIndel <- FALSE + local_mocked_bindings( + extractBlockGenotypes = .qh_mockExtractor(), + .package = "pecotmr") + blk <- pecotmr:::.qtlExtractBlock(qd) + expect_equal(blk$variantIds, c("rs1", "rs3", "rs5", "rs6")) +}) + +test_that(".qtlExtractBlock: keepIndel = TRUE (default) keeps indel variants", { + qd <- .qh_makeDataset() + qd@genotypes@snpInfo$A1[2] <- "AT" + expect_true(qd@keepIndel) # default + local_mocked_bindings( + extractBlockGenotypes = .qh_mockExtractor(), + .package = "pecotmr") + blk <- pecotmr:::.qtlExtractBlock(qd) + expect_equal(length(blk$variantIds), 6L) +}) + +test_that("QtlDataset: keepIndel defaults to TRUE; validity rejects non-scalar", { + qd <- .qh_makeDataset() + expect_true(qd@keepIndel) + # The constructor coerces via isTRUE(); validity guards direct new()/slot sets. + qd@keepIndel <- c(TRUE, FALSE) + expect_error(validObject(qd), "keepIndel.*single logical") +}) + test_that(".qtlExtractBlock: mafCutoff drops low-MAF variants", { qd <- .qh_makeDataset() # The mocked extractor returns binomial(0.3) dosages: realized MAFs hover diff --git a/tests/testthat/test_QtlFineMappingResult.R b/tests/testthat/test_QtlFineMappingResult.R index 5af72035..5daf89cd 100644 --- a/tests/testthat/test_QtlFineMappingResult.R +++ b/tests/testthat/test_QtlFineMappingResult.R @@ -296,6 +296,20 @@ test_that("QtlFineMappingResult: getStudy/getContexts/getTraits/getMethodNames a expect_setequal(getMethodNames(res), c("susie", "susieRss")) }) + +test_that("getCvResult works at the QtlFineMappingResult collection level", { + tl <- data.frame(variant_id = "v1", pip = 0.5, stringsAsFactors = FALSE) + cv <- list(samplePartition = data.frame(Sample = "s1", Fold = 1L), + prediction = list(susie_predicted = matrix(0, 1, 1)), + performance = list(susie_performance = matrix(0, 1, 6))) + e <- FineMappingEntry("v1", list(), tl, cvResult = cv) + fmr <- QtlFineMappingResult(study = "S", context = "C", trait = "T", + method = "susie", entry = list(e)) + expect_identical( + getCvResult(fmr, study = "S", context = "C", trait = "T", method = "susie"), + cv) +}) + # =========================================================================== # GwasFineMappingResult collection accessors # =========================================================================== diff --git a/tests/testthat/test_fineMappingPipeline.R b/tests/testthat/test_fineMappingPipeline.R index 1b49f2a9..be966131 100644 --- a/tests/testthat/test_fineMappingPipeline.R +++ b/tests/testthat/test_fineMappingPipeline.R @@ -451,6 +451,166 @@ test_that("fineMappingPipeline(QtlDataset): runs univariate dispatch with mocked expect_setequal(getMethodNames(res), "susie") }) +test_that("fineMappingPipeline(QtlDataset): seed argument is accepted and runs", { + qd <- .fmp_makeQtlDataset(contexts = "brain", traits = "ENSG_A") + local_mocked_bindings( + extractBlockGenotypes = .fmp_mockExtractor(), + .fmFitSusieIndiv = .fmp_mockFitIndiv(), + .fmPostprocessOne = .fmp_mockPostprocess(), + .package = "pecotmr") + res <- suppressMessages( + fineMappingPipeline(qd, methods = "susie", cisWindow = 1000L, + addSusieInf = FALSE, seed = 42L)) + expect_s4_class(res, "QtlFineMappingResult") + expect_equal(nrow(res), 1L) +}) + +test_that(".fmSerScreen: disables on 0, skips no-signal, keeps signal + adaptive", { + skip_if_not_installed("susieR") + set.seed(1) + n <- 150L; p <- 25L + X <- matrix(rnorm(n * p), n, p); colnames(X) <- paste0("v", seq_len(p)) + yNull <- rnorm(n) # no association + ySig <- X[, 1] * 2 + rnorm(n, sd = 0.3) # strong single effect at v1 + fn <- function(...) suppressMessages(pecotmr:::.fmSerScreen(...)) + expect_true(fn(X, yNull, 0)) # cutoff 0 disables -> always keep + expect_false(fn(X, yNull, 0.5)) # no PIP that high -> skip + expect_true(fn(X, ySig, 0.5)) # strong signal clears 0.5 -> keep + expect_true(fn(X, ySig, -1)) # adaptive 3/p: signal keeps + expect_false(fn(X, yNull, -1)) # adaptive 3/p: null skips + expect_true(fn(X, yNull, NA)) # malformed cutoff -> advisory keep +}) + +test_that("fineMappingPipeline(QtlDataset): pipCutoffToSkip skips no-signal univariate traits", { + qd <- .fmp_makeQtlDataset(contexts = "brain", traits = c("ENSG_A", "ENSG_B")) + # Stateful screen: reject the first block (ENSG_A), keep the rest (ENSG_B). + seen <- 0L + local_mocked_bindings( + extractBlockGenotypes = .fmp_mockExtractor(), + .fmFitSusieIndiv = .fmp_mockFitIndiv(), + .fmPostprocessOne = .fmp_mockPostprocess(), + .fmSerScreen = function(X, y, cutoff) { seen <<- seen + 1L; seen > 1L }, + .package = "pecotmr") + res <- suppressMessages( + fineMappingPipeline(qd, methods = "susie", cisWindow = 1000L, + addSusieInf = FALSE, pipCutoffToSkip = -1)) + # ENSG_A screened out, ENSG_B kept -> a single row. + expect_equal(nrow(res), 1L) + expect_setequal(getTraits(res), "ENSG_B") +}) + +test_that("fineMappingPipeline(QtlDataset, cvFolds>1): attaches cvResult end to end", { + skip_if_not_installed("susieR") + qd <- .fmp_makeQtlDataset(contexts = "brain", traits = "ENSG_A") + # Real susie fits drive CV; the genotype extraction and the full-data + # post-processor are mocked (the mock SNP ids are not chr:pos:a1:a2, which the + # real buildTopLoci requires). CV runs its own real .fmFitSusieIndiv, so the + # cvResult is genuine. + local_mocked_bindings( + extractBlockGenotypes = .fmp_mockExtractor(), + .fmPostprocessOne = .fmp_mockPostprocess(), + .package = "pecotmr") + res <- suppressMessages( + fineMappingPipeline(qd, methods = "susie", cisWindow = 1000L, + addSusieInf = FALSE, cvFolds = 3, verbose = 0)) + cv <- getCvResult(res, study = "study1", context = "brain", + trait = "ENSG_A", method = "susie") + expect_false(is.null(cv)) + expect_setequal(colnames(cv$samplePartition), c("Sample", "Fold")) + expect_setequal(sort(unique(cv$samplePartition$Fold)), 1:3) + expect_true("susie_predicted" %in% names(cv$prediction)) + expect_true("susie_performance" %in% names(cv$performance)) + # One out-of-fold prediction per sample (no fold leaves a sample unscored). + expect_false(anyNA(cv$prediction[["susie_predicted"]])) +}) + +test_that("fineMappingPipeline(QtlDataset, cvFolds=0): leaves cvResult NULL", { + qd <- .fmp_makeQtlDataset(contexts = "brain", traits = "ENSG_A") + local_mocked_bindings( + extractBlockGenotypes = .fmp_mockExtractor(), + .fmFitSusieIndiv = .fmp_mockFitIndiv(), + .fmPostprocessOne = .fmp_mockPostprocess(), + .package = "pecotmr") + res <- suppressMessages( + fineMappingPipeline(qd, methods = "susie", cisWindow = 1000L, + addSusieInf = FALSE)) + expect_null(getCvResult(res, study = "study1", context = "brain", + trait = "ENSG_A", method = "susie")) +}) + +# =========================================================================== +# Cross-validation internals: .fmMakeSamplePartition / .fmCrossValidate / +# .fmSliceCv / .fmAttachCv (unit-level counterparts to the cvFolds end-to-end +# tests above). +# =========================================================================== + +test_that(".fmMakeSamplePartition partitions every sample into the requested folds", { + part <- pecotmr:::.fmMakeSamplePartition(paste0("s", 1:20), fold = 4L) + expect_setequal(part$Sample, paste0("s", 1:20)) + expect_setequal(sort(unique(part$Fold)), 1:4) + expect_equal(nrow(part), 20L) +}) + +test_that(".fmCrossValidate returns twasWeightsCv-shaped output keyed by snake method", { + skip_if_not_installed("susieR") + set.seed(42) + n <- 60L; p <- 12L + X <- matrix(rnorm(n * p), n, p, + dimnames = list(paste0("s", seq_len(n)), paste0("v", seq_len(p)))) + y <- X[, 2] * 1.5 + rnorm(n, sd = 0.5) + names(y) <- rownames(X) + cv <- pecotmr:::.fmCrossValidate( + X, y, tokens = "susie", + methodArgs = list(susie = list()), fold = 3L, + coverage = 0.95, verbose = 0) + expect_named(cv, c("samplePartition", "prediction", "performance")) + expect_setequal(colnames(cv$samplePartition), c("Sample", "Fold")) + # Keyed by the TWAS snake method name (adapter methodKey base). + expect_true("susie_predicted" %in% names(cv$prediction)) + expect_true("susie_performance" %in% names(cv$performance)) + pred <- cv$prediction[["susie_predicted"]] + expect_equal(dim(pred), c(n, 1L)) + # Every sample is held out exactly once => no missing out-of-fold predictions. + expect_false(anyNA(pred)) + perf <- cv$performance[["susie_performance"]] + expect_equal(colnames(perf), c("corr", "rsq", "adj_rsq", "pval", "RMSE", "MAE")) + # A real causal signal should yield positive out-of-fold correlation. + expect_gt(perf[1, "corr"], 0) +}) + +test_that(".fmCrossValidate reuses a supplied samplePartition verbatim", { + skip_if_not_installed("susieR") + set.seed(7) + n <- 40L; p <- 8L + X <- matrix(rnorm(n * p), n, p, + dimnames = list(paste0("s", seq_len(n)), paste0("v", seq_len(p)))) + y <- X[, 1] + rnorm(n, sd = 0.5); names(y) <- rownames(X) + part <- pecotmr:::.fmMakeSamplePartition(rownames(X), fold = 4L) + cv <- pecotmr:::.fmCrossValidate(X, y, tokens = "susie", + methodArgs = list(susie = list()), + fold = 4L, samplePartition = part, + coverage = 0.95, verbose = 0) + expect_identical(cv$samplePartition, part) +}) + +test_that(".fmSliceCv / .fmAttachCv slice one method and round-trip onto an entry", { + full <- list( + samplePartition = data.frame(Sample = c("s1", "s2"), Fold = c(1L, 2L)), + prediction = list(susie_predicted = matrix(1, 2, 1), + susie_inf_predicted = matrix(2, 2, 1)), + performance = list(susie_performance = matrix(0, 1, 6), + susie_inf_performance = matrix(0, 1, 6))) + sl <- pecotmr:::.fmSliceCv(full, "susieInf") + expect_identical(names(sl$prediction), "susie_inf_predicted") + expect_identical(names(sl$performance), "susie_inf_performance") + expect_identical(sl$samplePartition, full$samplePartition) + + tl <- data.frame(variant_id = "v1", pip = 0.5, stringsAsFactors = FALSE) + e <- FineMappingEntry("v1", list(), tl) + e2 <- pecotmr:::.fmAttachCv(e, sl) + expect_identical(getCvResult(e2), sl) +}) + test_that("fineMappingPipeline(QtlDataset): RSS-only method rejected by capability check", { qd <- .fmp_makeQtlDataset() expect_error( @@ -556,6 +716,50 @@ test_that("fineMappingPipeline(QtlDataset): mvsusie multi-context single-trait d expect_setequal(getTraits(res), "ENSG_A") }) +test_that("fineMappingPipeline(QtlDataset): pipCutoffToSkip drops null contexts before joint mvsusie", { + qd <- .fmp_makeQtlDataset(contexts = c("brain", "liver", "heart"), + traits = "ENSG_A") + local_mocked_bindings( + extractBlockGenotypes = .fmp_mockExtractor(), + .fmPostprocessOne = .fmp_mockPostprocess(), + # Drop the middle context (liver); keep brain + heart. + .fmSerScreenColumns = function(X, Y, cutoff) c(TRUE, FALSE, TRUE), + .package = "pecotmr") + local_mocked_bindings( + mvsusie = .fmp_mockMvsusie(), + create_mixture_prior = .fmp_mockMixturePrior(), + .package = "mvsusieR") + res <- suppressMessages( + fineMappingPipeline(qd, methods = "mvsusie", cisWindow = 1000L, + pipCutoffToSkip = -1)) + # liver screened out -> the joint fit runs on brain + heart only. + expect_equal(nrow(res), 2L) + expect_setequal(getContexts(res), c("brain", "heart")) +}) + +test_that("fineMappingPipeline(QtlDataset): pipCutoffToSkip skips mvsusie when < 2 contexts survive", { + # susie runs alongside so the result still has rows after mvsusie is skipped. + qd <- .fmp_makeQtlDataset(contexts = c("brain", "liver"), traits = "ENSG_A") + local_mocked_bindings( + extractBlockGenotypes = .fmp_mockExtractor(), + .fmFitSusieIndiv = .fmp_mockFitIndiv(), + .fmPostprocessOne = .fmp_mockPostprocess(), + .fmSerScreen = function(X, y, cutoff) TRUE, # keep univariate susie + .fmSerScreenColumns = function(X, Y, cutoff) c(TRUE, FALSE), # only brain + .package = "pecotmr") + local_mocked_bindings( + mvsusie = .fmp_mockMvsusie(), + create_mixture_prior = .fmp_mockMixturePrior(), + .package = "mvsusieR") + res <- suppressMessages( + fineMappingPipeline(qd, methods = c("susie", "mvsusie"), + cisWindow = 1000L, addSusieInf = FALSE, + pipCutoffToSkip = -1)) + # mvsusie skipped (only 1 context survives); susie still produced per-context. + expect_setequal(getMethodNames(res), "susie") + expect_false("mvsusie" %in% getMethodNames(res)) +}) + test_that("fineMappingPipeline(QtlDataset): mvsusie both multi falls back to per-context multi-trait", { qd <- .fmp_makeQtlDataset(contexts = c("brain", "liver"), traits = c("ENSG_A", "ENSG_B")) diff --git a/tests/testthat/test_rrMrmashMvsusie.R b/tests/testthat/test_rrMrmashMvsusie.R index b23a3394..61468a81 100644 --- a/tests/testthat/test_rrMrmashMvsusie.R +++ b/tests/testthat/test_rrMrmashMvsusie.R @@ -1,4 +1,4 @@ -context("regularized_regression - mrmash / mvsusie") +context("regularized_regression - mrmash / mvsusie / fsusie") # ---- mrmashWeights ---- test_that("mrmashWeights errors when mr.mashr package is not available", { @@ -18,6 +18,31 @@ test_that("mrmashWeights errors when X and Y are NULL and fit is NULL", { "Both X and Y must be provided") }) +test_that("mrmashWeights(retainFit=TRUE) attaches {dataDrivenPriorMatrices, w0, V}", { + skip_if_not(requireNamespace("mr.mashr", quietly = TRUE), + "mr.mashr not installed") + # These are exactly the parts fineMappingPipeline needs to rebuild the + # mvSuSiE reweighted mixture prior (w0 -> rescaleCovW0, original $U) and the + # residual variance (V); the heavy mu1 coefficient matrix is not retained. + ddpm <- list(U = list(comp = diag(2))) + fakeFit <- structure( + list(w0 = c(null = 0.4, comp_grid1 = 0.6), V = diag(2) * 2), + class = "mr.mash") + fakeCoef <- matrix(0.1, nrow = 5, ncol = 2) + local_mocked_bindings(coef.mr.mash = function(object, ...) fakeCoef, + .package = "mr.mashr") + w <- mrmashWeights(mrmashFit = fakeFit, + dataDrivenPriorMatrices = ddpm, retainFit = TRUE) + fit <- attr(w, "fit") + expect_true(is.list(fit)) + expect_identical(fit$dataDrivenPriorMatrices, ddpm) + expect_identical(fit$w0, fakeFit$w0) + expect_identical(fit$V, fakeFit$V) + # Default (retainFit = FALSE) leaves the weights free of the fit attribute. + expect_null(attr( + mrmashWeights(mrmashFit = fakeFit, dataDrivenPriorMatrices = ddpm), "fit")) +}) + # ---- mvsusieWeights ---- test_that("mvsusieWeights errors when mvsusieR package is not available", { skip_if(requireNamespace("mvsusieR", quietly = TRUE), @@ -85,3 +110,84 @@ test_that("mvsusieWeights returns coefficients from provided fit", { expect_equal(dim(result), c(p, R)) expect_equal(result, fake_coef[-1, ]) }) + +# ---- fsusieWeights ---- +# Collapse a functional SuSiE fit to a variants x features TWAS weight matrix +# (the all-SNP wavelet posterior mean, the coef.susie analog). + +.fw_makeFsusieFit <- function(seed = 1, n = 150L, p = 24L, J = 16L) { + set.seed(seed) + X <- matrix(rnorm(n * p), n, p, + dimnames = list(paste0("s", seq_len(n)), paste0("v", seq_len(p)))) + b1 <- sin(seq(0, 2 * pi, length.out = J)) + b2 <- cos(seq(0, pi, length.out = J)) + Y <- X[, 3] %o% b1 + X[, 10] %o% b2 + + matrix(rnorm(n * J, sd = 0.3), n, J) + colnames(Y) <- paste0("f", seq_len(J)) + list(X = X, Y = Y, + fit = suppressWarnings(fsusieR::susiF( + X = X, Y = Y, pos = seq_len(J), L = 5, + post_processing = "none", verbose = FALSE))) +} + +test_that("fsusieWeights returns a variants x features matrix with variant rownames", { + skip_if_not_installed("fsusieR") + skip_if_not_installed("wavethresh") + obj <- .fw_makeFsusieFit() + W <- fsusieWeights(fsusieFit = obj$fit, variantIds = colnames(obj$X)) + expect_true(is.matrix(W)) + expect_equal(nrow(W), ncol(obj$X)) + expect_equal(ncol(W), ncol(obj$Y)) + expect_equal(rownames(W), colnames(obj$X)) +}) + +test_that("fsusieWeights matches fsusieR's own out_prep reconstruction (post_processing='none')", { + skip_if_not_installed("fsusieR") + skip_if_not_installed("wavethresh") + obj <- .fw_makeFsusieFit() + fit <- obj$fit + # The alpha-weighted sum over SNPs of the per-SNP feature-domain curves that + # fsusieWeights reconstructs must equal fSuSiE's own fitted_func[[l]] (built + # by out_prep.susiF) for every effect l. + csdX <- as.numeric(fit$csd_X) + perScale <- "mixture_normal_per_scale" %in% class(fsusieR::get_G_prior(fit)) + indxLst <- fsusieR::gen_wavelet_indx(log2(length(fit$outing_grid))) + scaleCols <- if (perScale) indxLst[[length(indxLst)]] + else ncol(as.matrix(fit$fitted_wc[[1L]])) + S <- pecotmr:::.fsusieSynthesisMatrix(fit$n_wac, scaleCols) + maxErr <- 0 + for (l in seq_along(fit$fitted_wc)) { + al <- as.numeric(fit$alpha[[l]]) + contrib <- colSums((al * (1 / csdX) * as.matrix(fit$fitted_wc[[l]])) %*% S) + maxErr <- max(maxErr, max(abs(contrib - as.numeric(fit$fitted_func[[l]])))) + } + expect_lt(maxErr, 1e-8) +}) + +test_that("fsusieWeights concentrates weight on the causal SNPs", { + skip_if_not_installed("fsusieR") + skip_if_not_installed("wavethresh") + obj <- .fw_makeFsusieFit() + W <- fsusieWeights(fsusieFit = obj$fit, variantIds = colnames(obj$X)) + rowNorm <- sqrt(rowSums(W^2)) + top2 <- names(sort(rowNorm, decreasing = TRUE))[1:2] + expect_setequal(top2, c("v3", "v10")) +}) + +test_that("fsusieWeights fast path returns precomputed $coef for a trimmed fit", { + # A trimmed fSuSiE fit drops fitted_wc but keeps the precomputed weight + # matrix in $coef; fsusieWeights returns it without touching wavelet slots. + W0 <- matrix(c(1, 0, 2, 0, 0, 3), nrow = 3, + dimnames = list(c("v1", "v2", "v3"), c("f1", "f2"))) + trimmed <- list(coef = W0, pip = c(0.1, 0.2, 0.7)) + class(trimmed) <- c("fsusie", "susie") + W <- fsusieWeights(fsusieFit = trimmed) + expect_identical(W, W0) +}) + +test_that("fsusieWeights errors without a fit and on an unusable (trimmed, no coef) fit", { + expect_error(fsusieWeights(fsusieFit = NULL), "is required") + bad <- list(pip = c(0.1, 0.9)) # no coef, no fitted_wc + class(bad) <- c("fsusie", "susie") + expect_error(fsusieWeights(fsusieFit = bad), "missing required slot") +}) diff --git a/tests/testthat/test_twasWeightsPipeline.R b/tests/testthat/test_twasWeightsPipeline.R index 8cd5963d..62a8cee9 100644 --- a/tests/testthat/test_twasWeightsPipeline.R +++ b/tests/testthat/test_twasWeightsPipeline.R @@ -179,6 +179,28 @@ test_that("twasWeightsPipeline(QtlDataset): runs end-to-end with mocked solvers" expect_setequal(getTraits(res), c("ENSG_A", "ENSG_B")) }) +test_that("twasWeightsPipeline(QtlDataset): minTwasMaf/minTwasXvar tighten the variant set", { + qd <- .tp_makeQtlDataset(contexts = "brain", traits = "ENSG_A") + do.call(local_mocked_bindings, + c(list(extractBlockGenotypes = .tp_mockExtractor()), + .tp_mockIndividualWeights(), list(.package = "pecotmr"))) + call <- function(...) suppressMessages(twasWeightsPipeline( + qd, methods = list(lasso_weights = list()), traitId = "ENSG_A", + cisWindow = 1000L, cvFolds = 0, ensemble = FALSE, estimatePi = FALSE, + verbose = 0, ...)) + vcount <- function(res) length(getWeights(res, study = "study1", + context = "brain", trait = "ENSG_A", method = "lasso")) + # Baseline: all 20 mock variants are retained. + expect_equal(vcount(call()), 20L) + # Mock MAF spans 0.225..0.388; a 0.25 cutoff drops the 3 lowest-MAF variants + # while leaving the rest, so the TWAS variant set strictly shrinks. + tight <- vcount(call(minTwasMaf = 0.25)) + expect_lt(tight, 20L) + expect_gt(tight, 0L) + # minTwasXvar is wired through the same elevation; a no-op value is accepted. + expect_s4_class(call(minTwasXvar = 0), "TwasWeights") +}) + test_that("twasWeightsPipeline(QtlDataset): contexts filter restricts the per-context loop", { qd <- .tp_makeQtlDataset(contexts = c("brain", "liver"), traits = "ENSG_A") @@ -385,16 +407,19 @@ test_that("gate: mvsusie without fineMappingResult errors (QtlDataset)", { "are fine-mapping methods and may not be re-fit") }) -test_that("gate: fsusie has no TWAS-weight extractor (rejected by name)", { - qd <- .tp_makeQtlDataset(contexts = c("brain", "liver"), - traits = c("ENSG_A", "ENSG_B")) +test_that("gate: fsusie now has a TWAS-weight extractor (accepted with FMR)", { fmr <- .tp_makeStubFineMappingResult( study = "study1", contexts = c("brain", "liver"), traits = "ENSG_A", method = "fsusie") - # Even with a fineMappingResult, fsusie can't produce TWAS weights. + # fsusie is registered in .twasFineMappingMethodAdapters (fsusieWeights), so + # the gate accepts it when a FineMappingResult is supplied ... + expect_silent( + pecotmr:::.twasCheckFineMappingMethods("fsusie", fmr, "QtlDataset")) + # ... but still requires a FineMappingResult (it is a fine-mapping method and + # is not re-fit by twasWeightsPipeline). expect_error( - twasWeightsPipeline(qd, methods = "fsusie", fineMappingResult = fmr), - "have no TWAS-weight extractor") + pecotmr:::.twasCheckFineMappingMethods("fsusie", NULL, "QtlDataset"), + "may not be re-fit") }) test_that("gate: fsusie on QtlSumStats delegates to .fmCheckMethodCapabilities", { @@ -595,6 +620,40 @@ test_that("twasWeightsPipeline(QtlDataset): mr.mash multivariate path with 2 tra expect_setequal(getMethodNames(res), "mrmash") }) +test_that("twasWeightsPipeline(QtlDataset): mr.mash retains its fit parts in the fits slot", { + # The multivariate dispatch runs with retainFits=TRUE so the mr.mash fit + # parts (the shared fit fineMappingPipeline consumes to build the mvSuSiE + # prior) land on the entry's `fits` slot. + qd <- .tp_makeQtlDataset(contexts = c("brain", "liver"), traits = "ENSG_A") + ddpm <- list(U = list(comp = diag(2))) + mocks <- list( + extractBlockGenotypes = .tp_mockExtractor(), + mrmashWeights = function(X, Y, retainFit = FALSE, ...) { + w <- matrix(0, nrow = ncol(X), ncol = ncol(Y), + dimnames = list(colnames(X), colnames(Y))) + if (isTRUE(retainFit)) { + dots <- list(...) + attr(w, "fit") <- list( + dataDrivenPriorMatrices = dots$dataDrivenPriorMatrices, + w0 = c(null = 0.5, comp = 0.5), V = diag(ncol(Y))) + } + w + }) + do.call(local_mocked_bindings, c(mocks, list(.package = "pecotmr"))) + res <- suppressMessages(suppressWarnings( + twasWeightsPipeline( + qd, + methods = list(mrmash_weights = list(dataDrivenPriorMatrices = ddpm)), + cisWindow = 1000L, cvFolds = 0, ensemble = FALSE, + estimatePi = FALSE, verbose = 0))) + fit <- getFits(res, study = "study1", context = "brain", + trait = "ENSG_A", method = "mrmash") + expect_true(is.list(fit)) + expect_identical(fit$dataDrivenPriorMatrices, ddpm) + expect_false(is.null(fit$w0)) + expect_false(is.null(fit$V)) +}) + # --------------------------------------------------------------------------- # QtlSumStats: multivariate dispatch builds a (variants x contexts) Z matrix # and invokes the *Rss solver once per (study, trait) group. @@ -2125,3 +2184,73 @@ test_that("twasWeightsPipeline(QtlDataset): mr.mash jointRegions=FALSE concatena trait = "ENSG_A", method = "mrmash") expect_equal(NROW(w), 20L) }) + +# =========================================================================== +# fineMapping -> TWAS cross-validation handoff: .twasCvResultFor merges the +# per-method cvResult stashed on a fine-mapping tuple's entries, and +# .twasWeightsPipelineMatrix adopts that partition + predictions instead of +# refitting the handed-over methods. +# =========================================================================== + +test_that(".twasCvResultFor merges per-method cvResult across a tuple's entries", { + tl <- data.frame(variant_id = "v1", pip = 0.5, stringsAsFactors = FALSE) + part <- data.frame(Sample = c("s1", "s2"), Fold = c(1L, 2L)) + mkCv <- function(key) list( + samplePartition = part, + prediction = setNames(list(matrix(0, 2, 1)), paste0(key, "_predicted")), + performance = setNames(list(matrix(0, 1, 6)), paste0(key, "_performance"))) + eSusie <- FineMappingEntry("v1", list(), tl, cvResult = mkCv("susie")) + eInf <- FineMappingEntry("v1", list(), tl, cvResult = mkCv("susie_inf")) + fmr <- QtlFineMappingResult( + study = c("S", "S"), context = c("C", "C"), trait = c("T", "T"), + method = c("susie", "susieInf"), entry = list(eSusie, eInf)) + merged <- pecotmr:::.twasCvResultFor(fmr, "S", "C", "T") + expect_setequal(names(merged$prediction), + c("susie_predicted", "susie_inf_predicted")) + expect_setequal(names(merged$performance), + c("susie_performance", "susie_inf_performance")) + expect_identical(merged$samplePartition, part) +}) + +test_that(".twasCvResultFor returns NULL when no entry carries CV", { + tl <- data.frame(variant_id = "v1", pip = 0.5, stringsAsFactors = FALSE) + e <- FineMappingEntry("v1", list(), tl) + fmr <- QtlFineMappingResult(study = "S", context = "C", trait = "T", + method = "susie", entry = list(e)) + expect_null(pecotmr:::.twasCvResultFor(fmr, "S", "C", "T")) +}) + +test_that(".twasWeightsPipelineMatrix merges fineMappingCv predictions and adopts its partition", { + skip_if_not_installed("glmnet") + set.seed(11) + n <- 50L; p <- 10L + X <- matrix(rnorm(n * p), n, p, + dimnames = list(paste0("s", seq_len(n)), paste0("v", seq_len(p)))) + y <- X[, 1] * 1.2 + rnorm(n, sd = 0.6) + y <- matrix(y, ncol = 1L, dimnames = list(rownames(X), "ENSG_A")) + + # Fine-mapping hands over a shared partition + out-of-fold "susie" predictions. + part <- pecotmr:::.fmMakeSamplePartition(rownames(X), fold = 3L) + susiePred <- matrix(X[, 1] * 0.9, ncol = 1L, + dimnames = list(rownames(X), "ENSG_A")) + susiePerf <- matrix(c(0.5, 0.25, 0.24, 0.01, 0.4, 0.3), nrow = 1L, + dimnames = list("ENSG_A", + c("corr", "rsq", "adj_rsq", "pval", "RMSE", "MAE"))) + fmCv <- list(samplePartition = part, + prediction = list(susie_predicted = susiePred), + performance = list(susie_performance = susiePerf)) + + res <- suppressMessages(pecotmr:::.twasWeightsPipelineMatrix( + X = X, y = y, study = "study1", context = "brain", trait = "ENSG_A", + weightMethods = list(lasso_weights = list()), + cvFolds = 3, fineMappingCv = fmCv, ensemble = FALSE, verbose = 0)) + + # lasso was refit here; susie came from the handoff (not refit). + expect_true("lasso_predicted" %in% names(res$twasCvResult$prediction)) + expect_true("susie_predicted" %in% names(res$twasCvResult$prediction)) + # The CV used fine-mapping's partition verbatim. + expect_identical(res$twasCvResult$samplePartition, part) + # The handed-over susie predictions were aligned to the pipeline samples. + expect_equal(rownames(res$twasCvResult$prediction$susie_predicted), + rownames(X)) +}) From 6f1c3dc0938d1ec165e453f715f5d3093153d3d5 Mon Sep 17 00:00:00 2001 From: Daniel Nachun Date: Tue, 23 Jun 2026 20:06:55 -0700 Subject: [PATCH 2/2] fix getAF bug --- NAMESPACE | 2 + R/AllGenerics.R | 14 +++++ R/QtlDataset.R | 38 +++++++++++-- R/fineMappingPipeline.R | 47 +++++++++++++--- man/QtlDataset.Rd | 7 ++- man/fineMappingPipeline.Rd | 19 +++++-- man/getAf.Rd | 28 ++++++++++ man/mrmashWeights.Rd | 9 +++- man/twasWeightsPipeline.Rd | 13 +++++ tests/testthat/helper-s4Constructors.R | 2 +- tests/testthat/test_FineMappingEntry.R | 16 ++++++ tests/testthat/test_QtlDataset.R | 64 ++++++++++++++++++++++ tests/testthat/test_fineMappingPipeline.R | 66 +++++++++++++++++++++++ 13 files changed, 308 insertions(+), 17 deletions(-) create mode 100644 man/getAf.Rd diff --git a/NAMESPACE b/NAMESPACE index bf58241d..dd30007a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -85,6 +85,7 @@ export(formatFinemappingOutput) export(fsusieGetCs) export(fsusieWeights) export(fsusieWrapper) +export(getAf) export(getAnnotationMeta) export(getAnnotations) export(getBaseline) @@ -275,6 +276,7 @@ exportMethods(colocboostPipeline) exportMethods(computeLdScores) exportMethods(estimateH2) exportMethods(fineMappingPipeline) +exportMethods(getAf) exportMethods(getAnnotationMeta) exportMethods(getAnnotations) exportMethods(getBlockMetadata) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 8f93337a..db9f0c57 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -142,6 +142,20 @@ setGeneric("getN", function(x, ...) standardGeneric("getN")) #' @export setGeneric("getMaf", function(x, ...) standardGeneric("getMaf")) +#' @title Get Effect-Allele Frequencies +#' @description Extract the directional effect-allele (A1) frequency vector +#' for a \code{QtlDataset}. Unlike \code{\link{getMaf}}, the value is +#' \emph{not} folded to the minor allele: it is the frequency of the +#' dosage-counted allele (A1, the effect allele), matching the allele the +#' marginal effect sizes and the fine-mapping \code{af} column report. +#' @param x A \code{QtlDataset} object. +#' @param ... Class-specific selection arguments (e.g., \code{traitId}, +#' \code{region}, \code{cisWindow}, \code{samples} for \code{QtlDataset}). +#' @return Named numeric vector of effect-allele frequencies (names are +#' variant IDs), or an empty vector when no variants are selected. +#' @export +setGeneric("getAf", function(x, ...) standardGeneric("getAf")) + #' @title Get Number of SNPs #' @description Number of SNPs in a \code{GwasSumStats} or #' \code{QtlSumStats} entry, selected by its identity tuple. diff --git a/R/QtlDataset.R b/R/QtlDataset.R index 7625d232..684f6724 100644 --- a/R/QtlDataset.R +++ b/R/QtlDataset.R @@ -298,6 +298,11 @@ setMethod("getScaleResiduals", "QtlDataset", function(x) x@scaleResiduals) # variantIds : character vector of kept variant IDs (= colnames(geno)) # sampleIds : character vector of kept sample IDs (= rownames(geno)) # maf : numeric vector of per-variant MAF for kept variants +# af : numeric vector of per-variant effect-allele (A1) frequency +# for kept variants. Directional (NOT folded to the minor +# allele): the frequency of the dosage-counted allele, which +# is A1 by the same convention the marginal betas use. `maf` +# is `pmin(af, 1 - af)`. .qtlExtractBlock <- function(x, traitId = NULL, region = NULL, cisWindow = NULL, samples = NULL) { gr <- .qtlResolveVariantRegion(x, traitId = traitId, region = region, @@ -309,7 +314,8 @@ setMethod("getScaleResiduals", "QtlDataset", function(x) x@scaleResiduals) dimnames = list(x@genotypes@sampleIds, character(0))), variantIds = character(0), sampleIds = x@genotypes@sampleIds, - maf = numeric(0) + maf = numeric(0), + af = numeric(0) )) } @@ -325,7 +331,8 @@ setMethod("getScaleResiduals", "QtlDataset", function(x) x@scaleResiduals) dimnames = list(character(0), character(0))), variantIds = character(0), sampleIds = character(0), - maf = numeric(0) + maf = numeric(0), + af = numeric(0) )) } } @@ -343,7 +350,8 @@ setMethod("getScaleResiduals", "QtlDataset", function(x) x@scaleResiduals) dimnames = list(character(0), character(0))), variantIds = character(0), sampleIds = character(0), - maf = numeric(0) + maf = numeric(0), + af = numeric(0) )) } } @@ -368,7 +376,8 @@ setMethod("getScaleResiduals", "QtlDataset", function(x) x@scaleResiduals) geno = dosage[integer(0), , drop = FALSE], variantIds = colnames(dosage), sampleIds = character(0), - maf = rep(NA_real_, ncol(dosage)) + maf = rep(NA_real_, ncol(dosage)), + af = rep(NA_real_, ncol(dosage)) )) } dosage <- dosage[keep, , drop = FALSE] @@ -389,6 +398,10 @@ setMethod("getScaleResiduals", "QtlDataset", function(x) x@scaleResiduals) nObs <- colSums(!is.na(dosage)) sumD <- colSums(dosage, na.rm = TRUE) p <- ifelse(nObs > 0L, sumD / (2 * nObs), NA_real_) + # `p` is the frequency of the dosage-counted allele (A1, the effect + # allele) -> exported directly as the directional `af`. `mafVec` folds + # it to the minor allele for the QC threshold and the `maf` accessor. + afVec <- p mafVec <- pmin(p, 1 - p) effectiveMaf <- max(x@mafCutoff, if (nSamp > 0L) x@macCutoff / (2 * nSamp) else 0) @@ -404,8 +417,10 @@ setMethod("getScaleResiduals", "QtlDataset", function(x) x@scaleResiduals) } dosage <- dosage[, keepVarMask, drop = FALSE] mafVec <- mafVec[keepVarMask] + afVec <- afVec[keepVarMask] } else { mafVec <- numeric(0) + afVec <- numeric(0) } # Mean-impute remaining missing dosage cells so downstream linear @@ -425,7 +440,8 @@ setMethod("getScaleResiduals", "QtlDataset", function(x) x@scaleResiduals) geno = dosage, variantIds = colnames(dosage), sampleIds = rownames(dosage), - maf = mafVec + maf = mafVec, + af = afVec ) } @@ -449,6 +465,18 @@ setMethod("getMaf", "QtlDataset", out }) +#' @rdname getAf +#' @export +setMethod("getAf", "QtlDataset", + function(x, traitId = NULL, region = NULL, cisWindow = NULL, + samples = NULL, ...) { + block <- .qtlExtractBlock(x, traitId = traitId, region = region, + cisWindow = cisWindow, samples = samples) + out <- block$af + names(out) <- block$variantIds + out + }) + #' @rdname getPhenotypes #' @export setMethod("getPhenotypes", "QtlDataset", diff --git a/R/fineMappingPipeline.R b/R/fineMappingPipeline.R index 9da97e9b..cc828259 100644 --- a/R/fineMappingPipeline.R +++ b/R/fineMappingPipeline.R @@ -613,6 +613,32 @@ setGeneric("fineMappingPipeline", c(list(x = x, ...), .resPickFlags())) } +# Directional effect-allele (A1) frequency for the variants in a fitted +# genotype block `X` (samples x variants, post-residualization and post +# sample-intersection). Re-extracts the allele frequency from the dataset +# `data` over the SAME selection used to build `X` and aligns it to +# `colnames(X)`; variants `getAf` does not return (e.g. dropped by a +# borderline MAF re-check on the final sample set) come back as NA. Returns +# NULL when `X` is empty or the dataset exposes no `getAf` (non-QtlDataset +# sources whose entries already carry `af`). The branch mirrors the +# `.fmResidGeno` call that built `X`: `region`-driven when a joint range is +# given, else `traitId` + `cisWindow` for the cis window. +.fmAfForX <- function(data, X, traitId = NULL, region = NULL, + cisWindow = NULL) { + if (is.null(X) || ncol(X) == 0L || nrow(X) == 0L) return(NULL) + if (!is(data, "QtlDataset")) return(NULL) + afAll <- tryCatch( + if (is.null(region)) { + getAf(data, traitId = traitId, cisWindow = cisWindow, + samples = rownames(X)) + } else { + getAf(data, region = region, samples = rownames(X)) + }, + error = function(e) NULL) + if (is.null(afAll) || length(afAll) == 0L) return(NULL) + unname(afAll[colnames(X)]) +} + .fmPostprocessOne <- function(fit, method, dataX, dataY, coverage, secondaryCoverage, signalCutoff, minAbsCorr, csInput = NULL, af = NULL, @@ -725,7 +751,7 @@ setGeneric("fineMappingPipeline", .fmFitXBlock <- function(X, y, toRun, addSusieInf, coverage, secondaryCoverage, signalCutoff, minAbsCorr, methodArgs, verbose, ctx, tid, - cvFolds = 0, samplePartition = NULL) { + cvFolds = 0, samplePartition = NULL, af = NULL) { chainLocal <- .fmResolveSusieChain(toRun, addSusieInf) infFit <- NULL if (chainLocal$runInf) { @@ -753,7 +779,7 @@ setGeneric("fineMappingPipeline", out[[tk]] <- .fmPostprocessOne( fit = fit, method = tk, dataX = X, dataY = y, coverage = coverage, secondaryCoverage = secondaryCoverage, signalCutoff = signalCutoff, - minAbsCorr = minAbsCorr, csInput = "X") + minAbsCorr = minAbsCorr, af = af, csInput = "X") } # Per-fold cross-validation across the fitted univariate methods; attach # each method's out-of-fold predictions to its entry. @@ -1327,10 +1353,13 @@ setMethod("fineMappingPipeline", "QtlDataset", ctx, tid)) return(list()) } + afVec <- .fmAfForX(data, X, traitId = tid, region = rg, + cisWindow = cisWindow) .fmFitXBlock(X, y, toRun, addSusieInf, coverage, secondaryCoverage, signalCutoff, minAbsCorr, methodArgs, verbose, ctx, tid, - cvFolds = cvFolds, samplePartition = samplePartition) + cvFolds = cvFolds, samplePartition = samplePartition, + af = afVec) }) for (tk in toRun) { @@ -1452,6 +1481,8 @@ setMethod("fineMappingPipeline", "QtlDataset", "insufficient shared samples across selected contexts.") } Xc <- X[cs, , drop = FALSE] + afVec <- .fmAfForX(data, Xc, traitId = tid, region = rg, + cisWindow = cisWindow) Yc <- do.call(cbind, lapply(contextsHere, function(ctx) { ym <- Yres[[ctx]][cs, , drop = FALSE] colnames(ym) <- ctx @@ -1469,7 +1500,7 @@ setMethod("fineMappingPipeline", "QtlDataset", fit = fit, method = "mvsusie", dataX = Xc, dataY = NULL, coverage = coverage, secondaryCoverage = secondaryCoverage, signalCutoff = signalCutoff, minAbsCorr = minAbsCorr, - csInput = "X") + af = afVec, csInput = "X") if (cvFolds > 1L) { cv <- .fmCrossValidate(Xc, Yc, "mvsusie", methodArgs, cvFolds, samplePartition = samplePartition, @@ -1558,6 +1589,8 @@ setMethod("fineMappingPipeline", "QtlDataset", } Xc <- X[common, , drop = FALSE] Yc <- Y[common, , drop = FALSE] + afVec <- .fmAfForX(data, Xc, traitId = traits, region = rg, + cisWindow = cisWindow) mvBaseArgs <- list( X = Xc, Y = Yc, prior_variance = mvsusieR::create_mixture_prior(R = ncol(Yc)), @@ -1570,7 +1603,7 @@ setMethod("fineMappingPipeline", "QtlDataset", fit = fit, method = "mvsusie", dataX = Xc, dataY = NULL, coverage = coverage, secondaryCoverage = secondaryCoverage, signalCutoff = signalCutoff, minAbsCorr = minAbsCorr, - csInput = "X") + af = afVec, csInput = "X") if (cvFolds > 1L) { cv <- .fmCrossValidate(Xc, Yc, "mvsusie", methodArgs, cvFolds, samplePartition = samplePartition, @@ -1646,6 +1679,8 @@ setMethod("fineMappingPipeline", "QtlDataset", } Xc <- X[common, , drop = FALSE] Yc <- Y[common, , drop = FALSE] + afVec <- .fmAfForX(data, Xc, traitId = traits, region = rg, + cisWindow = cisWindow) fit <- do.call(fitFsusie, .fmMergeUserArgs(list(X = Xc, Y = Yc, pos = pos), "fsusie", methodArgs[["fsusie"]])) @@ -1660,7 +1695,7 @@ setMethod("fineMappingPipeline", "QtlDataset", fit = fit, method = "fsusie", dataX = Xc, dataY = NULL, coverage = coverage, secondaryCoverage = secondaryCoverage, signalCutoff = signalCutoff, minAbsCorr = minAbsCorr, - csInput = "fsusie") + af = afVec, csInput = "fsusie") if (cvFolds > 1L) { cv <- .fmCrossValidate(Xc, Yc, "fsusie", methodArgs, cvFolds, samplePartition = samplePartition, diff --git a/man/QtlDataset.Rd b/man/QtlDataset.Rd index 890bf9aa..132f0204 100644 --- a/man/QtlDataset.Rd +++ b/man/QtlDataset.Rd @@ -15,7 +15,8 @@ QtlDataset( xvarCutoff = 0, imissCutoff = 0, keepSamples = character(0), - keepVariants = character(0) + keepVariants = character(0), + keepIndel = TRUE ) } \arguments{ @@ -31,6 +32,10 @@ positions and \code{colData} carrying per-context phenotype covariates.} (e.g., ancestry PCs); rows are samples.} \item{scaleResiduals}{Logical (length 1). Default \code{TRUE}.} + +\item{keepIndel}{Logical (length 1). When \code{FALSE}, variants whose +alleles are not single nucleotides (indels) are dropped at extraction. +Default \code{TRUE} (keep all variants).} } \value{ A \code{QtlDataset} object. diff --git a/man/fineMappingPipeline.Rd b/man/fineMappingPipeline.Rd index 07e57636..0ed6530b 100644 --- a/man/fineMappingPipeline.Rd +++ b/man/fineMappingPipeline.Rd @@ -29,6 +29,8 @@ fineMappingPipeline(data, ...) fineMappingResult = NULL, cvFolds = 0, samplePartition = NULL, + pipCutoffToSkip = 0, + seed = NULL, naAction = c("drop", "impute"), verbose = 1, trim = TRUE, @@ -57,6 +59,8 @@ fineMappingPipeline(data, ...) fineMappingResult = NULL, cvFolds = 0, samplePartition = NULL, + pipCutoffToSkip = 0, + seed = NULL, naAction = c("drop", "impute"), verbose = 1, trim = TRUE, @@ -199,6 +203,18 @@ supplied (and \code{cvFolds > 1}), every method reuses this exact partition; otherwise a fresh partition is generated per \code{(study, context, trait)}.} +\item{pipCutoffToSkip}{Numeric (length 1). Individual-level single-effect +(SER) pre-screen applied to each residualized \code{(X, y)} block before a +full fit: a susie model with \code{L = 1} is fit and the block is skipped +when no PIP exceeds the cutoff (no potentially significant variant). The +summary-statistics analog lives in \code{summaryStatsQc()}. \code{0} +(default) disables the screen; a negative value uses the adaptive +\code{3 / nVariants} threshold.} + +\item{seed}{Optional integer. When non-NULL, \code{set.seed(seed)} is +called once at the start of the call for reproducible fits. Default +\code{NULL} (no seeding).} + \item{verbose}{Verbosity (0 silent, 1 default). Default \code{1}.} \item{trim}{Logical (length 1). When \code{TRUE} (default) the @@ -339,9 +355,6 @@ deliberately not ported here: level QC lives on the \code{QtlDataset} constructor; sumstat QC lives in \code{summaryStatsQc()}. No filtering happens inside this pipeline. - \item \code{pipCutoffToSkip} pre-screen: this lived in the old - pipelines but is not ported. Callers can run a one-shot - \code{susieR::susie} check externally if needed. \item Diagnostic re-analysis paths (\code{singleEffect} / \code{bayesianConditionalRegression} reanalysis on the RSS path): these are not exposed as diff --git a/man/getAf.Rd b/man/getAf.Rd new file mode 100644 index 00000000..7c4a720b --- /dev/null +++ b/man/getAf.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/AllGenerics.R, R/QtlDataset.R +\name{getAf} +\alias{getAf} +\alias{getAf,QtlDataset-method} +\title{Get Effect-Allele Frequencies} +\usage{ +getAf(x, ...) + +\S4method{getAf}{QtlDataset}(x, traitId = NULL, region = NULL, cisWindow = NULL, samples = NULL, ...) +} +\arguments{ +\item{x}{A \code{QtlDataset} object.} + +\item{...}{Class-specific selection arguments (e.g., \code{traitId}, +\code{region}, \code{cisWindow}, \code{samples} for \code{QtlDataset}).} +} +\value{ +Named numeric vector of effect-allele frequencies (names are + variant IDs), or an empty vector when no variants are selected. +} +\description{ +Extract the directional effect-allele (A1) frequency vector + for a \code{QtlDataset}. Unlike \code{\link{getMaf}}, the value is + \emph{not} folded to the minor allele: it is the frequency of the + dosage-counted allele (A1, the effect allele), matching the allele the + marginal effect sizes and the fine-mapping \code{af} column report. +} diff --git a/man/mrmashWeights.Rd b/man/mrmashWeights.Rd index a7a46afb..60714e9c 100644 --- a/man/mrmashWeights.Rd +++ b/man/mrmashWeights.Rd @@ -4,7 +4,7 @@ \alias{mrmashWeights} \title{Compute mr.mash TWAS weights} \usage{ -mrmashWeights(mrmashFit = NULL, X = NULL, Y = NULL, ...) +mrmashWeights(mrmashFit = NULL, X = NULL, Y = NULL, retainFit = FALSE, ...) } \arguments{ \item{mrmashFit}{Optional fitted mr.mash object.} @@ -13,6 +13,13 @@ mrmashWeights(mrmashFit = NULL, X = NULL, Y = NULL, ...) \item{Y}{Phenotype matrix. Required when `mrmashFit` is NULL.} +\item{retainFit}{If TRUE, attach (as the `"fit"` attribute of the returned +weights) the parts of the mr.mash fit that `fineMappingPipeline` needs to +rebuild the mvSuSiE reweighted mixture prior + residual variance: the +original data-driven prior matrices (`dataDrivenPriorMatrices`), the fitted +mixture weights (`w0`) and the residual covariance (`V`). The heavy +coefficient matrix (`mu1`) is intentionally not retained. Default FALSE.} + \item{...}{Additional arguments passed to `mrmashWrapper()` when fitting.} } \value{ diff --git a/man/twasWeightsPipeline.Rd b/man/twasWeightsPipeline.Rd index ab1d31da..9c44b0d6 100644 --- a/man/twasWeightsPipeline.Rd +++ b/man/twasWeightsPipeline.Rd @@ -17,6 +17,8 @@ twasWeightsPipeline(data, ...) traitId = NULL, region = NULL, cisWindow = NULL, + minTwasMaf = NULL, + minTwasXvar = NULL, jointRegions = FALSE, jointSpecification = NULL, fineMappingResult = NULL, @@ -105,6 +107,17 @@ Mutually exclusive with \code{traitId}.} genomic position when extracting variants. Required when \code{traitId} is supplied. Mutually exclusive with \code{region}.} +\item{minTwasMaf}{For QtlDataset: optional minimum minor-allele frequency +applied to the variant set used for TWAS weight learning, on top of the +dataset's construct-time \code{mafCutoff} (the effective cutoff is the +larger of the two). Lets the TWAS pass use a stricter MAF threshold than +fine mapping. \code{NULL} (default) leaves the construct-time cutoff in +place.} + +\item{minTwasXvar}{As \code{minTwasMaf} but for the per-variant genotype +variance cutoff (\code{xvarCutoff}). \code{NULL} (default) leaves the +construct-time cutoff in place.} + \item{jointRegions}{For QtlDataset with a multi-range \code{region}: \code{FALSE} (default) learns weights for each range independently and concatenates them into one entry per (study, context, trait, method); diff --git a/tests/testthat/helper-s4Constructors.R b/tests/testthat/helper-s4Constructors.R index 19a104bf..083376b9 100644 --- a/tests/testthat/helper-s4Constructors.R +++ b/tests/testthat/helper-s4Constructors.R @@ -28,7 +28,7 @@ context("s4Constructors") A1 = rep("G", n), A2 = rep("A", n), N = rep(1000, n), - MAF = rep(0.1, n), + af = rep(0.87, n), # directional effect-allele freq (> 0.5) marginal_beta = rep(0.1, n), marginal_se = rep(0.05, n), marginal_z = rep(2.0, n), diff --git a/tests/testthat/test_FineMappingEntry.R b/tests/testthat/test_FineMappingEntry.R index b7c2af48..6243ed4f 100644 --- a/tests/testthat/test_FineMappingEntry.R +++ b/tests/testthat/test_FineMappingEntry.R @@ -171,6 +171,22 @@ test_that("FineMappingEntry: getCs filters to rows in any credible set", { }) +test_that("FineMappingEntry: getCs/getTopLoci surface the directional af column", { + # Regression: the posterior view must carry the topLoci `af` (effect-allele + # frequency) through to getCs() / getTopLoci(), not drop it to NA. The + # value is directional (0.87 > 0.5), so a folded MAF would be a bug. + entry <- .sc_makeFineMappingEntry(3) + cs <- getCs(entry) + expect_true("af" %in% names(cs)) + expect_equal(unname(cs$af), rep(0.87, nrow(cs))) + expect_false(anyNA(cs$af)) + + tl <- getTopLoci(entry, signalCutoff = 0) + expect_true("af" %in% names(tl)) + expect_equal(unname(tl$af), rep(0.87, nrow(tl))) +}) + + test_that("FineMappingEntry: validity errors when topLoci is missing required cols", { expect_error( FineMappingEntry( diff --git a/tests/testthat/test_QtlDataset.R b/tests/testthat/test_QtlDataset.R index d3f30654..44b5346e 100644 --- a/tests/testthat/test_QtlDataset.R +++ b/tests/testthat/test_QtlDataset.R @@ -602,6 +602,70 @@ test_that(".qtlExtractBlock: mafCutoff retains variants above the threshold", { expect_true(all(blk$maf >= 0.4)) }) +# Deterministic dosage panel so the directional (un-folded) effect-allele +# frequency is exactly known: rs1 = 0.70, rs2 = 0.20, rs3 = 0.50, rest 0. +.qh_directionalExtractor <- function() { + function(handle, snpIdx, meanImpute = TRUE) { + panel <- matrix(0, nrow = length(handle@sampleIds), + ncol = nrow(handle@snpInfo), + dimnames = list(handle@sampleIds, handle@snpInfo$SNP)) + panel[, "rs1"] <- c(2, 2, 2, 2, 1, 1, 1, 1, 1, 1) # sum 14 -> p = 0.70 + panel[, "rs2"] <- c(1, 1, 1, 1, 0, 0, 0, 0, 0, 0) # sum 4 -> p = 0.20 + panel[, "rs3"] <- 1 # sum 10 -> p = 0.50 + sub <- panel[, snpIdx, drop = FALSE] + rr <- GenomicRanges::GRanges( + seqnames = paste0("chr", handle@snpInfo$CHR[snpIdx]), + ranges = IRanges::IRanges(start = handle@snpInfo$BP[snpIdx], width = 1L)) + S4Vectors::mcols(rr) <- S4Vectors::DataFrame( + SNP = handle@snpInfo$SNP[snpIdx], A1 = handle@snpInfo$A1[snpIdx], + A2 = handle@snpInfo$A2[snpIdx]) + dosage <- t(sub) + rownames(dosage) <- handle@snpInfo$SNP[snpIdx] + colnames(dosage) <- handle@sampleIds + SummarizedExperiment::SummarizedExperiment( + assays = list(dosage = dosage), + rowRanges = rr, + colData = S4Vectors::DataFrame(sampleId = handle@sampleIds, + row.names = handle@sampleIds)) + } +} + +test_that(".qtlExtractBlock: returns directional af; maf is its minor-allele fold", { + qd <- .qh_makeDataset() + local_mocked_bindings( + extractBlockGenotypes = .qh_mockExtractor(), + .package = "pecotmr") + blk <- pecotmr:::.qtlExtractBlock(qd) + expect_equal(length(blk$af), length(blk$variantIds)) + # `maf` is exactly the minor-allele fold of the directional `af`. + expect_equal(unname(blk$maf), pmin(unname(blk$af), 1 - unname(blk$af))) +}) + +test_that("getAf: returns directional effect-allele frequency (not folded to MAF)", { + qd <- .qh_makeDataset(n_samples = 10L) + local_mocked_bindings( + extractBlockGenotypes = .qh_directionalExtractor(), + .package = "pecotmr") + af <- getAf(qd) + expect_named(af) + # Directional: 0.70 / 0.20 retained verbatim, NOT folded to 0.30 / 0.20. + expect_equal(unname(af[["rs1"]]), 0.70) + expect_equal(unname(af[["rs2"]]), 0.20) + expect_equal(unname(af[["rs3"]]), 0.50) +}) + +test_that("getMaf stays folded while getAf is directional (they fold into each other)", { + qd <- .qh_makeDataset(n_samples = 10L) + local_mocked_bindings( + extractBlockGenotypes = .qh_directionalExtractor(), + .package = "pecotmr") + af <- getAf(qd) + maf <- getMaf(qd) + # getMaf folds rs1's 0.70 down to 0.30; getAf does not. + expect_equal(unname(maf[["rs1"]]), 0.30) + expect_equal(unname(maf[names(af)]), pmin(unname(af), 1 - unname(af))) +}) + context("QtlDataset residualization methods") diff --git a/tests/testthat/test_fineMappingPipeline.R b/tests/testthat/test_fineMappingPipeline.R index be966131..dbf2a9e7 100644 --- a/tests/testthat/test_fineMappingPipeline.R +++ b/tests/testthat/test_fineMappingPipeline.R @@ -451,6 +451,72 @@ test_that("fineMappingPipeline(QtlDataset): runs univariate dispatch with mocked expect_setequal(getMethodNames(res), "susie") }) +test_that(".fmAfForX: returns directional effect-allele af aligned to colnames(X)", { + qd <- .fmp_makeQtlDataset(contexts = "brain", traits = "ENSG_A") + local_mocked_bindings( + extractBlockGenotypes = .fmp_mockExtractor(), + .package = "pecotmr") + region <- GenomicRanges::GRanges("chr1", IRanges::IRanges(1L, 100000L)) + # The helper aligns af to dimnames; values come from getAf over the same + # selection. Columns deliberately reordered to test name-based alignment. + X <- matrix(0, nrow = 5L, ncol = 3L, + dimnames = list(paste0("s", 1:5), c("v3", "v1", "v2"))) + af <- pecotmr:::.fmAfForX(qd, X, region = region) + expect_length(af, 3L) + expect_false(anyNA(af)) # region matched -> every fitted variant has an af + expect_equal( + af, + unname(getAf(qd, region = region, samples = rownames(X))[colnames(X)])) +}) + +test_that(".fmAfForX: returns NULL for an empty block or a non-QtlDataset source", { + qd <- .fmp_makeQtlDataset(contexts = "brain", traits = "ENSG_A") + emptyX <- matrix(numeric(0), nrow = 0L, ncol = 0L) + expect_null(pecotmr:::.fmAfForX(qd, emptyX)) + X <- matrix(0, nrow = 2L, ncol = 1L, + dimnames = list(c("s1", "s2"), "v1")) + expect_null(pecotmr:::.fmAfForX(list(not = "a dataset"), X)) +}) + +test_that("fineMappingPipeline(QtlDataset): threads directional af into postprocess", { + # Regression for af = NA in getCs: the individual-level univariate path must + # forward a non-NULL, directional effect-allele frequency to the + # post-processor (which writes it into the topLoci `af` column). + qd <- .fmp_makeQtlDataset(contexts = "brain", traits = "ENSG_A") + captured <- new.env(parent = emptyenv()) + captured$af <- "UNSET" + captured$cols <- NULL + recordingPostprocess <- function(fit, method, dataX, dataY, coverage, + secondaryCoverage, signalCutoff, minAbsCorr, + csInput = NULL, af = NULL, region = NULL) { + captured$af <- af + captured$cols <- colnames(dataX) + vids <- colnames(dataX) + FineMappingEntry( + variantIds = vids, + susieFit = list(method = method), + topLoci = data.frame(variant_id = vids, + pip = seq(0.9, by = -0.1, + length.out = length(vids)), + af = af, + stringsAsFactors = FALSE)) + } + local_mocked_bindings( + extractBlockGenotypes = .fmp_mockExtractor(), + .fmFitSusieIndiv = .fmp_mockFitIndiv(), + .fmPostprocessOne = recordingPostprocess, + .package = "pecotmr") + suppressMessages( + fineMappingPipeline(qd, methods = "susie", cisWindow = 1000L, + addSusieInf = FALSE)) + # af forwarded (not left at the NULL default), one value per fitted variant. + expect_false(identical(captured$af, "UNSET")) + expect_false(is.null(captured$af)) + expect_length(captured$af, length(captured$cols)) + expect_true(any(!is.na(captured$af))) + expect_true(all(captured$af >= 0 & captured$af <= 1, na.rm = TRUE)) +}) + test_that("fineMappingPipeline(QtlDataset): seed argument is accepted and runs", { qd <- .fmp_makeQtlDataset(contexts = "brain", traits = "ENSG_A") local_mocked_bindings(