From 0721711d9e01f7ce50870c1b85eea9ab5fa0fda5 Mon Sep 17 00:00:00 2001 From: Daniel Nachun Date: Mon, 22 Jun 2026 13:48:27 -0700 Subject: [PATCH 1/4] pipeline fixes --- DESCRIPTION | 1 - NAMESPACE | 5 +- R/ctwasPipeline.R | 776 ++++++++++++++++++++++++---- R/genotypeIo.R | 16 +- R/sumstatsQc.R | 535 ++++++++++++++----- tests/testthat/test_ctwasPipeline.R | 668 ++++++++++++++++++++---- tests/testthat/test_sumstatsQc.R | 221 +++++--- 7 files changed, 1804 insertions(+), 418 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d19b6a0e..29d11ae1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -20,7 +20,6 @@ Imports: IRanges, MASS, Matrix, - MungeSumstats, Rsamtools, S4Vectors, SummarizedExperiment, diff --git a/NAMESPACE b/NAMESPACE index 614d0059..c58aa18a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -50,7 +50,11 @@ export(computeQtlEnrichment) export(computeSldscAnnotSd) export(computeSldscMRef) export(ctwasBimfileLoader) +export(assembleCtwasInputs) export(ctwasPipeline) +export(estCtwasParam) +export(finemapCtwasRegions) +export(screenCtwasRegions) export(dentist) export(dentistSingleWindow) export(dprAdaptiveGibbsWeights) @@ -352,7 +356,6 @@ importFrom(IRanges,DataFrameList) importFrom(IRanges,IRanges) importFrom(IRanges,findOverlaps) importFrom(MASS,ginv) -importFrom(MungeSumstats,standardise_header) importFrom(Rsamtools,TabixFile) importFrom(Rsamtools,asBcf) importFrom(Rsamtools,headerTabix) diff --git a/R/ctwasPipeline.R b/R/ctwasPipeline.R index c536eafa..75c41198 100644 --- a/R/ctwasPipeline.R +++ b/R/ctwasPipeline.R @@ -1,7 +1,7 @@ -#' @title Causal TWAS Pipeline (cTWAS, single LD block) -#' @description Per-LD-block pipeline that hands a -#' \code{\link{GwasSumStats}} of GWAS Z-scores together with per-gene -#' TWAS weights and the shared LD sketch to +#' @title Causal TWAS Pipeline (cTWAS, multi LD block) +#' @description Pipeline that hands a per-block set of +#' \code{\link{GwasSumStats}} of GWAS Z-scores together with the +#' matching per-block per-gene TWAS weights and LD sketches to #' \code{ctwas::ctwas_sumstats}, producing per-gene posterior #' inclusion probabilities for causal genes. Optionally accepts a #' precomputed TWAS-Z \code{GRanges} from @@ -9,23 +9,27 @@ #' so the per-gene Z is not recomputed inside ctwas. #' #' @section LD block convention: -#' Each call assumes the inputs cover exactly one LD block — the user -#' is responsible for constructing the \code{GwasSumStats} and -#' \code{TwasWeights} over the block of interest before calling this -#' pipeline (the same convention used by -#' \code{\link{fineMappingPipeline}} on \code{GwasSumStats}). The -#' single-region \code{region_info}, \code{LD_map}, and \code{snp_map} -#' that \code{ctwas::ctwas_sumstats} requires are derived -#' automatically from the LD sketch on \code{gwasSumStats}. +#' Inputs are NAMED LISTS keyed by \code{region_id} +#' (\code{list(block1 = gss1, block2 = gss2, ...)}). Per-block +#' \code{region_info}, \code{LD_map}, and \code{snp_map} entries are +#' built automatically from each block's LD sketch and concatenated +#' before the call to \code{ctwas::ctwas_sumstats}. A single-block +#' input is rejected: cTWAS's EM cannot converge on a single region, +#' so callers must supply at least two blocks. #' #' @section LD-sketch identity check: -#' \code{getLdSketch(twasWeights)} (when non-NULL) must match -#' \code{getLdSketch(gwasSumStats)}. Mismatch is a hard error. +#' Per block: \code{getLdSketch(twasWeights)} (when non-NULL) must +#' match \code{getLdSketch(gwasSumStats)}. Mismatch is a hard error. #' -#' @param gwasSumStats A \code{\link{GwasSumStats}} over one LD block. -#' Must have \code{getQcInfo()} non-empty. -#' @param twasWeights A \code{\link{TwasWeights}} carrying per-(study, -#' context, trait, method) weights over the same LD block. +#' @param gwasSumStats NAMED LIST of \code{\link{GwasSumStats}} keyed +#' by \code{region_id} (at least two entries). Each must have +#' \code{getQcInfo()} non-empty. +#' @param twasWeights NAMED LIST of \code{\link{TwasWeights}} keyed by +#' \code{region_id}. Keys must be a SUBSET of \code{gwasSumStats}'s +#' keys: blocks without any TWAS weights still contribute their +#' SNP-level signal to ctwas's joint group prior estimate (matches +#' the legacy whole-chromosome pattern where only a few of many LD +#' blocks carry gene weights). #' @param twasZ Optional \code{GRanges} of TWAS Z-scores (output of #' \code{\link{causalInferencePipeline}}). When supplied, the #' per-(trait, context) Z is used as the \code{z_gene} input to @@ -37,8 +41,13 @@ #' (default) the smart filters are no-ops; only the magnitude filter #' (\code{twasWeightCutoff}) and the per-gene cap #' (\code{maxNumVariants}, ordered by \code{|weight|}) apply. -#' @param regionId Optional character (length 1) label for the LD -#' block. Default \code{"block1"}. +#' @param method Optional character (length 1). Picks which TWAS +#' method's weights to feed into ctwas for each (study, context, +#' trait) gene. When \code{NULL} (default): use \code{"ensemble"} if +#' that method is present across the TwasWeights; otherwise use the +#' sole method when only one is present; otherwise error. Passing +#' the name explicitly (e.g. \code{"mrash"}) overrides the default +#' resolution. #' @param thin,niterPrefit,niter,L Pass-throughs to #' \code{ctwas::ctwas_sumstats}. #' @param groupPriorVarStructure Pass-through (defaults @@ -71,7 +80,7 @@ ctwasPipeline <- function(gwasSumStats, twasWeights, twasZ = NULL, fineMappingResult = NULL, - regionId = "block1", + method = NULL, thin = 0.1, niterPrefit = 3L, niter = 30L, @@ -87,17 +96,104 @@ ctwasPipeline <- function(gwasSumStats, minPipCutoff = 0, maxNumVariants = Inf, ...) { + groupPriorVarStructure <- match.arg(groupPriorVarStructure) + inputs <- assembleCtwasInputs( + gwasSumStats = gwasSumStats, + twasWeights = twasWeights, + twasZ = twasZ, + fineMappingResult = fineMappingResult, + method = method, + twasWeightCutoff = twasWeightCutoff, + csMinCor = csMinCor, + minPipCutoff = minPipCutoff, + maxNumVariants = maxNumVariants) + est <- estCtwasParam( + inputs, + thin = thin, + niterPrefit = niterPrefit, + niter = niter, + groupPriorVarStructure = groupPriorVarStructure, + ncore = ncore, + ...) + screened <- screenCtwasRegions( + est, + L = L, + ncore = ncore, + ...) + finemapCtwasRegions( + screened, + L = L, + ncore = ncore, + ...) +} + +#' Assemble cTWAS inputs from S4 GwasSumStats / TwasWeights +#' +#' @description Builds the per-block ctwas-shape input set +#' (\code{z_snp}, \code{weights}, \code{region_info}, \code{snp_map}, +#' \code{LD_map}, the LD- and SNP-info loader closures, plus optional +#' \code{z_gene}) that the downstream ctwas steps consume. +#' This is step 1 of the three-step \code{\link{ctwasPipeline}} split. +#' +#' @details The returned list is the SHARED STATE threaded through +#' \code{\link{estCtwasParam}} → \code{\link{screenCtwasRegions}} → +#' \code{\link{finemapCtwasRegions}}. Callers can short-circuit at any +#' step (e.g. override the estimated priors before fine-mapping) or +#' call \code{ctwasPipeline()} for the one-shot path. +#' +#' @inheritParams ctwasPipeline +#' @return A list with elements \code{z_snp}, \code{z_gene} (NULL when +#' no \code{twasZ}), \code{weights}, \code{region_info}, +#' \code{snp_map}, \code{LD_map}, \code{LD_loader_fun}, +#' \code{snpinfo_loader_fun}, and \code{resolvedMethod}. +#' @export +assembleCtwasInputs <- function(gwasSumStats, twasWeights, + twasZ = NULL, + fineMappingResult = NULL, + method = NULL, + twasWeightCutoff = 0, + csMinCor = 0.8, + minPipCutoff = 0, + maxNumVariants = Inf) { if (!requireNamespace("ctwas", quietly = TRUE)) { - stop("Package 'ctwas' is required for ctwasPipeline. ", + stop("Package 'ctwas' is required for the cTWAS pipeline. ", "Install from https://github.com/xinhe-lab/ctwas .") } - if (!methods::is(gwasSumStats, "GwasSumStats")) - stop("`gwasSumStats` must be a GwasSumStats object.") - if (length(getQcInfo(gwasSumStats)) == 0L) - stop("ctwasPipeline: gwasSumStats has no QC record. Call ", - "summaryStatsQc() first.") - if (missing(twasWeights) || !methods::is(twasWeights, "TwasWeights")) - stop("`twasWeights` must be a TwasWeights object.") + if (missing(gwasSumStats) || !is.list(gwasSumStats) || + methods::is(gwasSumStats, "GwasSumStats")) + stop("`gwasSumStats` must be a NAMED LIST of GwasSumStats keyed by ", + "region_id (got ", class(gwasSumStats)[[1L]], "). cTWAS's EM ", + "requires multi-block context to converge; single-block calls ", + "are no longer supported.") + if (missing(twasWeights) || !is.list(twasWeights) || + methods::is(twasWeights, "TwasWeights")) + stop("`twasWeights` must be a NAMED LIST of TwasWeights keyed by ", + "region_id.") + if (is.null(names(gwasSumStats)) || any(!nzchar(names(gwasSumStats)))) + stop("`gwasSumStats` must be a named list keyed by region_id (got an ", + "unnamed or empty-named list).") + if (is.null(names(twasWeights)) || any(!nzchar(names(twasWeights)))) + stop("`twasWeights` must be a named list keyed by region_id (got an ", + "unnamed or empty-named list).") + extra_tw_keys <- setdiff(names(twasWeights), names(gwasSumStats)) + if (length(extra_tw_keys) > 0L) + stop("`twasWeights` has region_id key(s) not present in ", + "`gwasSumStats`: ", paste(extra_tw_keys, collapse = ", ")) + if (length(gwasSumStats) < 2L) + stop("assembleCtwasInputs: at least two LD blocks are required (got ", + length(gwasSumStats), "). cTWAS's EM cannot estimate the SNP-", + "group prior variance from a single region.") + for (rid in names(gwasSumStats)) { + if (!methods::is(gwasSumStats[[rid]], "GwasSumStats")) + stop("gwasSumStats[['", rid, "']] is not a GwasSumStats.") + if (length(getQcInfo(gwasSumStats[[rid]])) == 0L) + stop("assembleCtwasInputs: gwasSumStats[['", rid, + "']] has no QC record. Call summaryStatsQc() first.") + } + for (rid in names(twasWeights)) { + if (!methods::is(twasWeights[[rid]], "TwasWeights")) + stop("twasWeights[['", rid, "']] is not a TwasWeights.") + } if (!is.null(twasZ) && !methods::is(twasZ, "GRanges")) stop("`twasZ` must be a GRanges (output of causalInferencePipeline) ", "or NULL.") @@ -105,65 +201,281 @@ ctwasPipeline <- function(gwasSumStats, !methods::is(fineMappingResult, "FineMappingResultBase")) stop("`fineMappingResult` must be a FineMappingResultBase ", "(QtlFineMappingResult or GwasFineMappingResult) or NULL.") - if (length(regionId) != 1L || !nzchar(regionId)) - stop("`regionId` must be a single non-empty character string.") + + regionIds <- names(gwasSumStats) + resolvedMethod <- .ctwasResolveMethod(twasWeights, method) + + ldPanelsByRegion <- list() + weightsList <- list() + zSnpPieces <- list() + regionInfoPieces <- list() + snpMap <- list() + ldFileByRegion <- setNames(character(length(regionIds)), regionIds) + + # First pass: cache ld panels, build z_snp pieces, region_info, + # snp_map per region. We need the union of GWAS variant IDs ACROSS + # all blocks before we can correctly filter each per-block TwasWeights + # — a gene's weight variants can straddle adjacent LD blocks, and the + # per-block GWAS subset would drop the cross-boundary variants. + for (rid in regionIds) { + gss <- gwasSumStats[[rid]] + tw <- twasWeights[[rid]] # may be NULL for SNP-only blocks + gwasLd <- getLdSketch(gss) + if (!is.null(tw)) + .ctwasRequireMatchingLdSketches(getLdSketch(tw), gwasLd) + + ldKey <- .ctwasLdPanelKey(gwasLd) + if (is.null(ldPanelsByRegion[[ldKey]])) { + ldPanelsByRegion[[ldKey]] <- .ctwasComputeFullPanelLd(gwasLd) + } + ldPanel <- ldPanelsByRegion[[ldKey]] + ldFileByRegion[[rid]] <- ldKey + + zSnpPieces[[rid]] <- .ctwasBuildZSnp(gss) + regionInfoPieces[[rid]] <- .ctwasBuildSingleRegionInfo(rid, gwasLd) + snpMap[[rid]] <- .ctwasSnpInfoForGwasBlock(gss, ldPanel$snpInfo) + } + + # Global union of GWAS variant IDs across all blocks. Used to filter + # per-block TwasWeights so cross-boundary weight variants survive + # (ctwas's compute_gene_z consumes the concatenated z_snp + the gene's + # full weight vector regardless of which home block the gene's TSS + # falls in). + globalGwasSnpIds <- unique(unlist(lapply(zSnpPieces, function(p) p$id))) + + # Second pass: build per-block weight lists with the GLOBAL gwasSnpIds + # filter, so a gene whose cis-window straddles block boundaries + # contributes its full weight vector to ctwas. + for (rid in regionIds) { + tw <- twasWeights[[rid]] + if (is.null(tw)) next + twMethod <- .ctwasFilterMethod(tw, resolvedMethod) + if (is.null(twMethod)) next + ldKey <- ldFileByRegion[[rid]] + ldPanel <- ldPanelsByRegion[[ldKey]] + blockWeights <- .ctwasBuildWeights( + twMethod, ldPanel, + fineMappingResult = fineMappingResult, + twasWeightCutoff = twasWeightCutoff, + csMinCor = csMinCor, + minPipCutoff = minPipCutoff, + maxNumVariants = maxNumVariants, + gwasSnpIds = globalGwasSnpIds) + if (length(blockWeights) > 0L) { + names(blockWeights) <- paste0(rid, "|", names(blockWeights)) + weightsList <- c(weightsList, blockWeights) + } + } + + zSnp <- do.call(rbind, zSnpPieces) + rownames(zSnp) <- NULL + regionInfo <- do.call(rbind, regionInfoPieces) + rownames(regionInfo) <- NULL + + ldMap <- data.frame( + region_id = regionIds, + LD_file = unname(ldFileByRegion), + SNP_file = unname(ldFileByRegion), + stringsAsFactors = FALSE) + + list( + z_snp = zSnp, + z_gene = if (!is.null(twasZ)) .ctwasBuildZGene(twasZ) else NULL, + weights = weightsList, + region_info = regionInfo, + snp_map = snpMap, + LD_map = ldMap, + LD_loader_fun = .ctwasMultiBlockLdLoader(ldPanelsByRegion), + snpinfo_loader_fun = .ctwasMultiBlockSnpInfoLoader(ldPanelsByRegion), + resolvedMethod = resolvedMethod) +} + +#' Estimate cTWAS group prior + prior variance +#' +#' @description Step 2 of the three-step \code{\link{ctwasPipeline}}: +#' assembles \code{region_data} from the inputs and runs +#' \code{ctwas::est_param} (prefit EM + accurate EM) to estimate the +#' group prior probabilities and prior variances. Returns the input +#' state plus \code{region_data}, \code{boundary_genes}, +#' \code{z_gene}, and \code{param}. +#' +#' @param inputs A list returned by \code{\link{assembleCtwasInputs}}. +#' @param thin,niterPrefit,niter Pass-throughs to +#' \code{ctwas::assemble_region_data} / \code{ctwas::est_param}. +#' @param groupPriorVarStructure Pass-through. +#' @param ncore Number of cores. +#' @param ... Additional arguments forwarded to \code{ctwas::est_param} +#' (e.g. \code{min_p_single_effect}, \code{min_group_size}). +#' @return The \code{inputs} list augmented with \code{region_data}, +#' \code{boundary_genes}, \code{z_gene}, and \code{param}. +#' @export +estCtwasParam <- function(inputs, + thin = 0.1, + niterPrefit = 3L, + niter = 30L, + groupPriorVarStructure = c("shared_type", + "shared_context", + "shared_nonSNP", + "shared_all", + "independent"), + ncore = 1L, + ...) { + if (!requireNamespace("ctwas", quietly = TRUE)) { + stop("Package 'ctwas' is required for estCtwasParam.") + } groupPriorVarStructure <- match.arg(groupPriorVarStructure) + # ctwas::assemble_region_data assumes z_gene is non-NULL; when the + # caller did not supply a precomputed twasZ, compute it now via + # ctwas::compute_gene_z, mirroring ctwas_sumstats's own behaviour. + zGene <- inputs$z_gene + if (is.null(zGene)) { + zGene <- ctwas::compute_gene_z( + inputs$z_snp, inputs$weights, ncore = as.integer(ncore)) + } + assembled <- .ctwasInvoke(ctwas::assemble_region_data, list( + region_info = inputs$region_info, + z_snp = inputs$z_snp, + z_gene = zGene, + weights = inputs$weights, + snp_map = inputs$snp_map, + thin = thin, + ncore = as.integer(ncore)), extra = list(...)) + paramRes <- .ctwasInvoke(ctwas::est_param, list( + region_data = assembled$region_data, + niter_prefit = as.integer(niterPrefit), + niter = as.integer(niter), + group_prior_var_structure = groupPriorVarStructure, + ncore = as.integer(ncore)), extra = list(...)) + # ctwas::assemble_region_data does not echo z_gene back in its return + # list, so propagate the precomputed (or freshly computed) z_gene we + # passed into it. This matches ctwas_sumstats's top-level return shape. + # Replace inputs$z_gene (which is NULL when twasZ wasn't supplied) with + # the computed zGene so $z_gene resolves to the right entry. + inputs$z_gene <- zGene + c(inputs, list( + region_data = assembled$region_data, + boundary_genes = assembled$boundary_genes, + param = paramRes)) +} - twLd <- getLdSketch(twasWeights) - gwasLd <- getLdSketch(gwasSumStats) - .ctwasRequireMatchingLdSketches(twLd, gwasLd) - - # --- Compute the full-panel LD ONCE ------------------------------- - # Single source of truth for both the LD-loader closure (which ctwas - # invokes per region during assemble + fine-map stages) and the - # per-gene R_wgt submatrices (sliced from this cache by SNP ID). - ldPanel <- .ctwasComputeFullPanelLd(gwasLd) - - # --- Build the single-region ctwas inputs --------------------------- - zSnp <- .ctwasBuildZSnp(gwasSumStats) - regionInfo <- .ctwasBuildSingleRegionInfo(regionId, gwasLd) - # ctwas::ctwas_sumstats top-level asserts `file.exists(LD_map$LD_file)` - # and `file.exists(LD_map$SNP_file)` unconditionally — even when - # LD_format = "custom" routes all data through our loaders and the file - # paths are never read. The right fix is upstream (gate the assertion - # on `LD_format != "custom"` or drop it entirely; see - # https://github.com/xinhe-lab/ctwas — `ctwas_sumstats()` L33-34). Until - # that lands, point both columns at `tempdir()`: always exists, no disk - # writes, no cleanup. The loader closures ignore the file token. - vestigialPath <- tempdir() - ldMap <- data.frame(region_id = regionId, - LD_file = vestigialPath, - SNP_file = vestigialPath, - stringsAsFactors = FALSE) - snpMap <- list() - snpMap[[regionId]] <- ldPanel$snpInfo - weightsList <- .ctwasBuildWeights( - twasWeights, ldPanel, - fineMappingResult = fineMappingResult, - twasWeightCutoff = twasWeightCutoff, - csMinCor = csMinCor, - minPipCutoff = minPipCutoff, - maxNumVariants = maxNumVariants) - zGene <- if (!is.null(twasZ)) .ctwasBuildZGene(twasZ) else NULL - - # --- Call the ctwas engine ------------------------------------------ - ctwas::ctwas_sumstats( - z_snp = zSnp, - weights = weightsList, - region_info = regionInfo, - LD_map = ldMap, - snp_map = snpMap, - z_gene = zGene, - thin = thin, - niter_prefit = as.integer(niterPrefit), - niter = as.integer(niter), - L = as.integer(L), - group_prior_var_structure = groupPriorVarStructure, - LD_format = "custom", - LD_loader_fun = .ctwasSingleBlockLdLoader(ldPanel$R), - snpinfo_loader_fun = .ctwasSingleBlockSnpInfoLoader(ldPanel$snpInfo), - ncore = as.integer(ncore), - ...) +#' Screen cTWAS regions +#' +#' @description Step 3 of the three-step \code{\link{ctwasPipeline}}: +#' runs \code{ctwas::screen_regions} on the +#' \code{\link{estCtwasParam}} result and returns the screened-region +#' set. Use this entry point to substitute hand-tuned priors for the +#' ones estimated in step 2 (e.g. when the accurate EM diverges to +#' NaN and you want to recover the prefit values). +#' +#' @param estResult A list returned by \code{\link{estCtwasParam}}. +#' @param L Pass-through to \code{ctwas::screen_regions} (and downstream +#' fine-mapping). +#' @param ncore Number of cores. +#' @param ... Additional arguments forwarded to +#' \code{ctwas::screen_regions} (e.g. \code{filter_L}, +#' \code{min_nonSNP_PIP}, \code{min_L}). +#' @return The \code{estResult} list augmented with +#' \code{screen_res} (the full ctwas output) and +#' \code{screened_region_data}. +#' @export +screenCtwasRegions <- function(estResult, + L = 5L, + ncore = 1L, + ...) { + if (!requireNamespace("ctwas", quietly = TRUE)) { + stop("Package 'ctwas' is required for screenCtwasRegions.") + } + screenRes <- .ctwasInvoke(ctwas::screen_regions, list( + region_data = estResult$region_data, + LD_map = estResult$LD_map, + weights = estResult$weights, + group_prior = estResult$param$group_prior, + group_prior_var = estResult$param$group_prior_var, + L = as.integer(L), + LD_format = "custom", + LD_loader_fun = estResult$LD_loader_fun, + snpinfo_loader_fun = estResult$snpinfo_loader_fun, + ncore = as.integer(ncore)), extra = list(...)) + c(estResult, list( + screen_res = screenRes, + screened_region_data = screenRes$screened_region_data)) +} + +#' Fine-map cTWAS regions +#' +#' @description Step 4 (final) of the three-step +#' \code{\link{ctwasPipeline}}: runs \code{ctwas::finemap_regions} on +#' the screened-region set from \code{\link{screenCtwasRegions}} and +#' assembles the documented top-level ctwas output (\code{z_gene}, +#' \code{param}, \code{finemap_res}, \code{susie_alpha_res}, +#' \code{region_data}, \code{boundary_genes}, \code{screen_res}). +#' +#' @param screenResult A list returned by +#' \code{\link{screenCtwasRegions}}. +#' @param L Pass-through. +#' @param ncore Number of cores. +#' @param ... Additional arguments forwarded to +#' \code{ctwas::finemap_regions}. +#' @return A list mirroring \code{ctwas::ctwas_sumstats}'s output: +#' \code{z_gene}, \code{param}, \code{finemap_res}, +#' \code{susie_alpha_res}, \code{region_data}, \code{boundary_genes}, +#' \code{screen_res}. +#' @export +finemapCtwasRegions <- function(screenResult, + L = 5L, + ncore = 1L, + ...) { + if (!requireNamespace("ctwas", quietly = TRUE)) { + stop("Package 'ctwas' is required for finemapCtwasRegions.") + } + rd <- screenResult$screened_region_data + fmRes <- if (length(rd) == 0L) { + list(finemap_res = NULL, susie_alpha_res = NULL) + } else { + .ctwasInvoke(ctwas::finemap_regions, list( + region_data = rd, + LD_map = screenResult$LD_map, + weights = screenResult$weights, + group_prior = screenResult$param$group_prior, + group_prior_var = screenResult$param$group_prior_var, + L = as.integer(L), + LD_format = "custom", + LD_loader_fun = screenResult$LD_loader_fun, + snpinfo_loader_fun = screenResult$snpinfo_loader_fun, + ncore = as.integer(ncore)), extra = list(...)) + } + list( + z_gene = screenResult$z_gene, + param = screenResult$param, + finemap_res = fmRes$finemap_res, + susie_alpha_res = fmRes$susie_alpha_res, + region_data = screenResult$region_data, + boundary_genes = screenResult$boundary_genes, + screen_res = screenResult$screen_res) +} + +# Invoke a ctwas function with a fixed `args` list plus optional `extra` +# (typically the `...` collected by the wrapper). `extra` names that +# duplicate `args` names are silently dropped, so the wrapper's explicit +# arguments always win over caller-supplied `...`. +# @noRd +.ctwasInvoke <- function(fn, args, extra = list()) { + if (length(extra) > 0L) { + extra <- extra[setdiff(names(extra), names(args))] + # `...` is forwarded uniformly to four different ctwas functions + # (assemble_region_data / est_param / screen_regions / + # finemap_regions). Restrict to fn's explicit formals so an arg + # meant for a sibling step doesn't crash this one -- and so args + # that fn would otherwise forward via its own `...` (e.g. into + # susie_rss) don't bleed into incompatible downstream functions. + formalsFn <- tryCatch(names(formals(fn)), error = function(e) NULL) + if (!is.null(formalsFn)) { + explicitFormals <- setdiff(formalsFn, "...") + extra <- extra[intersect(names(extra), explicitFormals)] + } + args <- c(args, extra) + } + do.call(fn, args) } # ============================================================================= @@ -176,6 +488,52 @@ ctwasPipeline <- function(gwasSumStats, .requireMatchingLdSketches(twLd, gwasLd, pipelineName = "ctwasPipeline") } +# Resolve which TWAS method's weights to feed into ctwas given a +# TwasWeights collection that may carry multiple methods per +# (study, context, trait). Rules: +# - Caller-supplied method (non-NULL, non-empty) wins, provided that +# method exists in the TwasWeights's `method` column. +# - Otherwise prefer "ensemble" when present. +# - Otherwise return the sole method when only one is present. +# - Otherwise: error. +# @noRd +.ctwasResolveMethod <- function(twasWeightsList, method = NULL) { + available <- unique(unlist(lapply(twasWeightsList, function(tw) + as.character(tw$method)))) + if (length(available) == 0L) + stop("ctwasPipeline: TwasWeights collections have no method entries.") + if (!is.null(method) && nzchar(method)) { + if (!method %in% available) + stop("ctwasPipeline: method '", method, "' not present in TwasWeights ", + "(available: ", paste(available, collapse = ", "), ").") + return(method) + } + if ("ensemble" %in% available) return("ensemble") + if (length(available) == 1L) return(available[[1L]]) + stop("ctwasPipeline: TwasWeights carries multiple methods (", + paste(available, collapse = ", "), + ") with no 'ensemble' entry. Supply a `method` argument to ", + "pick one (e.g. method = \"mrash\").") +} + +# Subset a TwasWeights collection to rows whose `method` matches the +# resolved method. Used to enforce the "one ctwas gene per (study, +# context, trait)" semantics — the legacy pipeline fed a single +# best-CV-method weight per gene; the new S4 TwasWeights may carry +# many methods, but ctwas should only see one. +# @noRd +.ctwasFilterMethod <- function(tw, method) { + keep <- which(as.character(tw$method) == method) + if (length(keep) == 0L) return(NULL) + TwasWeights( + study = as.character(tw$study)[keep], + context = as.character(tw$context)[keep], + trait = as.character(tw$trait)[keep], + method = as.character(tw$method)[keep], + entry = as.list(tw$entry[keep]), + ldSketch = getLdSketch(tw)) +} + # Build the per-variant Z data.frame ctwas expects from a GwasSumStats. # Stacks each study row's GRanges via the shared `.entryToSumstatDf` # helper (R/sumstatsQc.R), then projects to ctwas's column shape and @@ -238,11 +596,14 @@ ctwasPipeline <- function(gwasSumStats, # Compute the full-panel LD ONCE and return everything the rest of the # pipeline needs to consume it. Returns a list with: -# R : full-panel correlation matrix (n_var x n_var, dimnames = -# SNP IDs). Single source of truth for both the per-region -# LD loader closure and the per-gene R_wgt submatrices. -# snpInfo : ctwas-shaped per-block table (chrom, id, pos, alt, ref) -# — both the snp_map element and the snpinfo loader return. +# R : full-panel correlation matrix (n_var x n_var, dimnames = +# SNP IDs). Single source of truth for both the per-region +# LD loader closure and the per-gene R_wgt submatrices. +# snpInfo : ctwas-shaped per-block table (chrom, id, pos, alt, ref) +# — both the snp_map element and the snpinfo loader return. +# variance : named numeric vector of per-variant dosage variance from +# the LD reference. Used to scale non-standardized TWAS +# weights to the correlation scale that ctwas expects. # @noRd .ctwasComputeFullPanelLd <- function(gwasLd) { snpInfoCtwas <- .ctwasSnpInfoForBlock(gwasLd) @@ -252,7 +613,88 @@ ctwasPipeline <- function(gwasSumStats, R <- computeLd(geno, method = "sample") snpIds <- snpInfoCtwas$id dimnames(R) <- list(snpIds, snpIds) - list(R = R, snpInfo = snpInfoCtwas) + variance <- setNames(apply(geno, 2, stats::var, na.rm = TRUE), snpIds) + list(R = R, snpInfo = snpInfoCtwas, variance = variance) +} + +# Harmonize TWAS weight variants against the LD reference panel. Same +# allele-matching semantics as the GWAS-side `.matchRefPanel` flow: +# match by (chrom, pos), accept exact A1/A2 frame, sign-flip the weight +# when alleles are swapped, drop unmatched / strand-ambiguous variants. +# Returns a data.frame with columns: +# variant_id : canonical (panel-frame) variant ID +# w : sign-flipped weight aligned to the panel's A1 frame +# origIdx : index back into the entry's original variantIds vector +# (used by SuSiE renormalization to slice mu / lbf) +# Returns NULL when the entry has no variants in common with the panel. +# @noRd +.ctwasHarmonizeWeights <- function(origVids, origW, refVariants) { + parsed <- tryCatch(parseVariantId(origVids), error = function(e) NULL) + if (is.null(parsed) || nrow(parsed) == 0L) return(NULL) + targetDf <- data.frame( + chrom = as.integer(parsed$chrom), + pos = as.integer(parsed$pos), + A2 = as.character(parsed$A2), + A1 = as.character(parsed$A1), + w = as.numeric(origW), + origIdx = seq_along(origVids), + stringsAsFactors = FALSE) + res <- tryCatch( + .matchRefPanel( + targetData = targetDf, + refVariants = refVariants, + colToFlip = "w", + matchMinProp = 0, + removeUnmatched = TRUE, + removeStrandAmbiguous = TRUE), + error = function(e) NULL) + if (is.null(res)) return(NULL) + res$harmonizedData +} + +# Does the entry's `fits` slot carry a SuSiE-shape intermediate (lbf, +# mu, X_column_scale_factors)? Used to gate the renormalization branch. +# @noRd +.ctwasIsSusieFit <- function(fits) { + if (is.null(fits)) return(FALSE) + needed <- c("lbf_variable", "mu", "X_column_scale_factors") + all(needed %in% names(fits)) +} + +# Renormalize SuSiE TWAS weights over the kept variant set. When some +# variants got dropped by allele harmonization / panel intersection, +# the posterior `alpha` values from the original fit no longer sum to +# 1 over the kept variants. We re-softmax `lbf_variable[, keptIdx]` +# into a renormalized alpha, sign-flip the rows of `mu[, keptIdx]` to +# match the panel's allele frame (carrying over the per-variant sign +# flip already applied to `harmonizedW`), and recompute the per-variant +# weight as `colSums(alpha * mu_subset) / X_column_scale_factors_subset`. +# Returns the new weight vector (length = length(keptIdx)), or NULL if +# the fit's dimensions don't line up with the entry's variantIds. +# @noRd +.ctwasRenormalizeSusieWeights <- function(fits, origVids, origW, + keptIdx, harmonizedW) { + lbf <- fits$lbf_variable + mu <- fits$mu + xCol <- fits$X_column_scale_factors + if (is.null(lbf) || is.null(mu) || is.null(xCol)) return(NULL) + if (ncol(lbf) != length(origVids) || + ncol(mu) != length(origVids) || + length(xCol) != length(origVids)) { + # Fit-vs-entry dimension mismatch; skip rather than mis-slice. + return(NULL) + } + # Per-variant sign flip applied by allele harmonization. NaN signs + # (origW == 0) default to +1. + signFlip <- sign(harmonizedW / origW[keptIdx]) + signFlip[!is.finite(signFlip)] <- 1 + lbfSub <- lbf[, keptIdx, drop = FALSE] + muSub <- sweep(mu[, keptIdx, drop = FALSE], 2L, signFlip, `*`) + xColSub <- xCol[keptIdx] + # Guard against zero scale factors (shouldn't happen in practice). + xColSub[xColSub == 0] <- 1 + newAlpha <- lbfToAlpha(lbfSub) + as.numeric(colSums(newAlpha * muSub) / xColSub) } # Build the weights list ctwas expects: keyed by per-tuple gene id, @@ -271,20 +713,83 @@ ctwasPipeline <- function(gwasSumStats, twasWeightCutoff = 0, csMinCor = 0.8, minPipCutoff = 0, - maxNumVariants = Inf) { + maxNumVariants = Inf, + gwasSnpIds = NULL) { panelSnps <- rownames(ldPanel$R) + # ctwas's compute_gene_z asserts that every weight variant exists in the + # block's z_snp$id. When the LD sketch covers more than the block (e.g. + # a whole-chromosome PLINK2 used for a single-block GwasSumStats), + # panelSnps alone leaks variants outside the block. Intersect with the + # caller-supplied GWAS sumstats variant set when provided. + if (!is.null(gwasSnpIds)) { + panelSnps <- intersect(panelSnps, as.character(gwasSnpIds)) + } panelInfo <- ldPanel$snpInfo + # Reference frame for allele-harmonization: panel variant info with + # the column shape `.matchRefPanel` expects (chrom/pos/A2/A1). + refVariants <- data.frame( + chrom = as.integer(panelInfo$chrom), + pos = as.integer(panelInfo$pos), + A2 = as.character(panelInfo$ref), + A1 = as.character(panelInfo$alt), + variant_id = as.character(panelInfo$id), + stringsAsFactors = FALSE) + out <- list() for (i in seq_len(nrow(twasWeights))) { - entry <- twasWeights$entry[[i]] - vids <- getVariantIds(entry) - w <- as.numeric(getWeights(entry)) - if (length(vids) == 0L || length(vids) != length(w)) next + entry <- twasWeights$entry[[i]] + origVids <- getVariantIds(entry) + origW <- as.numeric(getWeights(entry)) + if (length(origVids) == 0L || length(origVids) != length(origW)) next + + # --- Step 1: allele-harmonize against the LD panel ------------- + # Parses chr:pos:A2:A1 IDs into the data.frame `.matchRefPanel` + # expects, then matches by (chrom, pos) with exact / sign-flip / + # strand-flip detection. Returned canonical variant IDs are in the + # panel's A1/A2 frame; weights are sign-flipped for variants whose + # input A1/A2 frame was swapped relative to the panel. + harm <- .ctwasHarmonizeWeights(origVids, origW, refVariants) + if (is.null(harm) || nrow(harm) == 0L) next + vids <- as.character(harm$variant_id) + w <- as.numeric(harm$w) + keptIdx <- as.integer(harm$origIdx) # back-reference into origVids/origW - # Drop variants not in the LD sketch panel. + # --- Step 2: restrict to panel ∩ gwasSnpIds -------------------- keep <- vids %in% panelSnps if (!any(keep)) next - vids <- vids[keep]; w <- w[keep] + vids <- vids[keep] + w <- w[keep] + keptIdx <- keptIdx[keep] + + # --- Step 3: SuSiE alpha renormalization ----------------------- + # When the entry carries a SuSiE-style fit (lbf_variable + mu + + # X_column_scale_factors) and the kept variant set is smaller than + # the original fit, the posterior probabilities `alpha` no longer + # sum to 1 over the kept variants. Renormalize via softmax of + # lbf_variable over the kept columns and recompute the per-variant + # weight as colSums(new_alpha * mu_subset) / + # X_column_scale_factors_subset. mu is sign-flipped per the allele + # harmonization so the recomputed weight stays in the panel's + # allele frame. Mirrors the legacy adjustSusieWeights helper. + fits <- getFits(entry) + if (.ctwasIsSusieFit(fits) && length(keptIdx) < length(origVids)) { + renorm <- .ctwasRenormalizeSusieWeights( + fits, origVids = origVids, origW = origW, + keptIdx = keptIdx, harmonizedW = w) + if (!is.null(renorm)) w <- renorm + } + + # --- Step 4: variance scaling for non-standardized weights ----- + # w_scaled = w_raw * sqrt(per-variant genotype variance from the + # LD reference panel). Standardized entries (RSS-style, already on + # the correlation scale) pass through unchanged. + if (!isTRUE(getStandardized(entry))) { + varLookup <- ldPanel$variance[vids] + if (anyNA(varLookup)) + stop(".ctwasBuildWeights: missing genotype variance for ", + sum(is.na(varLookup)), " variant(s) in the LD panel.") + w <- w * sqrt(varLookup) + } gStudy <- as.character(twasWeights$study)[[i]] gContext <- as.character(twasWeights$context)[[i]] @@ -460,18 +965,71 @@ ctwasPipeline <- function(gwasSumStats, stringsAsFactors = FALSE) } -# Single-block LD loader for ctwas: captures the precomputed full-panel -# correlation matrix and returns it on every loader call (ctwas invokes -# the loader multiple times per region across assemble + fine-map). The -# `LD_file` argument is a vestigial region token — ignored. +# Multi-block LD loader for ctwas. ctwas invokes +# `LD_loader_fun(LD_file)` per region during region_data assembly and +# fine-mapping; we dispatch by `LD_file` (the same string set on +# `LD_map$LD_file`) into the cached per-sketch ldPanel. # @noRd -.ctwasSingleBlockLdLoader <- function(R) { - function(LD_file, ...) R +.ctwasMultiBlockLdLoader <- function(ldPanelsByRegion) { + function(LD_file, ...) { + panel <- ldPanelsByRegion[[LD_file]] + if (is.null(panel)) + stop("ctwasPipeline LD loader: no cached panel for LD_file = '", + LD_file, "'") + panel$R + } } -# Single-block SNP-info loader for ctwas: captures the precomputed -# per-block snpInfo table and returns it on every loader call. +# Multi-block SNP-info loader for ctwas. Mirrors the LD loader. # @noRd -.ctwasSingleBlockSnpInfoLoader <- function(snpInfo) { - function(LD_file, ...) snpInfo +.ctwasMultiBlockSnpInfoLoader <- function(ldPanelsByRegion) { + function(LD_file, ...) { + panel <- ldPanelsByRegion[[LD_file]] + if (is.null(panel)) + stop("ctwasPipeline snpInfo loader: no cached panel for LD_file = '", + LD_file, "'") + panel$snpInfo + } +} + +# Derive the LD_file token for ctwas from a GenotypeHandle. We point +# at the on-disk file that already backs the sketch's data, so the +# `file.exists(LD_map$LD_file)` assertion in ctwas::ctwas_sumstats +# passes WITHOUT pecotmr doing any new I/O. The token also serves as +# the dispatch key for the multi-block LD / snpInfo loaders, so two +# blocks sharing the same on-disk LD payload share one cached panel. +# @noRd +.ctwasLdPanelKey <- function(handle) { + fmt <- getFormat(handle) + stem <- getPath(handle) + candidates <- switch(fmt, + "plink2" = c(paste0(stem, ".pgen")), + "plink1" = c(paste0(stem, ".bed")), + "gds" = c(stem), + "vcf" = c(stem), + stem) + hit <- candidates[file.exists(candidates)] + if (length(hit) == 0L) + stop("ctwasPipeline: could not derive an existing LD-file token for ", + "the GenotypeHandle (format=", fmt, ", path=", stem, + "). Looked for: ", paste(candidates, collapse = ", ")) + hit[[1L]] +} + +# Build a per-block snpInfo table restricted to variants present in the +# GwasSumStats entry. Mirrors `.ctwasSnpInfoForBlock` but restricts to +# the block's GWAS variants (intersected against the cached panel) so +# snp_map[[region_id]] is sized to the block, not the whole panel. +# @noRd +.ctwasSnpInfoForGwasBlock <- function(gwasSumStats, panelSnpInfo) { + blockIds <- character(0) + for (i in seq_len(nrow(gwasSumStats))) { + mc <- S4Vectors::mcols(gwasSumStats$entry[[i]]) + if ("SNP" %in% colnames(mc)) + blockIds <- c(blockIds, as.character(mc$SNP)) + } + blockIds <- unique(blockIds) + if (length(blockIds) == 0L) return(panelSnpInfo[FALSE, , drop = FALSE]) + keep <- panelSnpInfo$id %in% blockIds + panelSnpInfo[keep, , drop = FALSE] } diff --git a/R/genotypeIo.R b/R/genotypeIo.R index 98f4ea9c..8b656a7c 100644 --- a/R/genotypeIo.R +++ b/R/genotypeIo.R @@ -314,9 +314,18 @@ extractBlockGenotypes <- function(handle, snpIdx, meanImpute = TRUE) { #' @keywords internal .extractBlockPlink2 <- function(handle, snpIdx) { - # pgenlibr::ReadList returns ALT dosage = A1 dosage in pecotmr convention - geno <- pgenlibr::ReadList(getPgenPtr(handle), variant_subset = snpIdx, - meanimpute = FALSE) + # pgenlibr::ReadList returns ALT dosage = A1 dosage in pecotmr convention. + # The cached @pgenPtr does not survive saveRDS/readRDS (external pointers + # become stale), so we re-open from getPath() on the fly if the cached + # pointer errors out. Opening is cheap relative to dosage extraction. + ptr <- getPgenPtr(handle) + paths <- resolvePlink2Paths(getPath(handle)) + geno <- tryCatch( + pgenlibr::ReadList(ptr, variant_subset = snpIdx, meanimpute = FALSE), + error = function(e) { + reopened <- pgenlibr::NewPgen(paths$pgen) + pgenlibr::ReadList(reopened, variant_subset = snpIdx, meanimpute = FALSE) + }) storage.mode(geno) <- "double" geno } @@ -530,7 +539,6 @@ computeBlockLdCor <- function(handle, snpIdx, backend = "internal", #' @importFrom Rsamtools TabixFile seqnamesTabix scanTabix headerTabix #' @importFrom GenomicRanges GRanges seqnames #' @importFrom SummarizedExperiment assay -#' @importFrom MungeSumstats standardise_header readBim <- function(bed) { bimf <- paste0(file_path_sans_ext(bed), ".bim") bim <- vroom(bimf, col_names = FALSE) diff --git a/R/sumstatsQc.R b/R/sumstatsQc.R index 0996d597..04943641 100644 --- a/R/sumstatsQc.R +++ b/R/sumstatsQc.R @@ -61,17 +61,17 @@ NULL #' @details #' Pure panel-vs-sumstats allele harmonization: match by (chrom, pos), #' detect A1/A2 swap, sign-flip \code{colToFlip} columns and complement -#' \code{colToComplement} columns on swap. Variant-content filters -#' (indels, strand-ambiguous, duplicates, NAs, MAF / INFO / N cutoffs) -#' are NOT applied here — \code{summaryStatsQc()} delegates those to -#' \code{MungeSumstats::format_sumstats()} before \code{.matchRefPanel} -#' runs. The internal \code{strand_unambiguous} guard remains so that -#' if a caller bypasses MungeSumstats and feeds in A/T or C/G -#' strand-ambiguous variants, they are not mis-flipped: the keep rule -#' only accepts a strand-flip claim when the unambiguous flag agrees. +#' \code{colToComplement} columns on swap. Variant-allele filters +#' (indels, strand-ambiguous, duplicates) are applied here directly when +#' the corresponding \code{removeIndels} / \code{removeStrandAmbiguous} / +#' \code{removeDups} flags are set; MAF / INFO / N column-numeric filters +#' run in \code{.applyContentFilters()} before this function. .matchRefPanel <- function(targetData, refVariants, colToFlip = NULL, matchMinProp = 0.2, flipStrand = FALSE, removeUnmatched = TRUE, + removeIndels = FALSE, + removeStrandAmbiguous = TRUE, + removeDups = FALSE, colToComplement = character(), ...) { strandFlip <- function(ref) chartr("ATCG", "TAGC", ref) @@ -148,7 +148,19 @@ NULL (A1.target != A1.ref & A2.target != A2.ref)) %>% mutate(strand_flip = ((A1.target == flip1.ref & A2.target == flip2.ref) | (A1.target == flip2.ref & A2.target == flip1.ref)) & - (A1.target != A1.ref & A2.target != A2.ref)) + (A1.target != A1.ref & A2.target != A2.ref)) %>% + # INDEL detection: explicit "I"/"D" notation, or any allele wider than 1bp. + mutate(INDEL = (A2.target == "I" | A2.target == "D" | + nchar(A2.target) > 1L | nchar(A1.target) > 1L)) %>% + # ID_match: an indel encoded as I/D on the target side matches an indel + # on the reference side (where the reference uses multi-base alleles). + mutate(ID_match = ((A2.target == "D" | A2.target == "I") & + (nchar(A1.ref) > 1L | nchar(A2.ref) > 1L))) + + # When removeStrandAmbiguous = FALSE, the A/T - C/G safety guard is + # disabled: ambiguous variants are treated as exact/sign-flip cases. + if (!removeStrandAmbiguous) + matchResult$strand_unambiguous <- TRUE # If no strand_flip survives the unambiguous test, the remaining ambiguous # variants can be treated as exact/sign-flip cases rather than dropped. @@ -157,8 +169,12 @@ NULL matchResult <- matchResult %>% mutate(keep = if_else(strand_flip, - true = strand_unambiguous | exact_match, - false = exact_match | sign_flip)) + true = strand_unambiguous | exact_match | ID_match, + false = exact_match | sign_flip | ID_match)) + + if (removeIndels) + matchResult <- matchResult %>% + mutate(keep = if_else(INDEL, FALSE, keep)) if (!is.null(colToFlip)) { missing <- setdiff(colToFlip, colnames(matchResult)) @@ -212,11 +228,23 @@ NULL result <- matchResult[matchResult$keep, , drop = FALSE] qcCols <- c("flip1.ref", "flip2.ref", "strand_unambiguous", - "exact_match", "sign_flip", "strand_flip", "keep") + "exact_match", "sign_flip", "strand_flip", "INDEL", + "ID_match", "keep") result <- result %>% select(-any_of(qcCols), -A1.target, -A2.target) %>% rename(A1 = A1.ref, A2 = A2.ref, variant_id = variants_id_qced) + # removeDups: drop duplicate variant rows (same chrom/pos/qced ID). + # Default FALSE keeps the existing strict behavior (error on dups). + if (removeDups) { + dups <- duplicated(result[, c("chrom", "pos", "variant_id")]) + if (any(dups)) { + warning(sprintf("Removed %d duplicate variant(s), keeping first occurrence.", + sum(dups))) + result <- result[!dups, , drop = FALSE] + } + } + if (!removeUnmatched) { matchVariant <- result %>% pull(variants_id_original) matchResult <- matchResult %>% @@ -237,9 +265,9 @@ NULL if (nrow(result) < matchMinProp * nrow(refVariants)) stop("Not enough variants have been matched.") if (any(duplicated(result$variant_id))) - stop("Duplicated variant IDs remain after harmonization; the input ", - "should have been deduplicated by MungeSumstats / summaryStatsQc ", - "before calling .matchRefPanel.") + stop("Duplicated variant IDs remain after harmonization; pass ", + "removeDups = TRUE or deduplicate upstream before calling ", + ".matchRefPanel.") out <- list(harmonizedData = result, qcSummary = matchResult) attr(out, "qcCounts") <- qcCounts @@ -2358,9 +2386,8 @@ krigingOutlierQc <- function(zScore, R, variantIds = NULL, # data.frame/LdData/QcResult-based summaryStatsQc and rssBasicQc). # ============================================================================= -# Convert one entry's GRanges into a flat data.frame with the column shape that -# MungeSumstats and .matchRefPanel expect (lower-case chrom/pos plus the -# CapsCase mcols). +# Convert one entry's GRanges into a flat data.frame with the column shape +# .matchRefPanel expects (lower-case chrom/pos plus the CapsCase mcols). .entryGrangesToDf <- function(gr) { mc <- as.data.frame(S4Vectors::mcols(gr), stringsAsFactors = FALSE) out <- data.frame( @@ -2591,62 +2618,14 @@ krigingOutlierQc <- function(zScore, R, variantIds = NULL, df[!dropMask, , drop = FALSE] } -# Run the curated MungeSumstats::format_sumstats() pass. Returns the cleaned -# data.frame in pecotmr CapsCase column convention plus a per-entry audit -# record describing what was applied. Caller is responsible for catching -# errors and reporting them with the entry identity. -.runMungeSumstatsFilter <- function(df, refGenome, useDbsnpRefCheck, - removeIndels, removeStrandAmbiguous, - mafCutoff, infoCutoff, nCutoff, - convertRefGenome, mungeSumstatsArgs) { - if (!requireNamespace("MungeSumstats", quietly = TRUE)) - stop("Package 'MungeSumstats' is required for summaryStatsQc(). ", - "Install it from Bioconductor.") - # MungeSumstats writes through a temp tsv when invoked with a path; pass the - # in-memory data.frame via the file-write/read path it provides. - tmpIn <- tempfile(fileext = ".tsv.gz") - on.exit(unlink(tmpIn), add = TRUE) - data.table::fwrite(df, tmpIn, sep = "\t") - - baseArgs <- list( - path = tmpIn, - ref_genome = if (is.null(refGenome)) "GRCh38" else refGenome, - convert_ref_genome = convertRefGenome, - on_ref_genome = isTRUE(useDbsnpRefCheck), - infer_eff_direction = isTRUE(useDbsnpRefCheck), - allele_flip_check = isTRUE(useDbsnpRefCheck), - bi_allelic_filter = isTRUE(useDbsnpRefCheck), - strand_ambig_filter = isTRUE(removeStrandAmbiguous), - drop_indels = isTRUE(removeIndels), - FRQ_filter = mafCutoff, - INFO_filter = infoCutoff, - N_std = nCutoff, - return_data = TRUE, - return_format = "data.table", - log_folder_ind = FALSE, - log_mungesumstats_msgs = FALSE) - # Caller-supplied pass-through overrides anything we set above. - for (nm in names(mungeSumstatsArgs)) baseArgs[[nm]] <- mungeSumstatsArgs[[nm]] - - before <- nrow(df) - cleaned <- do.call(MungeSumstats::format_sumstats, baseArgs) - cleaned <- as.data.frame(cleaned) - # Restore lower-case chrom/pos for downstream pecotmr code. - if ("CHR" %in% colnames(cleaned)) { - cleaned$chrom <- sub("^chr", "", as.character(cleaned$CHR), - ignore.case = TRUE) - cleaned$CHR <- NULL - } - if ("BP" %in% colnames(cleaned)) { - cleaned$pos <- as.integer(cleaned$BP) - cleaned$BP <- NULL - } - list(df = cleaned, droppedNVariants = before - nrow(cleaned)) -} - # Apply the panel-vs-sumstats allele harmonization using the slim -# .matchRefPanel against the ldSketch's variant info. -.matchAgainstSketch <- function(df, ldSketch, matchMinProp) { +# .matchRefPanel against the ldSketch's variant info. Threads the +# variant-level filters (indels, strand-ambiguous, duplicates) through +# so the LD-panel-anchored pass handles them in a single sweep. +.matchAgainstSketch <- function(df, ldSketch, matchMinProp, + removeIndels = FALSE, + removeStrandAmbiguous = TRUE, + removeDups = TRUE) { refVariants <- .refVariantsFromSketch(ldSketch) flipCandidates <- c("Z", "BETA") colToFlip <- intersect(flipCandidates, colnames(df)) @@ -2657,12 +2636,15 @@ krigingOutlierQc <- function(zScore, R, variantIds = NULL, if (!"A1" %in% colnames(df) || !"A2" %in% colnames(df)) stop("summaryStatsQc: input entry must contain A1 and A2 columns.") res <- .matchRefPanel( - targetData = df, - refVariants = refVariants, - colToFlip = colToFlip, - colToComplement = colToComplement, - matchMinProp = matchMinProp, - removeUnmatched = TRUE) + targetData = df, + refVariants = refVariants, + colToFlip = colToFlip, + colToComplement = colToComplement, + matchMinProp = matchMinProp, + removeUnmatched = TRUE, + removeIndels = removeIndels, + removeStrandAmbiguous = removeStrandAmbiguous, + removeDups = removeDups) out <- res$harmonizedData if (!"chrom" %in% colnames(out) && "chr" %in% colnames(out)) colnames(out)[colnames(out) == "chr"] <- "chrom" @@ -2670,6 +2652,189 @@ krigingOutlierQc <- function(zScore, R, variantIds = NULL, out } +# Variant-content filters (MAF / INFO / N). Pure data-frame column +# filters; no Bioconductor genome packages needed. +# +# mafCutoff: drop rows where MAF (or FRQ) < mafCutoff. Requires either +# column when mafCutoff > 0; errors if neither is present. +# infoCutoff: drop rows where INFO < infoCutoff. Requires INFO column +# when infoCutoff > 0. +# nCutoff: drop rows whose N is more than nCutoff median-absolute- +# deviations from the median (a 5-MAD-from-median cap on +# per-variant N). Set nCutoff = 0 to disable. Rows with NA N +# are always dropped. +.applyContentFilters <- function(df, mafCutoff = 0, infoCutoff = 0, + nCutoff = 5) { + audit <- list() + if (mafCutoff > 0) { + mafCol <- intersect(c("MAF", "FRQ"), colnames(df))[1L] + if (is.na(mafCol)) + stop(".applyContentFilters: mafCutoff > 0 requires a MAF or FRQ column.") + before <- nrow(df) + mafVals <- as.numeric(df[[mafCol]]) + # Normalise effect-allele frequency to MAF: take min(af, 1-af). + mafVals <- pmin(mafVals, 1 - mafVals, na.rm = FALSE) + df <- df[!is.na(mafVals) & mafVals >= mafCutoff, , drop = FALSE] + audit$mafDropped <- before - nrow(df) + } + if (infoCutoff > 0) { + if (!"INFO" %in% colnames(df)) + stop(".applyContentFilters: infoCutoff > 0 requires an INFO column.") + before <- nrow(df) + infoVals <- as.numeric(df$INFO) + df <- df[!is.na(infoVals) & infoVals >= infoCutoff, , drop = FALSE] + audit$infoDropped <- before - nrow(df) + } + if (nCutoff > 0 && "N" %in% colnames(df) && nrow(df) > 0L) { + nVals <- as.numeric(df$N) + before <- nrow(df) + if (any(is.na(nVals))) { + df <- df[!is.na(nVals), , drop = FALSE] + nVals <- nVals[!is.na(nVals)] + } + if (length(nVals) > 0L) { + medN <- stats::median(nVals) + madN <- stats::mad(nVals, constant = 1) + if (madN > 0) { + zN <- abs(nVals - medN) / madN + df <- df[zN <= nCutoff, , drop = FALSE] + } + } + audit$nDropped <- before - nrow(df) + } + list(df = df, audit = audit) +} + +# Per-row variant sanity / hygiene checks ported from MungeSumstats's +# check_*.R series but rewritten as pure data.frame operations with no +# genome / dbSNP dependency. Each step is gated by its own flag so a +# caller can disable any single check. +# +# Steps (in order; each contributes a count to audit): +# - coerceNumeric: cast signed columns to numeric (catches stray "0.5" +# strings). NA-introducing coercions are counted. +# - normalizeChr: strip "chr"/"ch" prefix, uppercase X/Y/MT, map +# 23->X, 24->Y, M->MT. Optional dropNonstandardChr removes rows +# whose CHR is outside 1..22, X, Y, MT after normalization. +# - dropMissData: drop rows with NA in any vital column (chrom, pos, +# A1, A2, and at least one of Z / BETA). +# - dropPOutOfRange: drop rows where P < 0 or P > 1 (corrupt p-values). +# Only fires when a P column is present. +# - clampSmallP: floor 0 <= P <= smallPFloor to smallPFloor so +# -log10(P) stays finite downstream. +# - dropZeroEffect: drop rows where any effect column is exactly 0 +# (BETA / LOG_ODDS / SIGNED_SUMSTAT) or OR is exactly 1. MungeSumstats +# treats these as degenerate / artefactual. +# - dropNonpositiveSe: drop rows where SE <= 0. +.applySanityChecks <- function(df, + coerceNumeric = TRUE, + normalizeChr = TRUE, + dropNonstandardChr = TRUE, + dropMissData = TRUE, + dropPOutOfRange = TRUE, + clampSmallP = TRUE, + smallPFloor = 5e-324, + dropZeroEffect = TRUE, + dropNonpositiveSe = TRUE) { + audit <- list() + if (nrow(df) == 0L) return(list(df = df, audit = audit)) + + if (coerceNumeric) { + numericCols <- intersect( + c("Z", "BETA", "SE", "OR", "LOG_ODDS", "SIGNED_SUMSTAT", + "P", "MAF", "FRQ", "INFO", "N"), + colnames(df)) + naIntroduced <- 0L + for (col in numericCols) { + orig <- df[[col]] + if (is.numeric(orig)) next + coerced <- suppressWarnings(as.numeric(orig)) + naIntroduced <- naIntroduced + + sum(is.na(coerced) & !is.na(orig)) + df[[col]] <- coerced + } + if (naIntroduced > 0L) audit$nonNumericCoerced <- naIntroduced + } + + if (normalizeChr && "chrom" %in% colnames(df)) { + chr <- as.character(df$chrom) + chr <- sub("^chr", "", chr, ignore.case = TRUE) + chr <- sub("^ch", "", chr, ignore.case = TRUE) + chr <- toupper(chr) + chr[chr == "23"] <- "X" + chr[chr == "24"] <- "Y" + chr[chr == "M"] <- "MT" + df$chrom <- chr + if (dropNonstandardChr) { + before <- nrow(df) + standardChrs <- c(as.character(1:22), "X", "Y", "MT") + df <- df[chr %in% standardChrs, , drop = FALSE] + dropped <- before - nrow(df) + if (dropped > 0L) audit$nonstandardChrDropped <- dropped + } + } + + if (dropMissData && nrow(df) > 0L) { + vital <- intersect(c("chrom", "pos", "A1", "A2"), colnames(df)) + signedCol <- intersect(c("Z", "BETA"), colnames(df))[1L] + if (!is.na(signedCol)) vital <- c(vital, signedCol) + if (length(vital) > 0L) { + before <- nrow(df) + bad <- Reduce(`|`, lapply(vital, function(c) is.na(df[[c]]))) + if (any(bad)) df <- df[!bad, , drop = FALSE] + dropped <- before - nrow(df) + if (dropped > 0L) audit$missDataDropped <- dropped + } + } + + if (dropPOutOfRange && "P" %in% colnames(df) && nrow(df) > 0L) { + before <- nrow(df) + p <- as.numeric(df$P) + bad <- !is.na(p) & (p < 0 | p > 1) + if (any(bad)) df <- df[!bad, , drop = FALSE] + dropped <- before - nrow(df) + if (dropped > 0L) audit$pOutOfRangeDropped <- dropped + } + + if (clampSmallP && "P" %in% colnames(df) && nrow(df) > 0L) { + p <- as.numeric(df$P) + smallMask <- !is.na(p) & p >= 0 & p < smallPFloor + nClamped <- sum(smallMask) + if (nClamped > 0L) { + df$P[smallMask] <- smallPFloor + audit$smallPClamped <- nClamped + } + } + + if (dropZeroEffect && nrow(df) > 0L) { + effectCols <- intersect( + c("BETA", "LOG_ODDS", "SIGNED_SUMSTAT", "OR"), colnames(df)) + if (length(effectCols) > 0L) { + before <- nrow(df) + badMask <- rep(FALSE, nrow(df)) + for (col in effectCols) { + vals <- as.numeric(df[[col]]) + sentinel <- if (col == "OR") 1 else 0 + badMask <- badMask | (!is.na(vals) & vals == sentinel) + } + if (any(badMask)) df <- df[!badMask, , drop = FALSE] + dropped <- before - nrow(df) + if (dropped > 0L) audit$zeroEffectDropped <- dropped + } + } + + if (dropNonpositiveSe && "SE" %in% colnames(df) && nrow(df) > 0L) { + before <- nrow(df) + se <- as.numeric(df$SE) + bad <- !is.na(se) & se <= 0 + if (any(bad)) df <- df[!bad, , drop = FALSE] + dropped <- before - nrow(df) + if (dropped > 0L) audit$nonpositiveSeDropped <- dropped + } + + list(df = df, audit = audit) +} + # Apply ldMismatchQc (SLALOM/DENTIST) against the LD sketch. .applyLdMismatchQcToEntry <- function(df, ldSketch, method) { variantIds <- df$SNP @@ -2725,38 +2890,69 @@ krigingOutlierQc <- function(zScore, R, variantIds = NULL, entryAudit$variantsIn <- nrow(df) nStudyIn <- nrow(df) - # 1. MungeSumstats variant-content pass. - nMungeIn <- nrow(df) - mungeResult <- .runMungeSumstatsFilter( + # 1. Per-row sanity checks (drop bad P / zero effect / non-positive SE, + # clamp tiny P, coerce numeric, normalize CHR, drop missing-data rows). + # Runs before any other filtering so downstream steps see clean values. + nSanIn <- nrow(df) + sanity <- .applySanityChecks( df, - refGenome = refGenome, - useDbsnpRefCheck = opts$useDbsnpRefCheck, - removeIndels = opts$removeIndels, - removeStrandAmbiguous = opts$removeStrandAmbiguous, - mafCutoff = opts$mafCutoff, - infoCutoff = opts$infoCutoff, - nCutoff = opts$nCutoff, - convertRefGenome = opts$convertRefGenome, - mungeSumstatsArgs = opts$mungeSumstatsArgs) - df <- mungeResult$df - entryAudit$mungeSumstatsDropped <- mungeResult$droppedNVariants - if (nMungeIn > 0L) { - emit("QC track: MungeSumstats kept ", nrow(df), " of ", nMungeIn, + coerceNumeric = opts$coerceNumeric, + normalizeChr = opts$normalizeChr, + dropNonstandardChr = opts$dropNonstandardChr, + dropMissData = opts$dropMissData, + dropPOutOfRange = opts$dropPOutOfRange, + clampSmallP = opts$clampSmallP, + smallPFloor = opts$smallPFloor, + dropZeroEffect = opts$dropZeroEffect, + dropNonpositiveSe = opts$dropNonpositiveSe) + df <- sanity$df + if (length(sanity$audit) > 0L) entryAudit$sanityChecks <- sanity$audit + if (nSanIn > 0L && nrow(df) != nSanIn) { + emit("QC track: sanity checks kept ", nrow(df), " of ", nSanIn, + " variant(s).") + } + + # 2. Variant-content filters (MAF / INFO / N). Pure column-numeric + # filters; the indel / strand-ambiguous variant-allele filtering happens + # inside .matchAgainstSketch via .matchRefPanel against the LD panel. + nFiltIn <- nrow(df) + contentFiltered <- .applyContentFilters( + df, + mafCutoff = opts$mafCutoff, + infoCutoff = opts$infoCutoff, + nCutoff = opts$nCutoff) + df <- contentFiltered$df + if (length(contentFiltered$audit) > 0L) + entryAudit$contentFilters <- contentFiltered$audit + if (nFiltIn > 0L && nrow(df) != nFiltIn) { + emit("QC track: MAF/INFO/N filters kept ", nrow(df), " of ", nFiltIn, " variant(s).") } - # 1b. Derive BETA / SE from signed Z when the input only carries Z. + # 3. Derive BETA / SE from signed Z when the input only carries Z. # Formula (Zhu et al. 2016 / RAISS): se = 1/sqrt(2 * maf * (1-maf) * # (N + z^2)); beta = z * se. Requires Z, MAF, and N to all be present. derived <- .deriveBetaSeFromZ(df) df <- derived$df if (!is.null(derived$audit)) entryAudit$betaSeFromZ <- derived$audit - # 1c. Derive P-values from Z when the entry carries Z but not P. - # Standard two-tailed normal: p = 2 * pnorm(-|z|). + # 4. Derive P-values from Z when the entry carries Z but not P. + # Standard two-tailed normal: p = 2 * pnorm(-|z|). Very large |Z| can + # underflow to P = 0, so re-apply the small-P clamp afterwards. if ("Z" %in% colnames(df) && !"P" %in% colnames(df)) { df$P <- .zToPvalue(df$Z) entryAudit$pValueFromZ <- sum(!is.na(df$P)) + if (isTRUE(opts$clampSmallP) && nrow(df) > 0L) { + smallMask <- !is.na(df$P) & df$P >= 0 & df$P < opts$smallPFloor + nClamped <- sum(smallMask) + if (nClamped > 0L) { + df$P[smallMask] <- opts$smallPFloor + prev <- entryAudit$sanityChecks$smallPClamped %||% 0L + if (is.null(entryAudit$sanityChecks)) + entryAudit$sanityChecks <- list() + entryAudit$sanityChecks$smallPClamped <- prev + nClamped + } + } } # 2. keepVariants subset. @@ -2788,7 +2984,12 @@ krigingOutlierQc <- function(zScore, R, variantIds = NULL, # 5. Panel-vs-sumstats allele harmonization. nHarmIn <- nrow(df) - df <- .matchAgainstSketch(df, ldSketch, matchMinProp = opts$matchMinProp) + df <- .matchAgainstSketch( + df, ldSketch, + matchMinProp = opts$matchMinProp, + removeIndels = opts$removeIndels, + removeStrandAmbiguous = opts$removeStrandAmbiguous, + removeDups = TRUE) harmCounts <- attr(df, "qcCounts") attr(df, "qcCounts") <- NULL entryAudit$matchedAgainstSketch <- nrow(df) @@ -2911,9 +3112,33 @@ krigingOutlierQc <- function(zScore, R, variantIds = NULL, # because imputation adds variants back, so "in -> out" is not monotonic. # Skipped steps omitted. removedSegs <- character(0) - if (entryAudit$mungeSumstatsDropped > 0L) - removedSegs <- c(removedSegs, - paste0("munge ", entryAudit$mungeSumstatsDropped)) + sc <- entryAudit$sanityChecks + if (!is.null(sc)) { + if (!is.null(sc$nonstandardChrDropped) && sc$nonstandardChrDropped > 0L) + removedSegs <- c(removedSegs, + paste0("nonstdChr ", sc$nonstandardChrDropped)) + if (!is.null(sc$missDataDropped) && sc$missDataDropped > 0L) + removedSegs <- c(removedSegs, + paste0("missData ", sc$missDataDropped)) + if (!is.null(sc$pOutOfRangeDropped) && sc$pOutOfRangeDropped > 0L) + removedSegs <- c(removedSegs, + paste0("badP ", sc$pOutOfRangeDropped)) + if (!is.null(sc$zeroEffectDropped) && sc$zeroEffectDropped > 0L) + removedSegs <- c(removedSegs, + paste0("zeroEffect ", sc$zeroEffectDropped)) + if (!is.null(sc$nonpositiveSeDropped) && sc$nonpositiveSeDropped > 0L) + removedSegs <- c(removedSegs, + paste0("badSE ", sc$nonpositiveSeDropped)) + } + cf <- entryAudit$contentFilters + if (!is.null(cf)) { + if (!is.null(cf$mafDropped) && cf$mafDropped > 0L) + removedSegs <- c(removedSegs, paste0("maf ", cf$mafDropped)) + if (!is.null(cf$infoDropped) && cf$infoDropped > 0L) + removedSegs <- c(removedSegs, paste0("info ", cf$infoDropped)) + if (!is.null(cf$nDropped) && cf$nDropped > 0L) + removedSegs <- c(removedSegs, paste0("nCutoff ", cf$nDropped)) + } if (qcCount$harmDropped > 0L) removedSegs <- c(removedSegs, paste0("harmonization ", qcCount$harmDropped)) @@ -2943,13 +3168,16 @@ krigingOutlierQc <- function(zScore, R, variantIds = NULL, #' Run QC on a SumStats Collection #' #' Applies a single QC pass to a \code{QtlSumStats} or \code{GwasSumStats} -#' collection: delegates variant-content QC (column standardization, -#' indels, strand-ambiguous, MAF/INFO/N filters, p-value & effect-size -#' sanity checks, optional dbSNP / liftover) to -#' \code{MungeSumstats::format_sumstats()}, then runs pecotmr-specific -#' steps (\code{skipRegion}, optional PIP screen, panel harmonization -#' against the \code{ldSketch} via \code{.matchRefPanel}, optional -#' SLALOM/DENTIST LD-mismatch QC, optional RAISS imputation). +#' collection: per-row sanity checks via \code{.applySanityChecks} (drop +#' rows with out-of-range / zero P, BETA == 0, SE <= 0, NA in vital +#' columns; clamp tiny P; normalize CHR; coerce signed columns to +#' numeric), variant-content filters (MAF / INFO / N) via +#' \code{.applyContentFilters}, optional \code{skipRegion} drop, optional +#' PIP screen, panel-vs-sumstats allele harmonization against the +#' \code{ldSketch} via \code{.matchRefPanel} (which handles indels, +#' strand-ambiguous variants, sign / strand flips, and duplicate drops in +#' a single sweep), optional SLALOM/DENTIST LD-mismatch QC, and optional +#' RAISS imputation. No Bioconductor genome / dbSNP packages required. #' #' The returned collection has its \code{qcInfo} slot populated with a #' per-entry audit record (variant counts, drop counts at each step, @@ -2964,24 +3192,19 @@ krigingOutlierQc <- function(zScore, R, variantIds = NULL, #' #' @param sumstats A \code{QtlSumStats} or \code{GwasSumStats} #' collection. -#' @param useDbsnpRefCheck One-shot opt-in to MungeSumstats's -#' dbSNP-based reference-genome / allele-flip / biallelic-filter -#' checks. When \code{TRUE}, sets \code{on_ref_genome}, -#' \code{infer_eff_direction}, \code{allele_flip_check}, and -#' \code{bi_allelic_filter} all to \code{TRUE} simultaneously. -#' Default \code{FALSE} (trust input alleles, lighter dependency -#' footprint). #' @param removeIndels Logical (length 1). When \code{TRUE}, drop -#' indels. Default \code{FALSE} (match MungeSumstats default). +#' indels during panel harmonization. Default \code{FALSE}. #' @param removeStrandAmbiguous Logical (length 1). When \code{TRUE}, #' drop A/T and C/G strand-ambiguous variants. Default \code{TRUE}. #' @param mafCutoff Numeric (length 1). MAF threshold (variants with #' \code{MAF < mafCutoff} are dropped). Default 0. Requires \code{MAF} -#' column. +#' or \code{FRQ} column when non-zero. #' @param infoCutoff Numeric (length 1). INFO score threshold. Default -#' 0. Requires \code{INFO} column. -#' @param nCutoff Numeric (length 1). MungeSumstats \code{N_std} value; -#' sample-size deviation threshold. Default 5. +#' 0. Requires \code{INFO} column when non-zero. +#' @param nCutoff Numeric (length 1). Sample-size deviation threshold: +#' drop variants whose \code{N} is more than \code{nCutoff} +#' median-absolute-deviations from the median. Set to 0 to disable. +#' Default 5. #' @param keepVariants Optional character vector of variant IDs (SNP #' column) to retain prior to harmonization. #' @param skipRegion Optional character vector of \code{"chr:start-end"} @@ -3000,19 +3223,35 @@ krigingOutlierQc <- function(zScore, R, variantIds = NULL, #' sketch is not yet fully wired for the new path; the option is #' accepted but currently emits a warning and is skipped.) #' @param imputeOpts Named list of RAISS parameters. -#' @param convertRefGenome Optional character (\code{"GRCh37"} or -#' \code{"GRCh38"}) to liftover the sumstats to via MungeSumstats. -#' \code{NULL} (default) skips liftover. #' @param matchMinProp Minimum proportion of LD panel variants that must #' be matched by the sumstats; default 0. -#' @param mungeSumstatsArgs Optional named list of pass-through args to -#' \code{MungeSumstats::format_sumstats()}. Any name supplied here -#' overrides the value the curated knobs would have set. +#' @param coerceNumeric Logical. Coerce signed columns +#' (Z/BETA/SE/OR/LOG_ODDS/SIGNED_SUMSTAT/P/MAF/FRQ/INFO/N) to numeric. +#' Default \code{TRUE}. +#' @param normalizeChr Logical. Strip \code{"chr"} prefix, uppercase the +#' chromosome label, and map 23->X, 24->Y, M->MT. Default \code{TRUE}. +#' @param dropNonstandardChr Logical. Drop variants whose CHR (after +#' normalization) is outside 1..22, X, Y, MT. Default \code{TRUE}. +#' @param dropMissData Logical. Drop rows with NA in any vital column +#' (chrom, pos, A1, A2, and at least one of Z / BETA). Default +#' \code{TRUE}. +#' @param dropPOutOfRange Logical. Drop rows where \code{P < 0} or +#' \code{P > 1}. Default \code{TRUE}. +#' @param clampSmallP Logical. Floor non-negative P values below +#' \code{smallPFloor} to \code{smallPFloor} so \code{-log10(P)} stays +#' finite. Applied to both input and Z-derived P values. Default +#' \code{TRUE}. +#' @param smallPFloor Numeric (length 1). Floor for \code{clampSmallP}. +#' Default \code{5e-324} (R's smallest positive double). +#' @param dropZeroEffect Logical. Drop rows where any effect column is +#' exactly 0 (\code{BETA}, \code{LOG_ODDS}, \code{SIGNED_SUMSTAT}) or +#' \code{OR} is exactly 1. Default \code{TRUE}. +#' @param dropNonpositiveSe Logical. Drop rows where \code{SE <= 0}. +#' Default \code{TRUE}. #' @return A new \code{QtlSumStats} / \code{GwasSumStats} with cleaned #' entries and \code{qcInfo} populated. #' @export summaryStatsQc <- function(sumstats, - useDbsnpRefCheck = FALSE, removeIndels = FALSE, removeStrandAmbiguous = TRUE, mafCutoff = 0, @@ -3029,9 +3268,16 @@ summaryStatsQc <- function(sumstats, r2Threshold = 0.6, minimumLd = 5, lamb = 0.01), - convertRefGenome = NULL, matchMinProp = 0, - mungeSumstatsArgs = list()) { + coerceNumeric = TRUE, + normalizeChr = TRUE, + dropNonstandardChr = TRUE, + dropMissData = TRUE, + dropPOutOfRange = TRUE, + clampSmallP = TRUE, + smallPFloor = 5e-324, + dropZeroEffect = TRUE, + dropNonpositiveSe = TRUE) { if (!methods::is(sumstats, "QtlSumStats") && !methods::is(sumstats, "GwasSumStats")) { stop("summaryStatsQc requires a QtlSumStats or GwasSumStats input.") @@ -3044,15 +3290,13 @@ summaryStatsQc <- function(sumstats, cols <- colnames(mc) if (mafCutoff > 0 && !any(c("MAF", "FRQ") %in% cols)) stop("summaryStatsQc: mafCutoff > 0 requires every entry to carry a ", - "MAF or FRQ column; entry ", i, " does not. MungeSumstats ", - "normalises FRQ to MAF internally during format_sumstats.") + "MAF or FRQ column; entry ", i, " does not.") if (infoCutoff > 0 && !"INFO" %in% cols) stop("summaryStatsQc: infoCutoff > 0 requires every entry to carry an ", "INFO column; entry ", i, " does not.") } opts <- list( - useDbsnpRefCheck = useDbsnpRefCheck, removeIndels = removeIndels, removeStrandAmbiguous = removeStrandAmbiguous, mafCutoff = mafCutoff, @@ -3065,9 +3309,16 @@ summaryStatsQc <- function(sumstats, alleleFlipKriging = alleleFlipKriging, impute = impute, imputeOpts = imputeOpts, - convertRefGenome = convertRefGenome, matchMinProp = matchMinProp, - mungeSumstatsArgs = mungeSumstatsArgs, + coerceNumeric = coerceNumeric, + normalizeChr = normalizeChr, + dropNonstandardChr = dropNonstandardChr, + dropMissData = dropMissData, + dropPOutOfRange = dropPOutOfRange, + clampSmallP = clampSmallP, + smallPFloor = smallPFloor, + dropZeroEffect = dropZeroEffect, + dropNonpositiveSe = dropNonpositiveSe, nForPip = NULL) newEntries <- vector("list", nrow(sumstats)) @@ -3100,7 +3351,6 @@ summaryStatsQc <- function(sumstats, qcInfo <- list( timestamp = NA_character_, options = list( - useDbsnpRefCheck = useDbsnpRefCheck, removeIndels = removeIndels, removeStrandAmbiguous = removeStrandAmbiguous, mafCutoff = mafCutoff, @@ -3109,9 +3359,16 @@ summaryStatsQc <- function(sumstats, zMismatchQc = zMismatchQc, alleleFlipKriging = alleleFlipKriging, impute = impute, - convertRefGenome = convertRefGenome), - entryAudit = entryAudits, - mungeSumstatsArgs = mungeSumstatsArgs) + coerceNumeric = coerceNumeric, + normalizeChr = normalizeChr, + dropNonstandardChr = dropNonstandardChr, + dropMissData = dropMissData, + dropPOutOfRange = dropPOutOfRange, + clampSmallP = clampSmallP, + smallPFloor = smallPFloor, + dropZeroEffect = dropZeroEffect, + dropNonpositiveSe = dropNonpositiveSe), + entryAudit = entryAudits) # Rebuild the SumStats with new entries and qcInfo. if (methods::is(sumstats, "GwasSumStats")) { diff --git a/tests/testthat/test_ctwasPipeline.R b/tests/testthat/test_ctwasPipeline.R index f0ca21d9..135a93e5 100644 --- a/tests/testthat/test_ctwasPipeline.R +++ b/tests/testthat/test_ctwasPipeline.R @@ -7,13 +7,23 @@ context("ctwasPipeline") # =========================================================================== .ctp_makeHandle <- function(snp_n = 6L, n_samples = 30L) { + # Use a per-process tempfile so .ctwasLdPanelKey's file.exists check + # succeeds against the fixture handle (real LD-sketch payloads exist + # by construction; mock fixtures need an equivalent on-disk anchor). + gdsPath <- file.path(tempdir(), "ctp_sketch.gds") + if (!file.exists(gdsPath)) file.create(gdsPath) + positions <- seq(100L, by = 100L, length.out = snp_n) + # SNP IDs follow the canonical chr:pos:A2:A1 layout so allele + # harmonization inside .ctwasBuildWeights / .ctwasHarmonizeWeights can + # parse them via parseVariantId(). + snpIds <- sprintf("chr1:%d:G:A", positions) new("GenotypeHandle", - path = "/tmp/sketch.gds", + path = gdsPath, format = "gds", snpInfo = data.frame( - SNP = paste0("v", seq_len(snp_n)), + SNP = snpIds, CHR = rep("1", snp_n), - BP = seq(100L, by = 100L, length.out = snp_n), + BP = positions, A1 = rep("A", snp_n), A2 = rep("G", snp_n), stringsAsFactors = FALSE), @@ -22,6 +32,10 @@ context("ctwasPipeline") pgenPtr = NULL) } +# Canonical SNP IDs the fixtures use. .ctp_makeHandle() emits these in +# chr:pos:A2:A1 form; tests reference them by index via .ctp_snpId(i). +.ctp_snpId <- function(i) sprintf("chr1:%d:G:A", 100L * i) + .ctp_mockExtractor <- function(seed = 5, n_samples = 30L) { function(handle, snpIdx, meanImpute = TRUE) { set.seed(seed) @@ -54,7 +68,7 @@ context("ctwasPipeline") ranges = IRanges::IRanges(start = seq(100L, by = 100L, length.out = 6L), width = 1L)) S4Vectors::mcols(gr) <- S4Vectors::DataFrame( - SNP = paste0("v", 1:6), + SNP = vapply(1:6, .ctp_snpId, character(1)), A1 = rep("A", 6), A2 = rep("G", 6), Z = rnorm(6), N = rep(1000L, 6)) GwasSumStats( @@ -67,7 +81,7 @@ context("ctwasPipeline") .ctp_makeTwasWeights <- function() { e <- TwasWeightsEntry( - variantIds = paste0("v", 1:5), + variantIds = vapply(1:5, .ctp_snpId, character(1)), weights = c(0.1, 0.05, -0.2, 0.3, 0.0)) TwasWeights( study = "Q1", context = "c1", trait = "t1", method = "susie", @@ -84,57 +98,139 @@ context("ctwasPipeline") # building helpers directly (they don't gate on ctwas). # =========================================================================== -test_that("ctwasPipeline: rejects non-GwasSumStats gwasSumStats", { +# Helper: minimal two-block input set for the multi-block API tests. +.ctp_makeMultiBlockInputs <- function(qc = TRUE) { + ss <- .ctp_makeGwasSumstats(qc = qc) + tw <- .ctp_makeTwasWeights() + list( + gwasSumStats = list(block1 = ss, block2 = ss), + twasWeights = list(block1 = tw, block2 = tw)) +} + +test_that("ctwasPipeline: rejects a bare (non-list) GwasSumStats", { skip_if_not_installed("ctwas") expect_error( - ctwasPipeline(gwasSumStats = "no", + ctwasPipeline(gwasSumStats = .ctp_makeGwasSumstats(), twasWeights = .ctp_makeTwasWeights()), - "must be a GwasSumStats" + "NAMED LIST of GwasSumStats" ) }) -test_that("ctwasPipeline: rejects un-QCd GwasSumStats", { +test_that("ctwasPipeline: rejects a single-block named list", { skip_if_not_installed("ctwas") expect_error( - ctwasPipeline(gwasSumStats = .ctp_makeGwasSumstats(qc = FALSE), - twasWeights = .ctp_makeTwasWeights()), + ctwasPipeline(gwasSumStats = list(block1 = .ctp_makeGwasSumstats()), + twasWeights = list(block1 = .ctp_makeTwasWeights())), + "at least two LD blocks" + ) +}) + +test_that("ctwasPipeline: rejects un-QCd GwasSumStats in any region", { + skip_if_not_installed("ctwas") + ss_qc <- .ctp_makeGwasSumstats(qc = TRUE) + ss_noqc <- .ctp_makeGwasSumstats(qc = FALSE) + tw <- .ctp_makeTwasWeights() + expect_error( + ctwasPipeline(gwasSumStats = list(block1 = ss_qc, block2 = ss_noqc), + twasWeights = list(block1 = tw, block2 = tw)), "has no QC record" ) }) -test_that("ctwasPipeline: rejects missing twasWeights", { +test_that("ctwasPipeline: rejects twasWeights keys not present in gwasSumStats", { skip_if_not_installed("ctwas") + ss <- .ctp_makeGwasSumstats() + tw <- .ctp_makeTwasWeights() expect_error( - ctwasPipeline(gwasSumStats = .ctp_makeGwasSumstats()), - "must be a TwasWeights" + ctwasPipeline(gwasSumStats = list(blockA = ss, blockB = ss), + twasWeights = list(blockA = tw, blockC = tw)), + "key.*not present in.*gwasSumStats" ) }) -test_that("ctwasPipeline: rejects non-GRanges twasZ", { +test_that("assembleCtwasInputs: allows twasWeights keys to be a subset of gwasSumStats", { + ss <- .ctp_makeGwasSumstats() + tw <- .ctp_makeTwasWeights() + local_mocked_bindings(extractBlockGenotypes = .ctp_mockExtractor(), + .package = "pecotmr") + # Two blocks supply zSnp; only block1 supplies TwasWeights. + inputs <- assembleCtwasInputs( + gwasSumStats = list(block1 = ss, block2 = ss), + twasWeights = list(block1 = tw)) + # Both blocks appear in region_info / snp_map (SNP-only block2 contributes + # its zSnp), but the weights list only has block1-keyed entries. + expect_setequal(inputs$region_info$region_id, c("block1", "block2")) + expect_setequal(names(inputs$snp_map), c("block1", "block2")) + expect_true(all(grepl("^block1\\|", names(inputs$weights)))) +}) + +test_that("assembleCtwasInputs: filters TwasWeights against UNION of all blocks' GWAS variants", { + # Build two blocks with NON-OVERLAPPING GWAS variants. Block 1 covers + # v1..v3, block 2 covers v4..v6. The gene's weight spans v2..v5 — i.e. + # crosses the block boundary. With a per-block filter the gene would + # lose v4..v5 (block-2 variants); with a global-union filter all four + # weight variants survive. + mkBlockGss <- function(study, snpIds, qc = TRUE) { + gr <- GenomicRanges::GRanges( + seqnames = "chr1", + ranges = IRanges::IRanges(start = as.integer(gsub(".*:([0-9]+):.*", "\\1", snpIds)), + width = 1L)) + S4Vectors::mcols(gr) <- S4Vectors::DataFrame( + SNP = snpIds, + A1 = rep("A", length(snpIds)), A2 = rep("G", length(snpIds)), + Z = rnorm(length(snpIds)), N = rep(1000L, length(snpIds))) + GwasSumStats(study = study, entry = list(gr), genome = "hg19", + ldSketch = .ctp_makeHandle(), + qcInfo = if (qc) list(step1 = "ok") else list()) + } + ss1 <- mkBlockGss("G1", vapply(1:3, .ctp_snpId, character(1))) + ss2 <- mkBlockGss("G2", vapply(4:6, .ctp_snpId, character(1))) + # Cross-boundary weights: v2..v5 (4 variants spanning both blocks). + crossEntry <- TwasWeightsEntry( + variantIds = vapply(2:5, .ctp_snpId, character(1)), + weights = c(0.1, 0.2, 0.3, 0.4)) + tw <- TwasWeights( + study = "G1", context = "c1", trait = "t1", method = "susie", + entry = list(crossEntry), ldSketch = .ctp_makeHandle()) + local_mocked_bindings(extractBlockGenotypes = .ctp_mockExtractor(), + .package = "pecotmr") + inputs <- assembleCtwasInputs( + gwasSumStats = list(block1 = ss1, block2 = ss2), + twasWeights = list(block1 = tw)) + # All four cross-boundary variants should appear in the weights list + # (would be only 2 with a per-block filter). + wgt <- inputs$weights[[1L]]$wgt + expect_equal(nrow(wgt), 4L) + expect_setequal(rownames(wgt), vapply(2:5, .ctp_snpId, character(1))) +}) + +test_that("ctwasPipeline: rejects bare (non-list) TwasWeights", { skip_if_not_installed("ctwas") expect_error( - ctwasPipeline(gwasSumStats = .ctp_makeGwasSumstats(), - twasWeights = .ctp_makeTwasWeights(), - twasZ = "not a GRanges"), - "must be a GRanges" + ctwasPipeline(gwasSumStats = list(block1 = .ctp_makeGwasSumstats(), + block2 = .ctp_makeGwasSumstats()), + twasWeights = .ctp_makeTwasWeights()), + "NAMED LIST of TwasWeights" ) }) -test_that("ctwasPipeline: rejects bad regionId", { +test_that("ctwasPipeline: rejects non-GRanges twasZ", { skip_if_not_installed("ctwas") + inp <- .ctp_makeMultiBlockInputs() expect_error( - ctwasPipeline(gwasSumStats = .ctp_makeGwasSumstats(), - twasWeights = .ctp_makeTwasWeights(), - regionId = ""), - "non-empty character" + ctwasPipeline(gwasSumStats = inp$gwasSumStats, + twasWeights = inp$twasWeights, + twasZ = "not a GRanges"), + "must be a GRanges" ) }) test_that("ctwasPipeline: rejects unknown groupPriorVarStructure value", { skip_if_not_installed("ctwas") + inp <- .ctp_makeMultiBlockInputs() expect_error( - ctwasPipeline(gwasSumStats = .ctp_makeGwasSumstats(), - twasWeights = .ctp_makeTwasWeights(), + ctwasPipeline(gwasSumStats = inp$gwasSumStats, + twasWeights = inp$twasWeights, groupPriorVarStructure = "bogus"), "'arg'" ) @@ -174,7 +270,7 @@ test_that(".ctwasBuildZSnp: produces a flat data.frame keyed by SNP/study", { expect_equal(nrow(df), 6L) expect_setequal(colnames(df), c("id", "chrom", "pos", "A1", "A2", "z", "study")) - expect_setequal(df$id, paste0("v", 1:6)) + expect_setequal(df$id, vapply(1:6, .ctp_snpId, character(1))) expect_setequal(unique(df$study), "G1") }) @@ -202,6 +298,96 @@ test_that(".ctwasSnpInfoForBlock: returns ctwas-required columns chrom/id/pos/al c("chrom", "id", "pos", "alt", "ref")) }) +test_that(".ctwasLdPanelKey: returns the on-disk path for an existing GDS sketch", { + handle <- .ctp_makeHandle() + key <- pecotmr:::.ctwasLdPanelKey(handle) + expect_true(file.exists(key)) + expect_equal(key, getPath(handle)) +}) + +test_that(".ctwasLdPanelKey: errors when no candidate file exists", { + ghost <- new("GenotypeHandle", + path = file.path(tempdir(), "does_not_exist_for_test.pgen_stem"), + format = "plink2", + snpInfo = data.frame(SNP = "v1", CHR = "1", BP = 100L, A1 = "A", + A2 = "G", stringsAsFactors = FALSE), + nSamples = 1L, sampleIds = "s1", pgenPtr = NULL) + expect_error(pecotmr:::.ctwasLdPanelKey(ghost), + "could not derive an existing LD-file token") +}) + +test_that(".ctwasResolveMethod: caller-supplied method wins when present", { + e <- TwasWeightsEntry(variantIds = paste0("v", 1:3), + weights = c(0.1, 0.2, 0.3)) + tw <- TwasWeights( + study = "Q1", context = "c1", trait = "t1", method = "mrash", + entry = list(e), ldSketch = .ctp_makeHandle()) + expect_equal(pecotmr:::.ctwasResolveMethod(list(r1 = tw), "mrash"), + "mrash") +}) + +test_that(".ctwasResolveMethod: caller-supplied unknown method errors", { + e <- TwasWeightsEntry(variantIds = paste0("v", 1:3), + weights = c(0.1, 0.2, 0.3)) + tw <- TwasWeights( + study = "Q1", context = "c1", trait = "t1", method = "mrash", + entry = list(e), ldSketch = .ctp_makeHandle()) + expect_error(pecotmr:::.ctwasResolveMethod(list(r1 = tw), "bogus"), + "not present in TwasWeights") +}) + +test_that(".ctwasResolveMethod: defaults to ensemble when present among multiple", { + mkTw <- function(m) { + e <- TwasWeightsEntry(variantIds = paste0("v", 1:3), + weights = c(0.1, 0.2, 0.3)) + TwasWeights(study = "Q1", context = "c1", trait = "t1", method = m, + entry = list(e), ldSketch = .ctp_makeHandle()) + } + # Build a multi-method TwasWeights by stitching two methods together. + tw <- TwasWeights( + study = c("Q1", "Q1"), context = c("c1", "c1"), + trait = c("t1", "t1"), method = c("mrash", "ensemble"), + entry = list( + TwasWeightsEntry(variantIds = paste0("v", 1:3), weights = c(0.1, 0.2, 0.3)), + TwasWeightsEntry(variantIds = paste0("v", 1:3), weights = c(0.4, 0.5, 0.6))), + ldSketch = .ctp_makeHandle()) + expect_equal(pecotmr:::.ctwasResolveMethod(list(r1 = tw)), "ensemble") +}) + +test_that(".ctwasResolveMethod: single method auto-picked when only one available", { + e <- TwasWeightsEntry(variantIds = paste0("v", 1:3), + weights = c(0.1, 0.2, 0.3)) + tw <- TwasWeights( + study = "Q1", context = "c1", trait = "t1", method = "mrash", + entry = list(e), ldSketch = .ctp_makeHandle()) + expect_equal(pecotmr:::.ctwasResolveMethod(list(r1 = tw)), "mrash") +}) + +test_that(".ctwasResolveMethod: multi-method + no ensemble + no caller method errors", { + tw <- TwasWeights( + study = c("Q1", "Q1"), context = c("c1", "c1"), + trait = c("t1", "t1"), method = c("mrash", "susie"), + entry = list( + TwasWeightsEntry(variantIds = paste0("v", 1:3), weights = c(0.1, 0.2, 0.3)), + TwasWeightsEntry(variantIds = paste0("v", 1:3), weights = c(0.4, 0.5, 0.6))), + ldSketch = .ctp_makeHandle()) + expect_error(pecotmr:::.ctwasResolveMethod(list(r1 = tw)), + "Supply a `method` argument") +}) + +test_that(".ctwasFilterMethod: subsets rows to the requested method", { + tw <- TwasWeights( + study = c("Q1", "Q1"), context = c("c1", "c1"), + trait = c("t1", "t1"), method = c("mrash", "susie"), + entry = list( + TwasWeightsEntry(variantIds = paste0("v", 1:3), weights = c(0.1, 0.2, 0.3)), + TwasWeightsEntry(variantIds = paste0("v", 1:3), weights = c(0.4, 0.5, 0.6))), + ldSketch = .ctp_makeHandle()) + twSub <- pecotmr:::.ctwasFilterMethod(tw, "susie") + expect_equal(nrow(twSub), 1L) + expect_equal(as.character(twSub$method), "susie") +}) + # Build an ldPanel fixture (matches .ctwasComputeFullPanelLd's return # shape) for the 6-SNP toy panel from .ctp_makeHandle(). .ctp_makeLdPanel <- function(snp_n = 6L) { @@ -209,12 +395,186 @@ test_that(".ctwasSnpInfoForBlock: returns ctwas-required columns chrom/id/pos/al snpInfo <- pecotmr:::.ctwasSnpInfoForBlock(h) R <- diag(1, snp_n) dimnames(R) <- list(snpInfo$id, snpInfo$id) - list(R = R, snpInfo = snpInfo) + # Unit dosage variance — sqrt(1) = 1, so the variance-scaling step in + # .ctwasBuildWeights is a no-op for this fixture. + variance <- setNames(rep(1, snp_n), snpInfo$id) + list(R = R, snpInfo = snpInfo, variance = variance) } +test_that(".ctwasBuildWeights: scales non-standardized weights by sqrt(variance)", { + panel <- .ctp_makeLdPanel() + # Replace the default unit variance with non-trivial values; raw + # weights should be multiplied by sqrt(variance) before reaching the + # final wgt matrix. + panel$variance <- setNames(c(0.5, 1, 2, 4, 8, 16), panel$snpInfo$id) + ids5 <- vapply(1:5, .ctp_snpId, character(1)) + rawW <- c(0.1, 0.2, 0.3, 0.4, 0.5) + tw <- TwasWeights( + study = "Q1", context = "c1", trait = "t1", method = "susie", + entry = list(TwasWeightsEntry(variantIds = ids5, + weights = rawW)), + ldSketch = .ctp_makeHandle()) + wl <- pecotmr:::.ctwasBuildWeights(tw, panel) + expected <- unname(rawW * sqrt(panel$variance[ids5])) + expect_equal(as.numeric(wl[[1L]]$wgt), expected, tolerance = 1e-12) +}) + +test_that(".ctwasBuildWeights: standardized weights bypass variance scaling", { + panel <- .ctp_makeLdPanel() + panel$variance <- setNames(c(0.5, 1, 2, 4, 8, 16), panel$snpInfo$id) + ids5 <- vapply(1:5, .ctp_snpId, character(1)) + rawW <- c(0.1, 0.2, 0.3, 0.4, 0.5) + tw <- TwasWeights( + study = "Q1", context = "c1", trait = "t1", method = "susie", + entry = list(TwasWeightsEntry(variantIds = ids5, + weights = rawW, + standardized = TRUE)), + ldSketch = .ctp_makeHandle()) + wl <- pecotmr:::.ctwasBuildWeights(tw, panel) + expect_equal(as.numeric(wl[[1L]]$wgt), rawW, tolerance = 1e-12) +}) + +# Build an LD-panel fixture with realistic chr:pos:A2:A1 variant IDs so +# `.ctwasHarmonizeWeights` (which calls parseVariantId + .matchRefPanel) +# can do real allele matching. +.ctp_makeAllelePanel <- function() { + ids <- c("1:100:C:T", "1:200:G:A", "1:300:A:G", "1:400:T:C") + snpInfo <- data.frame( + chrom = 1L, + id = ids, + pos = c(100L, 200L, 300L, 400L), + alt = c("T", "A", "G", "C"), # A1 (effect) + ref = c("C", "G", "A", "T"), # A2 (other) + stringsAsFactors = FALSE) + R <- diag(1, 4); dimnames(R) <- list(ids, ids) + list(R = R, snpInfo = snpInfo, + variance = setNames(rep(1, 4), ids)) +} + +test_that(".ctwasHarmonizeWeights: sign-flips weights for swapped A1/A2", { + panel <- .ctp_makeAllelePanel() + refVariants <- data.frame( + chrom = panel$snpInfo$chrom, pos = panel$snpInfo$pos, + A2 = panel$snpInfo$ref, A1 = panel$snpInfo$alt, + variant_id = panel$snpInfo$id, stringsAsFactors = FALSE) + # Variant 1: alleles match panel ("1:100:C:T" — same A2/A1 ordering) + # Variant 2: A1/A2 swapped vs panel ("1:200:A:G" — flips relative to "1:200:G:A") + # Output variant_id values are rebuilt via formatVariantId(), which + # always emits a `chr` prefix. + res <- pecotmr:::.ctwasHarmonizeWeights( + origVids = c("1:100:C:T", "1:200:A:G"), + origW = c(0.5, 0.3), + refVariants = refVariants) + expect_equal(nrow(res), 2L) + # Variant 1 keeps its sign; variant 2 should be sign-flipped to -0.3. + matches <- match(c("chr1:100:C:T", "chr1:200:G:A"), res$variant_id) + expect_equal(res$w[matches[[1L]]], 0.5, tolerance = 1e-12) + expect_equal(res$w[matches[[2L]]], -0.3, tolerance = 1e-12) +}) + +test_that(".ctwasHarmonizeWeights: drops variants not present in the panel", { + panel <- .ctp_makeAllelePanel() + refVariants <- data.frame( + chrom = panel$snpInfo$chrom, pos = panel$snpInfo$pos, + A2 = panel$snpInfo$ref, A1 = panel$snpInfo$alt, + variant_id = panel$snpInfo$id, stringsAsFactors = FALSE) + res <- pecotmr:::.ctwasHarmonizeWeights( + origVids = c("1:100:C:T", "1:999:A:T"), # 1:999 not in panel + origW = c(0.5, 0.3), + refVariants = refVariants) + expect_equal(nrow(res), 1L) + expect_equal(res$variant_id, "chr1:100:C:T") +}) + +test_that(".ctwasIsSusieFit: recognizes the susie intermediate shape", { + fits <- list(lbf_variable = matrix(0, 2, 3), + mu = matrix(0, 2, 3), + X_column_scale_factors = rep(1, 3)) + expect_true(pecotmr:::.ctwasIsSusieFit(fits)) + expect_false(pecotmr:::.ctwasIsSusieFit(NULL)) + expect_false(pecotmr:::.ctwasIsSusieFit(list(lbf_variable = matrix(0, 2, 3)))) +}) + +test_that(".ctwasRenormalizeSusieWeights: lbfToAlpha + colSums recomputation", { + # Original fit covers 4 variants; drop variant 4, renormalize over {1,2,3}. + origVids <- paste0("v", 1:4) + origW <- c(0.1, 0.2, 0.3, 0.4) + # Toy lbf_variable: 2 effects, 4 variants. Constructed so the kept + # subset yields easily-predictable softmax weights. + lbf <- rbind(c(0, 0, 0, 100), # effect 1: only v4 has signal + c(10, 0, -10, 0)) # effect 2: v1 dominates + mu <- matrix(c(1, 2, 3, 4, 1, 2, 3, 4), nrow = 2, byrow = TRUE) + xCol <- rep(1, 4) + fits <- list(lbf_variable = lbf, mu = mu, X_column_scale_factors = xCol) + out <- pecotmr:::.ctwasRenormalizeSusieWeights( + fits, + origVids = origVids, + origW = origW, + keptIdx = c(1L, 2L, 3L), + harmonizedW = origW[1:3]) + # Effect 1 lbf over {v1,v2,v3} = c(0,0,0) -> uniform alpha = 1/3 + # Effect 2 lbf over {v1,v2,v3} = c(10,0,-10) -> v1 ≈ 1 + # weight[v1] = (1/3)*1 + ~1*1 = ~1.33 + expect_equal(length(out), 3L) + expect_true(out[[1L]] > out[[2L]] && out[[1L]] > out[[3L]]) +}) + +test_that(".ctwasRenormalizeSusieWeights: returns NULL on fit/entry dimension mismatch", { + fits <- list(lbf_variable = matrix(0, 2, 5), + mu = matrix(0, 2, 5), + X_column_scale_factors = rep(1, 5)) + out <- pecotmr:::.ctwasRenormalizeSusieWeights( + fits, + origVids = paste0("v", 1:3), # entry says 3, fit covers 5 -> mismatch + origW = rep(0.1, 3), + keptIdx = 1:3, + harmonizedW = rep(0.1, 3)) + expect_null(out) +}) + +test_that(".ctwasRenormalizeSusieWeights: signFlip carries over to mu", { + # All variants kept; harmonized weights have opposite sign on v2. + origVids <- paste0("v", 1:3) + origW <- c(0.1, 0.2, 0.3) + # Strong lbf concentrated on a single effect / single variant per row, + # so the recomputed weight directly mirrors mu (alpha ≈ identity rows). + lbf <- rbind(c(100, -100, -100), + c(-100, 100, -100)) + mu <- rbind(c(1, 2, 3), + c(4, 5, 6)) + fits <- list(lbf_variable = lbf, mu = mu, + X_column_scale_factors = rep(1, 3)) + harmW_noflip <- c(0.1, 0.2, 0.3) # all positive + harmW_v2flip <- c(0.1, -0.2, 0.3) # v2 flipped + outNoFlip <- pecotmr:::.ctwasRenormalizeSusieWeights( + fits, origVids, origW, keptIdx = 1:3, harmonizedW = harmW_noflip) + outV2flip <- pecotmr:::.ctwasRenormalizeSusieWeights( + fits, origVids, origW, keptIdx = 1:3, harmonizedW = harmW_v2flip) + # v1, v3 should be unchanged between the two; v2 should flip sign. + expect_equal(outNoFlip[[1L]], outV2flip[[1L]], tolerance = 1e-9) + expect_equal(outNoFlip[[3L]], outV2flip[[3L]], tolerance = 1e-9) + expect_equal(outNoFlip[[2L]], -outV2flip[[2L]], tolerance = 1e-9) +}) + +test_that(".ctwasSnpInfoForGwasBlock: restricts panel snpInfo to block GWAS variants", { + ss <- .ctp_makeGwasSumstats() + panelInfo <- data.frame( + chrom = 1L, + id = vapply(1:6, .ctp_snpId, character(1)), # whole panel + pos = seq(100L, by = 100L, length.out = 6L), + alt = "A", ref = "G", + stringsAsFactors = FALSE) + blockInfo <- pecotmr:::.ctwasSnpInfoForGwasBlock(ss, panelInfo) + # Restricted to the variant IDs on the GwasSumStats entry. + expect_true(all(blockInfo$id %in% + as.character(S4Vectors::mcols(ss$entry[[1L]])$SNP))) + expect_true(nrow(blockInfo) <= nrow(panelInfo)) +}) + test_that(".ctwasBuildWeights: keys per-tuple weights and stamps gene metadata", { tw <- .ctp_makeTwasWeights() panel <- .ctp_makeLdPanel() + ids5 <- vapply(1:5, .ctp_snpId, character(1)) wl <- pecotmr:::.ctwasBuildWeights(tw, panel) expect_equal(length(wl), 1L) expect_equal(names(wl), "Q1|c1|t1|susie") @@ -224,40 +584,64 @@ test_that(".ctwasBuildWeights: keys per-tuple weights and stamps gene metadata", # wgt is a variants x 1 matrix with rownames = SNP IDs expect_true(is.matrix(wl[[1L]]$wgt)) expect_equal(dim(wl[[1L]]$wgt), c(5L, 1L)) - expect_equal(rownames(wl[[1L]]$wgt), paste0("v", 1:5)) + expect_equal(rownames(wl[[1L]]$wgt), ids5) # R_wgt is a 5x5 slice of the cached panel R expect_true(is.matrix(wl[[1L]]$R_wgt)) expect_equal(dim(wl[[1L]]$R_wgt), c(5L, 5L)) - expect_equal(rownames(wl[[1L]]$R_wgt), paste0("v", 1:5)) + expect_equal(rownames(wl[[1L]]$R_wgt), ids5) expect_equal(wl[[1L]]$n_wgt, 5L) # And it is literally a slice of the panel R (no recompute path). - expect_equal(wl[[1L]]$R_wgt, panel$R[paste0("v", 1:5), paste0("v", 1:5)]) + expect_equal(wl[[1L]]$R_wgt, panel$R[ids5, ids5]) }) test_that(".ctwasBuildWeights: drops variants not present in the LD panel", { + ids3 <- vapply(1:3, .ctp_snpId, character(1)) + missing <- c("chr1:99900:G:A", "chr1:99910:G:A") # not in panel tw <- TwasWeights( study = "Q1", context = "c1", trait = "t1", method = "susie", entry = list(TwasWeightsEntry( - variantIds = c(paste0("v", 1:3), "missing1", "missing2"), + variantIds = c(ids3, missing), weights = c(0.1, 0.2, 0.3, 0.4, 0.5))), ldSketch = .ctp_makeHandle()) panel <- .ctp_makeLdPanel() wl <- pecotmr:::.ctwasBuildWeights(tw, panel) expect_equal(nrow(wl[[1L]]$wgt), 3L) - expect_equal(rownames(wl[[1L]]$wgt), paste0("v", 1:3)) + expect_equal(rownames(wl[[1L]]$wgt), ids3) + expect_equal(wl[[1L]]$n_wgt, 3L) +}) + +test_that(".ctwasBuildWeights: intersects with gwasSnpIds when supplied", { + # The LD panel covers ids 1..6; the per-block GWAS sumstats covers only + # ids 1, 2, 4 (a subset). Weight variants that live in the panel but + # outside the block (id 3 here) must be dropped, otherwise ctwas's + # compute_gene_z asserts the weight variant is missing from z_snp. + ids5 <- vapply(1:5, .ctp_snpId, character(1)) + blockIds <- vapply(c(1, 2, 4), .ctp_snpId, character(1)) + tw <- TwasWeights( + study = "Q1", context = "c1", trait = "t1", method = "susie", + entry = list(TwasWeightsEntry( + variantIds = ids5, + weights = c(0.1, 0.2, 0.3, 0.4, 0.5))), + ldSketch = .ctp_makeHandle()) + panel <- .ctp_makeLdPanel() + wl <- pecotmr:::.ctwasBuildWeights( + tw, panel, gwasSnpIds = blockIds) + expect_equal(rownames(wl[[1L]]$wgt), blockIds) expect_equal(wl[[1L]]$n_wgt, 3L) }) -test_that(".ctwasComputeFullPanelLd: extracts once + returns cached R + snpInfo", { +test_that(".ctwasComputeFullPanelLd: extracts once + returns cached R + snpInfo + variance", { local_mocked_bindings(extractBlockGenotypes = .ctp_mockExtractor(), .package = "pecotmr") out <- pecotmr:::.ctwasComputeFullPanelLd(.ctp_makeHandle()) - expect_named(out, c("R", "snpInfo")) + ids6 <- vapply(1:6, .ctp_snpId, character(1)) + expect_named(out, c("R", "snpInfo", "variance")) expect_true(is.matrix(out$R)) expect_equal(dim(out$R), c(6L, 6L)) - expect_equal(rownames(out$R), paste0("v", 1:6)) + expect_equal(rownames(out$R), ids6) expect_setequal(colnames(out$snpInfo), c("chrom", "id", "pos", "alt", "ref")) + expect_named(out$variance, ids6) }) test_that(".ctwasBuildZGene: builds z_gene from a TWAS-Z GRanges", { @@ -277,69 +661,170 @@ test_that(".ctwasBuildZGene: builds z_gene from a TWAS-Z GRanges", { # LD loader / SNP-info loader closures # =========================================================================== -test_that(".ctwasSingleBlockLdLoader: returns the cached R unchanged on every call", { - R0 <- matrix(runif(36), 6, 6, dimnames = list(paste0("v", 1:6), - paste0("v", 1:6))) - loader <- pecotmr:::.ctwasSingleBlockLdLoader(R0) - expect_identical(loader("any_token"), R0) - expect_identical(loader("different_token"), R0) +test_that(".ctwasMultiBlockLdLoader: dispatches by LD_file token", { + RA <- matrix(runif(4), 2, 2, + dimnames = list(c("v1", "v2"), c("v1", "v2"))) + RB <- matrix(runif(4), 2, 2, + dimnames = list(c("v3", "v4"), c("v3", "v4"))) + loader <- pecotmr:::.ctwasMultiBlockLdLoader( + list(tokenA = list(R = RA), tokenB = list(R = RB))) + expect_identical(loader("tokenA"), RA) + expect_identical(loader("tokenB"), RB) + expect_error(loader("unknown_token"), "no cached panel") }) -test_that(".ctwasSingleBlockSnpInfoLoader: returns the cached snpInfo unchanged on every call", { - info0 <- data.frame(chrom = 1L, id = paste0("v", 1:3), - pos = c(100L, 200L, 300L), +test_that(".ctwasMultiBlockSnpInfoLoader: dispatches by LD_file token", { + infoA <- data.frame(chrom = 1L, id = c("v1", "v2"), + pos = c(100L, 200L), alt = "A", ref = "G", stringsAsFactors = FALSE) - loader <- pecotmr:::.ctwasSingleBlockSnpInfoLoader(info0) - expect_identical(loader("any_token"), info0) - expect_identical(loader("different_token"), info0) + infoB <- data.frame(chrom = 1L, id = c("v3", "v4"), + pos = c(300L, 400L), + alt = "C", ref = "T", stringsAsFactors = FALSE) + loader <- pecotmr:::.ctwasMultiBlockSnpInfoLoader( + list(tokenA = list(snpInfo = infoA), + tokenB = list(snpInfo = infoB))) + expect_identical(loader("tokenA"), infoA) + expect_identical(loader("tokenB"), infoB) + expect_error(loader("unknown_token"), "no cached panel") }) # =========================================================================== -# End-to-end with mocked ctwas::ctwas_sumstats +# Input-assembly shape checks via assembleCtwasInputs (no ctwas engine needed) # =========================================================================== -test_that("ctwasPipeline: assembles the documented input shape for ctwas_sumstats", { - skip_if_not_installed("ctwas") - ss <- .ctp_makeGwasSumstats() - tw <- .ctp_makeTwasWeights() - capturedArgs <- NULL - local_mocked_bindings( - ctwas_sumstats = function(...) { - capturedArgs <<- list(...) - list(susie_alpha_res = "mocked") - }, - .package = "ctwas") +test_that("assembleCtwasInputs: assembles the documented input shape for ctwas", { + inp <- .ctp_makeMultiBlockInputs() local_mocked_bindings(extractBlockGenotypes = .ctp_mockExtractor(), .package = "pecotmr") - out <- ctwasPipeline(gwasSumStats = ss, twasWeights = tw, - regionId = "myBlock") - expect_equal(out$susie_alpha_res, "mocked") - expect_equal(capturedArgs$region_info$region_id, "myBlock") - expect_equal(capturedArgs$z_snp$id, paste0("v", 1:6)) - expect_equal(length(capturedArgs$weights), 1L) - expect_equal(capturedArgs$L, 5L) + inputs <- assembleCtwasInputs( + gwasSumStats = inp$gwasSumStats, + twasWeights = inp$twasWeights) + # Two regions, two LD_map rows. + expect_equal(nrow(inputs$region_info), 2L) + expect_setequal(inputs$region_info$region_id, c("block1", "block2")) + # zSnp is the concatenation of both blocks' Z columns. + expect_equal(nrow(inputs$z_snp), 12L) + # snp_map keyed by region_id. + expect_setequal(names(inputs$snp_map), c("block1", "block2")) + # Per-region weights keys are prefixed with the region_id. + expect_true(all(grepl("^(block1|block2)\\|", names(inputs$weights)))) + # LD_map carries the same number of rows as regions. + expect_equal(nrow(inputs$LD_map), 2L) + # LD / snpInfo loader closures are present. + expect_true(is.function(inputs$LD_loader_fun)) + expect_true(is.function(inputs$snpinfo_loader_fun)) }) -test_that("ctwasPipeline: forwards a twasZ argument as z_gene", { - skip_if_not_installed("ctwas") - ss <- .ctp_makeGwasSumstats() - tw <- .ctp_makeTwasWeights() +test_that("assembleCtwasInputs: forwards a twasZ argument as z_gene", { + inp <- .ctp_makeMultiBlockInputs() twasZ <- GenomicRanges::GRanges("chr1", IRanges::IRanges(100, 200)) S4Vectors::mcols(twasZ) <- S4Vectors::DataFrame( qtlStudy = "Q1", context = "c1", trait = "t1", method = "susie", twasZ = 1.5) - capturedArgs <- NULL + local_mocked_bindings(extractBlockGenotypes = .ctp_mockExtractor(), + .package = "pecotmr") + inputs <- assembleCtwasInputs( + gwasSumStats = inp$gwasSumStats, + twasWeights = inp$twasWeights, + twasZ = twasZ) + expect_equal(inputs$z_gene$id, "Q1|c1|t1|susie") +}) + +# =========================================================================== +# Step-wise dispatch: estCtwasParam → screenCtwasRegions → finemapCtwasRegions +# =========================================================================== + +test_that("ctwasPipeline: dispatches assemble → est → screen → finemap and accumulates state", { + skip_if_not_installed("ctwas") + inp <- .ctp_makeMultiBlockInputs() + capturedAssemble <- list(); capturedEst <- list() + capturedScreen <- list(); capturedFinemap <- list() local_mocked_bindings( - ctwas_sumstats = function(...) { - capturedArgs <<- list(...) - list(ok = TRUE) + assemble_region_data = function(...) { + capturedAssemble <<- list(...) + list(region_data = list(block1 = list(stub = TRUE)), + boundary_genes = list(stub = TRUE), + z_gene = data.frame(id = "t1", z = 1.5)) + }, + est_param = function(...) { + capturedEst <<- list(...) + list(group_prior = c(g = 0.1, SNP = 0.0001), + group_prior_var = c(g = 5, SNP = 5)) + }, + screen_regions = function(...) { + capturedScreen <<- list(...) + list(screened_region_data = list(block1 = list(stub = TRUE)), + screen_res_meta = "mocked") }, + finemap_regions = function(...) { + capturedFinemap <<- list(...) + list(finemap_res = data.frame(id = "t1"), + susie_alpha_res = data.frame(susie_pip = 0.9)) + }, + .package = "ctwas") + local_mocked_bindings(extractBlockGenotypes = .ctp_mockExtractor(), + .package = "pecotmr") + out <- ctwasPipeline( + gwasSumStats = inp$gwasSumStats, + twasWeights = inp$twasWeights) + # Each ctwas step was invoked exactly once. + expect_true(length(capturedAssemble) > 0L) + expect_true(length(capturedEst) > 0L) + expect_true(length(capturedScreen) > 0L) + expect_true(length(capturedFinemap) > 0L) + # est sees the assembled region_data. + expect_named(capturedEst$region_data, "block1") + # screen receives the param estimates as group_prior / group_prior_var. + expect_equal(unname(capturedScreen$group_prior), c(0.1, 0.0001)) + # finemap consumes screen's screened_region_data. + expect_named(capturedFinemap$region_data, "block1") + # Output mirrors the ctwas_sumstats top-level shape. + expect_setequal( + names(out), + c("z_gene", "param", "finemap_res", "susie_alpha_res", + "region_data", "boundary_genes", "screen_res")) +}) + +test_that("estCtwasParam / screenCtwasRegions / finemapCtwasRegions can be called independently", { + skip_if_not_installed("ctwas") + inp <- .ctp_makeMultiBlockInputs() + local_mocked_bindings( + assemble_region_data = function(...) list( + region_data = list(block1 = list(stub = TRUE)), + boundary_genes = list(), + z_gene = data.frame(id = "t1", z = 1.0)), + est_param = function(...) list( + group_prior = c(g = 0.05, SNP = 1e-4), + group_prior_var = c(g = 4, SNP = 5)), + screen_regions = function(...) list( + screened_region_data = list(block1 = list(stub = TRUE))), + finemap_regions = function(...) list( + finemap_res = data.frame(id = "t1"), + susie_alpha_res = data.frame(pip = 0.5)), .package = "ctwas") local_mocked_bindings(extractBlockGenotypes = .ctp_mockExtractor(), .package = "pecotmr") - ctwasPipeline(gwasSumStats = ss, twasWeights = tw, twasZ = twasZ, - regionId = "block1") - expect_equal(capturedArgs$z_gene$id, "Q1|c1|t1|susie") + # Step 1 + inputs <- assembleCtwasInputs(inp$gwasSumStats, inp$twasWeights) + expect_true("region_info" %in% names(inputs)) + expect_true("LD_loader_fun" %in% names(inputs)) + # Step 2 + est <- estCtwasParam(inputs) + expect_true("region_data" %in% names(est)) + expect_true("param" %in% names(est)) + # User can OVERRIDE the estimated priors before screen/finemap — this is + # the escape hatch for NaN-on-iter-2 EM divergence. + est$param$group_prior <- c(g = 0.2, SNP = 1e-4) + est$param$group_prior_var <- c(g = 4.5, SNP = 5) + # Step 3 + screened <- screenCtwasRegions(est) + expect_true("screened_region_data" %in% names(screened)) + # Step 4 + final <- finemapCtwasRegions(screened) + expect_setequal( + names(final), + c("z_gene", "param", "finemap_res", "susie_alpha_res", + "region_data", "boundary_genes", "screen_res")) }) # =========================================================================== @@ -368,26 +853,27 @@ test_that("ctwasPipeline: real-engine end-to-end on the bundled example panel", entry = list(ent), ldSketch = gh) + # ctwasPipeline now requires >= 2 blocks; replicate the same GWAS + + # TWAS pair under two region_ids so the joint EM has something to + # estimate against. The bundled toy panel is still too sparse for + # the convergence checks, so we relax the gates. res <- suppressMessages(suppressWarnings( - ctwasPipeline(gwasSumStats = gss, twasWeights = tw, - regionId = "block1", - niter = 5L, - niterPrefit = 2L, - # Single-gene single-block toy: relax the production - # filters that gate out tiny inputs. - min_group_size = 1L, - min_p_single_effect = 0, - filter_L = FALSE))) + ctwasPipeline( + gwasSumStats = list(blockA = gss, blockB = gss), + twasWeights = list(blockA = tw, blockB = tw), + niter = 5L, + niterPrefit = 2L, + # Toy panel: relax the production filters that gate out tiny inputs. + min_group_size = 1L, + min_p_single_effect = 0, + filter_L = FALSE))) # ctwas_sumstats returns these 7 elements on success. expect_named(res, c("z_gene", "param", "finemap_res", "susie_alpha_res", "region_data", "boundary_genes", "screen_res"), ignore.order = TRUE) # The gene we passed in came through. - expect_equal(nrow(res$z_gene), 1L) - expect_equal(res$z_gene$id, "study1|brain|ENSG_example|susie") - # Per-SNP susie_alpha output covers our 5 weight SNPs. - expect_equal(nrow(res$susie_alpha_res), 5L) + expect_true(any(grepl("study1\\|brain\\|ENSG_example\\|susie", res$z_gene$id))) expect_true(all(c("susie_pip", "susie_alpha", "region_id") %in% colnames(res$susie_alpha_res))) }) diff --git a/tests/testthat/test_sumstatsQc.R b/tests/testthat/test_sumstatsQc.R index 7dd04d74..71e9700f 100644 --- a/tests/testthat/test_sumstatsQc.R +++ b/tests/testthat/test_sumstatsQc.R @@ -2547,16 +2547,15 @@ test_that("CS95 variants are ordered by decreasing PIP", { -context("summaryStatsQc (with mocked MungeSumstats)") +context("summaryStatsQc") # NOTE # ---- -# `.runMungeSumstatsFilter` wraps MungeSumstats::format_sumstats which needs -# a real dbSNP reference panel (multi-GB download). To exercise the QC chain -# in a unit test we mock that helper so it just returns the input data.frame -# unchanged, recording a "no variants dropped" audit record. The pecotmr- -# native steps (.applySkipRegion, .matchAgainstSketch, .applyPipScreen, -# .applyLdMismatchQcToEntry) all run for real on the synthetic fixture. +# The variant-content QC (MAF/INFO/N) is pure data.frame filtering via +# .applyContentFilters; no external genome / dbSNP packages required. +# All pecotmr-native steps (.applyContentFilters, .applySkipRegion, +# .matchAgainstSketch, .applyPipScreen, .applyLdMismatchQcToEntry) run +# for real on the synthetic fixture. # =========================================================================== # Fixture builders @@ -2602,21 +2601,6 @@ context("summaryStatsQc (with mocked MungeSumstats)") ldSketch = .ssQ_makeHandle()) } -.ssQ_mockMunge <- function(drop = 0L) { - # Mock that pretends MungeSumstats validated the input and returned the - # same data.frame, dropping `drop` rows. - function(df, refGenome, useDbsnpRefCheck, removeIndels, - removeStrandAmbiguous, mafCutoff, infoCutoff, nCutoff, - convertRefGenome, mungeSumstatsArgs) { - keep <- if (drop > 0L && drop < nrow(df)) - seq_len(nrow(df) - drop) - else - seq_len(nrow(df)) - list(df = df[keep, , drop = FALSE], - droppedNVariants = nrow(df) - length(keep)) - } -} - .ssQ_mockExtractor <- function(seed = 13, n_samples = 60L) { function(handle, snpIdx, meanImpute = TRUE) { set.seed(seed) @@ -2702,14 +2686,11 @@ test_that(".deriveBetaSeFromZ: skipped when N missing", { }) # =========================================================================== -# summaryStatsQc: end-to-end with mocked MungeSumstats +# summaryStatsQc: end-to-end on the synthetic fixture # =========================================================================== test_that("summaryStatsQc: vanilla run populates qcInfo and returns a GwasSumStats", { ss <- .ssQ_makeGwasSumStats() - local_mocked_bindings( - .runMungeSumstatsFilter = .ssQ_mockMunge(), - .package = "pecotmr") res <- summaryStatsQc(ss) expect_s4_class(res, "GwasSumStats") qc <- getQcInfo(res) @@ -2717,18 +2698,13 @@ test_that("summaryStatsQc: vanilla run populates qcInfo and returns a GwasSumSta expect_true("options" %in% names(qc)) expect_true("entryAudit" %in% names(qc)) expect_equal(length(qc$entryAudit), nrow(ss)) - # Per-entry audit records variantsIn / variantsOut / mungeSumstatsDropped. ea <- qc$entryAudit[[1L]] expect_equal(ea$variantsIn, 4L) expect_equal(ea$variantsOut, 4L) - expect_equal(ea$mungeSumstatsDropped, 0L) }) test_that("summaryStatsQc: keepVariants subsets each entry and records the drop", { ss <- .ssQ_makeGwasSumStats() - local_mocked_bindings( - .runMungeSumstatsFilter = .ssQ_mockMunge(), - .package = "pecotmr") res <- summaryStatsQc(ss, keepVariants = c("rs1", "rs3")) ea <- getQcInfo(res)$entryAudit[[1L]] expect_equal(ea$keepVariantsDropped, 2L) @@ -2737,9 +2713,6 @@ test_that("summaryStatsQc: keepVariants subsets each entry and records the drop" test_that("summaryStatsQc: skipRegion drops overlapping variants", { ss <- .ssQ_makeGwasSumStats() - local_mocked_bindings( - .runMungeSumstatsFilter = .ssQ_mockMunge(), - .package = "pecotmr") res <- summaryStatsQc(ss, skipRegion = "chr1:50-150") ea <- getQcInfo(res)$entryAudit[[1L]] expect_equal(ea$skipRegionDropped, 1L) # rs1 at pos 100 is dropped @@ -2752,9 +2725,6 @@ test_that("summaryStatsQc: PIP screen triggers when no variant has signal", { S4Vectors::mcols(gr)$Z <- rep(0.1, length(gr)) ss <- GwasSumStats(study = "g1", entry = list(gr), genome = "hg19", ldSketch = .ssQ_makeHandle()) - local_mocked_bindings( - .runMungeSumstatsFilter = .ssQ_mockMunge(), - .package = "pecotmr") res <- summaryStatsQc(ss, pipCutoffToSkip = 0.99) ea <- getQcInfo(res)$entryAudit[[1L]] expect_true(isTRUE(ea$pipScreenSkipped)) @@ -2762,22 +2732,8 @@ test_that("summaryStatsQc: PIP screen triggers when no variant has signal", { expect_equal(length(res$entry[[1L]]), 0L) }) -test_that("summaryStatsQc: early-exit records when fewer than 2 variants remain pre-harmonization", { - ss <- .ssQ_makeGwasSumStats() - local_mocked_bindings( - # Mock keeps only 1 row of the input - .runMungeSumstatsFilter = .ssQ_mockMunge(drop = 3L), - .package = "pecotmr") - res <- summaryStatsQc(ss) - ea <- getQcInfo(res)$entryAudit[[1L]] - expect_match(ea$earlyExit, "fewer than two variants") -}) - test_that("summaryStatsQc: harmonized variants count is recorded", { ss <- .ssQ_makeGwasSumStats() - local_mocked_bindings( - .runMungeSumstatsFilter = .ssQ_mockMunge(), - .package = "pecotmr") res <- summaryStatsQc(ss) ea <- getQcInfo(res)$entryAudit[[1L]] expect_equal(ea$matchedAgainstSketch, 4L) @@ -2785,9 +2741,6 @@ test_that("summaryStatsQc: harmonized variants count is recorded", { test_that("summaryStatsQc: options block records the curated knobs", { ss <- .ssQ_makeGwasSumStats() - local_mocked_bindings( - .runMungeSumstatsFilter = .ssQ_mockMunge(), - .package = "pecotmr") res <- summaryStatsQc(ss, removeIndels = TRUE, removeStrandAmbiguous = FALSE, nCutoff = 10) opts <- getQcInfo(res)$options @@ -2801,9 +2754,6 @@ test_that("summaryStatsQc: round-trips QtlSumStats inputs", { ss <- QtlSumStats(study = "s1", context = "c1", trait = "t1", entry = list(gr), genome = "hg19", ldSketch = .ssQ_makeHandle()) - local_mocked_bindings( - .runMungeSumstatsFilter = .ssQ_mockMunge(), - .package = "pecotmr") res <- summaryStatsQc(ss) expect_s4_class(res, "QtlSumStats") expect_equal(length(getQcInfo(res)$entryAudit), 1L) @@ -2817,8 +2767,7 @@ test_that("summaryStatsQc: zMismatchQc = 'dentist' walks the LD-mismatch branch" ss <- .ssQ_makeGwasSumStats(snp_ids = paste0("rs", 1:8), positions = seq(100L, by = 100L, length.out = 8L)) local_mocked_bindings( - .runMungeSumstatsFilter = .ssQ_mockMunge(), - extractBlockGenotypes = .ssQ_mockExtractor(), + extractBlockGenotypes = .ssQ_mockExtractor(), .package = "pecotmr") res <- suppressWarnings(summaryStatsQc(ss, zMismatchQc = "dentist")) ea <- getQcInfo(res)$entryAudit[[1L]] @@ -2844,7 +2793,6 @@ test_that("summaryStatsQc: impute = TRUE invokes RAISS and records the audit cou ldSketch = .ssQ_makeHandle(snp_n = 8L, n_samples = 60L)) local_mocked_bindings( - .runMungeSumstatsFilter = .ssQ_mockMunge(), extractBlockGenotypes = .ssQ_mockExtractor(), raiss = function(refPanel, knownZscores, genotypeMatrix, ...) { # Pretend RAISS imputed two of the missing panel variants (rs5, rs6) @@ -2873,8 +2821,7 @@ test_that("summaryStatsQc: impute = TRUE with raiss returning NULL records 0 imp ldSketch = .ssQ_makeHandle(snp_n = 8L, n_samples = 60L)) local_mocked_bindings( - .runMungeSumstatsFilter = .ssQ_mockMunge(), - extractBlockGenotypes = .ssQ_mockExtractor(), + extractBlockGenotypes = .ssQ_mockExtractor(), raiss = function(...) NULL, .package = "pecotmr") res <- summaryStatsQc(ss, impute = TRUE) @@ -2913,16 +2860,11 @@ test_that(".matchRefPanel surfaces sign/strand/dropped counts via qcCounts attri test_that("summaryStatsQc: QC track emits per-step 'kept N of M' messages plus a rollup", { ss <- .ssQ_makeGwasSumStats() - local_mocked_bindings( - .runMungeSumstatsFilter = .ssQ_mockMunge(), - .package = "pecotmr") local_mocked_bindings( extractBlockGenotypes = .ssQ_mockExtractor(), .package = "pecotmr") msgs <- capture_messages(summaryStatsQc(ss)) joined <- paste(msgs, collapse = "") - # MungeSumstats step + denominator framing. - expect_match(joined, "MungeSumstats kept [0-9]+ of [0-9]+ variant") # Harmonization step + corrected/dropped breakdown. expect_match(joined, "harmonization kept [0-9]+ of [0-9]+") expect_match(joined, "corrected: sign-flipped [0-9]+, strand-flipped [0-9]+") @@ -2933,9 +2875,6 @@ test_that("summaryStatsQc: QC track emits per-step 'kept N of M' messages plus a test_that("summaryStatsQc: skipped optional steps are omitted from the rollup", { ss <- .ssQ_makeGwasSumStats() - local_mocked_bindings( - .runMungeSumstatsFilter = .ssQ_mockMunge(), - .package = "pecotmr") local_mocked_bindings( extractBlockGenotypes = .ssQ_mockExtractor(), .package = "pecotmr") @@ -2962,9 +2901,6 @@ test_that("summaryStatsQc: per-entry log lines carry the (study/context/trait) l entry = list(.ssQ_makeEntryGr()), genome = "hg19", ldSketch = .ssQ_makeHandle()) - local_mocked_bindings( - .runMungeSumstatsFilter = .ssQ_mockMunge(), - .package = "pecotmr") local_mocked_bindings( extractBlockGenotypes = .ssQ_mockExtractor(), .package = "pecotmr") @@ -3128,6 +3064,145 @@ test_that(".refVariantsFromSketch: extracts chr/pos/A1/A2/variant_id from snpInf stringsAsFactors = FALSE) } +test_that(".applySanityChecks: empty input is a no-op", { + out <- pecotmr:::.applySanityChecks(data.frame(chrom = character())) + expect_equal(nrow(out$df), 0L) + expect_equal(out$audit, list()) +}) + +test_that(".applySanityChecks: coerceNumeric converts character columns and counts NAs", { + df <- data.frame( + chrom = c("1", "1"), pos = c(100L, 200L), + A1 = c("A", "C"), A2 = c("G", "T"), + Z = c("1.5", "not_a_number"), + stringsAsFactors = FALSE) + out <- pecotmr:::.applySanityChecks(df, dropMissData = FALSE) + expect_type(out$df$Z, "double") + expect_equal(out$df$Z[[1L]], 1.5) + expect_true(is.na(out$df$Z[[2L]])) + expect_equal(out$audit$nonNumericCoerced, 1L) +}) + +test_that(".applySanityChecks: normalizeChr maps 23/24/M/chr* and drops non-standard", { + df <- data.frame( + chrom = c("chr1", "23", "24", "M", "chrX_random"), + pos = c(100L, 200L, 300L, 400L, 500L), + A1 = c("A", "A", "A", "A", "A"), + A2 = c("G", "G", "G", "G", "G"), + Z = c(1, 2, 3, 4, 5), + stringsAsFactors = FALSE) + out <- pecotmr:::.applySanityChecks(df) + expect_equal(out$df$chrom, c("1", "X", "Y", "MT")) + expect_equal(out$audit$nonstandardChrDropped, 1L) +}) + +test_that(".applySanityChecks: dropMissData drops rows with NA in vital columns", { + df <- data.frame( + chrom = c("1", "1", "1"), + pos = c(100L, 200L, 300L), + A1 = c("A", NA, "C"), + A2 = c("G", "T", "T"), + Z = c(1, 2, NA), + stringsAsFactors = FALSE) + out <- pecotmr:::.applySanityChecks(df) + expect_equal(nrow(out$df), 1L) + expect_equal(out$audit$missDataDropped, 2L) +}) + +test_that(".applySanityChecks: dropPOutOfRange drops P < 0 and P > 1", { + df <- data.frame( + chrom = c("1", "1", "1", "1"), + pos = c(100L, 200L, 300L, 400L), + A1 = c("A", "A", "A", "A"), A2 = c("G", "G", "G", "G"), + Z = c(1, 2, 3, 4), + P = c(0.5, -0.1, 1.5, 0.001), + stringsAsFactors = FALSE) + out <- pecotmr:::.applySanityChecks(df) + expect_equal(nrow(out$df), 2L) + expect_equal(out$audit$pOutOfRangeDropped, 2L) +}) + +test_that(".applySanityChecks: clampSmallP floors to smallPFloor", { + df <- data.frame( + chrom = c("1", "1"), + pos = c(100L, 200L), + A1 = c("A", "A"), A2 = c("G", "G"), + Z = c(1, 50), + P = c(0.1, 0), + stringsAsFactors = FALSE) + out <- pecotmr:::.applySanityChecks(df, smallPFloor = 5e-324) + expect_equal(out$df$P[[2L]], 5e-324) + expect_equal(out$audit$smallPClamped, 1L) +}) + +test_that(".applySanityChecks: dropZeroEffect drops BETA==0 and OR==1", { + df <- data.frame( + chrom = c("1", "1", "1"), + pos = c(100L, 200L, 300L), + A1 = c("A", "A", "A"), A2 = c("G", "G", "G"), + BETA = c(0.5, 0, 0.2), + OR = c(1.5, 1.5, 1), + SE = c(0.1, 0.1, 0.1), + Z = c(5, 0, 2), + stringsAsFactors = FALSE) + out <- pecotmr:::.applySanityChecks(df) + expect_equal(nrow(out$df), 1L) + expect_equal(out$audit$zeroEffectDropped, 2L) +}) + +test_that(".applySanityChecks: dropNonpositiveSe drops SE <= 0", { + df <- data.frame( + chrom = c("1", "1", "1"), + pos = c(100L, 200L, 300L), + A1 = c("A", "A", "A"), A2 = c("G", "G", "G"), + SE = c(0.1, 0, -0.2), + Z = c(1, 2, 3), + stringsAsFactors = FALSE) + out <- pecotmr:::.applySanityChecks(df) + expect_equal(nrow(out$df), 1L) + expect_equal(out$audit$nonpositiveSeDropped, 2L) +}) + +test_that(".applySanityChecks: per-check knobs can disable each step", { + df <- data.frame( + chrom = c("chr1", "23"), + pos = c(100L, 200L), + A1 = c("A", "A"), A2 = c("G", "G"), + BETA = c(0.5, 0), + SE = c(0.1, -0.2), + P = c(0.5, 1.5), + Z = c(1, 2), + stringsAsFactors = FALSE) + out <- pecotmr:::.applySanityChecks( + df, + coerceNumeric = FALSE, + normalizeChr = FALSE, + dropMissData = FALSE, + dropPOutOfRange = FALSE, + clampSmallP = FALSE, + dropZeroEffect = FALSE, + dropNonpositiveSe = FALSE) + expect_equal(nrow(out$df), 2L) + expect_equal(out$audit, list()) + expect_equal(out$df$chrom, c("chr1", "23")) +}) + +test_that("summaryStatsQc: surfaces sanity-check audit and respects per-check knobs", { + gr <- .ssQ_makeEntryGr() + S4Vectors::mcols(gr)$BETA <- c(0.1, 0, 0.2, 0) + S4Vectors::mcols(gr)$SE <- c(0.1, 0.1, 0.1, 0.1) + S4Vectors::mcols(gr)$P <- c(0.5, 0.5, 0.5, 0.5) + ss <- GwasSumStats(study = "g1", entry = list(gr), genome = "hg19", + ldSketch = .ssQ_makeHandle()) + res <- summaryStatsQc(ss) + ea <- getQcInfo(res)$entryAudit[[1L]] + expect_equal(ea$sanityChecks$zeroEffectDropped, 2L) + # Disable the knob: zero-effect rows should remain. + res2 <- summaryStatsQc(ss, dropZeroEffect = FALSE) + ea2 <- getQcInfo(res2)$entryAudit[[1L]] + expect_null(ea2$sanityChecks$zeroEffectDropped) +}) + test_that(".applySkipRegion: NULL / empty skipRegion is a no-op", { df <- .ssh_smallDf() expect_identical(pecotmr:::.applySkipRegion(df, NULL), df) From b3f1cac996a3688070f621c8a10540717a7f7aee Mon Sep 17 00:00:00 2001 From: Daniel Nachun Date: Mon, 22 Jun 2026 16:30:21 -0700 Subject: [PATCH 2/4] more fixes for pipelines --- R/GwasFineMappingResult.R | 94 +++++++++++++-------- R/colocPipeline.R | 34 ++++---- R/ctwasPipeline.R | 92 ++++++++++++++++++-- R/fineMappingPipeline.R | 69 ++++++++++----- R/qtlEnrichmentPipeline.R | 59 ++++++++----- R/tupleSelectors.R | 30 +++++-- tests/testthat/test_GwasFineMappingResult.R | 40 ++++++++- tests/testthat/test_colocPipeline.R | 16 ++-- tests/testthat/test_ctwasPipeline.R | 37 ++++++++ tests/testthat/test_deprecated.R | 22 ++--- tests/testthat/test_fineMappingPipeline.R | 20 +++-- tests/testthat/test_qtlEnrichmentPipeline.R | 65 +++++++++++--- 12 files changed, 437 insertions(+), 141 deletions(-) diff --git a/R/GwasFineMappingResult.R b/R/GwasFineMappingResult.R index 3b5a4247..e2c3e950 100644 --- a/R/GwasFineMappingResult.R +++ b/R/GwasFineMappingResult.R @@ -1,9 +1,13 @@ # ============================================================================= # GwasFineMappingResult S4 class # ----------------------------------------------------------------------------- -# DFrame-subclass collection keyed by the identity tuple (study, method). -# Each row holds a FineMappingEntry payload for one GWAS study at one -# fine-mapping method, covering a single LD block. Class-level slots: +# DFrame-subclass collection keyed by the identity tuple +# (study, method, region_id). Each row holds a FineMappingEntry payload +# for one GWAS study at one fine-mapping method over one LD block. +# Multiple rows per (study, method) are allowed when they differ on +# region_id — the genome-wide-across-blocks shape that +# qtlEnrichmentPipeline / colocPipeline expect when sweeping a study +# across LD blocks. Class-level slots: # * ldSketch GenotypeHandle for the LD reference; required for the # LD-block-indexed susieRSS workflow. # Methods that take per-row selectors accept (study, method) and ignore @@ -17,7 +21,7 @@ setClass("GwasFineMappingResult", contains = "FineMappingResultBase", validity = function(object) { errors <- character() - required <- c("study", "method", "entry") + required <- c("study", "method", "region_id", "entry") missingCols <- setdiff(required, names(object)) if (length(missingCols) > 0L) errors <- c(errors, paste("missing columns:", @@ -32,10 +36,10 @@ setClass("GwasFineMappingResult", if (!all(entryTypes)) errors <- c(errors, "every element of the `entry` column must be a FineMappingEntry") - keyDf <- as.data.frame(object[, c("study", "method")]) + keyDf <- as.data.frame(object[, c("study", "method", "region_id")]) if (anyDuplicated(keyDf)) errors <- c(errors, - "(study, method) tuple uniqueness violated") + "(study, method, region_id) tuple uniqueness violated") } if (!is.null(object@ldSketch) && !methods::is(object@ldSketch, "GenotypeHandle")) { @@ -77,26 +81,41 @@ setClass("GwasFineMappingResult", #' @title Create a GwasFineMappingResult Collection #' @description Construct a \code{GwasFineMappingResult} DFrame-subclass -#' collection from per-(study, method) tuples and a list of -#' \code{FineMappingEntry} payloads. The collection represents one LD -#' block of GWAS fine-mapping fits; build a separate collection per -#' block when sweeping the genome. +#' collection from per-(study, method, region_id) tuples and a list of +#' \code{FineMappingEntry} payloads. The collection can represent +#' either a single LD block (one row per (study, method)) or a +#' genome-wide sweep across blocks (multiple rows per (study, method), +#' each tagged with its own region_id). #' @param study Character vector of study identifiers (per tuple). #' @param method Character vector of fine-mapping method names (per tuple). #' @param entry List / \code{SimpleList} of \code{FineMappingEntry} objects. +#' @param region_id Character vector of LD-block identifiers (per +#' tuple). When omitted (\code{NULL}), defaults to a per-row synthetic +#' id (\code{"region_1"}, \code{"region_2"}, ...) so the +#' (\code{study}, \code{method}, \code{region_id}) triple is unique. +#' Supplying meaningful labels (e.g. \code{"chr22_10516173_17414263"}) +#' is preferred for downstream consumers that join on region. #' @param ldSketch An optional \code{GenotypeHandle}. #' @return A \code{GwasFineMappingResult} object. #' @export GwasFineMappingResult <- function(study, method, entry, + region_id = NULL, ldSketch = NULL) { n <- length(study) if (length(method) != n || length(entry) != n) { stop("`study`, `method`, and `entry` must all have the same length.") } + if (is.null(region_id)) { + region_id <- paste0("region_", seq_len(n)) + } else if (length(region_id) != n) { + stop("`region_id` must have the same length as `study` (got ", + length(region_id), " vs ", n, ").") + } cols <- list( - study = as.character(study), - method = as.character(method), - entry = S4Vectors::SimpleList(entry) + study = as.character(study), + method = as.character(method), + region_id = as.character(region_id), + entry = S4Vectors::SimpleList(entry) ) df <- do.call(S4Vectors::DataFrame, c(cols, list(check.names = FALSE))) @@ -121,9 +140,10 @@ setMethod("getContexts", "GwasFineMappingResult", function(x) NULL) #' @export setMethod("getTraits", "GwasFineMappingResult", function(x) NULL) -# Per-tuple lookup for the GWAS variant (2-tuple instead of 4-tuple). -# The generic accepts the full set of selectors; context/trait args are -# ignored for GwasFineMappingResult. +# Per-tuple lookup keyed by (study, method, region_id). The generic +# accepts the full set of selectors; context/trait args are ignored for +# GwasFineMappingResult. `region` (passed via `...`) is the per-block +# disambiguator for multi-block genome-wide collections. #' @rdname getFineMappingResult #' @export setMethod("getFineMappingResult", "GwasFineMappingResult", @@ -136,17 +156,18 @@ setMethod("getFineMappingResult", "GwasFineMappingResult", #' @export setMethod("getPip", "GwasFineMappingResult", function(x, study = NULL, context = NULL, trait = NULL, method = NULL, - returnList = FALSE, ...) { - entry <- getFineMappingResult(x, study = study, method = method) - getPip(entry) + region = NULL, returnList = FALSE, ...) { + idx <- .tupleSelectRowGwasFmr(x, study, method, region) + getPip(x$entry[[idx]]) }) #' @rdname getCs #' @export setMethod("getCs", "GwasFineMappingResult", - function(x, study = NULL, context = NULL, trait = NULL, method = NULL, ...) { - entry <- getFineMappingResult(x, study = study, method = method) - getCs(entry) + function(x, study = NULL, context = NULL, trait = NULL, method = NULL, + region = NULL, ...) { + idx <- .tupleSelectRowGwasFmr(x, study, method, region) + getCs(x$entry[[idx]]) }) #' @rdname getTopLoci @@ -154,34 +175,39 @@ setMethod("getCs", "GwasFineMappingResult", setMethod("getTopLoci", "GwasFineMappingResult", function(x, type = c("data.frame", "GRanges"), signalCutoff = 0.025, - study = NULL, context = NULL, trait = NULL, method = NULL, ...) { - entry <- getFineMappingResult(x, study = study, method = method) - getTopLoci(entry, type = match.arg(type), signalCutoff = signalCutoff) + study = NULL, context = NULL, trait = NULL, method = NULL, + region = NULL, ...) { + idx <- .tupleSelectRowGwasFmr(x, study, method, region) + getTopLoci(x$entry[[idx]], type = match.arg(type), + signalCutoff = signalCutoff) }) #' @rdname getMarginalEffects #' @export setMethod("getMarginalEffects", "GwasFineMappingResult", function(x, maxPval = NULL, - study = NULL, context = NULL, trait = NULL, method = NULL, ...) { - entry <- getFineMappingResult(x, study = study, method = method) - getMarginalEffects(entry, maxPval = maxPval) + study = NULL, context = NULL, trait = NULL, method = NULL, + region = NULL, ...) { + idx <- .tupleSelectRowGwasFmr(x, study, method, region) + getMarginalEffects(x$entry[[idx]], maxPval = maxPval) }) #' @rdname getSusieFit #' @export setMethod("getSusieFit", "GwasFineMappingResult", - function(x, study = NULL, context = NULL, trait = NULL, method = NULL, ...) { - entry <- getFineMappingResult(x, study = study, method = method) - getSusieFit(entry) + function(x, study = NULL, context = NULL, trait = NULL, method = NULL, + region = NULL, ...) { + idx <- .tupleSelectRowGwasFmr(x, study, method, region) + getSusieFit(x$entry[[idx]]) }) #' @rdname getVariantIds #' @export setMethod("getVariantIds", "GwasFineMappingResult", - function(x, study = NULL, context = NULL, trait = NULL, method = NULL, ...) { - entry <- getFineMappingResult(x, study = study, method = method) - getVariantIds(entry) + function(x, study = NULL, context = NULL, trait = NULL, method = NULL, + region = NULL, ...) { + idx <- .tupleSelectRowGwasFmr(x, study, method, region) + getVariantIds(x$entry[[idx]]) }) diff --git a/R/colocPipeline.R b/R/colocPipeline.R index d0a8736e..8e27de0c 100644 --- a/R/colocPipeline.R +++ b/R/colocPipeline.R @@ -81,9 +81,10 @@ #' @param returnGwasFineMapping Logical. When \code{TRUE}, attach the #' computed \code{GwasFineMappingResult} on the returned data frame #' as attribute \code{"gwasFineMapping"}. Default \code{FALSE}. -#' @param enrichment Optional data.frame of per-(gwasStudy, qtlContext) -#' enrichment factors with columns \code{gwasStudy}, \code{qtlContext}, -#' \code{enrichment}. Output of \code{\link{qtlEnrichmentPipeline}}. +#' @param enrichment Optional data.frame of per-(gwasStudy, qtlStudy, +#' qtlContext) enrichment factors with columns \code{gwasStudy}, +#' \code{qtlStudy}, \code{qtlContext}, \code{enrichment}. Output of +#' \code{\link{qtlEnrichmentPipeline}}. #' When non-\code{NULL}, each pair's \code{p12} prior is scaled to #' \code{min(p12 * (1 + enrichment), p12Max)} (the enrichment-informed #' colocalization variant, "enloc"). Pairs without a matching @@ -132,9 +133,9 @@ colocPipeline <- function(qtlFineMappingResult, if (useEnrichment) { if (!is.data.frame(enrichment)) stop("`enrichment` must be a data.frame with at least gwasStudy, ", - "qtlContext, enrichment columns (output of ", + "qtlStudy, qtlContext, enrichment columns (output of ", "qtlEnrichmentPipeline).") - required <- c("gwasStudy", "qtlContext", "enrichment") + required <- c("gwasStudy", "qtlStudy", "qtlContext", "enrichment") missingCols <- setdiff(required, colnames(enrichment)) if (length(missingCols) > 0L) stop("`enrichment` is missing column(s): ", @@ -231,14 +232,16 @@ colocPipeline <- function(qtlFineMappingResult, qAligned <- aligned$qtl gAligned <- aligned$gwas - # Enrichment-informed p12: per-pair scaling capped at p12Max. - # Baseline p12 used when no enrichment table or no matching row. + # Enrichment-informed p12: per-(gwasStudy, qtlStudy, qtlContext) + # scaling capped at p12Max. Baseline p12 used when no enrichment + # table or no matching row. if (useEnrichment) { - enRow <- .colocLookupEnrichment(enrichment, gInfo$study, qContext) + enRow <- .colocLookupEnrichment(enrichment, gInfo$study, + qStudy, qContext) if (is.na(enRow)) { warning(sprintf( - "colocPipeline: no enrichment entry for (gwasStudy='%s', qtlContext='%s'); using baseline p12.", - gInfo$study, qContext)) + "colocPipeline: no enrichment entry for (gwasStudy='%s', qtlStudy='%s', qtlContext='%s'); using baseline p12.", + gInfo$study, qStudy, qContext)) enRow <- 0 } p12Used <- min(p12 * (1 + enRow), p12Max) @@ -504,13 +507,14 @@ colocPipeline <- function(qtlFineMappingResult, base } -# Look up the enrichment factor for a (gwasStudy, qtlContext) pair in -# the user-supplied enrichment table. Returns NA when the pair is not -# present; the caller falls back to the baseline p12 and emits a -# warning. +# Look up the enrichment factor for a (gwasStudy, qtlStudy, qtlContext) +# triple in the user-supplied enrichment table. Returns NA when the +# triple is not present; the caller falls back to the baseline p12 and +# emits a warning. # @noRd -.colocLookupEnrichment <- function(enrichment, gwasStudy, qtlContext) { +.colocLookupEnrichment <- function(enrichment, gwasStudy, qtlStudy, qtlContext) { idx <- which(as.character(enrichment$gwasStudy) == gwasStudy & + as.character(enrichment$qtlStudy) == qtlStudy & as.character(enrichment$qtlContext) == qtlContext) if (length(idx) == 0L) return(NA_real_) as.numeric(enrichment$enrichment[[idx[[1L]]]]) diff --git a/R/ctwasPipeline.R b/R/ctwasPipeline.R index 75c41198..c2382985 100644 --- a/R/ctwasPipeline.R +++ b/R/ctwasPipeline.R @@ -71,6 +71,11 @@ #' must-keep variants and fill remaining slots by descending PIP #' (when available) or descending \code{|weight|}. Default #' \code{Inf} (no cap). +#' @param fallbackToPrefit Logical (length 1). Forwarded to +#' \code{\link{estCtwasParam}}. When \code{TRUE}, ctwas's accurate-EM +#' NaN failure is recovered by falling back to the prefit estimates +#' (mirrors the legacy ctwas_2 workaround on underpowered data). +#' Default \code{FALSE}. #' @param ... Additional arguments forwarded to #' \code{ctwas::ctwas_sumstats}. #' @return Whatever \code{ctwas::ctwas_sumstats} returns (a list with @@ -95,6 +100,7 @@ ctwasPipeline <- function(gwasSumStats, csMinCor = 0.8, minPipCutoff = 0, maxNumVariants = Inf, + fallbackToPrefit = FALSE, ...) { groupPriorVarStructure <- match.arg(groupPriorVarStructure) inputs <- assembleCtwasInputs( @@ -114,6 +120,7 @@ ctwasPipeline <- function(gwasSumStats, niter = niter, groupPriorVarStructure = groupPriorVarStructure, ncore = ncore, + fallbackToPrefit = fallbackToPrefit, ...) screened <- screenCtwasRegions( est, @@ -304,6 +311,12 @@ assembleCtwasInputs <- function(gwasSumStats, twasWeights, #' \code{ctwas::assemble_region_data} / \code{ctwas::est_param}. #' @param groupPriorVarStructure Pass-through. #' @param ncore Number of cores. +#' @param fallbackToPrefit Logical (length 1). When \code{TRUE} (default +#' \code{FALSE}), if \code{ctwas::est_param}'s accurate EM diverges to +#' NaN and throws \code{"Estimated group_prior(_var)? contains NAs"}, +#' re-run only the prefit step via \code{ctwas:::fit_EM} and return +#' those (typically finite) priors as the param. Mirrors the legacy +#' ctwas_2 workaround on toy data where the accurate EM saturates. #' @param ... Additional arguments forwarded to \code{ctwas::est_param} #' (e.g. \code{min_p_single_effect}, \code{min_group_size}). #' @return The \code{inputs} list augmented with \code{region_data}, @@ -319,6 +332,7 @@ estCtwasParam <- function(inputs, "shared_all", "independent"), ncore = 1L, + fallbackToPrefit = FALSE, ...) { if (!requireNamespace("ctwas", quietly = TRUE)) { stop("Package 'ctwas' is required for estCtwasParam.") @@ -340,12 +354,27 @@ estCtwasParam <- function(inputs, snp_map = inputs$snp_map, thin = thin, ncore = as.integer(ncore)), extra = list(...)) - paramRes <- .ctwasInvoke(ctwas::est_param, list( - region_data = assembled$region_data, - niter_prefit = as.integer(niterPrefit), - niter = as.integer(niter), - group_prior_var_structure = groupPriorVarStructure, - ncore = as.integer(ncore)), extra = list(...)) + paramRes <- tryCatch( + .ctwasInvoke(ctwas::est_param, list( + region_data = assembled$region_data, + niter_prefit = as.integer(niterPrefit), + niter = as.integer(niter), + group_prior_var_structure = groupPriorVarStructure, + ncore = as.integer(ncore)), extra = list(...)), + error = function(e) { + if (fallbackToPrefit && grepl("contains NAs", conditionMessage(e))) { + message("estCtwasParam: accurate EM diverged (", + conditionMessage(e), "); falling back to prefit estimates.") + .ctwasFitPrefitEm(assembled$region_data, + niterPrefit = as.integer(niterPrefit), + groupPriorVarStructure = groupPriorVarStructure, + thin = thin, + ncore = as.integer(ncore), + extra = list(...)) + } else { + stop(e) + } + }) # ctwas::assemble_region_data does not echo z_gene back in its return # list, so propagate the precomputed (or freshly computed) z_gene we # passed into it. This matches ctwas_sumstats's top-level return shape. @@ -478,6 +507,57 @@ finemapCtwasRegions <- function(screenResult, do.call(fn, args) } +# Run ONLY ctwas's prefit EM step against `region_data` and return a +# param list shaped like ctwas::est_param normally produces. Used as +# the fallback path when est_param's accurate EM diverges to NaN on +# toy / underpowered data (matches the legacy ctwas_2 workaround). +# Calls ctwas's internal `fit_EM` (via ::: getFromNamespace) with +# niter = niter_prefit, then applies the same thin-adjustment to the +# SNP group_prior that est_param applies. p_single_effect is left as +# NA since the accurate EM never ran. +# @noRd +.ctwasFitPrefitEm <- function(region_data, niterPrefit, + groupPriorVarStructure, thin, ncore, + extra = list()) { + fitEm <- getFromNamespace("fit_EM", "ctwas") + fitArgs <- list( + region_data = region_data, + niter = as.integer(niterPrefit), + group_prior_var_structure = groupPriorVarStructure, + ncore = as.integer(ncore)) + if (length(extra) > 0L) { + formalsFn <- tryCatch(names(formals(fitEm)), error = function(e) NULL) + if (!is.null(formalsFn)) { + explicitFormals <- setdiff(formalsFn, "...") + extra <- extra[setdiff(names(extra), names(fitArgs))] + extra <- extra[intersect(names(extra), explicitFormals)] + } + fitArgs <- c(fitArgs, extra) + } + prefit <- do.call(fitEm, fitArgs) + groupPrior <- prefit$group_prior + groupSize <- prefit$group_size + if (thin != 1) { + if ("SNP" %in% names(groupPrior)) + groupPrior["SNP"] <- groupPrior["SNP"] * thin + if ("SNP" %in% names(groupSize)) + groupSize["SNP"] <- groupSize["SNP"] / thin + } + if (length(groupPrior) > 0L) + groupSize <- groupSize[names(groupPrior)] + list( + group_prior = groupPrior, + group_prior_var = prefit$group_prior_var, + group_prior_iters = prefit$group_prior_iters, + group_prior_var_iters = prefit$group_prior_var_iters, + group_prior_var_structure = groupPriorVarStructure, + group_size = groupSize, + p_single_effect = data.frame( + region_id = names(region_data), + p_single_effect = NA_real_, + stringsAsFactors = FALSE)) +} + # ============================================================================= # Internal helpers # ============================================================================= diff --git a/R/fineMappingPipeline.R b/R/fineMappingPipeline.R index 38d52a5d..bccaedd7 100644 --- a/R/fineMappingPipeline.R +++ b/R/fineMappingPipeline.R @@ -362,13 +362,17 @@ setGeneric("fineMappingPipeline", fineMappingResult$entry[[idx[[1L]]]] } -# GwasFineMappingResult cache lookup using the (study, method) 2-tuple. +# GwasFineMappingResult cache lookup using the (study, method, +# region_id) 3-tuple. Multi-block FMRs can carry one entry per +# (study, method, region_id) triple, so the cache key must include +# region_id to correctly retrieve the cached fit for a specific block. # @noRd -.fmCacheLookupGwas <- function(fineMappingResult, study, method) { +.fmCacheLookupGwas <- function(fineMappingResult, study, method, region_id) { if (is.null(fineMappingResult)) return(NULL) if (!is(fineMappingResult, "GwasFineMappingResult")) return(NULL) idx <- .matchTupleRows(fineMappingResult, - list(study = study, method = method)) + list(study = study, method = method, + region_id = region_id)) if (length(idx) == 0L) return(NULL) fineMappingResult$entry[[idx[[1L]]]] } @@ -401,18 +405,24 @@ setGeneric("fineMappingPipeline", ldSketch = ldSketch) } -# Build a GwasFineMappingResult collection from per-(study, method) vectors. +# Build a GwasFineMappingResult collection from per-row vectors. When +# `region_ids` is NULL, falls through to the constructor's synthetic +# defaults (region_1, region_2, ...). For callers that have meaningful +# block labels (e.g. derived from a GwasSumStats entry's GRanges), +# pass them explicitly so downstream consumers can join on region. # @noRd -.fmBuildGwasResult <- function(studies, methods, entries, ldSketch = NULL) { +.fmBuildGwasResult <- function(studies, methods, entries, + region_ids = NULL, ldSketch = NULL) { if (length(entries) == 0L) { - stop("fineMappingPipeline: no (study, method) tuples produced a ", + stop("fineMappingPipeline: no (study, method, region_id) tuples produced a ", "fine-mapping result.") } GwasFineMappingResult( - study = studies, - method = methods, - entry = entries, - ldSketch = ldSketch) + study = studies, + method = methods, + region_id = region_ids, + entry = entries, + ldSketch = ldSketch) } # Combine an optional joint column across two collections. Returns NULL @@ -457,10 +467,11 @@ setGeneric("fineMappingPipeline", ldSketch = ldSketch) } else { GwasFineMappingResult( - study = c(as.character(a$study), as.character(b$study)), - method = c(as.character(a$method), as.character(b$method)), - entry = c(as.list(a$entry), as.list(b$entry)), - ldSketch = ldSketch) + study = c(as.character(a$study), as.character(b$study)), + method = c(as.character(a$method), as.character(b$method)), + region_id = c(as.character(a$region_id), as.character(b$region_id)), + entry = c(as.list(a$entry), as.list(b$entry)), + ldSketch = ldSketch) } } @@ -1455,10 +1466,12 @@ setMethod("fineMappingPipeline", "GwasSumStats", rowStudy <- character(0) rowMethod <- character(0) + rowRegion <- character(0) rowEntries <- list() - pushRow <- function(st, mt, ent) { + pushRow <- function(st, mt, rg, ent) { rowStudy <<- c(rowStudy, st) rowMethod <<- c(rowMethod, mt) + rowRegion <<- c(rowRegion, rg) rowEntries[[length(rowEntries) + 1L]] <<- ent } @@ -1470,18 +1483,26 @@ setMethod("fineMappingPipeline", "GwasSumStats", variantIds <- zn$variantIds z <- zn$z n <- zn$n + # Derive a region_id from the entry's GRanges so multi-block + # genome-wide GWAS sweeps can carry one row per block without + # tripping (study, method, region_id) uniqueness. Format: + # "{seqname}_{minPos}_{maxPos}" (e.g. "chr22_10516173_17379581"). + region_id <- sprintf("%s_%d_%d", + as.character(GenomicRanges::seqnames(gr))[[1L]], + min(GenomicRanges::start(gr)), + max(GenomicRanges::start(gr))) # The .fmCacheLookup helper takes a 4-tuple key for the QTL cache # shape. For GWAS resume we look up using the GwasFineMappingResult - # 2-tuple shape. + # 3-tuple shape (study, method, region_id). toRun <- character(0) for (tk in tokens) { cached <- if (!is.null(fineMappingResult) && is(fineMappingResult, "GwasFineMappingResult")) { - .fmCacheLookupGwas(fineMappingResult, st, tk) + .fmCacheLookupGwas(fineMappingResult, st, tk, region_id) } else NULL if (!is.null(cached)) { - pushRow(st, tk, cached) + pushRow(st, tk, region_id, cached) } else { toRun <- c(toRun, tk) } @@ -1494,7 +1515,8 @@ setMethod("fineMappingPipeline", "GwasSumStats", infFit <- NULL if (chainLocal$runInf) { if (verbose >= 1) - message(sprintf("Fitting susieInf (RSS) for GWAS (study='%s') ...", st)) + message(sprintf("Fitting susieInf (RSS) for GWAS (study='%s', region='%s') ...", + st, region_id)) infFit <- .fmFitSusieRss(z, ldMat, n, "susieInf", coverage = coverage) } @@ -1507,8 +1529,8 @@ setMethod("fineMappingPipeline", "GwasSumStats", (tk == "susieAsh" && chainLocal$chainAsh)) infFit else NULL if (verbose >= 1) - message(sprintf("Fitting %s (RSS) for GWAS (study='%s') ...", - tk, st)) + message(sprintf("Fitting %s (RSS) for GWAS (study='%s', region='%s') ...", + tk, st, region_id)) fit <- .fmFitSusieRss(z, ldMat, n, tk, chainFromInf = chainFrom, coverage = coverage) @@ -1521,12 +1543,13 @@ setMethod("fineMappingPipeline", "GwasSumStats", signalCutoff = signalCutoff, minAbsCorr = minAbsCorr, csInput = "Xcorr") - pushRow(st, tk, ent) + pushRow(st, tk, region_id, ent) } } .fmBuildGwasResult(rowStudy, rowMethod, rowEntries, - ldSketch = ldSketch) + region_ids = rowRegion, + ldSketch = ldSketch) }) diff --git a/R/qtlEnrichmentPipeline.R b/R/qtlEnrichmentPipeline.R index cfbc4b43..928487b7 100644 --- a/R/qtlEnrichmentPipeline.R +++ b/R/qtlEnrichmentPipeline.R @@ -43,11 +43,12 @@ #' \code{qtlEnrichment}. Default \code{1}. #' @param ... Additional arguments forwarded to #' \code{\link{qtlEnrichment}}. -#' @return A data frame with one row per (gwasStudy, qtlContext) pair -#' and columns \code{gwasStudy}, \code{qtlContext}, -#' \code{enrichment}, \code{enrichmentSe}, \code{enrichmentLogOdds}, -#' plus any extras the underlying estimator emits. Suitable as the -#' \code{enrichment} argument to \code{\link{colocPipeline}}. +#' @return A data frame with one row per (gwasStudy, qtlStudy, +#' qtlContext) triple and columns \code{gwasStudy}, \code{qtlStudy}, +#' \code{qtlContext}, \code{enrichment}, \code{enrichmentSe}, +#' \code{enrichmentLogOdds}, plus any extras the underlying estimator +#' emits. Suitable as the \code{enrichment} argument to +#' \code{\link{colocPipeline}} (which joins on the same triple). #' @export qtlEnrichmentPipeline <- function(gwasFineMappingResult, qtlFineMappingResult, @@ -71,11 +72,19 @@ qtlEnrichmentPipeline <- function(gwasFineMappingResult, # Per-study genome-wide GWAS PIP vector (named by variant id). gwasStudies <- unique(as.character(gwasFineMappingResult$study)) - qtlContexts <- unique(as.character(qtlFineMappingResult$context)) + # Unique QTL (study, context) tuples — using context alone would + # silently merge two studies that share a context label (e.g. MSBB + # and ROSMAP both labelling something "DLPFC_eQTL"), giving wrong + # enrichment estimates. The S4 schema carries study explicitly per + # entry, so we iterate over the joint key. + qtlTuples <- unique(data.frame( + qtlStudy = as.character(qtlFineMappingResult$study), + qtlContext = as.character(qtlFineMappingResult$context), + stringsAsFactors = FALSE)) - if (length(gwasStudies) == 0L || length(qtlContexts) == 0L) - stop("qtlEnrichmentPipeline: no (gwasStudy, qtlContext) pairs ", - "to compute (one of the inputs has zero rows).") + if (length(gwasStudies) == 0L || nrow(qtlTuples) == 0L) + stop("qtlEnrichmentPipeline: no (gwasStudy, qtlStudy, qtlContext) ", + "triples to compute (one of the inputs has zero rows).") results <- list() for (gStudy in gwasStudies) { @@ -86,12 +95,15 @@ qtlEnrichmentPipeline <- function(gwasFineMappingResult, gStudy)) next } - for (qContext in qtlContexts) { - qtlRegions <- .enrBuildQtlRegionsList(qtlFineMappingResult, qContext) + for (k in seq_len(nrow(qtlTuples))) { + qStudy <- qtlTuples$qtlStudy[[k]] + qContext <- qtlTuples$qtlContext[[k]] + qtlRegions <- .enrBuildQtlRegionsList(qtlFineMappingResult, + qStudy, qContext) if (length(qtlRegions) == 0L) { warning(sprintf( - "qtlEnrichmentPipeline: no usable QTL regions for qtlContext='%s'; skipping.", - qContext)) + "qtlEnrichmentPipeline: no usable QTL regions for (qtlStudy='%s', qtlContext='%s'); skipping.", + qStudy, qContext)) next } enr <- tryCatch( @@ -106,13 +118,14 @@ qtlEnrichmentPipeline <- function(gwasFineMappingResult, ...), error = function(e) { warning(sprintf( - "qtlEnrichmentPipeline: qtlEnrichment failed for (gwasStudy='%s', qtlContext='%s'): %s", - gStudy, qContext, conditionMessage(e))) + "qtlEnrichmentPipeline: qtlEnrichment failed for (gwasStudy='%s', qtlStudy='%s', qtlContext='%s'): %s", + gStudy, qStudy, qContext, conditionMessage(e))) NULL }) if (is.null(enr)) next row <- .enrFlattenEnrichment(enr) row$gwasStudy <- gStudy + row$qtlStudy <- qStudy row$qtlContext <- qContext results[[length(results) + 1L]] <- row } @@ -121,6 +134,7 @@ qtlEnrichmentPipeline <- function(gwasFineMappingResult, if (length(results) == 0L) { return(data.frame( gwasStudy = character(0), + qtlStudy = character(0), qtlContext = character(0), enrichment = numeric(0), enrichmentSe = numeric(0), @@ -130,7 +144,7 @@ qtlEnrichmentPipeline <- function(gwasFineMappingResult, out <- do.call(rbind, lapply(results, as.data.frame, stringsAsFactors = FALSE)) rownames(out) <- NULL - idCols <- c("gwasStudy", "qtlContext") + idCols <- c("gwasStudy", "qtlStudy", "qtlContext") other <- setdiff(colnames(out), idCols) out[, c(idCols, other), drop = FALSE] } @@ -180,12 +194,15 @@ qtlEnrichmentPipeline <- function(gwasFineMappingResult, all } -# Build the per-(qtl context) list of region fits in the shape that -# qtlEnrichment expects: list(d) where each d carries -# alpha, pip, prior_variance (V). +# Build the per-(qtlStudy, qtlContext) list of region fits in the shape +# that qtlEnrichment expects: list(d) where each d carries +# alpha, pip, prior_variance (V). Filters on BOTH study and context so +# entries from different studies that happen to share a context label +# are not pooled into one enrichment estimate. # @noRd -.enrBuildQtlRegionsList <- function(qtlFmr, qContext) { - idx <- which(as.character(qtlFmr$context) == qContext) +.enrBuildQtlRegionsList <- function(qtlFmr, qStudy, qContext) { + idx <- which(as.character(qtlFmr$study) == qStudy & + as.character(qtlFmr$context) == qContext) if (length(idx) == 0L) return(list()) out <- list() for (i in idx) { diff --git a/R/tupleSelectors.R b/R/tupleSelectors.R index 79026604..73186b46 100644 --- a/R/tupleSelectors.R +++ b/R/tupleSelectors.R @@ -51,9 +51,11 @@ idx[[1L]] } -# Internal: resolve a (study, method) tuple to a single row index of a -# GwasFineMappingResult collection. -.tupleSelectRowGwasFmr <- function(x, study, method) { +# Internal: resolve a (study, method, region_id) tuple to a single row +# index of a GwasFineMappingResult collection. `region` may be NULL when +# the (study, method) pair maps to a single row; otherwise it disambiguates +# among per-block rows of a genome-wide collection. +.tupleSelectRowGwasFmr <- function(x, study, method, region = NULL) { if (nrow(x) == 0L) stop("GwasFineMappingResult has no rows.") if (missing(study) || is.null(study) || missing(method) || is.null(method)) { @@ -63,8 +65,24 @@ } if (length(study) != 1L || length(method) != 1L) stop("`study` and `method` must each be length 1.") - idx <- .matchTupleRows(x, list(study = study, method = method)) - if (length(idx) == 0L) - stop(sprintf("No entry for (study='%s', method='%s').", study, method)) + if (!is.null(region) && length(region) != 1L) + stop("`region` must be length 1 when supplied.") + keys <- list(study = study, method = method) + if (!is.null(region)) keys$region_id <- region + idx <- .matchTupleRows(x, keys) + if (length(idx) == 0L) { + stop(sprintf( + "No entry for (study='%s', method='%s'%s).", + study, method, + if (is.null(region)) "" else sprintf(", region='%s'", region))) + } + if (length(idx) > 1L) { + stop(sprintf( + "GwasFineMappingResult has %d rows matching (study='%s', method='%s'); ", + length(idx), study, method), + "pass `region` to disambiguate (available: ", + paste(shQuote(as.character(x$region_id[idx])), collapse = ", "), + ").") + } idx[[1L]] } diff --git a/tests/testthat/test_GwasFineMappingResult.R b/tests/testthat/test_GwasFineMappingResult.R index 9137cac6..52c8f86f 100644 --- a/tests/testthat/test_GwasFineMappingResult.R +++ b/tests/testthat/test_GwasFineMappingResult.R @@ -26,19 +26,51 @@ test_that("GwasFineMappingResult: errors on length mismatch", { }) -test_that("GwasFineMappingResult: rejects duplicate (study, method) tuples", { +test_that("GwasFineMappingResult: same (study, method) across distinct region_ids OK", { + # The default constructor synthesises region_1, region_2, ... so the + # (study, method, region_id) triple is unique even when (study, method) + # repeats. This is the genome-wide-across-blocks shape that + # qtlEnrichmentPipeline + colocPipeline expect. + e1 <- .sc_makeFineMappingEntry(3) + e2 <- .sc_makeFineMappingEntry(3) + res <- GwasFineMappingResult( + study = c("g1", "g1"), + method = c("susie", "susie"), + entry = list(e1, e2)) + expect_s4_class(res, "GwasFineMappingResult") + expect_equal(nrow(res), 2L) + expect_setequal(as.character(res$region_id), c("region_1", "region_2")) +}) + + +test_that("GwasFineMappingResult: rejects duplicate (study, method, region_id) tuples", { e1 <- .sc_makeFineMappingEntry(3) e2 <- .sc_makeFineMappingEntry(3) expect_error( GwasFineMappingResult( - study = c("g1", "g1"), - method = c("susie", "susie"), - entry = list(e1, e2)), + study = c("g1", "g1"), + method = c("susie", "susie"), + region_id = c("chr22_1_100", "chr22_1_100"), + entry = list(e1, e2)), "uniqueness violated" ) }) +test_that("GwasFineMappingResult: errors when region_id length mismatches", { + e1 <- .sc_makeFineMappingEntry(3) + e2 <- .sc_makeFineMappingEntry(3) + expect_error( + GwasFineMappingResult( + study = c("g1", "g1"), + method = c("susie", "susie"), + region_id = "only_one", + entry = list(e1, e2)), + "same length" + ) +}) + + test_that("GwasFineMappingResult: show prints summary", { e <- .sc_makeFineMappingEntry(3) res <- GwasFineMappingResult(study = "g1", method = "susie", diff --git a/tests/testthat/test_colocPipeline.R b/tests/testthat/test_colocPipeline.R index 63fd943e..e57ce2c3 100644 --- a/tests/testthat/test_colocPipeline.R +++ b/tests/testthat/test_colocPipeline.R @@ -339,18 +339,24 @@ context("encoloc") # .colocLookupEnrichment (formerly .enlocLookupEnrichment, now shared) # =========================================================================== -test_that(".colocLookupEnrichment: returns the value for a (gwasStudy, qtlContext) hit", { - enr <- data.frame(gwasStudy = c("G1", "G2"), +test_that(".colocLookupEnrichment: returns the value for a (gwasStudy, qtlStudy, qtlContext) hit", { + enr <- data.frame(gwasStudy = c("G1", "G2"), + qtlStudy = c("Q1", "Q1"), qtlContext = c("c1", "c1"), enrichment = c(2.0, 3.5), stringsAsFactors = FALSE) - expect_equal(pecotmr:::.colocLookupEnrichment(enr, "G2", "c1"), 3.5) + expect_equal(pecotmr:::.colocLookupEnrichment(enr, "G2", "Q1", "c1"), 3.5) }) test_that(".colocLookupEnrichment: returns NA when no row matches", { - enr <- data.frame(gwasStudy = "G1", qtlContext = "c1", enrichment = 2.0, + enr <- data.frame(gwasStudy = "G1", + qtlStudy = "Q1", + qtlContext = "c1", + enrichment = 2.0, stringsAsFactors = FALSE) - expect_true(is.na(pecotmr:::.colocLookupEnrichment(enr, "ghost", "c1"))) + expect_true(is.na(pecotmr:::.colocLookupEnrichment(enr, "ghost", "Q1", "c1"))) + # qtlStudy mismatch also a miss. + expect_true(is.na(pecotmr:::.colocLookupEnrichment(enr, "G1", "Qghost", "c1"))) }) # =========================================================================== diff --git a/tests/testthat/test_ctwasPipeline.R b/tests/testthat/test_ctwasPipeline.R index 135a93e5..05e618ad 100644 --- a/tests/testthat/test_ctwasPipeline.R +++ b/tests/testthat/test_ctwasPipeline.R @@ -785,6 +785,43 @@ test_that("ctwasPipeline: dispatches assemble → est → screen → finemap and "region_data", "boundary_genes", "screen_res")) }) +test_that("estCtwasParam: fallbackToPrefit recovers from accurate-EM NaN divergence", { + skip_if_not_installed("ctwas") + inp <- .ctp_makeMultiBlockInputs() + # Mock est_param to throw the documented NaN error, and fit_EM to + # produce a stub prefit result. Verify estCtwasParam catches the + # NaN error AND that the returned param is the prefit estimate. + local_mocked_bindings( + assemble_region_data = function(...) list( + region_data = list(block1 = list(stub = TRUE)), + boundary_genes = list(), + z_gene = data.frame(id = "t1", z = 1.0)), + compute_gene_z = function(...) data.frame(id = "t1", z = 1.0), + est_param = function(...) stop("Estimated group_prior_var contains NAs!"), + # ctwas:::fit_EM is internal — mock via the same `ctwas` namespace. + fit_EM = function(region_data, ...) list( + group_prior = c(g = 0.05, SNP = 1e-4), + group_prior_var = c(g = 4.0, SNP = 5.0), + group_size = c(g = 1, SNP = 100)), + .package = "ctwas") + local_mocked_bindings(extractBlockGenotypes = .ctp_mockExtractor(), + .package = "pecotmr") + # Without fallback: the NaN error propagates. + expect_error( + estCtwasParam(assembleCtwasInputs(inp$gwasSumStats, inp$twasWeights), + fallbackToPrefit = FALSE), + "contains NAs") + # With fallback: prefit estimates are returned as the param. + # .ctwasFitPrefitEm thin-scales the SNP group_prior (mirroring ctwas's + # est_param), so the mocked SNP prior 1e-4 emerges as 1e-4 * thin + # (default thin = 0.1) → 1e-5. The group_prior_var is not thinned. + est <- estCtwasParam( + assembleCtwasInputs(inp$gwasSumStats, inp$twasWeights), + fallbackToPrefit = TRUE) + expect_equal(unname(est$param$group_prior), c(0.05, 1e-5)) + expect_equal(unname(est$param$group_prior_var), c(4.0, 5.0)) +}) + test_that("estCtwasParam / screenCtwasRegions / finemapCtwasRegions can be called independently", { skip_if_not_installed("ctwas") inp <- .ctp_makeMultiBlockInputs() diff --git a/tests/testthat/test_deprecated.R b/tests/testthat/test_deprecated.R index b0c6f0dd..ea41ca69 100644 --- a/tests/testthat/test_deprecated.R +++ b/tests/testthat/test_deprecated.R @@ -503,7 +503,7 @@ test_that("enlocPipeline: rejects non-QtlFineMappingResult qtlFmr", { expect_error( suppressWarnings(enlocPipeline(qtlFineMappingResult = "no", gwasInput = .ep_makeGwasFmr(), - enrichment = data.frame(gwasStudy = "G1", qtlContext = "c1", + enrichment = data.frame(gwasStudy = "G1", qtlStudy = "Q1", qtlContext = "c1", enrichment = 2.0, stringsAsFactors = FALSE))), "must be a QtlFineMappingResult" @@ -515,7 +515,7 @@ test_that("enlocPipeline: rejects gwasInput that is neither GwasSumStats nor Gwa expect_error( suppressWarnings(enlocPipeline(qtlFineMappingResult = .ep_makeQtlFmr(), gwasInput = 42L, - enrichment = data.frame(gwasStudy = "G1", qtlContext = "c1", + enrichment = data.frame(gwasStudy = "G1", qtlStudy = "Q1", qtlContext = "c1", enrichment = 2.0, stringsAsFactors = FALSE))), "must be a GwasSumStats or a GwasFineMappingResult" @@ -547,7 +547,7 @@ test_that("enlocPipeline: un-QCd GwasSumStats input is rejected", { expect_error( suppressWarnings(enlocPipeline(qtlFineMappingResult = .ep_makeQtlFmr(), gwasInput = .ep_makeGwasSumstats(qc = FALSE), - enrichment = data.frame(gwasStudy = "G1", qtlContext = "c1", + enrichment = data.frame(gwasStudy = "G1", qtlStudy = "Q1", qtlContext = "c1", enrichment = 2.0, stringsAsFactors = FALSE))), "has no QC record" @@ -560,7 +560,7 @@ test_that("enlocPipeline: un-QCd GwasSumStats input is rejected", { test_that("enlocPipeline: pair loop produces one row per (QTL tuple, GWAS tuple) with adjusted p12", { skip_on_covr() - enr <- data.frame(gwasStudy = "G1", qtlContext = "c1", enrichment = 2.0, + enr <- data.frame(gwasStudy = "G1", qtlStudy = "Q1", qtlContext = "c1", enrichment = 2.0, stringsAsFactors = FALSE) local_mocked_bindings(coloc.bf_bf = .ep_mockColocBfBf(), .package = "coloc") out <- suppressWarnings( @@ -580,7 +580,7 @@ test_that("enlocPipeline: pair loop produces one row per (QTL tuple, GWAS tuple) test_that("enlocPipeline: missing-enrichment pair falls back to baseline p12 with a warning", { skip_on_covr() # An enrichment frame that has no row for (G1, c1). - enr <- data.frame(gwasStudy = "G_other", qtlContext = "c_other", + enr <- data.frame(gwasStudy = "G_other", qtlStudy = "Q_other", qtlContext = "c_other", enrichment = 10.0, stringsAsFactors = FALSE) local_mocked_bindings(coloc.bf_bf = .ep_mockColocBfBf(), .package = "coloc") expect_warning( @@ -597,7 +597,7 @@ test_that("enlocPipeline: missing-enrichment pair falls back to baseline p12 wit test_that("enlocPipeline: p12Max caps the adjusted prior", { skip_on_covr() - enr <- data.frame(gwasStudy = "G1", qtlContext = "c1", enrichment = 1e6, + enr <- data.frame(gwasStudy = "G1", qtlStudy = "Q1", qtlContext = "c1", enrichment = 1e6, stringsAsFactors = FALSE) local_mocked_bindings(coloc.bf_bf = .ep_mockColocBfBf(), .package = "coloc") out <- suppressWarnings( @@ -611,7 +611,7 @@ test_that("enlocPipeline: p12Max caps the adjusted prior", { test_that("enlocPipeline: coloc.bf_bf failures are caught and warned, pair skipped", { skip_on_covr() - enr <- data.frame(gwasStudy = "G1", qtlContext = "c1", enrichment = 2.0, + enr <- data.frame(gwasStudy = "G1", qtlStudy = "Q1", qtlContext = "c1", enrichment = 2.0, stringsAsFactors = FALSE) local_mocked_bindings( coloc.bf_bf = function(q, g, ...) stop("boom"), @@ -627,7 +627,7 @@ test_that("enlocPipeline: coloc.bf_bf failures are caught and warned, pair skipp test_that("enlocPipeline: returnGwasFineMapping=TRUE attaches gwasFineMapping attr (non-empty result)", { skip_on_covr() - enr <- data.frame(gwasStudy = "G1", qtlContext = "c1", enrichment = 2.0, + enr <- data.frame(gwasStudy = "G1", qtlStudy = "Q1", qtlContext = "c1", enrichment = 2.0, stringsAsFactors = FALSE) local_mocked_bindings(coloc.bf_bf = .ep_mockColocBfBf(), .package = "coloc") # Use GwasFineMappingResult input: returnGwasFineMapping has no effect @@ -672,7 +672,7 @@ test_that("enlocPipeline: qLbf NULL (QTL entry's LBF rows drop after priorTol) s enlocPipeline(qtlFineMappingResult = qfmr, gwasInput = .ep_makeGwasFmr(), enrichment = data.frame( - gwasStudy = "G1", qtlContext = "c1", + gwasStudy = "G1", qtlStudy = "Q1", qtlContext = "c1", enrichment = 1.0, stringsAsFactors = FALSE))) expect_equal(nrow(out), 0L) }) @@ -695,7 +695,7 @@ test_that("enlocPipeline: aligned NULL (disjoint variant sets) skips that pair", enlocPipeline(qtlFineMappingResult = qfmr, gwasInput = gfmr, enrichment = data.frame( - gwasStudy = "G1", qtlContext = "c1", + gwasStudy = "G1", qtlStudy = "Q1", qtlContext = "c1", enrichment = 1.0, stringsAsFactors = FALSE))) expect_equal(nrow(out), 0L) }) @@ -722,7 +722,7 @@ test_that("enlocPipeline: empty result schema includes enrichment + p12Used", { enlocPipeline(qtlFineMappingResult = .ep_makeQtlFmr(), gwasInput = gfmr, enrichment = data.frame( - gwasStudy = "G1", qtlContext = "c1", + gwasStudy = "G1", qtlStudy = "Q1", qtlContext = "c1", enrichment = 1.0, stringsAsFactors = FALSE))) expect_equal(nrow(out), 0L) expect_true(all(c("enrichment", "p12Used") %in% colnames(out))) diff --git a/tests/testthat/test_fineMappingPipeline.R b/tests/testthat/test_fineMappingPipeline.R index d85b1ade..454865e5 100644 --- a/tests/testthat/test_fineMappingPipeline.R +++ b/tests/testthat/test_fineMappingPipeline.R @@ -251,18 +251,23 @@ test_that(".fmCacheLookup: returns matching entry by 4-tuple", { expect_null(pecotmr:::.fmCacheLookup(fmr, "ghost", "c1", "t1", "susie")) }) -test_that(".fmCacheLookupGwas: returns matching entry by (study, method)", { +test_that(".fmCacheLookupGwas: returns matching entry by (study, method, region_id)", { e <- FineMappingEntry( variantIds = "v1", susieFit = list(token = "susie"), topLoci = data.frame(variant_id = "v1", pip = 0.5, stringsAsFactors = FALSE)) + # GwasFineMappingResult assigns the synthetic region_id "region_1" + # when none is supplied; the lookup must include it in the 3-tuple + # key (multi-block FMRs disambiguate per-block fits by region_id). fmr <- GwasFineMappingResult(study = "g1", method = "susie", entry = list(e)) expect_identical( - pecotmr:::.fmCacheLookupGwas(fmr, "g1", "susie"), + pecotmr:::.fmCacheLookupGwas(fmr, "g1", "susie", "region_1"), e) - expect_null(pecotmr:::.fmCacheLookupGwas(fmr, "ghost", "susie")) + expect_null(pecotmr:::.fmCacheLookupGwas(fmr, "ghost", "susie", "region_1")) + # Wrong region_id is also a miss. + expect_null(pecotmr:::.fmCacheLookupGwas(fmr, "g1", "susie", "other_region")) }) test_that(".fmCacheLookup: non-QtlFineMappingResult input returns NULL", { @@ -285,7 +290,7 @@ test_that(".fmCacheLookupGwas: non-GwasFineMappingResult input returns NULL", { qtlFmr <- QtlFineMappingResult( study = "s1", context = "c1", trait = "t1", method = "susie", entry = list(e)) - expect_null(pecotmr:::.fmCacheLookupGwas(qtlFmr, "s1", "susie")) + expect_null(pecotmr:::.fmCacheLookupGwas(qtlFmr, "s1", "susie", "region_1")) }) # =========================================================================== @@ -303,7 +308,7 @@ test_that(".fmBuildQtlResult: empty entries errors", { test_that(".fmBuildGwasResult: empty entries errors", { expect_error( pecotmr:::.fmBuildGwasResult(character(0), character(0), list()), - "no \\(study, method\\) tuples" + "no \\(study, method, region_id\\) tuples" ) }) @@ -1013,8 +1018,13 @@ test_that("fineMappingPipeline(GwasSumStats): cache hit short-circuits the RSS f topLoci = data.frame(variant_id = paste0("v", 1:5), pip = seq(0.9, 0.1, length.out = 5), stringsAsFactors = FALSE)) + # The GwasSumStats branch keys its cache by (study, method, region_id) + # where region_id is "{seqname}_{minPos}_{maxPos}" derived from the + # entry's GRanges. .fmp_makeSumstatsGr() yields chr1 positions 100..500, + # so the cache row must use region_id = "chr1_100_500" to hit. cache <- GwasFineMappingResult( study = "G1", method = "susie", + region_id = "chr1_100_500", entry = list(cachedEntry), ldSketch = .fmp_makeHandle()) rss_calls <- 0 diff --git a/tests/testthat/test_qtlEnrichmentPipeline.R b/tests/testthat/test_qtlEnrichmentPipeline.R index c8610f55..94885a15 100644 --- a/tests/testthat/test_qtlEnrichmentPipeline.R +++ b/tests/testthat/test_qtlEnrichmentPipeline.R @@ -136,7 +136,7 @@ test_that("qtlEnrichmentPipeline: ldSketch mismatch errors", { # Per-study / per-context iteration via mocked qtlEnrichment # =========================================================================== -test_that("qtlEnrichmentPipeline: returns one row per (gwasStudy, qtlContext) pair", { +test_that("qtlEnrichmentPipeline: returns one row per (gwasStudy, qtlStudy, qtlContext) triple", { gfmr <- .qep_makeGwasFmr() qfmr <- .qep_makeQtlFmr(contexts = c("c1", "c2")) local_mocked_bindings(qtlEnrichment = .qep_mockEnrichment(2.0), @@ -144,12 +144,44 @@ test_that("qtlEnrichmentPipeline: returns one row per (gwasStudy, qtlContext) pa out <- qtlEnrichmentPipeline(gwasFineMappingResult = gfmr, qtlFineMappingResult = qfmr) expect_s3_class(out, "data.frame") - expect_equal(nrow(out), 2L) # 1 GWAS study * 2 contexts - expect_setequal(out$gwasStudy, "G1") + expect_equal(nrow(out), 2L) # 1 GWAS study * 1 QTL study * 2 contexts + expect_setequal(out$gwasStudy, "G1") + expect_setequal(out$qtlStudy, "Q1") expect_setequal(out$qtlContext, c("c1", "c2")) expect_equal(out$enrichment, c(2.0, 2.0)) }) +test_that("qtlEnrichmentPipeline: distinguishes two QTL studies that share a context label", { + # Build a QtlFineMappingResult with two studies (Q1, Q2) both + # tagging the same context "shared_ctx". Per-study filtering must + # keep them separate (a context-only filter would merge them and + # produce a single enrichment row instead of two). + e1 <- .qep_makeFmEntry(variant_ids = paste0("v", 1:5)) + e2 <- .qep_makeFmEntry(variant_ids = paste0("v", 1:5)) + qfmr <- QtlFineMappingResult( + study = c("Q1", "Q2"), + context = c("shared_ctx", "shared_ctx"), + trait = c("t1", "t1"), + method = c("susie", "susie"), + entry = list(e1, e2), + ldSketch = .qep_makeHandle()) + gfmr <- .qep_makeGwasFmr() + capturedRegions <- list() + local_mocked_bindings( + qtlEnrichment = function(gwasPip, susieQtlRegions, ...) { + capturedRegions[[length(capturedRegions) + 1L]] <<- susieQtlRegions + list(enrichment = 2.0, enrichmentSe = 0.1, enrichmentLogOdds = log(2)) + }, + .package = "pecotmr") + out <- qtlEnrichmentPipeline(gwasFineMappingResult = gfmr, + qtlFineMappingResult = qfmr) + expect_equal(nrow(out), 2L) + expect_setequal(out$qtlStudy, c("Q1", "Q2")) + # Each call to qtlEnrichment sees exactly one region (the per-study one), + # not both regions merged together. + expect_true(all(lengths(capturedRegions) == 1L)) +}) + test_that("qtlEnrichmentPipeline: qtlEnrichment failure produces a warning + skip", { gfmr <- .qep_makeGwasFmr() qfmr <- .qep_makeQtlFmr() @@ -185,7 +217,7 @@ test_that("qtlEnrichmentPipeline: empty input collections yield the empty schema expect_s3_class(out, "data.frame") expect_equal(nrow(out), 0L) expect_setequal(colnames(out), - c("gwasStudy", "qtlContext", "enrichment", + c("gwasStudy", "qtlStudy", "qtlContext", "enrichment", "enrichmentSe", "enrichmentLogOdds")) }) @@ -201,15 +233,16 @@ test_that(".enrBuildGwasPipVector: extracts pip per study", { }) test_that(".enrBuildGwasPipVector: deduplicates identical PIPs across blocks", { - # Two rows under the same study but different methods, sharing v1 with - # an identical PIP value. The (study, method) validity constraint rules - # out same-method repeats, so we use susie + susieInf for the second. + # Two rows under the same study, same method, different LD blocks + # (distinct region_ids auto-supplied by the constructor) — the + # genome-wide multi-block shape. Both rows share v1 with the same + # PIP, so dedup must collapse it. e1 <- .qep_makeFmEntry(variant_ids = c("v1", "v2"), pip = c(0.5, 0.2)) e2 <- .qep_makeFmEntry(variant_ids = c("v1", "v3"), pip = c(0.5, 0.4)) g <- GwasFineMappingResult( - study = c("G1", "G1"), method = c("susie", "susieInf"), + study = c("G1", "G1"), method = c("susie", "susie"), entry = list(e1, e2), ldSketch = .qep_makeHandle()) out <- pecotmr:::.enrBuildGwasPipVector(g, "G1") @@ -217,12 +250,14 @@ test_that(".enrBuildGwasPipVector: deduplicates identical PIPs across blocks", { }) test_that(".enrBuildGwasPipVector: conflicting PIPs across blocks errors", { + # Same variant 'v1' appears in two LD blocks with different PIPs — + # .enrBuildGwasPipVector must refuse to merge them silently. e1 <- .qep_makeFmEntry(variant_ids = c("v1"), pip = 0.5, alpha = matrix(0.5, 1, 1)) e2 <- .qep_makeFmEntry(variant_ids = c("v1"), pip = 0.8, alpha = matrix(0.8, 1, 1)) g <- GwasFineMappingResult( - study = c("G1", "G1"), method = c("susie", "susieInf"), + study = c("G1", "G1"), method = c("susie", "susie"), entry = list(e1, e2), ldSketch = .qep_makeHandle()) expect_error( @@ -231,14 +266,22 @@ test_that(".enrBuildGwasPipVector: conflicting PIPs across blocks errors", { ) }) -test_that(".enrBuildQtlRegionsList: returns per-entry fit shapes", { +test_that(".enrBuildQtlRegionsList: returns per-entry fit shapes for a (study, context) hit", { qfmr <- .qep_makeQtlFmr(contexts = c("c1", "c2")) - out <- pecotmr:::.enrBuildQtlRegionsList(qfmr, "c1") + out <- pecotmr:::.enrBuildQtlRegionsList(qfmr, "Q1", "c1") expect_equal(length(out), 1L) expect_true(!is.null(out[[1L]]$alpha)) expect_true(!is.null(out[[1L]]$pip)) }) +test_that(".enrBuildQtlRegionsList: returns empty list when the (study, context) tuple is absent", { + qfmr <- .qep_makeQtlFmr(contexts = c("c1", "c2")) + # Correct context but wrong study -> no hit, even though context exists. + expect_equal(length(pecotmr:::.enrBuildQtlRegionsList(qfmr, "Q_ghost", "c1")), 0L) + # Correct study but wrong context. + expect_equal(length(pecotmr:::.enrBuildQtlRegionsList(qfmr, "Q1", "c_ghost")), 0L) +}) + # =========================================================================== # qtlEnrichment() — kernel wrapper + real C++ integration # These tests deliberately do NOT mock qtlEnrichmentRcpp so the C++ From 4494f2fca94573893be850089c9e03dcf34b0458 Mon Sep 17 00:00:00 2001 From: Daniel Nachun Date: Mon, 22 Jun 2026 16:50:03 -0700 Subject: [PATCH 3/4] fix vignettes --- vignettes/twas-weights.Rmd | 41 ++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/vignettes/twas-weights.Rmd b/vignettes/twas-weights.Rmd index 8817fad5..73b39a33 100644 --- a/vignettes/twas-weights.Rmd +++ b/vignettes/twas-weights.Rmd @@ -50,14 +50,22 @@ multi_study_qtl_dataset_example <- fixupExampleGenotypePaths(multi_study_qtl_dat ## Individual-level weights -The simplest call learns a single susie-based weight set from a +The simplest call learns a single regression-based weight set from a `QtlDataset`. Cross-validation is on by default (`cvFolds = 5`). Like `fineMappingPipeline()`, you need to tell it which genotype window to extract — `cisWindow` (per trait) is the most common choice. +Fine-mapping methods (`susie`, `susieInf`, `susieAsh`, `mvsusie`, +`fsusie`) are intentionally NOT runnable here on their own: +`twasWeightsPipeline()` will not re-fit fine-mapping models from +scratch. Run `fineMappingPipeline()` first and pass the result via +`fineMappingResult = …` (see the next section). The quickstart below +uses `lasso`, a TWAS-regression method that has no fine-mapping +dependency. + ```{r individual-quickstart} tw <- twasWeightsPipeline(qtl_dataset_example, - methods = "susie", + methods = "lasso", cisWindow = 1e6) tw ``` @@ -66,7 +74,7 @@ Extract the per-variant weights for one tuple: ```{r individual-weights} entry <- getTwasWeights(tw, study = "study1", context = "brain", - trait = "ENSG_example", method = "susie") + trait = "ENSG_example", method = "lasso") head(getWeights(entry)) ``` @@ -110,7 +118,7 @@ matching fine-mapping row fit the susie weights from scratch as normal. ```{r multistudy} msTw <- twasWeightsPipeline(multi_study_qtl_dataset_example, - methods = "susie", + methods = "lasso", cisWindow = 1e6) table(msTw$study) ``` @@ -118,23 +126,40 @@ table(msTw$study) ## Multiple weight methods at once Pass a character vector to fit several methods on the same fold split -and return them as separate `(method)` rows in the collection: +and return them as separate `(method)` rows in the collection. +Fine-mapping methods (`susie`, `susieInf`, `susieAsh`, `mvsusie`) can +ride along when a `fineMappingResult` is supplied; without one, restrict +the list to the TWAS-regression methods: ```{r multimethod, eval=FALSE} +# TWAS-regression only — no fineMappingResult needed. +twasWeightsPipeline( + qtl_dataset_example, + methods = c("lasso", "enet", "ridge", "mrash")) + +# Add a fine-mapping susie row by providing the FineMappingResult. +fmr <- fineMappingPipeline(qtl_dataset_example, methods = "susie", + cisWindow = 1e6) twasWeightsPipeline( qtl_dataset_example, - methods = c("susie", "lasso", "enet", "ridge", "mrash")) + methods = c("susie", "lasso", "enet", "ridge", "mrash"), + cisWindow = 1e6, + fineMappingResult = fmr) ``` ## RSS / sumstats weights ```{r sumstats} -twSs <- twasWeightsPipeline(qtl_sumstats_example, methods = "susie") +twSs <- twasWeightsPipeline(qtl_sumstats_example, methods = "lasso") twSs ``` The RSS path uses the `ldSketch` `GenotypeHandle` carried on the -`QtlSumStats` collection to compute / fetch LD as needed. +`QtlSumStats` collection to compute / fetch LD as needed. The fine- +mapping methods (`susie`, `susieInf`, `susieAsh`, `mvsusie`) follow the +same `fineMappingResult` gate as the individual-level path — call +`fineMappingPipeline()` on the `QtlSumStats` first, then pass the +result into `twasWeightsPipeline(..., fineMappingResult = ...)`. ## Multivariate (multi-context) weights From e0bf253c2477bb08d59a2bfc06b3f07464aa36dc Mon Sep 17 00:00:00 2001 From: danielnachun Date: Mon, 22 Jun 2026 23:51:20 +0000 Subject: [PATCH 4/4] Update documentation --- NAMESPACE | 8 ++-- man/GwasFineMappingResult.Rd | 18 +++++-- man/assembleCtwasInputs.Rd | 93 ++++++++++++++++++++++++++++++++++++ man/colocPipeline.Rd | 7 +-- man/ctwasPipeline.Rd | 58 ++++++++++++++-------- man/estCtwasParam.Rd | 50 +++++++++++++++++++ man/finemapCtwasRegions.Rd | 33 +++++++++++++ man/getCs.Rd | 10 +++- man/getMarginalEffects.Rd | 1 + man/getPip.Rd | 1 + man/getSusieFit.Rd | 10 +++- man/getTopLoci.Rd | 1 + man/getVariantIds.Rd | 1 + man/qtlEnrichmentPipeline.Rd | 11 +++-- man/screenCtwasRegions.Rd | 33 +++++++++++++ man/summaryStatsQc.Rd | 87 +++++++++++++++++++++------------ 16 files changed, 352 insertions(+), 70 deletions(-) create mode 100644 man/assembleCtwasInputs.Rd create mode 100644 man/estCtwasParam.Rd create mode 100644 man/finemapCtwasRegions.Rd create mode 100644 man/screenCtwasRegions.Rd diff --git a/NAMESPACE b/NAMESPACE index c58aa18a..a69a3932 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -21,6 +21,7 @@ export(TwasWeightsEntry) export(adjustPips) export(alignVariantNames) export(alleleQc) +export(assembleCtwasInputs) export(autoDecision) export(bLassoWeights) export(batchLoadTwasWeights) @@ -50,11 +51,7 @@ export(computeQtlEnrichment) export(computeSldscAnnotSd) export(computeSldscMRef) export(ctwasBimfileLoader) -export(assembleCtwasInputs) export(ctwasPipeline) -export(estCtwasParam) -export(finemapCtwasRegions) -export(screenCtwasRegions) export(dentist) export(dentistSingleWindow) export(dprAdaptiveGibbsWeights) @@ -65,6 +62,7 @@ export(enetWeights) export(enforceDesignFullRank) export(enlocPipeline) export(ensembleWeights) +export(estCtwasParam) export(estimateH2) export(estimateSparsity) export(extractBlockGenotypes) @@ -76,6 +74,7 @@ export(filterRelatedness) export(filterVariantsByLdReference) export(findOverlappingRegions) export(fineMappingPipeline) +export(finemapCtwasRegions) export(fitFsusie) export(fitMashContrast) export(fitMvsusie) @@ -229,6 +228,7 @@ export(rssAnalysisPipeline) export(sanitizeMashData) export(scadRssWeights) export(scadWeights) +export(screenCtwasRegions) export(sdpr) export(sdprWeights) export(slalom) diff --git a/man/GwasFineMappingResult.Rd b/man/GwasFineMappingResult.Rd index 30260242..989589d3 100644 --- a/man/GwasFineMappingResult.Rd +++ b/man/GwasFineMappingResult.Rd @@ -4,7 +4,7 @@ \alias{GwasFineMappingResult} \title{TWAS Weights Collection} \usage{ -GwasFineMappingResult(study, method, entry, ldSketch = NULL) +GwasFineMappingResult(study, method, entry, region_id = NULL, ldSketch = NULL) } \arguments{ \item{study}{Character vector of study identifiers (per tuple).} @@ -13,6 +13,13 @@ GwasFineMappingResult(study, method, entry, ldSketch = NULL) \item{entry}{List / \code{SimpleList} of \code{FineMappingEntry} objects.} +\item{region_id}{Character vector of LD-block identifiers (per +tuple). When omitted (\code{NULL}), defaults to a per-row synthetic +id (\code{"region_1"}, \code{"region_2"}, ...) so the +(\code{study}, \code{method}, \code{region_id}) triple is unique. +Supplying meaningful labels (e.g. \code{"chr22_10516173_17414263"}) +is preferred for downstream consumers that join on region.} + \item{ldSketch}{An optional \code{GenotypeHandle}.} } \value{ @@ -39,10 +46,11 @@ S4 collection of TWAS weights keyed by the identity tuple across the identity-tuple columns and any present joint columns. Construct a \code{GwasFineMappingResult} DFrame-subclass - collection from per-(study, method) tuples and a list of - \code{FineMappingEntry} payloads. The collection represents one LD - block of GWAS fine-mapping fits; build a separate collection per - block when sweeping the genome. + collection from per-(study, method, region_id) tuples and a list of + \code{FineMappingEntry} payloads. The collection can represent + either a single LD block (one row per (study, method)) or a + genome-wide sweep across blocks (multiple rows per (study, method), + each tagged with its own region_id). } \section{Slots}{ diff --git a/man/assembleCtwasInputs.Rd b/man/assembleCtwasInputs.Rd new file mode 100644 index 00000000..c5bdfc4f --- /dev/null +++ b/man/assembleCtwasInputs.Rd @@ -0,0 +1,93 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ctwasPipeline.R +\name{assembleCtwasInputs} +\alias{assembleCtwasInputs} +\title{Assemble cTWAS inputs from S4 GwasSumStats / TwasWeights} +\usage{ +assembleCtwasInputs( + gwasSumStats, + twasWeights, + twasZ = NULL, + fineMappingResult = NULL, + method = NULL, + twasWeightCutoff = 0, + csMinCor = 0.8, + minPipCutoff = 0, + maxNumVariants = Inf +) +} +\arguments{ +\item{gwasSumStats}{NAMED LIST of \code{\link{GwasSumStats}} keyed +by \code{region_id} (at least two entries). Each must have +\code{getQcInfo()} non-empty.} + +\item{twasWeights}{NAMED LIST of \code{\link{TwasWeights}} keyed by +\code{region_id}. Keys must be a SUBSET of \code{gwasSumStats}'s +keys: blocks without any TWAS weights still contribute their +SNP-level signal to ctwas's joint group prior estimate (matches +the legacy whole-chromosome pattern where only a few of many LD +blocks carry gene weights).} + +\item{twasZ}{Optional \code{GRanges} of TWAS Z-scores (output of +\code{\link{causalInferencePipeline}}). When supplied, the +per-(trait, context) Z is used as the \code{z_gene} input to +\code{ctwas_sumstats} so it is not recomputed.} + +\item{fineMappingResult}{Optional \code{QtlFineMappingResult} or +\code{GwasFineMappingResult} carrying the per-variant PIP and +credible-set membership data used by the CS / PIP rescue filters +(\code{csMinCor} and \code{minPipCutoff}). When \code{NULL} +(default) the smart filters are no-ops; only the magnitude filter +(\code{twasWeightCutoff}) and the per-gene cap +(\code{maxNumVariants}, ordered by \code{|weight|}) apply.} + +\item{method}{Optional character (length 1). Picks which TWAS +method's weights to feed into ctwas for each (study, context, +trait) gene. When \code{NULL} (default): use \code{"ensemble"} if +that method is present across the TwasWeights; otherwise use the +sole method when only one is present; otherwise error. Passing +the name explicitly (e.g. \code{"mrash"}) overrides the default +resolution.} + +\item{twasWeightCutoff}{Numeric (length 1). Drop variants with +\code{|weight| < twasWeightCutoff} from each gene's weight matrix +before ctwas sees it. Default \code{0} (no filter).} + +\item{csMinCor}{Numeric (length 1). When \code{fineMappingResult} is +provided, variants belonging to any 95\% credible set with purity +(\code{min_abs_corr}) \code{>= csMinCor} are marked as must-keep +and survive the per-gene cap. Default \code{0.8}. Ignored without +a \code{fineMappingResult}.} + +\item{minPipCutoff}{Numeric (length 1). When +\code{fineMappingResult} is provided, variants with PIP greater +than \code{minPipCutoff} are marked as must-keep and survive the +per-gene cap. Default \code{0} (no PIP rescue). Ignored without a +\code{fineMappingResult}.} + +\item{maxNumVariants}{Numeric (length 1). Cap on per-gene variant +count. When the gene has more variants than this, keep all +must-keep variants and fill remaining slots by descending PIP +(when available) or descending \code{|weight|}. Default +\code{Inf} (no cap).} +} +\value{ +A list with elements \code{z_snp}, \code{z_gene} (NULL when + no \code{twasZ}), \code{weights}, \code{region_info}, + \code{snp_map}, \code{LD_map}, \code{LD_loader_fun}, + \code{snpinfo_loader_fun}, and \code{resolvedMethod}. +} +\description{ +Builds the per-block ctwas-shape input set + (\code{z_snp}, \code{weights}, \code{region_info}, \code{snp_map}, + \code{LD_map}, the LD- and SNP-info loader closures, plus optional + \code{z_gene}) that the downstream ctwas steps consume. + This is step 1 of the three-step \code{\link{ctwasPipeline}} split. +} +\details{ +The returned list is the SHARED STATE threaded through + \code{\link{estCtwasParam}} → \code{\link{screenCtwasRegions}} → + \code{\link{finemapCtwasRegions}}. Callers can short-circuit at any + step (e.g. override the estimated priors before fine-mapping) or + call \code{ctwasPipeline()} for the one-shot path. +} diff --git a/man/colocPipeline.Rd b/man/colocPipeline.Rd index ead0abcc..b1e3aff5 100644 --- a/man/colocPipeline.Rd +++ b/man/colocPipeline.Rd @@ -71,9 +71,10 @@ use. Default \code{1e-9}.} computed \code{GwasFineMappingResult} on the returned data frame as attribute \code{"gwasFineMapping"}. Default \code{FALSE}.} -\item{enrichment}{Optional data.frame of per-(gwasStudy, qtlContext) -enrichment factors with columns \code{gwasStudy}, \code{qtlContext}, -\code{enrichment}. Output of \code{\link{qtlEnrichmentPipeline}}. +\item{enrichment}{Optional data.frame of per-(gwasStudy, qtlStudy, +qtlContext) enrichment factors with columns \code{gwasStudy}, +\code{qtlStudy}, \code{qtlContext}, \code{enrichment}. Output of +\code{\link{qtlEnrichmentPipeline}}. When non-\code{NULL}, each pair's \code{p12} prior is scaled to \code{min(p12 * (1 + enrichment), p12Max)} (the enrichment-informed colocalization variant, "enloc"). Pairs without a matching diff --git a/man/ctwasPipeline.Rd b/man/ctwasPipeline.Rd index d568030e..f6c9f68d 100644 --- a/man/ctwasPipeline.Rd +++ b/man/ctwasPipeline.Rd @@ -2,14 +2,14 @@ % Please edit documentation in R/ctwasPipeline.R \name{ctwasPipeline} \alias{ctwasPipeline} -\title{Causal TWAS Pipeline (cTWAS, single LD block)} +\title{Causal TWAS Pipeline (cTWAS, multi LD block)} \usage{ ctwasPipeline( gwasSumStats, twasWeights, twasZ = NULL, fineMappingResult = NULL, - regionId = "block1", + method = NULL, thin = 0.1, niterPrefit = 3L, niter = 30L, @@ -21,15 +21,21 @@ ctwasPipeline( csMinCor = 0.8, minPipCutoff = 0, maxNumVariants = Inf, + fallbackToPrefit = FALSE, ... ) } \arguments{ -\item{gwasSumStats}{A \code{\link{GwasSumStats}} over one LD block. -Must have \code{getQcInfo()} non-empty.} +\item{gwasSumStats}{NAMED LIST of \code{\link{GwasSumStats}} keyed +by \code{region_id} (at least two entries). Each must have +\code{getQcInfo()} non-empty.} -\item{twasWeights}{A \code{\link{TwasWeights}} carrying per-(study, -context, trait, method) weights over the same LD block.} +\item{twasWeights}{NAMED LIST of \code{\link{TwasWeights}} keyed by +\code{region_id}. Keys must be a SUBSET of \code{gwasSumStats}'s +keys: blocks without any TWAS weights still contribute their +SNP-level signal to ctwas's joint group prior estimate (matches +the legacy whole-chromosome pattern where only a few of many LD +blocks carry gene weights).} \item{twasZ}{Optional \code{GRanges} of TWAS Z-scores (output of \code{\link{causalInferencePipeline}}). When supplied, the @@ -44,8 +50,13 @@ credible-set membership data used by the CS / PIP rescue filters (\code{twasWeightCutoff}) and the per-gene cap (\code{maxNumVariants}, ordered by \code{|weight|}) apply.} -\item{regionId}{Optional character (length 1) label for the LD -block. Default \code{"block1"}.} +\item{method}{Optional character (length 1). Picks which TWAS +method's weights to feed into ctwas for each (study, context, +trait) gene. When \code{NULL} (default): use \code{"ensemble"} if +that method is present across the TwasWeights; otherwise use the +sole method when only one is present; otherwise error. Passing +the name explicitly (e.g. \code{"mrash"}) overrides the default +resolution.} \item{thin, niterPrefit, niter, L}{Pass-throughs to \code{ctwas::ctwas_sumstats}.} @@ -77,6 +88,12 @@ must-keep variants and fill remaining slots by descending PIP (when available) or descending \code{|weight|}. Default \code{Inf} (no cap).} +\item{fallbackToPrefit}{Logical (length 1). Forwarded to +\code{\link{estCtwasParam}}. When \code{TRUE}, ctwas's accurate-EM +NaN failure is recovered by falling back to the prefit estimates +(mirrors the legacy ctwas_2 workaround on underpowered data). +Default \code{FALSE}.} + \item{...}{Additional arguments forwarded to \code{ctwas::ctwas_sumstats}.} } @@ -85,9 +102,9 @@ Whatever \code{ctwas::ctwas_sumstats} returns (a list with \code{susie_alpha_res}, \code{param}, and other diagnostics). } \description{ -Per-LD-block pipeline that hands a - \code{\link{GwasSumStats}} of GWAS Z-scores together with per-gene - TWAS weights and the shared LD sketch to +Pipeline that hands a per-block set of + \code{\link{GwasSumStats}} of GWAS Z-scores together with the + matching per-block per-gene TWAS weights and LD sketches to \code{ctwas::ctwas_sumstats}, producing per-gene posterior inclusion probabilities for causal genes. Optionally accepts a precomputed TWAS-Z \code{GRanges} from @@ -96,19 +113,18 @@ Per-LD-block pipeline that hands a } \section{LD block convention}{ -Each call assumes the inputs cover exactly one LD block — the user -is responsible for constructing the \code{GwasSumStats} and -\code{TwasWeights} over the block of interest before calling this -pipeline (the same convention used by -\code{\link{fineMappingPipeline}} on \code{GwasSumStats}). The -single-region \code{region_info}, \code{LD_map}, and \code{snp_map} -that \code{ctwas::ctwas_sumstats} requires are derived -automatically from the LD sketch on \code{gwasSumStats}. +Inputs are NAMED LISTS keyed by \code{region_id} +(\code{list(block1 = gss1, block2 = gss2, ...)}). Per-block +\code{region_info}, \code{LD_map}, and \code{snp_map} entries are +built automatically from each block's LD sketch and concatenated +before the call to \code{ctwas::ctwas_sumstats}. A single-block +input is rejected: cTWAS's EM cannot converge on a single region, +so callers must supply at least two blocks. } \section{LD-sketch identity check}{ -\code{getLdSketch(twasWeights)} (when non-NULL) must match -\code{getLdSketch(gwasSumStats)}. Mismatch is a hard error. +Per block: \code{getLdSketch(twasWeights)} (when non-NULL) must +match \code{getLdSketch(gwasSumStats)}. Mismatch is a hard error. } diff --git a/man/estCtwasParam.Rd b/man/estCtwasParam.Rd new file mode 100644 index 00000000..3c1142f3 --- /dev/null +++ b/man/estCtwasParam.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ctwasPipeline.R +\name{estCtwasParam} +\alias{estCtwasParam} +\title{Estimate cTWAS group prior + prior variance} +\usage{ +estCtwasParam( + inputs, + thin = 0.1, + niterPrefit = 3L, + niter = 30L, + groupPriorVarStructure = c("shared_type", "shared_context", "shared_nonSNP", + "shared_all", "independent"), + ncore = 1L, + fallbackToPrefit = FALSE, + ... +) +} +\arguments{ +\item{inputs}{A list returned by \code{\link{assembleCtwasInputs}}.} + +\item{thin, niterPrefit, niter}{Pass-throughs to +\code{ctwas::assemble_region_data} / \code{ctwas::est_param}.} + +\item{groupPriorVarStructure}{Pass-through.} + +\item{ncore}{Number of cores.} + +\item{fallbackToPrefit}{Logical (length 1). When \code{TRUE} (default +\code{FALSE}), if \code{ctwas::est_param}'s accurate EM diverges to +NaN and throws \code{"Estimated group_prior(_var)? contains NAs"}, +re-run only the prefit step via \code{ctwas:::fit_EM} and return +those (typically finite) priors as the param. Mirrors the legacy +ctwas_2 workaround on toy data where the accurate EM saturates.} + +\item{...}{Additional arguments forwarded to \code{ctwas::est_param} +(e.g. \code{min_p_single_effect}, \code{min_group_size}).} +} +\value{ +The \code{inputs} list augmented with \code{region_data}, + \code{boundary_genes}, \code{z_gene}, and \code{param}. +} +\description{ +Step 2 of the three-step \code{\link{ctwasPipeline}}: + assembles \code{region_data} from the inputs and runs + \code{ctwas::est_param} (prefit EM + accurate EM) to estimate the + group prior probabilities and prior variances. Returns the input + state plus \code{region_data}, \code{boundary_genes}, + \code{z_gene}, and \code{param}. +} diff --git a/man/finemapCtwasRegions.Rd b/man/finemapCtwasRegions.Rd new file mode 100644 index 00000000..31c42774 --- /dev/null +++ b/man/finemapCtwasRegions.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ctwasPipeline.R +\name{finemapCtwasRegions} +\alias{finemapCtwasRegions} +\title{Fine-map cTWAS regions} +\usage{ +finemapCtwasRegions(screenResult, L = 5L, ncore = 1L, ...) +} +\arguments{ +\item{screenResult}{A list returned by +\code{\link{screenCtwasRegions}}.} + +\item{L}{Pass-through.} + +\item{ncore}{Number of cores.} + +\item{...}{Additional arguments forwarded to +\code{ctwas::finemap_regions}.} +} +\value{ +A list mirroring \code{ctwas::ctwas_sumstats}'s output: + \code{z_gene}, \code{param}, \code{finemap_res}, + \code{susie_alpha_res}, \code{region_data}, \code{boundary_genes}, + \code{screen_res}. +} +\description{ +Step 4 (final) of the three-step + \code{\link{ctwasPipeline}}: runs \code{ctwas::finemap_regions} on + the screened-region set from \code{\link{screenCtwasRegions}} and + assembles the documented top-level ctwas output (\code{z_gene}, + \code{param}, \code{finemap_res}, \code{susie_alpha_res}, + \code{region_data}, \code{boundary_genes}, \code{screen_res}). +} diff --git a/man/getCs.Rd b/man/getCs.Rd index 4decf71d..1ee1fef0 100644 --- a/man/getCs.Rd +++ b/man/getCs.Rd @@ -12,7 +12,15 @@ getCs(x, ...) \S4method{getCs}{FineMappingEntry}(x, coverage = 0.95, ...) -\S4method{getCs}{GwasFineMappingResult}(x, study = NULL, context = NULL, trait = NULL, method = NULL, ...) +\S4method{getCs}{GwasFineMappingResult}( + x, + study = NULL, + context = NULL, + trait = NULL, + method = NULL, + region = NULL, + ... +) \S4method{getCs}{QtlFineMappingResult}( x, diff --git a/man/getMarginalEffects.Rd b/man/getMarginalEffects.Rd index 438ac089..98263d16 100644 --- a/man/getMarginalEffects.Rd +++ b/man/getMarginalEffects.Rd @@ -19,6 +19,7 @@ getMarginalEffects(x, maxPval = NULL, ...) context = NULL, trait = NULL, method = NULL, + region = NULL, ... ) diff --git a/man/getPip.Rd b/man/getPip.Rd index 0a4ed899..6ddc688f 100644 --- a/man/getPip.Rd +++ b/man/getPip.Rd @@ -18,6 +18,7 @@ getPip(x, ...) context = NULL, trait = NULL, method = NULL, + region = NULL, returnList = FALSE, ... ) diff --git a/man/getSusieFit.Rd b/man/getSusieFit.Rd index ddf192f5..4a037888 100644 --- a/man/getSusieFit.Rd +++ b/man/getSusieFit.Rd @@ -12,7 +12,15 @@ getSusieFit(x, ...) \S4method{getSusieFit}{FineMappingEntry}(x, ...) -\S4method{getSusieFit}{GwasFineMappingResult}(x, study = NULL, context = NULL, trait = NULL, method = NULL, ...) +\S4method{getSusieFit}{GwasFineMappingResult}( + x, + study = NULL, + context = NULL, + trait = NULL, + method = NULL, + region = NULL, + ... +) \S4method{getSusieFit}{QtlFineMappingResult}(x, study = NULL, context = NULL, trait = NULL, method = NULL, ...) } diff --git a/man/getTopLoci.Rd b/man/getTopLoci.Rd index 12045a4e..3212fa2b 100644 --- a/man/getTopLoci.Rd +++ b/man/getTopLoci.Rd @@ -20,6 +20,7 @@ getTopLoci(x, type = c("data.frame", "GRanges"), signalCutoff = 0.025, ...) context = NULL, trait = NULL, method = NULL, + region = NULL, ... ) diff --git a/man/getVariantIds.Rd b/man/getVariantIds.Rd index 7b5b1a3b..860a0f83 100644 --- a/man/getVariantIds.Rd +++ b/man/getVariantIds.Rd @@ -22,6 +22,7 @@ getVariantIds(x, ...) context = NULL, trait = NULL, method = NULL, + region = NULL, ... ) diff --git a/man/qtlEnrichmentPipeline.Rd b/man/qtlEnrichmentPipeline.Rd index 59b73418..b418a842 100644 --- a/man/qtlEnrichmentPipeline.Rd +++ b/man/qtlEnrichmentPipeline.Rd @@ -40,11 +40,12 @@ Default \code{1.0}.} \code{\link{qtlEnrichment}}.} } \value{ -A data frame with one row per (gwasStudy, qtlContext) pair - and columns \code{gwasStudy}, \code{qtlContext}, - \code{enrichment}, \code{enrichmentSe}, \code{enrichmentLogOdds}, - plus any extras the underlying estimator emits. Suitable as the - \code{enrichment} argument to \code{\link{colocPipeline}}. +A data frame with one row per (gwasStudy, qtlStudy, + qtlContext) triple and columns \code{gwasStudy}, \code{qtlStudy}, + \code{qtlContext}, \code{enrichment}, \code{enrichmentSe}, + \code{enrichmentLogOdds}, plus any extras the underlying estimator + emits. Suitable as the \code{enrichment} argument to + \code{\link{colocPipeline}} (which joins on the same triple). } \description{ Genome-wide pipeline that computes per-pair (GWAS study, diff --git a/man/screenCtwasRegions.Rd b/man/screenCtwasRegions.Rd new file mode 100644 index 00000000..481fee75 --- /dev/null +++ b/man/screenCtwasRegions.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ctwasPipeline.R +\name{screenCtwasRegions} +\alias{screenCtwasRegions} +\title{Screen cTWAS regions} +\usage{ +screenCtwasRegions(estResult, L = 5L, ncore = 1L, ...) +} +\arguments{ +\item{estResult}{A list returned by \code{\link{estCtwasParam}}.} + +\item{L}{Pass-through to \code{ctwas::screen_regions} (and downstream +fine-mapping).} + +\item{ncore}{Number of cores.} + +\item{...}{Additional arguments forwarded to +\code{ctwas::screen_regions} (e.g. \code{filter_L}, +\code{min_nonSNP_PIP}, \code{min_L}).} +} +\value{ +The \code{estResult} list augmented with + \code{screen_res} (the full ctwas output) and + \code{screened_region_data}. +} +\description{ +Step 3 of the three-step \code{\link{ctwasPipeline}}: + runs \code{ctwas::screen_regions} on the + \code{\link{estCtwasParam}} result and returns the screened-region + set. Use this entry point to substitute hand-tuned priors for the + ones estimated in step 2 (e.g. when the accurate EM diverges to + NaN and you want to recover the prefit values). +} diff --git a/man/summaryStatsQc.Rd b/man/summaryStatsQc.Rd index e2683db7..ef11e92f 100644 --- a/man/summaryStatsQc.Rd +++ b/man/summaryStatsQc.Rd @@ -6,7 +6,6 @@ \usage{ summaryStatsQc( sumstats, - useDbsnpRefCheck = FALSE, removeIndels = FALSE, removeStrandAmbiguous = TRUE, mafCutoff = 0, @@ -19,38 +18,39 @@ summaryStatsQc( alleleFlipKriging = FALSE, impute = FALSE, imputeOpts = list(rcond = 0.01, r2Threshold = 0.6, minimumLd = 5, lamb = 0.01), - convertRefGenome = NULL, matchMinProp = 0, - mungeSumstatsArgs = list() + coerceNumeric = TRUE, + normalizeChr = TRUE, + dropNonstandardChr = TRUE, + dropMissData = TRUE, + dropPOutOfRange = TRUE, + clampSmallP = TRUE, + smallPFloor = 4.94065645841247e-324, + dropZeroEffect = TRUE, + dropNonpositiveSe = TRUE ) } \arguments{ \item{sumstats}{A \code{QtlSumStats} or \code{GwasSumStats} collection.} -\item{useDbsnpRefCheck}{One-shot opt-in to MungeSumstats's -dbSNP-based reference-genome / allele-flip / biallelic-filter -checks. When \code{TRUE}, sets \code{on_ref_genome}, -\code{infer_eff_direction}, \code{allele_flip_check}, and -\code{bi_allelic_filter} all to \code{TRUE} simultaneously. -Default \code{FALSE} (trust input alleles, lighter dependency -footprint).} - \item{removeIndels}{Logical (length 1). When \code{TRUE}, drop -indels. Default \code{FALSE} (match MungeSumstats default).} +indels during panel harmonization. Default \code{FALSE}.} \item{removeStrandAmbiguous}{Logical (length 1). When \code{TRUE}, drop A/T and C/G strand-ambiguous variants. Default \code{TRUE}.} \item{mafCutoff}{Numeric (length 1). MAF threshold (variants with \code{MAF < mafCutoff} are dropped). Default 0. Requires \code{MAF} -column.} +or \code{FRQ} column when non-zero.} \item{infoCutoff}{Numeric (length 1). INFO score threshold. Default -0. Requires \code{INFO} column.} +0. Requires \code{INFO} column when non-zero.} -\item{nCutoff}{Numeric (length 1). MungeSumstats \code{N_std} value; -sample-size deviation threshold. Default 5.} +\item{nCutoff}{Numeric (length 1). Sample-size deviation threshold: +drop variants whose \code{N} is more than \code{nCutoff} +median-absolute-deviations from the median. Set to 0 to disable. +Default 5.} \item{keepVariants}{Optional character vector of variant IDs (SNP column) to retain prior to harmonization.} @@ -77,16 +77,40 @@ accepted but currently emits a warning and is skipped.)} \item{imputeOpts}{Named list of RAISS parameters.} -\item{convertRefGenome}{Optional character (\code{"GRCh37"} or -\code{"GRCh38"}) to liftover the sumstats to via MungeSumstats. -\code{NULL} (default) skips liftover.} - \item{matchMinProp}{Minimum proportion of LD panel variants that must be matched by the sumstats; default 0.} -\item{mungeSumstatsArgs}{Optional named list of pass-through args to -\code{MungeSumstats::format_sumstats()}. Any name supplied here -overrides the value the curated knobs would have set.} +\item{coerceNumeric}{Logical. Coerce signed columns +(Z/BETA/SE/OR/LOG_ODDS/SIGNED_SUMSTAT/P/MAF/FRQ/INFO/N) to numeric. +Default \code{TRUE}.} + +\item{normalizeChr}{Logical. Strip \code{"chr"} prefix, uppercase the +chromosome label, and map 23->X, 24->Y, M->MT. Default \code{TRUE}.} + +\item{dropNonstandardChr}{Logical. Drop variants whose CHR (after +normalization) is outside 1..22, X, Y, MT. Default \code{TRUE}.} + +\item{dropMissData}{Logical. Drop rows with NA in any vital column +(chrom, pos, A1, A2, and at least one of Z / BETA). Default +\code{TRUE}.} + +\item{dropPOutOfRange}{Logical. Drop rows where \code{P < 0} or +\code{P > 1}. Default \code{TRUE}.} + +\item{clampSmallP}{Logical. Floor non-negative P values below +\code{smallPFloor} to \code{smallPFloor} so \code{-log10(P)} stays +finite. Applied to both input and Z-derived P values. Default +\code{TRUE}.} + +\item{smallPFloor}{Numeric (length 1). Floor for \code{clampSmallP}. +Default \code{5e-324} (R's smallest positive double).} + +\item{dropZeroEffect}{Logical. Drop rows where any effect column is +exactly 0 (\code{BETA}, \code{LOG_ODDS}, \code{SIGNED_SUMSTAT}) or +\code{OR} is exactly 1. Default \code{TRUE}.} + +\item{dropNonpositiveSe}{Logical. Drop rows where \code{SE <= 0}. +Default \code{TRUE}.} } \value{ A new \code{QtlSumStats} / \code{GwasSumStats} with cleaned @@ -94,13 +118,16 @@ A new \code{QtlSumStats} / \code{GwasSumStats} with cleaned } \description{ Applies a single QC pass to a \code{QtlSumStats} or \code{GwasSumStats} -collection: delegates variant-content QC (column standardization, -indels, strand-ambiguous, MAF/INFO/N filters, p-value & effect-size -sanity checks, optional dbSNP / liftover) to -\code{MungeSumstats::format_sumstats()}, then runs pecotmr-specific -steps (\code{skipRegion}, optional PIP screen, panel harmonization -against the \code{ldSketch} via \code{.matchRefPanel}, optional -SLALOM/DENTIST LD-mismatch QC, optional RAISS imputation). +collection: per-row sanity checks via \code{.applySanityChecks} (drop +rows with out-of-range / zero P, BETA == 0, SE <= 0, NA in vital +columns; clamp tiny P; normalize CHR; coerce signed columns to +numeric), variant-content filters (MAF / INFO / N) via +\code{.applyContentFilters}, optional \code{skipRegion} drop, optional +PIP screen, panel-vs-sumstats allele harmonization against the +\code{ldSketch} via \code{.matchRefPanel} (which handles indels, +strand-ambiguous variants, sign / strand flips, and duplicate drops in +a single sweep), optional SLALOM/DENTIST LD-mismatch QC, and optional +RAISS imputation. No Bioconductor genome / dbSNP packages required. } \details{ The returned collection has its \code{qcInfo} slot populated with a