From 04b85e361f5d81df25940de5cfd8e2c1dc3c6167 Mon Sep 17 00:00:00 2001 From: Yining97 Date: Tue, 23 Jun 2026 16:17:21 -0400 Subject: [PATCH 1/3] Export af (not MAF) in top_loci; add medianAbsCorr; PIP screen after harmonization top_loci now exports the effect-allele frequency as an af column instead of MAF, and the GwasSumStats/QtlSumStats methods actually wire the QC'd entry frequency into it (it was never passed before, so the old MAF column was always NA). af is the harmonized, complemented frequency; NA when absent. fineMappingPipeline gains a medianAbsCorr argument (default NULL), threaded to susie_get_cs for OR-logic credible-set purity alongside minAbsCorr. The optional PIP screen now runs after panel-vs-sumstats allele harmonization, so it operates on the harmonized variant set. Tests: update the MAF->af schema assertions and add coverage for af passthrough, medianAbsCorr OR-logic, and the PIP-screen ordering. Co-Authored-By: Claude Opus 4.8 --- R/FineMappingEntry.R | 20 ++++---- R/fineMappingPipeline.R | 31 +++++++++++- R/fineMappingWrappers.R | 4 +- R/sumstatsQc.R | 18 +++---- tests/testthat/test_fineMappingWrappers.R | 59 ++++++++++++++++++++++- tests/testthat/test_sumstatsQc.R | 34 +++++++++++++ vignettes/rss-qc.Rmd | 2 +- 7 files changed, 144 insertions(+), 24 deletions(-) diff --git a/R/FineMappingEntry.R b/R/FineMappingEntry.R index 298a787a..f948d201 100644 --- a/R/FineMappingEntry.R +++ b/R/FineMappingEntry.R @@ -80,7 +80,7 @@ setClass("FineMappingEntry", #' the pipeline's \code{trim} parameter). #' @param topLoci Per-variant \code{data.frame} in canonical schema: #' identity columns (\code{variant_id, chrom, pos, A1, A2}), context -#' (\code{N, MAF}), marginal columns (\code{marginal_beta, +#' (\code{N, af}; effect-allele frequency, never MAF), marginal columns (\code{marginal_beta, #' marginal_se, marginal_z, marginal_p}), posterior columns #' (\code{pip, posterior_mean, posterior_sd, cs_*, cs_*_purity}), #' pipeline stamps (\code{method, gene, event, grange_start, @@ -268,16 +268,17 @@ setMethod("show", "FineMappingEntry", function(object) { } # Project the canonical wide topLoci to the posterior view: identity + -# N/MAF + (beta=posterior_mean, se=posterior_sd) + pip + cs_* + signal_cluster +# N/af + (beta=posterior_mean, se=posterior_sd) + pip + cs_* + signal_cluster # + pipeline stamps. Renames `posterior_mean`/`posterior_sd` to `beta`/`se`. -# Missing optional columns are NA-filled. +# Exports effect-allele frequency as `af` (never MAF). Missing optional +# columns are NA-filled. # @noRd .projectPosteriorView <- function(tl) { if (nrow(tl) == 0L) { return(data.frame( variant_id = character(0), chrom = character(0), pos = integer(0), A1 = character(0), A2 = character(0), - N = numeric(0), MAF = numeric(0), + N = numeric(0), af = numeric(0), beta = numeric(0), se = numeric(0), pip = numeric(0), stringsAsFactors = FALSE)) } @@ -288,7 +289,7 @@ setMethod("show", "FineMappingEntry", function(object) { A1 = .tlCol(tl, "A1", "character"), A2 = .tlCol(tl, "A2", "character"), N = .tlCol(tl, "N", "numeric"), - MAF = .tlCol(tl, "MAF", "numeric"), + af = .tlCol(tl, "af", "numeric"), beta = .tlCol(tl, "posterior_mean", "numeric"), se = .tlCol(tl, "posterior_sd", "numeric"), pip = .tlCol(tl, "pip", "numeric"), @@ -304,16 +305,17 @@ setMethod("show", "FineMappingEntry", function(object) { out } -# Project to the marginal view: identity + N/MAF + (beta, se, z, p) where +# Project to the marginal view: identity + N/af + (beta, se, z, p) where # beta/se/z/p are the marginal univariate columns renamed from their -# `marginal_*` storage names. Missing optional columns are NA-filled. +# `marginal_*` storage names. Exports effect-allele frequency as `af` +# (never MAF). Missing optional columns are NA-filled. # @noRd .projectMarginalView <- function(tl) { if (nrow(tl) == 0L) { return(data.frame( variant_id = character(0), chrom = character(0), pos = integer(0), A1 = character(0), A2 = character(0), - N = numeric(0), MAF = numeric(0), + N = numeric(0), af = numeric(0), beta = numeric(0), se = numeric(0), z = numeric(0), p = numeric(0), stringsAsFactors = FALSE)) } @@ -324,7 +326,7 @@ setMethod("show", "FineMappingEntry", function(object) { A1 = .tlCol(tl, "A1", "character"), A2 = .tlCol(tl, "A2", "character"), N = .tlCol(tl, "N", "numeric"), - MAF = .tlCol(tl, "MAF", "numeric"), + af = .tlCol(tl, "af", "numeric"), beta = .tlCol(tl, "marginal_beta", "numeric"), se = .tlCol(tl, "marginal_se", "numeric"), z = .tlCol(tl, "marginal_z", "numeric"), diff --git a/R/fineMappingPipeline.R b/R/fineMappingPipeline.R index 38416366..928b4a06 100644 --- a/R/fineMappingPipeline.R +++ b/R/fineMappingPipeline.R @@ -144,6 +144,10 @@ #' \code{0.025}. #' @param minAbsCorr Minimum absolute correlation for credible-set #' purity. Default \code{0.8}. +#' @param medianAbsCorr Optional median absolute correlation for +#' credible-set purity, routed to \code{susieR::susie_get_cs}. A set is +#' kept if it passes either \code{minAbsCorr} or \code{medianAbsCorr} +#' (OR-logic). Default \code{NULL} (off). #' @param fineMappingResult Optional existing \code{FineMappingResult} #' to use as a resume cache; tuples already present are not refit. #' @param jointSpecification Optional joint-fit specification (NULL by @@ -584,7 +588,8 @@ setGeneric("fineMappingPipeline", .fmPostprocessOne <- function(fit, method, dataX, dataY, coverage, secondaryCoverage, signalCutoff, minAbsCorr, csInput = NULL, af = NULL, - region = NULL, trim = NULL) { + region = NULL, trim = NULL, + medianAbsCorr = NULL) { # Inherit `trim` from the calling method's frame if not passed in # explicitly. The 10 internal call sites don't currently forward it # (they predate the trim knob) so we look it up from the caller. This @@ -594,12 +599,19 @@ setGeneric("fineMappingPipeline", trim <- tryCatch(get("trim", envir = parent.frame()), error = function(e) TRUE) } + # `medianAbsCorr` is inherited the same way (each public setMethod gains a + # `medianAbsCorr = NULL` parameter); NULL is a no-op (OR-logic purity off). + if (is.null(medianAbsCorr)) { + medianAbsCorr <- tryCatch(get("medianAbsCorr", envir = parent.frame()), + error = function(e) NULL) + } fits <- setNames(list(fit), method) post <- postprocessFinemappingFits( fits = fits, dataX = dataX, dataY = dataY, af = af, coverage = coverage, secondaryCoverage = secondaryCoverage, signalCutoff = signalCutoff, minAbsCorr = minAbsCorr, + medianAbsCorr = medianAbsCorr, region = region, csInput = csInput, trim = isTRUE(trim)) out <- formatFinemappingOutput(post, primaryMethod = method) @@ -738,6 +750,7 @@ setMethod("fineMappingPipeline", "QtlDataset", secondaryCoverage = c(0.7, 0.5), signalCutoff = 0.025, minAbsCorr = 0.8, + medianAbsCorr = NULL, fineMappingResult = NULL, naAction = c("drop", "impute"), verbose = 1, @@ -1181,6 +1194,7 @@ setMethod("fineMappingPipeline", "MultiStudyQtlDataset", secondaryCoverage = c(0.7, 0.5), signalCutoff = 0.025, minAbsCorr = 0.8, + medianAbsCorr = NULL, fineMappingResult = NULL, naAction = c("drop", "impute"), verbose = 1, @@ -1313,6 +1327,7 @@ setMethod("fineMappingPipeline", "QtlSumStats", secondaryCoverage = c(0.7, 0.5), signalCutoff = 0.025, minAbsCorr = 0.8, + medianAbsCorr = NULL, fineMappingResult = NULL, verbose = 1, trim = TRUE, @@ -1406,6 +1421,11 @@ setMethod("fineMappingPipeline", "QtlSumStats", variantIds <- zn$variantIds z <- zn$z n <- zn$n + # Effect-allele frequency for export as `af` (entry MAF mcol, post-QC + # harmonized/complemented); aligned to variantIds, NULL -> af NA. + .qmc <- S4Vectors::mcols(entry) + afByVar <- if ("MAF" %in% colnames(.qmc)) + setNames(as.numeric(.qmc$MAF), as.character(.qmc$SNP))[variantIds] else NULL ldMat <- .fmLdFromSketch(ldSketch, variantIds) names(z) <- variantIds @@ -1441,6 +1461,7 @@ setMethod("fineMappingPipeline", "QtlSumStats", secondaryCoverage = secondaryCoverage, signalCutoff = signalCutoff, minAbsCorr = minAbsCorr, + af = afByVar, csInput = "Xcorr") # The method column on the FineMappingResult carries the bare # token (susie / susieInf / susieAsh), independent of which @@ -1552,6 +1573,7 @@ setMethod("fineMappingPipeline", "GwasSumStats", secondaryCoverage = c(0.7, 0.5), signalCutoff = 0.025, minAbsCorr = 0.8, + medianAbsCorr = NULL, fineMappingResult = NULL, verbose = 1, trim = TRUE, @@ -1590,6 +1612,12 @@ setMethod("fineMappingPipeline", "GwasSumStats", variantIds <- zn$variantIds z <- zn$z n <- zn$n + # Effect-allele frequency for export as the `af` column (post-QC the + # entry carries the harmonized, complemented frequency in its MAF mcol). + # Aligned to `variantIds`; NULL when absent -> af exported as NA. + .gmc <- S4Vectors::mcols(gr) + afByVar <- if ("MAF" %in% colnames(.gmc)) + setNames(as.numeric(.gmc$MAF), as.character(.gmc$SNP))[variantIds] else NULL # Derive a region_id from the entry's GRanges so multi-block # genome-wide GWAS sweeps can carry one row per block without # tripping (study, method, region_id) uniqueness. Format: @@ -1651,6 +1679,7 @@ setMethod("fineMappingPipeline", "GwasSumStats", secondaryCoverage = secondaryCoverage, signalCutoff = signalCutoff, minAbsCorr = minAbsCorr, + af = afByVar, csInput = "Xcorr") pushRow(st, tk, region_id, ent) } diff --git a/R/fineMappingWrappers.R b/R/fineMappingWrappers.R index 9fb8462b..3922cbc1 100644 --- a/R/fineMappingWrappers.R +++ b/R/fineMappingWrappers.R @@ -672,7 +672,7 @@ buildTopLoci <- function(fit, csTables, variantNames, sumstats = NULL, A1 = parsed$A1, A2 = parsed$A2, N = rep(fitN, nV), - MAF = if (is.null(af)) rep(NA_real_, nV) else as.numeric(af), + af = if (is.null(af)) rep(NA_real_, nV) else as.numeric(af), marginal_beta = marginalBeta, marginal_se = marginalSe, marginal_z = marginalZ, @@ -742,7 +742,7 @@ buildTopLoci <- function(fit, csTables, variantNames, sumstats = NULL, A1 = character(), A2 = character(), N = numeric(), - MAF = numeric(), + af = numeric(), marginal_beta = numeric(), marginal_se = numeric(), marginal_z = numeric(), diff --git a/R/sumstatsQc.R b/R/sumstatsQc.R index e206731d..dc1ce961 100644 --- a/R/sumstatsQc.R +++ b/R/sumstatsQc.R @@ -2989,20 +2989,12 @@ krigingOutlierQc <- function(zScore, R, variantIds = NULL, entryAudit$skipRegionDropped <- before - nrow(df) } - # 4. Optional PIP screen. - if (opts$pipCutoffToSkip != 0) { - pip <- .applyPipScreen(df, n = opts$nForPip, cutoff = opts$pipCutoffToSkip) - df <- pip$df - entryAudit$pipScreenSkipped <- isTRUE(pip$skipped) - if (isTRUE(pip$skipped)) entryAudit$pipScreenReason <- pip$reason - } - if (nrow(df) < 2L) { entryAudit$earlyExit <- "fewer than two variants after pre-harmonization QC" return(list(gr = .dfToEntryGranges(df), audit = entryAudit)) } - # 5. Panel-vs-sumstats allele harmonization. + # 4. Panel-vs-sumstats allele harmonization. nHarmIn <- nrow(df) df <- .matchAgainstSketch( df, ldSketch, @@ -3027,6 +3019,14 @@ krigingOutlierQc <- function(zScore, R, variantIds = NULL, " variant(s).") } + # 5. Optional PIP screen (after harmonization, on panel-aligned variants). + if (opts$pipCutoffToSkip != 0) { + pip <- .applyPipScreen(df, n = opts$nForPip, cutoff = opts$pipCutoffToSkip) + df <- pip$df + entryAudit$pipScreenSkipped <- isTRUE(pip$skipped) + if (isTRUE(pip$skipped)) entryAudit$pipScreenReason <- pip$reason + } + # 6. Optional kriging prefilter. if (isTRUE(opts$alleleFlipKriging) && nrow(df) >= 2L) { nKrIn <- nrow(df) diff --git a/tests/testthat/test_fineMappingWrappers.R b/tests/testthat/test_fineMappingWrappers.R index 936d050f..fe5cc06b 100644 --- a/tests/testthat/test_fineMappingWrappers.R +++ b/tests/testthat/test_fineMappingWrappers.R @@ -509,7 +509,7 @@ if (!exists(".make_univariate_data", inherits = FALSE)) { .UNIFIED_TOP_LOCI_COLS <- c( "variant_id", "chrom", "pos", "A1", "A2", - "N", "MAF", + "N", "af", "marginal_beta", "marginal_se", "marginal_z", "marginal_p", "pip", "posterior_mean", "posterior_sd", "cs_95", "cs_70", "cs_50", "cs_95_purity", @@ -610,7 +610,7 @@ test_that("buildTopLoci returns the exact 22-column schema in order with stable expect_true(is.character(out$A1)) expect_true(is.character(out$A2)) expect_true(is.numeric(out$N)) - expect_true(is.numeric(out$MAF)) + expect_true(is.numeric(out$af)) expect_true(is.numeric(out$marginal_beta)) expect_true(is.numeric(out$marginal_se)) expect_true(is.numeric(out$marginal_z)) @@ -652,6 +652,61 @@ test_that("buildTopLoci emits 22 columns in the fixed order on a non-empty fit", expect_equal(unique(out$method), "susie") }) +test_that("buildTopLoci exports af (not MAF) and carries the supplied af values", { + variant_ids <- c("chr1:100:A:G", "chr1:200:C:T") + inp <- .fake_fit_and_cs( + variant_ids, + cs_at_cov = list("0.95" = list(c(1L, 2L)), "0.7" = list(c(1L, 2L)), + "0.5" = list(c(1L, 2L))), + pip = c(0.9, 0.9)) + out <- .runBuildTopLoci(inp, method = "susie", af = c(0.12, 0.87)) + expect_true("af" %in% names(out)) + expect_false("MAF" %in% names(out)) + # Directional effect-allele frequency: value passed through verbatim, + # not folded to a minor-allele frequency (0.87 retained, not 0.13). + expect_equal(out$af, c(0.12, 0.87)) +}) + +test_that("buildTopLoci sets af = NA when no af is supplied (no silent coercion)", { + variant_ids <- c("chr1:100:A:G", "chr1:200:C:T") + inp <- .fake_fit_and_cs( + variant_ids, + cs_at_cov = list("0.95" = list(c(1L, 2L)), "0.7" = list(c(1L, 2L)), + "0.5" = list(c(1L, 2L))), + pip = c(0.9, 0.9)) + out <- .runBuildTopLoci(inp, method = "susie", af = NULL) + expect_true("af" %in% names(out)) + expect_true(all(is.na(out$af))) +}) + +.n_cs95 <- function(post) + length(setdiff(unique(as.character(post$top_loci$cs_95)), c(NA, ""))) + +test_that("postprocessFinemappingFits forwards medianAbsCorr to susie_get_cs (OR-logic admits >= sets)", { + d <- .make_univariate_data(seed = 11, effect_idx = c(10, 35)) + fit <- susieR::susie(d$X, d$y, L = 5) + # A very strict min_abs_corr alone vs the same min_abs_corr OR a lenient + # median_abs_corr: OR-logic keeps at least as many credible sets. + pStrict <- postprocessFinemappingFits(list(susie = fit), dataX = d$X, dataY = d$y, + coverage = 0.95, minAbsCorr = 0.999, + medianAbsCorr = NULL) + pOr <- postprocessFinemappingFits(list(susie = fit), dataX = d$X, dataY = d$y, + coverage = 0.95, minAbsCorr = 0.999, + medianAbsCorr = 0.1) + expect_gte(.n_cs95(pOr), .n_cs95(pStrict)) +}) + +test_that("postprocessFinemappingFits with medianAbsCorr = NULL is a no-op", { + d <- .make_univariate_data(seed = 7, effect_idx = c(20)) + fit <- susieR::susie(d$X, d$y, L = 5) + p1 <- postprocessFinemappingFits(list(susie = fit), dataX = d$X, dataY = d$y, + coverage = 0.95) + p2 <- postprocessFinemappingFits(list(susie = fit), dataX = d$X, dataY = d$y, + coverage = 0.95, medianAbsCorr = NULL) + expect_equal(p1$top_loci$cs_95, p2$top_loci$cs_95) + expect_equal(p1$top_loci$af, p2$top_loci$af) +}) + test_that("cs_95 / cs_70 / cs_50 are character strings of the form '_'", { variant_ids <- c("chr1:100:A:G", "chr1:200:C:T", "chr1:300:T:C") # Variants 1 and 2 in CS 1 (at 95), variant 3 in CS 2 (at 95). All three diff --git a/tests/testthat/test_sumstatsQc.R b/tests/testthat/test_sumstatsQc.R index a7ce10c8..d3b07edd 100644 --- a/tests/testthat/test_sumstatsQc.R +++ b/tests/testthat/test_sumstatsQc.R @@ -2648,6 +2648,40 @@ test_that("summaryStatsQc: infoCutoff > 0 with no INFO column errors", { "infoCutoff > 0 requires every entry to carry an INFO column") }) +test_that("summaryStatsQc: PIP screen runs AFTER allele harmonization", { + # Entry: three weak panel-matched variants plus one STRONG signal whose + # position (9999) is absent from the LD sketch (panel BP = 100..800), so + # harmonization drops it. With the screen running *after* harmonization it + # never sees the strong off-panel signal, so the remaining weak variants + # fail the PIP screen and the region is skipped (empty). (Pre-harmonization + # screening would have kept the region on the strength of the off-panel hit.) + gr <- .ssQ_makeEntryGr(c("rs1", "rs2", "rs3", "rsX"), + positions = c(100L, 200L, 300L, 9999L)) + mc <- S4Vectors::mcols(gr) + mc$Z <- c(0.2, 0.3, 0.1, 12) # only the off-panel rsX carries signal + S4Vectors::mcols(gr) <- mc + ss <- GwasSumStats(study = "g1", entry = list(gr), + genome = "hg19", ldSketch = .ssQ_makeHandle()) + out <- summaryStatsQc(ss, pipCutoffToSkip = 0.5, nCutoff = 0) + snps <- as.character(S4Vectors::mcols(out$entry[[1L]])$SNP) + expect_false("rsX" %in% snps) # dropped by harmonization + expect_equal(length(out$entry[[1L]]), 0L) # screen (post-harmonization) skips the region +}) + +test_that("summaryStatsQc: PIP screen off leaves the harmonized set intact", { + # Same harmonized variants, screen disabled (pipCutoffToSkip = 0): the three + # panel-matched variants survive (behavior-invariance for the screen-off path). + gr <- .ssQ_makeEntryGr(c("rs1", "rs2", "rs3", "rsX"), + positions = c(100L, 200L, 300L, 9999L)) + mc <- S4Vectors::mcols(gr); mc$Z <- c(0.2, 0.3, 0.1, 12) + S4Vectors::mcols(gr) <- mc + ss <- GwasSumStats(study = "g1", entry = list(gr), + genome = "hg19", ldSketch = .ssQ_makeHandle()) + out <- summaryStatsQc(ss, pipCutoffToSkip = 0, nCutoff = 0) + snps <- as.character(S4Vectors::mcols(out$entry[[1L]])$SNP) + expect_setequal(snps, c("rs1", "rs2", "rs3")) +}) + test_that(".deriveBetaSeFromZ: derives BETA+SE when entry has Z+MAF+N only", { df <- data.frame( SNP = paste0("rs", 1:3), diff --git a/vignettes/rss-qc.Rmd b/vignettes/rss-qc.Rmd index b73cb79f..f52b020e 100644 --- a/vignettes/rss-qc.Rmd +++ b/vignettes/rss-qc.Rmd @@ -35,7 +35,7 @@ entry point: | Stage | Mechanism | |---|---| | Variant-content QC (column standardization, indels, strand-ambiguous, MAF/INFO/N filters, p-value & effect-size sanity checks, optional dbSNP / liftover) | Delegated to `MungeSumstats::format_sumstats()` from inside `summaryStatsQc()`. | -| Pecotmr-specific QC (`skipRegion`, optional PIP screen, panel harmonization against the `ldSketch`, optional SLALOM/DENTIST LD-mismatch QC, optional RAISS imputation) | Implemented inside `summaryStatsQc()`. | +| Pecotmr-specific QC (`skipRegion`, panel harmonization against the `ldSketch`, optional PIP screen on the harmonized variants, optional SLALOM/DENTIST LD-mismatch QC, optional RAISS imputation) | Implemented inside `summaryStatsQc()`. | `summaryStatsQc()` is the **only** QC entry point. It takes a `GwasSumStats` or `QtlSumStats` (a DFrame collection that already carries From 5990a5ab8b50f1ac3c203d7151d1b369ab586800 Mon Sep 17 00:00:00 2001 From: Yining97 Date: Tue, 23 Jun 2026 16:42:18 -0400 Subject: [PATCH 2/3] Skip medianAbsCorr test when susieR lacks median_abs_corr The CI's conda-forge susieR (CRAN build) has no median_abs_corr in susie_get_cs; guard the new OR-logic test so it runs only where susieR supports it (GitHub-HEAD) and skips otherwise. Co-Authored-By: Claude Opus 4.8 --- tests/testthat/test_fineMappingWrappers.R | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/testthat/test_fineMappingWrappers.R b/tests/testthat/test_fineMappingWrappers.R index fe5cc06b..7c3e6560 100644 --- a/tests/testthat/test_fineMappingWrappers.R +++ b/tests/testthat/test_fineMappingWrappers.R @@ -683,6 +683,11 @@ test_that("buildTopLoci sets af = NA when no af is supplied (no silent coercion) length(setdiff(unique(as.character(post$top_loci$cs_95)), c(NA, ""))) test_that("postprocessFinemappingFits forwards medianAbsCorr to susie_get_cs (OR-logic admits >= sets)", { + # medianAbsCorr is only meaningful when the installed susieR's susie_get_cs + # accepts median_abs_corr (GitHub-HEAD susieR; the CRAN/conda-forge build + # does not yet). Skip rather than error where it is unavailable. + skip_if_not("median_abs_corr" %in% names(formals(susieR::susie_get_cs)), + "installed susieR has no median_abs_corr support") d <- .make_univariate_data(seed = 11, effect_idx = c(10, 35)) fit <- susieR::susie(d$X, d$y, L = 5) # A very strict min_abs_corr alone vs the same min_abs_corr OR a lenient From 80d0fe30147173637b53ceaf5b0649fedf513674 Mon Sep 17 00:00:00 2001 From: Yining97 Date: Tue, 23 Jun 2026 17:41:42 -0400 Subject: [PATCH 3/3] Reimplement krigingOutlierQc on susieR::kriging_rss Replace the hand-rolled fixed-ridge conditional with susieR's kriging RSS diagnostic: estimate the LD-mismatch scale via estimate_s_rss() and take the per-variant conditional distribution from kriging_rss(), flagging outliers on its standardized residual. krigingOutlierQc now takes n (threaded from the QC call site) and errors clearly when susieR lacks the diagnostic; ridge is gone. Better calibrated than the fixed ridge, which over-flagged in the tails (chr21 reference: 28 -> 8 flagged, the 8 a strict subset). Co-Authored-By: Claude Opus 4.8 --- R/sumstatsQc.R | 59 +++++++++++++++++++------------- tests/testthat/test_sumstatsQc.R | 23 ++++++++++++- 2 files changed, 58 insertions(+), 24 deletions(-) diff --git a/R/sumstatsQc.R b/R/sumstatsQc.R index dc1ce961..7db0e417 100644 --- a/R/sumstatsQc.R +++ b/R/sumstatsQc.R @@ -2331,45 +2331,56 @@ ldMismatchQc <- function(zScore, R = NULL, X = NULL, nSample = NULL, #' Kriging-style LD-consistency outlier QC #' #' Flags variants whose observed z-score is inconsistent with the value -#' predicted from its LD neighbours. For \code{z ~ N(0, R)} the leave-one-out -#' conditional distribution of \code{z_i} given the rest has mean -#' \code{-(1/Omega_ii) * Omega_{i,-i} z_{-i}} and variance \code{1/Omega_ii}, -#' where \code{Omega = R^{-1}}. The standardized residual is ~\code{N(0,1)} when -#' the z-scores and LD are mutually consistent, so a large residual marks an -#' allele-flip / LD-mismatch outlier. RSS-only helper, opt-in via -#' \code{alleleFlipKriging}; never wired into \code{alleleQc()} / -#' \code{matchRefPanel()}. +#' predicted from its LD neighbours, using susieR's kriging diagnostic. +#' \code{susieR::kriging_rss()} computes the leave-one-out conditional +#' distribution of each \code{z_i} given the rest, with the LD-mismatch scale +#' \code{s} estimated by \code{susieR::estimate_s_rss()}; its standardized +#' residual (\code{z_std_diff}) is ~\code{N(0,1)} when z-scores and LD agree, so +#' a large residual marks an allele-flip / LD-mismatch outlier. RSS-only helper, +#' opt-in via \code{alleleFlipKriging}; never wired into \code{alleleQc()} / +#' \code{matchRefPanel()}. Requires a susieR that provides \code{kriging_rss()} +#' and \code{estimate_s_rss()}. #' #' @param zScore Numeric vector of harmonized z-scores. #' @param R Square LD correlation matrix aligned to \code{zScore}. +#' @param n Sample size, forwarded to \code{susieR::estimate_s_rss()} and +#' \code{susieR::kriging_rss()}. #' @param variantIds Optional variant IDs for the diagnostics table. #' @param pThreshold Two-sided p-value cutoff for flagging an outlier #' (default \code{5e-8}). -#' @param ridge Small diagonal added to \code{R} before inversion for numerical -#' stability (default \code{1e-3}). #' @return A list with \code{outlier} (logical vector) and \code{diagnostics} #' (data frame of per-variant predicted z, residual, statistic, p-value, and #' outlier flag). #' @importFrom stats pnorm #' @export -krigingOutlierQc <- function(zScore, R, variantIds = NULL, - pThreshold = 5e-8, ridge = 1e-3) { +krigingOutlierQc <- function(zScore, R, n, variantIds = NULL, + pThreshold = 5e-8) { zScore <- as.numeric(zScore) m <- length(zScore) if (is.null(R) || !is.matrix(R) || nrow(R) != m || ncol(R) != m) { stop("krigingOutlierQc requires a square LD matrix aligned to zScore.") } + if (missing(n) || length(n) != 1L || is.na(n) || !is.finite(n) || n <= 0) { + stop("krigingOutlierQc requires a single positive sample size 'n'.") + } + if (!requireNamespace("susieR", quietly = TRUE) || + !all(c("estimate_s_rss", "kriging_rss") %in% getNamespaceExports("susieR"))) { + stop("krigingOutlierQc requires a susieR that provides estimate_s_rss() and ", + "kriging_rss(); the installed susieR does not. Install a susieR with the ", + "kriging RSS diagnostic, or disable alleleFlipKriging.") + } if (is.null(variantIds)) variantIds <- rownames(R) - # Regularize so the precision matrix is well-defined for collinear panels. - Omega <- solve(R + diag(ridge, m)) - d <- diag(Omega) - omegaZ <- as.numeric(Omega %*% zScore) - condMean <- -(omegaZ - d * zScore) / d - condVar <- 1 / d - residual <- zScore - condMean - statistic <- residual / sqrt(condVar) - pValue <- 2 * pnorm(-abs(statistic)) - outlier <- !is.na(pValue) & pValue < pThreshold + # susieR's kriging RSS diagnostic: estimate the LD-mismatch scale, then take + # the per-variant conditional distribution. `z_std_diff` is the standardized + # residual we threshold (same p-value rule as before). + s <- tryCatch(susieR::estimate_s_rss(z = zScore, R = R, n = n), + error = function(e) 0) + cd <- susieR::kriging_rss(z = zScore, R = R, n = n, s = s)$conditional_dist + condMean <- as.numeric(cd$condmean) + statistic <- as.numeric(cd$z_std_diff) + residual <- zScore - condMean + pValue <- 2 * stats::pnorm(-abs(statistic)) + outlier <- !is.na(pValue) & pValue < pThreshold list( outlier = outlier, diagnostics = data.frame( @@ -3035,7 +3046,9 @@ krigingOutlierQc <- function(zScore, R, variantIds = NULL, dosage <- t(SummarizedExperiment::assay(block, "dosage")) colnames(dosage) <- df$SNP R <- computeLd(dosage, method = "sample") - kr <- krigingOutlierQc(df$Z, R, variantIds = df$SNP) + nKrig <- if (!is.null(opts$nForPip) && is.finite(opts$nForPip)) opts$nForPip + else stats::median(as.numeric(df$N), na.rm = TRUE) + kr <- krigingOutlierQc(df$Z, R, n = nKrig, variantIds = df$SNP) nKr <- sum(kr$outlier) if (nKr > 0L) df <- df[!kr$outlier, , drop = FALSE] entryAudit$krigingOutliersDropped <- nKr diff --git a/tests/testthat/test_sumstatsQc.R b/tests/testthat/test_sumstatsQc.R index d3b07edd..74bdac47 100644 --- a/tests/testthat/test_sumstatsQc.R +++ b/tests/testthat/test_sumstatsQc.R @@ -67,6 +67,8 @@ test_that(".resolveZMismatchQc rejects stale rss_qc and other invalid tokens", { # =========================================================================== test_that("krigingOutlierQc flags an LD-inconsistent variant and spares the rest", { + skip_if_not("kriging_rss" %in% getNamespaceExports("susieR"), + "installed susieR has no kriging_rss") m <- 6 rho <- 0.7 R <- matrix(rho, m, m); diag(R) <- 1 @@ -74,7 +76,7 @@ test_that("krigingOutlierQc flags an LD-inconsistent variant and spares the rest rownames(R) <- colnames(R) <- ids z <- rep(3, m) z[3] <- -8 # strongly inconsistent with its neighbours - kr <- krigingOutlierQc(z, R, variantIds = ids) + kr <- krigingOutlierQc(z, R, n = 1000, variantIds = ids) expect_true(kr$outlier[3]) expect_false(any(kr$outlier[-3])) expect_equal(nrow(kr$diagnostics), m) @@ -82,6 +84,25 @@ test_that("krigingOutlierQc flags an LD-inconsistent variant and spares the rest colnames(kr$diagnostics))) }) +test_that("krigingOutlierQc statistic matches susieR::kriging_rss z_std_diff", { + skip_if_not("kriging_rss" %in% getNamespaceExports("susieR"), + "installed susieR has no kriging_rss") + set.seed(7); m <- 10 + R <- cov2cor(crossprod(matrix(rnorm(m * m), m))) + ids <- paste0("1:", seq_len(m) * 100, ":A:G") + rownames(R) <- colnames(R) <- ids + z <- as.numeric(R %*% rnorm(m)); z[4] <- 9 + kr <- krigingOutlierQc(z, R, n = 5000, variantIds = ids) + s <- susieR::estimate_s_rss(z = z, R = R, n = 5000) + ref <- susieR::kriging_rss(z = z, R = R, n = 5000, s = s)$conditional_dist + expect_equal(kr$diagnostics$statistic, as.numeric(ref$z_std_diff), tolerance = 1e-8) + expect_equal(kr$diagnostics$predicted, as.numeric(ref$condmean), tolerance = 1e-8) +}) + +test_that("krigingOutlierQc requires a positive sample size n", { + expect_error(krigingOutlierQc(c(1, 2, 3), diag(3)), "positive sample size") +}) + test_that("alignVariantNames correctly aligns variant names", { # Test case 1: Matching variant names