From 381c4a2ab07ca43cc6c782b0365eaaab19b30289 Mon Sep 17 00:00:00 2001 From: Arshammik <79arsham@gmail.com> Date: Tue, 9 Jun 2026 15:31:36 -0400 Subject: [PATCH] feat: repeated permutation null for get_pseudo_correlation Add permutation_count (default 100) and permutation_seed arguments to get_pseudo_correlation() and SplikitObject$getPseudoCorrelation(). The null model is now built from permutation_count cell-permutations of ZDB_matrix, yielding a per-event empirical null with the new columns null_sd, n_perm_valid, emp_pvalue (two-sided, (b+1)/(m+1) corrected) and emp_padj (Benjamini-Hochberg). The full per-event draws are attached as attr(result, "null_draws"). permutation_seed seeds the permutations reproducibly and saves/restores the caller's global RNG stream, so it does not perturb downstream randomness. null_distribution is now the mean of the per-event null draws (equal to the single draw when permutation_count = 1, preserving backward compatibility); the observed pseudo_correlation is unchanged. Bump to 2.3.2; update NEWS.md, RELEASENOTES.md logbook, regenerated docs, vignettes, and add a dedicated permutation test (validity, seed reproducibility, global-RNG isolation, count=1 equivalence, input validation). --- .Rbuildignore | 1 + DESCRIPTION | 2 +- NEWS.md | 20 +++ R/SplikitObject.R | 18 ++- R/general_tools.R | 218 ++++++++++++++++++++-------- RELEASENOTES.md | 39 +++++ man/SplikitObject.Rd | 16 +- man/get_pseudo_correlation.Rd | 46 +++++- tests/testthat/test-SplikitObject.R | 8 +- tests/testthat/test-general_tools.R | 62 ++++++++ vignettes/methods.Rmd | 2 +- vignettes/splikit_manual.Rmd | 9 +- 12 files changed, 366 insertions(+), 75 deletions(-) create mode 100644 RELEASENOTES.md diff --git a/.Rbuildignore b/.Rbuildignore index 540d634..05c9f00 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -13,3 +13,4 @@ ^build_pkgdown_site\.R$ ^CLAUDE\.md$ ^\.claude$ +^RELEASENOTES\.md$ diff --git a/DESCRIPTION b/DESCRIPTION index e50d604..8e031ec 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: splikit Title: Analysing RNA Splicing in Single-Cell RNA Sequencing Data -Version: 2.3.1 +Version: 2.3.2 Authors@R: person("Arsham", "Mikaeili Namini", , "arsham.mikaeilinamini@mail.mcgill.ca", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-9453-6951")) diff --git a/NEWS.md b/NEWS.md index 88dd836..1ed7220 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,23 @@ +# splikit 2.3.2 + +## New Features +* **Repeated permutation null for `get_pseudo_correlation()`.** The null model + is now built by permuting the cells (columns) of `ZDB_matrix` and recomputing + the pseudo correlation `permutation_count` times (default `100`), instead of a + single permutation. This yields a per-event empirical null and adds the + columns `null_sd`, `n_perm_valid`, `emp_pvalue` (two-sided, `(b + 1) / (m + 1)` + corrected), and `emp_padj` (Benjamini-Hochberg). The full per-event draws are + attached as `attr(result, "null_draws")`. +* **`permutation_seed` argument** for reproducible permutations. When supplied, + the RNG is seeded locally and the caller's global RNG stream is saved and + restored, so reproducibility does not perturb downstream randomness. +* Both additions are exposed on the R6 method `SplikitObject$getPseudoCorrelation()`. + +## Behavioural Notes +* `null_distribution` is now the mean of the per-event null draws; it equals the + original single null value when `permutation_count = 1`, preserving backward + compatibility. The observed `pseudo_correlation` is unchanged. + # splikit 2.3.1 ## CRAN Compliance diff --git a/R/SplikitObject.R b/R/SplikitObject.R index d31ab82..e8d7b97 100644 --- a/R/SplikitObject.R +++ b/R/SplikitObject.R @@ -220,10 +220,20 @@ SplikitObject <- R6::R6Class("SplikitObject", #' Must have same dimensions as m1. #' @param metric R-squared metric: "CoxSnell" or "Nagelkerke" (default: "CoxSnell"). #' @param suppress_warnings Suppress computation warnings (default: TRUE). + #' @param permutation_count Integer. Number of cell-permutation null draws per + #' event used to build the empirical null (default: 100L). Use 1L for the + #' original single-permutation behaviour. + #' @param permutation_seed Optional single numeric value seeding the + #' permutations for reproducibility without disturbing the global RNG + #' (default: NULL). #' - #' @return A data.table with event names, pseudo_correlation, and null_distribution. + #' @return A data.table with event names, pseudo_correlation, null_distribution, + #' null_sd, n_perm_valid, emp_pvalue, and emp_padj, plus the full null draws + #' in attr(result, "null_draws"). getPseudoCorrelation = function(ZDB_matrix, metric = "CoxSnell", - suppress_warnings = TRUE) { + suppress_warnings = TRUE, + permutation_count = 100L, + permutation_seed = NULL) { private$ensureM2Computed() @@ -249,7 +259,9 @@ SplikitObject <- R6::R6Class("SplikitObject", m1_inclusion = self$m1, m2_exclusion = self$m2, metric = metric, - suppress_warnings = suppress_warnings + suppress_warnings = suppress_warnings, + permutation_count = permutation_count, + permutation_seed = permutation_seed ) # Warn about NA removal diff --git a/R/general_tools.R b/R/general_tools.R index 1798ff9..b1bca5c 100644 --- a/R/general_tools.R +++ b/R/general_tools.R @@ -20,13 +20,31 @@ utils::globalVariables(c("unique_mapped", "i", "j", "x_1", "x_tot", "x_2", "grou #' @param metric Character string specifying which R² metric to compute. Options are "CoxSnell" (default) or "Nagelkerke". #' @param suppress_warnings Logical. If \code{TRUE} (default), suppresses warnings during computation (e.g., due to ill-conditioned inputs). #' @param verbose Logical. If \code{TRUE}, prints progress and informational messages. Default is \code{FALSE}. +#' @param permutation_count Integer. Number of times the null model is generated by +#' permuting the cells (columns) of `ZDB_matrix` and recomputing the pseudo +#' correlation. Each permutation yields one null value per event, so the n draws +#' form a per-event empirical null. Defaults to `100L`. Use `1L` to reproduce the +#' original single-permutation behaviour. Values of 1000 or more are recommended +#' for reliable empirical FDR estimation (the smallest achievable empirical +#' p-value is approximately `1 / (permutation_count + 1)`). +#' @param permutation_seed Optional single numeric value. If supplied, the random +#' number generator is seeded with it before the permutations are drawn, making +#' the null distribution (and therefore the empirical p-values) reproducible. The +#' caller's global RNG stream is saved and restored, so passing a seed does not +#' perturb downstream randomness. Defaults to `NULL` (use the ambient RNG state). #' #' @return A `data.table` with the following columns: #' \describe{ #' \item{event}{The event names from `ZDB_matrix` rownames.} #' \item{pseudo_correlation}{The computed pseudo R² correlation values using the specified metric.} -#' \item{null_distribution}{Null correlation values from a permuted version of `ZDB_matrix`.} +#' \item{null_distribution}{Mean of the per-event null correlation values across the `permutation_count` permutations (equal to the single null draw when `permutation_count = 1`).} +#' \item{null_sd}{Standard deviation of the per-event null draws (`NA` when fewer than two valid draws are available).} +#' \item{n_perm_valid}{Number of permutations that produced a non-`NA` null value for the event.} +#' \item{emp_pvalue}{Two-sided empirical p-value, `(b + 1) / (n_perm_valid + 1)`, where `b` is the number of valid null draws whose absolute value is greater than or equal to the absolute observed correlation.} +#' \item{emp_padj}{Benjamini-Hochberg adjusted `emp_pvalue` across the retained events.} #' } +#' The full matrix of per-event null draws (events x `permutation_count`) is attached +#' as `attr(result, "null_draws")` for custom downstream analyses. #' #' @examples #' \donttest{ @@ -40,6 +58,11 @@ utils::globalVariables(c("unique_mapped", "i", "j", "x_1", "x_tot", "x_2", "grou #' eventdata <- m1_obj$event_data #' m2_exclusion <- make_m2(m1_inclusion, eventdata) #' +#' # subset to a few hundred cells to keep the example quick +#' cells <- seq_len(min(250, ncol(m1_inclusion))) +#' m1_inclusion <- m1_inclusion[, cells] +#' m2_exclusion <- m2_exclusion[, cells] +#' #' # creating a dummy ZDB #' ZDB_matrix <- matrix(rnorm(n = (nrow(m1_inclusion) * ncol(m1_inclusion)), sd = 7), #' nrow = nrow(m1_inclusion), @@ -50,20 +73,29 @@ utils::globalVariables(c("unique_mapped", "i", "j", "x_1", "x_tot", "x_2", "grou #' # Example with dense matrices (backward compatible) #' m1_dense <- as.matrix(m1_inclusion) #' m2_dense <- as.matrix(m2_exclusion) -#' pseudo_r_square_cox <- get_pseudo_correlation(ZDB_matrix, m1_dense, m2_dense) +#' pseudo_r_square_cox <- get_pseudo_correlation(ZDB_matrix, m1_dense, m2_dense, +#' permutation_count = 20, +#' permutation_seed = 1) #' print(pseudo_r_square_cox) #' #' # Example with sparse matrices (more memory efficient) -#' pseudo_r_square_sparse <- get_pseudo_correlation(ZDB_matrix, m1_inclusion, m2_exclusion) +#' pseudo_r_square_sparse <- get_pseudo_correlation(ZDB_matrix, m1_inclusion, m2_exclusion, +#' permutation_count = 20, +#' permutation_seed = 1) +#' +#' # Per-event empirical p-values and the full null draws +#' head(pseudo_r_square_sparse[, c("event", "pseudo_correlation", "emp_pvalue")]) +#' dim(attr(pseudo_r_square_sparse, "null_draws")) #' #' # Example using Nagelkerke R-squared instead of Cox-Snell #' pseudo_r_square_nagel <- get_pseudo_correlation(ZDB_matrix, m1_inclusion, m2_exclusion, -#' metric = "Nagelkerke") +#' metric = "Nagelkerke", +#' permutation_count = 20) #' print(pseudo_r_square_nagel) #' } #' #' @export -get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion = NULL, metric = "CoxSnell", suppress_warnings = TRUE, verbose = FALSE) { +get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion = NULL, metric = "CoxSnell", suppress_warnings = TRUE, verbose = FALSE, permutation_count = 100L, permutation_seed = NULL) { # Handle SplikitObject input if (inherits(ZDB_matrix, "SplikitObject")) { @@ -88,6 +120,19 @@ get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion # Validate metric parameter metric <- match.arg(metric, choices = c("CoxSnell", "Nagelkerke")) + # Validate permutation controls + if (!is.numeric(permutation_count) || length(permutation_count) != 1L || + is.na(permutation_count) || permutation_count < 1) { + stop("permutation_count must be a single positive integer.", call. = FALSE) + } + permutation_count <- as.integer(permutation_count) + + if (!is.null(permutation_seed) && + (!is.numeric(permutation_seed) || length(permutation_seed) != 1L || + is.na(permutation_seed))) { + stop("permutation_seed must be NULL or a single numeric value.", call. = FALSE) + } + # Check ZDB_matrix (must be dense) if (!is.matrix(ZDB_matrix)) stop("ZDB_matrix must be a dense matrix.") @@ -108,7 +153,25 @@ get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion if (verbose) message("Input dimension check passed. Proceeding with computation.") - # Define the computation function + # Seed the RNG for reproducible permutations without disturbing the caller's + # global RNG stream (state is saved here and restored on exit). + if (!is.null(permutation_seed)) { + if (exists(".Random.seed", envir = globalenv(), inherits = FALSE)) { + old_seed <- get(".Random.seed", envir = globalenv(), inherits = FALSE) + on.exit(assign(".Random.seed", old_seed, envir = globalenv()), add = TRUE) + } else { + on.exit( + if (exists(".Random.seed", envir = globalenv(), inherits = FALSE)) { + rm(".Random.seed", envir = globalenv()) + }, + add = TRUE + ) + } + set.seed(permutation_seed) + } + + # Define the computation function. Returns the observed correlations, the event + # names, and the matrix of per-event null draws (events x permutation_count). run_computation <- function() { # Determine which C++ function to call based on matrix types is_m1_sparse <- inherits(m1_inclusion, "Matrix") @@ -126,32 +189,36 @@ get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion "dMatrix") } - if (!is_m1_sparse && !is_m2_sparse) { - # Both dense - ensure they are matrices - correls <- cppBetabinPseudoR2(Z = ZDB_matrix, - m1 = as.matrix(m1_inclusion), - m2 = as.matrix(m2_exclusion), - metric = metric) - } else if (is_m1_sparse && is_m2_sparse) { - # Both sparse - correls <- cppBetabinPseudoR2_sparse(Z = ZDB_matrix, - m1 = m1_inclusion, - m2 = m2_exclusion, - metric = metric) - } else if (is_m1_sparse && !is_m2_sparse) { - # m1 sparse, m2 dense - correls <- cppBetabinPseudoR2_mixed1(Z = ZDB_matrix, - m1 = m1_inclusion, - m2 = as.matrix(m2_exclusion), - metric = metric) - } else { - # m1 dense, m2 sparse - correls <- cppBetabinPseudoR2_mixed2(Z = ZDB_matrix, - m1 = as.matrix(m1_inclusion), - m2 = m2_exclusion, - metric = metric) + # Dispatch helper: compute the per-event pseudo correlation for a given Z, + # selecting the kernel that matches the m1/m2 matrix storage types. Shared + # by the observed pass and every null permutation. + compute_kernel <- function(Zmat) { + if (!is_m1_sparse && !is_m2_sparse) { + cppBetabinPseudoR2(Z = Zmat, + m1 = as.matrix(m1_inclusion), + m2 = as.matrix(m2_exclusion), + metric = metric) + } else if (is_m1_sparse && is_m2_sparse) { + cppBetabinPseudoR2_sparse(Z = Zmat, + m1 = m1_inclusion, + m2 = m2_exclusion, + metric = metric) + } else if (is_m1_sparse && !is_m2_sparse) { + cppBetabinPseudoR2_mixed1(Z = Zmat, + m1 = m1_inclusion, + m2 = as.matrix(m2_exclusion), + metric = metric) + } else { + cppBetabinPseudoR2_mixed2(Z = Zmat, + m1 = as.matrix(m1_inclusion), + m2 = m2_exclusion, + metric = metric) + } } + # Observed correlation (computed once; independent of the permutations). + correls <- compute_kernel(ZDB_matrix) + if (is.null(rownames(ZDB_matrix))) { warning("ZDB_matrix has no row names; assigning default event names.") events <- paste0("Event_", seq_len(nrow(ZDB_matrix))) @@ -159,44 +226,77 @@ get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion events <- rownames(ZDB_matrix) } - # Calculate null distribution with the same metric and matrix types - if (!is_m1_sparse && !is_m2_sparse) { - null_dist <- cppBetabinPseudoR2(Z = ZDB_matrix[, sample.int(ncol(ZDB_matrix))], - m1 = as.matrix(m1_inclusion), - m2 = as.matrix(m2_exclusion), - metric = metric) - } else if (is_m1_sparse && is_m2_sparse) { - null_dist <- cppBetabinPseudoR2_sparse(Z = ZDB_matrix[, sample.int(ncol(ZDB_matrix))], - m1 = m1_inclusion, - m2 = m2_exclusion, - metric = metric) - } else if (is_m1_sparse && !is_m2_sparse) { - null_dist <- cppBetabinPseudoR2_mixed1(Z = ZDB_matrix[, sample.int(ncol(ZDB_matrix))], - m1 = m1_inclusion, - m2 = as.matrix(m2_exclusion), - metric = metric) - } else { - null_dist <- cppBetabinPseudoR2_mixed2(Z = ZDB_matrix[, sample.int(ncol(ZDB_matrix))], - m1 = as.matrix(m1_inclusion), - m2 = m2_exclusion, - metric = metric) + # Null distribution: permute the cells (columns) of Z to break the + # cell-to-cell pairing with splicing, recompute, and repeat + # permutation_count times. Each column of null_mat is one permutation. + n_cells <- ncol(ZDB_matrix) + null_mat <- matrix(NA_real_, nrow = nrow(ZDB_matrix), ncol = permutation_count) + for (k in seq_len(permutation_count)) { + perm_idx <- sample.int(n_cells) + null_mat[, k] <- compute_kernel(ZDB_matrix[, perm_idx, drop = FALSE]) } - data.table::data.table( - event = events, - pseudo_correlation = correls, - null_distribution = null_dist - ) + list(correls = correls, events = events, null_mat = null_mat) } # Run computation with or without warning suppression if (suppress_warnings) { - results <- suppressWarnings(run_computation()) + comp <- suppressWarnings(run_computation()) } else { - results <- run_computation() + comp <- run_computation() } - results <- na.omit(results) + correls <- comp$correls + events <- comp$events + null_mat <- comp$null_mat + + # Per-event null summaries (vectorised over events). + n_valid <- rowSums(!is.na(null_mat)) + s1 <- rowSums(null_mat, na.rm = TRUE) + s2 <- rowSums(null_mat^2, na.rm = TRUE) + + null_mean <- s1 / n_valid + null_mean[n_valid < 1L] <- NA_real_ + + null_var <- (s2 - (s1^2) / n_valid) / (n_valid - 1) + # Clamp tiny negative variances from floating-point cancellation to avoid + # spurious sqrt() NaN warnings. + null_var[is.finite(null_var) & null_var < 0] <- 0 + null_sd <- sqrt(null_var) + null_sd[n_valid < 2L] <- NA_real_ + + # Two-sided empirical p-value with the (b + 1) / (m + 1) correction, comparing + # absolute null draws to the absolute observed correlation. NA observed values + # and events with no valid null draws yield NA. + obs_abs <- abs(correls) + exceed <- rowSums(abs(null_mat) >= obs_abs, na.rm = TRUE) + emp_pvalue <- (exceed + 1) / (n_valid + 1) + emp_pvalue[is.na(correls) | n_valid < 1L] <- NA_real_ + + results <- data.table::data.table( + event = events, + pseudo_correlation = correls, + null_distribution = null_mean, + null_sd = null_sd, + n_perm_valid = as.integer(n_valid), + emp_pvalue = emp_pvalue + ) + + # Retain events with a valid observed correlation and at least one valid null + # draw (mirrors the previous na.omit() coupling of observed and null values). + keep <- !is.na(results$pseudo_correlation) & !is.na(results$null_distribution) + results <- results[keep, ] + null_mat <- null_mat[keep, , drop = FALSE] + + # Benjamini-Hochberg adjusted empirical p-values across the retained events. + results$emp_padj <- stats::p.adjust(results$emp_pvalue, method = "BH") + + # Attach the full per-event null draws for custom downstream analyses. + if (nrow(null_mat) > 0L) { + rownames(null_mat) <- results$event + colnames(null_mat) <- paste0("perm_", seq_len(permutation_count)) + } + data.table::setattr(results, "null_draws", null_mat) if (verbose) message("Computation completed successfully.") return(results) diff --git a/RELEASENOTES.md b/RELEASENOTES.md new file mode 100644 index 0000000..d4beb2e --- /dev/null +++ b/RELEASENOTES.md @@ -0,0 +1,39 @@ +# splikit — Release Logbook + +A chronological logbook of notable changes. Each entry is dated and signed by the +person who made the change, with a short summary of what was done and why. For the +canonical, CRAN-facing changelog see `NEWS.md`. + +--- + +## 2.3.2 — 2026-06-09 + +**Author:** Arsham Mikaeili Namini + +**Summary — Repeated permutation null for pseudo correlation.** + +- Extended `get_pseudo_correlation()` (and the R6 method + `SplikitObject$getPseudoCorrelation()`) to build the null model from multiple + cell-permutations instead of a single one. +- Added two arguments: + - `permutation_count` (default `100`): number of times the cells (columns) of + `ZDB_matrix` are permuted and the pseudo correlation recomputed, producing a + per-event empirical null. `permutation_count = 1` reproduces the original + single-permutation behaviour. + - `permutation_seed` (default `NULL`): seeds the permutations for reproducible + nulls and empirical p-values. The caller's global RNG stream is saved and + restored, so a seed here does not disturb downstream randomness. +- The observed correlation is computed once; only the null pass scales with + `permutation_count`. +- New output columns: `null_sd`, `n_perm_valid`, `emp_pvalue` (two-sided, + `(b + 1) / (m + 1)` corrected), `emp_padj` (Benjamini-Hochberg). The full + per-event null draws are attached as `attr(result, "null_draws")`. +- `null_distribution` is now the mean of the per-event null draws (equal to the + single draw when `permutation_count = 1`); `pseudo_correlation` is unchanged. +- Tests: added a dedicated permutation test (validity of empirical p-values, + seed reproducibility, global-RNG isolation, `permutation_count = 1` + equivalence, input validation); the full-data R6 test runs with a small + `permutation_count` to stay fast. +- Version bumped to 2.3.2; `NEWS.md` updated. + +— Arsham Mikaeili Namini diff --git a/man/SplikitObject.Rd b/man/SplikitObject.Rd index ce44ade..f3993b2 100644 --- a/man/SplikitObject.Rd +++ b/man/SplikitObject.Rd @@ -201,7 +201,9 @@ Compute pseudo-correlation between splicing and external data. \if{html}{\out{