diff --git a/.Rbuildignore b/.Rbuildignore index 540d634..05c9f00 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -13,3 +13,4 @@ ^build_pkgdown_site\.R$ ^CLAUDE\.md$ ^\.claude$ +^RELEASENOTES\.md$ diff --git a/DESCRIPTION b/DESCRIPTION index e50d604..8e031ec 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: splikit Title: Analysing RNA Splicing in Single-Cell RNA Sequencing Data -Version: 2.3.1 +Version: 2.3.2 Authors@R: person("Arsham", "Mikaeili Namini", , "arsham.mikaeilinamini@mail.mcgill.ca", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-9453-6951")) diff --git a/NEWS.md b/NEWS.md index 88dd836..1ed7220 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,23 @@ +# splikit 2.3.2 + +## New Features +* **Repeated permutation null for `get_pseudo_correlation()`.** The null model + is now built by permuting the cells (columns) of `ZDB_matrix` and recomputing + the pseudo correlation `permutation_count` times (default `100`), instead of a + single permutation. This yields a per-event empirical null and adds the + columns `null_sd`, `n_perm_valid`, `emp_pvalue` (two-sided, `(b + 1) / (m + 1)` + corrected), and `emp_padj` (Benjamini-Hochberg). The full per-event draws are + attached as `attr(result, "null_draws")`. +* **`permutation_seed` argument** for reproducible permutations. When supplied, + the RNG is seeded locally and the caller's global RNG stream is saved and + restored, so reproducibility does not perturb downstream randomness. +* Both additions are exposed on the R6 method `SplikitObject$getPseudoCorrelation()`. + +## Behavioural Notes +* `null_distribution` is now the mean of the per-event null draws; it equals the + original single null value when `permutation_count = 1`, preserving backward + compatibility. The observed `pseudo_correlation` is unchanged. + # splikit 2.3.1 ## CRAN Compliance diff --git a/R/SplikitObject.R b/R/SplikitObject.R index d31ab82..e8d7b97 100644 --- a/R/SplikitObject.R +++ b/R/SplikitObject.R @@ -220,10 +220,20 @@ SplikitObject <- R6::R6Class("SplikitObject", #' Must have same dimensions as m1. #' @param metric R-squared metric: "CoxSnell" or "Nagelkerke" (default: "CoxSnell"). #' @param suppress_warnings Suppress computation warnings (default: TRUE). + #' @param permutation_count Integer. Number of cell-permutation null draws per + #' event used to build the empirical null (default: 100L). Use 1L for the + #' original single-permutation behaviour. + #' @param permutation_seed Optional single numeric value seeding the + #' permutations for reproducibility without disturbing the global RNG + #' (default: NULL). #' - #' @return A data.table with event names, pseudo_correlation, and null_distribution. + #' @return A data.table with event names, pseudo_correlation, null_distribution, + #' null_sd, n_perm_valid, emp_pvalue, and emp_padj, plus the full null draws + #' in attr(result, "null_draws"). getPseudoCorrelation = function(ZDB_matrix, metric = "CoxSnell", - suppress_warnings = TRUE) { + suppress_warnings = TRUE, + permutation_count = 100L, + permutation_seed = NULL) { private$ensureM2Computed() @@ -249,7 +259,9 @@ SplikitObject <- R6::R6Class("SplikitObject", m1_inclusion = self$m1, m2_exclusion = self$m2, metric = metric, - suppress_warnings = suppress_warnings + suppress_warnings = suppress_warnings, + permutation_count = permutation_count, + permutation_seed = permutation_seed ) # Warn about NA removal diff --git a/R/general_tools.R b/R/general_tools.R index 1798ff9..b1bca5c 100644 --- a/R/general_tools.R +++ b/R/general_tools.R @@ -20,13 +20,31 @@ utils::globalVariables(c("unique_mapped", "i", "j", "x_1", "x_tot", "x_2", "grou #' @param metric Character string specifying which R² metric to compute. Options are "CoxSnell" (default) or "Nagelkerke". #' @param suppress_warnings Logical. If \code{TRUE} (default), suppresses warnings during computation (e.g., due to ill-conditioned inputs). #' @param verbose Logical. If \code{TRUE}, prints progress and informational messages. Default is \code{FALSE}. +#' @param permutation_count Integer. Number of times the null model is generated by +#' permuting the cells (columns) of `ZDB_matrix` and recomputing the pseudo +#' correlation. Each permutation yields one null value per event, so the n draws +#' form a per-event empirical null. Defaults to `100L`. Use `1L` to reproduce the +#' original single-permutation behaviour. Values of 1000 or more are recommended +#' for reliable empirical FDR estimation (the smallest achievable empirical +#' p-value is approximately `1 / (permutation_count + 1)`). +#' @param permutation_seed Optional single numeric value. If supplied, the random +#' number generator is seeded with it before the permutations are drawn, making +#' the null distribution (and therefore the empirical p-values) reproducible. The +#' caller's global RNG stream is saved and restored, so passing a seed does not +#' perturb downstream randomness. Defaults to `NULL` (use the ambient RNG state). #' #' @return A `data.table` with the following columns: #' \describe{ #' \item{event}{The event names from `ZDB_matrix` rownames.} #' \item{pseudo_correlation}{The computed pseudo R² correlation values using the specified metric.} -#' \item{null_distribution}{Null correlation values from a permuted version of `ZDB_matrix`.} +#' \item{null_distribution}{Mean of the per-event null correlation values across the `permutation_count` permutations (equal to the single null draw when `permutation_count = 1`).} +#' \item{null_sd}{Standard deviation of the per-event null draws (`NA` when fewer than two valid draws are available).} +#' \item{n_perm_valid}{Number of permutations that produced a non-`NA` null value for the event.} +#' \item{emp_pvalue}{Two-sided empirical p-value, `(b + 1) / (n_perm_valid + 1)`, where `b` is the number of valid null draws whose absolute value is greater than or equal to the absolute observed correlation.} +#' \item{emp_padj}{Benjamini-Hochberg adjusted `emp_pvalue` across the retained events.} #' } +#' The full matrix of per-event null draws (events x `permutation_count`) is attached +#' as `attr(result, "null_draws")` for custom downstream analyses. #' #' @examples #' \donttest{ @@ -40,6 +58,11 @@ utils::globalVariables(c("unique_mapped", "i", "j", "x_1", "x_tot", "x_2", "grou #' eventdata <- m1_obj$event_data #' m2_exclusion <- make_m2(m1_inclusion, eventdata) #' +#' # subset to a few hundred cells to keep the example quick +#' cells <- seq_len(min(250, ncol(m1_inclusion))) +#' m1_inclusion <- m1_inclusion[, cells] +#' m2_exclusion <- m2_exclusion[, cells] +#' #' # creating a dummy ZDB #' ZDB_matrix <- matrix(rnorm(n = (nrow(m1_inclusion) * ncol(m1_inclusion)), sd = 7), #' nrow = nrow(m1_inclusion), @@ -50,20 +73,29 @@ utils::globalVariables(c("unique_mapped", "i", "j", "x_1", "x_tot", "x_2", "grou #' # Example with dense matrices (backward compatible) #' m1_dense <- as.matrix(m1_inclusion) #' m2_dense <- as.matrix(m2_exclusion) -#' pseudo_r_square_cox <- get_pseudo_correlation(ZDB_matrix, m1_dense, m2_dense) +#' pseudo_r_square_cox <- get_pseudo_correlation(ZDB_matrix, m1_dense, m2_dense, +#' permutation_count = 20, +#' permutation_seed = 1) #' print(pseudo_r_square_cox) #' #' # Example with sparse matrices (more memory efficient) -#' pseudo_r_square_sparse <- get_pseudo_correlation(ZDB_matrix, m1_inclusion, m2_exclusion) +#' pseudo_r_square_sparse <- get_pseudo_correlation(ZDB_matrix, m1_inclusion, m2_exclusion, +#' permutation_count = 20, +#' permutation_seed = 1) +#' +#' # Per-event empirical p-values and the full null draws +#' head(pseudo_r_square_sparse[, c("event", "pseudo_correlation", "emp_pvalue")]) +#' dim(attr(pseudo_r_square_sparse, "null_draws")) #' #' # Example using Nagelkerke R-squared instead of Cox-Snell #' pseudo_r_square_nagel <- get_pseudo_correlation(ZDB_matrix, m1_inclusion, m2_exclusion, -#' metric = "Nagelkerke") +#' metric = "Nagelkerke", +#' permutation_count = 20) #' print(pseudo_r_square_nagel) #' } #' #' @export -get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion = NULL, metric = "CoxSnell", suppress_warnings = TRUE, verbose = FALSE) { +get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion = NULL, metric = "CoxSnell", suppress_warnings = TRUE, verbose = FALSE, permutation_count = 100L, permutation_seed = NULL) { # Handle SplikitObject input if (inherits(ZDB_matrix, "SplikitObject")) { @@ -88,6 +120,19 @@ get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion # Validate metric parameter metric <- match.arg(metric, choices = c("CoxSnell", "Nagelkerke")) + # Validate permutation controls + if (!is.numeric(permutation_count) || length(permutation_count) != 1L || + is.na(permutation_count) || permutation_count < 1) { + stop("permutation_count must be a single positive integer.", call. = FALSE) + } + permutation_count <- as.integer(permutation_count) + + if (!is.null(permutation_seed) && + (!is.numeric(permutation_seed) || length(permutation_seed) != 1L || + is.na(permutation_seed))) { + stop("permutation_seed must be NULL or a single numeric value.", call. = FALSE) + } + # Check ZDB_matrix (must be dense) if (!is.matrix(ZDB_matrix)) stop("ZDB_matrix must be a dense matrix.") @@ -108,7 +153,25 @@ get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion if (verbose) message("Input dimension check passed. Proceeding with computation.") - # Define the computation function + # Seed the RNG for reproducible permutations without disturbing the caller's + # global RNG stream (state is saved here and restored on exit). + if (!is.null(permutation_seed)) { + if (exists(".Random.seed", envir = globalenv(), inherits = FALSE)) { + old_seed <- get(".Random.seed", envir = globalenv(), inherits = FALSE) + on.exit(assign(".Random.seed", old_seed, envir = globalenv()), add = TRUE) + } else { + on.exit( + if (exists(".Random.seed", envir = globalenv(), inherits = FALSE)) { + rm(".Random.seed", envir = globalenv()) + }, + add = TRUE + ) + } + set.seed(permutation_seed) + } + + # Define the computation function. Returns the observed correlations, the event + # names, and the matrix of per-event null draws (events x permutation_count). run_computation <- function() { # Determine which C++ function to call based on matrix types is_m1_sparse <- inherits(m1_inclusion, "Matrix") @@ -126,32 +189,36 @@ get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion "dMatrix") } - if (!is_m1_sparse && !is_m2_sparse) { - # Both dense - ensure they are matrices - correls <- cppBetabinPseudoR2(Z = ZDB_matrix, - m1 = as.matrix(m1_inclusion), - m2 = as.matrix(m2_exclusion), - metric = metric) - } else if (is_m1_sparse && is_m2_sparse) { - # Both sparse - correls <- cppBetabinPseudoR2_sparse(Z = ZDB_matrix, - m1 = m1_inclusion, - m2 = m2_exclusion, - metric = metric) - } else if (is_m1_sparse && !is_m2_sparse) { - # m1 sparse, m2 dense - correls <- cppBetabinPseudoR2_mixed1(Z = ZDB_matrix, - m1 = m1_inclusion, - m2 = as.matrix(m2_exclusion), - metric = metric) - } else { - # m1 dense, m2 sparse - correls <- cppBetabinPseudoR2_mixed2(Z = ZDB_matrix, - m1 = as.matrix(m1_inclusion), - m2 = m2_exclusion, - metric = metric) + # Dispatch helper: compute the per-event pseudo correlation for a given Z, + # selecting the kernel that matches the m1/m2 matrix storage types. Shared + # by the observed pass and every null permutation. + compute_kernel <- function(Zmat) { + if (!is_m1_sparse && !is_m2_sparse) { + cppBetabinPseudoR2(Z = Zmat, + m1 = as.matrix(m1_inclusion), + m2 = as.matrix(m2_exclusion), + metric = metric) + } else if (is_m1_sparse && is_m2_sparse) { + cppBetabinPseudoR2_sparse(Z = Zmat, + m1 = m1_inclusion, + m2 = m2_exclusion, + metric = metric) + } else if (is_m1_sparse && !is_m2_sparse) { + cppBetabinPseudoR2_mixed1(Z = Zmat, + m1 = m1_inclusion, + m2 = as.matrix(m2_exclusion), + metric = metric) + } else { + cppBetabinPseudoR2_mixed2(Z = Zmat, + m1 = as.matrix(m1_inclusion), + m2 = m2_exclusion, + metric = metric) + } } + # Observed correlation (computed once; independent of the permutations). + correls <- compute_kernel(ZDB_matrix) + if (is.null(rownames(ZDB_matrix))) { warning("ZDB_matrix has no row names; assigning default event names.") events <- paste0("Event_", seq_len(nrow(ZDB_matrix))) @@ -159,44 +226,77 @@ get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion events <- rownames(ZDB_matrix) } - # Calculate null distribution with the same metric and matrix types - if (!is_m1_sparse && !is_m2_sparse) { - null_dist <- cppBetabinPseudoR2(Z = ZDB_matrix[, sample.int(ncol(ZDB_matrix))], - m1 = as.matrix(m1_inclusion), - m2 = as.matrix(m2_exclusion), - metric = metric) - } else if (is_m1_sparse && is_m2_sparse) { - null_dist <- cppBetabinPseudoR2_sparse(Z = ZDB_matrix[, sample.int(ncol(ZDB_matrix))], - m1 = m1_inclusion, - m2 = m2_exclusion, - metric = metric) - } else if (is_m1_sparse && !is_m2_sparse) { - null_dist <- cppBetabinPseudoR2_mixed1(Z = ZDB_matrix[, sample.int(ncol(ZDB_matrix))], - m1 = m1_inclusion, - m2 = as.matrix(m2_exclusion), - metric = metric) - } else { - null_dist <- cppBetabinPseudoR2_mixed2(Z = ZDB_matrix[, sample.int(ncol(ZDB_matrix))], - m1 = as.matrix(m1_inclusion), - m2 = m2_exclusion, - metric = metric) + # Null distribution: permute the cells (columns) of Z to break the + # cell-to-cell pairing with splicing, recompute, and repeat + # permutation_count times. Each column of null_mat is one permutation. + n_cells <- ncol(ZDB_matrix) + null_mat <- matrix(NA_real_, nrow = nrow(ZDB_matrix), ncol = permutation_count) + for (k in seq_len(permutation_count)) { + perm_idx <- sample.int(n_cells) + null_mat[, k] <- compute_kernel(ZDB_matrix[, perm_idx, drop = FALSE]) } - data.table::data.table( - event = events, - pseudo_correlation = correls, - null_distribution = null_dist - ) + list(correls = correls, events = events, null_mat = null_mat) } # Run computation with or without warning suppression if (suppress_warnings) { - results <- suppressWarnings(run_computation()) + comp <- suppressWarnings(run_computation()) } else { - results <- run_computation() + comp <- run_computation() } - results <- na.omit(results) + correls <- comp$correls + events <- comp$events + null_mat <- comp$null_mat + + # Per-event null summaries (vectorised over events). + n_valid <- rowSums(!is.na(null_mat)) + s1 <- rowSums(null_mat, na.rm = TRUE) + s2 <- rowSums(null_mat^2, na.rm = TRUE) + + null_mean <- s1 / n_valid + null_mean[n_valid < 1L] <- NA_real_ + + null_var <- (s2 - (s1^2) / n_valid) / (n_valid - 1) + # Clamp tiny negative variances from floating-point cancellation to avoid + # spurious sqrt() NaN warnings. + null_var[is.finite(null_var) & null_var < 0] <- 0 + null_sd <- sqrt(null_var) + null_sd[n_valid < 2L] <- NA_real_ + + # Two-sided empirical p-value with the (b + 1) / (m + 1) correction, comparing + # absolute null draws to the absolute observed correlation. NA observed values + # and events with no valid null draws yield NA. + obs_abs <- abs(correls) + exceed <- rowSums(abs(null_mat) >= obs_abs, na.rm = TRUE) + emp_pvalue <- (exceed + 1) / (n_valid + 1) + emp_pvalue[is.na(correls) | n_valid < 1L] <- NA_real_ + + results <- data.table::data.table( + event = events, + pseudo_correlation = correls, + null_distribution = null_mean, + null_sd = null_sd, + n_perm_valid = as.integer(n_valid), + emp_pvalue = emp_pvalue + ) + + # Retain events with a valid observed correlation and at least one valid null + # draw (mirrors the previous na.omit() coupling of observed and null values). + keep <- !is.na(results$pseudo_correlation) & !is.na(results$null_distribution) + results <- results[keep, ] + null_mat <- null_mat[keep, , drop = FALSE] + + # Benjamini-Hochberg adjusted empirical p-values across the retained events. + results$emp_padj <- stats::p.adjust(results$emp_pvalue, method = "BH") + + # Attach the full per-event null draws for custom downstream analyses. + if (nrow(null_mat) > 0L) { + rownames(null_mat) <- results$event + colnames(null_mat) <- paste0("perm_", seq_len(permutation_count)) + } + data.table::setattr(results, "null_draws", null_mat) if (verbose) message("Computation completed successfully.") return(results) diff --git a/RELEASENOTES.md b/RELEASENOTES.md new file mode 100644 index 0000000..d4beb2e --- /dev/null +++ b/RELEASENOTES.md @@ -0,0 +1,39 @@ +# splikit — Release Logbook + +A chronological logbook of notable changes. Each entry is dated and signed by the +person who made the change, with a short summary of what was done and why. For the +canonical, CRAN-facing changelog see `NEWS.md`. + +--- + +## 2.3.2 — 2026-06-09 + +**Author:** Arsham Mikaeili Namini + +**Summary — Repeated permutation null for pseudo correlation.** + +- Extended `get_pseudo_correlation()` (and the R6 method + `SplikitObject$getPseudoCorrelation()`) to build the null model from multiple + cell-permutations instead of a single one. +- Added two arguments: + - `permutation_count` (default `100`): number of times the cells (columns) of + `ZDB_matrix` are permuted and the pseudo correlation recomputed, producing a + per-event empirical null. `permutation_count = 1` reproduces the original + single-permutation behaviour. + - `permutation_seed` (default `NULL`): seeds the permutations for reproducible + nulls and empirical p-values. The caller's global RNG stream is saved and + restored, so a seed here does not disturb downstream randomness. +- The observed correlation is computed once; only the null pass scales with + `permutation_count`. +- New output columns: `null_sd`, `n_perm_valid`, `emp_pvalue` (two-sided, + `(b + 1) / (m + 1)` corrected), `emp_padj` (Benjamini-Hochberg). The full + per-event null draws are attached as `attr(result, "null_draws")`. +- `null_distribution` is now the mean of the per-event null draws (equal to the + single draw when `permutation_count = 1`); `pseudo_correlation` is unchanged. +- Tests: added a dedicated permutation test (validity of empirical p-values, + seed reproducibility, global-RNG isolation, `permutation_count = 1` + equivalence, input validation); the full-data R6 test runs with a small + `permutation_count` to stay fast. +- Version bumped to 2.3.2; `NEWS.md` updated. + +— Arsham Mikaeili Namini diff --git a/man/SplikitObject.Rd b/man/SplikitObject.Rd index ce44ade..f3993b2 100644 --- a/man/SplikitObject.Rd +++ b/man/SplikitObject.Rd @@ -201,7 +201,9 @@ Compute pseudo-correlation between splicing and external data. \if{html}{\out{
}}\preformatted{SplikitObject$getPseudoCorrelation( ZDB_matrix, metric = "CoxSnell", - suppress_warnings = TRUE + suppress_warnings = TRUE, + permutation_count = 100L, + permutation_seed = NULL )}\if{html}{\out{
}} } @@ -214,11 +216,21 @@ Must have same dimensions as m1.} \item{\code{metric}}{R-squared metric: "CoxSnell" or "Nagelkerke" (default: "CoxSnell").} \item{\code{suppress_warnings}}{Suppress computation warnings (default: TRUE).} + +\item{\code{permutation_count}}{Integer. Number of cell-permutation null draws per +event used to build the empirical null (default: 100L). Use 1L for the +original single-permutation behaviour.} + +\item{\code{permutation_seed}}{Optional single numeric value seeding the +permutations for reproducibility without disturbing the global RNG +(default: NULL).} } \if{html}{\out{}} } \subsection{Returns}{ -A data.table with event names, pseudo_correlation, and null_distribution. +A data.table with event names, pseudo_correlation, null_distribution, +null_sd, n_perm_valid, emp_pvalue, and emp_padj, plus the full null draws +in attr(result, "null_draws"). } } \if{html}{\out{
}} diff --git a/man/get_pseudo_correlation.Rd b/man/get_pseudo_correlation.Rd index 3414efa..407a935 100644 --- a/man/get_pseudo_correlation.Rd +++ b/man/get_pseudo_correlation.Rd @@ -10,7 +10,9 @@ get_pseudo_correlation( m2_exclusion = NULL, metric = "CoxSnell", suppress_warnings = TRUE, - verbose = FALSE + verbose = FALSE, + permutation_count = 100L, + permutation_seed = NULL ) } \arguments{ @@ -25,14 +27,34 @@ get_pseudo_correlation( \item{suppress_warnings}{Logical. If \code{TRUE} (default), suppresses warnings during computation (e.g., due to ill-conditioned inputs).} \item{verbose}{Logical. If \code{TRUE}, prints progress and informational messages. Default is \code{FALSE}.} + +\item{permutation_count}{Integer. Number of times the null model is generated by +permuting the cells (columns) of \code{ZDB_matrix} and recomputing the pseudo +correlation. Each permutation yields one null value per event, so the n draws +form a per-event empirical null. Defaults to \code{100L}. Use \code{1L} to reproduce the +original single-permutation behaviour. Values of 1000 or more are recommended +for reliable empirical FDR estimation (the smallest achievable empirical +p-value is approximately \code{1 / (permutation_count + 1)}).} + +\item{permutation_seed}{Optional single numeric value. If supplied, the random +number generator is seeded with it before the permutations are drawn, making +the null distribution (and therefore the empirical p-values) reproducible. The +caller's global RNG stream is saved and restored, so passing a seed does not +perturb downstream randomness. Defaults to \code{NULL} (use the ambient RNG state).} } \value{ A \code{data.table} with the following columns: \describe{ \item{event}{The event names from \code{ZDB_matrix} rownames.} \item{pseudo_correlation}{The computed pseudo R² correlation values using the specified metric.} -\item{null_distribution}{Null correlation values from a permuted version of \code{ZDB_matrix}.} +\item{null_distribution}{Mean of the per-event null correlation values across the \code{permutation_count} permutations (equal to the single null draw when \code{permutation_count = 1}).} +\item{null_sd}{Standard deviation of the per-event null draws (\code{NA} when fewer than two valid draws are available).} +\item{n_perm_valid}{Number of permutations that produced a non-\code{NA} null value for the event.} +\item{emp_pvalue}{Two-sided empirical p-value, \code{(b + 1) / (n_perm_valid + 1)}, where \code{b} is the number of valid null draws whose absolute value is greater than or equal to the absolute observed correlation.} +\item{emp_padj}{Benjamini-Hochberg adjusted \code{emp_pvalue} across the retained events.} } +The full matrix of per-event null draws (events x \code{permutation_count}) is attached +as \code{attr(result, "null_draws")} for custom downstream analyses. } \description{ This function calculates a pseudo R²-like correlation metric using a beta-binomial model @@ -52,6 +74,11 @@ m1_inclusion <- m1_obj$m1_inclusion_matrix eventdata <- m1_obj$event_data m2_exclusion <- make_m2(m1_inclusion, eventdata) +# subset to a few hundred cells to keep the example quick +cells <- seq_len(min(250, ncol(m1_inclusion))) +m1_inclusion <- m1_inclusion[, cells] +m2_exclusion <- m2_exclusion[, cells] + # creating a dummy ZDB ZDB_matrix <- matrix(rnorm(n = (nrow(m1_inclusion) * ncol(m1_inclusion)), sd = 7), nrow = nrow(m1_inclusion), @@ -62,15 +89,24 @@ rownames(ZDB_matrix) <- rownames(m1_inclusion) # Example with dense matrices (backward compatible) m1_dense <- as.matrix(m1_inclusion) m2_dense <- as.matrix(m2_exclusion) -pseudo_r_square_cox <- get_pseudo_correlation(ZDB_matrix, m1_dense, m2_dense) +pseudo_r_square_cox <- get_pseudo_correlation(ZDB_matrix, m1_dense, m2_dense, + permutation_count = 20, + permutation_seed = 1) print(pseudo_r_square_cox) # Example with sparse matrices (more memory efficient) -pseudo_r_square_sparse <- get_pseudo_correlation(ZDB_matrix, m1_inclusion, m2_exclusion) +pseudo_r_square_sparse <- get_pseudo_correlation(ZDB_matrix, m1_inclusion, m2_exclusion, + permutation_count = 20, + permutation_seed = 1) + +# Per-event empirical p-values and the full null draws +head(pseudo_r_square_sparse[, c("event", "pseudo_correlation", "emp_pvalue")]) +dim(attr(pseudo_r_square_sparse, "null_draws")) # Example using Nagelkerke R-squared instead of Cox-Snell pseudo_r_square_nagel <- get_pseudo_correlation(ZDB_matrix, m1_inclusion, m2_exclusion, - metric = "Nagelkerke") + metric = "Nagelkerke", + permutation_count = 20) print(pseudo_r_square_nagel) } diff --git a/tests/testthat/test-SplikitObject.R b/tests/testthat/test-SplikitObject.R index adb8b74..d3a3adf 100644 --- a/tests/testthat/test-SplikitObject.R +++ b/tests/testthat/test-SplikitObject.R @@ -659,9 +659,13 @@ test_that("SplikitObject$getPseudoCorrelation executes correctly", { set.seed(42) ZDB_matrix <- matrix(rnorm(n_events * n_cells), nrow = n_events, ncol = n_cells) - res <- obj$getPseudoCorrelation(ZDB_matrix = ZDB_matrix, suppress_warnings = TRUE) + # Keep permutation_count small here: this runs on the full 2000 x 2000 toy data. + res <- obj$getPseudoCorrelation(ZDB_matrix = ZDB_matrix, suppress_warnings = TRUE, + permutation_count = 3, permutation_seed = 1) expect_true(data.table::is.data.table(res)) # With pure random data, some might be NA and removed, check columns - expect_true(all(c("event", "pseudo_correlation", "null_distribution") %in% names(res))) + expect_true(all(c("event", "pseudo_correlation", "null_distribution", + "null_sd", "n_perm_valid", "emp_pvalue", "emp_padj") %in% names(res))) + expect_true(all(res$emp_pvalue > 0 & res$emp_pvalue <= 1)) }) diff --git a/tests/testthat/test-general_tools.R b/tests/testthat/test-general_tools.R index 89686d4..30222b1 100644 --- a/tests/testthat/test-general_tools.R +++ b/tests/testthat/test-general_tools.R @@ -35,6 +35,68 @@ test_that("get_pseudo_correlation works on valid data", { expect_equal(res_obj$pseudo_correlation, res$pseudo_correlation) }) +test_that("get_pseudo_correlation permutation null is valid and reproducible", { + n_events <- 6 + n_cells <- 120 + set.seed(11) + + m1 <- matrix(rbinom(n_events * n_cells, size = 10, prob = 0.5), nrow = n_events) + m2 <- matrix(rbinom(n_events * n_cells, size = 10, prob = 0.5), nrow = n_events) + ZDB <- matrix(rnorm(n_events * n_cells), nrow = n_events, ncol = n_cells) + rownames(m1) <- rownames(m2) <- rownames(ZDB) <- paste0("event_", seq_len(n_events)) + + res <- get_pseudo_correlation(ZDB, m1, m2, + permutation_count = 50, permutation_seed = 123) + + # New columns are present and well formed + expect_true(all(c("null_sd", "n_perm_valid", "emp_pvalue", "emp_padj") %in% names(res))) + expect_true(all(res$emp_pvalue > 0 & res$emp_pvalue <= 1)) + expect_true(all(res$emp_padj >= 0 & res$emp_padj <= 1)) + expect_true(all(res$n_perm_valid <= 50L)) + # Smallest achievable empirical p-value is 1 / (n_perm_valid + 1) + expect_true(all(res$emp_pvalue >= 1 / (res$n_perm_valid + 1) - 1e-9)) + + # Full null draws attached as an attribute, correctly shaped + draws <- attr(res, "null_draws") + expect_true(is.matrix(draws)) + expect_equal(ncol(draws), 50L) + expect_equal(nrow(draws), nrow(res)) + + # Same seed -> identical null; observed correlation is seed-independent + res2 <- get_pseudo_correlation(ZDB, m1, m2, + permutation_count = 50, permutation_seed = 123) + expect_equal(res$null_distribution, res2$null_distribution) + expect_equal(res$emp_pvalue, res2$emp_pvalue) + expect_equal(res$pseudo_correlation, res2$pseudo_correlation) + + # Different seed -> different draws, but the same observed correlation + res3 <- get_pseudo_correlation(ZDB, m1, m2, + permutation_count = 50, permutation_seed = 999) + expect_false(isTRUE(all.equal(attr(res, "null_draws"), attr(res3, "null_draws")))) + expect_equal(res$pseudo_correlation, res3$pseudo_correlation) + + # permutation_seed must not perturb the caller's global RNG stream + set.seed(7); a <- runif(3) + invisible(get_pseudo_correlation(ZDB, m1, m2, + permutation_count = 10, permutation_seed = 42)) + set.seed(7); b <- runif(3) + expect_equal(a, b) + + # permutation_count = 1 reproduces the single-draw behaviour: the mean null + # equals the single column of draws + res1 <- get_pseudo_correlation(ZDB, m1, m2, + permutation_count = 1, permutation_seed = 5) + expect_equal(ncol(attr(res1, "null_draws")), 1L) + expect_equal(unname(res1$null_distribution), + unname(attr(res1, "null_draws")[, 1])) + + # Input validation + expect_error(get_pseudo_correlation(ZDB, m1, m2, permutation_count = 0), + "permutation_count") + expect_error(get_pseudo_correlation(ZDB, m1, m2, permutation_seed = c(1, 2)), + "permutation_seed") +}) + test_that("get_pseudo_correlation error handling works", { m1 <- matrix(1, nrow = 2, ncol = 2) m2 <- matrix(1, nrow = 2, ncol = 2) diff --git a/vignettes/methods.Rmd b/vignettes/methods.Rmd index a0ab14d..f9e47f7 100644 --- a/vignettes/methods.Rmd +++ b/vignettes/methods.Rmd @@ -25,7 +25,7 @@ Splice junctions are grouped into *local junction variants* — junctions sharin ## Event-covariate association -`get_pseudo_correlation()` fits a per-event binomial logistic GLM of the inclusion ratio on a target covariate by iteratively reweighted least squares, and reports a Cox-Snell / Nagelkerke pseudo-R-squared computed from the residual deviance. This quantifies how strongly each event tracks the covariate (e.g. a cluster label or a gene's expression). +`get_pseudo_correlation()` fits a per-event binomial logistic GLM of the inclusion ratio on a target covariate by iteratively reweighted least squares, and reports a Cox-Snell / Nagelkerke pseudo-R-squared computed from the residual deviance. This quantifies how strongly each event tracks the covariate (e.g. a cluster label or a gene's expression). A per-event empirical null is built by permuting the cells (columns) of the covariate `permutation_count` times (default 100) and recomputing the statistic; this yields a two-sided empirical p-value (`emp_pvalue`) and a Benjamini-Hochberg adjustment (`emp_padj`). The permutations can be made reproducible with `permutation_seed` without disturbing the global RNG stream. ## Implementation diff --git a/vignettes/splikit_manual.Rmd b/vignettes/splikit_manual.Rmd index 18ccda6..fdb1f6e 100644 --- a/vignettes/splikit_manual.Rmd +++ b/vignettes/splikit_manual.Rmd @@ -681,8 +681,9 @@ length(intersect(hvg_vst[1:500]$events, hvg_dev[1:500]$events)) **Function Signature**: ```r -get_pseudo_correlation(ZDB_matrix, m1_inclusion, m2_exclusion, - metric = "CoxSnell", suppress_warnings = TRUE) +get_pseudo_correlation(ZDB_matrix, m1_inclusion, m2_exclusion, + metric = "CoxSnell", suppress_warnings = TRUE, + permutation_count = 100L, permutation_seed = NULL) ``` **Parameters**: @@ -690,6 +691,10 @@ get_pseudo_correlation(ZDB_matrix, m1_inclusion, m2_exclusion, - `m1_inclusion`, `m2_exclusion`: Can be either sparse or dense matrices - `metric`: R² metric to use - "CoxSnell" (default) or "Nagelkerke" - `suppress_warnings`: Whether to suppress computational warnings +- `permutation_count`: Number of cell-permutation null draws per event (default `100`); use `1` for the original single-permutation behaviour, and `1000`+ for reliable empirical FDR +- `permutation_seed`: Optional seed for reproducible permutations (does not disturb the global RNG) + +**Output columns**: `event`, `pseudo_correlation`, `null_distribution` (mean of the per-event null draws), `null_sd`, `n_perm_valid`, `emp_pvalue` (two-sided empirical p-value), and `emp_padj` (BH-adjusted). The full per-event null draws are attached as `attr(result, "null_draws")`. **Use Cases**: - Correlating splicing patterns with gene expression