csglab · Arshammik · Jun 9, 2026 · Jun 9, 2026
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -13,3 +13,4 @@
 ^build_pkgdown_site\.R$
 ^CLAUDE\.md$
 ^\.claude$
+^RELEASENOTES\.md$
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: splikit
 Title: Analysing RNA Splicing in Single-Cell RNA Sequencing Data
-Version: 2.3.1
+Version: 2.3.2
 Authors@R:
     person("Arsham", "Mikaeili Namini", , "arsham.mikaeilinamini@mail.mcgill.ca", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0002-9453-6951"))

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,23 @@
+# splikit 2.3.2
+
+## New Features
+* **Repeated permutation null for `get_pseudo_correlation()`.** The null model
+  is now built by permuting the cells (columns) of `ZDB_matrix` and recomputing
+  the pseudo correlation `permutation_count` times (default `100`), instead of a
+  single permutation. This yields a per-event empirical null and adds the
+  columns `null_sd`, `n_perm_valid`, `emp_pvalue` (two-sided, `(b + 1) / (m + 1)`
+  corrected), and `emp_padj` (Benjamini-Hochberg). The full per-event draws are
+  attached as `attr(result, "null_draws")`.
+* **`permutation_seed` argument** for reproducible permutations. When supplied,
+  the RNG is seeded locally and the caller's global RNG stream is saved and
+  restored, so reproducibility does not perturb downstream randomness.
+* Both additions are exposed on the R6 method `SplikitObject$getPseudoCorrelation()`.
+
+## Behavioural Notes
+* `null_distribution` is now the mean of the per-event null draws; it equals the
+  original single null value when `permutation_count = 1`, preserving backward
+  compatibility. The observed `pseudo_correlation` is unchanged.
+
 # splikit 2.3.1
 
 ## CRAN Compliance

diff --git a/R/SplikitObject.R b/R/SplikitObject.R
@@ -220,10 +220,20 @@ SplikitObject <- R6::R6Class("SplikitObject",
     #'   Must have same dimensions as m1.
     #' @param metric R-squared metric: "CoxSnell" or "Nagelkerke" (default: "CoxSnell").
     #' @param suppress_warnings Suppress computation warnings (default: TRUE).
+    #' @param permutation_count Integer. Number of cell-permutation null draws per
+    #'   event used to build the empirical null (default: 100L). Use 1L for the
+    #'   original single-permutation behaviour.
+    #' @param permutation_seed Optional single numeric value seeding the
+    #'   permutations for reproducibility without disturbing the global RNG
+    #'   (default: NULL).
     #'
-    #' @return A data.table with event names, pseudo_correlation, and null_distribution.
+    #' @return A data.table with event names, pseudo_correlation, null_distribution,
+    #'   null_sd, n_perm_valid, emp_pvalue, and emp_padj, plus the full null draws
+    #'   in attr(result, "null_draws").
     getPseudoCorrelation = function(ZDB_matrix, metric = "CoxSnell",
-                                     suppress_warnings = TRUE) {
+                                     suppress_warnings = TRUE,
+                                     permutation_count = 100L,
+                                     permutation_seed = NULL) {
 
       private$ensureM2Computed()
 
@@ -249,7 +259,9 @@ SplikitObject <- R6::R6Class("SplikitObject",
         m1_inclusion = self$m1,
         m2_exclusion = self$m2,
         metric = metric,
-        suppress_warnings = suppress_warnings
+        suppress_warnings = suppress_warnings,
+        permutation_count = permutation_count,
+        permutation_seed = permutation_seed
       )
 
       # Warn about NA removal

diff --git a/R/general_tools.R b/R/general_tools.R
@@ -20,13 +20,31 @@ utils::globalVariables(c("unique_mapped", "i", "j", "x_1", "x_tot", "x_2", "grou
 #' @param metric Character string specifying which R² metric to compute. Options are "CoxSnell" (default) or "Nagelkerke".
 #' @param suppress_warnings Logical. If \code{TRUE} (default), suppresses warnings during computation (e.g., due to ill-conditioned inputs).
 #' @param verbose Logical. If \code{TRUE}, prints progress and informational messages. Default is \code{FALSE}.
+#' @param permutation_count Integer. Number of times the null model is generated by
+#'   permuting the cells (columns) of `ZDB_matrix` and recomputing the pseudo
+#'   correlation. Each permutation yields one null value per event, so the n draws
+#'   form a per-event empirical null. Defaults to `100L`. Use `1L` to reproduce the
+#'   original single-permutation behaviour. Values of 1000 or more are recommended
+#'   for reliable empirical FDR estimation (the smallest achievable empirical
+#'   p-value is approximately `1 / (permutation_count + 1)`).
+#' @param permutation_seed Optional single numeric value. If supplied, the random
+#'   number generator is seeded with it before the permutations are drawn, making
+#'   the null distribution (and therefore the empirical p-values) reproducible. The
+#'   caller's global RNG stream is saved and restored, so passing a seed does not
+#'   perturb downstream randomness. Defaults to `NULL` (use the ambient RNG state).
 #'
 #' @return A `data.table` with the following columns:
 #' \describe{
 #'   \item{event}{The event names from `ZDB_matrix` rownames.}
 #'   \item{pseudo_correlation}{The computed pseudo R² correlation values using the specified metric.}
-#'   \item{null_distribution}{Null correlation values from a permuted version of `ZDB_matrix`.}
+#'   \item{null_distribution}{Mean of the per-event null correlation values across the `permutation_count` permutations (equal to the single null draw when `permutation_count = 1`).}
+#'   \item{null_sd}{Standard deviation of the per-event null draws (`NA` when fewer than two valid draws are available).}
+#'   \item{n_perm_valid}{Number of permutations that produced a non-`NA` null value for the event.}
+#'   \item{emp_pvalue}{Two-sided empirical p-value, `(b + 1) / (n_perm_valid + 1)`, where `b` is the number of valid null draws whose absolute value is greater than or equal to the absolute observed correlation.}
+#'   \item{emp_padj}{Benjamini-Hochberg adjusted `emp_pvalue` across the retained events.}
 #' }
+#' The full matrix of per-event null draws (events x `permutation_count`) is attached
+#' as `attr(result, "null_draws")` for custom downstream analyses.
 #'
 #' @examples
 #' \donttest{
@@ -40,6 +58,11 @@ utils::globalVariables(c("unique_mapped", "i", "j", "x_1", "x_tot", "x_2", "grou
 #' eventdata <- m1_obj$event_data
 #' m2_exclusion <- make_m2(m1_inclusion, eventdata)
 #'
+#' # subset to a few hundred cells to keep the example quick
+#' cells <- seq_len(min(250, ncol(m1_inclusion)))
+#' m1_inclusion <- m1_inclusion[, cells]
+#' m2_exclusion <- m2_exclusion[, cells]
+#'
 #' # creating a dummy ZDB
 #' ZDB_matrix <- matrix(rnorm(n = (nrow(m1_inclusion) * ncol(m1_inclusion)), sd = 7),
 #' nrow = nrow(m1_inclusion),
@@ -50,20 +73,29 @@ utils::globalVariables(c("unique_mapped", "i", "j", "x_1", "x_tot", "x_2", "grou
 #' # Example with dense matrices (backward compatible)
 #' m1_dense <- as.matrix(m1_inclusion)
 #' m2_dense <- as.matrix(m2_exclusion)
-#' pseudo_r_square_cox <- get_pseudo_correlation(ZDB_matrix, m1_dense, m2_dense)
+#' pseudo_r_square_cox <- get_pseudo_correlation(ZDB_matrix, m1_dense, m2_dense,
+#'                                               permutation_count = 20,
+#'                                               permutation_seed = 1)
 #' print(pseudo_r_square_cox)
 #'
 #' # Example with sparse matrices (more memory efficient)
-#' pseudo_r_square_sparse <- get_pseudo_correlation(ZDB_matrix, m1_inclusion, m2_exclusion)
+#' pseudo_r_square_sparse <- get_pseudo_correlation(ZDB_matrix, m1_inclusion, m2_exclusion,
+#'                                                  permutation_count = 20,
+#'                                                  permutation_seed = 1)
+#'
+#' # Per-event empirical p-values and the full null draws
+#' head(pseudo_r_square_sparse[, c("event", "pseudo_correlation", "emp_pvalue")])
+#' dim(attr(pseudo_r_square_sparse, "null_draws"))
 #'
 #' # Example using Nagelkerke R-squared instead of Cox-Snell
 #' pseudo_r_square_nagel <- get_pseudo_correlation(ZDB_matrix, m1_inclusion, m2_exclusion,
-#'                                                 metric = "Nagelkerke")
+#'                                                 metric = "Nagelkerke",
+#'                                                 permutation_count = 20)
 #' print(pseudo_r_square_nagel)
 #' }
 #'
 #' @export
-get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion = NULL, metric = "CoxSnell", suppress_warnings = TRUE, verbose = FALSE) {
+get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion = NULL, metric = "CoxSnell", suppress_warnings = TRUE, verbose = FALSE, permutation_count = 100L, permutation_seed = NULL) {
 
   # Handle SplikitObject input
   if (inherits(ZDB_matrix, "SplikitObject")) {
@@ -88,6 +120,19 @@ get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion
   # Validate metric parameter
   metric <- match.arg(metric, choices = c("CoxSnell", "Nagelkerke"))
 
+  # Validate permutation controls
+  if (!is.numeric(permutation_count) || length(permutation_count) != 1L ||
+      is.na(permutation_count) || permutation_count < 1) {
+    stop("permutation_count must be a single positive integer.", call. = FALSE)
+  }
+  permutation_count <- as.integer(permutation_count)
+
+  if (!is.null(permutation_seed) &&
+      (!is.numeric(permutation_seed) || length(permutation_seed) != 1L ||
+       is.na(permutation_seed))) {
+    stop("permutation_seed must be NULL or a single numeric value.", call. = FALSE)
+  }
+
   # Check ZDB_matrix (must be dense)
   if (!is.matrix(ZDB_matrix)) stop("ZDB_matrix must be a dense matrix.")
 
@@ -108,7 +153,25 @@ get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion
 
   if (verbose) message("Input dimension check passed. Proceeding with computation.")
 
-  # Define the computation function
+  # Seed the RNG for reproducible permutations without disturbing the caller's
+  # global RNG stream (state is saved here and restored on exit).
+  if (!is.null(permutation_seed)) {
+    if (exists(".Random.seed", envir = globalenv(), inherits = FALSE)) {
+      old_seed <- get(".Random.seed", envir = globalenv(), inherits = FALSE)
+      on.exit(assign(".Random.seed", old_seed, envir = globalenv()), add = TRUE)
+    } else {
+      on.exit(
+        if (exists(".Random.seed", envir = globalenv(), inherits = FALSE)) {
+          rm(".Random.seed", envir = globalenv())
+        },
+        add = TRUE
+      )
+    }
+    set.seed(permutation_seed)
+  }
+
+  # Define the computation function. Returns the observed correlations, the event
+  # names, and the matrix of per-event null draws (events x permutation_count).
   run_computation <- function() {
     # Determine which C++ function to call based on matrix types
     is_m1_sparse <- inherits(m1_inclusion, "Matrix")
@@ -126,77 +189,114 @@ get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion
                                   "dMatrix")
     }
 
-    if (!is_m1_sparse && !is_m2_sparse) {
-      # Both dense - ensure they are matrices
-      correls <- cppBetabinPseudoR2(Z = ZDB_matrix,
-                                    m1 = as.matrix(m1_inclusion),
-                                    m2 = as.matrix(m2_exclusion),
-                                    metric = metric)
-    } else if (is_m1_sparse && is_m2_sparse) {
-      # Both sparse
-      correls <- cppBetabinPseudoR2_sparse(Z = ZDB_matrix,
-                                           m1 = m1_inclusion,
-                                           m2 = m2_exclusion,
-                                           metric = metric)
-    } else if (is_m1_sparse && !is_m2_sparse) {
-      # m1 sparse, m2 dense
-      correls <- cppBetabinPseudoR2_mixed1(Z = ZDB_matrix,
-                                           m1 = m1_inclusion,
-                                           m2 = as.matrix(m2_exclusion),
-                                           metric = metric)
-    } else {
-      # m1 dense, m2 sparse
-      correls <- cppBetabinPseudoR2_mixed2(Z = ZDB_matrix,
-                                           m1 = as.matrix(m1_inclusion),
-                                           m2 = m2_exclusion,
-                                           metric = metric)
+    # Dispatch helper: compute the per-event pseudo correlation for a given Z,
+    # selecting the kernel that matches the m1/m2 matrix storage types. Shared
+    # by the observed pass and every null permutation.
+    compute_kernel <- function(Zmat) {
+      if (!is_m1_sparse && !is_m2_sparse) {
+        cppBetabinPseudoR2(Z = Zmat,
+                           m1 = as.matrix(m1_inclusion),
+                           m2 = as.matrix(m2_exclusion),
+                           metric = metric)
+      } else if (is_m1_sparse && is_m2_sparse) {
+        cppBetabinPseudoR2_sparse(Z = Zmat,
+                                  m1 = m1_inclusion,
+                                  m2 = m2_exclusion,
+                                  metric = metric)
+      } else if (is_m1_sparse && !is_m2_sparse) {
+        cppBetabinPseudoR2_mixed1(Z = Zmat,
+                                  m1 = m1_inclusion,
+                                  m2 = as.matrix(m2_exclusion),
+                                  metric = metric)
+      } else {
+        cppBetabinPseudoR2_mixed2(Z = Zmat,
+                                  m1 = as.matrix(m1_inclusion),
+                                  m2 = m2_exclusion,
+                                  metric = metric)
+      }
     }
 
+    # Observed correlation (computed once; independent of the permutations).
+    correls <- compute_kernel(ZDB_matrix)
+
     if (is.null(rownames(ZDB_matrix))) {
       warning("ZDB_matrix has no row names; assigning default event names.")
       events <- paste0("Event_", seq_len(nrow(ZDB_matrix)))
     } else {
       events <- rownames(ZDB_matrix)
     }
 
-    # Calculate null distribution with the same metric and matrix types
-    if (!is_m1_sparse && !is_m2_sparse) {
-      null_dist <- cppBetabinPseudoR2(Z = ZDB_matrix[, sample.int(ncol(ZDB_matrix))],
-                                      m1 = as.matrix(m1_inclusion),
-                                      m2 = as.matrix(m2_exclusion),
-                                      metric = metric)
-    } else if (is_m1_sparse && is_m2_sparse) {
-      null_dist <- cppBetabinPseudoR2_sparse(Z = ZDB_matrix[, sample.int(ncol(ZDB_matrix))],
-                                             m1 = m1_inclusion,
-                                             m2 = m2_exclusion,
-                                             metric = metric)
-    } else if (is_m1_sparse && !is_m2_sparse) {
-      null_dist <- cppBetabinPseudoR2_mixed1(Z = ZDB_matrix[, sample.int(ncol(ZDB_matrix))],
-                                             m1 = m1_inclusion,
-                                             m2 = as.matrix(m2_exclusion),
-                                             metric = metric)
-    } else {
-      null_dist <- cppBetabinPseudoR2_mixed2(Z = ZDB_matrix[, sample.int(ncol(ZDB_matrix))],
-                                             m1 = as.matrix(m1_inclusion),
-                                             m2 = m2_exclusion,
-                                             metric = metric)
+    # Null distribution: permute the cells (columns) of Z to break the
+    # cell-to-cell pairing with splicing, recompute, and repeat
+    # permutation_count times. Each column of null_mat is one permutation.
+    n_cells <- ncol(ZDB_matrix)
+    null_mat <- matrix(NA_real_, nrow = nrow(ZDB_matrix), ncol = permutation_count)
+    for (k in seq_len(permutation_count)) {
+      perm_idx <- sample.int(n_cells)
+      null_mat[, k] <- compute_kernel(ZDB_matrix[, perm_idx, drop = FALSE])
     }
 
-    data.table::data.table(
-      event = events,
-      pseudo_correlation = correls,
-      null_distribution = null_dist
-    )
+    list(correls = correls, events = events, null_mat = null_mat)
   }
 
   # Run computation with or without warning suppression
   if (suppress_warnings) {
-    results <- suppressWarnings(run_computation())
+    comp <- suppressWarnings(run_computation())
   } else {
-    results <- run_computation()
+    comp <- run_computation()
   }
 
-  results <- na.omit(results)
+  correls  <- comp$correls
+  events   <- comp$events
+  null_mat <- comp$null_mat
+
+  # Per-event null summaries (vectorised over events).
+  n_valid <- rowSums(!is.na(null_mat))
+  s1 <- rowSums(null_mat, na.rm = TRUE)
+  s2 <- rowSums(null_mat^2, na.rm = TRUE)
+
+  null_mean <- s1 / n_valid
+  null_mean[n_valid < 1L] <- NA_real_
+
+  null_var <- (s2 - (s1^2) / n_valid) / (n_valid - 1)
+  # Clamp tiny negative variances from floating-point cancellation to avoid
+  # spurious sqrt() NaN warnings.
+  null_var[is.finite(null_var) & null_var < 0] <- 0
+  null_sd <- sqrt(null_var)
+  null_sd[n_valid < 2L] <- NA_real_
+
+  # Two-sided empirical p-value with the (b + 1) / (m + 1) correction, comparing
+  # absolute null draws to the absolute observed correlation. NA observed values
+  # and events with no valid null draws yield NA.
+  obs_abs <- abs(correls)
+  exceed <- rowSums(abs(null_mat) >= obs_abs, na.rm = TRUE)
+  emp_pvalue <- (exceed + 1) / (n_valid + 1)
+  emp_pvalue[is.na(correls) | n_valid < 1L] <- NA_real_
+
+  results <- data.table::data.table(
+    event = events,
+    pseudo_correlation = correls,
+    null_distribution = null_mean,
+    null_sd = null_sd,
+    n_perm_valid = as.integer(n_valid),
+    emp_pvalue = emp_pvalue
+  )
+
+  # Retain events with a valid observed correlation and at least one valid null
+  # draw (mirrors the previous na.omit() coupling of observed and null values).
+  keep <- !is.na(results$pseudo_correlation) & !is.na(results$null_distribution)
+  results <- results[keep, ]
+  null_mat <- null_mat[keep, , drop = FALSE]
+
+  # Benjamini-Hochberg adjusted empirical p-values across the retained events.
+  results$emp_padj <- stats::p.adjust(results$emp_pvalue, method = "BH")
+
+  # Attach the full per-event null draws for custom downstream analyses.
+  if (nrow(null_mat) > 0L) {
+    rownames(null_mat) <- results$event
+    colnames(null_mat) <- paste0("perm_", seq_len(permutation_count))
+  }
+  data.table::setattr(results, "null_draws", null_mat)
 
   if (verbose) message("Computation completed successfully.")
   return(results)