From 381c4a2ab07ca43cc6c782b0365eaaab19b30289 Mon Sep 17 00:00:00 2001
From: Arshammik <79arsham@gmail.com>
Date: Tue, 9 Jun 2026 15:31:36 -0400
Subject: [PATCH] feat: repeated permutation null for get_pseudo_correlation

Add permutation_count (default 100) and permutation_seed arguments to
get_pseudo_correlation() and SplikitObject$getPseudoCorrelation(). The null
model is now built from permutation_count cell-permutations of ZDB_matrix,
yielding a per-event empirical null with the new columns null_sd, n_perm_valid,
emp_pvalue (two-sided, (b+1)/(m+1) corrected) and emp_padj (Benjamini-Hochberg).
The full per-event draws are attached as attr(result, "null_draws").

permutation_seed seeds the permutations reproducibly and saves/restores the
caller's global RNG stream, so it does not perturb downstream randomness.

null_distribution is now the mean of the per-event null draws (equal to the
single draw when permutation_count = 1, preserving backward compatibility);
the observed pseudo_correlation is unchanged.

Bump to 2.3.2; update NEWS.md, RELEASENOTES.md logbook, regenerated docs,
vignettes, and add a dedicated permutation test (validity, seed reproducibility,
global-RNG isolation, count=1 equivalence, input validation).
---
 .Rbuildignore                       |   1 +
 DESCRIPTION                         |   2 +-
 NEWS.md                             |  20 +++
 R/SplikitObject.R                   |  18 ++-
 R/general_tools.R                   | 218 ++++++++++++++++++++--------
 RELEASENOTES.md                     |  39 +++++
 man/SplikitObject.Rd                |  16 +-
 man/get_pseudo_correlation.Rd       |  46 +++++-
 tests/testthat/test-SplikitObject.R |   8 +-
 tests/testthat/test-general_tools.R |  62 ++++++++
 vignettes/methods.Rmd               |   2 +-
 vignettes/splikit_manual.Rmd        |   9 +-
 12 files changed, 366 insertions(+), 75 deletions(-)
 create mode 100644 RELEASENOTES.md

diff --git a/.Rbuildignore b/.Rbuildignore
index 540d634..05c9f00 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -13,3 +13,4 @@
 ^build_pkgdown_site\.R$
 ^CLAUDE\.md$
 ^\.claude$
+^RELEASENOTES\.md$
diff --git a/DESCRIPTION b/DESCRIPTION
index e50d604..8e031ec 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: splikit
 Title: Analysing RNA Splicing in Single-Cell RNA Sequencing Data
-Version: 2.3.1
+Version: 2.3.2
 Authors@R:
     person("Arsham", "Mikaeili Namini", , "arsham.mikaeilinamini@mail.mcgill.ca", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0002-9453-6951"))
diff --git a/NEWS.md b/NEWS.md
index 88dd836..1ed7220 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,23 @@
+# splikit 2.3.2
+
+## New Features
+* **Repeated permutation null for `get_pseudo_correlation()`.** The null model
+  is now built by permuting the cells (columns) of `ZDB_matrix` and recomputing
+  the pseudo correlation `permutation_count` times (default `100`), instead of a
+  single permutation. This yields a per-event empirical null and adds the
+  columns `null_sd`, `n_perm_valid`, `emp_pvalue` (two-sided, `(b + 1) / (m + 1)`
+  corrected), and `emp_padj` (Benjamini-Hochberg). The full per-event draws are
+  attached as `attr(result, "null_draws")`.
+* **`permutation_seed` argument** for reproducible permutations. When supplied,
+  the RNG is seeded locally and the caller's global RNG stream is saved and
+  restored, so reproducibility does not perturb downstream randomness.
+* Both additions are exposed on the R6 method `SplikitObject$getPseudoCorrelation()`.
+
+## Behavioural Notes
+* `null_distribution` is now the mean of the per-event null draws; it equals the
+  original single null value when `permutation_count = 1`, preserving backward
+  compatibility. The observed `pseudo_correlation` is unchanged.
+
 # splikit 2.3.1
 
 ## CRAN Compliance
diff --git a/R/SplikitObject.R b/R/SplikitObject.R
index d31ab82..e8d7b97 100644
--- a/R/SplikitObject.R
+++ b/R/SplikitObject.R
@@ -220,10 +220,20 @@ SplikitObject <- R6::R6Class("SplikitObject",
     #'   Must have same dimensions as m1.
     #' @param metric R-squared metric: "CoxSnell" or "Nagelkerke" (default: "CoxSnell").
     #' @param suppress_warnings Suppress computation warnings (default: TRUE).
+    #' @param permutation_count Integer. Number of cell-permutation null draws per
+    #'   event used to build the empirical null (default: 100L). Use 1L for the
+    #'   original single-permutation behaviour.
+    #' @param permutation_seed Optional single numeric value seeding the
+    #'   permutations for reproducibility without disturbing the global RNG
+    #'   (default: NULL).
     #'
-    #' @return A data.table with event names, pseudo_correlation, and null_distribution.
+    #' @return A data.table with event names, pseudo_correlation, null_distribution,
+    #'   null_sd, n_perm_valid, emp_pvalue, and emp_padj, plus the full null draws
+    #'   in attr(result, "null_draws").
     getPseudoCorrelation = function(ZDB_matrix, metric = "CoxSnell",
-                                     suppress_warnings = TRUE) {
+                                     suppress_warnings = TRUE,
+                                     permutation_count = 100L,
+                                     permutation_seed = NULL) {
 
       private$ensureM2Computed()
 
@@ -249,7 +259,9 @@ SplikitObject <- R6::R6Class("SplikitObject",
         m1_inclusion = self$m1,
         m2_exclusion = self$m2,
         metric = metric,
-        suppress_warnings = suppress_warnings
+        suppress_warnings = suppress_warnings,
+        permutation_count = permutation_count,
+        permutation_seed = permutation_seed
       )
 
       # Warn about NA removal
diff --git a/R/general_tools.R b/R/general_tools.R
index 1798ff9..b1bca5c 100644
--- a/R/general_tools.R
+++ b/R/general_tools.R
@@ -20,13 +20,31 @@ utils::globalVariables(c("unique_mapped", "i", "j", "x_1", "x_tot", "x_2", "grou
 #' @param metric Character string specifying which R² metric to compute. Options are "CoxSnell" (default) or "Nagelkerke".
 #' @param suppress_warnings Logical. If \code{TRUE} (default), suppresses warnings during computation (e.g., due to ill-conditioned inputs).
 #' @param verbose Logical. If \code{TRUE}, prints progress and informational messages. Default is \code{FALSE}.
+#' @param permutation_count Integer. Number of times the null model is generated by
+#'   permuting the cells (columns) of `ZDB_matrix` and recomputing the pseudo
+#'   correlation. Each permutation yields one null value per event, so the n draws
+#'   form a per-event empirical null. Defaults to `100L`. Use `1L` to reproduce the
+#'   original single-permutation behaviour. Values of 1000 or more are recommended
+#'   for reliable empirical FDR estimation (the smallest achievable empirical
+#'   p-value is approximately `1 / (permutation_count + 1)`).
+#' @param permutation_seed Optional single numeric value. If supplied, the random
+#'   number generator is seeded with it before the permutations are drawn, making
+#'   the null distribution (and therefore the empirical p-values) reproducible. The
+#'   caller's global RNG stream is saved and restored, so passing a seed does not
+#'   perturb downstream randomness. Defaults to `NULL` (use the ambient RNG state).
 #'
 #' @return A `data.table` with the following columns:
 #' \describe{
 #'   \item{event}{The event names from `ZDB_matrix` rownames.}
 #'   \item{pseudo_correlation}{The computed pseudo R² correlation values using the specified metric.}
-#'   \item{null_distribution}{Null correlation values from a permuted version of `ZDB_matrix`.}
+#'   \item{null_distribution}{Mean of the per-event null correlation values across the `permutation_count` permutations (equal to the single null draw when `permutation_count = 1`).}
+#'   \item{null_sd}{Standard deviation of the per-event null draws (`NA` when fewer than two valid draws are available).}
+#'   \item{n_perm_valid}{Number of permutations that produced a non-`NA` null value for the event.}
+#'   \item{emp_pvalue}{Two-sided empirical p-value, `(b + 1) / (n_perm_valid + 1)`, where `b` is the number of valid null draws whose absolute value is greater than or equal to the absolute observed correlation.}
+#'   \item{emp_padj}{Benjamini-Hochberg adjusted `emp_pvalue` across the retained events.}
 #' }
+#' The full matrix of per-event null draws (events x `permutation_count`) is attached
+#' as `attr(result, "null_draws")` for custom downstream analyses.
 #'
 #' @examples
 #' \donttest{
@@ -40,6 +58,11 @@ utils::globalVariables(c("unique_mapped", "i", "j", "x_1", "x_tot", "x_2", "grou
 #' eventdata <- m1_obj$event_data
 #' m2_exclusion <- make_m2(m1_inclusion, eventdata)
 #'
+#' # subset to a few hundred cells to keep the example quick
+#' cells <- seq_len(min(250, ncol(m1_inclusion)))
+#' m1_inclusion <- m1_inclusion[, cells]
+#' m2_exclusion <- m2_exclusion[, cells]
+#'
 #' # creating a dummy ZDB
 #' ZDB_matrix <- matrix(rnorm(n = (nrow(m1_inclusion) * ncol(m1_inclusion)), sd = 7),
 #' nrow = nrow(m1_inclusion),
@@ -50,20 +73,29 @@ utils::globalVariables(c("unique_mapped", "i", "j", "x_1", "x_tot", "x_2", "grou
 #' # Example with dense matrices (backward compatible)
 #' m1_dense <- as.matrix(m1_inclusion)
 #' m2_dense <- as.matrix(m2_exclusion)
-#' pseudo_r_square_cox <- get_pseudo_correlation(ZDB_matrix, m1_dense, m2_dense)
+#' pseudo_r_square_cox <- get_pseudo_correlation(ZDB_matrix, m1_dense, m2_dense,
+#'                                               permutation_count = 20,
+#'                                               permutation_seed = 1)
 #' print(pseudo_r_square_cox)
 #'
 #' # Example with sparse matrices (more memory efficient)
-#' pseudo_r_square_sparse <- get_pseudo_correlation(ZDB_matrix, m1_inclusion, m2_exclusion)
+#' pseudo_r_square_sparse <- get_pseudo_correlation(ZDB_matrix, m1_inclusion, m2_exclusion,
+#'                                                  permutation_count = 20,
+#'                                                  permutation_seed = 1)
+#'
+#' # Per-event empirical p-values and the full null draws
+#' head(pseudo_r_square_sparse[, c("event", "pseudo_correlation", "emp_pvalue")])
+#' dim(attr(pseudo_r_square_sparse, "null_draws"))
 #'
 #' # Example using Nagelkerke R-squared instead of Cox-Snell
 #' pseudo_r_square_nagel <- get_pseudo_correlation(ZDB_matrix, m1_inclusion, m2_exclusion,
-#'                                                 metric = "Nagelkerke")
+#'                                                 metric = "Nagelkerke",
+#'                                                 permutation_count = 20)
 #' print(pseudo_r_square_nagel)
 #' }
 #'
 #' @export
-get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion = NULL, metric = "CoxSnell", suppress_warnings = TRUE, verbose = FALSE) {
+get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion = NULL, metric = "CoxSnell", suppress_warnings = TRUE, verbose = FALSE, permutation_count = 100L, permutation_seed = NULL) {
 
   # Handle SplikitObject input
   if (inherits(ZDB_matrix, "SplikitObject")) {
@@ -88,6 +120,19 @@ get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion
   # Validate metric parameter
   metric <- match.arg(metric, choices = c("CoxSnell", "Nagelkerke"))
 
+  # Validate permutation controls
+  if (!is.numeric(permutation_count) || length(permutation_count) != 1L ||
+      is.na(permutation_count) || permutation_count < 1) {
+    stop("permutation_count must be a single positive integer.", call. = FALSE)
+  }
+  permutation_count <- as.integer(permutation_count)
+
+  if (!is.null(permutation_seed) &&
+      (!is.numeric(permutation_seed) || length(permutation_seed) != 1L ||
+       is.na(permutation_seed))) {
+    stop("permutation_seed must be NULL or a single numeric value.", call. = FALSE)
+  }
+
   # Check ZDB_matrix (must be dense)
   if (!is.matrix(ZDB_matrix)) stop("ZDB_matrix must be a dense matrix.")
 
@@ -108,7 +153,25 @@ get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion
 
   if (verbose) message("Input dimension check passed. Proceeding with computation.")
 
-  # Define the computation function
+  # Seed the RNG for reproducible permutations without disturbing the caller's
+  # global RNG stream (state is saved here and restored on exit).
+  if (!is.null(permutation_seed)) {
+    if (exists(".Random.seed", envir = globalenv(), inherits = FALSE)) {
+      old_seed <- get(".Random.seed", envir = globalenv(), inherits = FALSE)
+      on.exit(assign(".Random.seed", old_seed, envir = globalenv()), add = TRUE)
+    } else {
+      on.exit(
+        if (exists(".Random.seed", envir = globalenv(), inherits = FALSE)) {
+          rm(".Random.seed", envir = globalenv())
+        },
+        add = TRUE
+      )
+    }
+    set.seed(permutation_seed)
+  }
+
+  # Define the computation function. Returns the observed correlations, the event
+  # names, and the matrix of per-event null draws (events x permutation_count).
   run_computation <- function() {
     # Determine which C++ function to call based on matrix types
     is_m1_sparse <- inherits(m1_inclusion, "Matrix")
@@ -126,32 +189,36 @@ get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion
                                   "dMatrix")
     }
 
-    if (!is_m1_sparse && !is_m2_sparse) {
-      # Both dense - ensure they are matrices
-      correls <- cppBetabinPseudoR2(Z = ZDB_matrix,
-                                    m1 = as.matrix(m1_inclusion),
-                                    m2 = as.matrix(m2_exclusion),
-                                    metric = metric)
-    } else if (is_m1_sparse && is_m2_sparse) {
-      # Both sparse
-      correls <- cppBetabinPseudoR2_sparse(Z = ZDB_matrix,
-                                           m1 = m1_inclusion,
-                                           m2 = m2_exclusion,
-                                           metric = metric)
-    } else if (is_m1_sparse && !is_m2_sparse) {
-      # m1 sparse, m2 dense
-      correls <- cppBetabinPseudoR2_mixed1(Z = ZDB_matrix,
-                                           m1 = m1_inclusion,
-                                           m2 = as.matrix(m2_exclusion),
-                                           metric = metric)
-    } else {
-      # m1 dense, m2 sparse
-      correls <- cppBetabinPseudoR2_mixed2(Z = ZDB_matrix,
-                                           m1 = as.matrix(m1_inclusion),
-                                           m2 = m2_exclusion,
-                                           metric = metric)
+    # Dispatch helper: compute the per-event pseudo correlation for a given Z,
+    # selecting the kernel that matches the m1/m2 matrix storage types. Shared
+    # by the observed pass and every null permutation.
+    compute_kernel <- function(Zmat) {
+      if (!is_m1_sparse && !is_m2_sparse) {
+        cppBetabinPseudoR2(Z = Zmat,
+                           m1 = as.matrix(m1_inclusion),
+                           m2 = as.matrix(m2_exclusion),
+                           metric = metric)
+      } else if (is_m1_sparse && is_m2_sparse) {
+        cppBetabinPseudoR2_sparse(Z = Zmat,
+                                  m1 = m1_inclusion,
+                                  m2 = m2_exclusion,
+                                  metric = metric)
+      } else if (is_m1_sparse && !is_m2_sparse) {
+        cppBetabinPseudoR2_mixed1(Z = Zmat,
+                                  m1 = m1_inclusion,
+                                  m2 = as.matrix(m2_exclusion),
+                                  metric = metric)
+      } else {
+        cppBetabinPseudoR2_mixed2(Z = Zmat,
+                                  m1 = as.matrix(m1_inclusion),
+                                  m2 = m2_exclusion,
+                                  metric = metric)
+      }
     }
 
+    # Observed correlation (computed once; independent of the permutations).
+    correls <- compute_kernel(ZDB_matrix)
+
     if (is.null(rownames(ZDB_matrix))) {
       warning("ZDB_matrix has no row names; assigning default event names.")
       events <- paste0("Event_", seq_len(nrow(ZDB_matrix)))
@@ -159,44 +226,77 @@ get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion
       events <- rownames(ZDB_matrix)
     }
 
-    # Calculate null distribution with the same metric and matrix types
-    if (!is_m1_sparse && !is_m2_sparse) {
-      null_dist <- cppBetabinPseudoR2(Z = ZDB_matrix[, sample.int(ncol(ZDB_matrix))],
-                                      m1 = as.matrix(m1_inclusion),
-                                      m2 = as.matrix(m2_exclusion),
-                                      metric = metric)
-    } else if (is_m1_sparse && is_m2_sparse) {
-      null_dist <- cppBetabinPseudoR2_sparse(Z = ZDB_matrix[, sample.int(ncol(ZDB_matrix))],
-                                             m1 = m1_inclusion,
-                                             m2 = m2_exclusion,
-                                             metric = metric)
-    } else if (is_m1_sparse && !is_m2_sparse) {
-      null_dist <- cppBetabinPseudoR2_mixed1(Z = ZDB_matrix[, sample.int(ncol(ZDB_matrix))],
-                                             m1 = m1_inclusion,
-                                             m2 = as.matrix(m2_exclusion),
-                                             metric = metric)
-    } else {
-      null_dist <- cppBetabinPseudoR2_mixed2(Z = ZDB_matrix[, sample.int(ncol(ZDB_matrix))],
-                                             m1 = as.matrix(m1_inclusion),
-                                             m2 = m2_exclusion,
-                                             metric = metric)
+    # Null distribution: permute the cells (columns) of Z to break the
+    # cell-to-cell pairing with splicing, recompute, and repeat
+    # permutation_count times. Each column of null_mat is one permutation.
+    n_cells <- ncol(ZDB_matrix)
+    null_mat <- matrix(NA_real_, nrow = nrow(ZDB_matrix), ncol = permutation_count)
+    for (k in seq_len(permutation_count)) {
+      perm_idx <- sample.int(n_cells)
+      null_mat[, k] <- compute_kernel(ZDB_matrix[, perm_idx, drop = FALSE])
     }
 
-    data.table::data.table(
-      event = events,
-      pseudo_correlation = correls,
-      null_distribution = null_dist
-    )
+    list(correls = correls, events = events, null_mat = null_mat)
   }
 
   # Run computation with or without warning suppression
   if (suppress_warnings) {
-    results <- suppressWarnings(run_computation())
+    comp <- suppressWarnings(run_computation())
   } else {
-    results <- run_computation()
+    comp <- run_computation()
   }
 
-  results <- na.omit(results)
+  correls  <- comp$correls
+  events   <- comp$events
+  null_mat <- comp$null_mat
+
+  # Per-event null summaries (vectorised over events).
+  n_valid <- rowSums(!is.na(null_mat))
+  s1 <- rowSums(null_mat, na.rm = TRUE)
+  s2 <- rowSums(null_mat^2, na.rm = TRUE)
+
+  null_mean <- s1 / n_valid
+  null_mean[n_valid < 1L] <- NA_real_
+
+  null_var <- (s2 - (s1^2) / n_valid) / (n_valid - 1)
+  # Clamp tiny negative variances from floating-point cancellation to avoid
+  # spurious sqrt() NaN warnings.
+  null_var[is.finite(null_var) & null_var < 0] <- 0
+  null_sd <- sqrt(null_var)
+  null_sd[n_valid < 2L] <- NA_real_
+
+  # Two-sided empirical p-value with the (b + 1) / (m + 1) correction, comparing
+  # absolute null draws to the absolute observed correlation. NA observed values
+  # and events with no valid null draws yield NA.
+  obs_abs <- abs(correls)
+  exceed <- rowSums(abs(null_mat) >= obs_abs, na.rm = TRUE)
+  emp_pvalue <- (exceed + 1) / (n_valid + 1)
+  emp_pvalue[is.na(correls) | n_valid < 1L] <- NA_real_
+
+  results <- data.table::data.table(
+    event = events,
+    pseudo_correlation = correls,
+    null_distribution = null_mean,
+    null_sd = null_sd,
+    n_perm_valid = as.integer(n_valid),
+    emp_pvalue = emp_pvalue
+  )
+
+  # Retain events with a valid observed correlation and at least one valid null
+  # draw (mirrors the previous na.omit() coupling of observed and null values).
+  keep <- !is.na(results$pseudo_correlation) & !is.na(results$null_distribution)
+  results <- results[keep, ]
+  null_mat <- null_mat[keep, , drop = FALSE]
+
+  # Benjamini-Hochberg adjusted empirical p-values across the retained events.
+  results$emp_padj <- stats::p.adjust(results$emp_pvalue, method = "BH")
+
+  # Attach the full per-event null draws for custom downstream analyses.
+  if (nrow(null_mat) > 0L) {
+    rownames(null_mat) <- results$event
+    colnames(null_mat) <- paste0("perm_", seq_len(permutation_count))
+  }
+  data.table::setattr(results, "null_draws", null_mat)
 
   if (verbose) message("Computation completed successfully.")
   return(results)
diff --git a/RELEASENOTES.md b/RELEASENOTES.md
new file mode 100644
index 0000000..d4beb2e
--- /dev/null
+++ b/RELEASENOTES.md
@@ -0,0 +1,39 @@
+# splikit — Release Logbook
+
+A chronological logbook of notable changes. Each entry is dated and signed by the
+person who made the change, with a short summary of what was done and why. For the
+canonical, CRAN-facing changelog see `NEWS.md`.
+
+---
+
+## 2.3.2 — 2026-06-09
+
+**Author:** Arsham Mikaeili Namini
+
+**Summary — Repeated permutation null for pseudo correlation.**
+
+- Extended `get_pseudo_correlation()` (and the R6 method
+  `SplikitObject$getPseudoCorrelation()`) to build the null model from multiple
+  cell-permutations instead of a single one.
+- Added two arguments:
+  - `permutation_count` (default `100`): number of times the cells (columns) of
+    `ZDB_matrix` are permuted and the pseudo correlation recomputed, producing a
+    per-event empirical null. `permutation_count = 1` reproduces the original
+    single-permutation behaviour.
+  - `permutation_seed` (default `NULL`): seeds the permutations for reproducible
+    nulls and empirical p-values. The caller's global RNG stream is saved and
+    restored, so a seed here does not disturb downstream randomness.
+- The observed correlation is computed once; only the null pass scales with
+  `permutation_count`.
+- New output columns: `null_sd`, `n_perm_valid`, `emp_pvalue` (two-sided,
+  `(b + 1) / (m + 1)` corrected), `emp_padj` (Benjamini-Hochberg). The full
+  per-event null draws are attached as `attr(result, "null_draws")`.
+- `null_distribution` is now the mean of the per-event null draws (equal to the
+  single draw when `permutation_count = 1`); `pseudo_correlation` is unchanged.
+- Tests: added a dedicated permutation test (validity of empirical p-values,
+  seed reproducibility, global-RNG isolation, `permutation_count = 1`
+  equivalence, input validation); the full-data R6 test runs with a small
+  `permutation_count` to stay fast.
+- Version bumped to 2.3.2; `NEWS.md` updated.
+
+— Arsham Mikaeili Namini
diff --git a/man/SplikitObject.Rd b/man/SplikitObject.Rd
index ce44ade..f3993b2 100644
--- a/man/SplikitObject.Rd
+++ b/man/SplikitObject.Rd
@@ -201,7 +201,9 @@ Compute pseudo-correlation between splicing and external data.
 \if{html}{\out{<div class="r">}}\preformatted{SplikitObject$getPseudoCorrelation(
   ZDB_matrix,
   metric = "CoxSnell",
-  suppress_warnings = TRUE
+  suppress_warnings = TRUE,
+  permutation_count = 100L,
+  permutation_seed = NULL
 )}\if{html}{\out{</div>}}
 }
 
@@ -214,11 +216,21 @@ Must have same dimensions as m1.}
 \item{\code{metric}}{R-squared metric: "CoxSnell" or "Nagelkerke" (default: "CoxSnell").}
 
 \item{\code{suppress_warnings}}{Suppress computation warnings (default: TRUE).}
+
+\item{\code{permutation_count}}{Integer. Number of cell-permutation null draws per
+event used to build the empirical null (default: 100L). Use 1L for the
+original single-permutation behaviour.}
+
+\item{\code{permutation_seed}}{Optional single numeric value seeding the
+permutations for reproducibility without disturbing the global RNG
+(default: NULL).}
 }
 \if{html}{\out{</div>}}
 }
 \subsection{Returns}{
-A data.table with event names, pseudo_correlation, and null_distribution.
+A data.table with event names, pseudo_correlation, null_distribution,
+null_sd, n_perm_valid, emp_pvalue, and emp_padj, plus the full null draws
+in attr(result, "null_draws").
 }
 }
 \if{html}{\out{<hr>}}
diff --git a/man/get_pseudo_correlation.Rd b/man/get_pseudo_correlation.Rd
index 3414efa..407a935 100644
--- a/man/get_pseudo_correlation.Rd
+++ b/man/get_pseudo_correlation.Rd
@@ -10,7 +10,9 @@ get_pseudo_correlation(
   m2_exclusion = NULL,
   metric = "CoxSnell",
   suppress_warnings = TRUE,
-  verbose = FALSE
+  verbose = FALSE,
+  permutation_count = 100L,
+  permutation_seed = NULL
 )
 }
 \arguments{
@@ -25,14 +27,34 @@ get_pseudo_correlation(
 \item{suppress_warnings}{Logical. If \code{TRUE} (default), suppresses warnings during computation (e.g., due to ill-conditioned inputs).}
 
 \item{verbose}{Logical. If \code{TRUE}, prints progress and informational messages. Default is \code{FALSE}.}
+
+\item{permutation_count}{Integer. Number of times the null model is generated by
+permuting the cells (columns) of \code{ZDB_matrix} and recomputing the pseudo
+correlation. Each permutation yields one null value per event, so the n draws
+form a per-event empirical null. Defaults to \code{100L}. Use \code{1L} to reproduce the
+original single-permutation behaviour. Values of 1000 or more are recommended
+for reliable empirical FDR estimation (the smallest achievable empirical
+p-value is approximately \code{1 / (permutation_count + 1)}).}
+
+\item{permutation_seed}{Optional single numeric value. If supplied, the random
+number generator is seeded with it before the permutations are drawn, making
+the null distribution (and therefore the empirical p-values) reproducible. The
+caller's global RNG stream is saved and restored, so passing a seed does not
+perturb downstream randomness. Defaults to \code{NULL} (use the ambient RNG state).}
 }
 \value{
 A \code{data.table} with the following columns:
 \describe{
 \item{event}{The event names from \code{ZDB_matrix} rownames.}
 \item{pseudo_correlation}{The computed pseudo R² correlation values using the specified metric.}
-\item{null_distribution}{Null correlation values from a permuted version of \code{ZDB_matrix}.}
+\item{null_distribution}{Mean of the per-event null correlation values across the \code{permutation_count} permutations (equal to the single null draw when \code{permutation_count = 1}).}
+\item{null_sd}{Standard deviation of the per-event null draws (\code{NA} when fewer than two valid draws are available).}
+\item{n_perm_valid}{Number of permutations that produced a non-\code{NA} null value for the event.}
+\item{emp_pvalue}{Two-sided empirical p-value, \code{(b + 1) / (n_perm_valid + 1)}, where \code{b} is the number of valid null draws whose absolute value is greater than or equal to the absolute observed correlation.}
+\item{emp_padj}{Benjamini-Hochberg adjusted \code{emp_pvalue} across the retained events.}
 }
+The full matrix of per-event null draws (events x \code{permutation_count}) is attached
+as \code{attr(result, "null_draws")} for custom downstream analyses.
 }
 \description{
 This function calculates a pseudo R²-like correlation metric using a beta-binomial model
@@ -52,6 +74,11 @@ m1_inclusion <- m1_obj$m1_inclusion_matrix
 eventdata <- m1_obj$event_data
 m2_exclusion <- make_m2(m1_inclusion, eventdata)
 
+# subset to a few hundred cells to keep the example quick
+cells <- seq_len(min(250, ncol(m1_inclusion)))
+m1_inclusion <- m1_inclusion[, cells]
+m2_exclusion <- m2_exclusion[, cells]
+
 # creating a dummy ZDB
 ZDB_matrix <- matrix(rnorm(n = (nrow(m1_inclusion) * ncol(m1_inclusion)), sd = 7),
 nrow = nrow(m1_inclusion),
@@ -62,15 +89,24 @@ rownames(ZDB_matrix) <- rownames(m1_inclusion)
 # Example with dense matrices (backward compatible)
 m1_dense <- as.matrix(m1_inclusion)
 m2_dense <- as.matrix(m2_exclusion)
-pseudo_r_square_cox <- get_pseudo_correlation(ZDB_matrix, m1_dense, m2_dense)
+pseudo_r_square_cox <- get_pseudo_correlation(ZDB_matrix, m1_dense, m2_dense,
+                                              permutation_count = 20,
+                                              permutation_seed = 1)
 print(pseudo_r_square_cox)
 
 # Example with sparse matrices (more memory efficient)
-pseudo_r_square_sparse <- get_pseudo_correlation(ZDB_matrix, m1_inclusion, m2_exclusion)
+pseudo_r_square_sparse <- get_pseudo_correlation(ZDB_matrix, m1_inclusion, m2_exclusion,
+                                                 permutation_count = 20,
+                                                 permutation_seed = 1)
+
+# Per-event empirical p-values and the full null draws
+head(pseudo_r_square_sparse[, c("event", "pseudo_correlation", "emp_pvalue")])
+dim(attr(pseudo_r_square_sparse, "null_draws"))
 
 # Example using Nagelkerke R-squared instead of Cox-Snell
 pseudo_r_square_nagel <- get_pseudo_correlation(ZDB_matrix, m1_inclusion, m2_exclusion,
-                                                metric = "Nagelkerke")
+                                                metric = "Nagelkerke",
+                                                permutation_count = 20)
 print(pseudo_r_square_nagel)
 }
 
diff --git a/tests/testthat/test-SplikitObject.R b/tests/testthat/test-SplikitObject.R
index adb8b74..d3a3adf 100644
--- a/tests/testthat/test-SplikitObject.R
+++ b/tests/testthat/test-SplikitObject.R
@@ -659,9 +659,13 @@ test_that("SplikitObject$getPseudoCorrelation executes correctly", {
   set.seed(42)
   ZDB_matrix <- matrix(rnorm(n_events * n_cells), nrow = n_events, ncol = n_cells)
   
-  res <- obj$getPseudoCorrelation(ZDB_matrix = ZDB_matrix, suppress_warnings = TRUE)
+  # Keep permutation_count small here: this runs on the full 2000 x 2000 toy data.
+  res <- obj$getPseudoCorrelation(ZDB_matrix = ZDB_matrix, suppress_warnings = TRUE,
+                                  permutation_count = 3, permutation_seed = 1)
   expect_true(data.table::is.data.table(res))
   # With pure random data, some might be NA and removed, check columns
-  expect_true(all(c("event", "pseudo_correlation", "null_distribution") %in% names(res)))
+  expect_true(all(c("event", "pseudo_correlation", "null_distribution",
+                    "null_sd", "n_perm_valid", "emp_pvalue", "emp_padj") %in% names(res)))
+  expect_true(all(res$emp_pvalue > 0 & res$emp_pvalue <= 1))
 })
 
diff --git a/tests/testthat/test-general_tools.R b/tests/testthat/test-general_tools.R
index 89686d4..30222b1 100644
--- a/tests/testthat/test-general_tools.R
+++ b/tests/testthat/test-general_tools.R
@@ -35,6 +35,68 @@ test_that("get_pseudo_correlation works on valid data", {
   expect_equal(res_obj$pseudo_correlation, res$pseudo_correlation)
 })
 
+test_that("get_pseudo_correlation permutation null is valid and reproducible", {
+  n_events <- 6
+  n_cells <- 120
+  set.seed(11)
+
+  m1 <- matrix(rbinom(n_events * n_cells, size = 10, prob = 0.5), nrow = n_events)
+  m2 <- matrix(rbinom(n_events * n_cells, size = 10, prob = 0.5), nrow = n_events)
+  ZDB <- matrix(rnorm(n_events * n_cells), nrow = n_events, ncol = n_cells)
+  rownames(m1) <- rownames(m2) <- rownames(ZDB) <- paste0("event_", seq_len(n_events))
+
+  res <- get_pseudo_correlation(ZDB, m1, m2,
+                                permutation_count = 50, permutation_seed = 123)
+
+  # New columns are present and well formed
+  expect_true(all(c("null_sd", "n_perm_valid", "emp_pvalue", "emp_padj") %in% names(res)))
+  expect_true(all(res$emp_pvalue > 0 & res$emp_pvalue <= 1))
+  expect_true(all(res$emp_padj >= 0 & res$emp_padj <= 1))
+  expect_true(all(res$n_perm_valid <= 50L))
+  # Smallest achievable empirical p-value is 1 / (n_perm_valid + 1)
+  expect_true(all(res$emp_pvalue >= 1 / (res$n_perm_valid + 1) - 1e-9))
+
+  # Full null draws attached as an attribute, correctly shaped
+  draws <- attr(res, "null_draws")
+  expect_true(is.matrix(draws))
+  expect_equal(ncol(draws), 50L)
+  expect_equal(nrow(draws), nrow(res))
+
+  # Same seed -> identical null; observed correlation is seed-independent
+  res2 <- get_pseudo_correlation(ZDB, m1, m2,
+                                 permutation_count = 50, permutation_seed = 123)
+  expect_equal(res$null_distribution, res2$null_distribution)
+  expect_equal(res$emp_pvalue, res2$emp_pvalue)
+  expect_equal(res$pseudo_correlation, res2$pseudo_correlation)
+
+  # Different seed -> different draws, but the same observed correlation
+  res3 <- get_pseudo_correlation(ZDB, m1, m2,
+                                 permutation_count = 50, permutation_seed = 999)
+  expect_false(isTRUE(all.equal(attr(res, "null_draws"), attr(res3, "null_draws"))))
+  expect_equal(res$pseudo_correlation, res3$pseudo_correlation)
+
+  # permutation_seed must not perturb the caller's global RNG stream
+  set.seed(7); a <- runif(3)
+  invisible(get_pseudo_correlation(ZDB, m1, m2,
+                                   permutation_count = 10, permutation_seed = 42))
+  set.seed(7); b <- runif(3)
+  expect_equal(a, b)
+
+  # permutation_count = 1 reproduces the single-draw behaviour: the mean null
+  # equals the single column of draws
+  res1 <- get_pseudo_correlation(ZDB, m1, m2,
+                                 permutation_count = 1, permutation_seed = 5)
+  expect_equal(ncol(attr(res1, "null_draws")), 1L)
+  expect_equal(unname(res1$null_distribution),
+               unname(attr(res1, "null_draws")[, 1]))
+
+  # Input validation
+  expect_error(get_pseudo_correlation(ZDB, m1, m2, permutation_count = 0),
+               "permutation_count")
+  expect_error(get_pseudo_correlation(ZDB, m1, m2, permutation_seed = c(1, 2)),
+               "permutation_seed")
+})
+
 test_that("get_pseudo_correlation error handling works", {
   m1 <- matrix(1, nrow = 2, ncol = 2)
   m2 <- matrix(1, nrow = 2, ncol = 2)
diff --git a/vignettes/methods.Rmd b/vignettes/methods.Rmd
index a0ab14d..f9e47f7 100644
--- a/vignettes/methods.Rmd
+++ b/vignettes/methods.Rmd
@@ -25,7 +25,7 @@ Splice junctions are grouped into *local junction variants* — junctions sharin
 
 ## Event-covariate association
 
-`get_pseudo_correlation()` fits a per-event binomial logistic GLM of the inclusion ratio on a target covariate by iteratively reweighted least squares, and reports a Cox-Snell / Nagelkerke pseudo-R-squared computed from the residual deviance. This quantifies how strongly each event tracks the covariate (e.g. a cluster label or a gene's expression).
+`get_pseudo_correlation()` fits a per-event binomial logistic GLM of the inclusion ratio on a target covariate by iteratively reweighted least squares, and reports a Cox-Snell / Nagelkerke pseudo-R-squared computed from the residual deviance. This quantifies how strongly each event tracks the covariate (e.g. a cluster label or a gene's expression). A per-event empirical null is built by permuting the cells (columns) of the covariate `permutation_count` times (default 100) and recomputing the statistic; this yields a two-sided empirical p-value (`emp_pvalue`) and a Benjamini-Hochberg adjustment (`emp_padj`). The permutations can be made reproducible with `permutation_seed` without disturbing the global RNG stream.
 
 ## Implementation
 
diff --git a/vignettes/splikit_manual.Rmd b/vignettes/splikit_manual.Rmd
index 18ccda6..fdb1f6e 100644
--- a/vignettes/splikit_manual.Rmd
+++ b/vignettes/splikit_manual.Rmd
@@ -681,8 +681,9 @@ length(intersect(hvg_vst[1:500]$events, hvg_dev[1:500]$events))
 
 **Function Signature**:
 ```r
-get_pseudo_correlation(ZDB_matrix, m1_inclusion, m2_exclusion, 
-                      metric = "CoxSnell", suppress_warnings = TRUE)
+get_pseudo_correlation(ZDB_matrix, m1_inclusion, m2_exclusion,
+                      metric = "CoxSnell", suppress_warnings = TRUE,
+                      permutation_count = 100L, permutation_seed = NULL)
 ```
 
 **Parameters**:
@@ -690,6 +691,10 @@ get_pseudo_correlation(ZDB_matrix, m1_inclusion, m2_exclusion,
 - `m1_inclusion`, `m2_exclusion`: Can be either sparse or dense matrices
 - `metric`: R² metric to use - "CoxSnell" (default) or "Nagelkerke"
 - `suppress_warnings`: Whether to suppress computational warnings
+- `permutation_count`: Number of cell-permutation null draws per event (default `100`); use `1` for the original single-permutation behaviour, and `1000`+ for reliable empirical FDR
+- `permutation_seed`: Optional seed for reproducible permutations (does not disturb the global RNG)
+
+**Output columns**: `event`, `pseudo_correlation`, `null_distribution` (mean of the per-event null draws), `null_sd`, `n_perm_valid`, `emp_pvalue` (two-sided empirical p-value), and `emp_padj` (BH-adjusted). The full per-event null draws are attached as `attr(result, "null_draws")`.
 
 **Use Cases**:
 - Correlating splicing patterns with gene expression