DavisLaboratory · Gene233 · Apr 22, 2026 · Apr 20, 2026 · Apr 21, 2026 · Apr 22, 2026
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,4 @@ inst/doc
 .Rhistory
 .DS_Store
 *.Rproj
+tests/testthat/testdata
diff --git a/NEWS.md b/NEWS.md
@@ -45,3 +45,48 @@
 # smartid 1.7.2
 
 * Fix dplyr defunct.
+
+# smartid (development version)
+
+* Major memory and performance optimization for `cal_score()` and
+  `top_markers()`. On a 20,000 gene x 100,000 cell sparse input peak
+  memory drops from roughly 100 GB to a few GB, and `top_markers()`
+  with the default `gaussian()` family runs in seconds instead of
+  hours.
+* **Breaking change**: `cal_score()` no longer stores the intermediate
+  `tf`, `idf` and `iae` matrices in `metadata()` by default. Callers
+  that relied on `metadata(se)$tf / $idf / $iae` (introduced in v1.1.1)
+  must now pass `return.intermediate = TRUE`. When the flag is `TRUE`,
+  the stored `idf` / `iae` for labelled methods (`prob`, `rf`) are now
+  compact `G x K` matrices (columns = unique labels); expand with
+  `md$idf[, as.character(label)]` to recover the legacy per-cell form.
+* Internal refactor: labelled `idf_prob`, `idf_rf`, `iae_prob` and
+  `iae_rf` helpers now return a compact `G x K` matrix. `cal_score()`
+  composes the final score through per-group column-block
+  multiplication, avoiding the materialisation of full `G x N`
+  intermediates.
+* `cal_score()` no longer forces dense conversion of `dgCMatrix`
+  inputs; the score assay stays sparse throughout the pipeline when
+  the input is sparse.
+* `tf()`, `idf_hdb()`, `iae_hdb()` and all IAE helpers now preserve
+  `dgCMatrix` sparsity by routing column scaling through
+  `Matrix::Diagonal` and replacing the densifying
+  `x[x < 0] <- 0` pattern with `pmax0_offset()`.
+* `top_markers_glm()` has a vectorised closed-form least-squares fast
+  path for the default `gaussian()` + identity link; non-gaussian
+  families or rank-deficient designs automatically fall back to the
+  legacy per-gene `glm()` loop with no behaviour change.
+* `top_markers_abs()` aggregates directly on the scored matrix via
+  `sparseMatrixStats::rowMeans2 / rowMedians / rowMads`, removing the
+  intermediate wide `data.frame` that previously reached tens of GB.
+* `scale_mgm()` caches per-group column indices and collapses the
+  two-step `(expr - mgm) / (sds + 1e-8)` into a single broadcast.
+* The `multi = TRUE` branch of the labelled IDF/IAE helpers switched
+  from an O(G * K^2) `apply()` to an O(G * K) top-1 + top-2 trick via
+  the new `rowwise_notin_max()` helper.
+* New `inst/bench/benchmark_smartid.R` micro-benchmark script; new
+  `tests/testthat/test-numerical-equivalence.R` pins `cal_score()` and
+  `top_markers()` outputs to a frozen pre-refactor snapshot at 1e-10
+  tolerance.
+* No new dependencies; the refactor relies entirely on `Matrix`,
+  `sparseMatrixStats` and base R.
diff --git a/R/AllGenerics.R b/R/AllGenerics.R
@@ -20,6 +20,12 @@
 #'     optional, default 'counts'
 #' @param new.slot a character, specify the name of slot to save score in se object,
 #'     optional, default 'score'
+#' @param return.intermediate logical, if TRUE also return or store the
+#'     intermediate `tf`, `idf` and `iae` matrices. Defaults to FALSE since
+#'     these objects have the same dimension as the input expression matrix
+#'     and can dominate memory usage on large datasets. Set to TRUE to
+#'     restore the pre-1.7.3 behavior where intermediates were kept in
+#'     `metadata()` of the SummarizedExperiment output.
 #'
 #' @return A list of matrices or se object containing combined score
 #'
@@ -42,7 +48,8 @@ setGeneric(
            slot = "counts",
            new.slot = "score",
            par.idf = NULL,
-           par.iae = NULL) {
+           par.iae = NULL,
+           return.intermediate = FALSE) {
     standardGeneric("cal_score")
   }
 )

diff --git a/R/scale_mgm.R b/R/scale_mgm.R
@@ -17,51 +17,45 @@
 #' @examples
 #' scale_mgm(matrix(rnorm(100), 10), label = rep(letters[1:2], 5))
 scale_mgm <- function(expr, label, pooled.sd = FALSE) {
-  if (pooled.sd) {
-    ## compute pooled sds
-    sds <- row_pool_sds(expr, label)
-  } else {
-    ## compute overall sds
-    sds <- sparseMatrixStats::rowSds(expr, na.rm = TRUE)
+  ## Cache column indices per group once; the group-mean and pooled-SD
+  ## paths previously recomputed `label == i` inside every `vapply`
+  ## iteration, which is O(K * N) in scan cost.
+  idx_by_grp <- split(seq_len(ncol(expr)), label)
 
-    # ## compute group sds
-    # sds <- vapply(unique(label), \(i)
-    #               sparseMatrixStats::rowSds(expr[, label == i, drop = FALSE],
-    #                                         na.rm = TRUE),
-    #               rep(1, nrow(expr))
-    #        ) # get sds of each group
-    # sds <- sparseMatrixStats::rowMeans2(sds)
+  sds <- if (isTRUE(pooled.sd)) {
+    row_pool_sds_from_idx(expr, idx_by_grp)
+  } else {
+    sparseMatrixStats::rowSds(expr, na.rm = TRUE)
   }
 
-  ## compute group means
-  mgm <- vapply(
-    unique(label), \(i)
-    sparseMatrixStats::rowMeans2(expr[, label == i, drop = FALSE],
-      na.rm = TRUE
-    ),
-    rep(1, nrow(expr))
-  ) |> # get mean of each group
-    rowMeans(na.rm = TRUE) # get mean of group mean
-
-  ## scale
-  expr <- (expr - mgm) / (sds + 1e-8)
+  # Compute per-group means, then average them to get the mean of group means (MGM).
+  group_means <- vapply(idx_by_grp, function(cols)
+    sparseMatrixStats::rowMeans2(expr[, cols, drop = FALSE], na.rm = TRUE),
+    numeric(nrow(expr)))
+  mgm <- rowMeans(group_means, na.rm = TRUE) # mean of per-group means
+
+  # scale
+  ## Single broadcast + single allocation: `(expr - mgm) * inv_sd`
+  ## collapses the prior two temporaries ((expr - mgm), then divide) into
+  ## one. Division-by-zero is guarded by the additive epsilon.
+  inv_sd <- 1 / (sds + 1e-8)
+  (expr - mgm) * inv_sd
+}
 
-  # expr[is.na(expr)] <- 0 # assign 0 to NA when sd = 0
 
-  return(expr)
+## Row-wise pooled SDs given pre-computed per-group column indices.
+row_pool_sds_from_idx <- function(expr, idx_by_grp) {
+  # Compute per-group variances, then combine them with the group sizes to get the pooled SD.
+  vars <- vapply(idx_by_grp, function(cols)
+    sparseMatrixStats::rowVars(expr[, cols, drop = FALSE], na.rm = TRUE),
+    numeric(nrow(expr)))
+  # Group sizes: number of columns in each group.
+  ng <- lengths(idx_by_grp)
+  as.numeric(sqrt((vars %*% cbind(ng - 1)) / sum(ng - 1)))
 }
 
-
-## row-wise pooled SDs
+## Back-compat shim: preserves the old `row_pool_sds(expr, label)`
+## internal call signature in case any caller still uses it.
 row_pool_sds <- function(expr, label) {
-  sds <- vapply(
-    unique(label), \(i)
-    sparseMatrixStats::rowVars(expr[, label == i, drop = FALSE], na.rm = TRUE),
-    rep(1, nrow(expr))
-  ) # get vars of each group
-  ng <- table(label)[unique(label)] # get group sizes in the same order
-  sds <- sds %*% cbind(ng - 1)
-  sds <- as.numeric(sqrt(sds / sum(ng - 1)))
-
-  return(sds)
+  row_pool_sds_from_idx(expr, split(seq_len(ncol(expr)), label))
 }
diff --git a/R/score-methods.R b/R/score-methods.R
@@ -11,14 +11,16 @@ setMethod(
            idf = "prob",
            iae = "prob",
            par.idf = NULL,
-           par.iae = NULL) {
+           par.iae = NULL,
+           return.intermediate = FALSE) {
     score <- cal_score_init(
-      expr = as.matrix(data),
+      expr = data,
       tf = tf,
       idf = idf,
       iae = iae,
       par.idf = par.idf,
-      par.iae = par.iae
+      par.iae = par.iae,
+      return.intermediate = return.intermediate
     )
 
     return(score)
@@ -37,7 +39,8 @@ setMethod(
            slot = "counts",
            new.slot = "score",
            par.idf = NULL,
-           par.iae = NULL) {
+           par.iae = NULL,
+           return.intermediate = FALSE) {
     ## get expr
     expr <- SummarizedExperiment::assay(data, i = slot)
     ## get label
@@ -54,13 +57,24 @@ setMethod(
       idf = idf,
       iae = iae,
       par.idf = par.idf,
-      par.iae = par.iae
+      par.iae = par.iae,
+      return.intermediate = return.intermediate
     )
 
     SummarizedExperiment::assay(data, i = new.slot) <- res$score
-    slot(data, "metadata")$tf <- res$tf
-    slot(data, "metadata")$idf <- res$idf
-    slot(data, "metadata")$iae <- res$iae
+
+    ## Store or clear intermediate matrices depending on caller request.
+    ## Explicitly clearing stale values prevents old metadata from leaking
+    ## when `cal_score()` is re-run on an object that previously held them.
+    if (isTRUE(return.intermediate)) {
+      slot(data, "metadata")$tf <- res$tf
+      slot(data, "metadata")$idf <- res$idf
+      slot(data, "metadata")$iae <- res$iae
+    } else {
+      slot(data, "metadata")$tf <- NULL
+      slot(data, "metadata")$idf <- NULL
+      slot(data, "metadata")$iae <- NULL
+    }
 
     return(data)
   }

diff --git a/R/score.R b/R/score.R
@@ -57,8 +57,13 @@ idf_iae_methods <- function() {
 #'     be accessed using [idf_iae_methods()]
 #' @param par.idf other parameters for specified IDF methods
 #' @param par.iae other parameters for specified IAE methods
+#' @param return.intermediate logical, if TRUE the returned list also contains
+#'     the intermediate `tf`, `idf` and `iae` objects. Default `FALSE` keeps
+#'     only the combined `score` to avoid the memory overhead of three extra
+#'     feature-by-cell matrices on large inputs.
 #'
-#' @return a list of combined score, tf, idf and iae
+#' @return a list always containing `score`; when `return.intermediate = TRUE`
+#'     the list additionally contains `tf`, `idf` and `iae`.
 #'
 #' @examples
 #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
@@ -69,14 +74,17 @@ idf_iae_methods <- function() {
 #' )
 cal_score_init <- function(expr, tf = c("logtf", "tf"),
                            idf = "prob", iae = "prob",
-                           par.idf = NULL, par.iae = NULL) {
+                           par.idf = NULL, par.iae = NULL,
+                           return.intermediate = FALSE) {
   ## check
   tf <- match.arg(tf)
   idf <- match.arg(idf, choices = idf_iae_methods())
   iae <- match.arg(iae, choices = idf_iae_methods())
   stopifnot(
     "par.idf must be a named list or NULL" = is.null(par.idf) | is.list(par.idf),
-    "par.iae must be a named list or NULL" = is.null(par.iae) | is.list(par.iae)
+    "par.iae must be a named list or NULL" = is.null(par.iae) | is.list(par.iae),
+    "return.intermediate must be a single logical" =
+      is.logical(return.intermediate) && length(return.intermediate) == 1L
   )
 
   ## compute tf
@@ -98,8 +106,74 @@ cal_score_init <- function(expr, tf = c("logtf", "tf"),
     iae <- do.call(iae, c(list(expr = expr), par.iae))
   }
 
-  ## combined score
-  score <- tf * idf * iae
+  ## combined score via per-group column-block broadcast; avoids
+  ## materialising the full G x N copy that a naive `tf * idf * iae`
+  ## would trigger when either factor is a G x K compact matrix.
+  score <- combine_tf_idf_iae(tf, idf, iae,
+                              label_idf = par.idf$label,
+                              label_iae = par.iae$label)
 
-  return(list(score = score, tf = tf, idf = idf, iae = iae))
+  if (isTRUE(return.intermediate)) {
+    return(list(score = score, tf = tf, idf = idf, iae = iae))
+  }
+  list(score = score)
+}
+
+## Per-group column-block composition of score = tf * idf * iae.
+##
+## Each factor can be one of:
+##   * scalar 1 (the "null" path),
+##   * a G-vector (cell-independent, e.g. idf/iae/idf_sd/iae_sd/idf_igm/iae_igm),
+##   * a G x K compact matrix with colnames = unique labels (idf_prob, idf_rf,
+##     iae_prob, iae_rf after Phase B),
+##   * a full G x N matrix (idf_m, iae_m, idf_hdb, iae_hdb — the latter two
+##     expand internally to preserve their legacy contract).
+##
+## When at least one factor is compact we loop over groups and broadcast
+## only the active slice into the corresponding columns of `score`. When
+## both factors are cell-independent or full-cell we fall back to the
+## direct algebraic form.
+combine_tf_idf_iae <- function(tf_mat, idf_obj, iae_obj,
+                               label_idf = NULL, label_iae = NULL) {
+  N <- ncol(tf_mat)
+  is_compact <- function(x) {
+    if (is.null(dim(x))) return(FALSE)
+    !is.null(colnames(x)) && ncol(x) < N
+  }
+  idf_is_gk <- is_compact(idf_obj)
+  iae_is_gk <- is_compact(iae_obj)
+
+  if (!idf_is_gk && !iae_is_gk) {
+    ## no compact factor; direct algebra preserves sparsity for scalar /
+    ## G-vector factors and only densifies when a factor is already G x N.
+    return(tf_mat * idf_obj * iae_obj)
+  }
+
+  label <- label_idf %||% label_iae
+  stopifnot(
+    "par.idf$label or par.iae$label is required when idf/iae return compact G x K matrices" =
+      !is.null(label),
+    "length(label) must equal ncol(expr)" = length(label) == N
+  )
+  label_ch <- as.character(label)
+
+  ## Slice a factor for a given (column subset, group name) pair.
+  slice_factor <- function(x, cols, group_name) {
+    if (is.null(dim(x))) return(x)             # scalar or G-vector
+    if (ncol(x) == N) return(x[, cols, drop = FALSE])  # G x N full
+    x[, group_name]                             # G x K compact
+  }
+
+  score <- tf_mat
+  for (g in unique(label_ch)) {
+    cols <- which(label_ch == g)
+    if (!length(cols)) next
+    idf_g <- slice_factor(idf_obj, cols, g)
+    iae_g <- slice_factor(iae_obj, cols, g)
+    score[, cols] <- score[, cols, drop = FALSE] * idf_g * iae_g
+  }
+  score
 }
+
+## Null-coalescing helper (kept local to avoid a new Imports).
+`%||%` <- function(x, y) if (is.null(x)) y else x