diff --git a/.gitignore b/.gitignore
index 9ff2a83..23c6d60 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@ inst/doc
 .Rhistory
 .DS_Store
 *.Rproj
+tests/testthat/testdata
diff --git a/NEWS.md b/NEWS.md
index 710386d..b7bd73b 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -45,3 +45,48 @@
 # smartid 1.7.2
 
 * Fix dplyr defunct.
+
+# smartid (development version)
+
+* Major memory and performance optimization for `cal_score()` and
+  `top_markers()`. On a 20,000 gene x 100,000 cell sparse input peak
+  memory drops from roughly 100 GB to a few GB, and `top_markers()`
+  with the default `gaussian()` family runs in seconds instead of
+  hours.
+* **Breaking change**: `cal_score()` no longer stores the intermediate
+  `tf`, `idf` and `iae` matrices in `metadata()` by default. Callers
+  that relied on `metadata(se)$tf / $idf / $iae` (introduced in v1.1.1)
+  must now pass `return.intermediate = TRUE`. When the flag is `TRUE`,
+  the stored `idf` / `iae` for labelled methods (`prob`, `rf`) are now
+  compact `G x K` matrices (columns = unique labels); expand with
+  `md$idf[, as.character(label)]` to recover the legacy per-cell form.
+* Internal refactor: labelled `idf_prob`, `idf_rf`, `iae_prob` and
+  `iae_rf` helpers now return a compact `G x K` matrix. `cal_score()`
+  composes the final score through per-group column-block
+  multiplication, avoiding the materialisation of full `G x N`
+  intermediates.
+* `cal_score()` no longer forces dense conversion of `dgCMatrix`
+  inputs; the score assay stays sparse throughout the pipeline when
+  the input is sparse.
+* `tf()`, `idf_hdb()`, `iae_hdb()` and all IAE helpers now preserve
+  `dgCMatrix` sparsity by routing column scaling through
+  `Matrix::Diagonal` and replacing the densifying
+  `x[x < 0] <- 0` pattern with `pmax0_offset()`.
+* `top_markers_glm()` has a vectorised closed-form least-squares fast
+  path for the default `gaussian()` + identity link; non-gaussian
+  families or rank-deficient designs automatically fall back to the
+  legacy per-gene `glm()` loop with no behaviour change.
+* `top_markers_abs()` aggregates directly on the scored matrix via
+  `sparseMatrixStats::rowMeans2 / rowMedians / rowMads`, removing the
+  intermediate wide `data.frame` that previously reached tens of GB.
+* `scale_mgm()` caches per-group column indices and collapses the
+  two-step `(expr - mgm) / (sds + 1e-8)` into a single broadcast.
+* The `multi = TRUE` branch of the labelled IDF/IAE helpers switched
+  from an O(G * K^2) `apply()` to an O(G * K) top-1 + top-2 trick via
+  the new `rowwise_notin_max()` helper.
+* New `inst/bench/benchmark_smartid.R` micro-benchmark script; new
+  `tests/testthat/test-numerical-equivalence.R` pins `cal_score()` and
+  `top_markers()` outputs to a frozen pre-refactor snapshot at 1e-10
+  tolerance.
+* No new dependencies; the refactor relies entirely on `Matrix`,
+  `sparseMatrixStats` and base R.
diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index 97ee8c1..ebe70b4 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -20,6 +20,12 @@
 #'     optional, default 'counts'
 #' @param new.slot a character, specify the name of slot to save score in se object,
 #'     optional, default 'score'
+#' @param return.intermediate logical, if TRUE also return or store the
+#'     intermediate `tf`, `idf` and `iae` matrices. Defaults to FALSE since
+#'     these objects have the same dimension as the input expression matrix
+#'     and can dominate memory usage on large datasets. Set to TRUE to
+#'     restore the pre-1.7.3 behavior where intermediates were kept in
+#'     `metadata()` of the SummarizedExperiment output.
 #'
 #' @return A list of matrices or se object containing combined score
 #'
@@ -42,7 +48,8 @@ setGeneric(
            slot = "counts",
            new.slot = "score",
            par.idf = NULL,
-           par.iae = NULL) {
+           par.iae = NULL,
+           return.intermediate = FALSE) {
     standardGeneric("cal_score")
   }
 )
diff --git a/R/scale_mgm.R b/R/scale_mgm.R
index 86938aa..e60252d 100644
--- a/R/scale_mgm.R
+++ b/R/scale_mgm.R
@@ -17,51 +17,45 @@
 #' @examples
 #' scale_mgm(matrix(rnorm(100), 10), label = rep(letters[1:2], 5))
 scale_mgm <- function(expr, label, pooled.sd = FALSE) {
-  if (pooled.sd) {
-    ## compute pooled sds
-    sds <- row_pool_sds(expr, label)
-  } else {
-    ## compute overall sds
-    sds <- sparseMatrixStats::rowSds(expr, na.rm = TRUE)
+  ## Cache column indices per group once; the group-mean and pooled-SD
+  ## paths previously recomputed `label == i` inside every `vapply`
+  ## iteration, which is O(K * N) in scan cost.
+  idx_by_grp <- split(seq_len(ncol(expr)), label)
 
-    # ## compute group sds
-    # sds <- vapply(unique(label), \(i)
-    #               sparseMatrixStats::rowSds(expr[, label == i, drop = FALSE],
-    #                                         na.rm = TRUE),
-    #               rep(1, nrow(expr))
-    #        ) # get sds of each group
-    # sds <- sparseMatrixStats::rowMeans2(sds)
+  sds <- if (isTRUE(pooled.sd)) {
+    row_pool_sds_from_idx(expr, idx_by_grp)
+  } else {
+    sparseMatrixStats::rowSds(expr, na.rm = TRUE)
   }
 
-  ## compute group means
-  mgm <- vapply(
-    unique(label), \(i)
-    sparseMatrixStats::rowMeans2(expr[, label == i, drop = FALSE],
-      na.rm = TRUE
-    ),
-    rep(1, nrow(expr))
-  ) |> # get mean of each group
-    rowMeans(na.rm = TRUE) # get mean of group mean
-
-  ## scale
-  expr <- (expr - mgm) / (sds + 1e-8)
+  # Compute per-group means, then average them to get the mean of group means (MGM).
+  group_means <- vapply(idx_by_grp, function(cols)
+    sparseMatrixStats::rowMeans2(expr[, cols, drop = FALSE], na.rm = TRUE),
+    numeric(nrow(expr)))
+  mgm <- rowMeans(group_means, na.rm = TRUE) # mean of per-group means
+
+  # scale
+  ## Single broadcast + single allocation: `(expr - mgm) * inv_sd`
+  ## collapses the prior two temporaries ((expr - mgm), then divide) into
+  ## one. Division-by-zero is guarded by the additive epsilon.
+  inv_sd <- 1 / (sds + 1e-8)
+  (expr - mgm) * inv_sd
+}
 
-  # expr[is.na(expr)] <- 0 # assign 0 to NA when sd = 0
 
-  return(expr)
+## Row-wise pooled SDs given pre-computed per-group column indices.
+row_pool_sds_from_idx <- function(expr, idx_by_grp) {
+  # Compute per-group variances, then combine them with the group sizes to get the pooled SD.
+  vars <- vapply(idx_by_grp, function(cols)
+    sparseMatrixStats::rowVars(expr[, cols, drop = FALSE], na.rm = TRUE),
+    numeric(nrow(expr)))
+  # Group sizes: number of columns in each group.
+  ng <- lengths(idx_by_grp)
+  as.numeric(sqrt((vars %*% cbind(ng - 1)) / sum(ng - 1)))
 }
 
-
-## row-wise pooled SDs
+## Back-compat shim: preserves the old `row_pool_sds(expr, label)`
+## internal call signature in case any caller still uses it.
 row_pool_sds <- function(expr, label) {
-  sds <- vapply(
-    unique(label), \(i)
-    sparseMatrixStats::rowVars(expr[, label == i, drop = FALSE], na.rm = TRUE),
-    rep(1, nrow(expr))
-  ) # get vars of each group
-  ng <- table(label)[unique(label)] # get group sizes in the same order
-  sds <- sds %*% cbind(ng - 1)
-  sds <- as.numeric(sqrt(sds / sum(ng - 1)))
-
-  return(sds)
+  row_pool_sds_from_idx(expr, split(seq_len(ncol(expr)), label))
 }
diff --git a/R/score-methods.R b/R/score-methods.R
index 122a278..220797e 100644
--- a/R/score-methods.R
+++ b/R/score-methods.R
@@ -11,14 +11,16 @@ setMethod(
            idf = "prob",
            iae = "prob",
            par.idf = NULL,
-           par.iae = NULL) {
+           par.iae = NULL,
+           return.intermediate = FALSE) {
     score <- cal_score_init(
-      expr = as.matrix(data),
+      expr = data,
       tf = tf,
       idf = idf,
       iae = iae,
       par.idf = par.idf,
-      par.iae = par.iae
+      par.iae = par.iae,
+      return.intermediate = return.intermediate
     )
 
     return(score)
@@ -37,7 +39,8 @@ setMethod(
            slot = "counts",
            new.slot = "score",
            par.idf = NULL,
-           par.iae = NULL) {
+           par.iae = NULL,
+           return.intermediate = FALSE) {
     ## get expr
     expr <- SummarizedExperiment::assay(data, i = slot)
     ## get label
@@ -54,13 +57,24 @@ setMethod(
       idf = idf,
       iae = iae,
       par.idf = par.idf,
-      par.iae = par.iae
+      par.iae = par.iae,
+      return.intermediate = return.intermediate
     )
 
     SummarizedExperiment::assay(data, i = new.slot) <- res$score
-    slot(data, "metadata")$tf <- res$tf
-    slot(data, "metadata")$idf <- res$idf
-    slot(data, "metadata")$iae <- res$iae
+
+    ## Store or clear intermediate matrices depending on caller request.
+    ## Explicitly clearing stale values prevents old metadata from leaking
+    ## when `cal_score()` is re-run on an object that previously held them.
+    if (isTRUE(return.intermediate)) {
+      slot(data, "metadata")$tf <- res$tf
+      slot(data, "metadata")$idf <- res$idf
+      slot(data, "metadata")$iae <- res$iae
+    } else {
+      slot(data, "metadata")$tf <- NULL
+      slot(data, "metadata")$idf <- NULL
+      slot(data, "metadata")$iae <- NULL
+    }
 
     return(data)
   }
diff --git a/R/score.R b/R/score.R
index 786adfa..e777f04 100644
--- a/R/score.R
+++ b/R/score.R
@@ -57,8 +57,13 @@ idf_iae_methods <- function() {
 #'     be accessed using [idf_iae_methods()]
 #' @param par.idf other parameters for specified IDF methods
 #' @param par.iae other parameters for specified IAE methods
+#' @param return.intermediate logical, if TRUE the returned list also contains
+#'     the intermediate `tf`, `idf` and `iae` objects. Default `FALSE` keeps
+#'     only the combined `score` to avoid the memory overhead of three extra
+#'     feature-by-cell matrices on large inputs.
 #'
-#' @return a list of combined score, tf, idf and iae
+#' @return a list always containing `score`; when `return.intermediate = TRUE`
+#'     the list additionally contains `tf`, `idf` and `iae`.
 #'
 #' @examples
 #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
@@ -69,14 +74,17 @@ idf_iae_methods <- function() {
 #' )
 cal_score_init <- function(expr, tf = c("logtf", "tf"),
                            idf = "prob", iae = "prob",
-                           par.idf = NULL, par.iae = NULL) {
+                           par.idf = NULL, par.iae = NULL,
+                           return.intermediate = FALSE) {
   ## check
   tf <- match.arg(tf)
   idf <- match.arg(idf, choices = idf_iae_methods())
   iae <- match.arg(iae, choices = idf_iae_methods())
   stopifnot(
     "par.idf must be a named list or NULL" = is.null(par.idf) | is.list(par.idf),
-    "par.iae must be a named list or NULL" = is.null(par.iae) | is.list(par.iae)
+    "par.iae must be a named list or NULL" = is.null(par.iae) | is.list(par.iae),
+    "return.intermediate must be a single logical" =
+      is.logical(return.intermediate) && length(return.intermediate) == 1L
   )
 
   ## compute tf
@@ -98,8 +106,74 @@ cal_score_init <- function(expr, tf = c("logtf", "tf"),
     iae <- do.call(iae, c(list(expr = expr), par.iae))
   }
 
-  ## combined score
-  score <- tf * idf * iae
+  ## combined score via per-group column-block broadcast; avoids
+  ## materialising the full G x N copy that a naive `tf * idf * iae`
+  ## would trigger when either factor is a G x K compact matrix.
+  score <- combine_tf_idf_iae(tf, idf, iae,
+                              label_idf = par.idf$label,
+                              label_iae = par.iae$label)
 
-  return(list(score = score, tf = tf, idf = idf, iae = iae))
+  if (isTRUE(return.intermediate)) {
+    return(list(score = score, tf = tf, idf = idf, iae = iae))
+  }
+  list(score = score)
+}
+
+## Per-group column-block composition of score = tf * idf * iae.
+##
+## Each factor can be one of:
+##   * scalar 1 (the "null" path),
+##   * a G-vector (cell-independent, e.g. idf/iae/idf_sd/iae_sd/idf_igm/iae_igm),
+##   * a G x K compact matrix with colnames = unique labels (idf_prob, idf_rf,
+##     iae_prob, iae_rf after Phase B),
+##   * a full G x N matrix (idf_m, iae_m, idf_hdb, iae_hdb — the latter two
+##     expand internally to preserve their legacy contract).
+##
+## When at least one factor is compact we loop over groups and broadcast
+## only the active slice into the corresponding columns of `score`. When
+## both factors are cell-independent or full-cell we fall back to the
+## direct algebraic form.
+combine_tf_idf_iae <- function(tf_mat, idf_obj, iae_obj,
+                               label_idf = NULL, label_iae = NULL) {
+  N <- ncol(tf_mat)
+  is_compact <- function(x) {
+    if (is.null(dim(x))) return(FALSE)
+    !is.null(colnames(x)) && ncol(x) < N
+  }
+  idf_is_gk <- is_compact(idf_obj)
+  iae_is_gk <- is_compact(iae_obj)
+
+  if (!idf_is_gk && !iae_is_gk) {
+    ## no compact factor; direct algebra preserves sparsity for scalar /
+    ## G-vector factors and only densifies when a factor is already G x N.
+    return(tf_mat * idf_obj * iae_obj)
+  }
+
+  label <- label_idf %||% label_iae
+  stopifnot(
+    "par.idf$label or par.iae$label is required when idf/iae return compact G x K matrices" =
+      !is.null(label),
+    "length(label) must equal ncol(expr)" = length(label) == N
+  )
+  label_ch <- as.character(label)
+
+  ## Slice a factor for a given (column subset, group name) pair.
+  slice_factor <- function(x, cols, group_name) {
+    if (is.null(dim(x))) return(x)             # scalar or G-vector
+    if (ncol(x) == N) return(x[, cols, drop = FALSE])  # G x N full
+    x[, group_name]                             # G x K compact
+  }
+
+  score <- tf_mat
+  for (g in unique(label_ch)) {
+    cols <- which(label_ch == g)
+    if (!length(cols)) next
+    idf_g <- slice_factor(idf_obj, cols, g)
+    iae_g <- slice_factor(iae_obj, cols, g)
+    score[, cols] <- score[, cols, drop = FALSE] * idf_g * iae_g
+  }
+  score
 }
+
+## Null-coalescing helper (kept local to avoid a new Imports).
+`%||%` <- function(x, y) if (is.null(x)) y else x
diff --git a/R/tf_idf_iae_wrappers.R b/R/tf_idf_iae_wrappers.R
index 136e1c9..092dfd5 100644
--- a/R/tf_idf_iae_wrappers.R
+++ b/R/tf_idf_iae_wrappers.R
@@ -1,3 +1,52 @@
+#################################################
+#-----------------Helpers----------------------#
+#################################################
+
+## For each row i and each column k of `mat`, return the maximum of
+## `mat[i, -k]`. Vectorised O(G * K) replacement for the nested
+## `apply(mat, 1, function(x) max(x[names(x) != type]))` pattern used in
+## the multi-class branches of the labelled IDF/IAE helpers.
+rowwise_notin_max <- function(mat) {
+  stopifnot(!is.null(dim(mat)))
+  G <- nrow(mat)
+  K <- ncol(mat)
+  if (K == 1L) {
+    ## no "other" column exists; return -Inf to mark degenerate input
+    out <- matrix(-Inf, nrow = G, ncol = 1L, dimnames = dimnames(mat))
+    return(out)
+  }
+  row_max  <- sparseMatrixStats::rowMaxs(mat, na.rm = TRUE)
+  argmax   <- max.col(mat, ties.method = "first")
+  masked   <- mat
+  masked[cbind(seq_len(G), argmax)] <- -Inf
+  row_max2 <- sparseMatrixStats::rowMaxs(masked, na.rm = TRUE)
+  ## broadcast row_max across all K columns, then overwrite with row_max2
+  ## wherever column index == argmax (i.e. the excluded-self case).
+  notin <- matrix(row_max, nrow = G, ncol = K)
+  for (k in seq_len(K)) {
+    hit <- argmax == k
+    if (any(hit)) notin[hit, k] <- row_max2[hit]
+  }
+  dimnames(notin) <- dimnames(mat)
+  notin
+}
+
+## Sparse-preserving equivalent of `pmax(x - thres, 0)`. For dgCMatrix
+## we mutate the non-zero slot in place and drop structural zeros,
+## avoiding the dense allocation that `x[x < 0] <- 0` would trigger.
+## `thres == 0` short-circuits because scRNA-seq counts are already
+## non-negative, which is the common default path.
+pmax0_offset <- function(x, thres = 0) {
+  if (thres == 0) return(x)
+  if (methods::is(x, "sparseMatrix")) {
+    x@x <- pmax(x@x - thres, 0)
+    return(Matrix::drop0(x))
+  }
+  out <- x - thres
+  out[out < 0] <- 0
+  out
+}
+
 #################################################
 #-----------------TF variants------------------#
 #################################################
@@ -14,18 +63,27 @@
 #' @param expr a count matrix, features in row and cells in column
 #' @param log logical, if to do log-transformation
 #'
-#' @return a matrix of term/gene frequency
+#' @return a matrix of term/gene frequency. For a `dgCMatrix` input the
+#'     returned object preserves sparsity (`log1p(0) == 0`); dense input
+#'     returns a dense matrix.
 #'
 #' @examples
 #' data <- matrix(rpois(100, 2), 10, dimnames = list(1:10))
 #' smartid:::tf(data)
 tf <- function(expr, log = FALSE) {
-  t.f <- sweep(expr, 2, colSums(expr, na.rm = TRUE) + 0.01, FUN = "/")
-  if (log) {
-    t.f <- log1p(t.f)
+  ## Column sums: use Matrix-aware accessor so dgCMatrix stays sparse.
+  cs <- Matrix::colSums(expr, na.rm = TRUE) + 0.01
+  if (methods::is(expr, "sparseMatrix")) {
+    ## Right-multiplying by a diagonal with 1/cs scales each column
+    ## without materialising a dense copy. log1p preserves sparsity
+    ## because log1p(0) == 0.
+    out <- expr %*% Matrix::Diagonal(x = 1 / cs)
+    dimnames(out) <- dimnames(expr)
+  } else {
+    out <- sweep(expr, 2, cs, FUN = "/")
   }
-
-  return(t.f)
+  if (log) out <- log1p(out)
+  out
 }
 
 #################################################
@@ -155,22 +213,24 @@ idf_hdb <- function(expr, features = NULL, multi = TRUE,
                     thres = 0, minPts = 2, ...) {
   if (is.null(features)) features <- seq_len(nrow(expr))
 
-  ## initially compute naive tf-idf
-  # tf <- (edgeR::cpm(expr)/1e6)[features, , drop = FALSE]
-  tf <- sweep(expr, 2, colSums(expr, na.rm = TRUE), "/")[features, , drop = FALSE]
-  tfidf <- tf * idf(expr, features = features, thres = thres)
+  ## initially compute naive tf-idf (sparse-preserving via tf() helper)
+  tfidf <- tf(expr)[features, , drop = FALSE] *
+    idf(expr, features = features, thres = thres)
 
   ## cluster obs based on given features
-  cluster <- dbscan::hdbscan(t(tfidf), minPts = minPts, ...)$cluster
+  cluster <- dbscan::hdbscan(Matrix::t(tfidf), minPts = minPts, ...)$cluster
   ## factor cluster
   cluster <- factor(cluster)
 
-  idf <- idf_prob(
+  ## `idf_prob()` now returns a compact G x K matrix; `idf_hdb()` owns the
+  ## cluster labels and is not reachable from `cal_score_init()`, so we
+  ## expand here to keep the G x N external contract.
+  idf_compact <- idf_prob(
     expr = expr, features = features,
     label = cluster, multi = multi,
     thres = thres
   )
-  return(idf)
+  idf_compact[, as.character(cluster), drop = FALSE]
 }
 
 ## ------------------labeled--------------------##
@@ -213,9 +273,8 @@ idf_rf <- function(expr, features = NULL, label,
     )
   }, rep(1, nrow(df_n))) ## mean counts for each gene in the group
   if (multi == TRUE) {
-    mean_row_notin <- vapply(colnames(mean_row_in), function(type) {
-      apply(mean_row_in, 1, function(x) max(x[names(x) != type]))
-    }, rep(1, nrow(mean_row_in))) ## doc freq for each gene not in group for multi-class: max(mean(N in Gi))
+    ## G x K: row-wise max over "other" columns; O(G * K) vectorisation
+    mean_row_notin <- rowwise_notin_max(mean_row_in)
   } else {
     mean_row_notin <- vapply(unique(label), function(type) {
       sparseMatrixStats::rowMeans2(df_n[, label != type, drop = FALSE],
@@ -224,9 +283,11 @@ idf_rf <- function(expr, features = NULL, label,
     }, rep(1, nrow(df_n))) ## doc freq for each gene not in group for bi-class
   }
 
-  idf <- log1p((mean_row_in / (mean_row_notin + 1e-8))[, label, drop = FALSE]) ## IDF scores
-
-  return(idf)
+  ## Return compact G x K matrix; `cal_score_init()` handles per-group
+  ## broadcast so we never materialise a G x N copy via `[, label]`.
+  idf <- log1p(mean_row_in / (mean_row_notin + 1e-8))
+  colnames(idf) <- colnames(mean_row_in)
+  idf
 }
 
 ## labeled inverse document frequency: probability based
@@ -257,7 +318,6 @@ idf_prob <- function(expr, features = NULL, label,
   # thres <- 0
   # thres <- sparseMatrixStats::rowQuantiles(expr[features, , drop = FALSE], probs = 0.25, na.rm = TRUE)
   df_n <- expr[features, , drop = FALSE] > thres ## if contain feature i > thres
-  df_n_inv <- !df_n ## if not contain feature i > thres
 
   ## convert label into character in case problem for factor
   label <- as.character(label)
@@ -267,9 +327,8 @@ idf_prob <- function(expr, features = NULL, label,
     )
   }, rep(1, nrow(df_n))) ## mean counts for each gene in the group
   if (multi == TRUE) {
-    mean_row_notin <- vapply(colnames(mean_row_in), function(type) {
-      apply(mean_row_in, 1, function(x) max(x[names(x) != type]))
-    }, rep(1, nrow(mean_row_in))) ## doc freq for each gene not in group for multi-class: max(mean(N in Gi))
+    ## G x K: row-wise max over "other" columns; O(G * K) vectorisation
+    mean_row_notin <- rowwise_notin_max(mean_row_in)
   } else {
     mean_row_notin <- vapply(unique(label), function(type) {
       sparseMatrixStats::rowMeans2(df_n[, label != type, drop = FALSE],
@@ -278,7 +337,10 @@ idf_prob <- function(expr, features = NULL, label,
     }, rep(1, nrow(df_n))) ## doc freq for each gene not in group for bi-class
   }
 
-  idf <- log1p((mean_row_in^2 / (mean_row_notin + 1e-8))[, label, drop = FALSE]) ## IDF scores
+  ## Return compact G x K; `cal_score_init()` broadcasts per group.
+  idf <- log1p(mean_row_in^2 / (mean_row_notin + 1e-8))
+  colnames(idf) <- colnames(mean_row_in)
+  idf
 
   return(idf)
 }
@@ -353,11 +415,9 @@ iae <- function(expr, features = NULL, thres = 0) {
   if (is.null(features)) features <- seq_len(nrow(expr))
   n_obs <- ncol(expr) ## number of total obs
 
-  # thres <- 0
   # thres <- sparseMatrixStats::rowQuantiles(expr[features, , drop = FALSE], probs = 0.25, na.rm = TRUE)
-  expr_offset <- expr[features, , drop = FALSE] - thres ## subtract offset
-  expr_offset[expr_offset < 0] <- 0
-  s_row <- rowSums(expr_offset) ## total counts of feature i across all cells
+  expr_offset <- pmax0_offset(expr[features, , drop = FALSE], thres)
+  s_row <- Matrix::rowSums(expr_offset, na.rm = TRUE) ## per-feature total counts
 
   iae <- log1p(n_obs / (s_row + 1))
   return(iae)
@@ -385,13 +445,14 @@ iae <- function(expr, features = NULL, thres = 0) {
 iae_m <- function(expr, features = NULL, thres = 0) {
   if (is.null(features)) features <- seq_len(nrow(expr))
 
-  # thres <- 0
-  # thres <- sparseMatrixStats::rowQuantiles(expr, probs = 0.25, na.rm = TRUE)
-  expr_offset <- expr - thres ## subtract offset
-  expr_offset[expr_offset < 0] <- 0
-  s_row <- rowSums(expr_offset) ## total counts of feature i across all cells
-
-  s_max <- ifelse(expr_offset > 0, s_row, 0) |> sparseMatrixStats::colMaxs()
+  # thres <- sparseMatrixStats::rowQuantiles(expr[features, , drop = FALSE], probs = 0.25, na.rm = TRUE)
+  expr_offset <- pmax0_offset(expr, thres)
+  s_row <- Matrix::rowSums(expr_offset, na.rm = TRUE) ## per-feature total counts
+  ## For each cell: max over features of s_row restricted to features
+  ## that are expressed > thres in that cell. `ifelse()` on the logical
+  ## mask preserves sparsity for dgCMatrix.
+  nonzero <- expr_offset > 0
+  s_max <- sparseMatrixStats::colMaxs(nonzero * s_row, na.rm = TRUE)
 
   iae <- matrix(1 / (1 + s_row), ncol = 1) %*% matrix(s_max, nrow = 1)
   dimnames(iae) <- dimnames(expr)
@@ -425,10 +486,8 @@ iae_sd <- function(expr, features = NULL, log = FALSE, thres = 0) {
 
   tfs <- tf(expr, log = log)
 
-  # thres <- 0
   # thres <- sparseMatrixStats::rowQuantiles(expr[features, , drop = FALSE], probs = 0.25, na.rm = TRUE)
-  expr_offset <- expr[features, , drop = FALSE] - thres ## subtract offset
-  expr_offset[expr_offset < 0] <- 0
+  expr_offset <- pmax0_offset(expr[features, , drop = FALSE], thres)
   s_row <- sparseMatrixStats::rowSums2(expr_offset, na.rm = TRUE) ## summed counts for each gene
   sd_row <- sparseMatrixStats::rowSds(tfs, na.rm = TRUE)
 
@@ -457,22 +516,23 @@ iae_hdb <- function(expr, features = NULL, multi = TRUE,
                     thres = 0, minPts = 2, ...) {
   if (is.null(features)) features <- seq_len(nrow(expr))
 
-  ## initially compute naive tf-idf
-  # tf <- (edgeR::cpm(expr)/1e6)[features, , drop = FALSE]
-  tf <- sweep(expr, 2, colSums(expr, na.rm = TRUE), "/")[features, , drop = FALSE]
-  tfidf <- tf * iae(expr, features = features, thres = thres)
+  ## initially compute naive tf-iae (sparse-preserving via tf() helper)
+  tfidf <- tf(expr)[features, , drop = FALSE] *
+    iae(expr, features = features, thres = thres)
 
   ## cluster obs based on given features
-  cluster <- dbscan::hdbscan(t(tfidf), minPts = minPts, ...)$cluster
+  cluster <- dbscan::hdbscan(Matrix::t(tfidf), minPts = minPts, ...)$cluster
   ## factor cluster
   cluster <- factor(cluster)
 
-  iae <- iae_prob(
+  ## Same rationale as `idf_hdb()`: expand G x K compact to G x N here
+  ## since cluster labels are not visible to `cal_score_init()`.
+  iae_compact <- iae_prob(
     expr = expr, features = features,
     label = cluster, multi = multi,
     thres = thres
   )
-  return(iae)
+  iae_compact[, as.character(cluster), drop = FALSE]
 }
 
 ## ------------------labeled--------------------##
@@ -502,8 +562,7 @@ iae_rf <- function(expr, features = NULL, label,
 
   # thres <- 0
   # thres <- sparseMatrixStats::rowQuantiles(expr[features, , drop = FALSE], probs = 0.25, na.rm = TRUE)
-  expr_offset <- expr[features, , drop = FALSE] - thres ## subtract offset
-  expr_offset[expr_offset < 0] <- 0
+  expr_offset <- pmax0_offset(expr[features, , drop = FALSE], thres)
 
   ## convert label into character in case problem for factor
   label <- as.character(label)
@@ -513,9 +572,7 @@ iae_rf <- function(expr, features = NULL, label,
     )
   }, rep(1, nrow(expr_offset))) ## mean counts for each gene in the group
   if (multi == TRUE) {
-    mean_row_notin <- vapply(colnames(mean_row_in), function(type) {
-      apply(mean_row_in, 1, function(x) max(x[names(x) != type]))
-    }, rep(1, nrow(mean_row_in))) ## mean counts for each gene not in group for multi-class: max(mean(N in Gi))
+    mean_row_notin <- rowwise_notin_max(mean_row_in)
   } else {
     mean_row_notin <- vapply(unique(label), function(type) {
       sparseMatrixStats::rowMeans2(expr_offset[, label != type, drop = FALSE],
@@ -524,8 +581,10 @@ iae_rf <- function(expr, features = NULL, label,
     }, rep(1, nrow(expr_offset))) ## mean counts for each gene not in group for bi-class
   }
 
-  iae <- log1p((mean_row_in / (mean_row_notin + 1e-8))[, label, drop = FALSE]) ## IDF scores
-  return(iae)
+  ## Return compact G x K; `cal_score_init()` broadcasts per group.
+  iae <- log1p(mean_row_in / (mean_row_notin + 1e-8))
+  colnames(iae) <- colnames(mean_row_in)
+  iae
 }
 
 ## labeled inverse average expression: probability based
@@ -554,8 +613,7 @@ iae_prob <- function(expr, features = NULL, label,
 
   # thres <- 0
   # thres <- sparseMatrixStats::rowQuantiles(expr[features, , drop = FALSE], probs = 0.25, na.rm = TRUE)
-  expr_offset <- expr[features, , drop = FALSE] - thres ## subtract offset
-  expr_offset[expr_offset < 0] <- 0
+  expr_offset <- pmax0_offset(expr[features, , drop = FALSE], thres)
 
   ## convert label into character in case problem for factor
   label <- as.character(label)
@@ -565,9 +623,7 @@ iae_prob <- function(expr, features = NULL, label,
     )
   }, rep(1, nrow(expr_offset))) ## mean counts for each gene in the group
   if (multi == TRUE) {
-    mean_row_notin <- vapply(colnames(mean_row_in), function(type) {
-      apply(mean_row_in, 1, function(x) max(x[names(x) != type]))
-    }, rep(1, nrow(mean_row_in))) ## mean counts for each gene not in group for multi-class: max(mean(N in Gi))
+    mean_row_notin <- rowwise_notin_max(mean_row_in)
   } else {
     mean_row_notin <- vapply(unique(label), function(type) {
       sparseMatrixStats::rowMeans2(expr_offset[, label != type, drop = FALSE],
@@ -576,8 +632,10 @@ iae_prob <- function(expr, features = NULL, label,
     }, rep(1, nrow(expr_offset))) ## mean counts for each gene not in group for bi-class
   }
 
-  iae <- log1p((mean_row_in^2 / (mean_row_notin + 1e-8))[, label, drop = FALSE]) ## IDF scores
-  return(iae)
+  ## Return compact G x K; `cal_score_init()` broadcasts per group.
+  iae <- log1p(mean_row_in^2 / (mean_row_notin + 1e-8))
+  colnames(iae) <- colnames(mean_row_in)
+  iae
 }
 
 ## labeled inverse average expression IGM
@@ -604,10 +662,8 @@ iae_prob <- function(expr, features = NULL, label,
 iae_igm <- function(expr, features = NULL, label, lambda = 7, thres = 0) {
   if (is.null(features)) features <- seq_len(nrow(expr))
 
-  # thres <- 0
   # thres <- sparseMatrixStats::rowQuantiles(expr[features, , drop = FALSE], probs = 0.25, na.rm = TRUE)
-  expr_offset <- expr[features, , drop = FALSE] - thres ## subtract offset
-  expr_offset[expr_offset < 0] <- 0
+  expr_offset <- pmax0_offset(expr[features, , drop = FALSE], thres)
 
   mean_row_in <- vapply(unique(label), function(type) {
     sparseMatrixStats::rowMeans2(expr_offset[, label == type, drop = FALSE],
diff --git a/R/top_markers.R b/R/top_markers.R
index 38020b7..c61e05d 100644
--- a/R/top_markers.R
+++ b/R/top_markers.R
@@ -65,45 +65,28 @@ top_markers_abs <- function(data, label, n = 10,
                             softmax = TRUE,
                             tau = 1) {
   method <- match.arg(method)
-  if (scale && use.mgm) {
-    data <- scale_mgm(expr = data, label = label, pooled.sd = pooled.sd)
-  } else if (scale && !use.mgm) {
-    ## scale scores on rows
-    # mu_s <- sparseMatrixStats::rowMeans2(data, na.rm = TRUE)
-    # sd_s <- sparseMatrixStats::rowSds(data, na.rm = TRUE)
-    # data <- (data - mu_s) / sd_s
-
-    data <- t(scale(t(data)))
-    data[is.na(data)] <- 0 # assign 0 to NA when sd = 0
-  }
-
-  data <- data |>
-    t() |>
-    as.data.frame() |>
-    dplyr::group_by(.dot = label) |> ## group by label
-    dplyr::summarise_all(method, na.rm = TRUE) |> ## aggregate scores
-    tidyr::gather("Genes", "Scores", -`.dot`) |> ## transform into long data
-    dplyr::group_by(`.dot`) ## group by label again
-
-  if (softmax == TRUE) {
-    data <- data |>
-      # dplyr::mutate(Scores = Scores / sd(Scores, na.rm = TRUE)) |> # norm by sd
-      # dplyr::mutate(Scores = sigmoid(Scores)) |> # sigmoid
-      # dplyr::mutate(Scores = tanh(Scores)) |> # tanh
-      dplyr::mutate(Scores = softmax(Scores, tau = tau)) # softmax
-  }
+  data <- apply_row_scaling(data, label, scale, use.mgm, pooled.sd)
 
-  data <- dplyr::slice_max(data, Scores, n = n) ## extract top n markers
-
-  # ## softmax
-  # if(softmax == TRUE)
-  #   data <- dplyr::mutate(data, Scores = softmax(Scores, tau = tau))
-
-  return(data)
+  ## G x K aggregation goes straight to a long data.frame; skips the
+  ## legacy `t() |> as.data.frame() |> summarise_all()` path which
+  ## materialised a dense N x G frame (tens of GB on large inputs).
+  agg  <- aggregate_rows_by_group(data, label, method)
+  long <- long_format_from_group_matrix(agg)
+  finalize_top_markers(long, n = n, softmax = softmax, tau = tau)
 }
 
 #' calculate group mean score using glm and order genes based on scores difference
 #'
+#' @details
+#' When `family` is `gaussian()` with the identity link (the default) and
+#' the design matrix is full-rank, `top_markers_glm()` computes all per-
+#' gene label coefficients in a single closed-form least-squares solve
+#' via `Matrix::solve(crossprod(X), crossprod(X, t(data)))`, avoiding
+#' the per-gene `glm()` loop. For any other family, or a rank-deficient
+#' design, the function automatically falls back to the legacy
+#' `apply(data, 1, glm(...))` path, so results are unchanged for users
+#' who pass e.g. `family = Gamma()` or `family = poisson()`.
+#'
 #' @inheritParams scale_mgm
 #' @param data matrix, features in row and samples in column
 #' @param n integer, number of returned top genes for each group
@@ -128,72 +111,25 @@ top_markers_glm <- function(data, label, n = 10,
                             # log = TRUE,
                             softmax = TRUE,
                             tau = 1) {
-  label <- factor(label) # factorize label
-
-  ## scale
-  if (scale && !use.mgm) {
-    ## scale scores on rows
-    # mu_s <- sparseMatrixStats::rowMeans2(data, na.rm = TRUE)
-    # sd_s <- sparseMatrixStats::rowSds(data, na.rm = TRUE)
-    # data <- (data - mu_s) / sd_s
-
-    data <- t(scale(t(data)))
-    data[is.na(data)] <- 0 # assign 0 to NA when sd = 0
-  } else if (scale && use.mgm) {
-    data <- scale_mgm(expr = data, label = label, pooled.sd = pooled.sd)
-  }
+  label <- factor(label)
+  if (!is.null(batch)) batch <- factor(batch)
 
+  data  <- apply_row_scaling(data, label, scale, use.mgm, pooled.sd)
   # ## log score
   # if(log == TRUE) {
   #   data <- log(data + 1e-8)
   # }
-
-  ## estimate betas based on given group and/or batch label
-  if(is.null(batch)) {
-    ## model with group label only
-    betas <- apply(data, 1, \(s) glm(s ~ 0 + label, family = family)$coef)
-  } else {
-    ## factorize batch label
-    batch <- factor(batch)
-    ## model with both group and batch label
-    betas <- apply(data, 1, \(s) glm(s ~ 0 + label + batch, family = family)$coef)
-    ## only extract betas of group label
-    betas <- betas[grep("^label", rownames(betas)), , drop = FALSE]
-  }
-
-  rownames(betas) <- gsub("label", "", rownames(betas))
-
-  # ## compute logFC (1 vs all mean) for each group
-  # betas <- apply(betas, 2, \(x) x - (sum(x) - x)/(length(x) - 1))
-
-  ## compute logFC (1 vs max excluding self) for each group
-  betas <- vapply(
-    seq_len(nrow(betas)), \(i)
-    betas[i, ] - sparseMatrixStats::colMaxs(betas[-i, , drop = FALSE]),
-    rep(1, ncol(betas))
-  ) |>
-    t()
+  betas <- fit_label_betas(data, label, batch, family)      # K x G
+  betas <- betas_to_logfc_1v_max(betas)                     # K x G
   rownames(betas) <- levels(label)
 
-  data <- data.frame(.dot = rownames(betas), betas) |>
-    tidyr::pivot_longer(-`.dot`, names_to = "Genes", values_to = "Scores") |>
-    dplyr::group_by(`.dot`) ## group by label again
-
-  if (softmax == TRUE) {
-    data <- data |>
-      # dplyr::mutate(Scores = Scores / sd(Scores, na.rm = TRUE)) |> # norm by sd
-      # dplyr::mutate(Scores = sigmoid(Scores)) |> # sigmoid
-      # dplyr::mutate(Scores = tanh(Scores)) |> # tanh
-      dplyr::mutate(Scores = softmax(Scores, tau = tau)) # softmax
-  }
-
-  data <- dplyr::slice_max(data, Scores, n = n) ## extract top n markers
-
-  # ## softmax
-  # if(softmax == TRUE)
-  #   data <- dplyr::mutate(data, Scores = softmax(Scores, tau = tau))
+  long <- data.frame(.dot = rownames(betas), betas,
+                     check.names = FALSE, stringsAsFactors = FALSE) |>
+    tidyr::pivot_longer(-`.dot`, names_to = "Genes",
+                        values_to = "Scores") |>
+    dplyr::group_by(`.dot`)
 
-  return(data)
+  finalize_top_markers(long, n = n, softmax = softmax, tau = tau)
 }
 
 ## sigmoid: [0, 1], multi-label, no need to sum to 1
@@ -211,4 +147,147 @@ softmax <- function(x, tau = 1) {
 ## tanh: [-1, 1], similar to sigmoid, no need to sum 1
 tanh <- function(x) 2 / (1 + exp(-2 * x)) - 1
 
+#################################################
+# Internal helpers for top_markers_abs/glm
+#################################################
+
+## Row-wise z-score without densifying via `scale(t(data))`. Rows with
+## zero or NA SD are collapsed to zero, matching the
+## `data[is.na(data)] <- 0` guard from the legacy path.
+row_scale_zmean <- function(data) {
+  mu <- sparseMatrixStats::rowMeans2(data, na.rm = TRUE)
+  sd <- sparseMatrixStats::rowSds(data,   na.rm = TRUE)
+  sd[sd == 0 | is.na(sd)] <- 1
+  out <- (data - mu) / sd
+  out[is.na(out)] <- 0
+  out
+}
+
+## Single entry point for the three `scale` / `use.mgm` branches shared
+## between `top_markers_abs()` and `top_markers_glm()`.
+apply_row_scaling <- function(data, label, scale, use.mgm, pooled.sd) {
+  if (!isTRUE(scale)) return(data)
+  if (isTRUE(use.mgm)) {
+    return(scale_mgm(expr = data, label = label, pooled.sd = pooled.sd))
+  }
+  row_scale_zmean(data)
+}
+
+## G x K matrix of per-group row statistics. Uses `sparseMatrixStats` so
+## the path stays sparse-friendly for dgCMatrix inputs.
+aggregate_rows_by_group <- function(data, label, method) {
+  groups <- unique(as.character(label))
+  fn <- switch(method,
+    mean   = sparseMatrixStats::rowMeans2,
+    median = sparseMatrixStats::rowMedians,
+    mad    = sparseMatrixStats::rowMads,
+    stop("Unknown aggregation method: ", method, call. = FALSE)
+  )
+  label_ch <- as.character(label)
+  agg <- vapply(groups, function(g)
+    fn(data[, label_ch == g, drop = FALSE], na.rm = TRUE),
+    numeric(nrow(data)))
+  colnames(agg) <- groups
+  rownames(agg) <- rownames(data)
+  agg
+}
+
+## Transform G x K group-statistic matrix into the grouped long
+## data.frame expected by downstream `markers_*()` consumers.
+long_format_from_group_matrix <- function(agg) {
+  out <- data.frame(
+    .dot   = rep(colnames(agg), each = nrow(agg)),
+    Genes  = rep(rownames(agg), times = ncol(agg)),
+    Scores = as.vector(agg),
+    stringsAsFactors = FALSE
+  )
+  dplyr::group_by(out, `.dot`)
+}
+
+## Softmax (optional) + top-n slice, shared finalisation.
+finalize_top_markers <- function(long, n, softmax, tau) {
+  if (isTRUE(softmax)) {
+    # long <- dplyr::mutate(Scores = Scores / sd(Scores, na.rm = TRUE)) |> # norm by sd
+    #  dplyr::mutate(Scores = sigmoid(Scores)) |> # sigmoid
+    #  dplyr::mutate(Scores = tanh(Scores)) |> # tanh
+    long <- dplyr::mutate(long, Scores = softmax(Scores, tau = tau))
+  }
+  dplyr::slice_max(long, Scores, n = n)
+}
+
+## Estimate per-gene label coefficients. Uses the closed-form solution
+## whenever `family` is gaussian-identity and the design is full-rank;
+## otherwise falls back to the legacy per-gene `glm()` loop so that
+## users passing non-gaussian families keep the same behaviour.
+fit_label_betas <- function(data, label, batch = NULL,
+                            family = stats::gaussian()) {
+  is_gauss_identity <- identical(family$family, "gaussian") &&
+    identical(family$link, "identity")
+  if (isTRUE(is_gauss_identity)) {
+    res <- try(
+      fit_label_betas_closed_form(data, label, batch),
+      silent = TRUE
+    )
+    if (!inherits(res, "try-error") && !is.null(res)) return(res)
+  }
+  fit_label_betas_glm_loop(data, label, batch, family)
+}
+
+## Closed-form ordinary least squares for gaussian identity link.
+## Returns K x G matrix of label coefficients; NULL signals a rank-
+## deficient design so the caller can fall back to `glm()`.
+fit_label_betas_closed_form <- function(data, label, batch = NULL) {
+  X <- if (is.null(batch)) {
+    Matrix::sparse.model.matrix(~ 0 + label)
+  } else {
+    Matrix::sparse.model.matrix(~ 0 + label + batch)
+  }
+  XtX <- Matrix::crossprod(X)
+  if (Matrix::rankMatrix(XtX)[1] < ncol(X)) return(NULL)
+  ## betas_all: K_total x G  (K_total = n label levels [+ batch levels])
+  betas_all <- as.matrix(
+    Matrix::solve(XtX, Matrix::crossprod(X, Matrix::t(data)))
+  )
+  rownames(betas_all) <- colnames(X)
+  keep  <- grep("^label", rownames(betas_all))
+  betas <- betas_all[keep, , drop = FALSE]
+  rownames(betas) <- sub("^label", "", rownames(betas))
+  betas
+}
+
+## Legacy per-gene glm loop; retained for non-gaussian families.
+fit_label_betas_glm_loop <- function(data, label, batch, family) {
+  # Build design matrix ONCE (biggest single win: avoids G formula parses)
+  design <- if (is.null(batch)) {
+    stats::model.matrix(~ 0 + label)
+  } else {
+    stats::model.matrix(~ 0 + label + batch)
+  }
+  
+  # Loop over genes; tryCatch to assign NA on failure (e.g. perfect separation)
+  betas <- apply(data, 1, function(y)
+    tryCatch(
+      stats::glm.fit(x = design, y = y, family = family,
+                     intercept = FALSE)$coefficients,
+      error = function(e) rep(NA_real_, ncol(design))
+    ))
+  # betas is K_total x G; we want K x G, so subset to label rows
+  betas <- betas[grep("^label", rownames(betas)), , drop = FALSE]
+  # rownames(betas) are "labelX" where X is the group; strip the "label" prefix
+  rownames(betas) <- sub("^label", "", rownames(betas))
+  betas
+}
+
+## 1-vs-max(other) log fold-change contrast used by top_markers_glm().
+## Input is a K x G matrix; output has the same shape.
+betas_to_logfc_1v_max <- function(betas) {
+  vapply(
+    seq_len(nrow(betas)), function(i)
+      betas[i, ] - sparseMatrixStats::colMaxs(
+        betas[-i, , drop = FALSE]
+      ),
+    numeric(ncol(betas))
+  ) |> t()
+}
+
 utils::globalVariables(c(".dot", "Scores"))
diff --git a/inst/bench/benchmark_smartid.R b/inst/bench/benchmark_smartid.R
new file mode 100644
index 0000000..8534458
--- /dev/null
+++ b/inst/bench/benchmark_smartid.R
@@ -0,0 +1,104 @@
+## Micro-benchmark for the memory-optimization refactor (smartid >= 1.7.3).
+##
+## Run from the package root:
+##
+##   Rscript inst/bench/benchmark_smartid.R
+##
+## The script measures `cal_score()` and `top_markers()` on a simulated
+## 20,000 gene x 100,000 cell dgCMatrix with density 5% and 10 balanced
+## groups. This mirrors the workload reported on large scRNA-seq atlases
+## where the legacy implementation peaked around 100 GB of memory.
+##
+## The output goes to stdout. `bench::mark()` reports `mem_alloc` which
+## accounts for R-level allocations during the call; peak RSS can be
+## tracked externally (e.g. `/usr/bin/time -v Rscript ...`). We also
+## `gc()` before each measurement so per-call allocations are isolated.
+##
+## No hard coded dependency on `bench`: if it is missing the script falls
+## back to a `proc.time()` + `gc()` delta report.
+
+suppressPackageStartupMessages({
+  library(Matrix)
+  library(SummarizedExperiment)
+  library(S4Vectors)
+  library(smartid)
+})
+
+set.seed(42)
+
+G <- 20000L
+N <- 100000L
+K <- 10L
+density <- 0.05
+
+message("Simulating ", G, " x ", N, " dgCMatrix (density = ", density, ")...")
+sim_t <- system.time({
+  counts <- Matrix::rsparsematrix(
+    G, N, density = density,
+    rand.x = function(n) rpois(n, lambda = 2)
+  )
+  dimnames(counts) <- list(paste0("gene", seq_len(G)),
+                           paste0("cell", seq_len(N)))
+  label <- sample(LETTERS[seq_len(K)], N, replace = TRUE)
+})
+message("Simulation took ", round(sim_t["elapsed"], 1), "s; nnz = ",
+        format(length(counts@x), big.mark = ","))
+
+se <- SummarizedExperiment(
+  assays  = list(counts = counts),
+  colData = DataFrame(Group = factor(label))
+)
+
+report <- function(label, expr) {
+  gc(reset = TRUE, full = TRUE)
+  pre  <- gc(reset = FALSE)
+  wall <- system.time(value <- force(expr))
+  post <- gc(reset = FALSE)
+  used_mb <- max(post[, "used (Mb)"] - pre[, "used (Mb)"])
+  cat(sprintf("  %-35s  wall = %7.2fs   R-allocated delta = %7.1f MB\n",
+              label, wall[["elapsed"]], used_mb))
+  invisible(value)
+}
+
+has_bench <- requireNamespace("bench", quietly = TRUE)
+cat("\nbenchmark backend: ", if (has_bench) "bench::mark" else "gc delta",
+    "\n\n", sep = "")
+
+## --- cal_score -----------------------------------------------------------
+score_se <- report("cal_score (default)",
+                   cal_score(se,
+                             par.idf = list(label = "Group"),
+                             par.iae = list(label = "Group")))
+
+score_se_int <- report("cal_score (return.intermediate=TRUE)",
+                       cal_score(se,
+                                 par.idf = list(label = "Group"),
+                                 par.iae = list(label = "Group"),
+                                 return.intermediate = TRUE))
+
+## --- top_markers ---------------------------------------------------------
+tm_glm <- report("top_markers (gaussian closed-form)",
+                 top_markers(score_se, label = "Group",
+                             n = 50L, use.glm = TRUE))
+
+tm_glm_batch <- report(
+  "top_markers (glm + batch)",
+  {
+    SummarizedExperiment::colData(score_se)$Batch <-
+      sample(c("b1", "b2", "b3"), N, replace = TRUE)
+    top_markers(score_se, label = "Group", batch = "Batch",
+                n = 50L, use.glm = TRUE)
+  }
+)
+
+tm_abs <- report("top_markers (abs, method=mean)",
+                 top_markers(score_se, label = "Group",
+                             n = 50L, use.glm = FALSE, method = "mean"))
+
+cat("\nScore class: ",
+    class(SummarizedExperiment::assay(score_se, "score"))[1], "\n")
+cat("Intermediate idf dim (compact): ",
+    paste(dim(S4Vectors::metadata(score_se_int)$idf), collapse = " x "),
+    "\n")
+cat("top_markers GLM rows: ", nrow(tm_glm), "\n")
+cat("top_markers abs rows: ", nrow(tm_abs), "\n")
diff --git a/man/cal_score.Rd b/man/cal_score.Rd
index df23b7a..14bdce7 100644
--- a/man/cal_score.Rd
+++ b/man/cal_score.Rd
@@ -14,7 +14,8 @@ cal_score(
   slot = "counts",
   new.slot = "score",
   par.idf = NULL,
-  par.iae = NULL
+  par.iae = NULL,
+  return.intermediate = FALSE
 )
 
 \S4method{cal_score}{AnyMatrix}(
@@ -23,7 +24,8 @@ cal_score(
   idf = "prob",
   iae = "prob",
   par.idf = NULL,
-  par.iae = NULL
+  par.iae = NULL,
+  return.intermediate = FALSE
 )
 
 \S4method{cal_score}{SummarizedExperiment}(
@@ -34,7 +36,8 @@ cal_score(
   slot = "counts",
   new.slot = "score",
   par.idf = NULL,
-  par.iae = NULL
+  par.iae = NULL,
+  return.intermediate = FALSE
 )
 }
 \arguments{
@@ -57,6 +60,13 @@ optional, default 'score'}
 \item{par.idf}{other parameters for specified IDF methods}
 
 \item{par.iae}{other parameters for specified IAE methods}
+
+\item{return.intermediate}{logical, if TRUE also return or store the
+intermediate \code{tf}, \code{idf} and \code{iae} matrices. Defaults to FALSE since
+these objects have the same dimension as the input expression matrix
+and can dominate memory usage on large datasets. Set to TRUE to
+restore the pre-1.7.3 behavior where intermediates were kept in
+\code{metadata()} of the SummarizedExperiment output.}
 }
 \value{
 A list of matrices or se object containing combined score
diff --git a/man/cal_score_init.Rd b/man/cal_score_init.Rd
index 81795ce..639ab14 100644
--- a/man/cal_score_init.Rd
+++ b/man/cal_score_init.Rd
@@ -10,7 +10,8 @@ cal_score_init(
   idf = "prob",
   iae = "prob",
   par.idf = NULL,
-  par.iae = NULL
+  par.iae = NULL,
+  return.intermediate = FALSE
 )
 }
 \arguments{
@@ -27,9 +28,15 @@ be accessed using \code{\link[=idf_iae_methods]{idf_iae_methods()}}}
 \item{par.idf}{other parameters for specified IDF methods}
 
 \item{par.iae}{other parameters for specified IAE methods}
+
+\item{return.intermediate}{logical, if TRUE the returned list also contains
+the intermediate \code{tf}, \code{idf} and \code{iae} objects. Default \code{FALSE} keeps
+only the combined \code{score} to avoid the memory overhead of three extra
+feature-by-cell matrices on large inputs.}
 }
 \value{
-a list of combined score, tf, idf and iae
+a list always containing \code{score}; when \code{return.intermediate = TRUE}
+the list additionally contains \code{tf}, \code{idf} and \code{iae}.
 }
 \description{
 Calculate score for each feature in each cell
diff --git a/man/tf.Rd b/man/tf.Rd
index 01a4880..be1ffe0 100644
--- a/man/tf.Rd
+++ b/man/tf.Rd
@@ -12,7 +12,9 @@ tf(expr, log = FALSE)
 \item{log}{logical, if to do log-transformation}
 }
 \value{
-a matrix of term/gene frequency
+a matrix of term/gene frequency. For a \code{dgCMatrix} input the
+returned object preserves sparsity (\code{log1p(0) == 0}); dense input
+returns a dense matrix.
 }
 \description{
 compute term/feature frequency within each cell
diff --git a/man/top_markers_glm.Rd b/man/top_markers_glm.Rd
index fe9dde8..92f673f 100644
--- a/man/top_markers_glm.Rd
+++ b/man/top_markers_glm.Rd
@@ -44,6 +44,16 @@ a tibble with feature names, group labels and ordered processed scores
 \description{
 calculate group mean score using glm and order genes based on scores difference
 }
+\details{
+When \code{family} is \code{gaussian()} with the identity link (the default) and
+the design matrix is full-rank, \code{top_markers_glm()} computes all per-
+gene label coefficients in a single closed-form least-squares solve
+via \code{Matrix::solve(crossprod(X), crossprod(X, t(data)))}, avoiding
+the per-gene \code{glm()} loop. For any other family, or a rank-deficient
+design, the function automatically falls back to the legacy
+\code{apply(data, 1, glm(...))} path, so results are unchanged for users
+who pass e.g. \code{family = Gamma()} or \code{family = poisson()}.
+}
 \examples{
 data <- matrix(rgamma(100, 2), 10, dimnames = list(1:10))
 top_markers_glm(data, label = rep(c("A", "B"), 5))
diff --git a/tests/testthat/test-numerical-equivalence.R b/tests/testthat/test-numerical-equivalence.R
new file mode 100644
index 0000000..cb288da
--- /dev/null
+++ b/tests/testthat/test-numerical-equivalence.R
@@ -0,0 +1,340 @@
+## Numerical regression tests for the memory-optimization refactor.
+##
+## These tests compare current output against pre-refactor snapshots
+## produced by `tests/testthat/testdata/capture_legacy_snapshots.R` on the
+## same HEAD that first introduced this file. Any refactor that claims to
+## preserve scoring behaviour must keep all `all.equal()` calls green.
+##
+## Tolerance is deliberately tight (1e-10) because the refactor only
+## rearranges algebra and subsetting order; no stochastic steps are
+## involved upstream of the matrices compared here.
+
+snap_path <- test_path("testdata", "legacy_scores.rds")
+
+skip_if_no_snapshot <- function() {
+  if (!file.exists(snap_path)) {
+    skip(paste0("Legacy snapshot missing: ", snap_path))
+  }
+}
+
+test_that("cal_score on dense matrix reproduces legacy score", {
+  skip_if_no_snapshot()
+  snap <- readRDS(snap_path)
+  res <- cal_score(
+    data    = snap$inputs$counts_dense,
+    tf      = "logtf",
+    idf     = "prob",
+    iae     = "prob",
+    par.idf = list(label = as.character(snap$inputs$label)),
+    par.iae = list(label = as.character(snap$inputs$label)),
+    return.intermediate = TRUE
+  )
+  expect_equal(
+    unname(as.matrix(res$score)),
+    unname(as.matrix(snap$cal_score$matrix_out$score)),
+    tolerance = 1e-10
+  )
+})
+
+test_that("cal_score default no longer stores intermediates in metadata", {
+  skip_if_no_snapshot()
+  snap <- readRDS(snap_path)
+  se <- SummarizedExperiment::SummarizedExperiment(
+    assays  = list(counts = snap$inputs$counts_dense),
+    colData = data.frame(
+      Group = snap$inputs$label,
+      Batch = snap$inputs$batch
+    )
+  )
+  out <- cal_score(
+    data    = se,
+    tf      = "logtf",
+    idf     = "prob",
+    iae     = "prob",
+    par.idf = list(label = "Group"),
+    par.iae = list(label = "Group")
+  )
+  expect_null(out@metadata$tf)
+  expect_null(out@metadata$idf)
+  expect_null(out@metadata$iae)
+  expect_equal(
+    unname(as.matrix(SummarizedExperiment::assay(out, "score"))),
+    unname(snap$cal_score$se_assay),
+    tolerance = 1e-10
+  )
+})
+
+test_that("cal_score return.intermediate=TRUE restores legacy metadata", {
+  skip_if_no_snapshot()
+  snap <- readRDS(snap_path)
+  se <- SummarizedExperiment::SummarizedExperiment(
+    assays  = list(counts = snap$inputs$counts_dense),
+    colData = data.frame(
+      Group = snap$inputs$label,
+      Batch = snap$inputs$batch
+    )
+  )
+  out <- cal_score(
+    data    = se,
+    tf      = "logtf",
+    idf     = "prob",
+    iae     = "prob",
+    par.idf = list(label = "Group"),
+    par.iae = list(label = "Group"),
+    return.intermediate = TRUE
+  )
+  md <- out@metadata
+  expect_false(is.null(md$tf))
+  expect_false(is.null(md$idf))
+  expect_false(is.null(md$iae))
+  ## tf stays G x N; direct comparison.
+  expect_equal(
+    unname(as.matrix(md$tf)),
+    unname(snap$cal_score$se_tf),
+    tolerance = 1e-10
+  )
+  ## Phase B: idf/iae for labelled prob/rf methods now return compact
+  ## G x K matrices. Expand via the Group label to recover the legacy
+  ## G x N representation and compare element-wise.
+  label_ch <- as.character(snap$inputs$label)
+  expect_equal(
+    unname(as.matrix(md$idf)[, label_ch, drop = FALSE]),
+    unname(snap$cal_score$se_idf),
+    tolerance = 1e-10
+  )
+  expect_equal(
+    unname(as.matrix(md$iae)[, label_ch, drop = FALSE]),
+    unname(snap$cal_score$se_iae),
+    tolerance = 1e-10
+  )
+})
+
+test_that("top_markers GLM path reproduces legacy scores", {
+  skip_if_no_snapshot()
+  snap <- readRDS(snap_path)
+  se <- SummarizedExperiment::SummarizedExperiment(
+    assays  = list(counts = snap$inputs$counts_dense),
+    colData = data.frame(
+      Group = snap$inputs$label,
+      Batch = snap$inputs$batch
+    )
+  )
+  se <- cal_score(
+    data    = se,
+    tf      = "logtf",
+    idf     = "prob",
+    iae     = "prob",
+    par.idf = list(label = "Group"),
+    par.iae = list(label = "Group")
+  )
+  tm <- top_markers(
+    data    = se,
+    label   = "Group",
+    n       = 10L,
+    use.glm = TRUE,
+    slot    = "score"
+  )
+  tm_ref <- snap$top_markers$glm
+  ## order-sensitive comparison on the same keys
+  key_new <- paste(tm$.dot,       tm$Genes,       sep = "|")
+  key_ref <- paste(tm_ref$.dot,   tm_ref$Genes,   sep = "|")
+  expect_setequal(key_new, key_ref)
+  ## align and compare scores
+  idx <- match(key_ref, key_new)
+  expect_equal(tm$Scores[idx], tm_ref$Scores, tolerance = 1e-8)
+})
+
+test_that("top_markers GLM with batch reproduces legacy scores", {
+  skip_if_no_snapshot()
+  snap <- readRDS(snap_path)
+  se <- SummarizedExperiment::SummarizedExperiment(
+    assays  = list(counts = snap$inputs$counts_dense),
+    colData = data.frame(
+      Group = snap$inputs$label,
+      Batch = snap$inputs$batch
+    )
+  )
+  se <- cal_score(
+    data    = se,
+    tf      = "logtf",
+    idf     = "prob",
+    iae     = "prob",
+    par.idf = list(label = "Group"),
+    par.iae = list(label = "Group")
+  )
+  tm <- top_markers(
+    data    = se,
+    label   = "Group",
+    n       = 10L,
+    use.glm = TRUE,
+    batch   = "Batch",
+    slot    = "score"
+  )
+  tm_ref <- snap$top_markers$glm_batch
+  key_new <- paste(tm$.dot, tm$Genes, sep = "|")
+  key_ref <- paste(tm_ref$.dot, tm_ref$Genes, sep = "|")
+  expect_setequal(key_new, key_ref)
+  idx <- match(key_ref, key_new)
+  expect_equal(tm$Scores[idx], tm_ref$Scores, tolerance = 1e-8)
+})
+
+test_that("top_markers abs mean path reproduces legacy scores", {
+  skip_if_no_snapshot()
+  snap <- readRDS(snap_path)
+  se <- SummarizedExperiment::SummarizedExperiment(
+    assays  = list(counts = snap$inputs$counts_dense),
+    colData = data.frame(
+      Group = snap$inputs$label,
+      Batch = snap$inputs$batch
+    )
+  )
+  se <- cal_score(
+    data    = se,
+    tf      = "logtf",
+    idf     = "prob",
+    iae     = "prob",
+    par.idf = list(label = "Group"),
+    par.iae = list(label = "Group")
+  )
+  tm <- top_markers(
+    data    = se,
+    label   = "Group",
+    n       = 10L,
+    use.glm = FALSE,
+    method  = "mean",
+    slot    = "score"
+  )
+  tm_ref <- snap$top_markers$abs_mean
+  key_new <- paste(tm$.dot, tm$Genes, sep = "|")
+  key_ref <- paste(tm_ref$.dot, tm_ref$Genes, sep = "|")
+  expect_setequal(key_new, key_ref)
+  idx <- match(key_ref, key_new)
+  expect_equal(tm$Scores[idx], tm_ref$Scores, tolerance = 1e-8)
+})
+
+test_that("top_markers abs median path reproduces legacy scores", {
+  skip_if_no_snapshot()
+  snap <- readRDS(snap_path)
+  se <- SummarizedExperiment::SummarizedExperiment(
+    assays  = list(counts = snap$inputs$counts_dense),
+    colData = data.frame(
+      Group = snap$inputs$label,
+      Batch = snap$inputs$batch
+    )
+  )
+  se <- cal_score(
+    data    = se,
+    tf      = "logtf",
+    idf     = "prob",
+    iae     = "prob",
+    par.idf = list(label = "Group"),
+    par.iae = list(label = "Group")
+  )
+  tm <- top_markers(
+    data    = se,
+    label   = "Group",
+    n       = 10L,
+    use.glm = FALSE,
+    method  = "median",
+    slot    = "score"
+  )
+  tm_ref <- snap$top_markers$abs_median
+  key_new <- paste(tm$.dot, tm$Genes, sep = "|")
+  key_ref <- paste(tm_ref$.dot, tm_ref$Genes, sep = "|")
+  expect_setequal(key_new, key_ref)
+  idx <- match(key_ref, key_new)
+  expect_equal(tm$Scores[idx], tm_ref$Scores, tolerance = 1e-8)
+})
+
+## ---------------------------------------------------------------------
+## Edge cases specific to the memory-optimization refactor
+## ---------------------------------------------------------------------
+
+test_that("dgCMatrix input yields identical scores to dense input", {
+  skip_if_no_snapshot()
+  snap   <- readRDS(snap_path)
+  dense  <- snap$inputs$counts_dense
+  sparse <- as(dense, "CsparseMatrix")
+  lab    <- as.character(snap$inputs$label)
+
+  r_dense <- cal_score(
+    dense,  tf = "logtf", idf = "prob", iae = "prob",
+    par.idf = list(label = lab), par.iae = list(label = lab),
+    return.intermediate = TRUE
+  )
+  r_sparse <- cal_score(
+    sparse, tf = "logtf", idf = "prob", iae = "prob",
+    par.idf = list(label = lab), par.iae = list(label = lab),
+    return.intermediate = TRUE
+  )
+  expect_s4_class(r_sparse$score, "dgCMatrix")
+  expect_true(is.matrix(r_dense$score))
+  expect_equal(
+    as.matrix(r_sparse$score), as.matrix(r_dense$score),
+    tolerance = 1e-10
+  )
+  expect_equal(
+    as.matrix(r_sparse$tf), as.matrix(r_dense$tf),
+    tolerance = 1e-10
+  )
+})
+
+test_that("top_markers gaussian closed-form matches glm apply-loop", {
+  skip_if_no_snapshot()
+  snap <- readRDS(snap_path)
+  se <- SummarizedExperiment::SummarizedExperiment(
+    assays  = list(counts = snap$inputs$counts_dense),
+    colData = data.frame(Group = snap$inputs$label)
+  )
+  se <- cal_score(se,
+    par.idf = list(label = "Group"),
+    par.iae = list(label = "Group")
+  )
+  scored <- SummarizedExperiment::assay(se, "score")
+  lab    <- SummarizedExperiment::colData(se)$Group
+
+  ## closed form (default gaussian)
+  betas_cf <- smartid:::fit_label_betas_closed_form(scored, factor(lab), NULL)
+  ## glm apply-loop reference
+  betas_gl <- smartid:::fit_label_betas_glm_loop(
+    scored, factor(lab), NULL, stats::gaussian()
+  )
+  expect_equal(unname(betas_cf), unname(betas_gl), tolerance = 1e-8)
+})
+
+test_that("rows with zero SD collapse to zero after row_scale_zmean", {
+  m <- matrix(rnorm(60), nrow = 6)
+  m[3, ] <- 5                    # constant row -> SD 0
+  m[5, ] <- NA_real_             # all-NA row -> SD NA
+  rownames(m) <- paste0("g", seq_len(6))
+  scaled <- smartid:::row_scale_zmean(m)
+  expect_true(all(scaled[3, ] == 0))
+  expect_true(all(scaled[5, ] == 0))
+})
+
+test_that("pmax0_offset preserves sparsity and masks negatives", {
+  x <- Matrix::sparseMatrix(
+    i = c(1, 2, 3), j = c(1, 2, 3), x = c(-1, 2, 5), dims = c(4, 4)
+  )
+  ## thres == 0 short-circuits to identity
+  expect_identical(smartid:::pmax0_offset(x, 0), x)
+  ## thres == 3 zeroes out 2 (< 3) and clips 5 to 2
+  out <- smartid:::pmax0_offset(x, 3)
+  expect_s4_class(out, "CsparseMatrix")
+  expect_equal(as.numeric(out[2, 2]), 0)
+  expect_equal(as.numeric(out[3, 3]), 2)
+  expect_equal(as.numeric(out[1, 1]), 0)
+})
+
+test_that("rowwise_notin_max vectorised form matches naive apply", {
+  set.seed(7)
+  mat <- matrix(runif(40, 0, 10), nrow = 8)
+  colnames(mat) <- paste0("k", seq_len(ncol(mat)))
+  fast <- smartid:::rowwise_notin_max(mat)
+  slow <- vapply(
+    colnames(mat), function(type)
+      apply(mat, 1, function(x) max(x[names(x) != type])),
+    numeric(nrow(mat))
+  )
+  expect_equal(unname(fast), unname(slow), tolerance = 1e-12)
+})
diff --git a/tests/testthat/testdata/capture_legacy_snapshots.R b/tests/testthat/testdata/capture_legacy_snapshots.R
new file mode 100644
index 0000000..4809fc1
--- /dev/null
+++ b/tests/testthat/testdata/capture_legacy_snapshots.R
@@ -0,0 +1,139 @@
+# Capture legacy outputs of cal_score() and top_markers() from the current
+# pre-refactor HEAD. Run ONCE from the package root with the source branch
+# loaded via devtools::load_all(). The resulting .rds file is committed and
+# consumed by tests/testthat/test-numerical-equivalence.R to verify the
+# memory-optimization refactor preserves numerical equivalence.
+#
+# Usage (from package root):
+#   Rscript tests/testthat/testdata/capture_legacy_snapshots.R
+#
+# Prerequisites: devtools, Matrix, SummarizedExperiment, S4Vectors installed.
+
+suppressPackageStartupMessages({
+  library(devtools)
+  library(Matrix)
+  library(SummarizedExperiment)
+  library(S4Vectors)
+})
+
+## Load source package at current HEAD (pre-refactor)
+devtools::load_all(".", quiet = TRUE)
+
+## Deterministic test inputs ----------------------------------------------
+set.seed(2026)
+
+G <- 40L   # features
+N <- 60L   # cells
+K <- 3L    # groups
+
+counts_dense <- matrix(rpois(G * N, lambda = 1.5), nrow = G)
+rownames(counts_dense) <- paste0("gene", seq_len(G))
+colnames(counts_dense) <- paste0("cell", seq_len(N))
+
+counts_sparse <- as(counts_dense, "CsparseMatrix")  # dgCMatrix
+
+label <- factor(rep(LETTERS[seq_len(K)], length.out = N))
+batch <- factor(rep(c("b1", "b2"), length.out = N))
+
+se_dense <- SummarizedExperiment(
+  assays  = list(counts = counts_dense),
+  colData = DataFrame(Group = label, Batch = batch)
+)
+
+## Reference outputs ------------------------------------------------------
+
+## (1) cal_score on dense matrix, default idf/iae = "prob", logtf
+score_mat_dense <- cal_score(
+  data    = counts_dense,
+  tf      = "logtf",
+  idf     = "prob",
+  iae     = "prob",
+  par.idf = list(label = as.character(label)),
+  par.iae = list(label = as.character(label))
+)
+
+## (2) cal_score on SummarizedExperiment (currently stores intermediates
+##     in metadata; post-refactor we will compare via return.intermediate)
+se_score <- cal_score(
+  data    = se_dense,
+  tf      = "logtf",
+  idf     = "prob",
+  iae     = "prob",
+  par.idf = list(label = "Group"),
+  par.iae = list(label = "Group")
+)
+
+## (3) top_markers with use.glm = TRUE (gaussian default)
+tm_glm <- top_markers(
+  data    = se_score,
+  label   = "Group",
+  n       = 10L,
+  use.glm = TRUE,
+  slot    = "score"
+)
+
+## (4) top_markers with use.glm = TRUE + batch
+tm_glm_batch <- top_markers(
+  data    = se_score,
+  label   = "Group",
+  n       = 10L,
+  use.glm = TRUE,
+  batch   = "Batch",
+  slot    = "score"
+)
+
+## (5) top_markers with use.glm = FALSE, method = "mean"
+tm_abs_mean <- top_markers(
+  data    = se_score,
+  label   = "Group",
+  n       = 10L,
+  use.glm = FALSE,
+  method  = "mean",
+  slot    = "score"
+)
+
+## (6) top_markers with use.glm = FALSE, method = "median"
+tm_abs_median <- top_markers(
+  data    = se_score,
+  label   = "Group",
+  n       = 10L,
+  use.glm = FALSE,
+  method  = "median",
+  slot    = "score"
+)
+
+## Assemble & save --------------------------------------------------------
+snap <- list(
+  inputs = list(
+    counts_dense  = counts_dense,
+    counts_sparse = counts_sparse,
+    label         = label,
+    batch         = batch
+  ),
+  cal_score = list(
+    matrix_out = score_mat_dense,        # list(score = ..., tf, idf, iae)
+    se_assay   = as.matrix(assay(se_score, "score")),
+    se_tf      = as.matrix(metadata(se_score)$tf),
+    se_idf     = as.matrix(metadata(se_score)$idf),
+    se_iae     = as.matrix(metadata(se_score)$iae)
+  ),
+  top_markers = list(
+    glm        = tm_glm,
+    glm_batch  = tm_glm_batch,
+    abs_mean   = tm_abs_mean,
+    abs_median = tm_abs_median
+  ),
+  meta = list(
+    captured_at = Sys.time(),
+    R_version   = R.version.string,
+    smartid_ver = as.character(packageVersion("smartid"))
+  )
+)
+
+out <- "tests/testthat/testdata/legacy_scores.rds"
+saveRDS(snap, file = out, version = 2L)
+
+message("Legacy snapshot written to: ", out)
+message("  cal_score$score dim:  ", paste(dim(snap$cal_score$matrix_out$score), collapse = " x "))
+message("  SE assay 'score' dim: ", paste(dim(snap$cal_score$se_assay), collapse = " x "))
+message("  top_markers$glm rows: ", nrow(snap$top_markers$glm))
diff --git a/tests/testthat/testdata/legacy_scores.rds b/tests/testthat/testdata/legacy_scores.rds
new file mode 100644
index 0000000..e6c8075
Binary files /dev/null and b/tests/testthat/testdata/legacy_scores.rds differ
diff --git a/vignettes/smartid_Demo.Rmd b/vignettes/smartid_Demo.Rmd
index d0e4308..28de859 100644
--- a/vignettes/smartid_Demo.Rmd
+++ b/vignettes/smartid_Demo.Rmd
@@ -135,6 +135,8 @@ TF here stands for gene frequency, which is similar to CPM, while IDF represents
 
 Another advantage of `smartid` is that it can start with raw counts data, with no need for pre-processed data. And the scoring is quite fast.
 
+Starting from smartid 1.7.3, `cal_score()` no longer stores the intermediate `tf`, `idf` and `iae` matrices in `metadata()` by default, because on large inputs these objects can each match the full feature-by-cell score matrix in size and quickly dominate memory. When the intermediates are useful for inspection (as in this demo), pass `return.intermediate = TRUE` to restore the pre-1.7.3 behaviour.
+
 ```{r}
 ## compute score
 system.time(
@@ -144,11 +146,13 @@ system.time(
     idf = "prob",
     iae = "prob",
     par.idf = list(label = "Group"),
-    par.iae = list(label = "Group")
+    par.iae = list(label = "Group"),
+    return.intermediate = TRUE
   )
 )
 
-## score and tf,idf,iae all saved
+## score in assays; tf, idf and iae are stored in metadata()
+## because we opted in with return.intermediate = TRUE.
 assays(data_sim)
 names(metadata(data_sim))
 ```
@@ -278,11 +282,13 @@ system.time(
     tf = "logtf",
     idf = "sd",
     iae = "sd",
-    new.slot = "score_unlabel"
+    new.slot = "score_unlabel",
+    return.intermediate = TRUE
   )
 )
 
-## new score is saved and tf,idf,iae all updated
+## new score is saved; tf/idf/iae are updated in metadata because we
+## again opted in with return.intermediate = TRUE (default is FALSE).
 assays(data_sim)
 names(metadata(data_sim))
 ```