From c42290abf947a1c06331093428c6af7598fda75e Mon Sep 17 00:00:00 2001 From: Dario Date: Wed, 15 Apr 2026 10:24:59 +0200 Subject: [PATCH 1/8] fixing vignettes nomenclatures --- vignettes/SpaceTrooper_utilities.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/SpaceTrooper_utilities.Rmd b/vignettes/SpaceTrooper_utilities.Rmd index ee68496..d44e414 100644 --- a/vignettes/SpaceTrooper_utilities.Rmd +++ b/vignettes/SpaceTrooper_utilities.Rmd @@ -9,7 +9,7 @@ output: toc: true toc_float: true vignette: > - %\VignetteIndexEntry{Imaging-based Spatial Transcriptomics Data Quality Control with SpaceTrooper} + %\VignetteIndexEntry{SpaceTrooper utilities overview} %\VignetteEncoding{UTF-8} %\VignetteEngine{knitr::rmarkdown} editor_options: From ddff68f5a255b364a01f094b3e0ea74a13910adc Mon Sep 17 00:00:00 2001 From: Dario Date: Wed, 6 May 2026 10:41:18 +0200 Subject: [PATCH 2/8] transfer/eval coeff between datasets --- inst/scripts/evaluate_qs_split_kfold.R | 726 ++++++++++++++++++++++++ inst/scripts/transfer_qs_coefficients.R | 665 ++++++++++++++++++++++ 2 files changed, 1391 insertions(+) create mode 100644 inst/scripts/evaluate_qs_split_kfold.R create mode 100644 inst/scripts/transfer_qs_coefficients.R diff --git a/inst/scripts/evaluate_qs_split_kfold.R b/inst/scripts/evaluate_qs_split_kfold.R new file mode 100644 index 0000000..faa4d1d --- /dev/null +++ b/inst/scripts/evaluate_qs_split_kfold.R @@ -0,0 +1,726 @@ +library(SpaceTrooper) +library(SummarizedExperiment) +library(S4Vectors) + +# Apply a fitted ridge-logistic QS model to a new SpatialExperiment subset. +# +# The function rebuilds the design matrix from `colData`, using the same +# formula employed during training, and stores predicted probabilities in the +# `QC_score` column. +score_subset <- function(spe_subset, fit, lambda, model_formula) { + # Convert per-cell metadata to a plain data.frame so model.matrix can use it. + df_subset <- as.data.frame(SummarizedExperiment::colData(spe_subset)) + + # Recreate the exact predictor matrix used by glmnet. + x_subset <- stats::model.matrix(stats::as.formula(model_formula), + data=df_subset) + + # Predict the probability of being a good-quality cell. + spe_subset$QC_score <- as.vector( + stats::predict(fit, s=lambda, newx=x_subset, type="response") + ) + spe_subset +} + +# Compute AUROC from ranks, without requiring extra packages. +# +# `label_positive` must be a 0/1 vector, where 1 indicates the positive class. +# In this script we use it both for bad-vs-good and for derived summaries. +auc_rank <- function(score, label_positive) { + # Coerce labels to integer to avoid issues with logical/factor inputs. + label_positive <- as.integer(label_positive) + n_pos <- sum(label_positive == 1L) + n_neg <- sum(label_positive == 0L) + + # AUROC is undefined if one of the two classes is missing. + if (n_pos == 0L || n_neg == 0L) { + return(NA_real_) + } + + # Mann-Whitney / rank-based computation of AUROC. + ranks <- rank(score, ties.method="average") + (sum(ranks[label_positive == 1L]) - n_pos * (n_pos + 1) / 2) / + (n_pos * n_neg) +} + +# Build pseudo-reference labels on a subset, following the same SpaceTrooper +# logic used internally to define training examples. +# +# Returned labels are: +# - `qcscore_train = 0`: low-quality examples +# - `qcscore_train = 1`: good-quality examples +build_reference_labels <- function(spe_subset, verbose=FALSE) { + # Detect outliers for the metrics used by QS. + spe_ref <- computeOutliersQCScore(spe_subset) + + # Remove variables that do not have enough outliers to be informative. + spe_ref <- checkOutliers(spe_ref, verbose=verbose) + + # Build a balanced table of bad and good examples. + df_ref <- computeTrainDF( + colData=SummarizedExperiment::colData(spe_ref), + formulaVars=S4Vectors::metadata(spe_ref)$formula_variables, + tech=S4Vectors::metadata(spe_ref)$technology, + verbose=verbose + ) + + # Keep only the columns needed for downstream evaluation. + df_ref[, c("cell_id", "qcscore_train")] +} + +# Safe wrapper around `build_reference_labels()`. +# +# On small or unlucky splits/folds, SpaceTrooper may not find enough outliers +# to define a reference set. Instead of stopping the whole script, this returns +# the error message so the caller can inspect it. +safe_reference_labels <- function(spe_subset, verbose=FALSE) { + tryCatch( + list(data=build_reference_labels(spe_subset, verbose=verbose), + error=NULL), + error=function(e) list(data=NULL, error=conditionMessage(e)) + ) +} + +# Safely extract an optional scalar character field from a list. +# +# This is used when collecting fold-level error messages at the end of k-fold +# evaluation. Missing, NULL, or NA entries are converted to `NA_character_`. +extract_optional_chr1 <- function(x, name) { + value <- x[[name]] + + if (is.null(value) || length(value) == 0L || all(is.na(value))) { + return(NA_character_) + } + + as.character(value[[1]]) +} + +# Add good/bad/unlabelled labels to a SpatialExperiment subset for plotting. +# +# Labels are derived from the evaluation table: +# - `qcscore_train = 0` -> bad +# - `qcscore_train = 1` -> good +# - missing label -> unlabelled +append_eval_labels <- function( + spe_subset, + df_eval, + label_col="eval_label", + colour_col="eval_label_color" +) { + labels <- rep("unlabelled", ncol(spe_subset)) + + if (!is.null(df_eval) && nrow(df_eval) > 0L) { + idx <- match(spe_subset$cell_id, df_eval$cell_id) + keep <- !is.na(idx) + labels[keep] <- ifelse(df_eval$qcscore_train[idx[keep]] == 0, + "bad", "good") + } + + spe_subset[[label_col]] <- factor(labels, + levels=c("bad", "good", "unlabelled")) + + palette <- c( + bad="#D55E00", + good="#009E73", + unlabelled="#BDBDBD" + ) + spe_subset[[colour_col]] <- unname(palette[as.character(spe_subset[[label_col]])]) + spe_subset +} + +# Select either the whole result object (single split) or one fold result. +# +# This helper keeps the plotting functions compact and makes the API uniform +# across single-split and k-fold outputs. +get_result_level <- function(result, fold=NULL) { + if (identical(result$split_type, "k_fold_cv")) { + if (is.null(fold)) { + stop("Please provide `fold` when plotting a k-fold result") + } + if (length(fold) != 1L || is.na(fold) || fold < 1 || fold > length(result$folds)) { + stop("`fold` must be an integer between 1 and the number of folds") + } + return(result$folds[[fold]]) + } + + result +} + +# Plot cells coloured as good, bad, or unlabelled. +# +# By default the function uses centroids because they are always available in +# the current workflow. For k-fold results, choose which fold to display. +plot_labelled_cells <- function( + result, + dataset=c("train", "test"), + fold=NULL, + size=0.05, + alpha=0.8, + scaleBar=TRUE +) { + dataset <- match.arg(dataset) + level_result <- get_result_level(result, fold=fold) + + spe_subset <- switch(dataset, + train=level_result$spe_train, + test=level_result$spe_test + ) + df_eval <- switch(dataset, + train=level_result$train_eval, + test=level_result$test_eval + ) + + spe_subset <- append_eval_labels(spe_subset, df_eval) + + title_suffix <- if (identical(result$split_type, "k_fold_cv")) { + paste0(" - fold ", fold) + } else { + "" + } + + plotCentroids( + spe_subset, + colourBy="eval_label", + palette="eval_label_color", + size=size, + alpha=alpha, + scaleBar=scaleBar + ) + + ggplot2::labs( + title=paste0("Labelled cells (", dataset, ")", title_suffix), + colour="Reference label", + fill="Reference label" + ) +} + +# Compute the points of a ROC curve from evaluation labels and QS values. +# +# The positive class is the bad-quality group, so the curve is built using +# `1 - QC_score` as the decision score. +build_roc_df <- function(df_eval) { + if (is.null(df_eval) || nrow(df_eval) == 0L) { + return(data.frame()) + } + + label_positive <- as.integer(df_eval$qcscore_train == 0) + n_pos <- sum(label_positive == 1L) + n_neg <- sum(label_positive == 0L) + + if (n_pos == 0L || n_neg == 0L) { + return(data.frame()) + } + + score_bad <- 1 - df_eval$QC_score + ord <- order(score_bad, decreasing=TRUE) + score_bad <- score_bad[ord] + label_positive <- label_positive[ord] + + tp <- cumsum(label_positive == 1L) + fp <- cumsum(label_positive == 0L) + change <- c(score_bad[-1] != score_bad[-length(score_bad)], TRUE) + + data.frame( + fpr=c(0, fp[change] / n_neg), + tpr=c(0, tp[change] / n_pos) + ) +} + +# Plot ROC curves for a single split or across folds. +# +# In k-fold mode the function overlays one ROC curve per fold and reports the +# average AUC in the subtitle. +plot_roc_eval <- function(result, dataset=c("train", "test")) { + dataset <- match.arg(dataset) + + if (identical(result$split_type, "k_fold_cv")) { + roc_list <- lapply(seq_along(result$folds), function(i) { + df_eval <- switch(dataset, + train=result$folds[[i]]$train_eval, + test=result$folds[[i]]$test_eval + ) + roc_df <- build_roc_df(df_eval) + if (nrow(roc_df) == 0L) { + return(NULL) + } + roc_df$fold <- i + roc_df + }) + roc_list <- Filter(Negate(is.null), roc_list) + + if (length(roc_list) == 0L) { + stop("No ROC curves available for the selected dataset") + } + + roc_df <- do.call(rbind, roc_list) + auc_mean <- switch(dataset, + train=result$train_summary_average$auc_bad_vs_good, + test=result$test_summary_average$auc_bad_vs_good + ) + + return( + ggplot2::ggplot(roc_df, + ggplot2::aes(x=fpr, y=tpr, colour=factor(fold), group=fold)) + + ggplot2::geom_abline(intercept=0, slope=1, linetype=2, + colour="grey60") + + ggplot2::geom_path(linewidth=0.7, alpha=0.8) + + ggplot2::coord_fixed() + + ggplot2::theme_bw() + + ggplot2::labs( + title=paste0("ROC curves across folds (", dataset, ")"), + subtitle=paste0("Mean AUC = ", round(auc_mean, 4)), + x="False positive rate", + y="True positive rate", + colour="Fold" + ) + ) + } + + df_eval <- switch(dataset, + train=result$train_eval, + test=result$test_eval + ) + roc_df <- build_roc_df(df_eval) + + if (nrow(roc_df) == 0L) { + stop("No ROC curve available for the selected dataset") + } + + auc_value <- switch(dataset, + train=result$train_summary$auc_bad_vs_good, + test=result$test_summary$auc_bad_vs_good + ) + + ggplot2::ggplot(roc_df, ggplot2::aes(x=fpr, y=tpr)) + + ggplot2::geom_abline(intercept=0, slope=1, linetype=2, + colour="grey60") + + ggplot2::geom_path(linewidth=0.9, colour="#0072B2") + + ggplot2::coord_fixed() + + ggplot2::theme_bw() + + ggplot2::labs( + title=paste0("ROC curve (", dataset, ")"), + subtitle=paste0("AUC = ", round(auc_value, 4)), + x="False positive rate", + y="True positive rate" + ) +} + +# Plot AUC values for each fold. +# +# In single-split mode the function returns a one-point summary. In k-fold mode +# it shows one point per fold and a dashed line for the average AUC. +plot_auc_across_folds <- function(result, dataset=c("train", "test")) { + dataset <- match.arg(dataset) + + if (identical(result$split_type, "k_fold_cv")) { + auc_df <- switch(dataset, + train=result$train_summary_per_fold, + test=result$test_summary_per_fold + ) + auc_mean <- switch(dataset, + train=result$train_summary_average$auc_bad_vs_good, + test=result$test_summary_average$auc_bad_vs_good + ) + + return( + ggplot2::ggplot(auc_df, + ggplot2::aes(x=fold, y=auc_bad_vs_good)) + + ggplot2::geom_hline(yintercept=auc_mean, linetype=2, + colour="#D55E00") + + ggplot2::geom_line(colour="#0072B2") + + ggplot2::geom_point(size=2.2, colour="#0072B2") + + ggplot2::theme_bw() + + ggplot2::labs( + title=paste0("AUC across folds (", dataset, ")"), + subtitle=paste0("Dashed line = mean AUC = ", round(auc_mean, 4)), + x="Fold", + y="AUC" + ) + ) + } + + auc_value <- switch(dataset, + train=result$train_summary$auc_bad_vs_good, + test=result$test_summary$auc_bad_vs_good + ) + auc_df <- data.frame(split=1, auc_bad_vs_good=auc_value) + + ggplot2::ggplot(auc_df, ggplot2::aes(x=split, y=auc_bad_vs_good)) + + ggplot2::geom_point(size=3, colour="#0072B2") + + ggplot2::theme_bw() + + ggplot2::scale_x_continuous(breaks=1, labels="split") + + ggplot2::labs( + title=paste0("AUC summary (", dataset, ")"), + subtitle=paste0("AUC = ", round(auc_value, 4)), + x=NULL, + y="AUC" + ) +} + +# Summarise QS performance against pseudo-reference labels. +# +# The summary reports: +# - number of labelled cells +# - number of bad/good examples +# - median QS in bad and good examples +# - AUROC for separating bad from good +# - sensitivity/specificity at the chosen threshold +summarise_eval <- function(df_eval, threshold=0.5) { + # Return an empty data.frame if the evaluation table is missing. + if (is.null(df_eval) || nrow(df_eval) == 0L) { + return(data.frame()) + } + + # Derive boolean masks for the pseudo-reference classes. + is_bad <- df_eval$qcscore_train == 0 + is_good <- df_eval$qcscore_train == 1 + + # A cell is predicted as bad if its QS is below the chosen threshold. + pred_bad <- df_eval$QC_score < threshold + + data.frame( + n_labeled=nrow(df_eval), + n_bad=sum(is_bad), + n_good=sum(is_good), + median_qs_bad=if (sum(is_bad) > 0) { + stats::median(df_eval$QC_score[is_bad]) + } else { + NA_real_ + }, + median_qs_good=if (sum(is_good) > 0) { + stats::median(df_eval$QC_score[is_good]) + } else { + NA_real_ + }, + # `1 - QC_score` is used so that larger values correspond to worse cells. + auc_bad_vs_good=auc_rank(1 - df_eval$QC_score, as.integer(is_bad)), + sensitivity_bad_qs_lt_threshold=if (sum(is_bad) > 0) { + mean(pred_bad[is_bad]) + } else { + NA_real_ + }, + specificity_good_qs_ge_threshold=if (sum(is_good) > 0) { + mean(!pred_bad[is_good]) + } else { + NA_real_ + } + ) +} + +# Create train/test indices for a single random split. +# +# `train_fraction` is only used when `k_folds = 1`. It controls the proportion +# of cells assigned to the training set, allowing e.g. 50/50, 70/30, 80/20. +make_random_split <- function(n_cells, train_fraction, seed) { + stopifnot(length(n_cells) == 1L, n_cells > 1L) + stopifnot(length(train_fraction) == 1L, !is.na(train_fraction)) + + if (train_fraction <= 0 || train_fraction >= 1) { + stop("train_fraction must be strictly between 0 and 1") + } + + set.seed(seed) + + # Sample the training cells at random. + n_train <- floor(n_cells * train_fraction) + if (n_train < 1L || n_train >= n_cells) { + stop("train_fraction produces an empty training or test set") + } + + train_idx <- sample(seq_len(n_cells), size=n_train, replace=FALSE) + test_idx <- setdiff(seq_len(n_cells), train_idx) + + list(train_idx=train_idx, test_idx=test_idx) +} + +# Assign each cell to one of the k folds used in cross-validation. +# +# Each fold acts as test set once, while the remaining folds are used for +# training. This function is only used when `k_folds > 1`. +make_fold_ids <- function(n_cells, k_folds, seed) { + stopifnot(length(n_cells) == 1L, n_cells > 1L) + stopifnot(length(k_folds) == 1L, !is.na(k_folds), k_folds >= 1) + + k_folds <- as.integer(k_folds) + if (k_folds < 1L) { + stop("k_folds must be >= 1") + } + + set.seed(seed) + + if (k_folds > n_cells) { + stop("k_folds cannot be greater than the number of cells") + } + + # Shuffle cells and then distribute them approximately evenly across folds. + shuffled <- sample(seq_len(n_cells)) + fold_id <- integer(n_cells) + fold_id[shuffled] <- rep(seq_len(k_folds), length.out=n_cells) + fold_id +} + +# Train the QS model on one training split/fold and evaluate it on both train +# and test cells. +# +# This is the core routine used by both single-split validation and k-fold CV. +run_one_fold <- function(spe, test_idx, threshold=0.5, verbose=TRUE) { + # Partition the input SpatialExperiment into training and test subsets. + spe_train <- spe[, -test_idx] + spe_test <- spe[, test_idx] + + # Compute outliers only on training cells, so the model is fitted without + # information leakage from the test set. + spe_train <- computeOutliersQCScore(spe_train) + spe_train <- checkOutliers(spe_train, verbose=verbose) + + # Extract the training formula selected by SpaceTrooper. + formula_vars <- S4Vectors::metadata(spe_train)$formula_variables + model_formula <- getModelFormula(formula_vars, verbose=verbose) + + # Create the balanced training table of pseudo-bad and pseudo-good cells. + df_train <- computeTrainDF( + colData=SummarizedExperiment::colData(spe_train), + formulaVars=formula_vars, + tech=S4Vectors::metadata(spe_train)$technology, + verbose=verbose + ) + + # Build the design matrix and estimate the ridge penalty. + x_train <- stats::model.matrix(stats::as.formula(model_formula), data=df_train) + best_lambda <- computeLambda(df_train, model_formula) + + # Fit the ridge logistic model. + fit <- trainModel(x_train, df_train) + + # Score both the training and test subsets using the trained model. + spe_train <- score_subset(spe_train, fit, best_lambda, model_formula) + spe_test <- score_subset(spe_test, fit, best_lambda, model_formula) + + # Training evaluation uses the very same labelled examples used for fitting. + train_eval <- merge( + data.frame(cell_id=spe_train$cell_id, QC_score=spe_train$QC_score), + df_train[, c("cell_id", "qcscore_train")], + by="cell_id" + ) + + # On the test set, build pseudo-reference labels independently. + test_ref <- safe_reference_labels(spe_test, verbose=verbose) + test_eval <- if (is.null(test_ref$data)) { + NULL + } else { + merge( + data.frame(cell_id=spe_test$cell_id, QC_score=spe_test$QC_score), + test_ref$data, + by="cell_id" + ) + } + + # Return both summaries and raw objects/tables for further inspection. + list( + formula_variables=formula_vars, + model_formula=model_formula, + lambda=best_lambda, + train_class_balance=table(df_train$qcscore_train), + train_label_error=NA_character_, + train_summary=summarise_eval(train_eval, threshold=threshold), + test_summary=summarise_eval(test_eval, threshold=threshold), + test_label_error=test_ref$error, + spe_train=spe_train, + spe_test=spe_test, + train_eval=train_eval, + test_eval=test_eval + ) +} + +# Combine per-fold summaries into a single table and compute their mean. +# +# This is only used in k-fold mode. The output contains: +# - `per_fold`: one row per fold +# - `average`: average across folds for each metric +combine_fold_summaries <- function(fold_results, summary_name) { + summaries <- lapply(seq_along(fold_results), function(i) { + x <- fold_results[[i]][[summary_name]] + if (is.null(x) || nrow(x) == 0L) { + return(NULL) + } + x$fold <- i + x + }) + summaries <- Filter(Negate(is.null), summaries) + + if (length(summaries) == 0L) { + return(list(per_fold=data.frame(), average=data.frame())) + } + + per_fold <- do.call(rbind, summaries) + metric_cols <- setdiff(colnames(per_fold), "fold") + + # Average each numeric summary metric across folds. + average <- as.data.frame( + lapply(per_fold[, metric_cols, drop=FALSE], function(x) mean(x, na.rm=TRUE)) + ) + + list(per_fold=per_fold, average=average) +} + +# Main entry point for evaluating SpaceTrooper QS on a CosMx dataset. +# +# Modes: +# - `k_folds = 1`: single random split using `train_fraction` +# - `k_folds > 1`: k-fold cross-validation +# +# Important notes: +# - `train_fraction` is ignored when `k_folds > 1` +# - `threshold` is used only for classification summaries, not for model fitting +evaluate_qs_split <- function( + data_dir="/Users/inzirio/Downloads/CosMx_data/DBKero/CosMx_Breast/CosMx_data_Case2", + sample_name="CosMx_Case2", + seed=1713, + threshold=0.5, + k_folds=1, + train_fraction=0.5, + verbose=TRUE +) { + # Validate the high-level input parameters. + stopifnot(length(k_folds) == 1L, !is.na(k_folds), k_folds >= 1) + k_folds <- as.integer(k_folds) + stopifnot(length(train_fraction) == 1L, !is.na(train_fraction)) + + if (train_fraction <= 0 || train_fraction >= 1) { + stop("train_fraction must be strictly between 0 and 1") + } + + # Read the CosMx dataset from disk. + spe <- readCosmxSPE(data_dir, sampleName=sample_name) + + # Compute the per-cell QC metrics required by the QS model. + spe <- spatialPerCellQC( + spe, + rmZeros=TRUE, + negProbList=c("NegPrb", "Negative", "SystemControl") + ) + + # Count the number of cells remaining after QC preprocessing. + n_cells <- ncol(spe) + + if (k_folds == 1L) { + # Single random split: build train/test indices according to the + # requested training fraction. + split_idx <- make_random_split( + n_cells=n_cells, + train_fraction=train_fraction, + seed=seed + ) + + res <- run_one_fold( + spe=spe, + test_idx=split_idx$test_idx, + threshold=threshold, + verbose=verbose + ) + + return(c( + list( + split_type="random_split", + k_folds=k_folds, + seed=seed, + threshold=threshold, + train_fraction=train_fraction, + test_fraction=1 - train_fraction, + n_cells=n_cells, + n_train=length(split_idx$train_idx), + n_test=length(split_idx$test_idx) + ), + res + )) + } + + # K-fold mode: assign each cell to a fold once. + fold_id <- make_fold_ids(n_cells=n_cells, k_folds=k_folds, seed=seed) + + fold_results <- lapply(seq_len(k_folds), function(i) { + if (verbose) { + message("Running fold ", i, " of ", k_folds) + } + + # Fold i is the test set; all other folds are the training set. + run_one_fold( + spe=spe, + test_idx=which(fold_id == i), + threshold=threshold, + verbose=verbose + ) + }) + + train_combined <- combine_fold_summaries(fold_results, "train_summary") + test_combined <- combine_fold_summaries(fold_results, "test_summary") + test_label_errors <- vapply( + fold_results, + extract_optional_chr1, + character(1), + name = "test_label_error" + ) + + train_label_errors <- vapply( + fold_results, + extract_optional_chr1, + character(1), + name = "train_label_error" + ) + + # if you only want real errors later, drop missing values explicitly + test_label_errors <- stats::na.omit(test_label_errors) + train_label_errors <- stats::na.omit(train_label_errors) + + list( + split_type="k_fold_cv", + k_folds=k_folds, + seed=seed, + threshold=threshold, + n_cells=n_cells, + fold_assignment=fold_id, + train_summary_per_fold=train_combined$per_fold, + train_summary_average=train_combined$average, + test_summary_per_fold=test_combined$per_fold, + test_summary_average=test_combined$average, + test_label_errors=test_label_errors, + train_label_errors=train_label_errors, + folds=fold_results + ) +} + +# Esempi di utilizzo: +# source("inst/scripts/evaluate_qs_split_kfold.R") +# +# Split singolo 50/50 +# res_split_50_50 <- evaluate_qs_split(k_folds=1, train_fraction=0.50) +# res_split_50_50$test_summary +# +# Split singolo 70/30 +# res_split_70_30 <- evaluate_qs_split(k_folds=1, train_fraction=0.70) +# res_split_70_30$test_summary +res_split <- evaluate_qs_split(k_folds=1, train_fraction=0.70) + +plot_labelled_cells(res_split, dataset="train", size=0.08) +plot_labelled_cells(res_split, dataset="test", size=0.08) + +plot_roc_eval(res_split, dataset="test") +plot_auc_across_folds(res_split, dataset="test") +# +# Split singolo 80/20 +# res_split_80_20 <- evaluate_qs_split(k_folds=1, train_fraction=0.80) +# res_split_80_20$test_summary +# +# 5-fold cross-validation +res_kfold_5 <- evaluate_qs_split(k_folds=5) +res_kfold_5$test_summary_per_fold +res_kfold_5$test_summary_average +res_kfold_5$test_label_errors +# +# Celle colorate good/bad sul test set del fold 1 +plot_labelled_cells(res_kfold_5, dataset="test", fold=1, size=0.1) +# +# ROC curve across folds +plot_roc_eval(res_kfold_5, dataset="test") +# +# AUC across folds +plot_auc_across_folds(res_kfold_5, dataset="test") diff --git a/inst/scripts/transfer_qs_coefficients.R b/inst/scripts/transfer_qs_coefficients.R new file mode 100644 index 0000000..27b4c3a --- /dev/null +++ b/inst/scripts/transfer_qs_coefficients.R @@ -0,0 +1,665 @@ +library(SpaceTrooper) +library(SummarizedExperiment) +library(S4Vectors) + +# Read a dataset with the appropriate SpaceTrooper reader according to the +# selected technology. +# +# Supported values are: +# - "cosmx" +# - "cosmx_protein" +# - "xenium" +read_dataset_for_transfer <- function( + dir_name, + technology=c("cosmx", "cosmx_protein", "xenium"), + sample_name="sample01" +) { + technology <- match.arg(technology) + + switch( + technology, + cosmx=readCosmxSPE(dir_name, sampleName=sample_name), + cosmx_protein=readCosmxProteinSPE(dir_name, sampleName=sample_name), + xenium=readXeniumSPE( + dir_name, + sampleName=sample_name, + computeMissingMetrics=TRUE, + keepPolygons=FALSE + ) + ) +} + +# Compute the per-cell QC metrics required before training or applying QS. +# +# The same preprocessing is applied to both source and target datasets so that +# transferred coefficients operate on harmonised metric columns. +prepare_dataset_for_transfer <- function( + spe, + rm_zeros=TRUE, + neg_prob_list=c("NegPrb", "Negative", "SystemControl") +) { + spatialPerCellQC( + spe, + rmZeros=rm_zeros, + negProbList=neg_prob_list + ) +} + +# Identify the metric set that can be transferred from source to target. +# +# For heterogeneous transfers, the safest common metrics are: +# - log2SignalDensity +# - Area_um +# - log2Ctrl_total_ratio +# +# The border-dependent aspect-ratio term is included only if both datasets have +# `log2AspectRatio` and `dist_border`, which is typically the case for CosMx to +# CosMx transfers. +get_transferable_metrics <- function(source_spe, target_spe) { + source_cols <- colnames(SummarizedExperiment::colData(source_spe)) + target_cols <- colnames(SummarizedExperiment::colData(target_spe)) + shared_cols <- intersect(source_cols, target_cols) + + metrics <- intersect( + c("log2SignalDensity", "Area_um", "log2Ctrl_total_ratio"), + shared_cols + ) + + if (all(c("log2AspectRatio", "dist_border") %in% source_cols) && + all(c("log2AspectRatio", "dist_border") %in% target_cols)) { + metrics <- c(metrics, "log2AspectRatio") + } + + unique(metrics) +} + +# Extract a readable coefficient table from a fitted glmnet model. +# +# Coefficients are returned at the selected lambda, so they are exactly the ones +# transferred to the target dataset. +extract_qs_coefficients <- function(fit, lambda) { + coef_mat <- as.matrix(stats::predict(fit, s=lambda, type="coefficients")) + data.frame( + term=rownames(coef_mat), + coefficient=as.numeric(coef_mat[, 1]), + row.names=NULL + ) +} + +# Apply a fitted QS model to a new SpatialExperiment object. +# +# The design matrix is rebuilt from the target `colData` using the compatible +# transfer formula trained on the source dataset. +score_target_dataset <- function(spe, fit, lambda, model_formula) { + df <- as.data.frame(SummarizedExperiment::colData(spe)) + x <- stats::model.matrix(stats::as.formula(model_formula), data=df) + spe$QC_score <- as.vector( + stats::predict(fit, s=lambda, newx=x, type="response") + ) + spe +} + +# Compute AUROC with a rank-based formula, avoiding extra package dependencies. +auc_rank <- function(score, label_positive) { + label_positive <- as.integer(label_positive) + n_pos <- sum(label_positive == 1L) + n_neg <- sum(label_positive == 0L) + + if (n_pos == 0L || n_neg == 0L) { + return(NA_real_) + } + + ranks <- rank(score, ties.method="average") + (sum(ranks[label_positive == 1L]) - n_pos * (n_pos + 1) / 2) / + (n_pos * n_neg) +} + +# Build pseudo-reference labels on a dataset using the same metric subset used +# for transfer. +# +# This is useful for evaluating how well transferred coefficients separate +# pseudo-bad from pseudo-good cells on the target dataset. +build_reference_labels_transfer <- function(spe, metric_list, verbose=FALSE) { + spe_ref <- computeOutliersQCScore(spe, metricList=metric_list) + spe_ref <- checkOutliers(spe_ref, verbose=verbose) + + df_ref <- computeTrainDF( + colData=SummarizedExperiment::colData(spe_ref), + formulaVars=S4Vectors::metadata(spe_ref)$formula_variables, + tech=S4Vectors::metadata(spe_ref)$technology, + verbose=verbose + ) + + df_ref[, c("cell_id", "qcscore_train")] +} + +# Safe wrapper for pseudo-reference label generation. +safe_reference_labels_transfer <- function(spe, metric_list, verbose=FALSE) { + tryCatch( + list( + data=build_reference_labels_transfer( + spe, + metric_list=metric_list, + verbose=verbose + ), + error=NULL + ), + error=function(e) list(data=NULL, error=conditionMessage(e)) + ) +} + +# Summarise the separation between pseudo-bad and pseudo-good cells. +# +# The summary is computed on either source or target after scoring with the +# transferred model. +summarise_transfer_eval <- function(df_eval, threshold=0.5) { + if (is.null(df_eval) || nrow(df_eval) == 0L) { + return(data.frame()) + } + + is_bad <- df_eval$qcscore_train == 0 + is_good <- df_eval$qcscore_train == 1 + pred_bad <- df_eval$QC_score < threshold + + data.frame( + n_labeled=nrow(df_eval), + n_bad=sum(is_bad), + n_good=sum(is_good), + median_qs_bad=if (sum(is_bad) > 0) { + stats::median(df_eval$QC_score[is_bad]) + } else { + NA_real_ + }, + median_qs_good=if (sum(is_good) > 0) { + stats::median(df_eval$QC_score[is_good]) + } else { + NA_real_ + }, + auc_bad_vs_good=auc_rank(1 - df_eval$QC_score, as.integer(is_bad)), + sensitivity_bad_qs_lt_threshold=if (sum(is_bad) > 0) { + mean(pred_bad[is_bad]) + } else { + NA_real_ + }, + specificity_good_qs_ge_threshold=if (sum(is_good) > 0) { + mean(!pred_bad[is_good]) + } else { + NA_real_ + } + ) +} + +# Add good/bad/unlabelled labels to a SpatialExperiment object for plotting. +# +# Labels are derived from the evaluation table built for either the source or +# the target dataset after scoring with transferred coefficients. +append_transfer_labels <- function( + spe, + df_eval, + label_col="transfer_label", + colour_col="transfer_label_color" +) { + labels <- rep("unlabelled", ncol(spe)) + + if (!is.null(df_eval) && nrow(df_eval) > 0L) { + idx <- match(spe$cell_id, df_eval$cell_id) + keep <- !is.na(idx) + labels[keep] <- ifelse(df_eval$qcscore_train[idx[keep]] == 0, + "bad", "good") + } + + spe[[label_col]] <- factor(labels, levels=c("bad", "good", "unlabelled")) + palette <- c(bad="#D55E00", good="#009E73", unlabelled="#BDBDBD") + spe[[colour_col]] <- unname(palette[as.character(spe[[label_col]])]) + spe +} + +# Build a ROC data.frame from pseudo-reference labels and transferred QS. +build_transfer_roc_df <- function(df_eval) { + if (is.null(df_eval) || nrow(df_eval) == 0L) { + return(data.frame()) + } + + label_positive <- as.integer(df_eval$qcscore_train == 0) + n_pos <- sum(label_positive == 1L) + n_neg <- sum(label_positive == 0L) + + if (n_pos == 0L || n_neg == 0L) { + return(data.frame()) + } + + score_bad <- 1 - df_eval$QC_score + ord <- order(score_bad, decreasing=TRUE) + score_bad <- score_bad[ord] + label_positive <- label_positive[ord] + + tp <- cumsum(label_positive == 1L) + fp <- cumsum(label_positive == 0L) + change <- c(score_bad[-1] != score_bad[-length(score_bad)], TRUE) + + data.frame( + fpr=c(0, fp[change] / n_neg), + tpr=c(0, tp[change] / n_pos) + ) +} + +# Plot source or target cells coloured as pseudo-good / pseudo-bad after +# scoring with transferred coefficients. +plot_transfer_labelled_cells <- function( + result, + dataset=c("source", "target"), + size=0.05, + alpha=0.8, + scaleBar=TRUE +) { + dataset <- match.arg(dataset) + + spe <- switch(dataset, + source=result$source_spe, + target=result$target_spe + ) + df_eval <- switch(dataset, + source=result$source_eval, + target=result$target_eval + ) + + spe <- append_transfer_labels(spe, df_eval) + + plotCentroids( + spe, + colourBy="transfer_label", + palette="transfer_label_color", + size=size, + alpha=alpha, + scaleBar=scaleBar + ) + + ggplot2::labs( + title=paste0("Transferred QS labels on ", dataset, " dataset"), + colour="Reference label", + fill="Reference label" + ) +} + +# Plot the spatial distribution of transferred QS on source or target. +plot_transfer_qs <- function( + result, + dataset=c("source", "target"), + size=0.05, + alpha=0.8, + scaleBar=TRUE +) { + dataset <- match.arg(dataset) + + spe <- switch(dataset, + source=result$source_spe, + target=result$target_spe + ) + + plotCentroids( + spe, + colourBy="QC_score", + size=size, + alpha=alpha, + scaleBar=scaleBar + ) + + ggplot2::labs(title=paste0("Transferred QC score on ", dataset, " dataset")) +} + +# Plot ROC curves for the source or target transfer evaluation. +plot_transfer_roc <- function(result, dataset=c("source", "target")) { + dataset <- match.arg(dataset) + + df_eval <- switch(dataset, + source=result$source_eval, + target=result$target_eval + ) + roc_df <- build_transfer_roc_df(df_eval) + + if (nrow(roc_df) == 0L) { + stop("No ROC curve available for the selected dataset") + } + + auc_value <- switch(dataset, + source=result$source_summary$auc_bad_vs_good, + target=result$target_summary$auc_bad_vs_good + ) + + ggplot2::ggplot(roc_df, ggplot2::aes(x=fpr, y=tpr)) + + ggplot2::geom_abline(intercept=0, slope=1, linetype=2, + colour="grey60") + + ggplot2::geom_path(linewidth=0.9, colour="#0072B2") + + ggplot2::coord_fixed() + + ggplot2::theme_bw() + + ggplot2::labs( + title=paste0("ROC curve for transferred QS (", dataset, ")"), + subtitle=paste0("AUC = ", round(auc_value, 4)), + x="False positive rate", + y="True positive rate" + ) +} + +# Compare AUC between source and target after transfer. +plot_transfer_auc_summary <- function(result) { + auc_df <- data.frame( + dataset=c("source", "target"), + auc_bad_vs_good=c( + result$source_summary$auc_bad_vs_good, + if (nrow(result$target_summary) == 0L) NA_real_ else result$target_summary$auc_bad_vs_good + ) + ) + + ggplot2::ggplot(auc_df, ggplot2::aes(x=dataset, y=auc_bad_vs_good, fill=dataset)) + + ggplot2::geom_col(width=0.65, alpha=0.85, show.legend=FALSE) + + ggplot2::geom_text(ggplot2::aes(label=round(auc_bad_vs_good, 4)), + vjust=-0.4, na.rm=TRUE) + + ggplot2::theme_bw() + + ggplot2::labs( + title="AUC summary for transferred QS", + x=NULL, + y="AUC" + ) +} + +# Train a native QS model directly on the target dataset, so it can be compared +# against the transferred QS. +# +# By default it uses the full SpaceTrooper native workflow on the target. +# If `use_transfer_metrics = TRUE`, it restricts the native training to the same +# metric subset used by the transferred model. +compute_native_target_qs <- function( + result, + use_transfer_metrics=FALSE, + verbose=TRUE +) { + target_native <- result$target_spe + target_native$QC_score_transfer <- target_native$QC_score + + if (isTRUE(use_transfer_metrics)) { + target_native <- computeOutliersQCScore( + target_native, + metricList=result$transfer_metrics + ) + target_native <- checkOutliers(target_native, verbose=verbose) + + formula_vars <- S4Vectors::metadata(target_native)$formula_variables + model_formula <- getModelFormula(formula_vars, verbose=verbose) + train_df <- computeTrainDF( + colData=SummarizedExperiment::colData(target_native), + formulaVars=formula_vars, + tech=S4Vectors::metadata(target_native)$technology, + verbose=verbose + ) + x_target <- stats::model.matrix(stats::as.formula(model_formula), data=train_df) + best_lambda <- computeLambda(train_df, model_formula) + fit_native <- trainModel(x_target, train_df) + target_native$QC_score_native <- as.vector( + stats::predict( + fit_native, + s=best_lambda, + newx=stats::model.matrix( + stats::as.formula(model_formula), + data=as.data.frame(SummarizedExperiment::colData(target_native)) + ), + type="response" + ) + ) + return(target_native) + } + + target_native <- computeQCScore(target_native, verbose=verbose) + target_native$QC_score_native <- target_native$QC_score + target_native$QC_score <- target_native$QC_score_transfer + target_native +} + +# Compare transferred QS with a native QS trained directly on the target. +# +# The function reports correlations and agreement on low-quality calls. +compare_transferred_vs_native <- function( + result, + threshold=0.5, + low_fraction=0.1, + use_transfer_metrics=FALSE, + verbose=TRUE +) { + stopifnot(length(low_fraction) == 1L, low_fraction > 0, low_fraction < 1) + + target_native <- compute_native_target_qs( + result, + use_transfer_metrics=use_transfer_metrics, + verbose=verbose + ) + + df_compare <- data.frame( + cell_id=target_native$cell_id, + qc_transfer=target_native$QC_score_transfer, + qc_native=target_native$QC_score_native + ) + + transfer_cut <- stats::quantile(df_compare$qc_transfer, probs=low_fraction, na.rm=TRUE) + native_cut <- stats::quantile(df_compare$qc_native, probs=low_fraction, na.rm=TRUE) + + comparison_summary <- data.frame( + pearson=stats::cor(df_compare$qc_transfer, df_compare$qc_native, + method="pearson", use="complete.obs"), + spearman=stats::cor(df_compare$qc_transfer, df_compare$qc_native, + method="spearman", use="complete.obs"), + low_qc_agreement_at_threshold=mean( + (df_compare$qc_transfer < threshold) == (df_compare$qc_native < threshold), + na.rm=TRUE + ), + overlap_lowest_fraction=mean( + (df_compare$qc_transfer <= transfer_cut) & + (df_compare$qc_native <= native_cut), + na.rm=TRUE + ) / low_fraction + ) + + list( + summary=comparison_summary, + df_compare=df_compare, + target_native_spe=target_native, + threshold=threshold, + low_fraction=low_fraction, + use_transfer_metrics=use_transfer_metrics + ) +} + +# Scatter plot of transferred QS vs native QS on the target dataset. +plot_transfer_vs_native_scatter <- function(comparison_result) { + ggplot2::ggplot( + comparison_result$df_compare, + ggplot2::aes(x=qc_native, y=qc_transfer) + ) + + ggplot2::geom_point(alpha=0.4, size=0.6, colour="#0072B2") + + ggplot2::geom_abline(intercept=0, slope=1, linetype=2, + colour="grey60") + + ggplot2::theme_bw() + + ggplot2::labs( + title="Transferred QS vs native QS on target", + subtitle=paste0( + "Spearman = ", + round(comparison_result$summary$spearman, 4), + ", agreement@", + comparison_result$threshold, + " = ", + round(comparison_result$summary$low_qc_agreement_at_threshold, 4) + ), + x="Native target QS", + y="Transferred QS" + ) +} + +# Train a transferable QS model on one dataset and apply it to another. +# +# Workflow: +# 1. read source and target datasets +# 2. compute QC metrics on both +# 3. identify the metric subset that exists in both datasets +# 4. train the ridge-logistic QS model on the source dataset using only those metrics +# 5. transfer the fitted coefficients to the target dataset +# 6. optionally evaluate the transfer on pseudo-reference labels in the target +transfer_qs_coefficients <- function( + source_dir, + target_dir, + source_technology=c("cosmx", "cosmx_protein", "xenium"), + target_technology=c("cosmx", "cosmx_protein", "xenium"), + source_sample_name="source_sample", + target_sample_name="target_sample", + threshold=0.5, + evaluate_target=TRUE, + verbose=TRUE +) { + source_technology <- match.arg(source_technology) + target_technology <- match.arg(target_technology) + + # Read the source dataset, from which coefficients will be learned. + source_spe <- read_dataset_for_transfer( + dir_name=source_dir, + technology=source_technology, + sample_name=source_sample_name + ) + + # Read the target dataset, on which coefficients will be transferred. + target_spe <- read_dataset_for_transfer( + dir_name=target_dir, + technology=target_technology, + sample_name=target_sample_name + ) + + # Harmonise per-cell QC metrics in both datasets. + source_spe <- prepare_dataset_for_transfer(source_spe) + target_spe <- prepare_dataset_for_transfer(target_spe) + + # Keep only the subset of QS metrics that is available in both datasets. + transfer_metrics <- get_transferable_metrics(source_spe, target_spe) + + if (!"log2SignalDensity" %in% transfer_metrics) { + stop("log2SignalDensity is required for transferable QS training") + } + if (length(transfer_metrics) < 2L) { + stop("At least two shared metrics are recommended for coefficient transfer") + } + + if (verbose) { + message("Transferable metrics: ", paste(transfer_metrics, collapse=", ")) + } + + # Compute source outliers only on the transferable metric subset. + source_spe <- computeOutliersQCScore(source_spe, metricList=transfer_metrics) + source_spe <- checkOutliers(source_spe, verbose=verbose) + + formula_vars <- S4Vectors::metadata(source_spe)$formula_variables + if (!"log2SignalDensity" %in% names(formula_vars)) { + stop("The transferable source formula lost log2SignalDensity after outlier filtering") + } + + # Build the compatible model formula and the balanced source training table. + model_formula <- getModelFormula(formula_vars, verbose=verbose) + source_train_df <- computeTrainDF( + colData=SummarizedExperiment::colData(source_spe), + formulaVars=formula_vars, + tech=S4Vectors::metadata(source_spe)$technology, + verbose=verbose + ) + + # Fit the ridge-logistic QS model on the source dataset. + x_source <- stats::model.matrix(stats::as.formula(model_formula), + data=source_train_df) + best_lambda <- computeLambda(source_train_df, model_formula) + fit <- trainModel(x_source, source_train_df) + coefficient_table <- extract_qs_coefficients(fit, best_lambda) + + # Score the full source and target datasets with the same transferred model. + source_spe <- score_target_dataset(source_spe, fit, best_lambda, model_formula) + target_spe <- score_target_dataset(target_spe, fit, best_lambda, model_formula) + + # Build the labelled source evaluation set from the source training examples. + source_eval <- merge( + data.frame(cell_id=source_spe$cell_id, QC_score=source_spe$QC_score), + source_train_df[, c("cell_id", "qcscore_train")], + by="cell_id" + ) + source_summary <- summarise_transfer_eval(source_eval, threshold=threshold) + + target_eval <- NULL + target_summary <- data.frame() + target_label_error <- NA_character_ + + if (isTRUE(evaluate_target)) { + # Build pseudo-reference labels independently on the target dataset, + # using the same transferable metric subset. + target_ref <- safe_reference_labels_transfer( + target_spe, + metric_list=transfer_metrics, + verbose=verbose + ) + target_label_error <- target_ref$error + + if (!is.null(target_ref$data)) { + target_eval <- merge( + data.frame(cell_id=target_spe$cell_id, QC_score=target_spe$QC_score), + target_ref$data, + by="cell_id" + ) + target_summary <- summarise_transfer_eval(target_eval, + threshold=threshold) + } + } + + list( + source_technology=source_technology, + target_technology=target_technology, + transfer_metrics=transfer_metrics, + model_formula=model_formula, + lambda=best_lambda, + coefficient_table=coefficient_table, + source_summary=source_summary, + target_summary=target_summary, + target_label_error=target_label_error, + source_train_df=source_train_df, + source_eval=source_eval, + target_eval=target_eval, + source_spe=source_spe, + target_spe=target_spe, + fit=fit + ) +} + +# Esempi di utilizzo: +# source("inst/scripts/transfer_qs_coefficients.R") +# +# Train su CosMx e applica a Xenium +# res_transfer_cx <- transfer_qs_coefficients( +# source_dir="/path/to/cosmx_dataset", +# source_technology="cosmx", +# target_dir="/path/to/xenium_dataset", +# target_technology="xenium", +# source_sample_name="CosMx_source", +# target_sample_name="Xenium_target" +# ) +# res_transfer_cx$coefficient_table +# res_transfer_cx$target_summary +# plot_transfer_labelled_cells(res_transfer_cx, dataset="target", size=0.08) +# plot_transfer_qs(res_transfer_cx, dataset="target", size=0.08) +# plot_transfer_roc(res_transfer_cx, dataset="target") +# plot_transfer_auc_summary(res_transfer_cx) +# +# cmp_transfer_cx <- compare_transferred_vs_native( +# res_transfer_cx, +# threshold=0.5, +# low_fraction=0.1 +# ) +# cmp_transfer_cx$summary +# plot_transfer_vs_native_scatter(cmp_transfer_cx) +# +# Train su Xenium e applica a CosMx +# res_transfer_xc <- transfer_qs_coefficients( +# source_dir="/path/to/xenium_dataset", +# source_technology="xenium", +# target_dir="/path/to/cosmx_dataset", +# target_technology="cosmx" +# ) +# res_transfer_xc$coefficient_table +# res_transfer_xc$target_summary +# res_transfer_xc$target_summary From a4bdc6c997c8f8a43e83837357de424e6236b204 Mon Sep 17 00:00:00 2001 From: Dario Date: Fri, 15 May 2026 16:13:02 +0200 Subject: [PATCH 3/8] adding functions for model transfer across datasets --- R/QC.R | 243 ++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 225 insertions(+), 18 deletions(-) diff --git a/R/QC.R b/R/QC.R index 14ef5ed..6d69efd 100644 --- a/R/QC.R +++ b/R/QC.R @@ -370,7 +370,17 @@ computeThresholdFlags <- function(spe, totalThreshold=0, #' @export computeLambda <- function(trainDF, modelFormula) { # model_formula <- getModelFormula(technology) + train_ok <- .filterCompleteModelCases( + df=trainDF, + modelFormula=modelFormula, + response=trainDF$qcscore_train, + context="training cells for lambda selection" + ) + + trainDF <- trainDF[train_ok, , drop=FALSE] + model_matrix <- model.matrix(as.formula(modelFormula), data=trainDF) + model_matrix <- .dropModelIntercept(model_matrix) ridge_cv <- cv.glmnet(model_matrix, trainDF$qcscore_train, family="binomial", alpha=0, lambda=NULL) bestLambda <- ridge_cv$lambda.min @@ -442,7 +452,7 @@ computeLambda <- function(trainDF, modelFormula) { #' set.seed(1998) #' spe <- computeQCScore(spe) #' summary(spe$QC_score) -computeQCScore <- function(spe, bestLambda=NULL, verbose=FALSE) { +computeQCScore <- function(spe, bestLambda=NULL, modelFormula=NULL, verbose=FALSE) { stopifnot(is(spe, "SpatialExperiment")) if (dim(spe[,spe$total == 0])[2] != 0) { warning(paste0(dim(spe[,spe$total == 0])[2], @@ -458,34 +468,71 @@ computeQCScore <- function(spe, bestLambda=NULL, verbose=FALSE) { train_df <- computeTrainDF(df, out_var, tech, verbose) - model_formula <- getModelFormula(out_var, verbose) + if (is.null(modelFormula)) { + model_formula <- getModelFormula(out_var, verbose) + } else { + model_formula <- modelFormula + if (verbose) { + message("Using user-supplied model formula:") + message(model_formula) + } + } + train_ok <- .filterCompleteModelCases( + df=train_df, + modelFormula=model_formula, + response=train_df$qcscore_train, + context="training cells" + ) + + train_df <- train_df[train_ok, , drop=FALSE] model_matrix <- model.matrix(as.formula(model_formula), data=train_df) + model_matrix <- .dropModelIntercept(model_matrix) model <- trainModel(model_matrix, train_df) if(is.null(bestLambda)) { bestLambda <- computeLambda(train_df, model_formula) } + coefs <- coef(model, s=bestLambda) + if (verbose) { message("Model coefficients for every term used in the formula:") - message(paste(round(predict(model, s=bestLambda, type="coefficients"), - 2), - collapse = " ")) + message( paste( rownames(coefs), round(as.numeric(coefs), 2), sep="=", + collapse=" ")) } - full_matrix <- model.matrix(as.formula(model_formula), data=df) - cd <- colData(spe) - cd$QC_score <- as.vector(predict(model, s=bestLambda, - newx = full_matrix, - type = "response")) - spe$QC_score <- cd$QC_score - # train_identity <- rep("TEST", dim(spe)[2]) - # train_bad <- train_df$cell_id[train_df$qcscore_train==0] - # train_good <- train_df$cell_id[train_df$qcscore_train==1] - # spe$training_status <- dplyr::case_when( - # spe$cell_id %in% train_bad ~ "BAD", - # spe$cell_id %in% train_good ~ "GOOD", - # TRUE ~ train_identity) + full_ok <- .filterCompleteModelCases(df=df, modelFormula=model_formula, + context="cells") + + full_matrix <- model.matrix(as.formula(model_formula), + data=df[full_ok, , drop=FALSE]) + + full_matrix <- .dropModelIntercept(full_matrix) + full_matrix <- full_matrix[, colnames(model_matrix), drop=FALSE] + + ## NAs may come from model variables used in `model_formula`, e.g. cells with + ## missing `log2AspectRatio` when aspect ratio cannot be computed from polygons. + ## Predict only complete cases; incomplete cells keep QC_score = NA. + qc_score <- rep(NA_real_, nrow(df)) + qc_score[full_ok] <- as.vector( + predict(model, s=bestLambda, newx=full_matrix, type="response") + ) + + spe$QC_score <- qc_score + + metadata(spe)$QCScore_model <- list( + model=model, + bestLambda=bestLambda, + model_formula=model_formula, + formula_variables=out_var, + model_matrix_colnames=colnames(model_matrix), + coefficients=coefs, + coefficients_table=data.frame( + term=rownames(coefs), + coefficient=as.numeric(coefs), + row.names=NULL + ) + ) return(spe) } @@ -1148,3 +1195,163 @@ checkOutliers <- function(spe, verbose=FALSE) { return(list(df=df, out_var=out_var, tech=tech)) } +#' applyQCScoreModel +#' +#' @description +#' Apply a previously trained QC score model to a new SpatialExperiment object. +#' +#' @param spe A `SpatialExperiment` object with QC metrics already computed. +#' @param qcModel A QC score model object, usually stored in +#' `metadata(spe)$QCScore_model`. +#' @param scoreName Name of the output column in `colData`. +#' +#' @return A `SpatialExperiment` object with added QC score in `colData`. +#' +#' @export +#' @examples +#' +#' example(readCosmxSPE) +#' +#' ## Compute per-cell quality control metrics +#' spe <- spatialPerCellQC(spe) +#' +#' ## Split the object only for example purposes +#' set.seed(1998) +#' idx <- sample(seq_len(ncol(spe)), floor(ncol(spe) / 2)) +#' spe_train <- spe[, idx] +#' spe_test <- spe[, -idx] +#' +#' ## Train the Quality Control (QC) score model on one dataset +#' spe_train <- computeQCScore(spe_train) +#' qc_model <- metadata(spe_train)$QCScore_model +#' +#' ## Apply the trained model to another dataset +#' spe_test <- applyQCScoreModel( +#' spe=spe_test, +#' qcModel=qc_model, +#' scoreName="QC_score_transferred" +#' ) +#' +#' summary(spe_test$QC_score_transferred) +applyQCScoreModel <- function(spe, qcModel, scoreName="QC_score") { + stopifnot(is(spe, "SpatialExperiment")) + + required <- c( + "model", + "bestLambda", + "model_formula", + "model_matrix_colnames" + ) + + stopifnot( + "qcModel is missing required fields" = + all(required %in% names(qcModel)) + ) + + df <- as.data.frame(colData(spe)) + + ok <- .filterCompleteModelCases( + df=df, + modelFormula=qcModel$model_formula, + context="cells" + ) + + new_matrix <- model.matrix( + as.formula(qcModel$model_formula), + data=df[ok, , drop=FALSE] + ) + + new_matrix <- .dropModelIntercept(new_matrix) + + missing_cols <- setdiff( + qcModel$model_matrix_colnames, + colnames(new_matrix) + ) + + if (length(missing_cols) > 0L) { + zero_matrix <- matrix( + 0, + nrow=nrow(new_matrix), + ncol=length(missing_cols), + dimnames=list(rownames(new_matrix), missing_cols) + ) + + new_matrix <- cbind(new_matrix, zero_matrix) + } + + extra_cols <- setdiff( + colnames(new_matrix), + qcModel$model_matrix_colnames + ) + + if (length(extra_cols) > 0L) { + warning( + "New model matrix contains columns not present in the ", + "training matrix. These columns will be ignored: ", + paste(extra_cols, collapse=", ") + ) + } + + new_matrix <- new_matrix[, qcModel$model_matrix_colnames, drop=FALSE] + + score <- rep(NA_real_, nrow(df)) + + score[ok] <- as.vector( + predict( + qcModel$model, + s=qcModel$bestLambda, + newx=new_matrix, + type="response" + ) + ) + + cd <- colData(spe) + cd[[scoreName]] <- score + colData(spe) <- cd + + metadata(spe)$QCScore_model_applied <- qcModel + + return(spe) +} + +.dropModelIntercept <- function(modelMatrix) { + if ("(Intercept)" %in% colnames(modelMatrix)) { + modelMatrix <- modelMatrix[, colnames(modelMatrix)!="(Intercept)", + drop=FALSE] + } + + return(modelMatrix) +} + +.filterCompleteModelCases <- function(df, modelFormula, response=NULL, + context="cells") { + + vars <- all.vars(as.formula(modelFormula)) + + missing_vars <- setdiff(vars, colnames(df)) + + if (length(missing_vars) > 0L) { + stop( + "Missing variables required by model formula: ", + paste(missing_vars, collapse=", ") + ) + } + ## Missing values may come from variables used in `modelFormula`, for + ## example `log2AspectRatio` when aspect ratio cannot be computed from + ## polygons. + ok <- complete.cases(df[, vars, drop=FALSE]) + + if (!is.null(response)) { + ok <- ok & !is.na(response) + } + + if (!all(ok)) { + warning( + sum(!ok), + " ", context, + " contain missing model variables." + ) + } + + return(ok) +} From 8aeee801dc2e480af7be5867e8499013ab06102b Mon Sep 17 00:00:00 2001 From: Dario Date: Fri, 15 May 2026 16:13:13 +0200 Subject: [PATCH 4/8] script for model transfer --- inst/scripts/transfer_coeffs.R | 126 +++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 inst/scripts/transfer_coeffs.R diff --git a/inst/scripts/transfer_coeffs.R b/inst/scripts/transfer_coeffs.R new file mode 100644 index 0000000..19812b4 --- /dev/null +++ b/inst/scripts/transfer_coeffs.R @@ -0,0 +1,126 @@ +library(SpaceTrooper) +library(SummarizedExperiment) +library(S4Vectors) + +set.seed(1998) + +common_formula <- "~(log2SignalDensity + Area_um + log2AspectRatio)^2" + +## ----------------------------- +## Read Xenium data +## ----------------------------- + +dbkx_path <- "~/Downloads/Xenium_data/db_kero_xen" + +spexen <- readXeniumSPE(dbkx_path) + +spexen <- spatialPerCellQC(spexen) + +spexen <- computeQCScore( + spexen, + modelFormula=common_formula, + verbose=TRUE +) + +xen_model <- metadata(spexen)$QCScore_model + + +## ----------------------------- +## Read CosMx data +## ----------------------------- + +cosm_path <- "~/Downloads/CosMx_data/DBKero/CosMx_Breast/CosMx_data_Case2/" + +specosm <- readCosmxSPE(cosm_path) + +specosm <- spatialPerCellQC(specosm) + +specosm <- computeQCScore( + specosm, + modelFormula=common_formula, + verbose=TRUE +) + +cosmx_model <- metadata(specosm)$QCScore_model + + +## ----------------------------- +## Save native scores before transfer +## ----------------------------- + +spexen$QC_score_xenium_native <- spexen$QC_score +specosm$QC_score_cosmx_native <- specosm$QC_score + + +## ----------------------------- +## Apply transferred models +## ----------------------------- + +specosm <- applyQCScoreModel( + spe=specosm, + qcModel=xen_model, + scoreName="QC_score_xenium_model" +) + +spexen <- applyQCScoreModel( + spe=spexen, + qcModel=cosmx_model, + scoreName="QC_score_cosmx_model" +) + + +## ----------------------------- +## Summaries +## ----------------------------- + +message("Xenium native score:") +print(summary(spexen$QC_score_xenium_native)) + +message("Xenium scored with CosMx model:") +print(summary(spexen$QC_score_cosmx_model)) + +message("CosMx native score:") +print(summary(specosm$QC_score_cosmx_native)) + +message("CosMx scored with Xenium model:") +print(summary(specosm$QC_score_xenium_model)) + + +## ----------------------------- +## Correlations +## ----------------------------- + +message("Correlation in Xenium: native vs CosMx model") +print( + cor( + spexen$QC_score_xenium_native, + spexen$QC_score_cosmx_model, + use="complete.obs" + ) +) + +message("Correlation in CosMx: native vs Xenium model") +print( + cor( + specosm$QC_score_cosmx_native, + specosm$QC_score_xenium_model, + use="complete.obs" + ) +) + + +## ----------------------------- +## Model details +## ----------------------------- + +message("Xenium model formula:") +print(metadata(spexen)$QCScore_model$model_formula) + +message("CosMx model formula:") +print(metadata(specosm)$QCScore_model$model_formula) + +message("Xenium model coefficients:") +print(metadata(spexen)$QCScore_model$coefficients_table) + +message("CosMx model coefficients:") +print(metadata(specosm)$QCScore_model$coefficients_table) From c0cf042355c3f3cc610199b34350af8742aea875 Mon Sep 17 00:00:00 2001 From: Dario Date: Wed, 20 May 2026 15:40:37 +0200 Subject: [PATCH 5/8] adding scripts for reviewing process --- R/plotUtils.R | 54 ++ inst/scripts/ablation_study.R | 484 ++++++++++++ inst/scripts/compare_qs_general.R | 79 ++ inst/scripts/create_cosmx_rds.R | 39 + inst/scripts/run_qs_general.R | 275 +++++++ inst/scripts/transfer_coeffs_cosmx_cosmx.R | 730 ++++++++++++++++++ ...oeffs.R => transfer_coeffs_cosmx_xenium.R} | 82 +- inst/scripts/transfer_coeffs_xenium_xenium.R | 348 +++++++++ 8 files changed, 2087 insertions(+), 4 deletions(-) create mode 100644 inst/scripts/ablation_study.R create mode 100644 inst/scripts/compare_qs_general.R create mode 100644 inst/scripts/create_cosmx_rds.R create mode 100644 inst/scripts/run_qs_general.R create mode 100644 inst/scripts/transfer_coeffs_cosmx_cosmx.R rename inst/scripts/{transfer_coeffs.R => transfer_coeffs_cosmx_xenium.R} (56%) create mode 100644 inst/scripts/transfer_coeffs_xenium_xenium.R diff --git a/R/plotUtils.R b/R/plotUtils.R index 21d6a8c..9fe56bd 100644 --- a/R/plotUtils.R +++ b/R/plotUtils.R @@ -200,3 +200,57 @@ firstFlagPalette <- c( "> logged aspect ratio higher thr." = "greenyellow" ) +# To be decided if this should be moved to a separate file or included in the main package +# plot_qc_score_comparison <- function(x, y, x_label="x", y_label="y", +# title="QC score comparison", threshold=0.75, +# cor_method="spearman", output_file=NULL) { + +# stopifnot(length(x) == length(y)) + +# df <- data.frame(x=x, y=y) + +# df$quadrant <- with(df, ifelse( +# x < threshold & y < threshold, "low / low", +# ifelse(x >= threshold & y < threshold, "high x / low y", +# ifelse(x < threshold & y >= threshold, "low x / high y", +# "high / high")) +# )) + +# cor_val <- cor(df$x, df$y, use="complete.obs", method=cor_method) + +# p <- ggplot(df, aes(x=x, y=y, color=quadrant)) + +# geom_point(alpha=0.35, size=0.6) + +# geom_abline(slope=1, intercept=0, linetype="dashed", color="red") + +# geom_hline(yintercept=threshold) + +# geom_vline(xintercept=threshold) + +# scale_color_manual(values=c( +# "low / low"="grey60", +# "high x / low y"="orange", +# "low x / high y"="dodgerblue", +# "high / high"="black" +# )) + +# labs(x=x_label, y=y_label, title=title, color="Quadrant") + +# theme_minimal() + +# annotate( +# "text", +# x=quantile(df$x, 0.99, na.rm=TRUE), +# y=quantile(df$y, 0.01, na.rm=TRUE), +# label=paste0(cor_method, " r = ", +# formatC(cor_val, digits=3, format="f")), +# hjust=1, +# vjust=0, +# color="black" +# ) + +# if (!is.null(output_file)) { +# ggsave(output_file, p, device="pdf", +# width=11.69, height=8.27, units="in") +# } + +# return(list( +# plot=p, +# correlation=cor_val, +# quadrant_table=table(df$quadrant) +# )) +# } + diff --git a/inst/scripts/ablation_study.R b/inst/scripts/ablation_study.R new file mode 100644 index 0000000..9cd3777 --- /dev/null +++ b/inst/scripts/ablation_study.R @@ -0,0 +1,484 @@ +## ============================================================ +## 0. Libraries and output directory +## ============================================================ + +library(SpaceTrooper) +library(ggplot2) + +out_dir <- "~/SpaceTrooper_qs_check" + +if (!dir.exists(out_dir)) { + dir.create(out_dir, recursive=TRUE) +} + + +## ============================================================ +## 1. Define ablation formulas +## ============================================================ +## The full model contains all Quality Score (QS) components. +## Each ablated model removes one component at a time. + +formulas <- list( + full="~(log2SignalDensity + Area_um + log2AspectRatio + log2Ctrl_total_ratio)^2", + + no_log2SignalDensity= + "~(Area_um + log2AspectRatio + log2Ctrl_total_ratio)^2", + + no_Area_um= + "~(log2SignalDensity + log2AspectRatio + log2Ctrl_total_ratio)^2", + + no_log2AspectRatio= + "~(log2SignalDensity + Area_um + log2Ctrl_total_ratio)^2", + + no_log2Ctrl_total_ratio= + "~(log2SignalDensity + Area_um + log2AspectRatio)^2" +) + + +## ============================================================ +## 2. Function to run QS ablation models +## ============================================================ +## For each formula, computeQCScore() is run with a forced model formula. +## The output stores: +## - the computed QS vector +## - the trained QS model stored in metadata(spe)$QCScore_model + +run_qs_ablation <- function(spe, formulas, verbose=FALSE) { + out <- list() + + for (nm in names(formulas)) { + message("Running model: ", nm) + + spe_i <- computeQCScore( + spe=spe, + modelFormula=formulas[[nm]], + verbose=verbose + ) + + out[[nm]] <- list( + spe=spe_i, + score=spe_i$QC_score, + model=metadata(spe_i)$QCScore_model + ) + } + + return(out) +} + + +## ============================================================ +## 3. Read and preprocess CosMx data +## ============================================================ +## The object is read, then per-cell QC metrics are computed. +## These metrics are required by computeQCScore(). + +cosm_path <- "~/Downloads/CosMx_data/DBKero/CosMx_Breast/CosMx_data_Case2/" + +specosm <- readCosmxSPE(cosm_path) + +specosm <- spatialPerCellQC(specosm) + + +## ============================================================ +## 4. Run ablation study on CosMx +## ============================================================ +## This computes the full QS and one QS for each ablated formula. + +set.seed(1998) + +abl_cosmx <- run_qs_ablation( + spe=specosm, + formulas=formulas, + verbose=TRUE +) + + +## ============================================================ +## 5. Functions to summarize ablation results +## ============================================================ +## bottom_overlap() compares the low-QS tails of two score vectors. +## By default q=0.10 means the bottom 10% of cells. +## +## summarize_qs_ablation() compares every ablated QS against the full QS +## using: +## - Pearson correlation +## - Spearman correlation +## - absolute score differences +## - overlap of bottom-tail low-quality cells + +bottom_overlap <- function(x, y, q=0.10) { + x_bad <- x <= quantile(x, probs=q, na.rm=TRUE) + y_bad <- y <= quantile(y, probs=q, na.rm=TRUE) + + overlap <- sum(x_bad & y_bad, na.rm=TRUE) + union <- sum(x_bad | y_bad, na.rm=TRUE) + + data.frame( + quantile=q, + overlap=overlap, + union=union, + jaccard=overlap / union, + prop_ref_recovered=overlap / sum(x_bad, na.rm=TRUE) + ) +} + + +summarize_qs_ablation <- function(ablation_results, reference="full", q=0.10) { + ref_score <- ablation_results[[reference]]$score + + res <- lapply(names(ablation_results), function(nm) { + score <- ablation_results[[nm]]$score + + ov <- bottom_overlap(ref_score, score, q=q) + + data.frame( + model=nm, + pearson=cor( + ref_score, + score, + use="complete.obs", + method="pearson" + ), + spearman=cor( + ref_score, + score, + use="complete.obs", + method="spearman" + ), + mean_abs_diff=mean(abs(ref_score - score), na.rm=TRUE), + median_abs_diff=median(abs(ref_score - score), na.rm=TRUE), + max_abs_diff=max(abs(ref_score - score), na.rm=TRUE), + bottom_jaccard=ov$jaccard, + bottom_ref_recovered=ov$prop_ref_recovered, + n_na=sum(is.na(score)), + stringsAsFactors=FALSE + ) + }) + + do.call(rbind, res) +} + + +## ============================================================ +## 6. Build ablation summary table +## ============================================================ + +tab_cosmx <- summarize_qs_ablation( + ablation_results=abl_cosmx, + reference="full", + q=0.10 +) + +tab_cosmx + + +## ============================================================ +## 7. Pairwise plots: full QS vs each ablated QS +## ============================================================ +## This assumes plot_qc_score_comparison() is already available. +## Each plot is also saved as an A4 landscape PDF. + +ablation_names <- setdiff(names(abl_cosmx), "full") + +plots_cosmx <- lapply(ablation_names, function(nm) { + plot_qc_score_comparison( + x=abl_cosmx$full$score, + y=abl_cosmx[[nm]]$score, + x_label="Full QS", + y_label=paste0("QS: ", nm), + title=paste0("CosMx: full QS vs ", nm), + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + paste0("cosmx_full_vs_", nm, "_A4_landscape.pdf") + ) + ) +}) + +names(plots_cosmx) <- ablation_names + +## Print plots +lapply(plots_cosmx, function(x) print(x$plot)) + + +## ============================================================ +## 8. Heatmap: complete ablation summary +## ============================================================ +## This heatmap includes correlations, score differences, and bottom-tail +## overlap metrics. + +heat_df <- tab_cosmx[tab_cosmx$model != "full", ] + +heat_df <- heat_df[, c( + "model", + "pearson", + "spearman", + "mean_abs_diff", + "median_abs_diff", + "bottom_jaccard", + "bottom_ref_recovered" +)] + +heat_long <- reshape( + heat_df, + varying=names(heat_df)[names(heat_df) != "model"], + v.names="value", + timevar="metric", + times=names(heat_df)[names(heat_df) != "model"], + direction="long" +) + +rownames(heat_long) <- NULL + +p_heat <- ggplot( + heat_long, + aes(x=metric, y=model, fill=value) +) + + geom_tile() + + geom_text( + aes(label=formatC(value, digits=3, format="f")), + color="white", + size=3 + ) + + labs( + title="CosMx QS ablation summary", + x="Metric", + y="Ablation model", + fill="Value" + ) + + theme_minimal() + + theme( + axis.text.x=element_text(angle=45, hjust=1) + ) + +p_heat + +ggsave( + filename=file.path( + out_dir, + "cosmx_ablation_summary_heatmap_A4_landscape.pdf" + ), + plot=p_heat, + device="pdf", + width=11.69, + height=8.27, + units="in" +) + + +## ============================================================ +## 9. Heatmap: correlation-only summary +## ============================================================ +## Cleaner figure showing only Pearson and Spearman correlations +## between the full QS and each ablated QS. + +cor_heat_df <- tab_cosmx[ + tab_cosmx$model != "full", + c("model", "pearson", "spearman") +] + +cor_heat_long <- reshape( + cor_heat_df, + varying=c("pearson", "spearman"), + v.names="correlation", + timevar="metric", + times=c("pearson", "spearman"), + direction="long" +) + +rownames(cor_heat_long) <- NULL + +p_cor_heat <- ggplot( + cor_heat_long, + aes(x=metric, y=model, fill=correlation) +) + + geom_tile() + + geom_text( + aes(label=formatC(correlation, digits=3, format="f")), + color="white", + size=4 + ) + + labs( + title="CosMx QS ablation correlation with full model", + x="Correlation metric", + y="Ablation model", + fill="Correlation" + ) + + theme_minimal() + +p_cor_heat + +ggsave( + filename=file.path( + out_dir, + "cosmx_ablation_correlation_heatmap.pdf" + ), + plot=p_cor_heat, + device="pdf", + width=8, + height=5, + units="in" +) + + +## ============================================================ +## 10. Optional: save tabular results +## ============================================================ + +write.csv( + tab_cosmx, + file=file.path(out_dir, "cosmx_ablation_summary_table.csv"), + row.names=FALSE +) + + +plot_ablation_colored <- function(ablation_results, ablation_name, + reference="full", color_col=NULL, + x_label="Full QS", y_label=NULL, title=NULL, + cor_method="spearman", output_file=NULL) { + + stopifnot(reference %in% names(ablation_results)) + stopifnot(ablation_name %in% names(ablation_results)) + + ref_score <- ablation_results[[reference]]$score + abl_score <- ablation_results[[ablation_name]]$score + + stopifnot(length(ref_score) == length(abl_score)) + + spe_ref <- ablation_results[[reference]]$spe + cd <- as.data.frame(colData(spe_ref)) + + df <- data.frame( + full_qs=ref_score, + ablated_qs=abl_score + ) + + if (!is.null(color_col)) { + stopifnot(color_col %in% colnames(cd)) + df$color_var <- cd[[color_col]] + } + + cor_val <- cor( + df$full_qs, + df$ablated_qs, + use="complete.obs", + method=cor_method + ) + + if (is.null(y_label)) { + y_label <- paste0("QS: ", ablation_name) + } + + if (is.null(title)) { + title <- paste0("Full QS vs ", ablation_name) + } + + if (is.null(color_col)) { + p <- ggplot(df, aes(x=full_qs, y=ablated_qs)) + + geom_point(alpha=0.35, size=0.6) + } else { + p <- ggplot(df, aes(x=full_qs, y=ablated_qs, color=color_var)) + + geom_point(alpha=0.35, size=0.6) + } + + p <- p + + geom_abline( + slope=1, + intercept=0, + linetype="dashed", + color="red" + ) + + labs( + x=x_label, + y=y_label, + title=title, + color=color_col + ) + + theme_minimal() + + annotate( + "text", + x=quantile(df$full_qs, 0.99, na.rm=TRUE), + y=quantile(df$ablated_qs, 0.01, na.rm=TRUE), + label=paste0( + cor_method, + " r = ", + formatC(cor_val, digits=3, format="f") + ), + hjust=1, + vjust=0, + color="black" + ) + + if (!is.null(output_file)) { + ggsave( + filename=output_file, + plot=p, + device="pdf", + width=11.69, + height=8.27, + units="in" + ) + } + + return(list( + plot=p, + correlation=cor_val, + data=df + )) +} + +p_no_signal_col <- plot_ablation_colored( + ablation_results=abl_cosmx, + ablation_name="no_log2SignalDensity", + color_col="log2Ctrl_total_ratio", + title="CosMx: full QS vs QS without log2SignalDensity", + y_label="QS without log2SignalDensity", + output_file="~/SpaceTrooper_qs_check/cosmx_full_vs_no_log2SignalDensity_log2Ctrl_total_ratiocolored.pdf" +) +p_no_signal_col_area <- plot_ablation_colored( + ablation_results=abl_cosmx, + ablation_name="no_log2SignalDensity", + color_col="Area_um", + title="CosMx: full QS vs QS without log2SignalDensity", + y_label="QS without log2SignalDensity", + output_file="~/SpaceTrooper_qs_check/cosmx_full_vs_no_log2SignalDensity_Area_um_colored.pdf" +) + +p_no_signal_col_aspect <- plot_ablation_colored( + ablation_results=abl_cosmx, + ablation_name="no_log2SignalDensity", + color_col="log2AspectRatio", + title="CosMx: full QS vs QS without log2SignalDensity", + y_label="QS without log2SignalDensity", + output_file="~/SpaceTrooper_qs_check/cosmx_full_vs_no_log2SignalDensity_log2AspectRatio_colored.pdf" +) + +p_no_signal_col_aspect$plot +p_no_signal_col$plot + +plot_ablation_colored( + ablation_results=abl_cosmx, + ablation_name="no_log2Ctrl_total_ratio", + color_col="log2SignalDensity", + title="CosMx: full QS vs QS without log2Ctrl_total_ratio", + y_label="QS without log2Ctrl_total_ratio", + output_file="~/SpaceTrooper_qs_check/cosmx_full_vs_no_log2ctrl_total_ratio_log2SignalDensitycolored.pdf" +)$plot + +plot_ablation_colored( + ablation_results=abl_cosmx, + ablation_name="no_log2Ctrl_total_ratio", + color_col="Area_um", + title="CosMx: full QS vs QS without log2Ctrl_total_ratio", + y_label="QS without log2Ctrl_total_ratio", + output_file="~/SpaceTrooper_qs_check/cosmx_full_vs_no_log2Ctrl_total_ratio_Area_um_colored.pdf" +)$plot +plot_ablation_colored( + ablation_results=abl_cosmx, + ablation_name="no_log2Ctrl_total_ratio", + color_col="log2AspectRatio", + title="CosMx: full QS vs QS without log2Ctrl_total_ratio", + y_label="QS without log2Ctrl_total_ratio", + output_file="~/SpaceTrooper_qs_check/cosmx_full_vs_no_log2Ctrl_total_ratio_log2AspectRatio_colored.pdf" +)$plot diff --git a/inst/scripts/compare_qs_general.R b/inst/scripts/compare_qs_general.R new file mode 100644 index 0000000..60dd790 --- /dev/null +++ b/inst/scripts/compare_qs_general.R @@ -0,0 +1,79 @@ +args <- commandArgs(trailingOnly=TRUE) + +if (length(args) != 2) { + stop("Usage: Rscript compare_qs_general.R before.rds current.rds") +} + +before <- readRDS(args[[1]]) +current <- readRDS(args[[2]]) + +qs_before <- before$quality_score +qs_current <- current$quality_score + +cat("BEFORE\n") +cat("Branch:", before$git_branch, "\n") +cat("Commit:", before$git_hash, "\n") +cat("QScore column:", before$qscore_column, "\n") +cat("Detection:", before$qscore_detection_method, "\n\n") + +cat("CURRENT\n") +cat("Branch:", current$git_branch, "\n") +cat("Commit:", current$git_hash, "\n") +cat("QScore column:", current$qscore_column, "\n") +cat("Detection:", current$qscore_detection_method, "\n\n") + +cat("Lengths:\n") +cat("before:", length(qs_before), "\n") +cat("current:", length(qs_current), "\n\n") + +if (length(qs_before) != length(qs_current)) { + stop("Different number of Quality Score values. Cannot compare directly.") +} + +comparison <- data.frame( + index=seq_along(qs_before), + quality_score_before=qs_before, + quality_score_current=qs_current, + difference=qs_current - qs_before +) + +cat("Summary before:\n") +print(summary(qs_before)) + +cat("\nSummary current:\n") +print(summary(qs_current)) + +cat("\nSummary difference current - before:\n") +print(summary(comparison$difference)) + +cat("\nall.equal:\n") +print(all.equal(qs_before, qs_current)) + +cat("\nExactly different values:\n") +print(sum(qs_before != qs_current, na.rm=TRUE)) + +cat("\nDifferent above tolerance 1e-8:\n") +print(sum(abs(qs_current - qs_before) > 1e-8, na.rm=TRUE)) + +cat("\nDifferent above tolerance 1e-6:\n") +print(sum(abs(qs_current - qs_before) > 1e-6, na.rm=TRUE)) + +cat("\nDifferent above tolerance 1e-4:\n") +print(sum(abs(qs_current - qs_before) > 1e-4, na.rm=TRUE)) + +out_csv <- file.path( + dirname(args[[2]]), + "quality_score_comparison.csv" +) + +out_rds <- file.path( + dirname(args[[2]]), + "quality_score_comparison.rds" +) + +write.csv(comparison, out_csv, row.names=FALSE) +saveRDS(comparison, out_rds) + +cat("\nSaved:\n") +cat(out_csv, "\n") +cat(out_rds, "\n") diff --git a/inst/scripts/create_cosmx_rds.R b/inst/scripts/create_cosmx_rds.R new file mode 100644 index 0000000..152d9f8 --- /dev/null +++ b/inst/scripts/create_cosmx_rds.R @@ -0,0 +1,39 @@ +suppressPackageStartupMessages({ + library(devtools) + library(SpatialExperiment) + library(SummarizedExperiment) +}) + + +input_dir <- "/Users/inzirio/Downloads/CosMx_data/DBKero/CosMx_Breast/CosMx_data_Case2" + +output_rds <- "~/SpaceTrooper_qs_check/cosmx_case2_spe_raw.rds" +output_rds <- path.expand(output_rds) + + +library(SpaceTrooper) + +if (!dir.exists(input_dir)) { + stop("Input directory does not exist: ", input_dir) +} + +message("Reading CosMx dataset from:") +message(input_dir) + +spe <- readCosmxSPE(input_dir) + +message("Object class:") +print(class(spe)) + +message("Object dimensions:") +print(dim(spe)) + +message("colData columns:") +print(colnames(SummarizedExperiment::colData(spe))) + +message("Saving RDS to:") +message(output_rds) + +saveRDS(spe, output_rds) + +message("Done.") diff --git a/inst/scripts/run_qs_general.R b/inst/scripts/run_qs_general.R new file mode 100644 index 0000000..607cf52 --- /dev/null +++ b/inst/scripts/run_qs_general.R @@ -0,0 +1,275 @@ + +package_path <- "/Users/inzirio/My Drive/works/coding/SpaceTrooper" +input_rds <- "~/SpaceTrooper_qs_check/cosmx_case2_spe_raw.rds" +output_rds <- "~/SpaceTrooper_qs_check/cosmx_case2_qs_result_interaction.rds" + + +message("Package path: ", package_path) +message("Input RDS: ", input_rds) +message("Output RDS: ", output_rds) + +library(devtools) +library(SpatialExperiment) + +devtools::load_all(package_path, quiet=FALSE) + + +obj <- readRDS(input_rds) +metadata(obj)$formula_variables <- c(log2CountArea="log2CountArea", + log2AspectRatio="log2AspectRatio") +obj <- spatialPerCellQC(obj) + +metadata(obj) +obj <- computeQCScore(obj) + +qc_nointeract <- obj$QC_score +qc_interact <- obj$QC_score +saveRDS(qc_nointeract, "~/SpaceTrooper_qs_check/qc_nointeract.rds") +saveRDS(qc_interact, "~/SpaceTrooper_qs_check/qc_interact.rds") + +library(ggplot2) + +# assume qc_nointeract and qc_interact exist in the environment +df <- data.frame(qc_nointeract = qc_nointeract, qc_interact = qc_interact) + +# correlation +cor_val <- cor(df$qc_nointeract, df$qc_interact, use = "complete.obs", method = "pearson") + +# plot: use 2D binning for large datasets, points otherwise; add y=x line and annotation +plot_qc <- ggplot(df, aes(x = qc_nointeract, y = qc_interact)) + + geom_point(alpha = 0.35, size = 0.6) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") + +# scale_fill_gradient(low = "white", high = "steelblue") + + labs( + x = "QC score (no interaction)", + y = "QC score (with interaction)", + title = "QC: no-interaction vs interaction" + ) + + theme_minimal() + + geom_hline(yintercept = 0.75) + + geom_vline(xintercept = 0.75) + + annotate( + "text", + x = quantile(df$qc_nointeract, 0.99, na.rm = TRUE), + y = quantile(df$qc_interact, 0.01, na.rm = TRUE), + label = paste0("r = ", formatC(cor_val, digits = 3, format = "f")), + hjust = 1, vjust = 0 + ) + +# return the plot object +plot_qc + + +library(ggplot2) + +df <- data.frame( + qc_nointeract=qc_nointeract, + qc_interact=qc_interact +) + +threshold <- 0.75 + +df$quadrant <- with( + df, + ifelse( + qc_nointeract < threshold & qc_interact < threshold, + "low / low", + ifelse( + qc_nointeract >= threshold & qc_interact < threshold, + "high no-interaction / low interaction", + ifelse( + qc_nointeract < threshold & qc_interact >= threshold, + "low no-interaction / high interaction", + "high / high" + ) + ) + ) +) + +table(df$quadrant) + +cor_val <- cor( + df$qc_nointeract, + df$qc_interact, + use="complete.obs", + method="pearson" +) + +plot_qc <- ggplot(df, aes(x=qc_nointeract, y=qc_interact, color=quadrant)) + + geom_point(alpha=0.35, size=0.6) + + geom_abline( + slope=1, + intercept=0, + linetype="dashed", + color="red" + ) + + geom_hline(yintercept=threshold) + + geom_vline(xintercept=threshold) + + labs( + x="QC score (no interaction)", + y="QC score (with interaction)", + title="QC: no-interaction vs interaction", + color="Quadrant" + ) + + theme_minimal() + + annotate( + "text", + x=quantile(df$qc_nointeract, 0.99, na.rm=TRUE), + y=quantile(df$qc_interact, 0.01, na.rm=TRUE), + label=paste0("r = ", formatC(cor_val, digits=3, format="f")), + hjust=1, + vjust=0, + color="black" + ) + +plot_qc + +plot_qc <- plot_qc + + scale_color_manual( + values=c( + "low / low"="grey60", + "high no-interaction / low interaction"="orange", + "low no-interaction / high interaction"="dodgerblue", + "high / high"="black" + ) + ) + +ggsave( + filename="~/SpaceTrooper_qs_check/qc_nointeraction_vs_interaction_A4_landscape.pdf", + plot=plot_qc, + device="pdf", + width=11.69, + height=8.27, + units="in" +) +# get_coldata_df <- function(x) { +# cd <- SummarizedExperiment::colData(x) +# as.data.frame(cd) +# } + +# get_qscore_from_coldata <- function(x) { +# cd <- get_coldata_df(x) + +# candidate_names <- c( +# "quality_score", +# "QualityScore", +# "qualityScore", +# "qscore", +# "QScore", +# "Q_score", +# "q_score", +# "score", +# "flag_score", +# "qc_score", +# "QC_score", +# "QCScore", +# "cell_quality_score", +# "spatial_quality_score" +# ) + +# exact_hits <- intersect(candidate_names, colnames(cd)) + +# if (length(exact_hits) > 0) { +# selected <- exact_hits[1] +# return(list( +# values=cd[[selected]], +# column=selected, +# method="exact_name_match", +# all_candidates=exact_hits, +# coldata=cd +# )) +# } + +# pattern_hits <- grep( +# "quality|qscore|q_score|qc_score|flag_score|score", +# colnames(cd), +# ignore.case=TRUE, +# value=TRUE +# ) + +# numeric_pattern_hits <- pattern_hits[ +# vapply(cd[pattern_hits], is.numeric, logical(1)) +# ] + +# if (length(numeric_pattern_hits) > 0) { +# selected <- numeric_pattern_hits[1] +# return(list( +# values=cd[[selected]], +# column=selected, +# method="pattern_numeric_match", +# all_candidates=numeric_pattern_hits, +# coldata=cd +# )) +# } + +# stop( +# "Could not automatically identify Quality Score column.\n", +# "Available colData columns are:\n", +# paste(colnames(cd), collapse=", ") +# ) +# } + +# call_function_if_exists <- function(fun_name, x) { +# if (!exists(fun_name, mode="function")) { +# stop("Function not found in loaded SpaceTrooper branch: ", fun_name) +# } + +# fun <- get(fun_name, mode="function") +# fun(x) +# } + +# obj_before <- obj + +# if (run_spatial_qc) { +# message("Running spatial QC function...") +# obj <- call_function_if_exists(spatial_qc_fun, obj) +# } + +# message("Running Quality Score function...") +# obj_after <- call_function_if_exists(qscore_fun, obj) + +# message("Extracting Quality Score...") +# qs_info <- get_qscore_from_coldata(obj_after) + +# qs <- qs_info$values + +# if (!is.numeric(qs)) { +# warning("Detected Quality Score column is not numeric: ", qs_info$column) +# } + +# out <- list( +# git_branch=git_branch, +# git_hash=git_hash, +# package_path=normalizePath(package_path), +# input_rds=normalizePath(input_rds), +# spatial_qc_fun=spatial_qc_fun, +# qscore_fun=qscore_fun, +# run_spatial_qc=run_spatial_qc, +# qscore_column=qs_info$column, +# qscore_detection_method=qs_info$method, +# qscore_candidates=qs_info$all_candidates, +# n=length(qs), +# object_class=class(obj_after), +# object_dim=tryCatch(dim(obj_after), error=function(e) NULL), +# object_colnames=tryCatch(colnames(obj_after), error=function(e) NULL), +# quality_score=qs, +# quality_score_summary=summary(qs), +# colData=qs_info$coldata, +# session_info=sessionInfo() +# ) + +# saveRDS(out, output_rds) + +# message("Saved result to: ", output_rds) +# message("Detected Quality Score column: ", qs_info$column) +# message("Detection method: ", qs_info$method) +# message("Number of values: ", length(qs)) +# print(summary(qs)) + + +# computeQCScore(obj) + +# # model_formula <- paste0("~ log2CountArea + I(abs(log2AspectRatio) ", +# # "* as.numeric(dist_border<50)) + ", +# # " log2CountArea:I(abs(log2AspectRatio)", +# # "* as.numeric(dist_border<50))") #for cosmx diff --git a/inst/scripts/transfer_coeffs_cosmx_cosmx.R b/inst/scripts/transfer_coeffs_cosmx_cosmx.R new file mode 100644 index 0000000..d6d0aea --- /dev/null +++ b/inst/scripts/transfer_coeffs_cosmx_cosmx.R @@ -0,0 +1,730 @@ +## ============================================================ +## 0. Libraries and output directory +## ============================================================ + +devtools::load_all() +library(ggplot2) + +out_dir <- "~/SpaceTrooper_qs_check" + +set.seed(1998) + + +## ============================================================ +## 1. Define common CosMx model formula +## ============================================================ +## Since both datasets are CosMx, we can use the CosMx-specific formula, +## including the border-adjusted aspect ratio term and control ratio. + +cosmx_formula <- paste0( + "~(", + "log2SignalDensity + ", + "Area_um + ", + "I(abs(log2AspectRatio) * as.numeric(dist_border < 50)) + ", + "log2Ctrl_total_ratio", + ")^2" +) + + +## ============================================================ +## 2. Read and preprocess CosMx Breast dataset +## ============================================================ + +cosmx_breast_path <- "~/Downloads/CosMx_data/DBKero/CosMx_Breast/CosMx_data_Case2/" + +specosm_breast <- readCosmxSPE(cosmx_breast_path) + +specosm_breast <- spatialPerCellQC(specosm_breast) + + +## ============================================================ +## 3. Read and preprocess CosMx Pancreas dataset +## ============================================================ + +cosmx_pancreas_path <- "/Users/inzirio/Downloads/CosMx_data/Pancreas-CosMx-WTx" + +specosm_pancreas <- readCosmxSPE(cosmx_pancreas_path) + +specosm_pancreas <- spatialPerCellQC(specosm_pancreas) + + +## ============================================================ +## 4. Train native QS models on both datasets +## ============================================================ + +specosm_breast <- computeQCScore( + spe=specosm_breast, + modelFormula=cosmx_formula, + verbose=TRUE +) + +specosm_pancreas <- computeQCScore( + spe=specosm_pancreas, + modelFormula=cosmx_formula, + verbose=TRUE +) + + +## ============================================================ +## 5. Store native QS scores and trained models +## ============================================================ + +specosm_breast$QC_score_breast_native <- specosm_breast$QC_score +specosm_pancreas$QC_score_pancreas_native <- specosm_pancreas$QC_score + +breast_model <- metadata(specosm_breast)$QCScore_model +pancreas_model <- metadata(specosm_pancreas)$QCScore_model + + +## ============================================================ +## 6. Apply transferred models +## ============================================================ +## Apply Pancreas-trained model to Breast. +## Apply Breast-trained model to Pancreas. + +specosm_breast <- applyQCScoreModel( + spe=specosm_breast, + qcModel=pancreas_model, + scoreName="QC_score_pancreas_model" +) + +specosm_pancreas <- applyQCScoreModel( + spe=specosm_pancreas, + qcModel=breast_model, + scoreName="QC_score_breast_model" +) + + +## ============================================================ +## 7. Basic checks +## ============================================================ + +summary(specosm_breast$QC_score_breast_native) +summary(specosm_breast$QC_score_pancreas_model) + +summary(specosm_pancreas$QC_score_pancreas_native) +summary(specosm_pancreas$QC_score_breast_model) + +range(specosm_breast$QC_score_breast_native, na.rm=TRUE) +range(specosm_breast$QC_score_pancreas_model, na.rm=TRUE) + +range(specosm_pancreas$QC_score_pancreas_native, na.rm=TRUE) +range(specosm_pancreas$QC_score_breast_model, na.rm=TRUE) + +sum(is.na(specosm_breast$QC_score_breast_native)) +sum(is.na(specosm_breast$QC_score_pancreas_model)) + +sum(is.na(specosm_pancreas$QC_score_pancreas_native)) +sum(is.na(specosm_pancreas$QC_score_breast_model)) + + +## ============================================================ +## 8. Correlations: native vs transferred +## ============================================================ + +cor_breast_spearman <- cor( + specosm_breast$QC_score_breast_native, + specosm_breast$QC_score_pancreas_model, + use="complete.obs", + method="spearman" +) + +cor_breast_pearson <- cor( + specosm_breast$QC_score_breast_native, + specosm_breast$QC_score_pancreas_model, + use="complete.obs", + method="pearson" +) + +cor_pancreas_spearman <- cor( + specosm_pancreas$QC_score_pancreas_native, + specosm_pancreas$QC_score_breast_model, + use="complete.obs", + method="spearman" +) + +cor_pancreas_pearson <- cor( + specosm_pancreas$QC_score_pancreas_native, + specosm_pancreas$QC_score_breast_model, + use="complete.obs", + method="pearson" +) + +cor_breast_spearman +cor_breast_pearson + +cor_pancreas_spearman +cor_pancreas_pearson + + +## ============================================================ +## 9. Plot: Breast native QS vs Pancreas-trained QS +## ============================================================ + +res_breast <- plot_qc_score_comparison( + x=specosm_breast$QC_score_breast_native, + y=specosm_breast$QC_score_pancreas_model, + x_label="Breast native QS", + y_label="Pancreas-trained QS", + title="CosMx Breast: native QS vs Pancreas-trained QS", + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + "cosmx_breast_native_vs_pancreas_model_A4_landscape.pdf" + ) +) + +res_breast$plot +res_breast$correlation +res_breast$quadrant_table + + +## ============================================================ +## 10. Plot: Pancreas native QS vs Breast-trained QS +## ============================================================ + +res_pancreas <- plot_qc_score_comparison( + x=specosm_pancreas$QC_score_pancreas_native, + y=specosm_pancreas$QC_score_breast_model, + x_label="Pancreas native QS", + y_label="Breast-trained QS", + title="CosMx Pancreas: native QS vs Breast-trained QS", + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + "cosmx_pancreas_native_vs_breast_model_A4_landscape.pdf" + ) +) + +res_pancreas$plot +res_pancreas$correlation +res_pancreas$quadrant_table + + +## ============================================================ +## 11. Compare model formulas and coefficients +## ============================================================ + +metadata(specosm_breast)$QCScore_model$model_formula +metadata(specosm_pancreas)$QCScore_model$model_formula + +metadata(specosm_breast)$QCScore_model$model_matrix_colnames +metadata(specosm_pancreas)$QCScore_model$model_matrix_colnames + +metadata(specosm_breast)$QCScore_model$coefficients_table +metadata(specosm_pancreas)$QCScore_model$coefficients_table + + +## ============================================================ +## 12. Save summary table +## ============================================================ + +cosmx_transfer_summary <- data.frame( + comparison=c( + "Breast native vs Pancreas-trained", + "Pancreas native vs Breast-trained" + ), + spearman=c( + cor_breast_spearman, + cor_pancreas_spearman + ), + pearson=c( + cor_breast_pearson, + cor_pancreas_pearson + ), + stringsAsFactors=FALSE +) + +cosmx_transfer_summary + +write.csv( + cosmx_transfer_summary, + file=file.path(out_dir, "cosmx_breast_pancreas_transfer_summary.csv"), + row.names=FALSE +) + +plot_qc_score_coldata <- function(x, y, spe, color_col, + x_label="x", y_label="y", + title="QC score comparison", + threshold=0.75, + cor_method="spearman", + output_file=NULL) { + + stopifnot(length(x) == length(y)) + stopifnot(ncol(spe) == length(x)) + stopifnot(color_col %in% colnames(colData(spe))) + + df <- data.frame( + x=x, + y=y, + color_var=as.data.frame(colData(spe))[[color_col]] + ) + + cor_val <- cor( + df$x, + df$y, + use="complete.obs", + method=cor_method + ) + + p <- ggplot(df, aes(x=x, y=y, color=color_var)) + + geom_point(alpha=0.35, size=0.6) + + geom_abline( + slope=1, + intercept=0, + linetype="dashed", + color="red" + ) + + geom_hline(yintercept=threshold) + + geom_vline(xintercept=threshold) + + labs( + x=x_label, + y=y_label, + title=title, + color=color_col + ) + + theme_minimal() + + annotate( + "text", + x=quantile(df$x, 0.99, na.rm=TRUE), + y=quantile(df$y, 0.01, na.rm=TRUE), + label=paste0( + cor_method, + " r = ", + formatC(cor_val, digits=3, format="f") + ), + hjust=1, + vjust=0, + color="black" + ) + + if (!is.null(output_file)) { + ggsave( + filename=output_file, + plot=p, + device="pdf", + width=11.69, + height=8.27, + units="in" + ) + } + + return(list( + plot=p, + correlation=cor_val, + data=df + )) +} + + +res_breast_signal <- plot_qc_score_coldata( + x=specosm_breast$QC_score_breast_native, + y=specosm_breast$QC_score_pancreas_model, + spe=specosm_breast, + color_col="log2SignalDensity", + x_label="Breast native QS", + y_label="Pancreas-trained QS", + title="CosMx Breast: native QS vs Pancreas-trained QS", + threshold=0.75, + cor_method="spearman", + output_file="~/SpaceTrooper_qs_check/cosmx_breast_native_vs_pancreas_model_colored_by_log2SignalDensity.pdf" +) + +res_breast_signal$plot + +res_breast_area <- plot_qc_score_coldata( + x=specosm_breast$QC_score_breast_native, + y=specosm_breast$QC_score_pancreas_model, + spe=specosm_breast, + color_col="Area_um", + x_label="Breast native QS", + y_label="Pancreas-trained QS", + title="CosMx Breast: native QS vs Pancreas-trained QS", + threshold=0.75, + cor_method="spearman", + output_file="~/SpaceTrooper_qs_check/cosmx_breast_native_vs_pancreas_model_colored_by_Area_um.pdf" +) + +res_breast_area$plot + +res_breast_ctrl <- plot_qc_score_coldata( + x=specosm_breast$QC_score_breast_native, + y=specosm_breast$QC_score_pancreas_model, + spe=specosm_breast, + color_col="log2Ctrl_total_ratio", + x_label="Breast native QS", + y_label="Pancreas-trained QS", + title="CosMx Breast: native QS vs Pancreas-trained QS", + threshold=0.75, + cor_method="spearman", + output_file="~/SpaceTrooper_qs_check/cosmx_breast_native_vs_pancreas_model_colored_by_log2Ctrl_total_ratio.pdf" +) + +res_breast_ctrl$plot + +res_breast_aspect <- plot_qc_score_coldata( + x=specosm_breast$QC_score_breast_native, + y=specosm_breast$QC_score_pancreas_model, + spe=specosm_breast, + color_col="log2AspectRatio", + x_label="Breast native QS", + y_label="Pancreas-trained QS", + title="CosMx Breast: native QS vs Pancreas-trained QS", + threshold=0.75, + cor_method="spearman", + output_file="~/SpaceTrooper_qs_check/cosmx_breast_native_vs_pancreas_model_colored_by_log2AspectRatio.pdf" +) + +res_breast_aspect$plot + +SpaceTrooper::plotMetricHist(spe = specosm_breast, metric="log2Ctrl_total_ratio") +SpaceTrooper::plotMetricHist(spe = specosm_pancreas, metric="log2Ctrl_total_ratio") + +############################################################ +specosm_mb1 <- readCosmxSPE("/Users/inzirio/Downloads/CosMx_data/CosMx1k_MouseBrain1") +specosm_mb2 <- readCosmxSPE("/Users/inzirio/Downloads/CosMx_data/CosMx1k_MouseBrain2") + + +specosm_mb1 <- spatialPerCellQC(specosm_mb1) +specosm_mb2 <- spatialPerCellQC(specosm_mb2) + +cosmx_formula <- paste0( + "~(", + "log2SignalDensity + ", + "Area_um + ", + "I(abs(log2AspectRatio) * as.numeric(dist_border < 50)) + ", + "log2Ctrl_total_ratio", + ")^2" +) + +specosm_mb1 <- computeQCScore( + spe=specosm_mb1, + modelFormula=cosmx_formula, + verbose=TRUE +) + +specosm_mb2 <- computeQCScore( + spe=specosm_mb2, + modelFormula=cosmx_formula, + verbose=TRUE +) + +specosm_mb1$QC_score_mb1_native <- specosm_mb1$QC_score +specosm_mb2$QC_score_mb2_native <- specosm_mb2$QC_score + +mb1_model <- metadata(specosm_mb1)$QCScore_model +mb2_model <- metadata(specosm_mb2)$QCScore_model + +specosm_mb1 <- applyQCScoreModel( + spe=specosm_mb1, + qcModel=mb2_model, + scoreName="QC_score_mb2_model" +) + +specosm_mb2 <- applyQCScoreModel( + spe=specosm_mb2, + qcModel=mb1_model, + scoreName="QC_score_mb1_model" +) + +cor_mb1_spearman <- cor( + specosm_mb1$QC_score_mb1_native, + specosm_mb1$QC_score_mb2_model, + use="complete.obs", + method="spearman" +) + +cor_mb1_pearson <- cor( + specosm_mb1$QC_score_mb1_native, + specosm_mb1$QC_score_mb2_model, + use="complete.obs", + method="pearson" +) + +cor_mb2_spearman <- cor( + specosm_mb2$QC_score_mb2_native, + specosm_mb2$QC_score_mb1_model, + use="complete.obs", + method="spearman" +) + +cor_mb2_pearson <- cor( + specosm_mb2$QC_score_mb2_native, + specosm_mb2$QC_score_mb1_model, + use="complete.obs", + method="pearson" +) + +res_mb1 <- plot_qc_score_comparison( + x=specosm_mb1$QC_score_mb1_native, + y=specosm_mb1$QC_score_mb2_model, + x_label="MouseBrain1 native QS", + y_label="MouseBrain2-trained QS", + title="CosMx MouseBrain1: native QS vs MouseBrain2-trained QS", + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + "cosmx_mousebrain1_native_vs_mousebrain2_model_A4_landscape.pdf" + ) +) + +res_mb2 <- plot_qc_score_comparison( + x=specosm_mb2$QC_score_mb2_native, + y=specosm_mb2$QC_score_mb1_model, + x_label="MouseBrain2 native QS", + y_label="MouseBrain1-trained QS", + title="CosMx MouseBrain2: native QS vs MouseBrain1-trained QS", + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + "cosmx_mousebrain2_native_vs_mousebrain1_model_A4_landscape.pdf" + ) +) + +res_mb1$plot +res_mb2$plot + +color_vars <- c( + "log2SignalDensity", + "Area_um", + "log2AspectRatio", + "log2Ctrl_total_ratio" +) + +mb1_colored_plots <- lapply(color_vars, function(v) { + plot_qc_score_coldata( + x=specosm_mb1$QC_score_mb1_native, + y=specosm_mb1$QC_score_mb2_model, + spe=specosm_mb1, + color_col=v, + x_label="MouseBrain1 native QS", + y_label="MouseBrain2-trained QS", + title=paste0("MouseBrain1 native vs MouseBrain2-trained QS: ", v), + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + paste0("cosmx_mousebrain1_native_vs_mousebrain2_model_", v, ".pdf") + ) + ) +}) +names(mb1_colored_plots) <- color_vars + +mb2_colored_plots <- lapply(color_vars, function(v) { + plot_qc_score_coldata( + x=specosm_mb2$QC_score_mb2_native, + y=specosm_mb2$QC_score_mb1_model, + spe=specosm_mb2, + color_col=v, + x_label="MouseBrain2 native QS", + y_label="MouseBrain1-trained QS", + title=paste0("MouseBrain2 native vs MouseBrain1-trained QS: ", v), + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + paste0("cosmx_mousebrain2_native_vs_mousebrain1_model_", v, ".pdf") + ) + ) +}) +names(mb2_colored_plots) <- color_vars +mb2_colored_plots +cosmx_mousebrain_transfer_summary <- data.frame( + comparison=c( + "MouseBrain1 native vs MouseBrain2-trained", + "MouseBrain2 native vs MouseBrain1-trained" + ), + spearman=c(cor_mb1_spearman, cor_mb2_spearman), + pearson=c(cor_mb1_pearson, cor_mb2_pearson), + stringsAsFactors=FALSE +) + +cosmx_mousebrain_transfer_summary + +write.csv( + cosmx_mousebrain_transfer_summary, + file=file.path(out_dir, "cosmx_mousebrain1_mousebrain2_transfer_summary.csv"), + row.names=FALSE +) + +metadata(specosm_mb1)$QCScore_model$model_matrix_colnames +metadata(specosm_mb2)$QCScore_model$model_matrix_colnames + +#################### + +spexen_bc1 <- readXeniumSPE("/Users/inzirio/Downloads/Xenium_data/Xenium_HumanBreast1_Janesick/") +spexen_bc2 <- readXeniumSPE("/Users/inzirio/Downloads/Xenium_data/Xenium_HumanBreast2_Janesick") + + +spexen_bc1 <- spatialPerCellQC(spexen_bc1) +spexen_bc2 <- spatialPerCellQC(spexen_bc2) + +xenium_formula <- paste0( + "~(", + "log2SignalDensity + ", + "Area_um + ", + "log2AspectRatio + ", + "log2Ctrl_total_ratio", + ")^2" +) + +spexen_bc1 <- computeQCScore( + spe=spexen_bc1, + modelFormula=xenium_formula, + verbose=TRUE +) + +spexen_bc2 <- computeQCScore( + spe=spexen_bc2, + modelFormula=xenium_formula, + verbose=TRUE +) + +spexen_bc1$QC_score_bc1_native <- spexen_bc1$QC_score +spexen_bc2$QC_score_bc2_native <- spexen_bc2$QC_score + +bc1_model <- metadata(spexen_bc1)$QCScore_model +bc2_model <- metadata(spexen_bc2)$QCScore_model + +spexen_bc1 <- applyQCScoreModel( + spe=spexen_bc1, + qcModel=bc2_model, + scoreName="QC_score_bc2_model" +) + +spexen_bc2 <- applyQCScoreModel( + spe=spexen_bc2, + qcModel=bc1_model, + scoreName="QC_score_bc1_model" +) + +cor_bc1_spearman <- cor( + spexen_bc1$QC_score_bc1_native, + spexen_bc1$QC_score_bc2_model, + use="complete.obs", + method="spearman" +) + +cor_bc1_pearson <- cor( + spexen_bc1$QC_score_bc1_native, + spexen_bc1$QC_score_bc2_model, + use="complete.obs", + method="pearson" +) + +cor_bc2_spearman <- cor( + spexen_bc2$QC_score_bc2_native, + spexen_bc2$QC_score_bc1_model, + use="complete.obs", + method="spearman" +) + +cor_bc2_pearson <- cor( + spexen_bc2$QC_score_bc2_native, + spexen_bc2$QC_score_bc1_model, + use="complete.obs", + method="pearson" +) + +res_bc1 <- plot_qc_score_comparison( + x=spexen_bc1$QC_score_bc1_native, + y=spexen_bc1$QC_score_bc2_model, + x_label="Breast Cancer 1 native QS", + y_label="Breast Cancer 2-trained QS", + title="Xenium Breast Cancer 1: native QS vs Breast Cancer 2-trained QS", + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + "xenium_breast1_native_vs_breast2_model_A4_landscape.pdf" + ) +) + +res_bc2 <- plot_qc_score_comparison( + x=spexen_bc2$QC_score_bc2_native, + y=spexen_bc2$QC_score_bc1_model, + x_label="Breast Cancer 2 native QS", + y_label="Breast Cancer 1-trained QS", + title="Xenium Breast Cancer 2: native QS vs Breast Cancer 1-trained QS", + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + "xenium_breast2_native_vs_breast1_model_A4_landscape.pdf" + ) +) + +res_bc1$plot +res_bc2$plot + +color_vars <- c( + "log2SignalDensity", + "Area_um", + "log2AspectRatio", + "log2Ctrl_total_ratio" +) + +bc1_colored_plots <- lapply(color_vars, function(v) { + plot_qc_score_coldata( + x=spexen_bc1$QC_score_bc1_native, + y=spexen_bc1$QC_score_bc2_model, + spe=spexen_bc1, + color_col=v, + x_label="Breast Cancer 1 native QS", + y_label="Breast Cancer 2-trained QS", + title=paste0("Breast Cancer 1 native vs Breast Cancer 2-trained QS: ", v), + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + paste0("xenium_breast1_native_vs_breast2_model_", v, ".pdf") + ) + ) +}) +names(bc1_colored_plots) <- color_vars + +bc2_colored_plots <- lapply(color_vars, function(v) { + plot_qc_score_coldata( + x=spexen_bc2$QC_score_bc2_native, + y=spexen_bc2$QC_score_bc1_model, + spe=spexen_bc2, + color_col=v, + x_label="Breast Cancer 2 native QS", + y_label="Breast Cancer 1-trained QS", + title=paste0("Breast Cancer 2 native vs Breast Cancer 1-trained QS: ", v), + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + paste0("xenium_breast2_native_vs_breast1_model_", v, ".pdf") + ) + ) +}) +names(bc2_colored_plots) <- color_vars + +xenium_breast_transfer_summary <- data.frame( + comparison=c( + "Breast Cancer 1 native vs Breast Cancer 2-trained", + "Breast Cancer 2 native vs Breast Cancer 1-trained" + ), + spearman=c(cor_bc1_spearman, cor_bc2_spearman), + pearson=c(cor_bc1_pearson, cor_bc2_pearson), + stringsAsFactors=FALSE +) + +xenium_breast_transfer_summary + +write.csv( + xenium_breast_transfer_summary, + file=file.path(out_dir, "xenium_breast1_breast2_transfer_summary.csv"), + row.names=FALSE +) + +metadata(spexen_bc1)$QCScore_model$model_matrix_colnames +metadata(spexen_bc2)$QCScore_model$model_matrix_colnames + +######################## diff --git a/inst/scripts/transfer_coeffs.R b/inst/scripts/transfer_coeffs_cosmx_xenium.R similarity index 56% rename from inst/scripts/transfer_coeffs.R rename to inst/scripts/transfer_coeffs_cosmx_xenium.R index 19812b4..ae260f5 100644 --- a/inst/scripts/transfer_coeffs.R +++ b/inst/scripts/transfer_coeffs_cosmx_xenium.R @@ -1,6 +1,6 @@ library(SpaceTrooper) -library(SummarizedExperiment) -library(S4Vectors) +# library(SummarizedExperiment) +# library(S4Vectors) set.seed(1998) @@ -22,7 +22,19 @@ spexen <- computeQCScore( verbose=TRUE ) -xen_model <- metadata(spexen)$QCScore_model +sum(is.na(spexen$QC_score)) +colSums(is.na(as.data.frame(colData(spexen))[ + , c("log2SignalDensity", "Area_um", "log2AspectRatio") +])) + +length(spexen$QC_score) == ncol(spexen) + +sum(is.na(spexen$QC_score)) +summary(spexen$QC_score) + +metadata(spexen)$QCScore_model$model_formula +metadata(spexen)$QCScore_model$model_matrix_colnames +metadata(spexen)$QCScore_model$coefficients_table ## ----------------------------- @@ -41,8 +53,14 @@ specosm <- computeQCScore( verbose=TRUE ) -cosmx_model <- metadata(specosm)$QCScore_model +length(specosm$QC_score) == ncol(specosm) +sum(is.na(specosm$QC_score)) +summary(specosm$QC_score) + +metadata(specosm)$QCScore_model$model_formula +metadata(specosm)$QCScore_model$model_matrix_colnames +metadata(specosm)$QCScore_model$coefficients_table ## ----------------------------- ## Save native scores before transfer @@ -51,6 +69,8 @@ cosmx_model <- metadata(specosm)$QCScore_model spexen$QC_score_xenium_native <- spexen$QC_score specosm$QC_score_cosmx_native <- specosm$QC_score +xen_model <- metadata(spexen)$QCScore_model +cosmx_model <- metadata(specosm)$QCScore_model ## ----------------------------- ## Apply transferred models @@ -124,3 +144,57 @@ print(metadata(spexen)$QCScore_model$coefficients_table) message("CosMx model coefficients:") print(metadata(specosm)$QCScore_model$coefficients_table) + + +## ----------------------------- +## Correlations of scores with model variables +## ----------------------------- +cor( + spexen$QC_score_xenium_native, + spexen$QC_score_cosmx_model, + use="complete.obs", + method="spearman" +) + +cor( + specosm$QC_score_cosmx_native, + specosm$QC_score_xenium_model, + use="complete.obs", + method="spearman" +) + +range(spexen$QC_score_cosmx_model, na.rm=TRUE) +sum(is.na(spexen$QC_score_cosmx_model)) + +range(specosm$QC_score_xenium_model, na.rm=TRUE) +sum(is.na(specosm$QC_score_xenium_model)) + +res_xen <- plot_qc_score_comparison( + x=spexen$QC_score_xenium_native, + y=spexen$QC_score_cosmx_model, + x_label="Native Xenium QC score", + y_label="CosMx-trained QC score", + title="Xenium: native vs CosMx-trained QC score", + cor_method = "pearson", + threshold=0.25, + output_file="~/SpaceTrooper_qs_check/xenium_native_vs_cosmx_model.pdf" +) + +res_xen$plot +res_xen$correlation +res_xen$quadrant_table + +res_cosmx <- plot_qc_score_comparison( + x=specosm$QC_score_cosmx_native, + y=specosm$QC_score_xenium_model, + x_label="Native CosMx QC score", + y_label="Xenium-trained QC score", + title="CosMx: native vs Xenium-trained QC score", + cor_method = "pearson", + threshold=0.75, + output_file="~/SpaceTrooper_qs_check/cosmx_native_vs_xenium_model.pdf" +) + +res_cosmx$plot +res_cosmx$correlation +res_cosmx$quadrant_table diff --git a/inst/scripts/transfer_coeffs_xenium_xenium.R b/inst/scripts/transfer_coeffs_xenium_xenium.R new file mode 100644 index 0000000..7ed919e --- /dev/null +++ b/inst/scripts/transfer_coeffs_xenium_xenium.R @@ -0,0 +1,348 @@ +spexen_bc1 <- readXeniumSPE("/Users/inzirio/Downloads/Xenium_data/Xenium_HumanBreast1_Janesick/") +spexen_bc2 <- readXeniumSPE("/Users/inzirio/Downloads/Xenium_data/Xenium_HumanBreast2_Janesick") + + +spexen_bc1 <- spatialPerCellQC(spexen_bc1) +spexen_bc2 <- spatialPerCellQC(spexen_bc2) + +xenium_formula <- paste0( + "~(", + "log2SignalDensity + ", + "Area_um + ", + "log2AspectRatio + ", + "log2Ctrl_total_ratio", + ")^2" +) + +spexen_bc1 <- computeQCScore( + spe=spexen_bc1, + modelFormula=xenium_formula, + verbose=TRUE +) + +spexen_bc2 <- computeQCScore( + spe=spexen_bc2, + modelFormula=xenium_formula, + verbose=TRUE +) + +spexen_bc1$QC_score_bc1_native <- spexen_bc1$QC_score +spexen_bc2$QC_score_bc2_native <- spexen_bc2$QC_score + +bc1_model <- metadata(spexen_bc1)$QCScore_model +bc2_model <- metadata(spexen_bc2)$QCScore_model + +spexen_bc1 <- applyQCScoreModel( + spe=spexen_bc1, + qcModel=bc2_model, + scoreName="QC_score_bc2_model" +) + +spexen_bc2 <- applyQCScoreModel( + spe=spexen_bc2, + qcModel=bc1_model, + scoreName="QC_score_bc1_model" +) + +cor_bc1_spearman <- cor( + spexen_bc1$QC_score_bc1_native, + spexen_bc1$QC_score_bc2_model, + use="complete.obs", + method="spearman" +) + +cor_bc1_pearson <- cor( + spexen_bc1$QC_score_bc1_native, + spexen_bc1$QC_score_bc2_model, + use="complete.obs", + method="pearson" +) + +cor_bc2_spearman <- cor( + spexen_bc2$QC_score_bc2_native, + spexen_bc2$QC_score_bc1_model, + use="complete.obs", + method="spearman" +) + +cor_bc2_pearson <- cor( + spexen_bc2$QC_score_bc2_native, + spexen_bc2$QC_score_bc1_model, + use="complete.obs", + method="pearson" +) + +res_bc1 <- plot_qc_score_comparison( + x=spexen_bc1$QC_score_bc1_native, + y=spexen_bc1$QC_score_bc2_model, + x_label="Breast Cancer 1 native QS", + y_label="Breast Cancer 2-trained QS", + title="Xenium Breast Cancer 1: native QS vs Breast Cancer 2-trained QS", + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + "xenium_breast1_native_vs_breast2_model_A4_landscape.pdf" + ) +) + +res_bc2 <- plot_qc_score_comparison( + x=spexen_bc2$QC_score_bc2_native, + y=spexen_bc2$QC_score_bc1_model, + x_label="Breast Cancer 2 native QS", + y_label="Breast Cancer 1-trained QS", + title="Xenium Breast Cancer 2: native QS vs Breast Cancer 1-trained QS", + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + "xenium_breast2_native_vs_breast1_model_A4_landscape.pdf" + ) +) + +res_bc1$plot +res_bc2$plot + +color_vars <- c( + "log2SignalDensity", + "Area_um", + "log2AspectRatio", + "log2Ctrl_total_ratio" +) + +bc1_colored_plots <- lapply(color_vars, function(v) { + plot_qc_score_coldata( + x=spexen_bc1$QC_score_bc1_native, + y=spexen_bc1$QC_score_bc2_model, + spe=spexen_bc1, + color_col=v, + x_label="Breast Cancer 1 native QS", + y_label="Breast Cancer 2-trained QS", + title=paste0("Breast Cancer 1 native vs Breast Cancer 2-trained QS: ", v), + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + paste0("xenium_breast1_native_vs_breast2_model_", v, ".pdf") + ) + ) +}) +names(bc1_colored_plots) <- color_vars + +bc2_colored_plots <- lapply(color_vars, function(v) { + plot_qc_score_coldata( + x=spexen_bc2$QC_score_bc2_native, + y=spexen_bc2$QC_score_bc1_model, + spe=spexen_bc2, + color_col=v, + x_label="Breast Cancer 2 native QS", + y_label="Breast Cancer 1-trained QS", + title=paste0("Breast Cancer 2 native vs Breast Cancer 1-trained QS: ", v), + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + paste0("xenium_breast2_native_vs_breast1_model_", v, ".pdf") + ) + ) +}) +names(bc2_colored_plots) <- color_vars + +xenium_breast_transfer_summary <- data.frame( + comparison=c( + "Breast Cancer 1 native vs Breast Cancer 2-trained", + "Breast Cancer 2 native vs Breast Cancer 1-trained" + ), + spearman=c(cor_bc1_spearman, cor_bc2_spearman), + pearson=c(cor_bc1_pearson, cor_bc2_pearson), + stringsAsFactors=FALSE +) + +xenium_breast_transfer_summary + +write.csv( + xenium_breast_transfer_summary, + file=file.path(out_dir, "xenium_breast1_breast2_transfer_summary.csv"), + row.names=FALSE +) + +metadata(spexen_bc1)$QCScore_model$model_matrix_colnames +metadata(spexen_bc2)$QCScore_model$model_matrix_colnames + +######################## + +dbkx_path <- "~/Downloads/Xenium_data/db_kero_xen" + +spexen <- readXeniumSPE(dbkx_path) +spexen_bc1 <- readXeniumSPE("/Users/inzirio/Downloads/Xenium_data/Xenium_HumanBreast1_Janesick/") + + + +spexen <- spatialPerCellQC(spexen) +spexen_bc1 <- spatialPerCellQC(spexen_bc1) + +xenium_formula <- paste0( + "~(", + "log2SignalDensity + ", + "Area_um + ", + "log2AspectRatio + ", + "log2Ctrl_total_ratio", + ")^2" +) + +spexen <- computeQCScore( + spe=spexen, + modelFormula=xenium_formula, + verbose=TRUE +) + +spexen_bc1 <- computeQCScore( + spe=spexen_bc1, + modelFormula=xenium_formula, + verbose=TRUE +) + +spexen$QC_score_dbk_native <- spexen$QC_score +spexen_bc1$QC_score_bc1_native <- spexen_bc1$QC_score + +dbk_model <- metadata(spexen)$QCScore_model +bc1_model <- metadata(spexen_bc1)$QCScore_model + +spexen <- applyQCScoreModel( + spe=spexen, + qcModel=bc1_model, + scoreName="QC_score_bc1_model" +) + +spexen_bc1 <- applyQCScoreModel( + spe=spexen_bc1, + qcModel=dbk_model, + scoreName="QC_score_dbk_model" +) + +cor_dbk_spearman <- cor( + spexen$QC_score_dbk_native, + spexen$QC_score_bc1_model, + use="complete.obs", + method="spearman" +) + +cor_dbk_pearson <- cor( + spexen$QC_score_dbk_native, + spexen$QC_score_bc1_model, + use="complete.obs", + method="pearson" +) + +cor_bc1_spearman <- cor( + spexen_bc1$QC_score_bc1_native, + spexen_bc1$QC_score_dbk_model, + use="complete.obs", + method="spearman" +) + +cor_bc1_pearson <- cor( + spexen_bc1$QC_score_bc1_native, + spexen_bc1$QC_score_dbk_model, + use="complete.obs", + method="pearson" +) + +res_dbk <- plot_qc_score_comparison( + x=spexen$QC_score_dbk_native, + y=spexen$QC_score_bc1_model, + x_label="DBK Xenium native QS", + y_label="Breast 1-trained QS", + title="Xenium DBK: native QS vs Breast 1-trained QS", + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + "xenium_dbk_native_vs_breast1_model_A4_landscape.pdf" + ) +) + +res_bc1 <- plot_qc_score_comparison( + x=spexen_bc1$QC_score_bc1_native, + y=spexen_bc1$QC_score_dbk_model, + x_label="Breast 1 native QS", + y_label="DBK-trained QS", + title="Xenium Breast 1: native QS vs DBK-trained QS", + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + "xenium_breast1_native_vs_dbk_model_A4_landscape.pdf" + ) +) + +res_dbk$plot +res_bc1$plot + +color_vars <- c( + "log2SignalDensity", + "Area_um", + "log2AspectRatio", + "log2Ctrl_total_ratio" +) + +dbk_colored_plots <- lapply(color_vars, function(v) { + plot_qc_score_coldata( + x=spexen$QC_score_dbk_native, + y=spexen$QC_score_bc1_model, + spe=spexen, + color_col=v, + x_label="DBK Xenium native QS", + y_label="Breast 1-trained QS", + title=paste0("DBK Xenium native vs Breast 1-trained QS: ", v), + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + paste0("xenium_dbk_native_vs_breast1_model_", v, ".pdf") + ) + ) +}) +names(dbk_colored_plots) <- color_vars + +bc1_colored_plots <- lapply(color_vars, function(v) { + plot_qc_score_coldata( + x=spexen_bc1$QC_score_bc1_native, + y=spexen_bc1$QC_score_dbk_model, + spe=spexen_bc1, + color_col=v, + x_label="Breast 1 native QS", + y_label="DBK-trained QS", + title=paste0("Breast 1 native vs DBK-trained QS: ", v), + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + paste0("xenium_breast1_native_vs_dbk_model_", v, ".pdf") + ) + ) +}) +names(bc1_colored_plots) <- color_vars + +xenium_dbk_breast1_transfer_summary <- data.frame( + comparison=c( + "DBK Xenium native vs Breast 1-trained", + "Breast 1 native vs DBK-trained" + ), + spearman=c(cor_dbk_spearman, cor_bc1_spearman), + pearson=c(cor_dbk_pearson, cor_bc1_pearson), + stringsAsFactors=FALSE +) + +xenium_dbk_breast1_transfer_summary + +write.csv( + xenium_dbk_breast1_transfer_summary, + file=file.path(out_dir, "xenium_dbk_breast1_transfer_summary.csv"), + row.names=FALSE +) + +metadata(spexen)$QCScore_model$model_matrix_colnames +metadata(spexen_bc1)$QCScore_model$model_matrix_colnames + +######################### From faa1af5f52a0c2a49be3800372ec1c9b76d3550c Mon Sep 17 00:00:00 2001 From: Dario Date: Mon, 25 May 2026 11:25:32 +0200 Subject: [PATCH 6/8] adding citation file --- inst/CITATION | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 inst/CITATION diff --git a/inst/CITATION b/inst/CITATION new file mode 100644 index 0000000..e0a7e38 --- /dev/null +++ b/inst/CITATION @@ -0,0 +1,23 @@ +bibentry( + bibtype="Article", + title="SpaceTrooper: a quality control framework for imaging-based spatial omics data", + author=c( + person("Benedetta", "Banzi"), + person("Dario", "Righelli"), + person("Matteo", "Marchionni"), + person("Oriana", "Romano"), + person("Mattia", "Forcato"), + person("Davide", "Risso"), + person("Silvio", "Bicciato") + ), + journal="bioRxiv", + year="2025", + doi="10.64898/2025.12.24.696336", + url="https://doi.org/10.64898/2025.12.24.696336", + textVersion=paste( + "Banzi B, Righelli D, Marchionni M, Romano O, Forcato M,", + "Risso D, Bicciato S. SpaceTrooper: a quality control framework", + "for imaging-based spatial omics data. bioRxiv, 2025.", + "doi:10.64898/2025.12.24.696336" + ) +) From c732c22608ef5f364488da5602db50f2dd277fe7 Mon Sep 17 00:00:00 2001 From: Dario Date: Mon, 25 May 2026 11:35:10 +0200 Subject: [PATCH 7/8] news version 1.1.8 --- NEWS.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/NEWS.md b/NEWS.md index b2c76b3..27c1dce 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# Changes in version 1.1.8 + +* fixing naming of of spacetrooper utilities vignette +* adding functions for QC model transfer across datasets +* several new scripts for QC tranferring and comparison +* adding citation file with biorxiv paper + # Changes in version 1.1.7 * fixing author name typo From 031dc0cdd5d1a5c0f42f95ac50703161188c864f Mon Sep 17 00:00:00 2001 From: Dario Date: Mon, 25 May 2026 15:04:37 +0200 Subject: [PATCH 8/8] fixing errors on check --- NAMESPACE | 3 +++ R/QC.R | 11 ++++++++- man/applyQCScoreModel.Rd | 48 ++++++++++++++++++++++++++++++++++++++++ man/computeQCScore.Rd | 11 ++++++++- 4 files changed, 71 insertions(+), 2 deletions(-) create mode 100644 man/applyQCScoreModel.Rd diff --git a/NAMESPACE b/NAMESPACE index 4e9db72..cc92c05 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,7 @@ export(.getActiveGeometryName) export(.renameGeometry) export(.setActiveGeometry) export(addPolygonsToSPE) +export(applyQCScoreModel) export(checkOutliers) export(computeAreaFromPolygons) export(computeAspectRatioFromPolygons) @@ -126,6 +127,8 @@ importFrom(sf,st_sf) importFrom(sf,st_sfc) importFrom(sfheaders,sf_polygon) importFrom(stats,as.formula) +importFrom(stats,coef) +importFrom(stats,complete.cases) importFrom(stats,model.matrix) importFrom(stats,predict) importFrom(stats,quantile) diff --git a/R/QC.R b/R/QC.R index 6d69efd..78f5798 100644 --- a/R/QC.R +++ b/R/QC.R @@ -441,12 +441,20 @@ computeLambda <- function(trainDF, modelFormula) { #' @param spe A `SpatialExperiment` object with spatial transcriptomics data. #' @param verbose logical for having a verbose output. Default is FALSE. #' @param bestLambda the best lambda typically computed using `computeLambda`. +#' @param modelFormula a character string representing the model formula to be +#' used for training the model. If NULL, the formula is automatically generated +#' based on the available metrics and outliers in the dataset. +#' See Details for more information. +#' Note that the automatically generated formula will include interaction +#' terms between the metrics, and will exclude metrics with insufficient +#' outliers (< 0.1% of the dataset). If a custom `modelFormula` is provided, +#' it will be used as is without modification or checks for outlier counts. #' #' @return The `SpatialExperiment` object with added QC score in `colData`. #' @export #' @importFrom dplyr case_when filter mutate distinct pull #' @importFrom glmnet glmnet cv.glmnet -#' @importFrom stats as.formula model.matrix quantile predict +#' @importFrom stats as.formula model.matrix quantile predict coef #' @examples #' example(spatialPerCellQC) #' set.seed(1998) @@ -1323,6 +1331,7 @@ applyQCScoreModel <- function(spe, qcModel, scoreName="QC_score") { return(modelMatrix) } +#' @importFrom stats complete.cases .filterCompleteModelCases <- function(df, modelFormula, response=NULL, context="cells") { diff --git a/man/applyQCScoreModel.Rd b/man/applyQCScoreModel.Rd new file mode 100644 index 0000000..9bea5d2 --- /dev/null +++ b/man/applyQCScoreModel.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/QC.R +\name{applyQCScoreModel} +\alias{applyQCScoreModel} +\title{applyQCScoreModel} +\usage{ +applyQCScoreModel(spe, qcModel, scoreName = "QC_score") +} +\arguments{ +\item{spe}{A `SpatialExperiment` object with QC metrics already computed.} + +\item{qcModel}{A QC score model object, usually stored in +`metadata(spe)$QCScore_model`.} + +\item{scoreName}{Name of the output column in `colData`.} +} +\value{ +A `SpatialExperiment` object with added QC score in `colData`. +} +\description{ +Apply a previously trained QC score model to a new SpatialExperiment object. +} +\examples{ + +example(readCosmxSPE) + +## Compute per-cell quality control metrics +spe <- spatialPerCellQC(spe) + +## Split the object only for example purposes +set.seed(1998) +idx <- sample(seq_len(ncol(spe)), floor(ncol(spe) / 2)) +spe_train <- spe[, idx] +spe_test <- spe[, -idx] + +## Train the Quality Control (QC) score model on one dataset +spe_train <- computeQCScore(spe_train) +qc_model <- metadata(spe_train)$QCScore_model + +## Apply the trained model to another dataset +spe_test <- applyQCScoreModel( + spe=spe_test, + qcModel=qc_model, + scoreName="QC_score_transferred" +) + +summary(spe_test$QC_score_transferred) +} diff --git a/man/computeQCScore.Rd b/man/computeQCScore.Rd index a9fce89..0af8503 100644 --- a/man/computeQCScore.Rd +++ b/man/computeQCScore.Rd @@ -4,13 +4,22 @@ \alias{computeQCScore} \title{computeQCScore} \usage{ -computeQCScore(spe, bestLambda = NULL, verbose = FALSE) +computeQCScore(spe, bestLambda = NULL, modelFormula = NULL, verbose = FALSE) } \arguments{ \item{spe}{A `SpatialExperiment` object with spatial transcriptomics data.} \item{bestLambda}{the best lambda typically computed using `computeLambda`.} +\item{modelFormula}{a character string representing the model formula to be +used for training the model. If NULL, the formula is automatically generated +based on the available metrics and outliers in the dataset. +See Details for more information. +Note that the automatically generated formula will include interaction +terms between the metrics, and will exclude metrics with insufficient +outliers (< 0.1% of the dataset). If a custom `modelFormula` is provided, +it will be used as is without modification or checks for outlier counts.} + \item{verbose}{logical for having a verbose output. Default is FALSE.} } \value{