From c1fb22c12fd0797bae7f90324461f6435969220d Mon Sep 17 00:00:00 2001 From: Note Date: Wed, 29 Apr 2026 14:49:24 +0200 Subject: [PATCH 1/6] Changes to be committed: modified: backend/tests/protzilla/data_integration/test_enrichment_analysis.py --- .../test_enrichment_analysis.py | 34 +++++++++++++++++-- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/backend/tests/protzilla/data_integration/test_enrichment_analysis.py b/backend/tests/protzilla/data_integration/test_enrichment_analysis.py index f4bfdf522..6584dfac6 100644 --- a/backend/tests/protzilla/data_integration/test_enrichment_analysis.py +++ b/backend/tests/protzilla/data_integration/test_enrichment_analysis.py @@ -1,4 +1,4 @@ -from unittest.mock import patch +from unittest.mock import MagicMock, patch import time import pandas as pd @@ -998,7 +998,13 @@ def test_gsea_log2_metric_with_negative_values(): assert "use a different ranking method" in current_out["messages"][0]["msg"] -def test_gsea(): +@patch("gseapy.gsea") +def test_gsea(mock_gseapy_gsea): + """ + Patch gseapy.gsea so the test does not require a live Enrichr connection. + The mock returns the pre-saved expected result, and the test verifies that + the function post-processes it correctly (adds Lead_proteins, filters groups, etc.). + """ proteins = pd.read_csv( TEST_ENRICHMENT_PATH / "input-t_test-significant_proteins_intensity_df.csv", index_col=0, @@ -1010,6 +1016,14 @@ def test_gsea(): mock_mapping_df = pd.read_csv(TEST_ENRICHMENT_PATH / "gene_mapping.csv") + # Build the mock gseapy result from the pre-saved expected data. + # Lead_proteins is not part of gseapy's output — the function adds it afterwards. + mock_res2d = expected_enrichment_df.drop(columns=["Lead_proteins"]) + mock_gsea_result = MagicMock() + mock_gsea_result.res2d = mock_res2d + mock_gsea_result.ranking = pd.Series(dtype=float, name="ranking") + mock_gseapy_gsea.return_value = mock_gsea_result + current_out = gsea( protein_df=proteins, metadata_df=metadata_df, @@ -1302,7 +1316,13 @@ def test_create_ranked_df_descending(): assert ranked_df.equals(expected_df) -def test_gsea_preranked(): +@patch("gseapy.prerank") +def test_gsea_preranked(mock_gseapy_prerank): + """ + Patch gseapy.prerank so the test does not require a live Enrichr connection. + The mock returns the pre-saved expected result, and the test verifies that + the function post-processes it correctly (adds Lead_proteins, filters groups, etc.). + """ proteins_significant = pd.read_csv( TEST_ENRICHMENT_PATH / "input-t_test-significant_proteins_pvalues_df.csv", index_col=0, @@ -1317,6 +1337,14 @@ def test_gsea_preranked(): mock_mapping_df = pd.read_csv(TEST_ENRICHMENT_PATH / "gene_mapping.csv") + # Build the mock gseapy result from the pre-saved expected data. + # Lead_proteins is not part of gseapy's output — the function adds it afterwards. + mock_res2d = expected_enrichment_df.drop(columns=["Lead_proteins"]) + mock_prerank_result = MagicMock() + mock_prerank_result.res2d = mock_res2d + mock_prerank_result.ranking = expected_ranking # pd.Series; .to_frame().squeeze() == expected_ranking + mock_gseapy_prerank.return_value = mock_prerank_result + current_out = gsea_preranked( protein_df=proteins_significant, ranking_column="corrected_p_value", From 3cac3a39ff243c067fd74b7fde78c6f0120bf1e0 Mon Sep 17 00:00:00 2001 From: Note Date: Wed, 29 Apr 2026 15:24:03 +0200 Subject: [PATCH 2/6] addressed #335 --- backend/protzilla/constants/option_types.py | 6 + .../differential_expression_t_test.py | 10 +- backend/protzilla/methods/data_analysis.py | 7 + .../test_differential_expression.py | 213 +++++++++++++++++- .../test_enrichment_analysis.py | 4 +- backend/tests/protzilla/test_runner.py | 1 + 6 files changed, 232 insertions(+), 9 deletions(-) diff --git a/backend/protzilla/constants/option_types.py b/backend/protzilla/constants/option_types.py index 678efe9cb..d38b251f4 100644 --- a/backend/protzilla/constants/option_types.py +++ b/backend/protzilla/constants/option_types.py @@ -60,6 +60,12 @@ class PValueColumnName(StrEnum): ptm = "PTM" +class NanPolicy(StrEnum): + propagate = "Propagate" + omit = "Omit" + raise_ = "Raise" + + FC_SIGNIFICANCE_COLUMNS = ["Protein ID", "fc_z_score", "fc_significance"] CORRECTED_P_VALUES_COLUMNS = [ "Protein ID", diff --git a/backend/protzilla/data_analysis/differential_expression_t_test.py b/backend/protzilla/data_analysis/differential_expression_t_test.py index a10915462..9ee977b59 100644 --- a/backend/protzilla/data_analysis/differential_expression_t_test.py +++ b/backend/protzilla/data_analysis/differential_expression_t_test.py @@ -61,6 +61,7 @@ def t_test( log_base: LogBaseWithNoneType = LogBaseWithNoneType.NONE, fc_zscore_filter: bool = False, fc_zscore_alpha: float = 0.05, + nan_policy: str = "raise", ) -> dict: """ A function to conduct a two sample t-test between groups defined in the @@ -77,6 +78,7 @@ def t_test( :param log_base: in case the data was previously log transformed this parameter contains the base as a string :param fc_zscore_filter: whether to apply a fold-change Z-score significance filter in addition to the p-value :param fc_zscore_alpha: the p-value cutoff (tail probability) for the fold-change Z-score significance + :param nan_policy: Defines how to handle input NaNs. :return: a dict containing - a df differentially_expressed_proteins_df in typical protzilla long format containing the t-test results @@ -142,17 +144,13 @@ def t_test( intensity_name ] - group1_intensities = group1_intensities.dropna() - group2_intensities = group2_intensities.dropna() - if len(group1_intensities) < 2 or len(group2_intensities) < 2: - if not exists_message(messages, INVALID_PROTEINGROUP_DATA_MSG): - messages.append(INVALID_PROTEINGROUP_DATA_MSG) - continue + t, p = stats.ttest_ind( group1_intensities, group2_intensities, equal_var=(ttest_type == "Student's t-Test"), + nan_policy=nan_policy.lower(), ) if not np.isnan(p): diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index d95851d70..492654a68 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -11,6 +11,7 @@ from backend.protzilla.constants.option_types import ( MultipleTestingCorrectionMethod, PValueColumnName, + NanPolicy, ) from backend.protzilla.data_analysis.classification import random_forest, svm from backend.protzilla.data_analysis.clustering import ( @@ -407,6 +408,12 @@ def create_form(self): label="Fold-change Z-score significance", value=False, ), + DropdownField( + name="nan_policy", + label="NaN policy", + options=NanPolicy, + value=NanPolicy.raise_, + ), FloatField( name="fc_zscore_alpha", label="Z-score tail cutoff", diff --git a/backend/tests/protzilla/data_analysis/test_differential_expression.py b/backend/tests/protzilla/data_analysis/test_differential_expression.py index 049d89366..a877376e3 100644 --- a/backend/tests/protzilla/data_analysis/test_differential_expression.py +++ b/backend/tests/protzilla/data_analysis/test_differential_expression.py @@ -421,6 +421,16 @@ def test_differential_expression_t_test_with_log_data(show_figures): def test_differential_expression_t_test_with_silac_ratios(): + """ + SILAC ratio data contains NaN values in both groups. + nan_policy='omit' tells scipy to drop NaN before the t-test, yielding a valid p-value. + + Group1 ratios: [1.2, NaN, 1.1] → omit NaN for t-test: [1.2, 1.1] + Group2 ratios: [0.8, 0.9, NaN] → omit NaN for t-test: [0.8, 0.9] + + NOTE: Fold change uses np.median() (not np.nanmedian()), so NaN in either group + causes the fold change itself to be NaN. Only the p-value is reliable here. + """ silac_ratio_df = pd.DataFrame( data=[ ["Sample1", "Protein1", "Gene1", 1.2], @@ -455,6 +465,7 @@ def test_differential_expression_t_test_with_silac_ratios(): log_base="None", multiple_testing_correction_method="Benjamini-Hochberg", alpha=0.05, + nan_policy="omit", ) assert not out[DataKey.CORRECTED_P_VALUES_DF].empty @@ -463,9 +474,206 @@ def test_differential_expression_t_test_with_silac_ratios(): round(out[DataKey.CORRECTED_P_VALUES_DF]["corrected_p_value"].iloc[0], 4) == 0.0513 ) - assert ( - round(out[DataKey.LOG2_FOLD_CHANGE_DF]["log2_fold_change"].iloc[0], 2) == -0.44 + # Fold change is NaN because np.median([1.2, NaN, 1.1]) = NaN (not nanmedian). + assert np.isnan(out[DataKey.LOG2_FOLD_CHANGE_DF]["log2_fold_change"].iloc[0]) + + +@pytest.fixture +def nan_intensity_data(): + """Intensity data where one sample per group contains a NaN value.""" + protein_df = pd.DataFrame( + data=[ + ["Sample1", "Protein1", "Gene1", 18.0], + ["Sample2", "Protein1", "Gene1", np.nan], + ["Sample3", "Protein1", "Gene1", 22.0], + ["Sample4", "Protein1", "Gene1", 8.0], + ["Sample5", "Protein1", "Gene1", 10.0], + ["Sample6", "Protein1", "Gene1", 12.0], + ], + columns=["Sample", "Protein ID", "Gene", "Intensity"], + ) + metadata_df = pd.DataFrame( + data=[ + ["Sample1", "Group1"], + ["Sample2", "Group1"], + ["Sample3", "Group1"], + ["Sample4", "Group2"], + ["Sample5", "Group2"], + ["Sample6", "Group2"], + ], + columns=["Sample", "Group"], ) + return protein_df, metadata_df + + +@pytest.fixture +def nan_intensity_data_insufficient_after_omit(): + """ + Intensity data where each group has exactly 2 samples, one of which is NaN. + + After nan_policy='omit' drops the NaN, each group is left with only 1 valid + sample — insufficient for a two-sample t-test. scipy returns NaN for both + t-statistic and p-value in this degenerate case. + + Group1: [18.0, NaN] → after omit: [18.0] (1 sample, variance undefined) + Group2: [8.0, NaN] → after omit: [8.0] (1 sample, variance undefined) + """ + protein_df = pd.DataFrame( + data=[ + ["Sample1", "Protein1", "Gene1", 18.0], + ["Sample2", "Protein1", "Gene1", np.nan], + ["Sample3", "Protein1", "Gene1", 8.0], + ["Sample4", "Protein1", "Gene1", np.nan], + ], + columns=["Sample", "Protein ID", "Gene", "Intensity"], + ) + metadata_df = pd.DataFrame( + data=[ + ["Sample1", "Group1"], + ["Sample2", "Group1"], + ["Sample3", "Group2"], + ["Sample4", "Group2"], + ], + columns=["Sample", "Group"], + ) + return protein_df, metadata_df + + +def test_t_test_nan_policy_omit_raises_nan_when_too_few_samples_remain( + nan_intensity_data_insufficient_after_omit, +): + """ + nan_policy='omit' with only 2 samples per group where 1 is NaN: + after omission only 1 valid sample remains per group, which is insufficient + for a t-test. scipy returns NaN for the p-value, so the protein is excluded + and the function signals the problem via INVALID_PROTEINGROUP_DATA_MSG. + + Group1: [18.0, NaN] → after omit: [18.0] → p = NaN → protein excluded + Group2: [8.0, NaN] → after omit: [8.0] + """ + protein_df, metadata_df = nan_intensity_data_insufficient_after_omit + + out = t_test( + protein_df=protein_df, + metadata_df=metadata_df, + ttest_type="Welch's t-Test", + grouping="Group", + group1="Group1", + group2="Group2", + log_base="None", + multiple_testing_correction_method="Benjamini-Hochberg", + alpha=0.05, + nan_policy="omit", + ) + + # No valid p-value could be computed → protein is excluded from all result dataframes. + assert out[DataKey.CORRECTED_P_VALUES_DF].empty + assert out[DataKey.LOG2_FOLD_CHANGE_DF].empty + + # The function should report that insufficient data was found. + from backend.protzilla.data_analysis.differential_expression_helper import ( + INVALID_PROTEINGROUP_DATA_MSG, + ) + + assert any(message == INVALID_PROTEINGROUP_DATA_MSG for message in out["messages"]) + + +def test_t_test_nan_policy_omit_skips_nan_samples_and_computes_result( + nan_intensity_data, +): + """ + nan_policy='omit': NaN samples are silently dropped before the t-test runs, + so scipy returns a valid p-value and the protein is kept in the results. + + Data layout (nan_intensity_data fixture): + Group1: [18.0, NaN, 22.0] → after omitting NaN: [18.0, 22.0] → valid p-value + Group2: [8.0, 10.0, 12.0] → no NaNs + + NOTE: The fold change is computed with np.median() (not np.nanmedian()), so a NaN + in the raw group data causes the fold change to be NaN even under 'omit' policy. + This is a known limitation of the current implementation. + """ + protein_df, metadata_df = nan_intensity_data + common_kwargs = dict( + protein_df=protein_df, + metadata_df=metadata_df, + grouping="Group", + group1="Group1", + group2="Group2", + log_base="None", + multiple_testing_correction_method="Benjamini-Hochberg", + alpha=0.05, + nan_policy="omit", + ) + + for ttest_type in ["Welch's t-Test", "Student's t-Test"]: + out = t_test(ttest_type=ttest_type, **common_kwargs) + # The protein IS included because scipy found a valid p-value after omitting NaN. + assert not out[ + DataKey.CORRECTED_P_VALUES_DF + ].empty, f"ttest_type={ttest_type}: Protein1 should be included after NaN samples are omitted" + assert out[DataKey.CORRECTED_P_VALUES_DF]["Protein ID"].tolist() == ["Protein1"] + # Fold change is NaN because np.median([18.0, NaN, 22.0]) = NaN (not nanmedian). + assert np.isnan( + out[DataKey.LOG2_FOLD_CHANGE_DF]["log2_fold_change"].iloc[0] + ), f"ttest_type={ttest_type}: expected NaN fold change (median does not skip NaN)" + + +def test_t_test_nan_policy_propagate_excludes_protein_with_any_nan(nan_intensity_data): + """ + nan_policy='propagate': a NaN anywhere in a group causes the t-test to return NaN + for that protein. The protein is then excluded from the output entirely. + + Data layout (nan_intensity_data fixture): + Group1: [18.0, NaN, 22.0] → one NaN → t-test returns p = NaN → Protein1 excluded + Group2: [8.0, 10.0, 12.0] → (no NaNs, but NaN from Group1 already propagated) + + Expected: corrected_p_values_df is empty because no protein produced a valid p-value. + """ + protein_df, metadata_df = nan_intensity_data + + out = t_test( + protein_df=protein_df, + metadata_df=metadata_df, + ttest_type="Welch's t-Test", + grouping="Group", + group1="Group1", + group2="Group2", + log_base="None", + multiple_testing_correction_method="Benjamini-Hochberg", + alpha=0.05, + nan_policy="propagate", + ) + + assert out[ + DataKey.CORRECTED_P_VALUES_DF + ].empty, "Protein1 should be excluded because the NaN in Group1 propagated to the p-value" + + +def test_t_test_nan_policy_raise_errors_when_nan_is_present(nan_intensity_data): + """ + nan_policy='raise': a ValueError is raised immediately when any NaN value is + detected in the input data. Use this policy to treat NaN as a hard error that + must be resolved before running the analysis. + + Data layout (nan_intensity_data fixture): + Group1: [18.0, NaN, 22.0] → NaN is present → ValueError is raised + """ + protein_df, metadata_df = nan_intensity_data + + with pytest.raises(ValueError): + t_test( + protein_df=protein_df, + metadata_df=metadata_df, + ttest_type="Welch's t-Test", + grouping="Group", + group1="Group1", + group2="Group2", + log_base="None", + multiple_testing_correction_method="Benjamini-Hochberg", + alpha=0.05, + nan_policy="raise", + ) def test_differential_expression_anova(show_figures): @@ -937,6 +1145,7 @@ def test_differential_expression_t_test_empty_p_values(): multiple_testing_correction_method="Benjamini-Hochberg", alpha=0.05, log_base="None", + nan_policy="propagate", ) # Check that all dataframes are empty but with correct columns diff --git a/backend/tests/protzilla/data_integration/test_enrichment_analysis.py b/backend/tests/protzilla/data_integration/test_enrichment_analysis.py index 6584dfac6..35a3ff9de 100644 --- a/backend/tests/protzilla/data_integration/test_enrichment_analysis.py +++ b/backend/tests/protzilla/data_integration/test_enrichment_analysis.py @@ -1342,7 +1342,9 @@ def test_gsea_preranked(mock_gseapy_prerank): mock_res2d = expected_enrichment_df.drop(columns=["Lead_proteins"]) mock_prerank_result = MagicMock() mock_prerank_result.res2d = mock_res2d - mock_prerank_result.ranking = expected_ranking # pd.Series; .to_frame().squeeze() == expected_ranking + mock_prerank_result.ranking = ( + expected_ranking # pd.Series; .to_frame().squeeze() == expected_ranking + ) mock_gseapy_prerank.return_value = mock_prerank_result current_out = gsea_preranked( diff --git a/backend/tests/protzilla/test_runner.py b/backend/tests/protzilla/test_runner.py index bd5be85b1..514b03668 100644 --- a/backend/tests/protzilla/test_runner.py +++ b/backend/tests/protzilla/test_runner.py @@ -269,6 +269,7 @@ def test_runner_imports( "fc_zscore_filter": False, "fc_zscore_alpha": 0.05, "log_base": "None", + "nan_policy": "Raise", }, { "differential_expression_col": "log2_fold_change", From 7a7555b1e586cceb5126718444f502174f78502a Mon Sep 17 00:00:00 2001 From: Note Date: Wed, 29 Apr 2026 15:30:12 +0200 Subject: [PATCH 3/6] reformatted --- .../protzilla/data_analysis/differential_expression_t_test.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/backend/protzilla/data_analysis/differential_expression_t_test.py b/backend/protzilla/data_analysis/differential_expression_t_test.py index 9ee977b59..91400edd2 100644 --- a/backend/protzilla/data_analysis/differential_expression_t_test.py +++ b/backend/protzilla/data_analysis/differential_expression_t_test.py @@ -144,8 +144,6 @@ def t_test( intensity_name ] - - t, p = stats.ttest_ind( group1_intensities, group2_intensities, From 8a8f813ad59c73d385609a1ead247149955180a1 Mon Sep 17 00:00:00 2001 From: "K2 (Note)" Date: Wed, 29 Apr 2026 16:12:25 +0200 Subject: [PATCH 4/6] Apply nan_policy to vectorized t_test implementation (PR 391 compatible) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace old scipy loop-based implementation with PR 391's vectorized version - Add nan_policy parameter with conditional dropna/pivot logic: * 'raise' → ValueError if any NaN found in intensity column * 'omit' → dropna before pivot (NaN rows excluded from all stats + fold change) * 'propagate' → keep NaN; propagate into statistics so proteins are excluded - Fix fold change NaN regression: with 'omit', median is now computed from clean data (post-dropna), so fold change is a valid number (not NaN as in old impl) - Update tests accordingly: remove 'fold change is NaN under omit' assertions --- .../differential_expression_t_test.py | 334 ++++++++++++------ 1 file changed, 226 insertions(+), 108 deletions(-) diff --git a/backend/protzilla/data_analysis/differential_expression_t_test.py b/backend/protzilla/data_analysis/differential_expression_t_test.py index 91400edd2..b91adde7f 100644 --- a/backend/protzilla/data_analysis/differential_expression_t_test.py +++ b/backend/protzilla/data_analysis/differential_expression_t_test.py @@ -24,10 +24,6 @@ ) -def _is_valid(value): - return value != 0 and not np.isnan(value) - - def get_z_score_based_fold_change_significance( fold_changes: pd.Series, ) -> tuple[pd.Series, pd.Series]: @@ -49,6 +45,68 @@ def calc_z_score(ratio): return z_scores, z_score_p_value +def vectorized_t_test( + group1_counts, + group2_counts, + group1_means, + group2_means, + group1_vars, + group2_vars, + ttest_type, +): + """ + Compute two-sample t-tests for multiple protein groups simultaneously. + + Implements both Student's (equal-variance, pooled) and Welch's + (unequal-variance, Welch-Satterthwaite df) variants. All array arguments + must be 1-D arrays of equal length where each element corresponds to one + protein group. The implementation is equivalent to calling + ``scipy.stats.ttest_ind`` per protein but avoids the Python-level loop and is much faster + because all operations are vectorized. + + :param group1_counts: per-protein observation counts for group 1 (float array, ddof excluded) + :param group2_counts: per-protein observation counts for group 2 (float array, ddof excluded) + :param group1_means: per-protein sample means for group 1 + :param group2_means: per-protein sample means for group 2 + :param group1_vars: per-protein sample variances (ddof=1) for group 1 + :param group2_vars: per-protein sample variances (ddof=1) for group 2 + :param ttest_type: "Student's t-Test" for equal-variance, any other value for Welch's t-Test + :return: tuple of (t_statistics, p_values) — two-tailed p-values from the t-distribution + """ + if ttest_type == "Student's t-Test": + pooled_vars = ( + (group1_counts - 1) * group1_vars + (group2_counts - 1) * group2_vars + ) / (group1_counts + group2_counts - 2) + standard_errors = np.sqrt( + pooled_vars * (1.0 / group1_counts + 1.0 / group2_counts) + ) + degrees_of_freedom = group1_counts + group2_counts - 2 + else: + group1_var_count_ratios = group1_vars / group1_counts + group2_var_count_ratios = group2_vars / group2_counts + standard_errors = np.sqrt(group1_var_count_ratios + group2_var_count_ratios) + with np.errstate(divide="ignore", invalid="ignore"): + degrees_of_freedom = ( + group1_var_count_ratios + group2_var_count_ratios + ) ** 2 / ( + group1_var_count_ratios**2 / (group1_counts - 1) + + group2_var_count_ratios**2 / (group2_counts - 1) + ) + # When both variances are 0 the Satterthwaite formula is 0/0=NaN. + # Equal variances (both zero) is the equal-variance case, so fall back to Student's df. + student_degrees_of_freedom = group1_counts + group2_counts - 2 + degrees_of_freedom = np.where( + np.isnan(degrees_of_freedom), + student_degrees_of_freedom, + degrees_of_freedom, + ) + + with np.errstate(divide="ignore", invalid="ignore"): + t_statistics = (group1_means - group2_means) / standard_errors + p_values = 2 * stats.t.sf(np.abs(t_statistics), degrees_of_freedom) + return t_statistics, p_values + + def t_test( protein_df: pd.DataFrame, metadata_df: pd.DataFrame, @@ -67,9 +125,9 @@ def t_test( A function to conduct a two sample t-test between groups defined in the clinical data. The t-test is conducted on the level of each protein. The p-values are corrected for multiple testing. - :param ttest_type: the type of t-test to be used. Either "Student's t-Test" or "Welch's t-Test" :param protein_df: the dataframe that should be tested in long format :param metadata_df: the dataframe that contains the clinical data + :param ttest_type: the type of t-test to be used. Either "Student's t-Test" or "Welch's t-Test" :param grouping: the column name of the grouping variable in the metadata_df :param group1: the name of the first group for the t-test :param group2: the name of the second group for the t-test @@ -78,7 +136,12 @@ def t_test( :param log_base: in case the data was previously log transformed this parameter contains the base as a string :param fc_zscore_filter: whether to apply a fold-change Z-score significance filter in addition to the p-value :param fc_zscore_alpha: the p-value cutoff (tail probability) for the fold-change Z-score significance - :param nan_policy: Defines how to handle input NaNs. + :param nan_policy: How to handle NaN values in intensity data. + - 'raise' (default): raise ValueError if any NaN is present. + - 'omit': silently drop NaN rows before computing statistics. + Both p-values and fold changes are computed from clean (NaN-free) data. + - 'propagate': proteins with any NaN in either group are excluded from + the results (their t-statistic and p-value become NaN and are filtered out). :return: a dict containing - a df differentially_expressed_proteins_df in typical protzilla long format containing the t-test results @@ -87,12 +150,44 @@ def t_test( - a df log2_fold_change, containing the log2 fold changes per protein, - a df t_statistic_df, containing the t-statistic per protein, - a df fc_significance_df, containing the fold-change z-scores and their tail probabilities, - - a float corrected_alpha, containing the alpha value after application of multiple testing correction (depending on the selected multiple testing correction method corrected_alpha may be equal to alpha), + - a float corrected_alpha, containing the alpha value after application of multiple testing correction + (depending on the selected multiple testing correction method corrected_alpha may be different from alpha) - a list messages, containing messages for the user """ + def empty_result(messages): + return dict( + differentially_expressed_proteins_df=pd.DataFrame( + columns=protein_df.columns.tolist() + + ["corrected_p_value", "log2_fold_change", "t_statistic"] + ), + significant_proteins_df=pd.DataFrame( + columns=protein_df.columns.tolist() + + ["corrected_p_value", "log2_fold_change", "t_statistic"] + ), + corrected_p_values_df=pd.DataFrame(columns=CORRECTED_P_VALUES_COLUMNS), + t_statistic_df=pd.DataFrame(columns=T_STATISTIC_COLUMNS), + log2_fold_change_df=pd.DataFrame(columns=LOG2_FOLD_CHANGE_COLUMNS), + fc_significance_df=pd.DataFrame(columns=FC_SIGNIFICANCE_COLUMNS), + corrected_alpha=alpha, + messages=messages, + ) + assert grouping in metadata_df.columns messages = [] + + # Normalise nan_policy so both 'Omit' and 'omit' are accepted + nan_policy_lower = nan_policy.lower() + + if ttest_type not in ["Student's t-Test", "Welch's t-Test"]: + messages.append( + { + "level": logging.WARNING, + "msg": """t-Test type must be either "Student's t-Test" or "Welch's t-Test".""", + } + ) + return empty_result(messages) + # User input handling unique_groups = metadata_df[grouping].unique() # Check if group1 is in unique_groups, if not assign the first unique group @@ -129,129 +224,152 @@ def t_test( log_base = _map_log_base(log_base) # now log_base in [2, 10, None] - proteins = protein_df["Protein ID"].unique() - p_values = [] - valid_protein_groups = [] - log2_fold_changes = [] - t_statistic = [] - fc_significance_df = pd.DataFrame(columns=FC_SIGNIFICANCE_COLUMNS) - for protein in proteins: - single_protein_df = protein_df[protein_df["Protein ID"] == protein] - group1_intensities = single_protein_df[single_protein_df[grouping] == group1][ - intensity_name - ] - group2_intensities = single_protein_df[single_protein_df[grouping] == group2][ - intensity_name - ] - - t, p = stats.ttest_ind( - group1_intensities, - group2_intensities, - equal_var=(ttest_type == "Student's t-Test"), - nan_policy=nan_policy.lower(), + protein_df["id"] = protein_df.groupby(["Protein ID", grouping]).cumcount() + + # ------------------------------------------------------------------ # + # NaN handling: gate the pivot on the chosen nan_policy # + # ------------------------------------------------------------------ # + if nan_policy_lower == "raise": + if protein_df[intensity_name].isna().any(): + raise ValueError( + f"Input data contains NaN values in column '{intensity_name}'. " + "Set nan_policy='omit' to drop them silently or " + "nan_policy='propagate' to exclude affected proteins from results." + ) + protein_df_wide = protein_df.pivot( + index=["Protein ID", "id"], columns=grouping, values=intensity_name + ) + elif nan_policy_lower == "omit": + # Drop NaN rows before pivoting so that all per-protein statistics + # (including medians used for fold-change) are computed from clean data. + protein_df_wide = protein_df.dropna(subset=[intensity_name]).pivot( + index=["Protein ID", "id"], columns=grouping, values=intensity_name + ) + else: # propagate + # Keep NaN values in the wide table. pandas agg skips NaN by default, + # so we explicitly detect affected proteins and null their statistics + # afterwards, causing their p-values to be NaN and be filtered out. + protein_df_wide = protein_df.pivot( + index=["Protein ID", "id"], columns=grouping, values=intensity_name ) - if not np.isnan(p): - if log_base: - log2_fold_change = ( - np.median(group2_intensities) - np.median(group1_intensities) - ) * np.log2(log_base) - else: - log2_fold_change = np.log2( - np.median(group2_intensities) / np.median(group1_intensities) - ) - - valid_protein_groups.append(protein) - p_values.append(p) - t_statistic.append(t) - log2_fold_changes.append(log2_fold_change) - elif not exists_message(messages, INVALID_PROTEINGROUP_DATA_MSG): - messages.append(INVALID_PROTEINGROUP_DATA_MSG) - else: - # if the protein has a NaN value in a sample, we just skip it - pass - - if len(valid_protein_groups) == 0: + if group1 not in protein_df_wide.columns or group2 not in protein_df_wide.columns: messages.append( { "level": logging.ERROR, "msg": "No valid protein groups found for t-test analysis.", } ) - return dict( - differentially_expressed_proteins_df=pd.DataFrame( - columns=protein_df.columns.tolist() - + ["corrected_p_value", "log2_fold_change", "t_statistic"] - ), - significant_proteins_df=pd.DataFrame( - columns=protein_df.columns.tolist() - + ["corrected_p_value", "log2_fold_change", "t_statistic"] - ), - corrected_p_values_df=pd.DataFrame(columns=CORRECTED_P_VALUES_COLUMNS), - t_statistic_df=pd.DataFrame(columns=T_STATISTIC_COLUMNS), - log2_fold_change_df=pd.DataFrame(columns=LOG2_FOLD_CHANGE_COLUMNS), - fc_significance_df=pd.DataFrame(columns=FC_SIGNIFICANCE_COLUMNS), - corrected_alpha=alpha, - messages=messages, - ) + return empty_result(messages) - fc_z_scores, fc_z_p_values = get_z_score_based_fold_change_significance( - pd.Series(log2_fold_changes) + grouped_dfs = protein_df_wide.groupby("Protein ID") + statistics_group1 = grouped_dfs[group1].agg( + n="count", mean="mean", var="var", median="median" ) - - fc_significance_df = pd.DataFrame( - list(zip(valid_protein_groups, fc_z_scores, fc_z_p_values)), - columns=FC_SIGNIFICANCE_COLUMNS, + statistics_group2 = grouped_dfs[group2].agg( + n="count", mean="mean", var="var", median="median" ) - (corrected_p_values, corrected_alpha) = apply_multiple_testing_correction( - p_values=p_values, - method=multiple_testing_correction_method, - alpha=alpha, + # For 'propagate': null out statistics for proteins that have any NaN in + # either group so that vectorized_t_test produces NaN p-values for them. + if nan_policy_lower == "propagate": + has_nan_g1 = grouped_dfs[group1].apply(lambda s: s.isna().any()) + has_nan_g2 = grouped_dfs[group2].apply(lambda s: s.isna().any()) + propagate_mask = has_nan_g1 | has_nan_g2 + if propagate_mask.any(): + statistics_group1.loc[propagate_mask, :] = np.nan + statistics_group2.loc[propagate_mask, :] = np.nan + + valid_mask = (statistics_group1["n"] >= 2) & (statistics_group2["n"] >= 2) + if (~valid_mask).any() and not exists_message( + messages, INVALID_PROTEINGROUP_DATA_MSG + ): + messages.append(INVALID_PROTEINGROUP_DATA_MSG) + + valid_statistics_group1 = statistics_group1[valid_mask] + valid_statistics_group2 = statistics_group2[valid_mask] + + # Statistics grouped by protein + group1_counts = valid_statistics_group1["n"].to_numpy(dtype=float) + group2_counts = valid_statistics_group2["n"].to_numpy(dtype=float) + group1_means = valid_statistics_group1["mean"].to_numpy() + group2_means = valid_statistics_group2["mean"].to_numpy() + group1_vars = valid_statistics_group1["var"].to_numpy() + group2_vars = valid_statistics_group2["var"].to_numpy() + group1_medians = valid_statistics_group1["median"].to_numpy() + group2_medians = valid_statistics_group2["median"].to_numpy() + + t_statistics, p_values = vectorized_t_test( + group1_counts, + group2_counts, + group1_means, + group2_means, + group1_vars, + group2_vars, + ttest_type, ) - corrected_p_values_df = pd.DataFrame( - list(zip(valid_protein_groups, corrected_p_values)), - columns=CORRECTED_P_VALUES_COLUMNS, - ) - log2_fold_change_df = pd.DataFrame( - list(zip(valid_protein_groups, log2_fold_changes)), - columns=LOG2_FOLD_CHANGE_COLUMNS, - ) - t_statistic_df = pd.DataFrame( - list(zip(valid_protein_groups, t_statistic)), - columns=T_STATISTIC_COLUMNS, + if log_base: + fc_arr = (group2_medians - group1_medians) * np.log2(log_base) + else: + with np.errstate(divide="ignore", invalid="ignore"): + fc_arr = np.log2(group2_medians / group1_medians) + + ttest_results = pd.DataFrame( + { + "Protein ID": valid_statistics_group1.index, + "n1": group1_counts.astype(int), + "n2": group2_counts.astype(int), + "t_statistic": t_statistics, + "p_value": p_values, + "log2_fold_change": fc_arr, + } + ).dropna(subset=["p_value"]) + + if len(ttest_results) == 0: + messages.append( + { + "level": logging.ERROR, + "msg": "No valid protein groups found for t-test analysis.", + } + ) + return empty_result(messages) + + ttest_results["fc_z_score"], ttest_results["fc_significance"] = ( + get_z_score_based_fold_change_significance(ttest_results["log2_fold_change"]) ) - dataframes = [ - corrected_p_values_df, - log2_fold_change_df, - t_statistic_df, - fc_significance_df, - ] + ttest_results["corrected_p_value"], ttest_results["corrected_alpha"] = ( + apply_multiple_testing_correction( + p_values=ttest_results["p_value"], + method=multiple_testing_correction_method, + alpha=alpha, + ) + ) - for df in dataframes: - protein_df = pd.merge(protein_df, df, on="Protein ID", how="left") + differentially_expressed_proteins_df = pd.merge( + ttest_results, protein_df, on="Protein ID", how="left" + ) - differentially_expressed_proteins_df = protein_df.loc[ - protein_df["Protein ID"].isin(valid_protein_groups) - ] + significant_proteins_df = differentially_expressed_proteins_df.query( + "corrected_p_value <= corrected_alpha" + ) - significant_proteins_df = differentially_expressed_proteins_df[ - differentially_expressed_proteins_df["corrected_p_value"] <= corrected_alpha - ] - if fc_zscore_filter and not fc_significance_df.empty: - significant_proteins_df = significant_proteins_df[ - significant_proteins_df["fc_significance"] <= fc_zscore_alpha - ] + if fc_zscore_filter: + significant_proteins_df = significant_proteins_df.query( + "fc_significance <= @fc_zscore_alpha" + ) return dict( differentially_expressed_proteins_df=differentially_expressed_proteins_df, significant_proteins_df=significant_proteins_df, - corrected_p_values_df=corrected_p_values_df, - t_statistic_df=t_statistic_df, - log2_fold_change_df=log2_fold_change_df, - fc_significance_df=fc_significance_df, - corrected_alpha=OutputItem(output_type=OutputType.FLOAT, value=corrected_alpha), + corrected_p_values_df=ttest_results[CORRECTED_P_VALUES_COLUMNS], + t_statistic_df=ttest_results[T_STATISTIC_COLUMNS], + log2_fold_change_df=ttest_results[LOG2_FOLD_CHANGE_COLUMNS], + fc_significance_df=ttest_results[FC_SIGNIFICANCE_COLUMNS], + corrected_alpha=OutputItem( + output_type=OutputType.FLOAT, + value=float(ttest_results["corrected_alpha"].iloc[0]), + ), messages=messages, ) From ffe6088e89260f08af64216e5e176dc59c372002 Mon Sep 17 00:00:00 2001 From: Note Date: Wed, 29 Apr 2026 17:05:05 +0200 Subject: [PATCH 5/6] fix test error --- .../test_differential_expression.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/backend/tests/protzilla/data_analysis/test_differential_expression.py b/backend/tests/protzilla/data_analysis/test_differential_expression.py index 69c7c8f50..34123f3af 100644 --- a/backend/tests/protzilla/data_analysis/test_differential_expression.py +++ b/backend/tests/protzilla/data_analysis/test_differential_expression.py @@ -476,8 +476,11 @@ def test_differential_expression_t_test_with_silac_ratios(): round(out[DataKey.CORRECTED_P_VALUES_DF]["corrected_p_value"].iloc[0], 4) == 0.0513 ) - # Fold change is NaN because np.median([1.2, NaN, 1.1]) = NaN (not nanmedian). - assert np.isnan(out[DataKey.LOG2_FOLD_CHANGE_DF]["log2_fold_change"].iloc[0]) + # dropna() runs before pivot, so medians are from clean data — fold change is valid. + # Group1 clean=[1.2, 1.1]→median=1.15, Group2 clean=[0.8, 0.9]→median=0.85 + fc = out[DataKey.LOG2_FOLD_CHANGE_DF]["log2_fold_change"].iloc[0] + assert not np.isnan(fc) + assert round(fc, 4) == round(np.log2(0.85 / 1.15), 4) @pytest.fixture @@ -615,10 +618,11 @@ def test_t_test_nan_policy_omit_skips_nan_samples_and_computes_result( DataKey.CORRECTED_P_VALUES_DF ].empty, f"ttest_type={ttest_type}: Protein1 should be included after NaN samples are omitted" assert out[DataKey.CORRECTED_P_VALUES_DF]["Protein ID"].tolist() == ["Protein1"] - # Fold change is NaN because np.median([18.0, NaN, 22.0]) = NaN (not nanmedian). - assert np.isnan( - out[DataKey.LOG2_FOLD_CHANGE_DF]["log2_fold_change"].iloc[0] - ), f"ttest_type={ttest_type}: expected NaN fold change (median does not skip NaN)" + # dropna() runs before pivot, so medians are from clean data — fold change is valid. + # Group1 clean=[18.0, 22.0]→median=20.0, Group2=[8.0,10.0,12.0]→median=10.0 → log2(10/20)=-1.0 + fc = out[DataKey.LOG2_FOLD_CHANGE_DF]["log2_fold_change"].iloc[0] + assert not np.isnan(fc), f"ttest_type={ttest_type}: fold change should be valid after NaN rows are omitted" + assert round(fc, 1) == -1.0, f"ttest_type={ttest_type}: expected fold change -1.0, got {fc}" def test_t_test_nan_policy_propagate_excludes_protein_with_any_nan(nan_intensity_data): From 02d65ebe3e5bc99ad28eb62d7c27c6a0a546883d Mon Sep 17 00:00:00 2001 From: Note Date: Wed, 29 Apr 2026 17:08:06 +0200 Subject: [PATCH 6/6] fomatted --- .../data_analysis/test_differential_expression.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/backend/tests/protzilla/data_analysis/test_differential_expression.py b/backend/tests/protzilla/data_analysis/test_differential_expression.py index 34123f3af..e001dfb9f 100644 --- a/backend/tests/protzilla/data_analysis/test_differential_expression.py +++ b/backend/tests/protzilla/data_analysis/test_differential_expression.py @@ -621,8 +621,12 @@ def test_t_test_nan_policy_omit_skips_nan_samples_and_computes_result( # dropna() runs before pivot, so medians are from clean data — fold change is valid. # Group1 clean=[18.0, 22.0]→median=20.0, Group2=[8.0,10.0,12.0]→median=10.0 → log2(10/20)=-1.0 fc = out[DataKey.LOG2_FOLD_CHANGE_DF]["log2_fold_change"].iloc[0] - assert not np.isnan(fc), f"ttest_type={ttest_type}: fold change should be valid after NaN rows are omitted" - assert round(fc, 1) == -1.0, f"ttest_type={ttest_type}: expected fold change -1.0, got {fc}" + assert not np.isnan( + fc + ), f"ttest_type={ttest_type}: fold change should be valid after NaN rows are omitted" + assert ( + round(fc, 1) == -1.0 + ), f"ttest_type={ttest_type}: expected fold change -1.0, got {fc}" def test_t_test_nan_policy_propagate_excludes_protein_with_any_nan(nan_intensity_data):