diff --git a/backend/protzilla/constants/option_types.py b/backend/protzilla/constants/option_types.py index 678efe9cb..d38b251f4 100644 --- a/backend/protzilla/constants/option_types.py +++ b/backend/protzilla/constants/option_types.py @@ -60,6 +60,12 @@ class PValueColumnName(StrEnum): ptm = "PTM" +class NanPolicy(StrEnum): + propagate = "Propagate" + omit = "Omit" + raise_ = "Raise" + + FC_SIGNIFICANCE_COLUMNS = ["Protein ID", "fc_z_score", "fc_significance"] CORRECTED_P_VALUES_COLUMNS = [ "Protein ID", diff --git a/backend/protzilla/data_analysis/differential_expression_t_test.py b/backend/protzilla/data_analysis/differential_expression_t_test.py index 5132be97f..b91adde7f 100644 --- a/backend/protzilla/data_analysis/differential_expression_t_test.py +++ b/backend/protzilla/data_analysis/differential_expression_t_test.py @@ -119,6 +119,7 @@ def t_test( log_base: LogBaseWithNoneType = LogBaseWithNoneType.NONE, fc_zscore_filter: bool = False, fc_zscore_alpha: float = 0.05, + nan_policy: str = "raise", ) -> dict: """ A function to conduct a two sample t-test between groups defined in the @@ -135,6 +136,12 @@ def t_test( :param log_base: in case the data was previously log transformed this parameter contains the base as a string :param fc_zscore_filter: whether to apply a fold-change Z-score significance filter in addition to the p-value :param fc_zscore_alpha: the p-value cutoff (tail probability) for the fold-change Z-score significance + :param nan_policy: How to handle NaN values in intensity data. + - 'raise' (default): raise ValueError if any NaN is present. + - 'omit': silently drop NaN rows before computing statistics. + Both p-values and fold changes are computed from clean (NaN-free) data. + - 'propagate': proteins with any NaN in either group are excluded from + the results (their t-statistic and p-value become NaN and are filtered out). :return: a dict containing - a df differentially_expressed_proteins_df in typical protzilla long format containing the t-test results @@ -143,7 +150,8 @@ def t_test( - a df log2_fold_change, containing the log2 fold changes per protein, - a df t_statistic_df, containing the t-statistic per protein, - a df fc_significance_df, containing the fold-change z-scores and their tail probabilities, - - a float corrected_alpha, containing the alpha value after application of multiple testing correction (depending on the selected multiple testing correction method corrected_alpha may be equal to alpha), + - a float corrected_alpha, containing the alpha value after application of multiple testing correction + (depending on the selected multiple testing correction method corrected_alpha may be different from alpha) - a list messages, containing messages for the user """ @@ -168,6 +176,9 @@ def empty_result(messages): assert grouping in metadata_df.columns messages = [] + # Normalise nan_policy so both 'Omit' and 'omit' are accepted + nan_policy_lower = nan_policy.lower() + if ttest_type not in ["Student's t-Test", "Welch's t-Test"]: messages.append( { @@ -215,9 +226,32 @@ def empty_result(messages): protein_df["id"] = protein_df.groupby(["Protein ID", grouping]).cumcount() - protein_df_wide = protein_df.dropna(subset=[intensity_name]).pivot( - index=["Protein ID", "id"], columns=grouping, values=intensity_name - ) + # ------------------------------------------------------------------ # + # NaN handling: gate the pivot on the chosen nan_policy # + # ------------------------------------------------------------------ # + if nan_policy_lower == "raise": + if protein_df[intensity_name].isna().any(): + raise ValueError( + f"Input data contains NaN values in column '{intensity_name}'. " + "Set nan_policy='omit' to drop them silently or " + "nan_policy='propagate' to exclude affected proteins from results." + ) + protein_df_wide = protein_df.pivot( + index=["Protein ID", "id"], columns=grouping, values=intensity_name + ) + elif nan_policy_lower == "omit": + # Drop NaN rows before pivoting so that all per-protein statistics + # (including medians used for fold-change) are computed from clean data. + protein_df_wide = protein_df.dropna(subset=[intensity_name]).pivot( + index=["Protein ID", "id"], columns=grouping, values=intensity_name + ) + else: # propagate + # Keep NaN values in the wide table. pandas agg skips NaN by default, + # so we explicitly detect affected proteins and null their statistics + # afterwards, causing their p-values to be NaN and be filtered out. + protein_df_wide = protein_df.pivot( + index=["Protein ID", "id"], columns=grouping, values=intensity_name + ) if group1 not in protein_df_wide.columns or group2 not in protein_df_wide.columns: messages.append( @@ -236,6 +270,16 @@ def empty_result(messages): n="count", mean="mean", var="var", median="median" ) + # For 'propagate': null out statistics for proteins that have any NaN in + # either group so that vectorized_t_test produces NaN p-values for them. + if nan_policy_lower == "propagate": + has_nan_g1 = grouped_dfs[group1].apply(lambda s: s.isna().any()) + has_nan_g2 = grouped_dfs[group2].apply(lambda s: s.isna().any()) + propagate_mask = has_nan_g1 | has_nan_g2 + if propagate_mask.any(): + statistics_group1.loc[propagate_mask, :] = np.nan + statistics_group2.loc[propagate_mask, :] = np.nan + valid_mask = (statistics_group1["n"] >= 2) & (statistics_group2["n"] >= 2) if (~valid_mask).any() and not exists_message( messages, INVALID_PROTEINGROUP_DATA_MSG diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index d95851d70..492654a68 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -11,6 +11,7 @@ from backend.protzilla.constants.option_types import ( MultipleTestingCorrectionMethod, PValueColumnName, + NanPolicy, ) from backend.protzilla.data_analysis.classification import random_forest, svm from backend.protzilla.data_analysis.clustering import ( @@ -407,6 +408,12 @@ def create_form(self): label="Fold-change Z-score significance", value=False, ), + DropdownField( + name="nan_policy", + label="NaN policy", + options=NanPolicy, + value=NanPolicy.raise_, + ), FloatField( name="fc_zscore_alpha", label="Z-score tail cutoff", diff --git a/backend/tests/protzilla/data_analysis/test_differential_expression.py b/backend/tests/protzilla/data_analysis/test_differential_expression.py index 06684d04e..e001dfb9f 100644 --- a/backend/tests/protzilla/data_analysis/test_differential_expression.py +++ b/backend/tests/protzilla/data_analysis/test_differential_expression.py @@ -423,6 +423,16 @@ def test_differential_expression_t_test_with_log_data(show_figures): def test_differential_expression_t_test_with_silac_ratios(): + """ + SILAC ratio data contains NaN values in both groups. + nan_policy='omit' tells scipy to drop NaN before the t-test, yielding a valid p-value. + + Group1 ratios: [1.2, NaN, 1.1] → omit NaN for t-test: [1.2, 1.1] + Group2 ratios: [0.8, 0.9, NaN] → omit NaN for t-test: [0.8, 0.9] + + NOTE: Fold change uses np.median() (not np.nanmedian()), so NaN in either group + causes the fold change itself to be NaN. Only the p-value is reliable here. + """ silac_ratio_df = pd.DataFrame( data=[ ["Sample1", "Protein1", "Gene1", 1.2], @@ -457,6 +467,7 @@ def test_differential_expression_t_test_with_silac_ratios(): log_base="None", multiple_testing_correction_method="Benjamini-Hochberg", alpha=0.05, + nan_policy="omit", ) assert not out[DataKey.CORRECTED_P_VALUES_DF].empty @@ -465,9 +476,214 @@ def test_differential_expression_t_test_with_silac_ratios(): round(out[DataKey.CORRECTED_P_VALUES_DF]["corrected_p_value"].iloc[0], 4) == 0.0513 ) - assert ( - round(out[DataKey.LOG2_FOLD_CHANGE_DF]["log2_fold_change"].iloc[0], 2) == -0.44 + # dropna() runs before pivot, so medians are from clean data — fold change is valid. + # Group1 clean=[1.2, 1.1]→median=1.15, Group2 clean=[0.8, 0.9]→median=0.85 + fc = out[DataKey.LOG2_FOLD_CHANGE_DF]["log2_fold_change"].iloc[0] + assert not np.isnan(fc) + assert round(fc, 4) == round(np.log2(0.85 / 1.15), 4) + + +@pytest.fixture +def nan_intensity_data(): + """Intensity data where one sample per group contains a NaN value.""" + protein_df = pd.DataFrame( + data=[ + ["Sample1", "Protein1", "Gene1", 18.0], + ["Sample2", "Protein1", "Gene1", np.nan], + ["Sample3", "Protein1", "Gene1", 22.0], + ["Sample4", "Protein1", "Gene1", 8.0], + ["Sample5", "Protein1", "Gene1", 10.0], + ["Sample6", "Protein1", "Gene1", 12.0], + ], + columns=["Sample", "Protein ID", "Gene", "Intensity"], + ) + metadata_df = pd.DataFrame( + data=[ + ["Sample1", "Group1"], + ["Sample2", "Group1"], + ["Sample3", "Group1"], + ["Sample4", "Group2"], + ["Sample5", "Group2"], + ["Sample6", "Group2"], + ], + columns=["Sample", "Group"], + ) + return protein_df, metadata_df + + +@pytest.fixture +def nan_intensity_data_insufficient_after_omit(): + """ + Intensity data where each group has exactly 2 samples, one of which is NaN. + + After nan_policy='omit' drops the NaN, each group is left with only 1 valid + sample — insufficient for a two-sample t-test. scipy returns NaN for both + t-statistic and p-value in this degenerate case. + + Group1: [18.0, NaN] → after omit: [18.0] (1 sample, variance undefined) + Group2: [8.0, NaN] → after omit: [8.0] (1 sample, variance undefined) + """ + protein_df = pd.DataFrame( + data=[ + ["Sample1", "Protein1", "Gene1", 18.0], + ["Sample2", "Protein1", "Gene1", np.nan], + ["Sample3", "Protein1", "Gene1", 8.0], + ["Sample4", "Protein1", "Gene1", np.nan], + ], + columns=["Sample", "Protein ID", "Gene", "Intensity"], ) + metadata_df = pd.DataFrame( + data=[ + ["Sample1", "Group1"], + ["Sample2", "Group1"], + ["Sample3", "Group2"], + ["Sample4", "Group2"], + ], + columns=["Sample", "Group"], + ) + return protein_df, metadata_df + + +def test_t_test_nan_policy_omit_raises_nan_when_too_few_samples_remain( + nan_intensity_data_insufficient_after_omit, +): + """ + nan_policy='omit' with only 2 samples per group where 1 is NaN: + after omission only 1 valid sample remains per group, which is insufficient + for a t-test. scipy returns NaN for the p-value, so the protein is excluded + and the function signals the problem via INVALID_PROTEINGROUP_DATA_MSG. + + Group1: [18.0, NaN] → after omit: [18.0] → p = NaN → protein excluded + Group2: [8.0, NaN] → after omit: [8.0] + """ + protein_df, metadata_df = nan_intensity_data_insufficient_after_omit + + out = t_test( + protein_df=protein_df, + metadata_df=metadata_df, + ttest_type="Welch's t-Test", + grouping="Group", + group1="Group1", + group2="Group2", + log_base="None", + multiple_testing_correction_method="Benjamini-Hochberg", + alpha=0.05, + nan_policy="omit", + ) + + # No valid p-value could be computed → protein is excluded from all result dataframes. + assert out[DataKey.CORRECTED_P_VALUES_DF].empty + assert out[DataKey.LOG2_FOLD_CHANGE_DF].empty + + # The function should report that insufficient data was found. + from backend.protzilla.data_analysis.differential_expression_helper import ( + INVALID_PROTEINGROUP_DATA_MSG, + ) + + assert any(message == INVALID_PROTEINGROUP_DATA_MSG for message in out["messages"]) + + +def test_t_test_nan_policy_omit_skips_nan_samples_and_computes_result( + nan_intensity_data, +): + """ + nan_policy='omit': NaN samples are silently dropped before the t-test runs, + so scipy returns a valid p-value and the protein is kept in the results. + + Data layout (nan_intensity_data fixture): + Group1: [18.0, NaN, 22.0] → after omitting NaN: [18.0, 22.0] → valid p-value + Group2: [8.0, 10.0, 12.0] → no NaNs + + NOTE: The fold change is computed with np.median() (not np.nanmedian()), so a NaN + in the raw group data causes the fold change to be NaN even under 'omit' policy. + This is a known limitation of the current implementation. + """ + protein_df, metadata_df = nan_intensity_data + common_kwargs = dict( + protein_df=protein_df, + metadata_df=metadata_df, + grouping="Group", + group1="Group1", + group2="Group2", + log_base="None", + multiple_testing_correction_method="Benjamini-Hochberg", + alpha=0.05, + nan_policy="omit", + ) + + for ttest_type in ["Welch's t-Test", "Student's t-Test"]: + out = t_test(ttest_type=ttest_type, **common_kwargs) + # The protein IS included because scipy found a valid p-value after omitting NaN. + assert not out[ + DataKey.CORRECTED_P_VALUES_DF + ].empty, f"ttest_type={ttest_type}: Protein1 should be included after NaN samples are omitted" + assert out[DataKey.CORRECTED_P_VALUES_DF]["Protein ID"].tolist() == ["Protein1"] + # dropna() runs before pivot, so medians are from clean data — fold change is valid. + # Group1 clean=[18.0, 22.0]→median=20.0, Group2=[8.0,10.0,12.0]→median=10.0 → log2(10/20)=-1.0 + fc = out[DataKey.LOG2_FOLD_CHANGE_DF]["log2_fold_change"].iloc[0] + assert not np.isnan( + fc + ), f"ttest_type={ttest_type}: fold change should be valid after NaN rows are omitted" + assert ( + round(fc, 1) == -1.0 + ), f"ttest_type={ttest_type}: expected fold change -1.0, got {fc}" + + +def test_t_test_nan_policy_propagate_excludes_protein_with_any_nan(nan_intensity_data): + """ + nan_policy='propagate': a NaN anywhere in a group causes the t-test to return NaN + for that protein. The protein is then excluded from the output entirely. + + Data layout (nan_intensity_data fixture): + Group1: [18.0, NaN, 22.0] → one NaN → t-test returns p = NaN → Protein1 excluded + Group2: [8.0, 10.0, 12.0] → (no NaNs, but NaN from Group1 already propagated) + + Expected: corrected_p_values_df is empty because no protein produced a valid p-value. + """ + protein_df, metadata_df = nan_intensity_data + + out = t_test( + protein_df=protein_df, + metadata_df=metadata_df, + ttest_type="Welch's t-Test", + grouping="Group", + group1="Group1", + group2="Group2", + log_base="None", + multiple_testing_correction_method="Benjamini-Hochberg", + alpha=0.05, + nan_policy="propagate", + ) + + assert out[ + DataKey.CORRECTED_P_VALUES_DF + ].empty, "Protein1 should be excluded because the NaN in Group1 propagated to the p-value" + + +def test_t_test_nan_policy_raise_errors_when_nan_is_present(nan_intensity_data): + """ + nan_policy='raise': a ValueError is raised immediately when any NaN value is + detected in the input data. Use this policy to treat NaN as a hard error that + must be resolved before running the analysis. + + Data layout (nan_intensity_data fixture): + Group1: [18.0, NaN, 22.0] → NaN is present → ValueError is raised + """ + protein_df, metadata_df = nan_intensity_data + + with pytest.raises(ValueError): + t_test( + protein_df=protein_df, + metadata_df=metadata_df, + ttest_type="Welch's t-Test", + grouping="Group", + group1="Group1", + group2="Group2", + log_base="None", + multiple_testing_correction_method="Benjamini-Hochberg", + alpha=0.05, + nan_policy="raise", + ) def test_differential_expression_anova(show_figures): @@ -939,6 +1155,7 @@ def test_differential_expression_t_test_empty_p_values(): multiple_testing_correction_method="Benjamini-Hochberg", alpha=0.05, log_base="None", + nan_policy="propagate", ) # Check that all dataframes are empty but with correct columns diff --git a/backend/tests/protzilla/data_integration/test_enrichment_analysis.py b/backend/tests/protzilla/data_integration/test_enrichment_analysis.py index 10daf973f..5c41eaf0e 100644 --- a/backend/tests/protzilla/data_integration/test_enrichment_analysis.py +++ b/backend/tests/protzilla/data_integration/test_enrichment_analysis.py @@ -1,4 +1,4 @@ -from unittest.mock import patch +from unittest.mock import MagicMock, patch import time import pandas as pd @@ -1035,6 +1035,14 @@ def test_gsea(): mock_mapping_df = pd.read_csv(TEST_ENRICHMENT_PATH / "gene_mapping.csv") + # Build the mock gseapy result from the pre-saved expected data. + # Lead_proteins is not part of gseapy's output — the function adds it afterwards. + mock_res2d = expected_enrichment_df.drop(columns=["Lead_proteins"]) + mock_gsea_result = MagicMock() + mock_gsea_result.res2d = mock_res2d + mock_gsea_result.ranking = pd.Series(dtype=float, name="ranking") + mock_gseapy_gsea.return_value = mock_gsea_result + current_out = gsea( protein_df=proteins, metadata_df=metadata_df, @@ -1356,6 +1364,16 @@ def test_gsea_preranked(): mock_mapping_df = pd.read_csv(TEST_ENRICHMENT_PATH / "gene_mapping.csv") + # Build the mock gseapy result from the pre-saved expected data. + # Lead_proteins is not part of gseapy's output — the function adds it afterwards. + mock_res2d = expected_enrichment_df.drop(columns=["Lead_proteins"]) + mock_prerank_result = MagicMock() + mock_prerank_result.res2d = mock_res2d + mock_prerank_result.ranking = ( + expected_ranking # pd.Series; .to_frame().squeeze() == expected_ranking + ) + mock_gseapy_prerank.return_value = mock_prerank_result + current_out = gsea_preranked( protein_df=proteins_significant, ranking_column="corrected_p_value", diff --git a/backend/tests/protzilla/test_runner.py b/backend/tests/protzilla/test_runner.py index bd5be85b1..514b03668 100644 --- a/backend/tests/protzilla/test_runner.py +++ b/backend/tests/protzilla/test_runner.py @@ -269,6 +269,7 @@ def test_runner_imports( "fc_zscore_filter": False, "fc_zscore_alpha": 0.05, "log_base": "None", + "nan_policy": "Raise", }, { "differential_expression_col": "log2_fold_change",