Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions backend/protzilla/constants/option_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,12 @@ class PValueColumnName(StrEnum):
ptm = "PTM"


class NanPolicy(StrEnum):
propagate = "Propagate"
omit = "Omit"
raise_ = "Raise"


FC_SIGNIFICANCE_COLUMNS = ["Protein ID", "fc_z_score", "fc_significance"]
CORRECTED_P_VALUES_COLUMNS = [
"Protein ID",
Expand Down
52 changes: 48 additions & 4 deletions backend/protzilla/data_analysis/differential_expression_t_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ def t_test(
log_base: LogBaseWithNoneType = LogBaseWithNoneType.NONE,
fc_zscore_filter: bool = False,
fc_zscore_alpha: float = 0.05,
nan_policy: str = "raise",
) -> dict:
"""
A function to conduct a two sample t-test between groups defined in the
Expand All @@ -135,6 +136,12 @@ def t_test(
:param log_base: in case the data was previously log transformed this parameter contains the base as a string
:param fc_zscore_filter: whether to apply a fold-change Z-score significance filter in addition to the p-value
:param fc_zscore_alpha: the p-value cutoff (tail probability) for the fold-change Z-score significance
:param nan_policy: How to handle NaN values in intensity data.
- 'raise' (default): raise ValueError if any NaN is present.
- 'omit': silently drop NaN rows before computing statistics.
Both p-values and fold changes are computed from clean (NaN-free) data.
- 'propagate': proteins with any NaN in either group are excluded from
the results (their t-statistic and p-value become NaN and are filtered out).

:return: a dict containing
- a df differentially_expressed_proteins_df in typical protzilla long format containing the t-test results
Expand All @@ -143,7 +150,8 @@ def t_test(
- a df log2_fold_change, containing the log2 fold changes per protein,
- a df t_statistic_df, containing the t-statistic per protein,
- a df fc_significance_df, containing the fold-change z-scores and their tail probabilities,
- a float corrected_alpha, containing the alpha value after application of multiple testing correction (depending on the selected multiple testing correction method corrected_alpha may be equal to alpha),
- a float corrected_alpha, containing the alpha value after application of multiple testing correction
(depending on the selected multiple testing correction method corrected_alpha may be different from alpha)
- a list messages, containing messages for the user
"""

Expand All @@ -168,6 +176,9 @@ def empty_result(messages):
assert grouping in metadata_df.columns
messages = []

# Normalise nan_policy so both 'Omit' and 'omit' are accepted
nan_policy_lower = nan_policy.lower()

if ttest_type not in ["Student's t-Test", "Welch's t-Test"]:
messages.append(
{
Expand Down Expand Up @@ -215,9 +226,32 @@ def empty_result(messages):

protein_df["id"] = protein_df.groupby(["Protein ID", grouping]).cumcount()

protein_df_wide = protein_df.dropna(subset=[intensity_name]).pivot(
index=["Protein ID", "id"], columns=grouping, values=intensity_name
)
# ------------------------------------------------------------------ #
# NaN handling: gate the pivot on the chosen nan_policy #
# ------------------------------------------------------------------ #
if nan_policy_lower == "raise":
if protein_df[intensity_name].isna().any():
raise ValueError(
f"Input data contains NaN values in column '{intensity_name}'. "
"Set nan_policy='omit' to drop them silently or "
"nan_policy='propagate' to exclude affected proteins from results."
)
protein_df_wide = protein_df.pivot(
index=["Protein ID", "id"], columns=grouping, values=intensity_name
)
elif nan_policy_lower == "omit":
# Drop NaN rows before pivoting so that all per-protein statistics
# (including medians used for fold-change) are computed from clean data.
protein_df_wide = protein_df.dropna(subset=[intensity_name]).pivot(
index=["Protein ID", "id"], columns=grouping, values=intensity_name
)
else: # propagate
# Keep NaN values in the wide table. pandas agg skips NaN by default,
# so we explicitly detect affected proteins and null their statistics
# afterwards, causing their p-values to be NaN and be filtered out.
protein_df_wide = protein_df.pivot(
index=["Protein ID", "id"], columns=grouping, values=intensity_name
)

if group1 not in protein_df_wide.columns or group2 not in protein_df_wide.columns:
messages.append(
Expand All @@ -236,6 +270,16 @@ def empty_result(messages):
n="count", mean="mean", var="var", median="median"
)

# For 'propagate': null out statistics for proteins that have any NaN in
# either group so that vectorized_t_test produces NaN p-values for them.
if nan_policy_lower == "propagate":
has_nan_g1 = grouped_dfs[group1].apply(lambda s: s.isna().any())
has_nan_g2 = grouped_dfs[group2].apply(lambda s: s.isna().any())
propagate_mask = has_nan_g1 | has_nan_g2
if propagate_mask.any():
statistics_group1.loc[propagate_mask, :] = np.nan
statistics_group2.loc[propagate_mask, :] = np.nan

valid_mask = (statistics_group1["n"] >= 2) & (statistics_group2["n"] >= 2)
if (~valid_mask).any() and not exists_message(
messages, INVALID_PROTEINGROUP_DATA_MSG
Expand Down
7 changes: 7 additions & 0 deletions backend/protzilla/methods/data_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from backend.protzilla.constants.option_types import (
MultipleTestingCorrectionMethod,
PValueColumnName,
NanPolicy,
)
from backend.protzilla.data_analysis.classification import random_forest, svm
from backend.protzilla.data_analysis.clustering import (
Expand Down Expand Up @@ -407,6 +408,12 @@ def create_form(self):
label="Fold-change Z-score significance",
value=False,
),
DropdownField(
name="nan_policy",
label="NaN policy",
options=NanPolicy,
value=NanPolicy.raise_,
),
FloatField(
name="fc_zscore_alpha",
label="Z-score tail cutoff",
Expand Down
221 changes: 219 additions & 2 deletions backend/tests/protzilla/data_analysis/test_differential_expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,16 @@ def test_differential_expression_t_test_with_log_data(show_figures):


def test_differential_expression_t_test_with_silac_ratios():
"""
SILAC ratio data contains NaN values in both groups.
nan_policy='omit' tells scipy to drop NaN before the t-test, yielding a valid p-value.

Group1 ratios: [1.2, NaN, 1.1] → omit NaN for t-test: [1.2, 1.1]
Group2 ratios: [0.8, 0.9, NaN] → omit NaN for t-test: [0.8, 0.9]

NOTE: Fold change uses np.median() (not np.nanmedian()), so NaN in either group
causes the fold change itself to be NaN. Only the p-value is reliable here.
"""
silac_ratio_df = pd.DataFrame(
data=[
["Sample1", "Protein1", "Gene1", 1.2],
Expand Down Expand Up @@ -457,6 +467,7 @@ def test_differential_expression_t_test_with_silac_ratios():
log_base="None",
multiple_testing_correction_method="Benjamini-Hochberg",
alpha=0.05,
nan_policy="omit",
)

assert not out[DataKey.CORRECTED_P_VALUES_DF].empty
Expand All @@ -465,9 +476,214 @@ def test_differential_expression_t_test_with_silac_ratios():
round(out[DataKey.CORRECTED_P_VALUES_DF]["corrected_p_value"].iloc[0], 4)
== 0.0513
)
assert (
round(out[DataKey.LOG2_FOLD_CHANGE_DF]["log2_fold_change"].iloc[0], 2) == -0.44
# dropna() runs before pivot, so medians are from clean data — fold change is valid.
# Group1 clean=[1.2, 1.1]→median=1.15, Group2 clean=[0.8, 0.9]→median=0.85
fc = out[DataKey.LOG2_FOLD_CHANGE_DF]["log2_fold_change"].iloc[0]
assert not np.isnan(fc)
assert round(fc, 4) == round(np.log2(0.85 / 1.15), 4)


@pytest.fixture
def nan_intensity_data():
"""Intensity data where one sample per group contains a NaN value."""
protein_df = pd.DataFrame(
data=[
["Sample1", "Protein1", "Gene1", 18.0],
["Sample2", "Protein1", "Gene1", np.nan],
["Sample3", "Protein1", "Gene1", 22.0],
["Sample4", "Protein1", "Gene1", 8.0],
["Sample5", "Protein1", "Gene1", 10.0],
["Sample6", "Protein1", "Gene1", 12.0],
],
columns=["Sample", "Protein ID", "Gene", "Intensity"],
)
metadata_df = pd.DataFrame(
data=[
["Sample1", "Group1"],
["Sample2", "Group1"],
["Sample3", "Group1"],
["Sample4", "Group2"],
["Sample5", "Group2"],
["Sample6", "Group2"],
],
columns=["Sample", "Group"],
)
return protein_df, metadata_df


@pytest.fixture
def nan_intensity_data_insufficient_after_omit():
"""
Intensity data where each group has exactly 2 samples, one of which is NaN.

After nan_policy='omit' drops the NaN, each group is left with only 1 valid
sample — insufficient for a two-sample t-test. scipy returns NaN for both
t-statistic and p-value in this degenerate case.

Group1: [18.0, NaN] → after omit: [18.0] (1 sample, variance undefined)
Group2: [8.0, NaN] → after omit: [8.0] (1 sample, variance undefined)
"""
protein_df = pd.DataFrame(
data=[
["Sample1", "Protein1", "Gene1", 18.0],
["Sample2", "Protein1", "Gene1", np.nan],
["Sample3", "Protein1", "Gene1", 8.0],
["Sample4", "Protein1", "Gene1", np.nan],
],
columns=["Sample", "Protein ID", "Gene", "Intensity"],
)
metadata_df = pd.DataFrame(
data=[
["Sample1", "Group1"],
["Sample2", "Group1"],
["Sample3", "Group2"],
["Sample4", "Group2"],
],
columns=["Sample", "Group"],
)
return protein_df, metadata_df


def test_t_test_nan_policy_omit_raises_nan_when_too_few_samples_remain(
nan_intensity_data_insufficient_after_omit,
):
"""
nan_policy='omit' with only 2 samples per group where 1 is NaN:
after omission only 1 valid sample remains per group, which is insufficient
for a t-test. scipy returns NaN for the p-value, so the protein is excluded
and the function signals the problem via INVALID_PROTEINGROUP_DATA_MSG.

Group1: [18.0, NaN] → after omit: [18.0] → p = NaN → protein excluded
Group2: [8.0, NaN] → after omit: [8.0]
"""
protein_df, metadata_df = nan_intensity_data_insufficient_after_omit

out = t_test(
protein_df=protein_df,
metadata_df=metadata_df,
ttest_type="Welch's t-Test",
grouping="Group",
group1="Group1",
group2="Group2",
log_base="None",
multiple_testing_correction_method="Benjamini-Hochberg",
alpha=0.05,
nan_policy="omit",
)

# No valid p-value could be computed → protein is excluded from all result dataframes.
assert out[DataKey.CORRECTED_P_VALUES_DF].empty
assert out[DataKey.LOG2_FOLD_CHANGE_DF].empty

# The function should report that insufficient data was found.
from backend.protzilla.data_analysis.differential_expression_helper import (
INVALID_PROTEINGROUP_DATA_MSG,
)

assert any(message == INVALID_PROTEINGROUP_DATA_MSG for message in out["messages"])


def test_t_test_nan_policy_omit_skips_nan_samples_and_computes_result(
nan_intensity_data,
):
"""
nan_policy='omit': NaN samples are silently dropped before the t-test runs,
so scipy returns a valid p-value and the protein is kept in the results.

Data layout (nan_intensity_data fixture):
Group1: [18.0, NaN, 22.0] → after omitting NaN: [18.0, 22.0] → valid p-value
Group2: [8.0, 10.0, 12.0] → no NaNs

NOTE: The fold change is computed with np.median() (not np.nanmedian()), so a NaN
in the raw group data causes the fold change to be NaN even under 'omit' policy.
This is a known limitation of the current implementation.
"""
protein_df, metadata_df = nan_intensity_data
common_kwargs = dict(
protein_df=protein_df,
metadata_df=metadata_df,
grouping="Group",
group1="Group1",
group2="Group2",
log_base="None",
multiple_testing_correction_method="Benjamini-Hochberg",
alpha=0.05,
nan_policy="omit",
)

for ttest_type in ["Welch's t-Test", "Student's t-Test"]:
out = t_test(ttest_type=ttest_type, **common_kwargs)
# The protein IS included because scipy found a valid p-value after omitting NaN.
assert not out[
DataKey.CORRECTED_P_VALUES_DF
].empty, f"ttest_type={ttest_type}: Protein1 should be included after NaN samples are omitted"
assert out[DataKey.CORRECTED_P_VALUES_DF]["Protein ID"].tolist() == ["Protein1"]
# dropna() runs before pivot, so medians are from clean data — fold change is valid.
# Group1 clean=[18.0, 22.0]→median=20.0, Group2=[8.0,10.0,12.0]→median=10.0 → log2(10/20)=-1.0
fc = out[DataKey.LOG2_FOLD_CHANGE_DF]["log2_fold_change"].iloc[0]
assert not np.isnan(
fc
), f"ttest_type={ttest_type}: fold change should be valid after NaN rows are omitted"
assert (
round(fc, 1) == -1.0
), f"ttest_type={ttest_type}: expected fold change -1.0, got {fc}"


def test_t_test_nan_policy_propagate_excludes_protein_with_any_nan(nan_intensity_data):
"""
nan_policy='propagate': a NaN anywhere in a group causes the t-test to return NaN
for that protein. The protein is then excluded from the output entirely.

Data layout (nan_intensity_data fixture):
Group1: [18.0, NaN, 22.0] → one NaN → t-test returns p = NaN → Protein1 excluded
Group2: [8.0, 10.0, 12.0] → (no NaNs, but NaN from Group1 already propagated)

Expected: corrected_p_values_df is empty because no protein produced a valid p-value.
"""
protein_df, metadata_df = nan_intensity_data

out = t_test(
protein_df=protein_df,
metadata_df=metadata_df,
ttest_type="Welch's t-Test",
grouping="Group",
group1="Group1",
group2="Group2",
log_base="None",
multiple_testing_correction_method="Benjamini-Hochberg",
alpha=0.05,
nan_policy="propagate",
)

assert out[
DataKey.CORRECTED_P_VALUES_DF
].empty, "Protein1 should be excluded because the NaN in Group1 propagated to the p-value"


def test_t_test_nan_policy_raise_errors_when_nan_is_present(nan_intensity_data):
"""
nan_policy='raise': a ValueError is raised immediately when any NaN value is
detected in the input data. Use this policy to treat NaN as a hard error that
must be resolved before running the analysis.

Data layout (nan_intensity_data fixture):
Group1: [18.0, NaN, 22.0] → NaN is present → ValueError is raised
"""
protein_df, metadata_df = nan_intensity_data

with pytest.raises(ValueError):
t_test(
protein_df=protein_df,
metadata_df=metadata_df,
ttest_type="Welch's t-Test",
grouping="Group",
group1="Group1",
group2="Group2",
log_base="None",
multiple_testing_correction_method="Benjamini-Hochberg",
alpha=0.05,
nan_policy="raise",
)


def test_differential_expression_anova(show_figures):
Expand Down Expand Up @@ -939,6 +1155,7 @@ def test_differential_expression_t_test_empty_p_values():
multiple_testing_correction_method="Benjamini-Hochberg",
alpha=0.05,
log_base="None",
nan_policy="propagate",
)

# Check that all dataframes are empty but with correct columns
Expand Down
Loading
Loading