From 51dff9489d278da95a9a09606ab55d07703dffa2 Mon Sep 17 00:00:00 2001
From: henning <henning.gaertner@student.hpi.de>
Date: Thu, 14 Nov 2024 15:58:03 +0100
Subject: [PATCH 01/13] Add fasta import, primitive peptide - protein sequence
 alignment, preliminary coverage bar plot

---
 protzilla/data_analysis/protein_coverage.py   |  70 ++++++++
 protzilla/importing/fasta_import.py           |  36 ++++
 protzilla/methods/data_analysis.py            | 128 ++++++++++----
 protzilla/methods/importing.py                |  21 ++-
 .../data_analysis/test_protein_coverage.py    |  73 ++++++++
 ui/runs/form_mapping.py                       |   2 +
 ui/runs/forms/data_analysis.py                | 156 +++++++++++++-----
 ui/runs/forms/importing.py                    |   4 +
 8 files changed, 407 insertions(+), 83 deletions(-)
 create mode 100644 protzilla/data_analysis/protein_coverage.py
 create mode 100644 protzilla/importing/fasta_import.py
 create mode 100644 tests/protzilla/data_analysis/test_protein_coverage.py

diff --git a/protzilla/data_analysis/protein_coverage.py b/protzilla/data_analysis/protein_coverage.py
new file mode 100644
index 00000000..afc504ad
--- /dev/null
+++ b/protzilla/data_analysis/protein_coverage.py
@@ -0,0 +1,70 @@
+from typing import List
+
+import pandas as pd
+import plotly.graph_objects as go
+
+
+def align_peptide_to_protein_sequence(
+    peptide_sequence: str, protein_sequence: str
+) -> List[int]:
+    """
+    Aligns a peptide to a protein sequence and returns the indices of the protein sequence where the peptide.
+    NAIVE APPROACH
+    """
+    if (
+        len(peptide_sequence) == 0
+        or len(protein_sequence) == 0
+        or len(peptide_sequence) > len(protein_sequence)
+    ):
+        return []
+    indices = []
+    for i in range(len(protein_sequence) - len(peptide_sequence) + 1):
+        if protein_sequence[i : i + len(peptide_sequence)] == peptide_sequence:
+            indices.append(i)
+    return indices
+
+
+def plot_protein_coverage(
+    fasta_df: pd.DataFrame, peptide_df: pd.DataFrame, protein_id: str
+) -> None:
+    """
+    Plots the coverage of a protein sequence by peptides.
+    """
+    # Retrieve the relevant peptides from the peptide df
+    relevant_peptides = peptide_df[peptide_df["Protein ID"] == protein_id]
+    protein_sequence = fasta_df[fasta_df["Protein ID"] == protein_id][
+        "Protein Sequence"
+    ].values[0]
+    protein_sequence_length = len(protein_sequence)
+    coverage = [0] * protein_sequence_length
+    for peptide_sequence in relevant_peptides["Sequence"].unique():
+        indices = align_peptide_to_protein_sequence(peptide_sequence, protein_sequence)
+        for i in indices:
+            for j in range(len(peptide_sequence)):
+                coverage[i + j] += 1
+
+    # Make sure coverage length matches the sequence length
+    if len(coverage) != protein_sequence_length:
+        raise ValueError("Length of coverage must match protein_sequence_length.")
+
+    # Generate labels for amino acid positions
+    amino_acid_positions = list(range(1, protein_sequence_length + 1))
+    x_labels = [f"{i} ({protein_sequence[i - 1]})" for i in amino_acid_positions]
+
+    # Create the bar chart
+    fig = go.Figure()
+
+    # Add bars for coverage values
+    fig.add_trace(
+        go.Bar(x=x_labels, y=coverage, name="Coverage", marker=dict(color="skyblue"))
+    )
+
+    # Customize layout
+    fig.update_layout(
+        title="Protein Coverage",
+        xaxis_title="Amino Acid",
+        yaxis_title="Coverage Value",
+        xaxis=dict(tickmode="linear"),
+        bargap=0.1,  # Adjust space between bars if needed
+    )
+    return dict(plots=[fig])
diff --git a/protzilla/importing/fasta_import.py b/protzilla/importing/fasta_import.py
new file mode 100644
index 00000000..bbdb9373
--- /dev/null
+++ b/protzilla/importing/fasta_import.py
@@ -0,0 +1,36 @@
+"""
+This module contains the code to parse a fasta file containing protein sequences and their ids.
+"""
+import logging
+
+import pandas as pd
+from Bio import SeqIO
+
+
+def parse_fasta_id(fasta_id: str) -> str:
+    """
+    Parse the fasta id to get the protein name from the fasta id string
+    """
+    metadata = fasta_id.split("|")[1]
+    if len(metadata) < 2:
+        logging.warning(f"Metadata too short: {metadata}")
+        return ""
+    return metadata
+
+
+def fasta_import(file_path: str) -> pd.DataFrame:
+    """
+    Import a fasta file and return a DataFrame with the protein sequences and their protein ids
+    """
+    fasta_iterator = SeqIO.parse(open(file_path), "fasta")
+    protein_ids = []
+    protein_sequences = []
+    for fasta_sequence in fasta_iterator:
+        id, sequence = parse_fasta_id(fasta_sequence.id), str(fasta_sequence.seq)
+        protein_ids.append(id)
+        protein_sequences.append(sequence)
+
+    fasta_sequences = pd.DataFrame(
+        {"Protein ID": protein_ids, "Protein Sequence": protein_sequences}
+    )
+    return {"fasta_df": fasta_sequences}
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 511629f5..ab8bdc27 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -7,15 +7,17 @@
     k_means,
 )
 from protzilla.data_analysis.differential_expression_anova import anova
-from protzilla.data_analysis.differential_expression_kruskal_wallis import kruskal_wallis_test_on_ptm_data, \
-    kruskal_wallis_test_on_intensity_data
+from protzilla.data_analysis.differential_expression_kruskal_wallis import (
+    kruskal_wallis_test_on_intensity_data,
+    kruskal_wallis_test_on_ptm_data,
+)
 from protzilla.data_analysis.differential_expression_linear_model import linear_model
 from protzilla.data_analysis.differential_expression_mann_whitney import (
-    mann_whitney_test_on_intensity_data, mann_whitney_test_on_ptm_data)
+    mann_whitney_test_on_intensity_data,
+    mann_whitney_test_on_ptm_data,
+)
 from protzilla.data_analysis.differential_expression_t_test import t_test
 from protzilla.data_analysis.dimension_reduction import t_sne, umap
-from protzilla.data_analysis.ptm_analysis import ptms_per_sample, \
-    ptms_per_protein_and_sample, select_peptides_of_protein
 from protzilla.data_analysis.model_evaluation import evaluate_classification_model
 from protzilla.data_analysis.plots import (
     clustergram_plot,
@@ -23,11 +25,12 @@
     prot_quant_plot,
     scatter_plot,
 )
+from protzilla.data_analysis.protein_coverage import plot_protein_coverage
 from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph
 from protzilla.data_analysis.ptm_analysis import (
-    select_peptides_of_protein,
     ptms_per_protein_and_sample,
     ptms_per_sample,
+    select_peptides_of_protein,
 )
 from protzilla.data_analysis.ptm_quantification import flexiquant_lf
 from protzilla.methods.data_preprocessing import TransformationLog
@@ -164,8 +167,10 @@ def plot(self, inputs):
 class DifferentialExpressionMannWhitneyOnIntensity(DataAnalysisStep):
     display_name = "Mann-Whitney Test"
     operation = "differential_expression"
-    method_description = ("A function to conduct a Mann-Whitney U test between groups defined in the clinical data."
-                          "The p-values are corrected for multiple testing.")
+    method_description = (
+        "A function to conduct a Mann-Whitney U test between groups defined in the clinical data."
+        "The p-values are corrected for multiple testing."
+    )
 
     input_keys = [
         "protein_df",
@@ -191,7 +196,9 @@ def method(self, inputs: dict) -> dict:
 
     def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         if steps.get_step_output(Step, "protein_df", inputs["protein_df"]) is not None:
-            inputs["protein_df"] = steps.get_step_output(Step, "protein_df", inputs["protein_df"])
+            inputs["protein_df"] = steps.get_step_output(
+                Step, "protein_df", inputs["protein_df"]
+            )
         inputs["metadata_df"] = steps.metadata_df
         inputs["log_base"] = steps.get_step_input(TransformationLog, "log_base")
         return inputs
@@ -200,8 +207,10 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
 class DifferentialExpressionMannWhitneyOnPTM(DataAnalysisStep):
     display_name = "Mann-Whitney Test"
     operation = "Peptide analysis"
-    method_description = ("A function to conduct a Mann-Whitney U test between groups defined in the clinical data."
-                          "The p-values are corrected for multiple testing.")
+    method_description = (
+        "A function to conduct a Mann-Whitney U test between groups defined in the clinical data."
+        "The p-values are corrected for multiple testing."
+    )
 
     input_keys = [
         "ptm_df",
@@ -234,8 +243,10 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
 class DifferentialExpressionKruskalWallisOnIntensity(DataAnalysisStep):
     display_name = "Kruskal-Wallis Test"
     operation = "differential_expression"
-    method_description = ("A function to conduct a Kruskal-Wallis test between groups defined in the clinical data."
-                          "The p-values are corrected for multiple testing.")
+    method_description = (
+        "A function to conduct a Kruskal-Wallis test between groups defined in the clinical data."
+        "The p-values are corrected for multiple testing."
+    )
 
     input_keys = [
         "protein_df",
@@ -265,8 +276,10 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
 class DifferentialExpressionKruskalWallisOnIntensity(DataAnalysisStep):
     display_name = "Kruskal-Wallis Test"
     operation = "differential_expression"
-    method_description = ("A function to conduct a Kruskal-Wallis test between groups defined in the clinical data."
-                          "The p-values are corrected for multiple testing.")
+    method_description = (
+        "A function to conduct a Kruskal-Wallis test between groups defined in the clinical data."
+        "The p-values are corrected for multiple testing."
+    )
 
     input_keys = [
         "protein_df",
@@ -288,7 +301,9 @@ def method(self, inputs: dict) -> dict:
         return kruskal_wallis_test_on_intensity_data(**inputs)
 
     def insert_dataframes(self, steps: StepManager, inputs) -> dict:
-        inputs["protein_df"] = steps.get_step_output(Step, "protein_df", inputs["protein_df"])
+        inputs["protein_df"] = steps.get_step_output(
+            Step, "protein_df", inputs["protein_df"]
+        )
         inputs["metadata_df"] = steps.metadata_df
         inputs["log_base"] = steps.get_step_input(TransformationLog, "log_base")
         return inputs
@@ -297,8 +312,10 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
 class DifferentialExpressionKruskalWallisOnPTM(DataAnalysisStep):
     display_name = "Kruskal-Wallis Test"
     operation = "Peptide analysis"
-    method_description = ("A function to conduct a Kruskal-Wallis test between groups defined in the clinical data."
-                          "The p-values are corrected for multiple testing.")
+    method_description = (
+        "A function to conduct a Kruskal-Wallis test between groups defined in the clinical data."
+        "The p-values are corrected for multiple testing."
+    )
 
     input_keys = [
         "ptm_df",
@@ -327,9 +344,11 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
 class PlotVolcano(PlotStep):
     display_name = "Volcano Plot"
     operation = "plot"
-    method_description = ("Plots the results of a differential expression analysis in a volcano plot. The x-axis shows "
-                          "the log2 fold change and the y-axis shows the -log10 of the corrected p-values. The user "
-                          "can define a fold change threshold and an alpha level to highlight significant items.")
+    method_description = (
+        "Plots the results of a differential expression analysis in a volcano plot. The x-axis shows "
+        "the log2 fold change and the y-axis shows the -log10 of the corrected p-values. The user "
+        "can define a fold change threshold and an alpha level to highlight significant items."
+    )
     input_keys = [
         "p_values",
         "fc_threshold",
@@ -762,6 +781,33 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         return inputs
 
 
+class PlotProteinCoverage(PlotStep):
+    display_name = "Protein Coverage Plot"
+    operation = "plot"
+    method_description = (
+        "Create a protein coverage plot from a protein graph and peptide data"
+    )
+
+    input_keys = [
+        "protein_id",
+        "fasta_df",
+        "peptide_df",
+    ]
+    output_keys = []
+
+    def method(self, inputs: dict) -> dict:
+        return plot_protein_coverage(**inputs)
+
+    def insert_dataframes(self, steps: StepManager, inputs) -> dict:
+        inputs["fasta_df"] = steps.get_step_output(
+            Step, "fasta_df", inputs["fasta_df_instance"]
+        )
+        inputs["peptide_df"] = steps.get_step_output(
+            Step, "peptide_df", inputs["peptide_df_instance"]
+        )
+        return
+
+
 class ProteinGraphVariationGraph(DataAnalysisStep):
     display_name = "Protein Variation Graph"
     operation = "protein_graph"
@@ -841,17 +887,23 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         inputs["metadata_df"] = steps.metadata_df
 
         if inputs["auto_select"]:
-            significant_proteins = (
-                steps.get_step_output(DataAnalysisStep, "significant_proteins_df", inputs["protein_list"]))
-            index_of_most_significant_protein = significant_proteins['corrected_p_value'].idxmin()
-            most_significant_protein = significant_proteins.loc[index_of_most_significant_protein]
+            significant_proteins = steps.get_step_output(
+                DataAnalysisStep, "significant_proteins_df", inputs["protein_list"]
+            )
+            index_of_most_significant_protein = significant_proteins[
+                "corrected_p_value"
+            ].idxmin()
+            most_significant_protein = significant_proteins.loc[
+                index_of_most_significant_protein
+            ]
             inputs["protein_id"] = [most_significant_protein["Protein ID"]]
-            self.messages.append({
-                "level": logging.INFO,
-                "msg":
-                    f"Selected the most significant Protein: {most_significant_protein['Protein ID']}, "
-                    f"from {inputs['protein_list']}"
-            })
+            self.messages.append(
+                {
+                    "level": logging.INFO,
+                    "msg": f"Selected the most significant Protein: {most_significant_protein['Protein ID']}, "
+                    f"from {inputs['protein_list']}",
+                }
+            )
 
         return inputs
 
@@ -859,8 +911,10 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
 class PTMsPerSample(DataAnalysisStep):
     display_name = "PTMs per Sample"
     operation = "Peptide analysis"
-    method_description = ("Analyze the post-translational modifications (PTMs) of a single protein of interest. "
-                          "This function requires a peptide dataframe with PTM information.")
+    method_description = (
+        "Analyze the post-translational modifications (PTMs) of a single protein of interest. "
+        "This function requires a peptide dataframe with PTM information."
+    )
 
     input_keys = [
         "peptide_df",
@@ -882,8 +936,10 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
 class PTMsProteinAndPerSample(DataAnalysisStep):
     display_name = "PTMs per Sample and Protein"
     operation = "Peptide analysis"
-    method_description = ("Analyze the post-translational modifications (PTMs) of all Proteins. "
-                          "This function requires a peptide dataframe with PTM information.")
+    method_description = (
+        "Analyze the post-translational modifications (PTMs) of all Proteins. "
+        "This function requires a peptide dataframe with PTM information."
+    )
 
     input_keys = [
         "peptide_df",
@@ -899,4 +955,4 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         inputs["peptide_df"] = steps.get_step_output(
             Step, "peptide_df", inputs["peptide_df"]
         )
-        return inputs
\ No newline at end of file
+        return inputs
diff --git a/protzilla/methods/importing.py b/protzilla/methods/importing.py
index 6a2f6835..effcd5f1 100644
--- a/protzilla/methods/importing.py
+++ b/protzilla/methods/importing.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from protzilla.importing.fasta_import import fasta_import
 from protzilla.importing.metadata_import import (
     metadata_column_assignment,
     metadata_import_method,
@@ -10,7 +11,7 @@
     max_quant_import,
     ms_fragger_import,
 )
-from protzilla.importing.peptide_import import peptide_import, evidence_import
+from protzilla.importing.peptide_import import evidence_import, peptide_import
 from protzilla.steps import Step, StepManager
 
 
@@ -51,7 +52,9 @@ def method(self, inputs):
 class MsFraggerImport(ImportingStep):
     display_name = "MS Fragger Combined Protein Import"
     operation = "Protein Data Import"
-    method_description = "Import the combined_protein.tsv file form output of MS Fragger"
+    method_description = (
+        "Import the combined_protein.tsv file form output of MS Fragger"
+    )
 
     input_keys = ["file_path", "intensity_name", "map_to_uniprot", "aggregation_method"]
     output_keys = ["protein_df"]
@@ -139,4 +142,16 @@ class EvidenceImport(ImportingStep):
     output_keys = ["peptide_df"]
 
     def method(self, inputs):
-        return evidence_import(**inputs)
\ No newline at end of file
+        return evidence_import(**inputs)
+
+
+class FastaImport(ImportingStep):
+    display_name = "Fasta Protein Sequence Import"
+    operation = "fasta_import"
+    method_description = "Import a fasta file containing protein sequences."
+
+    input_keys = ["file_path"]
+    output_keys = ["fasta_df"]
+
+    def method(self, inputs):
+        return fasta_import(**inputs)
diff --git a/tests/protzilla/data_analysis/test_protein_coverage.py b/tests/protzilla/data_analysis/test_protein_coverage.py
new file mode 100644
index 00000000..f2ca2f1e
--- /dev/null
+++ b/tests/protzilla/data_analysis/test_protein_coverage.py
@@ -0,0 +1,73 @@
+from protzilla.data_analysis.protein_coverage import align_peptide_to_protein_sequence
+
+
+def test_aligns_peptide_at_start():
+    peptide = "MKT"
+    protein_sequence = "MKTLLL"
+    expected_indices = [0]
+    assert (
+        align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices
+    )
+
+
+def test_aligns_peptide_in_middle():
+    peptide = "KTL"
+    protein_sequence = "MKTLLL"
+    expected_indices = [1]
+    assert (
+        align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices
+    )
+
+
+def test_aligns_peptide_at_end():
+    peptide = "LLL"
+    protein_sequence = "MKTLLL"
+    expected_indices = [3]
+    assert (
+        align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices
+    )
+
+
+def test_aligns_multiple_occurrences():
+    peptide = "LL"
+    protein_sequence = "MKTLLLL"
+    expected_indices = [3, 4, 5]
+    assert (
+        align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices
+    )
+
+
+def aligns_no_occurrences():
+    peptide = "XYZ"
+    protein_sequence = "MKTLLL"
+    expected_indices = []
+    assert (
+        align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices
+    )
+
+
+def test_aligns_empty_peptide():
+    peptide = ""
+    protein_sequence = "MKTLLL"
+    expected_indices = []
+    assert (
+        align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices
+    )
+
+
+def test_aligns_empty_protein_sequence():
+    peptide = "MKT"
+    protein_sequence = ""
+    expected_indices = []
+    assert (
+        align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices
+    )
+
+
+def test_aligns_peptide_longer_than_protein_sequence():
+    peptide = "MKTLLL"
+    protein_sequence = "MKT"
+    expected_indices = []
+    assert (
+        align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices
+    )
diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py
index 7221bba6..4d6fae1c 100644
--- a/ui/runs/form_mapping.py
+++ b/ui/runs/form_mapping.py
@@ -22,6 +22,7 @@
     importing.MetadataColumnAssignment: importing_forms.MetadataColumnAssignmentForm,
     importing.PeptideImport: importing_forms.PeptideImportForm,
     importing.EvidenceImport: importing_forms.EvidenceImportForm,
+    importing.FastaImport: importing_forms.FastaImportForm,
     data_preprocessing.FilterProteinsBySamplesMissing: data_preprocessing_forms.FilterProteinsBySamplesMissingForm,
     data_preprocessing.FilterByProteinsCount: data_preprocessing_forms.FilterByProteinsCountForm,
     data_preprocessing.FilterSamplesByProteinsMissing: data_preprocessing_forms.FilterSamplesByProteinsMissingForm,
@@ -52,6 +53,7 @@
     data_analysis.PlotClustergram: data_analysis_forms.PlotClustergramForm,
     data_analysis.PlotProtQuant: data_analysis_forms.PlotProtQuantForm,
     data_analysis.PlotPrecisionRecallCurve: data_analysis_forms.PlotPrecisionRecallCurveForm,
+    data_analysis.PlotProteinCoverage: data_analysis_forms.PlotProteinCoverageForm,
     data_analysis.PlotROC: data_analysis_forms.PlotROCCurveForm,
     data_analysis.ClusteringKMeans: data_analysis_forms.ClusteringKMeansForm,
     data_analysis.ClusteringExpectationMaximisation: data_analysis_forms.ClusteringExpectationMaximizationForm,
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index 6e34aa6e..be9ea58f 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -1,16 +1,10 @@
-import logging
 from enum import Enum, StrEnum
 
-from protzilla.methods.data_preprocessing import DataPreprocessingStep
 from protzilla.methods.data_analysis import (
     DataAnalysisStep,
-    DifferentialExpressionLinearModel,
-    DifferentialExpressionTTest,
     DimensionReductionUMAP,
-    DataAnalysisStep,
     PTMsPerSample,
     SelectPeptidesForProtein,
-    DifferentialExpressionMannWhitneyOnPTM,
 )
 from protzilla.methods.data_preprocessing import DataPreprocessingStep
 from protzilla.run import Run
@@ -25,7 +19,6 @@
     CustomFloatField,
     CustomMultipleChoiceField,
     CustomNumberField,
-    CustomBooleanField,
 )
 
 
@@ -169,7 +162,11 @@ class DifferentialExpressionANOVAForm(MethodForm):
         initial=MultipleTestingCorrectionMethod.benjamini_hochberg,
     )
     alpha = CustomFloatField(
-        label="Error rate (alpha)", min_value=0, max_value=1, step_size=0.01, initial=0.05
+        label="Error rate (alpha)",
+        min_value=0,
+        max_value=1,
+        step_size=0.01,
+        initial=0.05,
     )
 
     grouping = CustomChoiceField(choices=[], label="Grouping from metadata")
@@ -256,7 +253,11 @@ class DifferentialExpressionLinearModelForm(MethodForm):
         initial=MultipleTestingCorrectionMethod.benjamini_hochberg,
     )
     alpha = CustomFloatField(
-        label="Error rate (alpha)", min_value=0, max_value=1, step_size=0.01, initial=0.05
+        label="Error rate (alpha)",
+        min_value=0,
+        max_value=1,
+        step_size=0.01,
+        initial=0.05,
     )
     grouping = CustomChoiceField(choices=[], label="Grouping from metadata")
     group1 = CustomChoiceField(choices=[], label="Group 1")
@@ -293,25 +294,31 @@ def fill_form(self, run: Run) -> None:
 class DifferentialExpressionMannWhitneyOnIntensityForm(MethodForm):
     is_dynamic = True
 
-    protein_df = CustomChoiceField(
-        choices=[], label="Step to use protein data from"
-    )
+    protein_df = CustomChoiceField(choices=[], label="Step to use protein data from")
     multiple_testing_correction_method = CustomChoiceField(
         choices=MultipleTestingCorrectionMethod,
         label="Multiple testing correction",
         initial=MultipleTestingCorrectionMethod.benjamini_hochberg,
     )
     alpha = CustomFloatField(
-        label="Error rate (alpha)", min_value=0, max_value=1, step_size=0.01, initial=0.05
+        label="Error rate (alpha)",
+        min_value=0,
+        max_value=1,
+        step_size=0.01,
+        initial=0.05,
     )
     grouping = CustomChoiceField(choices=[], label="Grouping from metadata")
     group1 = CustomChoiceField(choices=[], label="Group 1")
     group2 = CustomChoiceField(choices=[], label="Group 2")
 
     def fill_form(self, run: Run) -> None:
-        self.fields["protein_df"].choices = fill_helper.get_choices_for_protein_df_steps(run)
+        self.fields[
+            "protein_df"
+        ].choices = fill_helper.get_choices_for_protein_df_steps(run)
 
-        self.fields["grouping"].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
+        self.fields[
+            "grouping"
+        ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
 
         grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0])
 
@@ -346,7 +353,11 @@ class DifferentialExpressionMannWhitneyOnPTMForm(MethodForm):
         initial=MultipleTestingCorrectionMethod.benjamini_hochberg,
     )
     alpha = CustomFloatField(
-        label="Error rate (alpha)", min_value=0, max_value=1, step_size=0.01, initial=0.05
+        label="Error rate (alpha)",
+        min_value=0,
+        max_value=1,
+        step_size=0.01,
+        initial=0.05,
     )
     p_value_calculation_method = CustomChoiceField(
         choices=PValueCalculationMethod,
@@ -362,7 +373,9 @@ def fill_form(self, run: Run) -> None:
             run.steps.get_instance_identifiers(PTMsPerSample, "ptm_df")
         )
 
-        self.fields["grouping"].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
+        self.fields[
+            "grouping"
+        ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
 
         grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0])
 
@@ -390,16 +403,18 @@ def fill_form(self, run: Run) -> None:
 class DifferentialExpressionKruskalWallisOnIntensityForm(MethodForm):
     is_dynamic = True
 
-    protein_df = CustomChoiceField(
-        choices=[], label="Step to use protein data from"
-    )
+    protein_df = CustomChoiceField(choices=[], label="Step to use protein data from")
     multiple_testing_correction_method = CustomChoiceField(
         choices=MultipleTestingCorrectionMethod,
         label="Multiple testing correction",
         initial=MultipleTestingCorrectionMethod.benjamini_hochberg,
     )
     alpha = CustomFloatField(
-        label="Error rate (alpha)", min_value=0, max_value=1, step_size=0.01, initial=0.05
+        label="Error rate (alpha)",
+        min_value=0,
+        max_value=1,
+        step_size=0.01,
+        initial=0.05,
     )
 
     grouping = CustomChoiceField(choices=[], label="Grouping from metadata")
@@ -408,9 +423,13 @@ class DifferentialExpressionKruskalWallisOnIntensityForm(MethodForm):
     )
 
     def fill_form(self, run: Run) -> None:
-        self.fields["protein_df"].choices = fill_helper.get_choices_for_protein_df_steps(run)
+        self.fields[
+            "protein_df"
+        ].choices = fill_helper.get_choices_for_protein_df_steps(run)
 
-        self.fields["grouping"].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
+        self.fields[
+            "grouping"
+        ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
         grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0])
         self.fields["selected_groups"].choices = fill_helper.to_choices(
             run.steps.metadata_df[grouping].unique()
@@ -420,16 +439,18 @@ def fill_form(self, run: Run) -> None:
 class DifferentialExpressionKruskalWallisOnPTMForm(MethodForm):
     is_dynamic = True
 
-    ptm_df = CustomChoiceField(
-        choices=[], label="Step to use ptm data from"
-    )
+    ptm_df = CustomChoiceField(choices=[], label="Step to use ptm data from")
     multiple_testing_correction_method = CustomChoiceField(
         choices=MultipleTestingCorrectionMethod,
         label="Multiple testing correction",
         initial=MultipleTestingCorrectionMethod.benjamini_hochberg,
     )
     alpha = CustomFloatField(
-        label="Error rate (alpha)", min_value=0, max_value=1, step_size=0.01, initial=0.05
+        label="Error rate (alpha)",
+        min_value=0,
+        max_value=1,
+        step_size=0.01,
+        initial=0.05,
     )
 
     grouping = CustomChoiceField(choices=[], label="Grouping from metadata")
@@ -441,7 +462,9 @@ def fill_form(self, run: Run) -> None:
         self.fields["ptm_df"].choices = fill_helper.to_choices(
             run.steps.get_instance_identifiers(PTMsPerSample, "ptm_df")
         )
-        self.fields["grouping"].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
+        self.fields[
+            "grouping"
+        ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
         grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0])
         self.fields["selected_groups"].choices = fill_helper.to_choices(
             run.steps.metadata_df[grouping].unique()
@@ -466,7 +489,8 @@ class PlotVolcanoForm(MethodForm):
     def fill_form(self, run: Run) -> None:
         self.fields["input_dict"].choices = fill_helper.to_choices(
             run.steps.get_instance_identifiers(
-                Step, ["corrected_p_values_df", "log2_fold_change_df"],
+                Step,
+                ["corrected_p_values_df", "log2_fold_change_df"],
             )
         )
 
@@ -486,7 +510,9 @@ def fill_form(self, run: Run) -> None:
         if step_output is not None:
             items_of_interest = step_output["PTM"].unique()
 
-        self.fields["items_of_interest"].choices = fill_helper.to_choices(items_of_interest)
+        self.fields["items_of_interest"].choices = fill_helper.to_choices(
+            items_of_interest
+        )
 
 
 class PlotScatterPlotForm(MethodForm):
@@ -804,7 +830,7 @@ class ClassificationRandomForestForm(MethodForm):
     # TODO: Workflow_meta line 1763
     train_val_split = CustomNumberField(
         label="Choose the size of the validation data set (you can either enter the absolute number of validation "
-              "samples or a number between 0.0 and 1.0 to represent the percentage of validation samples)",
+        "samples or a number between 0.0 and 1.0 to represent the percentage of validation samples)",
         initial=0.20,
     )
     # TODO: Workflow_meta line 1770
@@ -892,7 +918,7 @@ class ClassificationSVMForm(MethodForm):
     )
     train_val_split = CustomNumberField(
         label="Choose the size of the validation data set (you can either enter the absolute number of validation "
-              "samples or a number between 0.0 and 1.0 to represent the percentage of validation samples)",
+        "samples or a number between 0.0 and 1.0 to represent the percentage of validation samples)",
         initial=0.20,
     )
     # TODO: Workflow_meta line 1973
@@ -1009,7 +1035,7 @@ class DimensionReductionUMAPForm(MethodForm):
     )
     n_neighbors = CustomNumberField(
         label="The size of local neighborhood (in terms of number of neighboring sample points) used for manifold "
-              "approximation",
+        "approximation",
         min_value=2,
         max_value=100,
         step_size=1,
@@ -1050,7 +1076,7 @@ class ProteinGraphPeptidesToIsoformForm(MethodForm):
     k = CustomNumberField(label="k-mer length", min_value=1, step_size=1, initial=5)
     allowed_mismatches = CustomNumberField(
         label="Number of allowed mismatched amino acids per peptide. For many allowed mismatches, this can take a "
-              "long time.",
+        "long time.",
         min_value=0,
         step_size=1,
         initial=2,
@@ -1064,6 +1090,44 @@ class ProteinGraphVariationGraphForm(MethodForm):
     # TODO: workflow_meta line 2291 - 2295
 
 
+class PlotProteinCoverageForm(MethodForm):
+    peptide_df_instance = CustomChoiceField(
+        choices=[],
+        label="Step to use peptide data from",
+    )
+    fasta_df_instance = CustomChoiceField(
+        choices=[],
+        label="Step to use fasta protein data from",
+    )
+    protein_id = CustomChoiceField(
+        choices=[],
+        label="Protein ID",
+    )
+
+    def fill_form(self, run: Run) -> None:
+        self.fields["peptide_df_instance"].choices = fill_helper.get_choices(
+            run, "peptide_df"
+        )
+        self.fields["fasta_df_instance"].choices = fill_helper.get_choices(
+            run, "fasta_df", Step
+        )
+        peptide_df_instance_id = self.data.get(
+            "peptide_df_instance", self.fields["peptide_df_instance"].choices[0][0]
+        )
+        fasta_df_instance_id = self.data.get(
+            "fasta_df_instance", self.fields["fasta_df_instance"].choices[0][0]
+        )
+        peptide_df = run.steps.get_step_output(
+            Step, "peptide_df", peptide_df_instance_id
+        )
+        fasta_protein_ids = run.steps.get_step_output(
+            Step, "fasta_df", fasta_df_instance_id
+        )["Protein ID"].unique()
+        peptide_df_protein_ids = peptide_df["Protein ID"].unique()
+        common_protein_ids = list(set(fasta_protein_ids) & set(peptide_df_protein_ids))
+        self.fields["protein_id"].choices = fill_helper.to_choices(common_protein_ids)
+
+
 class FLEXIQuantLFForm(MethodForm):
     is_dynamic = True
 
@@ -1143,13 +1207,17 @@ def fill_form(self, run: Run) -> None:
 
         selected_auto_select = self.data.get("auto_select")
 
-        choices = fill_helper.to_choices([] if selected_auto_select else ["all proteins"])
-        choices.extend(fill_helper.get_choices(
-            run, "significant_proteins_df", DataAnalysisStep
-        ))
+        choices = fill_helper.to_choices(
+            [] if selected_auto_select else ["all proteins"]
+        )
+        choices.extend(
+            fill_helper.get_choices(run, "significant_proteins_df", DataAnalysisStep)
+        )
         self.fields["protein_list"].choices = choices
 
-        chosen_list = self.data.get("protein_list", self.fields["protein_list"].choices[0][0])
+        chosen_list = self.data.get(
+            "protein_list", self.fields["protein_list"].choices[0][0]
+        )
         if not selected_auto_select:
             self.toggle_visibility("sort_proteins", True)
             self.toggle_visibility("protein_ids", True)
@@ -1163,7 +1231,9 @@ def fill_form(self, run: Run) -> None:
                     self.fields["protein_ids"].choices = fill_helper.to_choices(
                         run.steps.get_step_output(
                             DataAnalysisStep, "significant_proteins_df", chosen_list
-                        ).sort_values(by="corrected_p_value")["Protein ID"].unique()
+                        )
+                        .sort_values(by="corrected_p_value")["Protein ID"]
+                        .unique()
                     )
                 else:
                     self.fields["protein_ids"].choices = fill_helper.to_choices(
@@ -1183,9 +1253,7 @@ class PTMsPerSampleForm(MethodForm):
     )
 
     def fill_form(self, run: Run) -> None:
-        self.fields["peptide_df"].choices = fill_helper.get_choices(
-            run, "peptide_df"
-        )
+        self.fields["peptide_df"].choices = fill_helper.get_choices(run, "peptide_df")
 
         single_protein_peptides = run.steps.get_instance_identifiers(
             SelectPeptidesForProtein, "peptide_df"
@@ -1207,4 +1275,4 @@ def fill_form(self, run: Run) -> None:
             SelectPeptidesForProtein, "peptide_df"
         )
         if single_protein_peptides:
-            self.fields["peptide_df"].initial = single_protein_peptides[0]
\ No newline at end of file
+            self.fields["peptide_df"].initial = single_protein_peptides[0]
diff --git a/ui/runs/forms/importing.py b/ui/runs/forms/importing.py
index 8975e3ba..600ea517 100644
--- a/ui/runs/forms/importing.py
+++ b/ui/runs/forms/importing.py
@@ -192,3 +192,7 @@ def fill_form(self, run: Run) -> None:
         self.fields["map_to_uniprot"].initial = run.steps.get_step_input(
             [MaxQuantImport, MsFraggerImport, DiannImport], "map_to_uniprot"
         )
+
+
+class FastaImportForm(MethodForm):
+    file_path = CustomFileField(label="Fasta file")

From f66d4ed30480b1daaea72085009d981e2b28ba63 Mon Sep 17 00:00:00 2001
From: henning <henning.gaertner@student.hpi.de>
Date: Thu, 21 Nov 2024 13:43:50 +0100
Subject: [PATCH 02/13] Add new pickle disk operator interface, add k-mer
 dictionary generation and storing, annihilate protein coverage tests

---
 protzilla/data_analysis/protein_coverage.py   | 108 ++++++++++++------
 protzilla/disk_operator.py                    |  20 ++++
 .../data_analysis/test_protein_coverage.py    |  73 ------------
 3 files changed, 92 insertions(+), 109 deletions(-)

diff --git a/protzilla/data_analysis/protein_coverage.py b/protzilla/data_analysis/protein_coverage.py
index afc504ad..dce9c996 100644
--- a/protzilla/data_analysis/protein_coverage.py
+++ b/protzilla/data_analysis/protein_coverage.py
@@ -1,27 +1,67 @@
-from typing import List
-
+from tqdm import tqdm
+from protzilla.constants.paths import EXTERNAL_DATA_PATH
+from protzilla.disk_operator import PickleOperator
 import pandas as pd
 import plotly.graph_objects as go
+def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dict[str, list[tuple[str, int]]]:
+    """
+    Builds a dictionary of all kmers in a list of protein sequences. The kmers are generated by sliding a window of size
+    k along the protein sequences. The dictionary maps a the k-mer to a list of tuples containing the protein ID and the index of
+    the k-mer in the protein sequence. The dictionary is saved to disk for future use.
+    """
+    # if the dictionary already exists, load it from disk
+    kmer_dict_path = EXTERNAL_DATA_PATH / f"kmer_dict_{k}.pkl"
+    if kmer_dict_path.exists():
+        kmer_dict = PickleOperator.read(kmer_dict_path)
+        return kmer_dict
+    kmer_dict = {}
+    for protein_id, protein_sequence in tqdm(protein_dictionary.items(), desc="Building kmer dictionary", unit_scale=True, unit="protein"):
+        for i in range(len(protein_sequence) - k + 1):
+            kmer = protein_sequence[i : i + k]
+            if kmer in kmer_dict:
+                kmer_dict[kmer].append((protein_id, i))
+            else:
+                kmer_dict[kmer] = [(protein_id, i)]
+    PickleOperator.write(kmer_dict_path, kmer_dict)
+    return kmer_dict
 
 
-def align_peptide_to_protein_sequence(
-    peptide_sequence: str, protein_sequence: str
-) -> List[int]:
+def match_peptide_to_protein_ids(
+    peptide_sequence: str, protein_kmer_dictionary : dict[str, list[tuple[str, int]]], protein_dictionary: dict[str, str]
+) -> list[tuple[str, int]]:
     """
-    Aligns a peptide to a protein sequence and returns the indices of the protein sequence where the peptide.
-    NAIVE APPROACH
+    Matches a peptide sequence to a dictionary of kmers in protein sequences.
+    Returns a list of protein ids that could be matches
     """
-    if (
-        len(peptide_sequence) == 0
-        or len(protein_sequence) == 0
-        or len(peptide_sequence) > len(protein_sequence)
-    ):
-        return []
-    indices = []
-    for i in range(len(protein_sequence) - len(peptide_sequence) + 1):
-        if protein_sequence[i : i + len(peptide_sequence)] == peptide_sequence:
-            indices.append(i)
-    return indices
+    # convert the peptide sequence to a list of kmers
+    k = 5
+    kmers = [peptide_sequence[i : i + k] for i in range(len(peptide_sequence) - k + 1)]
+    # for now, we only do exact matches, so only the first and last kmer of the peptide are needed
+    first_kmer = kmers[0]
+    last_kmer = kmers[-1]
+    # match the kmers
+    first_kmer_matches = protein_kmer_dictionary.get(first_kmer, [])
+    last_kmer_matches = protein_kmer_dictionary.get(last_kmer, [])
+    # determine the protein ids that are common to both lists, check if the peptide is in the protein sequence at that
+    # location and return the protein ids and the start location
+    hits = []
+    for protein_id, start_location in first_kmer_matches:
+        expected_end_location = start_location + len(peptide_sequence) - k
+        for _, end_location in last_kmer_matches:
+            if protein_id != _:
+                continue
+            if end_location != expected_end_location:
+                continue
+            protein_sequence = protein_dictionary[protein_id]
+            subsequence = protein_sequence[start_location:end_location + k]
+            if subsequence != peptide_sequence:
+                continue
+            assert len(subsequence) == len(peptide_sequence), f"Lengths do not match: {len(subsequence)} != {len(peptide_sequence)}"
+            assert subsequence in peptide_sequence, f"Subsequence not in peptide sequence:\nA: {subsequence}\nB: {peptide_sequence}"
+            assert peptide_sequence in protein_sequence, f"Peptide not in protein sequence: {peptide_sequence} not in {protein_sequence}"
+            hits.append((protein_id, start_location))
+    return hits
+
 
 
 def plot_protein_coverage(
@@ -30,28 +70,24 @@ def plot_protein_coverage(
     """
     Plots the coverage of a protein sequence by peptides.
     """
-    # Retrieve the relevant peptides from the peptide df
-    relevant_peptides = peptide_df[peptide_df["Protein ID"] == protein_id]
-    protein_sequence = fasta_df[fasta_df["Protein ID"] == protein_id][
-        "Protein Sequence"
-    ].values[0]
+    # generate the protein kmer dictionary
+    protein_dict = dict(zip(fasta_df["Protein ID"], fasta_df["Protein Sequence"]))
+    kmer_dict = build_kmer_dictionary(protein_dict, k=5)
+
+    protein_sequence = protein_dict[protein_id]
     protein_sequence_length = len(protein_sequence)
-    coverage = [0] * protein_sequence_length
-    for peptide_sequence in relevant_peptides["Sequence"].unique():
-        indices = align_peptide_to_protein_sequence(peptide_sequence, protein_sequence)
-        for i in indices:
-            for j in range(len(peptide_sequence)):
-                coverage[i + j] += 1
+    coverage = []
 
-    # Make sure coverage length matches the sequence length
-    if len(coverage) != protein_sequence_length:
-        raise ValueError("Length of coverage must match protein_sequence_length.")
+    for peptide_sequence in tqdm(peptide_df['Sequence'].unique(), desc="Matching peptides to protein", unit_scale=True, unit="peptide"):
+        hits = match_peptide_to_protein_ids(peptide_sequence=peptide_sequence, protein_kmer_dictionary=kmer_dict, protein_dictionary=protein_dict)
+        for protein, start_location in hits:
+            # we are only interested in the provided protein id, everything else is ignored
+            if protein != protein_id:
+                continue
+            # add the peptide sequence and location to the coverage list
+            coverage.append((peptide_sequence, start_location))
 
-    # Generate labels for amino acid positions
-    amino_acid_positions = list(range(1, protein_sequence_length + 1))
     x_labels = [f"{i} ({protein_sequence[i - 1]})" for i in amino_acid_positions]
-
-    # Create the bar chart
     fig = go.Figure()
 
     # Add bars for coverage values
diff --git a/protzilla/disk_operator.py b/protzilla/disk_operator.py
index ce7f07ad..dfee538c 100644
--- a/protzilla/disk_operator.py
+++ b/protzilla/disk_operator.py
@@ -6,6 +6,7 @@
 
 import pandas as pd
 import yaml
+import pickle
 from plotly.io import read_json, write_json
 
 import protzilla.utilities as utilities
@@ -36,6 +37,25 @@ def __exit__(self, exc_type, exc_val, exc_tb):
             return False
         return True
 
+class PickleOperator:
+    @staticmethod
+    def read(file_path: Path):
+        with ErrorHandler():
+            with open(file_path, "rb") as file:
+                logger.info(f"Reading pickle from {file_path}")
+                return pickle.load(file)
+
+    @staticmethod
+    def write(file_path: Path, data):
+        with ErrorHandler():
+            if not file_path.exists():
+                if not file_path.parent.exists():
+                    logger.info(
+                        f"Parent directory {file_path.parent} did not exist and was created"
+                    )
+                    file_path.parent.mkdir(parents=True)
+            with open(file_path, "wb") as file:
+                pickle.dump(data, file)
 
 class YamlOperator:
     @staticmethod
diff --git a/tests/protzilla/data_analysis/test_protein_coverage.py b/tests/protzilla/data_analysis/test_protein_coverage.py
index f2ca2f1e..e69de29b 100644
--- a/tests/protzilla/data_analysis/test_protein_coverage.py
+++ b/tests/protzilla/data_analysis/test_protein_coverage.py
@@ -1,73 +0,0 @@
-from protzilla.data_analysis.protein_coverage import align_peptide_to_protein_sequence
-
-
-def test_aligns_peptide_at_start():
-    peptide = "MKT"
-    protein_sequence = "MKTLLL"
-    expected_indices = [0]
-    assert (
-        align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices
-    )
-
-
-def test_aligns_peptide_in_middle():
-    peptide = "KTL"
-    protein_sequence = "MKTLLL"
-    expected_indices = [1]
-    assert (
-        align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices
-    )
-
-
-def test_aligns_peptide_at_end():
-    peptide = "LLL"
-    protein_sequence = "MKTLLL"
-    expected_indices = [3]
-    assert (
-        align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices
-    )
-
-
-def test_aligns_multiple_occurrences():
-    peptide = "LL"
-    protein_sequence = "MKTLLLL"
-    expected_indices = [3, 4, 5]
-    assert (
-        align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices
-    )
-
-
-def aligns_no_occurrences():
-    peptide = "XYZ"
-    protein_sequence = "MKTLLL"
-    expected_indices = []
-    assert (
-        align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices
-    )
-
-
-def test_aligns_empty_peptide():
-    peptide = ""
-    protein_sequence = "MKTLLL"
-    expected_indices = []
-    assert (
-        align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices
-    )
-
-
-def test_aligns_empty_protein_sequence():
-    peptide = "MKT"
-    protein_sequence = ""
-    expected_indices = []
-    assert (
-        align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices
-    )
-
-
-def test_aligns_peptide_longer_than_protein_sequence():
-    peptide = "MKTLLL"
-    protein_sequence = "MKT"
-    expected_indices = []
-    assert (
-        align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices
-    )

From 4e981c2790e7069d570015e73b4d8a393ee64107 Mon Sep 17 00:00:00 2001
From: henning <henning.gaertner@student.hpi.de>
Date: Tue, 26 Nov 2024 12:27:56 +0100
Subject: [PATCH 03/13] Add peptide sequences as hoverable rectangles in the
 protein coverage plot, space optimization

---
 protzilla/data_analysis/protein_coverage.py | 95 +++++++++++++++------
 1 file changed, 68 insertions(+), 27 deletions(-)

diff --git a/protzilla/data_analysis/protein_coverage.py b/protzilla/data_analysis/protein_coverage.py
index dce9c996..35bc0516 100644
--- a/protzilla/data_analysis/protein_coverage.py
+++ b/protzilla/data_analysis/protein_coverage.py
@@ -1,3 +1,4 @@
+from docutils.nodes import title
 from tqdm import tqdm
 from protzilla.constants.paths import EXTERNAL_DATA_PATH
 from protzilla.disk_operator import PickleOperator
@@ -28,10 +29,10 @@ def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dic
 
 def match_peptide_to_protein_ids(
     peptide_sequence: str, protein_kmer_dictionary : dict[str, list[tuple[str, int]]], protein_dictionary: dict[str, str]
-) -> list[tuple[str, int]]:
+) -> list[tuple[str, int, int]]:
     """
     Matches a peptide sequence to a dictionary of kmers in protein sequences.
-    Returns a list of protein ids that could be matches
+    Returns a list of tuples containing the protein ID and the start, end location of the peptide in the protein sequence.
     """
     # convert the peptide sequence to a list of kmers
     k = 5
@@ -45,21 +46,22 @@ def match_peptide_to_protein_ids(
     # determine the protein ids that are common to both lists, check if the peptide is in the protein sequence at that
     # location and return the protein ids and the start location
     hits = []
-    for protein_id, start_location in first_kmer_matches:
-        expected_end_location = start_location + len(peptide_sequence) - k
-        for _, end_location in last_kmer_matches:
+    for protein_id, start_first_kmer in first_kmer_matches:
+        expected_start_last_kmer = start_first_kmer + len(peptide_sequence) - k
+        for _, start_last_kmer in last_kmer_matches:
             if protein_id != _:
                 continue
-            if end_location != expected_end_location:
+            if start_last_kmer != expected_start_last_kmer:
                 continue
             protein_sequence = protein_dictionary[protein_id]
-            subsequence = protein_sequence[start_location:end_location + k]
+            end_last_kmer = start_first_kmer + len(peptide_sequence)
+            subsequence = protein_sequence[start_first_kmer:end_last_kmer]
             if subsequence != peptide_sequence:
                 continue
             assert len(subsequence) == len(peptide_sequence), f"Lengths do not match: {len(subsequence)} != {len(peptide_sequence)}"
             assert subsequence in peptide_sequence, f"Subsequence not in peptide sequence:\nA: {subsequence}\nB: {peptide_sequence}"
             assert peptide_sequence in protein_sequence, f"Peptide not in protein sequence: {peptide_sequence} not in {protein_sequence}"
-            hits.append((protein_id, start_location))
+            hits.append((protein_id, start_first_kmer, end_last_kmer))
     return hits
 
 
@@ -76,31 +78,70 @@ def plot_protein_coverage(
 
     protein_sequence = protein_dict[protein_id]
     protein_sequence_length = len(protein_sequence)
-    coverage = []
+    peptide_matches = []
 
     for peptide_sequence in tqdm(peptide_df['Sequence'].unique(), desc="Matching peptides to protein", unit_scale=True, unit="peptide"):
-        hits = match_peptide_to_protein_ids(peptide_sequence=peptide_sequence, protein_kmer_dictionary=kmer_dict, protein_dictionary=protein_dict)
-        for protein, start_location in hits:
-            # we are only interested in the provided protein id, everything else is ignored
-            if protein != protein_id:
+        protein_hits = match_peptide_to_protein_ids(peptide_sequence=peptide_sequence, protein_kmer_dictionary=kmer_dict, protein_dictionary=protein_dict)
+        for prot_id, start_location_on_protein, end_location_on_protein in protein_hits:
+            # we are only interested in the provided protein id, everything else is discarded
+            if prot_id != protein_id:
                 continue
-            # add the peptide sequence and location to the coverage list
-            coverage.append((peptide_sequence, start_location))
+            # add the peptide sequence and location to the peptide matches pertaining to the protein id
+            peptide_matches.append((peptide_sequence, start_location_on_protein, end_location_on_protein))
 
-    x_labels = [f"{i} ({protein_sequence[i - 1]})" for i in amino_acid_positions]
-    fig = go.Figure()
+    # now that all the matches have been determined with their start and end on the protein sequence, we need to find
+    # an optimal solution for the location of their rectangles in the plot without overlap while minimizing the
+    # required number of rows (vertical space)
+    rows_of_peptide_matches = distribute_to_rows(peptide_matches)
 
-    # Add bars for coverage values
-    fig.add_trace(
-        go.Bar(x=x_labels, y=coverage, name="Coverage", marker=dict(color="skyblue"))
-    )
+    x_labels = [f"{i} ({protein_sequence[i - 1]})" for i in range(1, protein_sequence_length + 1)]
+    fig = go.Figure(data=go.Bar(x=x_labels, y=[1] * protein_sequence_length,
+                                name="Protein Sequence", marker=dict(color="lightgray")), )
+    for row_index, row in enumerate(rows_of_peptide_matches):
+        for start_location_on_protein, end_location_on_protein, peptide_sequence in row:
+            print(f"Peptide: {peptide_sequence}, start: {start_location_on_protein}, end: {end_location_on_protein}")
+            # add a rectangle for each peptide
+            fig.add_shape(
+                type="rect",
+                x0=start_location_on_protein,
+                y0=row_index+1 + 0.1,
+                x1=end_location_on_protein,
+                y1=row_index+ 2 - 0.1,
+                fillcolor="blue",
+                opacity=0.5,
+                layer="above",
+            )
+            # add invisible plotly object to the rectangle to show the peptide sequence when hovered over
+            fig.add_trace(go.Scatter(
+                x=[x_labels[(start_location_on_protein + end_location_on_protein) // 2]],
+                y=[row_index + 1.5],
+                text=[f"{peptide_sequence}<br>({start_location_on_protein}-{end_location_on_protein})"],
+                mode="markers",
+                marker=dict(opacity=0, size=20),  # make marker invisible
+                hoverinfo="text",
+                hovertemplate="%{text}<extra></extra>",
+                showlegend=False,
+            ))
 
-    # Customize layout
     fig.update_layout(
-        title="Protein Coverage",
-        xaxis_title="Amino Acid",
-        yaxis_title="Coverage Value",
-        xaxis=dict(tickmode="linear"),
-        bargap=0.1,  # Adjust space between bars if needed
+        title=f"Protein Coverage of {protein_id}",
+        xaxis_title="Protein Sequence",
+        yaxis=dict(visible=False, range=[0, 1+len(rows_of_peptide_matches)+1], title=""),
+        showlegend=False,
     )
+
     return dict(plots=[fig])
+
+
+def distribute_to_rows(coverage):
+    coverage.sort(key=lambda x: x[1])
+    rows = []
+    for peptide_sequence, start_location, end_location in coverage:
+        # find the first row that does not overlap with the current peptide
+        for row in rows:
+            if row[-1][1] <= start_location:
+                row.append((start_location, end_location, peptide_sequence))
+                break
+        else:
+            rows.append([(start_location, end_location, peptide_sequence)])
+    return rows

From 1006c0caafbb462d0e6b435ecbb255366f609058 Mon Sep 17 00:00:00 2001
From: henning <henning.gaertner@student.hpi.de>
Date: Wed, 27 Nov 2024 09:17:52 +0100
Subject: [PATCH 04/13] Add sample selection, coloring based on intensities to
 the protein coverage plot

---
 protzilla/data_analysis/protein_coverage.py | 191 +++++++++++++++-----
 protzilla/methods/data_analysis.py          |   1 +
 ui/runs/forms/data_analysis.py              |  10 +-
 3 files changed, 154 insertions(+), 48 deletions(-)

diff --git a/protzilla/data_analysis/protein_coverage.py b/protzilla/data_analysis/protein_coverage.py
index 35bc0516..6e4fb04c 100644
--- a/protzilla/data_analysis/protein_coverage.py
+++ b/protzilla/data_analysis/protein_coverage.py
@@ -1,9 +1,24 @@
+from colour import color_scale
 from docutils.nodes import title
 from tqdm import tqdm
 from protzilla.constants.paths import EXTERNAL_DATA_PATH
 from protzilla.disk_operator import PickleOperator
+from dataclasses import dataclass
 import pandas as pd
+from numpy import log2
 import plotly.graph_objects as go
+
+@dataclass(unsafe_hash=True)
+class PeptideMatch:
+
+    peptide_sequence: str
+    start_location_on_protein: int
+    end_location_on_protein: int
+    intensity: float = 0.0
+    sample: str = ""
+
+
+
 def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dict[str, list[tuple[str, int]]]:
     """
     Builds a dictionary of all kmers in a list of protein sequences. The kmers are generated by sliding a window of size
@@ -16,9 +31,10 @@ def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dic
         kmer_dict = PickleOperator.read(kmer_dict_path)
         return kmer_dict
     kmer_dict = {}
-    for protein_id, protein_sequence in tqdm(protein_dictionary.items(), desc="Building kmer dictionary", unit_scale=True, unit="protein"):
+    for protein_id, protein_sequence in tqdm(protein_dictionary.items(), desc="Building kmer dictionary",
+                                             unit_scale=True, unit="protein"):
         for i in range(len(protein_sequence) - k + 1):
-            kmer = protein_sequence[i : i + k]
+            kmer = protein_sequence[i: i + k]
             if kmer in kmer_dict:
                 kmer_dict[kmer].append((protein_id, i))
             else:
@@ -28,7 +44,8 @@ def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dic
 
 
 def match_peptide_to_protein_ids(
-    peptide_sequence: str, protein_kmer_dictionary : dict[str, list[tuple[str, int]]], protein_dictionary: dict[str, str]
+        peptide_sequence: str, protein_kmer_dictionary: dict[str, list[tuple[str, int]]],
+        protein_dictionary: dict[str, str]
 ) -> list[tuple[str, int, int]]:
     """
     Matches a peptide sequence to a dictionary of kmers in protein sequences.
@@ -36,7 +53,7 @@ def match_peptide_to_protein_ids(
     """
     # convert the peptide sequence to a list of kmers
     k = 5
-    kmers = [peptide_sequence[i : i + k] for i in range(len(peptide_sequence) - k + 1)]
+    kmers = [peptide_sequence[i: i + k] for i in range(len(peptide_sequence) - k + 1)]
     # for now, we only do exact matches, so only the first and last kmer of the peptide are needed
     first_kmer = kmers[0]
     last_kmer = kmers[-1]
@@ -58,90 +75,170 @@ def match_peptide_to_protein_ids(
             subsequence = protein_sequence[start_first_kmer:end_last_kmer]
             if subsequence != peptide_sequence:
                 continue
-            assert len(subsequence) == len(peptide_sequence), f"Lengths do not match: {len(subsequence)} != {len(peptide_sequence)}"
+            assert len(subsequence) == len(
+                peptide_sequence), f"Lengths do not match: {len(subsequence)} != {len(peptide_sequence)}"
             assert subsequence in peptide_sequence, f"Subsequence not in peptide sequence:\nA: {subsequence}\nB: {peptide_sequence}"
             assert peptide_sequence in protein_sequence, f"Peptide not in protein sequence: {peptide_sequence} not in {protein_sequence}"
             hits.append((protein_id, start_first_kmer, end_last_kmer))
     return hits
 
 
-
 def plot_protein_coverage(
-    fasta_df: pd.DataFrame, peptide_df: pd.DataFrame, protein_id: str
+        fasta_df: pd.DataFrame, peptide_df: pd.DataFrame, protein_id: str, samples: list[str] = None
 ) -> None:
     """
     Plots the coverage of a protein sequence by peptides.
     """
     # generate the protein kmer dictionary
+    if samples is None:
+        samples = []
+    reduced_peptide_df = peptide_df[peptide_df["Sample"].isin(samples) & peptide_df['Intensity'] > 0]
+
     protein_dict = dict(zip(fasta_df["Protein ID"], fasta_df["Protein Sequence"]))
     kmer_dict = build_kmer_dictionary(protein_dict, k=5)
 
     protein_sequence = protein_dict[protein_id]
     protein_sequence_length = len(protein_sequence)
     peptide_matches = []
+    coverage = [0] * protein_sequence_length
 
-    for peptide_sequence in tqdm(peptide_df['Sequence'].unique(), desc="Matching peptides to protein", unit_scale=True, unit="peptide"):
-        protein_hits = match_peptide_to_protein_ids(peptide_sequence=peptide_sequence, protein_kmer_dictionary=kmer_dict, protein_dictionary=protein_dict)
+    for sample, peptide_sequence, intensity in tqdm(
+            reduced_peptide_df[['Sample', 'Sequence', 'Intensity']].drop_duplicates().itertuples(index=False),
+            desc="Matching peptides to protein", unit_scale=True,
+            unit="peptide"):
+        protein_hits = match_peptide_to_protein_ids(peptide_sequence=peptide_sequence,
+                                                    protein_kmer_dictionary=kmer_dict, protein_dictionary=protein_dict)
         for prot_id, start_location_on_protein, end_location_on_protein in protein_hits:
             # we are only interested in the provided protein id, everything else is discarded
             if prot_id != protein_id:
                 continue
             # add the peptide sequence and location to the peptide matches pertaining to the protein id
-            peptide_matches.append((peptide_sequence, start_location_on_protein, end_location_on_protein))
+            peptide_matches.append(
+                PeptideMatch(peptide_sequence=peptide_sequence,
+                             start_location_on_protein=start_location_on_protein,
+                             end_location_on_protein=end_location_on_protein,
+                             intensity=log2(intensity),
+                             sample=sample)
+            )
+            # update the coverage of the protein sequence
+            coverage[start_location_on_protein:end_location_on_protein] = \
+                [coverage + 1 for coverage in coverage[start_location_on_protein:end_location_on_protein]]
 
+    if len(peptide_matches) == 0:
+        raise ValueError(f"No peptides matched for protein {protein_id}")
     # now that all the matches have been determined with their start and end on the protein sequence, we need to find
     # an optimal solution for the location of their rectangles in the plot without overlap while minimizing the
     # required number of rows (vertical space)
-    rows_of_peptide_matches = distribute_to_rows(peptide_matches)
-
-    x_labels = [f"{i} ({protein_sequence[i - 1]})" for i in range(1, protein_sequence_length + 1)]
-    fig = go.Figure(data=go.Bar(x=x_labels, y=[1] * protein_sequence_length,
-                                name="Protein Sequence", marker=dict(color="lightgray")), )
-    for row_index, row in enumerate(rows_of_peptide_matches):
-        for start_location_on_protein, end_location_on_protein, peptide_sequence in row:
-            print(f"Peptide: {peptide_sequence}, start: {start_location_on_protein}, end: {end_location_on_protein}")
-            # add a rectangle for each peptide
+    rows = distribute_to_rows(peptide_matches)
+
+    x_labels = [f"{amino_acid_index} ({amino_acid})" for amino_acid_index, amino_acid in enumerate(protein_sequence)]
+    max_coverage_in_sequence = max(coverage)
+    max_intensity = max([peptide_match.intensity for peptide_match in peptide_matches])
+    normalized_coverage = [value / max_coverage_in_sequence for value in coverage]
+    hover_text = [f"Position: {i}<br>Amino acid: {amino_acid}<br>Coverage: {coverage}"
+                  for i, (coverage, amino_acid) in
+                  enumerate(zip(normalized_coverage, protein_sequence))]
+
+    fig = go.Figure(data=go.Bar(x=x_labels,
+                                y=normalized_coverage,
+                                marker=dict(
+                                    color=normalized_coverage,
+                                    colorscale='Bluered',
+                                    colorbar=dict(title="Coverage<br>(normalized)"),
+                                    line=dict(width=0)
+                                ),
+                                hovertext=hover_text,
+                                hoverinfo="text",
+                                showlegend=False,
+                                name="Coverage (normalized to max in Protein Sequence)"))
+
+    # Keep track of the current row position across all samples
+    current_row = 1
+
+    # Iterate through samples and their rows in order
+    for sample_idx, (sample, sample_rows) in enumerate(rows.items()):
+        # Add a sample label
+        fig.add_annotation(
+            x=-0.1,  # slightly to the left of the plot
+            y=current_row + len(sample_rows) / 2,
+            text=sample,
+            showarrow=False,
+            textangle=-90,
+            xref='paper',
+            yref='y'
+        )
+
+        for row_index, row in enumerate(sample_rows):
+            for peptide_match in row:
+                # add a rectangle for each peptide
+                x0, x1 = peptide_match.start_location_on_protein, peptide_match.end_location_on_protein
+                y0, y1 = current_row, current_row + 1
+                normalized_intensity = (peptide_match.intensity) / max_intensity
+                fig.add_shape(
+                    type="rect",
+                    # the coordinates need to be a bit offset, as otherwise the rectangles would start in the middle of the bars
+                    x0=x0 - 0.5, y0=y0,
+                    x1=x1 - 0.5, y1=y1,
+                    fillcolor=f"rgba({int(255 * (1 - normalized_intensity))}, 0, {int(255 * normalized_intensity)}, 0.5)",
+                    opacity=0.5,
+                    layer="above",
+                )
+                # add invisible plotly object to the rectangle to show the peptide sequence when hovered over
+                fig.add_trace(go.Scatter(
+                    x=x_labels[peptide_match.start_location_on_protein:peptide_match.end_location_on_protein + 1],
+                    y=[(y0 + y1) / 2] * len(peptide_match.peptide_sequence),
+                    text=[f"Sample: {sample}<br>Peptide: {peptide_match.peptide_sequence}<br>"
+                          f"({peptide_match.start_location_on_protein}-{peptide_match.end_location_on_protein})<br>"
+                          f"Intensity: {peptide_match.intensity}"] * len(peptide_match.peptide_sequence),
+                    mode="markers",
+                    marker=dict(opacity=0, size=30),  # make marker invisible
+                    hoverinfo="text",
+                    hovertemplate="%{text}<extra></extra>",
+                    showlegend=False,
+                ))
+
+            # Move to next row
+            current_row += 1
+
+        # Add a horizontal line separator between samples (except after the last sample)
+        if sample_idx < len(rows) - 1:
             fig.add_shape(
-                type="rect",
-                x0=start_location_on_protein,
-                y0=row_index+1 + 0.1,
-                x1=end_location_on_protein,
-                y1=row_index+ 2 - 0.1,
-                fillcolor="blue",
-                opacity=0.5,
-                layer="above",
+                type="line",
+                x0=-0.5,
+                y0=current_row,
+                x1=len(protein_sequence) - 0.5,
+                y1=current_row,
+                line=dict(
+                    color="Gray",
+                    width=2,
+                    dash="solid",
+                )
             )
-            # add invisible plotly object to the rectangle to show the peptide sequence when hovered over
-            fig.add_trace(go.Scatter(
-                x=[x_labels[(start_location_on_protein + end_location_on_protein) // 2]],
-                y=[row_index + 1.5],
-                text=[f"{peptide_sequence}<br>({start_location_on_protein}-{end_location_on_protein})"],
-                mode="markers",
-                marker=dict(opacity=0, size=20),  # make marker invisible
-                hoverinfo="text",
-                hovertemplate="%{text}<extra></extra>",
-                showlegend=False,
-            ))
 
     fig.update_layout(
         title=f"Protein Coverage of {protein_id}",
         xaxis_title="Protein Sequence",
-        yaxis=dict(visible=False, range=[0, 1+len(rows_of_peptide_matches)+1], title=""),
+        yaxis=dict(visible=False, range=[0, 1 + current_row], title=""),
         showlegend=False,
+        bargap=0
     )
 
     return dict(plots=[fig])
 
 
-def distribute_to_rows(coverage):
-    coverage.sort(key=lambda x: x[1])
-    rows = []
-    for peptide_sequence, start_location, end_location in coverage:
+def distribute_to_rows(peptide_matches: list[PeptideMatch]) -> dict[list[list[PeptideMatch]]]:
+    """
+    Distributes peptide matches to rows such that no two peptides overlap in the same row.
+    Greedy algorithm (see Interval Scheduling Problem).
+    """
+    peptide_matches.sort(key=lambda peptide_match: peptide_match.start_location_on_protein)
+    rows = {sample : [] for sample in set([peptide_match.sample for peptide_match in peptide_matches])}
+    for peptide_match in peptide_matches:
         # find the first row that does not overlap with the current peptide
-        for row in rows:
-            if row[-1][1] <= start_location:
-                row.append((start_location, end_location, peptide_sequence))
+        for row in rows[peptide_match.sample]:
+            if row[-1].end_location_on_protein < peptide_match.start_location_on_protein:
+                row.append((peptide_match))
                 break
         else:
-            rows.append([(start_location, end_location, peptide_sequence)])
+            rows[peptide_match.sample].append([(peptide_match)])
     return rows
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index ab8bdc27..765b3047 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -792,6 +792,7 @@ class PlotProteinCoverage(PlotStep):
         "protein_id",
         "fasta_df",
         "peptide_df",
+        "samples"
     ]
     output_keys = []
 
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index be9ea58f..38c8668e 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -1103,6 +1103,11 @@ class PlotProteinCoverageForm(MethodForm):
         choices=[],
         label="Protein ID",
     )
+    samples = CustomMultipleChoiceField(
+        choices=[],
+        label="Samples",
+    )
+
 
     def fill_form(self, run: Run) -> None:
         self.fields["peptide_df_instance"].choices = fill_helper.get_choices(
@@ -1124,9 +1129,12 @@ def fill_form(self, run: Run) -> None:
             Step, "fasta_df", fasta_df_instance_id
         )["Protein ID"].unique()
         peptide_df_protein_ids = peptide_df["Protein ID"].unique()
-        common_protein_ids = list(set(fasta_protein_ids) & set(peptide_df_protein_ids))
+        common_protein_ids = sorted(list(set(fasta_protein_ids) & set(peptide_df_protein_ids)))
         self.fields["protein_id"].choices = fill_helper.to_choices(common_protein_ids)
 
+        peptide_df_samples = peptide_df["Sample"].unique()
+        self.fields["samples"].choices = fill_helper.to_choices(peptide_df_samples)
+
 
 class FLEXIQuantLFForm(MethodForm):
     is_dynamic = True

From cb227f4622bbe87d817aa64b2d600a0d5466c39d Mon Sep 17 00:00:00 2001
From: henning <henning.gaertner@student.hpi.de>
Date: Thu, 28 Nov 2024 15:32:47 +0100
Subject: [PATCH 05/13] Refactoring and cleanup

---
 protzilla/data_analysis/protein_coverage.py | 79 +++++++++++----------
 1 file changed, 40 insertions(+), 39 deletions(-)

diff --git a/protzilla/data_analysis/protein_coverage.py b/protzilla/data_analysis/protein_coverage.py
index 6e4fb04c..6ffe135c 100644
--- a/protzilla/data_analysis/protein_coverage.py
+++ b/protzilla/data_analysis/protein_coverage.py
@@ -18,6 +18,12 @@ class PeptideMatch:
     sample: str = ""
 
 
+@dataclass(unsafe_hash=True)
+class ProteinHit:
+    protein_id: str
+    start_location_on_protein: int
+    end_location_on_protein: int
+
 
 def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dict[str, list[tuple[str, int]]]:
     """
@@ -46,40 +52,34 @@ def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dic
 def match_peptide_to_protein_ids(
         peptide_sequence: str, protein_kmer_dictionary: dict[str, list[tuple[str, int]]],
         protein_dictionary: dict[str, str]
-) -> list[tuple[str, int, int]]:
+) -> list[ProteinHit]:
     """
     Matches a peptide sequence to a dictionary of kmers in protein sequences.
     Returns a list of tuples containing the protein ID and the start, end location of the peptide in the protein sequence.
     """
-    # convert the peptide sequence to a list of kmers
     k = 5
-    kmers = [peptide_sequence[i: i + k] for i in range(len(peptide_sequence) - k + 1)]
-    # for now, we only do exact matches, so only the first and last kmer of the peptide are needed
-    first_kmer = kmers[0]
-    last_kmer = kmers[-1]
+    peptide_kmers = [peptide_sequence[i: i + k] for i in range(len(peptide_sequence) - k + 1)]
+    first_kmer, last_kmer = peptide_kmers[0], peptide_kmers[-1]
     # match the kmers
     first_kmer_matches = protein_kmer_dictionary.get(first_kmer, [])
     last_kmer_matches = protein_kmer_dictionary.get(last_kmer, [])
-    # determine the protein ids that are common to both lists, check if the peptide is in the protein sequence at that
-    # location and return the protein ids and the start location
-    hits = []
-    for protein_id, start_first_kmer in first_kmer_matches:
-        expected_start_last_kmer = start_first_kmer + len(peptide_sequence) - k
-        for _, start_last_kmer in last_kmer_matches:
-            if protein_id != _:
-                continue
-            if start_last_kmer != expected_start_last_kmer:
-                continue
-            protein_sequence = protein_dictionary[protein_id]
-            end_last_kmer = start_first_kmer + len(peptide_sequence)
-            subsequence = protein_sequence[start_first_kmer:end_last_kmer]
-            if subsequence != peptide_sequence:
-                continue
-            assert len(subsequence) == len(
-                peptide_sequence), f"Lengths do not match: {len(subsequence)} != {len(peptide_sequence)}"
-            assert subsequence in peptide_sequence, f"Subsequence not in peptide sequence:\nA: {subsequence}\nB: {peptide_sequence}"
-            assert peptide_sequence in protein_sequence, f"Peptide not in protein sequence: {peptide_sequence} not in {protein_sequence}"
-            hits.append((protein_id, start_first_kmer, end_last_kmer))
+    hits = [
+        ProteinHit(protein_id, start_first_kmer, start_first_kmer + len(peptide_sequence))
+        for protein_id, start_first_kmer in first_kmer_matches
+        for _, start_last_kmer in last_kmer_matches
+        if protein_id == _ and start_last_kmer == start_first_kmer + len(peptide_sequence) - k
+           and protein_dictionary[protein_id][
+               start_first_kmer:start_first_kmer + len(peptide_sequence)] == peptide_sequence
+    ]
+    hits = list(set(hits)) # remove duplicates
+    # Just sanity checks
+    for hit in hits:
+        subsequence = protein_dictionary[hit.protein_id][hit.start_location_on_protein:hit.end_location_on_protein]
+        assert len(subsequence) == len(
+            peptide_sequence), f"Lengths do not match: {len(subsequence)} != {len(peptide_sequence)}"
+        assert subsequence in peptide_sequence, f"Subsequence not in peptide sequence:\nA: {subsequence}\nB: {peptide_sequence}"
+        assert peptide_sequence in protein_dictionary[
+            hit.protein_id], f"Peptide not in protein sequence: {peptide_sequence} not in {protein_dictionary[protein_id]}"
     return hits
 
 
@@ -90,8 +90,9 @@ def plot_protein_coverage(
     Plots the coverage of a protein sequence by peptides.
     """
     # generate the protein kmer dictionary
-    if samples is None:
-        samples = []
+    if samples is None or samples == []:
+        raise ValueError("No samples provided.")
+
     reduced_peptide_df = peptide_df[peptide_df["Sample"].isin(samples) & peptide_df['Intensity'] > 0]
 
     protein_dict = dict(zip(fasta_df["Protein ID"], fasta_df["Protein Sequence"]))
@@ -107,22 +108,22 @@ def plot_protein_coverage(
             desc="Matching peptides to protein", unit_scale=True,
             unit="peptide"):
         protein_hits = match_peptide_to_protein_ids(peptide_sequence=peptide_sequence,
-                                                    protein_kmer_dictionary=kmer_dict, protein_dictionary=protein_dict)
-        for prot_id, start_location_on_protein, end_location_on_protein in protein_hits:
-            # we are only interested in the provided protein id, everything else is discarded
-            if prot_id != protein_id:
-                continue
+                                                    protein_kmer_dictionary=kmer_dict,
+                                                    protein_dictionary=protein_dict)
+        # we only care about the hits pertaining to the argument-supplied protein id
+        protein_hits = filter(lambda x: x.protein_id == protein_id, protein_hits)
+        for protein_hit in protein_hits:
             # add the peptide sequence and location to the peptide matches pertaining to the protein id
             peptide_matches.append(
                 PeptideMatch(peptide_sequence=peptide_sequence,
-                             start_location_on_protein=start_location_on_protein,
-                             end_location_on_protein=end_location_on_protein,
+                             start_location_on_protein=protein_hit.start_location_on_protein,
+                             end_location_on_protein=protein_hit.end_location_on_protein,
                              intensity=log2(intensity),
                              sample=sample)
             )
             # update the coverage of the protein sequence
-            coverage[start_location_on_protein:end_location_on_protein] = \
-                [coverage + 1 for coverage in coverage[start_location_on_protein:end_location_on_protein]]
+            coverage[protein_hit.start_location_on_protein:protein_hit.end_location_on_protein] = \
+                [coverage + 1 for coverage in coverage[protein_hit.start_location_on_protein:protein_hit.end_location_on_protein]]
 
     if len(peptide_matches) == 0:
         raise ValueError(f"No peptides matched for protein {protein_id}")
@@ -131,7 +132,7 @@ def plot_protein_coverage(
     # required number of rows (vertical space)
     rows = distribute_to_rows(peptide_matches)
 
-    x_labels = [f"{amino_acid_index} ({amino_acid})" for amino_acid_index, amino_acid in enumerate(protein_sequence)]
+    x_labels = [f"{amino_acid} - {amino_acid_index} " for amino_acid_index, amino_acid in enumerate(protein_sequence)]
     max_coverage_in_sequence = max(coverage)
     max_intensity = max([peptide_match.intensity for peptide_match in peptide_matches])
     normalized_coverage = [value / max_coverage_in_sequence for value in coverage]
@@ -179,7 +180,7 @@ def plot_protein_coverage(
                     # the coordinates need to be a bit offset, as otherwise the rectangles would start in the middle of the bars
                     x0=x0 - 0.5, y0=y0,
                     x1=x1 - 0.5, y1=y1,
-                    fillcolor=f"rgba({int(255 * (1 - normalized_intensity))}, 0, {int(255 * normalized_intensity)}, 0.5)",
+                    fillcolor=f"rgba({int(255 * (normalized_intensity))}, 0, 0, 1)",
                     opacity=0.5,
                     layer="above",
                 )

From 265db599bfad473cd28eabeebd2034d2ce1e2bae Mon Sep 17 00:00:00 2001
From: henning <henning.gaertner@student.hpi.de>
Date: Thu, 28 Nov 2024 15:57:10 +0100
Subject: [PATCH 06/13] Refactor plot_protein_coverage by extracting
 _build_coverage_plot function

---
 protzilla/data_analysis/protein_coverage.py | 44 +++++++++------------
 1 file changed, 19 insertions(+), 25 deletions(-)

diff --git a/protzilla/data_analysis/protein_coverage.py b/protzilla/data_analysis/protein_coverage.py
index 6ffe135c..38584058 100644
--- a/protzilla/data_analysis/protein_coverage.py
+++ b/protzilla/data_analysis/protein_coverage.py
@@ -8,9 +8,9 @@
 from numpy import log2
 import plotly.graph_objects as go
 
+
 @dataclass(unsafe_hash=True)
 class PeptideMatch:
-
     peptide_sequence: str
     start_location_on_protein: int
     end_location_on_protein: int
@@ -71,7 +71,7 @@ def match_peptide_to_protein_ids(
            and protein_dictionary[protein_id][
                start_first_kmer:start_first_kmer + len(peptide_sequence)] == peptide_sequence
     ]
-    hits = list(set(hits)) # remove duplicates
+    hits = list(set(hits))  # remove duplicates
     # Just sanity checks
     for hit in hits:
         subsequence = protein_dictionary[hit.protein_id][hit.start_location_on_protein:hit.end_location_on_protein]
@@ -99,9 +99,8 @@ def plot_protein_coverage(
     kmer_dict = build_kmer_dictionary(protein_dict, k=5)
 
     protein_sequence = protein_dict[protein_id]
-    protein_sequence_length = len(protein_sequence)
     peptide_matches = []
-    coverage = [0] * protein_sequence_length
+    coverage = [0] * len(protein_sequence)
 
     for sample, peptide_sequence, intensity in tqdm(
             reduced_peptide_df[['Sample', 'Sequence', 'Intensity']].drop_duplicates().itertuples(index=False),
@@ -122,16 +121,23 @@ def plot_protein_coverage(
                              sample=sample)
             )
             # update the coverage of the protein sequence
-            coverage[protein_hit.start_location_on_protein:protein_hit.end_location_on_protein] = \
-                [coverage + 1 for coverage in coverage[protein_hit.start_location_on_protein:protein_hit.end_location_on_protein]]
+            increment_coverage(coverage, protein_hit)
 
     if len(peptide_matches) == 0:
         raise ValueError(f"No peptides matched for protein {protein_id}")
+
     # now that all the matches have been determined with their start and end on the protein sequence, we need to find
     # an optimal solution for the location of their rectangles in the plot without overlap while minimizing the
     # required number of rows (vertical space)
     rows = distribute_to_rows(peptide_matches)
 
+    fig = _build_coverage_plot(coverage, peptide_matches, protein_id, protein_sequence, rows)
+
+    return dict(plots=[fig])
+
+
+def _build_coverage_plot(coverage: list[int], peptide_matches: list[PeptideMatch], protein_id: str,
+                         protein_sequence: str, rows: dict[str, list[list[PeptideMatch]]]) -> go.Figure:
     x_labels = [f"{amino_acid} - {amino_acid_index} " for amino_acid_index, amino_acid in enumerate(protein_sequence)]
     max_coverage_in_sequence = max(coverage)
     max_intensity = max([peptide_match.intensity for peptide_match in peptide_matches])
@@ -139,7 +145,6 @@ def plot_protein_coverage(
     hover_text = [f"Position: {i}<br>Amino acid: {amino_acid}<br>Coverage: {coverage}"
                   for i, (coverage, amino_acid) in
                   enumerate(zip(normalized_coverage, protein_sequence))]
-
     fig = go.Figure(data=go.Bar(x=x_labels,
                                 y=normalized_coverage,
                                 marker=dict(
@@ -152,23 +157,8 @@ def plot_protein_coverage(
                                 hoverinfo="text",
                                 showlegend=False,
                                 name="Coverage (normalized to max in Protein Sequence)"))
-
-    # Keep track of the current row position across all samples
     current_row = 1
-
-    # Iterate through samples and their rows in order
     for sample_idx, (sample, sample_rows) in enumerate(rows.items()):
-        # Add a sample label
-        fig.add_annotation(
-            x=-0.1,  # slightly to the left of the plot
-            y=current_row + len(sample_rows) / 2,
-            text=sample,
-            showarrow=False,
-            textangle=-90,
-            xref='paper',
-            yref='y'
-        )
-
         for row_index, row in enumerate(sample_rows):
             for peptide_match in row:
                 # add a rectangle for each peptide
@@ -215,7 +205,6 @@ def plot_protein_coverage(
                     dash="solid",
                 )
             )
-
     fig.update_layout(
         title=f"Protein Coverage of {protein_id}",
         xaxis_title="Protein Sequence",
@@ -223,8 +212,13 @@ def plot_protein_coverage(
         showlegend=False,
         bargap=0
     )
+    return fig
 
-    return dict(plots=[fig])
+
+def increment_coverage(coverage: list[int], protein_hit: ProteinHit) -> None:
+    coverage[protein_hit.start_location_on_protein:protein_hit.end_location_on_protein] = \
+        [coverage + 1 for coverage in
+         coverage[protein_hit.start_location_on_protein:protein_hit.end_location_on_protein]]
 
 
 def distribute_to_rows(peptide_matches: list[PeptideMatch]) -> dict[list[list[PeptideMatch]]]:
@@ -233,7 +227,7 @@ def distribute_to_rows(peptide_matches: list[PeptideMatch]) -> dict[list[list[Pe
     Greedy algorithm (see Interval Scheduling Problem).
     """
     peptide_matches.sort(key=lambda peptide_match: peptide_match.start_location_on_protein)
-    rows = {sample : [] for sample in set([peptide_match.sample for peptide_match in peptide_matches])}
+    rows = {sample: [] for sample in set([peptide_match.sample for peptide_match in peptide_matches])}
     for peptide_match in peptide_matches:
         # find the first row that does not overlap with the current peptide
         for row in rows[peptide_match.sample]:

From f5ec509379a4cc7843ef3d71349d708ee1321e58 Mon Sep 17 00:00:00 2001
From: henning <henning.gaertner@student.hpi.de>
Date: Mon, 9 Dec 2024 14:25:49 +0100
Subject: [PATCH 07/13] New build_plot function for protein coverage plot:
 using subplots, color min-max-normalization on sample-level

---
 protzilla/data_analysis/protein_coverage.py | 71 +++++++++++++++++++--
 1 file changed, 67 insertions(+), 4 deletions(-)

diff --git a/protzilla/data_analysis/protein_coverage.py b/protzilla/data_analysis/protein_coverage.py
index 38584058..1a503ce9 100644
--- a/protzilla/data_analysis/protein_coverage.py
+++ b/protzilla/data_analysis/protein_coverage.py
@@ -2,11 +2,15 @@
 from docutils.nodes import title
 from tqdm import tqdm
 from protzilla.constants.paths import EXTERNAL_DATA_PATH
+from protzilla.data_analysis.differential_expression_mann_whitney import mann_whitney_test_on_intensity_data
 from protzilla.disk_operator import PickleOperator
 from dataclasses import dataclass
 import pandas as pd
 from numpy import log2
 import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+# import StrEnum
+from enum import StrEnum
 
 
 @dataclass(unsafe_hash=True)
@@ -24,6 +28,11 @@ class ProteinHit:
     start_location_on_protein: int
     end_location_on_protein: int
 
+class IntensityNormalization(StrEnum):
+    min_max_scaling = "min_max_scaling"
+    none = "none"
+
+
 
 def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dict[str, list[tuple[str, int]]]:
     """
@@ -131,10 +140,65 @@ def plot_protein_coverage(
     # required number of rows (vertical space)
     rows = distribute_to_rows(peptide_matches)
 
-    fig = _build_coverage_plot(coverage, peptide_matches, protein_id, protein_sequence, rows)
+    fig = _new_build_coverage_plot(coverage, peptide_matches, protein_id, protein_sequence, rows)
 
     return dict(plots=[fig])
 
+def _new_build_coverage_plot(
+        coverage: list[int], peptide_matches: list[PeptideMatch], protein_id: str, protein_sequence: str,
+        rows: dict[str, list[list[PeptideMatch]]], intensity_normalization: IntensityNormalization = IntensityNormalization.min_max_scaling
+) -> go.Figure:
+    # one row for each sample + two row for the protein sequence (on top and on the bottom)
+    number_of_subplots = len(rows) + 1
+    protein_sequence_labels = [f"{amino_acid} - {amino_acid_index} " for amino_acid_index, amino_acid in enumerate(protein_sequence)]
+    fig = make_subplots(rows=number_of_subplots, cols=1, shared_xaxes=True, vertical_spacing=0.2, subplot_titles=list(rows.keys()) + [f"Sequencing depth of {protein_id}"])
+    # Sequencing depth bar chart
+    fig.add_trace(go.Bar(x=protein_sequence_labels, y=coverage, showlegend=False), row=number_of_subplots, col=1)
+    # Peptides
+    for subplot_row_index, (sample, rows) in enumerate(rows.items()):
+        # Intensity scaling for the current sample
+        current_row_intensities = [peptide_match.intensity for row in rows for peptide_match in row]
+        max_intensity, min_intensity = max(current_row_intensities), min(current_row_intensities)
+        def scale_intensity(intensity: float) -> float:
+            if intensity_normalization == IntensityNormalization.min_max_scaling:
+                return (intensity - min_intensity) / (max_intensity - min_intensity)
+            else:
+                return intensity
+
+        for row_index, row in enumerate(rows):
+            for peptide_match in row:
+                x0, x1 = peptide_match.start_location_on_protein, peptide_match.end_location_on_protein
+                y0, y1 = row_index, row_index + 1
+                # add the rectangles
+                fig.add_shape(
+                    type="rect",
+                    x0=x0 - 0.5, y0=y0,
+                    x1=x1 - 0.5, y1=y1,
+                    fillcolor=f"rgba({int(255 * scale_intensity(peptide_match.intensity))}, 0, 0, 1)",
+                    opacity=0.5,
+                    layer="above",
+                    row=subplot_row_index + 1, col=1
+                )
+                # add invisible plotly object to the rectangle to show the peptide sequence when hovered over
+                fig.add_trace(go.Scatter(
+                    x=protein_sequence_labels[
+                      peptide_match.start_location_on_protein:peptide_match.end_location_on_protein + 1],
+                    y=[(y0 + y1) / 2] * len(peptide_match.peptide_sequence),
+                    text=[f"Sample: {sample}<br>Peptide: {peptide_match.peptide_sequence}<br>"
+                          f"({peptide_match.start_location_on_protein}-{peptide_match.end_location_on_protein})<br>"
+                          f"Intensity: {peptide_match.intensity}"] * len(peptide_match.peptide_sequence),
+                    mode="markers",
+                    marker=dict(opacity=0, size=15),  # make marker invisible
+                    hoverinfo="text",
+                    hovertemplate="%{text}<extra></extra>",
+                    showlegend=False,
+                ), row=subplot_row_index + 1, col=1)
+
+        fig.update_yaxes(range=[0, len(rows)], showticklabels=False, row=subplot_row_index + 1, col=1)
+        fig.update_xaxes(showticklabels=False, row=subplot_row_index + 1, col=1)
+
+    return fig
+
 
 def _build_coverage_plot(coverage: list[int], peptide_matches: list[PeptideMatch], protein_id: str,
                          protein_sequence: str, rows: dict[str, list[list[PeptideMatch]]]) -> go.Figure:
@@ -148,8 +212,6 @@ def _build_coverage_plot(coverage: list[int], peptide_matches: list[PeptideMatch
     fig = go.Figure(data=go.Bar(x=x_labels,
                                 y=normalized_coverage,
                                 marker=dict(
-                                    color=normalized_coverage,
-                                    colorscale='Bluered',
                                     colorbar=dict(title="Coverage<br>(normalized)"),
                                     line=dict(width=0)
                                 ),
@@ -210,7 +272,7 @@ def _build_coverage_plot(coverage: list[int], peptide_matches: list[PeptideMatch
         xaxis_title="Protein Sequence",
         yaxis=dict(visible=False, range=[0, 1 + current_row], title=""),
         showlegend=False,
-        bargap=0
+        bargap=0.2
     )
     return fig
 
@@ -237,3 +299,4 @@ def distribute_to_rows(peptide_matches: list[PeptideMatch]) -> dict[list[list[Pe
         else:
             rows[peptide_match.sample].append([(peptide_match)])
     return rows
+

From d04024863bc276f607267e07259de32a13f550ea Mon Sep 17 00:00:00 2001
From: henning <henning.gaertner@student.hpi.de>
Date: Tue, 10 Dec 2024 12:17:32 +0100
Subject: [PATCH 08/13] Separate the sequencing depth bar charts for each
 sample

---
 protzilla/data_analysis/protein_coverage.py | 72 +++++++++++++--------
 1 file changed, 45 insertions(+), 27 deletions(-)

diff --git a/protzilla/data_analysis/protein_coverage.py b/protzilla/data_analysis/protein_coverage.py
index 1a503ce9..0a5f514b 100644
--- a/protzilla/data_analysis/protein_coverage.py
+++ b/protzilla/data_analysis/protein_coverage.py
@@ -4,6 +4,7 @@
 from protzilla.constants.paths import EXTERNAL_DATA_PATH
 from protzilla.data_analysis.differential_expression_mann_whitney import mann_whitney_test_on_intensity_data
 from protzilla.disk_operator import PickleOperator
+from protzilla.constants.colors import PLOT_PRIMARY_COLOR
 from dataclasses import dataclass
 import pandas as pd
 from numpy import log2
@@ -104,33 +105,33 @@ def plot_protein_coverage(
 
     reduced_peptide_df = peptide_df[peptide_df["Sample"].isin(samples) & peptide_df['Intensity'] > 0]
 
-    protein_dict = dict(zip(fasta_df["Protein ID"], fasta_df["Protein Sequence"]))
-    kmer_dict = build_kmer_dictionary(protein_dict, k=5)
+    protein_sequence_dict = dict(zip(fasta_df["Protein ID"], fasta_df["Protein Sequence"]))
+    sequence_kmer_dict = build_kmer_dictionary(protein_sequence_dict, k=5)
 
-    protein_sequence = protein_dict[protein_id]
+    protein_sequence = protein_sequence_dict[protein_id]
     peptide_matches = []
-    coverage = [0] * len(protein_sequence)
+    coverage_by_sample = {sample : [0] * len(protein_sequence) for sample in samples}
 
-    for sample, peptide_sequence, intensity in tqdm(
+    for sample_name, peptide_sequence, intensity in tqdm(
             reduced_peptide_df[['Sample', 'Sequence', 'Intensity']].drop_duplicates().itertuples(index=False),
             desc="Matching peptides to protein", unit_scale=True,
             unit="peptide"):
         protein_hits = match_peptide_to_protein_ids(peptide_sequence=peptide_sequence,
-                                                    protein_kmer_dictionary=kmer_dict,
-                                                    protein_dictionary=protein_dict)
+                                                    protein_kmer_dictionary=sequence_kmer_dict,
+                                                    protein_dictionary=protein_sequence_dict)
         # we only care about the hits pertaining to the argument-supplied protein id
-        protein_hits = filter(lambda x: x.protein_id == protein_id, protein_hits)
-        for protein_hit in protein_hits:
+        filtered_protein_hits = filter(lambda x: x.protein_id == protein_id,protein_hits)
+        for protein_hit in filtered_protein_hits:
             # add the peptide sequence and location to the peptide matches pertaining to the protein id
             peptide_matches.append(
                 PeptideMatch(peptide_sequence=peptide_sequence,
                              start_location_on_protein=protein_hit.start_location_on_protein,
                              end_location_on_protein=protein_hit.end_location_on_protein,
                              intensity=log2(intensity),
-                             sample=sample)
+                             sample=sample_name)
             )
             # update the coverage of the protein sequence
-            increment_coverage(coverage, protein_hit)
+            increment_coverage(coverage_by_sample[sample_name], protein_hit)
 
     if len(peptide_matches) == 0:
         raise ValueError(f"No peptides matched for protein {protein_id}")
@@ -138,26 +139,40 @@ def plot_protein_coverage(
     # now that all the matches have been determined with their start and end on the protein sequence, we need to find
     # an optimal solution for the location of their rectangles in the plot without overlap while minimizing the
     # required number of rows (vertical space)
-    rows = distribute_to_rows(peptide_matches)
+    rows_by_sample = distribute_to_rows(peptide_matches)
 
-    fig = _new_build_coverage_plot(coverage, peptide_matches, protein_id, protein_sequence, rows)
+    fig = _new_build_coverage_plot(coverage_by_sample, peptide_matches, protein_id, protein_sequence, rows_by_sample)
 
     return dict(plots=[fig])
 
 def _new_build_coverage_plot(
-        coverage: list[int], peptide_matches: list[PeptideMatch], protein_id: str, protein_sequence: str,
-        rows: dict[str, list[list[PeptideMatch]]], intensity_normalization: IntensityNormalization = IntensityNormalization.min_max_scaling
+        coverages_by_sample: list[int], peptide_matches: list[PeptideMatch], protein_id: str, protein_sequence: str,
+        rows_by_sample: dict[str, list[list[PeptideMatch]]], intensity_normalization: IntensityNormalization = IntensityNormalization.min_max_scaling
 ) -> go.Figure:
     # one row for each sample + two row for the protein sequence (on top and on the bottom)
-    number_of_subplots = len(rows) + 1
+    number_of_subplots = len(rows_by_sample.keys()) + len(coverages_by_sample.keys())
     protein_sequence_labels = [f"{amino_acid} - {amino_acid_index} " for amino_acid_index, amino_acid in enumerate(protein_sequence)]
-    fig = make_subplots(rows=number_of_subplots, cols=1, shared_xaxes=True, vertical_spacing=0.2, subplot_titles=list(rows.keys()) + [f"Sequencing depth of {protein_id}"])
-    # Sequencing depth bar chart
-    fig.add_trace(go.Bar(x=protein_sequence_labels, y=coverage, showlegend=False), row=number_of_subplots, col=1)
+    subplot_titles = zip(
+        [f"Peptides of {sample}" for sample in rows_by_sample.keys()], # str_a
+        [f"Sequencing depth of {sample}" for sample in coverages_by_sample.keys()] # str_b
+    ) # now is (str_a, str_b), (str_a, str_b), ..., so we need to flatten it
+    subplot_titles = [title for titles in subplot_titles for title in titles]
+    max_coverage_value = get_max_coverage(coverages_by_sample)
+
+    fig = make_subplots(rows=number_of_subplots, cols=1, shared_xaxes=True, vertical_spacing=0.1, subplot_titles=subplot_titles)
     # Peptides
-    for subplot_row_index, (sample, rows) in enumerate(rows.items()):
+    for sample_index, sample_name in enumerate(rows_by_sample.keys()):
+        peptide_subplot_rows = rows_by_sample[sample_name]
+        peptides_subplot_index = (2*sample_index) + 1
+        coverage =  coverages_by_sample[sample_name]
+        coverage_subplot_index = (2*sample_index) + 2
+
+        # Sequencing depth bar chart
+        fig.add_trace(go.Bar(x=protein_sequence_labels, y=coverage, showlegend=False, marker=dict(color=PLOT_PRIMARY_COLOR)),
+                      row=coverage_subplot_index, col=1)
+        fig.update_yaxes(range=[0, max_coverage_value], row=coverage_subplot_index, col=1)
         # Intensity scaling for the current sample
-        current_row_intensities = [peptide_match.intensity for row in rows for peptide_match in row]
+        current_row_intensities = [peptide_match.intensity for row in peptide_subplot_rows for peptide_match in row]
         max_intensity, min_intensity = max(current_row_intensities), min(current_row_intensities)
         def scale_intensity(intensity: float) -> float:
             if intensity_normalization == IntensityNormalization.min_max_scaling:
@@ -165,7 +180,7 @@ def scale_intensity(intensity: float) -> float:
             else:
                 return intensity
 
-        for row_index, row in enumerate(rows):
+        for row_index, row in enumerate(peptide_subplot_rows):
             for peptide_match in row:
                 x0, x1 = peptide_match.start_location_on_protein, peptide_match.end_location_on_protein
                 y0, y1 = row_index, row_index + 1
@@ -177,14 +192,14 @@ def scale_intensity(intensity: float) -> float:
                     fillcolor=f"rgba({int(255 * scale_intensity(peptide_match.intensity))}, 0, 0, 1)",
                     opacity=0.5,
                     layer="above",
-                    row=subplot_row_index + 1, col=1
+                    row=peptides_subplot_index, col=1
                 )
                 # add invisible plotly object to the rectangle to show the peptide sequence when hovered over
                 fig.add_trace(go.Scatter(
                     x=protein_sequence_labels[
                       peptide_match.start_location_on_protein:peptide_match.end_location_on_protein + 1],
                     y=[(y0 + y1) / 2] * len(peptide_match.peptide_sequence),
-                    text=[f"Sample: {sample}<br>Peptide: {peptide_match.peptide_sequence}<br>"
+                    text=[f"Sample: {sample_name}<br>Peptide: {peptide_match.peptide_sequence}<br>"
                           f"({peptide_match.start_location_on_protein}-{peptide_match.end_location_on_protein})<br>"
                           f"Intensity: {peptide_match.intensity}"] * len(peptide_match.peptide_sequence),
                     mode="markers",
@@ -192,10 +207,10 @@ def scale_intensity(intensity: float) -> float:
                     hoverinfo="text",
                     hovertemplate="%{text}<extra></extra>",
                     showlegend=False,
-                ), row=subplot_row_index + 1, col=1)
+                ), row=peptides_subplot_index, col=1)
 
-        fig.update_yaxes(range=[0, len(rows)], showticklabels=False, row=subplot_row_index + 1, col=1)
-        fig.update_xaxes(showticklabels=False, row=subplot_row_index + 1, col=1)
+        fig.update_yaxes(range=[0, len(peptide_subplot_rows)], showticklabels=False, row=peptides_subplot_index, col=1)
+        fig.update_xaxes(showticklabels=False, row=peptides_subplot_index, col=1)
 
     return fig
 
@@ -300,3 +315,6 @@ def distribute_to_rows(peptide_matches: list[PeptideMatch]) -> dict[list[list[Pe
             rows[peptide_match.sample].append([(peptide_match)])
     return rows
 
+
+def get_max_coverage(coverage: dict[list[int]]) -> int:
+    return max([max(coverage) for coverage in coverage.values()])
\ No newline at end of file

From 19a0c257f6d60524ae73d984a69837202ca89a21 Mon Sep 17 00:00:00 2001
From: henning <henning.gaertner@student.hpi.de>
Date: Thu, 12 Dec 2024 16:17:35 +0100
Subject: [PATCH 09/13] Reformatting and refactoring

---
 protzilla/data_analysis/protein_coverage.py | 407 ++++++++++++--------
 1 file changed, 238 insertions(+), 169 deletions(-)

diff --git a/protzilla/data_analysis/protein_coverage.py b/protzilla/data_analysis/protein_coverage.py
index 0a5f514b..b292d0bd 100644
--- a/protzilla/data_analysis/protein_coverage.py
+++ b/protzilla/data_analysis/protein_coverage.py
@@ -2,7 +2,9 @@
 from docutils.nodes import title
 from tqdm import tqdm
 from protzilla.constants.paths import EXTERNAL_DATA_PATH
-from protzilla.data_analysis.differential_expression_mann_whitney import mann_whitney_test_on_intensity_data
+from protzilla.data_analysis.differential_expression_mann_whitney import (
+    mann_whitney_test_on_intensity_data,
+)
 from protzilla.disk_operator import PickleOperator
 from protzilla.constants.colors import PLOT_PRIMARY_COLOR
 from dataclasses import dataclass
@@ -10,6 +12,7 @@
 from numpy import log2
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
+
 # import StrEnum
 from enum import StrEnum
 
@@ -29,16 +32,19 @@ class ProteinHit:
     start_location_on_protein: int
     end_location_on_protein: int
 
+
 class IntensityNormalization(StrEnum):
     min_max_scaling = "min_max_scaling"
     none = "none"
 
 
-
-def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dict[str, list[tuple[str, int]]]:
+def build_kmer_dictionary(
+    protein_dictionary: dict[str, str], k: int = 5
+) -> dict[str, list[tuple[str, int]]]:
     """
     Builds a dictionary of all kmers in a list of protein sequences. The kmers are generated by sliding a window of size
-    k along the protein sequences. The dictionary maps a the k-mer to a list of tuples containing the protein ID and the index of
+    k along the protein sequences. The dictionary maps a the k-mer to a list of tuples containing the protein ID and
+    the index of
     the k-mer in the protein sequence. The dictionary is saved to disk for future use.
     """
     # if the dictionary already exists, load it from disk
@@ -47,10 +53,14 @@ def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dic
         kmer_dict = PickleOperator.read(kmer_dict_path)
         return kmer_dict
     kmer_dict = {}
-    for protein_id, protein_sequence in tqdm(protein_dictionary.items(), desc="Building kmer dictionary",
-                                             unit_scale=True, unit="protein"):
+    for protein_id, protein_sequence in tqdm(
+        protein_dictionary.items(),
+        desc="Building kmer dictionary",
+        unit_scale=True,
+        unit="protein",
+    ):
         for i in range(len(protein_sequence) - k + 1):
-            kmer = protein_sequence[i: i + k]
+            kmer = protein_sequence[i : i + k]
             if kmer in kmer_dict:
                 kmer_dict[kmer].append((protein_id, i))
             else:
@@ -60,41 +70,61 @@ def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dic
 
 
 def match_peptide_to_protein_ids(
-        peptide_sequence: str, protein_kmer_dictionary: dict[str, list[tuple[str, int]]],
-        protein_dictionary: dict[str, str]
+    peptide_sequence: str,
+    protein_kmer_dictionary: dict[str, list[tuple[str, int]]],
+    protein_dictionary: dict[str, str],
 ) -> list[ProteinHit]:
     """
     Matches a peptide sequence to a dictionary of kmers in protein sequences.
-    Returns a list of tuples containing the protein ID and the start, end location of the peptide in the protein sequence.
+    Returns a list of tuples containing the protein ID and the start, end location of the peptide in the protein
+    sequence.
     """
     k = 5
-    peptide_kmers = [peptide_sequence[i: i + k] for i in range(len(peptide_sequence) - k + 1)]
+    peptide_kmers = [
+        peptide_sequence[i : i + k] for i in range(len(peptide_sequence) - k + 1)
+    ]
     first_kmer, last_kmer = peptide_kmers[0], peptide_kmers[-1]
     # match the kmers
     first_kmer_matches = protein_kmer_dictionary.get(first_kmer, [])
     last_kmer_matches = protein_kmer_dictionary.get(last_kmer, [])
     hits = [
-        ProteinHit(protein_id, start_first_kmer, start_first_kmer + len(peptide_sequence))
+        ProteinHit(
+            protein_id, start_first_kmer, start_first_kmer + len(peptide_sequence)
+        )
         for protein_id, start_first_kmer in first_kmer_matches
         for _, start_last_kmer in last_kmer_matches
-        if protein_id == _ and start_last_kmer == start_first_kmer + len(peptide_sequence) - k
-           and protein_dictionary[protein_id][
-               start_first_kmer:start_first_kmer + len(peptide_sequence)] == peptide_sequence
+        if protein_id == _
+        and start_last_kmer == start_first_kmer + len(peptide_sequence) - k
+        and protein_dictionary[protein_id][
+            start_first_kmer : start_first_kmer + len(peptide_sequence)
+        ]
+        == peptide_sequence
     ]
     hits = list(set(hits))  # remove duplicates
     # Just sanity checks
     for hit in hits:
-        subsequence = protein_dictionary[hit.protein_id][hit.start_location_on_protein:hit.end_location_on_protein]
+        subsequence = protein_dictionary[hit.protein_id][
+            hit.start_location_on_protein : hit.end_location_on_protein
+        ]
         assert len(subsequence) == len(
-            peptide_sequence), f"Lengths do not match: {len(subsequence)} != {len(peptide_sequence)}"
-        assert subsequence in peptide_sequence, f"Subsequence not in peptide sequence:\nA: {subsequence}\nB: {peptide_sequence}"
-        assert peptide_sequence in protein_dictionary[
-            hit.protein_id], f"Peptide not in protein sequence: {peptide_sequence} not in {protein_dictionary[protein_id]}"
+            peptide_sequence
+        ), f"Lengths do not match: {len(subsequence)} != {len(peptide_sequence)}"
+        assert subsequence in peptide_sequence, (
+            f"Subsequence not in peptide sequence:\nA: {subsequence}\nB: "
+            f"{peptide_sequence}"
+        )
+        assert peptide_sequence in protein_dictionary[hit.protein_id], (
+            f"Peptide not in protein sequence: {peptide_sequence} not in "
+            f"{protein_dictionary[protein_id]}"
+        )
     return hits
 
 
 def plot_protein_coverage(
-        fasta_df: pd.DataFrame, peptide_df: pd.DataFrame, protein_id: str, samples: list[str] = None
+    fasta_df: pd.DataFrame,
+    peptide_df: pd.DataFrame,
+    protein_id: str,
+    samples: list[str] = None,
 ) -> None:
     """
     Plots the coverage of a protein sequence by peptides.
@@ -103,32 +133,46 @@ def plot_protein_coverage(
     if samples is None or samples == []:
         raise ValueError("No samples provided.")
 
-    reduced_peptide_df = peptide_df[peptide_df["Sample"].isin(samples) & peptide_df['Intensity'] > 0]
+    reduced_peptide_df = peptide_df[
+        peptide_df["Sample"].isin(samples) & peptide_df["Intensity"] > 0
+    ]
 
-    protein_sequence_dict = dict(zip(fasta_df["Protein ID"], fasta_df["Protein Sequence"]))
+    protein_sequence_dict = dict(
+        zip(fasta_df["Protein ID"], fasta_df["Protein Sequence"])
+    )
     sequence_kmer_dict = build_kmer_dictionary(protein_sequence_dict, k=5)
 
     protein_sequence = protein_sequence_dict[protein_id]
     peptide_matches = []
-    coverage_by_sample = {sample : [0] * len(protein_sequence) for sample in samples}
+    coverage_by_sample = {sample: [0] * len(protein_sequence) for sample in samples}
 
     for sample_name, peptide_sequence, intensity in tqdm(
-            reduced_peptide_df[['Sample', 'Sequence', 'Intensity']].drop_duplicates().itertuples(index=False),
-            desc="Matching peptides to protein", unit_scale=True,
-            unit="peptide"):
-        protein_hits = match_peptide_to_protein_ids(peptide_sequence=peptide_sequence,
-                                                    protein_kmer_dictionary=sequence_kmer_dict,
-                                                    protein_dictionary=protein_sequence_dict)
+        reduced_peptide_df[["Sample", "Sequence", "Intensity"]]
+        .drop_duplicates()
+        .itertuples(index=False),
+        desc="Matching peptides to protein",
+        unit_scale=True,
+        unit="peptide",
+    ):
+        protein_hits = match_peptide_to_protein_ids(
+            peptide_sequence=peptide_sequence,
+            protein_kmer_dictionary=sequence_kmer_dict,
+            protein_dictionary=protein_sequence_dict,
+        )
         # we only care about the hits pertaining to the argument-supplied protein id
-        filtered_protein_hits = filter(lambda x: x.protein_id == protein_id,protein_hits)
+        filtered_protein_hits = filter(
+            lambda x: x.protein_id == protein_id, protein_hits
+        )
         for protein_hit in filtered_protein_hits:
             # add the peptide sequence and location to the peptide matches pertaining to the protein id
             peptide_matches.append(
-                PeptideMatch(peptide_sequence=peptide_sequence,
-                             start_location_on_protein=protein_hit.start_location_on_protein,
-                             end_location_on_protein=protein_hit.end_location_on_protein,
-                             intensity=log2(intensity),
-                             sample=sample_name)
+                PeptideMatch(
+                    peptide_sequence=peptide_sequence,
+                    start_location_on_protein=protein_hit.start_location_on_protein,
+                    end_location_on_protein=protein_hit.end_location_on_protein,
+                    intensity=log2(intensity),
+                    sample=sample_name,
+                )
             )
             # update the coverage of the protein sequence
             increment_coverage(coverage_by_sample[sample_name], protein_hit)
@@ -141,174 +185,199 @@ def plot_protein_coverage(
     # required number of rows (vertical space)
     rows_by_sample = distribute_to_rows(peptide_matches)
 
-    fig = _new_build_coverage_plot(coverage_by_sample, peptide_matches, protein_id, protein_sequence, rows_by_sample)
+    fig = _new_build_coverage_plot(
+        coverage_by_sample,
+        peptide_matches,
+        protein_id,
+        protein_sequence,
+        rows_by_sample,
+    )
 
     return dict(plots=[fig])
 
+
 def _new_build_coverage_plot(
-        coverages_by_sample: list[int], peptide_matches: list[PeptideMatch], protein_id: str, protein_sequence: str,
-        rows_by_sample: dict[str, list[list[PeptideMatch]]], intensity_normalization: IntensityNormalization = IntensityNormalization.min_max_scaling
+    coverages_by_sample: list[int],
+    peptide_matches: list[PeptideMatch],
+    protein_id: str,
+    protein_sequence: str,
+    rows_by_sample: dict[str, list[list[PeptideMatch]]],
+    intensity_normalization: IntensityNormalization = IntensityNormalization.min_max_scaling,
 ) -> go.Figure:
     # one row for each sample + two row for the protein sequence (on top and on the bottom)
     number_of_subplots = len(rows_by_sample.keys()) + len(coverages_by_sample.keys())
-    protein_sequence_labels = [f"{amino_acid} - {amino_acid_index} " for amino_acid_index, amino_acid in enumerate(protein_sequence)]
+    protein_sequence_labels = [
+        f"{amino_acid} - {amino_acid_index} "
+        for amino_acid_index, amino_acid in enumerate(protein_sequence)
+    ]
     subplot_titles = zip(
-        [f"Peptides of {sample}" for sample in rows_by_sample.keys()], # str_a
-        [f"Sequencing depth of {sample}" for sample in coverages_by_sample.keys()] # str_b
-    ) # now is (str_a, str_b), (str_a, str_b), ..., so we need to flatten it
+        [f"Peptides of {sample}" for sample in rows_by_sample.keys()],  # str_a
+        [
+            f"Sequencing depth of {sample}" for sample in coverages_by_sample.keys()
+        ],  # str_b
+    )  # now is (str_a, str_b), (str_a, str_b), ..., so we need to flatten it
     subplot_titles = [title for titles in subplot_titles for title in titles]
     max_coverage_value = get_max_coverage(coverages_by_sample)
 
-    fig = make_subplots(rows=number_of_subplots, cols=1, shared_xaxes=True, vertical_spacing=0.1, subplot_titles=subplot_titles)
+    fig = make_subplots(
+        rows=number_of_subplots,
+        cols=1,
+        shared_xaxes=True,
+        vertical_spacing=0.1,
+        subplot_titles=subplot_titles,
+    )
     # Peptides
     for sample_index, sample_name in enumerate(rows_by_sample.keys()):
         peptide_subplot_rows = rows_by_sample[sample_name]
-        peptides_subplot_index = (2*sample_index) + 1
-        coverage =  coverages_by_sample[sample_name]
-        coverage_subplot_index = (2*sample_index) + 2
-
-        # Sequencing depth bar chart
-        fig.add_trace(go.Bar(x=protein_sequence_labels, y=coverage, showlegend=False, marker=dict(color=PLOT_PRIMARY_COLOR)),
-                      row=coverage_subplot_index, col=1)
-        fig.update_yaxes(range=[0, max_coverage_value], row=coverage_subplot_index, col=1)
+        peptides_subplot_index = (2 * sample_index) + 1
+        coverage = coverages_by_sample[sample_name]
+        coverage_subplot_index = (2 * sample_index) + 2
+
+        add_sequence_depth_to_plot(
+            fig,
+            coverage,
+            coverage_subplot_index,
+            max_coverage_value,
+            protein_sequence_labels,
+        )
+
+        # Peptide plot
         # Intensity scaling for the current sample
-        current_row_intensities = [peptide_match.intensity for row in peptide_subplot_rows for peptide_match in row]
-        max_intensity, min_intensity = max(current_row_intensities), min(current_row_intensities)
-        def scale_intensity(intensity: float) -> float:
-            if intensity_normalization == IntensityNormalization.min_max_scaling:
-                return (intensity - min_intensity) / (max_intensity - min_intensity)
-            else:
-                return intensity
-
+        current_row_intensities = [
+            peptide_match.intensity
+            for row in peptide_subplot_rows
+            for peptide_match in row
+        ]
+        max_intensity, min_intensity = max(current_row_intensities), min(
+            current_row_intensities
+        )
+        scale_intensity = lambda intensity: (intensity - min_intensity) / (
+            max_intensity - min_intensity
+        )
         for row_index, row in enumerate(peptide_subplot_rows):
             for peptide_match in row:
-                x0, x1 = peptide_match.start_location_on_protein, peptide_match.end_location_on_protein
-                y0, y1 = row_index, row_index + 1
-                # add the rectangles
-                fig.add_shape(
-                    type="rect",
-                    x0=x0 - 0.5, y0=y0,
-                    x1=x1 - 0.5, y1=y1,
-                    fillcolor=f"rgba({int(255 * scale_intensity(peptide_match.intensity))}, 0, 0, 1)",
-                    opacity=0.5,
-                    layer="above",
-                    row=peptides_subplot_index, col=1
+                normalized_intensity = scale_intensity(peptide_match.intensity)
+                add_peptide_to_plot(
+                    fig,
+                    normalized_intensity,
+                    peptide_match,
+                    peptides_subplot_index,
+                    protein_sequence_labels,
+                    row_index,
                 )
-                # add invisible plotly object to the rectangle to show the peptide sequence when hovered over
-                fig.add_trace(go.Scatter(
-                    x=protein_sequence_labels[
-                      peptide_match.start_location_on_protein:peptide_match.end_location_on_protein + 1],
-                    y=[(y0 + y1) / 2] * len(peptide_match.peptide_sequence),
-                    text=[f"Sample: {sample_name}<br>Peptide: {peptide_match.peptide_sequence}<br>"
-                          f"({peptide_match.start_location_on_protein}-{peptide_match.end_location_on_protein})<br>"
-                          f"Intensity: {peptide_match.intensity}"] * len(peptide_match.peptide_sequence),
-                    mode="markers",
-                    marker=dict(opacity=0, size=15),  # make marker invisible
-                    hoverinfo="text",
-                    hovertemplate="%{text}<extra></extra>",
-                    showlegend=False,
-                ), row=peptides_subplot_index, col=1)
-
-        fig.update_yaxes(range=[0, len(peptide_subplot_rows)], showticklabels=False, row=peptides_subplot_index, col=1)
-        fig.update_xaxes(showticklabels=False, row=peptides_subplot_index, col=1)
+
+        fig.update_yaxes(
+            range=[0, len(peptide_subplot_rows)],
+            showticklabels=False,
+            showgrid=False,
+            row=peptides_subplot_index,
+            col=1,
+        )
+        fig.update_xaxes(
+            showticklabels=False, showgrid=False, row=peptides_subplot_index, col=1
+        )
 
     return fig
 
 
-def _build_coverage_plot(coverage: list[int], peptide_matches: list[PeptideMatch], protein_id: str,
-                         protein_sequence: str, rows: dict[str, list[list[PeptideMatch]]]) -> go.Figure:
-    x_labels = [f"{amino_acid} - {amino_acid_index} " for amino_acid_index, amino_acid in enumerate(protein_sequence)]
-    max_coverage_in_sequence = max(coverage)
-    max_intensity = max([peptide_match.intensity for peptide_match in peptide_matches])
-    normalized_coverage = [value / max_coverage_in_sequence for value in coverage]
-    hover_text = [f"Position: {i}<br>Amino acid: {amino_acid}<br>Coverage: {coverage}"
-                  for i, (coverage, amino_acid) in
-                  enumerate(zip(normalized_coverage, protein_sequence))]
-    fig = go.Figure(data=go.Bar(x=x_labels,
-                                y=normalized_coverage,
-                                marker=dict(
-                                    colorbar=dict(title="Coverage<br>(normalized)"),
-                                    line=dict(width=0)
-                                ),
-                                hovertext=hover_text,
-                                hoverinfo="text",
-                                showlegend=False,
-                                name="Coverage (normalized to max in Protein Sequence)"))
-    current_row = 1
-    for sample_idx, (sample, sample_rows) in enumerate(rows.items()):
-        for row_index, row in enumerate(sample_rows):
-            for peptide_match in row:
-                # add a rectangle for each peptide
-                x0, x1 = peptide_match.start_location_on_protein, peptide_match.end_location_on_protein
-                y0, y1 = current_row, current_row + 1
-                normalized_intensity = (peptide_match.intensity) / max_intensity
-                fig.add_shape(
-                    type="rect",
-                    # the coordinates need to be a bit offset, as otherwise the rectangles would start in the middle of the bars
-                    x0=x0 - 0.5, y0=y0,
-                    x1=x1 - 0.5, y1=y1,
-                    fillcolor=f"rgba({int(255 * (normalized_intensity))}, 0, 0, 1)",
-                    opacity=0.5,
-                    layer="above",
-                )
-                # add invisible plotly object to the rectangle to show the peptide sequence when hovered over
-                fig.add_trace(go.Scatter(
-                    x=x_labels[peptide_match.start_location_on_protein:peptide_match.end_location_on_protein + 1],
-                    y=[(y0 + y1) / 2] * len(peptide_match.peptide_sequence),
-                    text=[f"Sample: {sample}<br>Peptide: {peptide_match.peptide_sequence}<br>"
-                          f"({peptide_match.start_location_on_protein}-{peptide_match.end_location_on_protein})<br>"
-                          f"Intensity: {peptide_match.intensity}"] * len(peptide_match.peptide_sequence),
-                    mode="markers",
-                    marker=dict(opacity=0, size=30),  # make marker invisible
-                    hoverinfo="text",
-                    hovertemplate="%{text}<extra></extra>",
-                    showlegend=False,
-                ))
-
-            # Move to next row
-            current_row += 1
-
-        # Add a horizontal line separator between samples (except after the last sample)
-        if sample_idx < len(rows) - 1:
-            fig.add_shape(
-                type="line",
-                x0=-0.5,
-                y0=current_row,
-                x1=len(protein_sequence) - 0.5,
-                y1=current_row,
-                line=dict(
-                    color="Gray",
-                    width=2,
-                    dash="solid",
-                )
-            )
-    fig.update_layout(
-        title=f"Protein Coverage of {protein_id}",
-        xaxis_title="Protein Sequence",
-        yaxis=dict(visible=False, range=[0, 1 + current_row], title=""),
+def add_sequence_depth_to_plot(
+    fig: go.Figure,
+    coverage: list[int],
+    coverage_subplot_index: int,
+    max_coverage_value: int,
+    protein_sequence_labels: list[str],
+) -> None:
+    SEQUENCE_DEPTH_BAR_CHART = dict(
+        x=protein_sequence_labels,
+        y=coverage,
         showlegend=False,
-        bargap=0.2
+        marker=dict(color=PLOT_PRIMARY_COLOR),
+    )
+    fig.add_trace(go.Bar(**SEQUENCE_DEPTH_BAR_CHART), row=coverage_subplot_index, col=1)
+    fig.update_yaxes(range=[0, max_coverage_value], row=coverage_subplot_index, col=1)
+
+
+def add_peptide_to_plot(
+    fig: go.Figure,
+    normalized_intensity: float,
+    peptide_match: PeptideMatch,
+    peptides_subplot_index: int,
+    protein_sequence_labels: list[str],
+    row_index: int,
+) -> None:
+    x0, x1 = (
+        peptide_match.start_location_on_protein,
+        peptide_match.end_location_on_protein,
+    )
+    y0, y1 = row_index, row_index + 1
+    PEPTIDE_SHAPE = dict(
+        type="rect",
+        x0=x0 - 0.5,
+        y0=y0,
+        x1=x1 - 0.5,
+        y1=y1,
+        fillcolor=f"rgba({int(255 * normalized_intensity)}, 0, 0, 1)",
+        opacity=0.5,
+        layer="above",
+    )
+    fig.add_shape(**PEPTIDE_SHAPE, row=peptides_subplot_index, col=1)
+    # add invisible plotly object to the rectangle to show the peptide sequence when hovered over
+    fig.add_trace(
+        go.Scatter(
+            x=protein_sequence_labels[
+                peptide_match.start_location_on_protein : peptide_match.end_location_on_protein
+                + 1
+            ],
+            y=[(y0 + y1) / 2] * len(peptide_match.peptide_sequence),
+            text=[
+                f"Sample: {peptide_match.sample}<br>Peptide: {peptide_match.peptide_sequence}<br>"
+                f"({peptide_match.start_location_on_protein}-{peptide_match.end_location_on_protein})<br>"
+                f"Intensity: {peptide_match.intensity}"
+            ]
+            * len(peptide_match.peptide_sequence),
+            mode="markers",
+            marker=dict(opacity=0, size=15),  # make marker invisible
+            hoverinfo="text",
+            hovertemplate="%{text}<extra></extra>",
+            showlegend=False,
+        ),
+        row=peptides_subplot_index,
+        col=1,
     )
-    return fig
 
 
 def increment_coverage(coverage: list[int], protein_hit: ProteinHit) -> None:
-    coverage[protein_hit.start_location_on_protein:protein_hit.end_location_on_protein] = \
-        [coverage + 1 for coverage in
-         coverage[protein_hit.start_location_on_protein:protein_hit.end_location_on_protein]]
+    coverage[
+        protein_hit.start_location_on_protein : protein_hit.end_location_on_protein
+    ] = [
+        coverage + 1
+        for coverage in coverage[
+            protein_hit.start_location_on_protein : protein_hit.end_location_on_protein
+        ]
+    ]
 
 
-def distribute_to_rows(peptide_matches: list[PeptideMatch]) -> dict[list[list[PeptideMatch]]]:
+def distribute_to_rows(
+    peptide_matches: list[PeptideMatch],
+) -> dict[list[list[PeptideMatch]]]:
     """
     Distributes peptide matches to rows such that no two peptides overlap in the same row.
     Greedy algorithm (see Interval Scheduling Problem).
     """
-    peptide_matches.sort(key=lambda peptide_match: peptide_match.start_location_on_protein)
-    rows = {sample: [] for sample in set([peptide_match.sample for peptide_match in peptide_matches])}
+    peptide_matches.sort(
+        key=lambda peptide_match: peptide_match.start_location_on_protein
+    )
+    rows = {
+        sample: []
+        for sample in set([peptide_match.sample for peptide_match in peptide_matches])
+    }
     for peptide_match in peptide_matches:
         # find the first row that does not overlap with the current peptide
         for row in rows[peptide_match.sample]:
-            if row[-1].end_location_on_protein < peptide_match.start_location_on_protein:
+            if (
+                row[-1].end_location_on_protein
+                < peptide_match.start_location_on_protein
+            ):
                 row.append((peptide_match))
                 break
         else:
@@ -317,4 +386,4 @@ def distribute_to_rows(peptide_matches: list[PeptideMatch]) -> dict[list[list[Pe
 
 
 def get_max_coverage(coverage: dict[list[int]]) -> int:
-    return max([max(coverage) for coverage in coverage.values()])
\ No newline at end of file
+    return max([max(coverage) for coverage in coverage.values()])

From a7bd9f0f2187b3836d23cdec791bbae25a969212 Mon Sep 17 00:00:00 2001
From: henning <henning.gaertner@student.hpi.de>
Date: Mon, 16 Dec 2024 15:00:43 +0100
Subject: [PATCH 10/13] Add color interpolation, combine peptides and sequence
 depth subplots

---
 protzilla/constants/colors.py               |  35 ++++++-
 protzilla/data_analysis/protein_coverage.py | 102 +++++++++++++-------
 2 files changed, 100 insertions(+), 37 deletions(-)

diff --git a/protzilla/constants/colors.py b/protzilla/constants/colors.py
index 0c6600f7..9a92e93c 100644
--- a/protzilla/constants/colors.py
+++ b/protzilla/constants/colors.py
@@ -11,4 +11,37 @@
 """First color in list. Conventionally used for visualizing outliers."""
 
 PLOT_SECONDARY_COLOR = PLOT_COLOR_SEQUENCE[1]
-"""Second color in list."""
\ No newline at end of file
+"""Second color in list."""
+
+
+def interpolate_color(color_a, color_b, t):
+    """
+    Interpolate between two RGB color strings based on a float t between 0 and 1.
+
+    Args:
+        color_a (str): RGB color string in the format "#RRGGBB".
+        color_b (str): RGB color string in the format "#RRGGBB".
+        t (float): A float between 0 and 1 representing the interpolation factor.
+
+    Returns:
+        str: Interpolated color as an RGB string in the format "#RRGGBB".
+    """
+    if not (0 <= t <= 1):
+        raise ValueError("Interpolation factor t must be between 0 and 1")
+
+    # Convert hex color strings to RGB tuples
+    def hex_to_rgb(hex_color):
+        hex_color = hex_color.lstrip("#")
+        return tuple(int(hex_color[i : i + 2], 16) for i in (0, 2, 4))
+
+    # Convert RGB tuples back to hex color strings
+    def rgb_to_hex(rgb):
+        return f"#{''.join(f'{c:02x}' for c in rgb)}"
+
+    rgb_a = hex_to_rgb(color_a)
+    rgb_b = hex_to_rgb(color_b)
+
+    # Interpolate each color channel
+    interpolated_rgb = tuple(int(a + (b - a) * t) for a, b in zip(rgb_a, rgb_b))
+
+    return rgb_to_hex(interpolated_rgb)
diff --git a/protzilla/data_analysis/protein_coverage.py b/protzilla/data_analysis/protein_coverage.py
index b292d0bd..89133bfb 100644
--- a/protzilla/data_analysis/protein_coverage.py
+++ b/protzilla/data_analysis/protein_coverage.py
@@ -6,7 +6,11 @@
     mann_whitney_test_on_intensity_data,
 )
 from protzilla.disk_operator import PickleOperator
-from protzilla.constants.colors import PLOT_PRIMARY_COLOR
+from protzilla.constants.colors import (
+    PLOT_PRIMARY_COLOR,
+    PLOT_COLOR_SEQUENCE,
+    interpolate_color,
+)
 from dataclasses import dataclass
 import pandas as pd
 from numpy import log2
@@ -16,6 +20,8 @@
 # import StrEnum
 from enum import StrEnum
 
+INTENSITY_COLORS = ["#FFFFFF", PLOT_COLOR_SEQUENCE[3]]
+
 
 @dataclass(unsafe_hash=True)
 class PeptideMatch:
@@ -205,18 +211,12 @@ def _new_build_coverage_plot(
     intensity_normalization: IntensityNormalization = IntensityNormalization.min_max_scaling,
 ) -> go.Figure:
     # one row for each sample + two row for the protein sequence (on top and on the bottom)
-    number_of_subplots = len(rows_by_sample.keys()) + len(coverages_by_sample.keys())
+    number_of_subplots = len(rows_by_sample.keys())
     protein_sequence_labels = [
         f"{amino_acid} - {amino_acid_index} "
         for amino_acid_index, amino_acid in enumerate(protein_sequence)
     ]
-    subplot_titles = zip(
-        [f"Peptides of {sample}" for sample in rows_by_sample.keys()],  # str_a
-        [
-            f"Sequencing depth of {sample}" for sample in coverages_by_sample.keys()
-        ],  # str_b
-    )  # now is (str_a, str_b), (str_a, str_b), ..., so we need to flatten it
-    subplot_titles = [title for titles in subplot_titles for title in titles]
+    subplot_titles = [f"Peptides of {sample}" for sample in rows_by_sample.keys()]
     max_coverage_value = get_max_coverage(coverages_by_sample)
 
     fig = make_subplots(
@@ -225,57 +225,80 @@ def _new_build_coverage_plot(
         shared_xaxes=True,
         vertical_spacing=0.1,
         subplot_titles=subplot_titles,
+        specs=[[{"secondary_y": True}] for _ in range(number_of_subplots)],
     )
     # Peptides
-    for sample_index, sample_name in enumerate(rows_by_sample.keys()):
-        peptide_subplot_rows = rows_by_sample[sample_name]
-        peptides_subplot_index = (2 * sample_index) + 1
+    for sample_index, sample_name in enumerate(rows_by_sample.keys(), start=1):
+        peptide_rows = rows_by_sample[sample_name]
+        sample_subplot_index = sample_index
         coverage = coverages_by_sample[sample_name]
-        coverage_subplot_index = (2 * sample_index) + 2
 
         add_sequence_depth_to_plot(
             fig,
             coverage,
-            coverage_subplot_index,
+            sample_subplot_index,
             max_coverage_value,
             protein_sequence_labels,
         )
 
         # Peptide plot
-        # Intensity scaling for the current sample
-        current_row_intensities = [
-            peptide_match.intensity
-            for row in peptide_subplot_rows
-            for peptide_match in row
+        current_sample_intensities = [
+            peptide_match.intensity for row in peptide_rows for peptide_match in row
         ]
-        max_intensity, min_intensity = max(current_row_intensities), min(
-            current_row_intensities
+        max_intensity, min_intensity = max(current_sample_intensities), min(
+            current_sample_intensities
         )
         scale_intensity = lambda intensity: (intensity - min_intensity) / (
             max_intensity - min_intensity
         )
-        for row_index, row in enumerate(peptide_subplot_rows):
+        for row_index, row in enumerate(peptide_rows):
             for peptide_match in row:
-                normalized_intensity = scale_intensity(peptide_match.intensity)
                 add_peptide_to_plot(
-                    fig,
-                    normalized_intensity,
-                    peptide_match,
-                    peptides_subplot_index,
-                    protein_sequence_labels,
-                    row_index,
+                    fig=fig,
+                    normalized_intensity=scale_intensity(peptide_match.intensity),
+                    peptide_match=peptide_match,
+                    peptides_subplot_index=sample_subplot_index,
+                    protein_sequence_labels=protein_sequence_labels,
+                    row_index=row_index,
+                    offset=max_coverage_value,
+                    color_a=INTENSITY_COLORS[0],
+                    color_b=INTENSITY_COLORS[1],
                 )
 
         fig.update_yaxes(
-            range=[0, len(peptide_subplot_rows)],
+            range=[0, len(peptide_rows) + max_coverage_value],
             showticklabels=False,
             showgrid=False,
-            row=peptides_subplot_index,
+            row=sample_subplot_index,
             col=1,
         )
         fig.update_xaxes(
-            showticklabels=False, showgrid=False, row=peptides_subplot_index, col=1
+            showticklabels=False, showgrid=False, row=sample_subplot_index, col=1
         )
+    # Color legend trace
+    color_scale = [[0, INTENSITY_COLORS[0]], [1, INTENSITY_COLORS[1]]]
+    color_legend_trace = go.Scatter(
+        x=[None],
+        y=[None],
+        mode="markers",
+        marker=dict(
+            colorscale=color_scale,
+            showscale=True,
+            cmin=0,
+            cmax=1,
+            colorbar=dict(
+                title="Intensity of peptide",
+                x=1.0,  # Move further outside the plot
+                len=0.9,  # Increase length of colorbar (70% of subplot height)
+                thickness=30,  # Increase thickness of colorbar
+                titleside="right",  # Move title to the right side of the bar
+            ),
+            size=10,
+        ),
+        showlegend=False,
+    )
+
+    fig.add_trace(color_legend_trace, row=number_of_subplots, col=1)
 
     return fig
 
@@ -292,9 +315,11 @@ def add_sequence_depth_to_plot(
         y=coverage,
         showlegend=False,
         marker=dict(color=PLOT_PRIMARY_COLOR),
+        # hovering should give the amino acid, the position and the coverage
+        hoverinfo="text",
+        hovertemplate="%{x}: %{y}<extra></extra>",
     )
     fig.add_trace(go.Bar(**SEQUENCE_DEPTH_BAR_CHART), row=coverage_subplot_index, col=1)
-    fig.update_yaxes(range=[0, max_coverage_value], row=coverage_subplot_index, col=1)
 
 
 def add_peptide_to_plot(
@@ -304,20 +329,25 @@ def add_peptide_to_plot(
     peptides_subplot_index: int,
     protein_sequence_labels: list[str],
     row_index: int,
+    offset: int,
+    color_a: str = "#FFFFFF",
+    color_b: str = PLOT_COLOR_SEQUENCE[3],
 ) -> None:
     x0, x1 = (
         peptide_match.start_location_on_protein,
         peptide_match.end_location_on_protein,
     )
-    y0, y1 = row_index, row_index + 1
+    y0, y1 = row_index + offset, row_index + offset + 1
+    # interpolate between the two colors
+    color = interpolate_color(color_a, color_b, normalized_intensity)
     PEPTIDE_SHAPE = dict(
         type="rect",
         x0=x0 - 0.5,
         y0=y0,
         x1=x1 - 0.5,
         y1=y1,
-        fillcolor=f"rgba({int(255 * normalized_intensity)}, 0, 0, 1)",
-        opacity=0.5,
+        fillcolor=color,
+        line_color=color,
         layer="above",
     )
     fig.add_shape(**PEPTIDE_SHAPE, row=peptides_subplot_index, col=1)

From 8a3ba5a299fa34ab9b06a7e656e7e686092e4cff Mon Sep 17 00:00:00 2001
From: henning <henning.gaertner@student.hpi.de>
Date: Tue, 17 Dec 2024 15:41:46 +0100
Subject: [PATCH 11/13] Add grouping via metadata and aggregation via median,
 improve logging

---
 protzilla/data_analysis/protein_coverage.py | 184 +++++++++++++-------
 protzilla/methods/data_analysis.py          |   6 +-
 protzilla/steps.py                          |   5 +-
 ui/runs/forms/data_analysis.py              | 106 ++++++-----
 ui/runs/forms/fill_helper.py                |   4 +
 5 files changed, 201 insertions(+), 104 deletions(-)

diff --git a/protzilla/data_analysis/protein_coverage.py b/protzilla/data_analysis/protein_coverage.py
index 89133bfb..6b0e0bb9 100644
--- a/protzilla/data_analysis/protein_coverage.py
+++ b/protzilla/data_analysis/protein_coverage.py
@@ -16,11 +16,23 @@
 from numpy import log2
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
+from protzilla.constants.protzilla_logging import logger
 
 # import StrEnum
 from enum import StrEnum
 
 INTENSITY_COLORS = ["#FFFFFF", PLOT_COLOR_SEQUENCE[3]]
+SEQUENCE_DEPTH_PEPTIDE_SPACING = 1
+
+
+class IntensityNormalization(StrEnum):
+    min_max_scaling = "min_max_scaling"
+    none = "none"
+
+
+class AggregationMethod(StrEnum):
+    median = "median"
+    mean = "mean"
 
 
 @dataclass(unsafe_hash=True)
@@ -29,7 +41,7 @@ class PeptideMatch:
     start_location_on_protein: int
     end_location_on_protein: int
     intensity: float = 0.0
-    sample: str = ""
+    metadata_group: str = ""
 
 
 @dataclass(unsafe_hash=True)
@@ -85,6 +97,8 @@ def match_peptide_to_protein_ids(
     Returns a list of tuples containing the protein ID and the start, end location of the peptide in the protein
     sequence.
     """
+    if len(peptide_sequence) == 0:
+        raise ValueError("Peptide sequence is empty.")
     k = 5
     peptide_kmers = [
         peptide_sequence[i : i + k] for i in range(len(peptide_sequence) - k + 1)
@@ -107,6 +121,7 @@ def match_peptide_to_protein_ids(
         == peptide_sequence
     ]
     hits = list(set(hits))  # remove duplicates
+
     # Just sanity checks
     for hit in hits:
         subsequence = protein_dictionary[hit.protein_id][
@@ -129,19 +144,27 @@ def match_peptide_to_protein_ids(
 def plot_protein_coverage(
     fasta_df: pd.DataFrame,
     peptide_df: pd.DataFrame,
+    metadata_df: pd.DataFrame,
     protein_id: str,
-    samples: list[str] = None,
+    grouping: str,
+    selected_groups: list[str] = None,
+    aggregation_method: AggregationMethod = AggregationMethod.median,
 ) -> None:
     """
     Plots the coverage of a protein sequence by peptides.
     """
     # generate the protein kmer dictionary
-    if samples is None or samples == []:
+    if selected_groups is None or selected_groups == []:
         raise ValueError("No samples provided.")
 
+    # add group information to the peptide dataframe
+    peptide_df = peptide_df.merge(metadata_df, on="Sample")
+
     reduced_peptide_df = peptide_df[
-        peptide_df["Sample"].isin(samples) & peptide_df["Intensity"] > 0
+        peptide_df[grouping].isin(selected_groups) & peptide_df["Intensity"] > 0
     ]
+    if len(reduced_peptide_df) == 0:
+        raise ValueError("No peptides found for the samples provided.")
 
     protein_sequence_dict = dict(
         zip(fasta_df["Protein ID"], fasta_df["Protein Sequence"])
@@ -150,16 +173,22 @@ def plot_protein_coverage(
 
     protein_sequence = protein_sequence_dict[protein_id]
     peptide_matches = []
-    coverage_by_sample = {sample: [0] * len(protein_sequence) for sample in samples}
+    coverage_by_group_name = {
+        sample: [0] * len(protein_sequence) for sample in selected_groups
+    }
 
-    for sample_name, peptide_sequence, intensity in tqdm(
-        reduced_peptide_df[["Sample", "Sequence", "Intensity"]]
+    for (group_name, peptide_sequence), row in tqdm(
+        reduced_peptide_df[[grouping, "Sequence", "Intensity"]]
         .drop_duplicates()
-        .itertuples(index=False),
+        .groupby([grouping, "Sequence"]),
         desc="Matching peptides to protein",
         unit_scale=True,
         unit="peptide",
     ):
+        sample = extract_peptide_from_slice(row, aggregation_method)
+        if sample.empty:
+            continue
+        intensity = sample["Intensity"].values[0]
         protein_hits = match_peptide_to_protein_ids(
             peptide_sequence=peptide_sequence,
             protein_kmer_dictionary=sequence_kmer_dict,
@@ -177,11 +206,11 @@ def plot_protein_coverage(
                     start_location_on_protein=protein_hit.start_location_on_protein,
                     end_location_on_protein=protein_hit.end_location_on_protein,
                     intensity=log2(intensity),
-                    sample=sample_name,
+                    metadata_group=group_name,
                 )
             )
             # update the coverage of the protein sequence
-            increment_coverage(coverage_by_sample[sample_name], protein_hit)
+            increment_coverage(coverage_by_group_name[group_name], protein_hit)
 
     if len(peptide_matches) == 0:
         raise ValueError(f"No peptides matched for protein {protein_id}")
@@ -189,35 +218,38 @@ def plot_protein_coverage(
     # now that all the matches have been determined with their start and end on the protein sequence, we need to find
     # an optimal solution for the location of their rectangles in the plot without overlap while minimizing the
     # required number of rows (vertical space)
-    rows_by_sample = distribute_to_rows(peptide_matches)
-
-    fig = _new_build_coverage_plot(
-        coverage_by_sample,
-        peptide_matches,
-        protein_id,
-        protein_sequence,
-        rows_by_sample,
+    rows_by_group_name = distribute_to_rows(peptide_matches)
+
+    fig = build_coverage_plot(
+        coverages_by_group_name=coverage_by_group_name,
+        peptide_matches=peptide_matches,
+        protein_id=protein_id,
+        protein_sequence=protein_sequence,
+        rows_by_group_name=rows_by_group_name,
+        grouping=grouping,
     )
 
     return dict(plots=[fig])
 
 
-def _new_build_coverage_plot(
-    coverages_by_sample: list[int],
+def build_coverage_plot(
+    coverages_by_group_name: list[int],
     peptide_matches: list[PeptideMatch],
     protein_id: str,
     protein_sequence: str,
-    rows_by_sample: dict[str, list[list[PeptideMatch]]],
+    rows_by_group_name: dict[str, list[list[PeptideMatch]]],
     intensity_normalization: IntensityNormalization = IntensityNormalization.min_max_scaling,
+    grouping: str = "",
 ) -> go.Figure:
-    # one row for each sample + two row for the protein sequence (on top and on the bottom)
-    number_of_subplots = len(rows_by_sample.keys())
+    number_of_subplots = len(rows_by_group_name.keys())
     protein_sequence_labels = [
         f"{amino_acid} - {amino_acid_index} "
         for amino_acid_index, amino_acid in enumerate(protein_sequence)
     ]
-    subplot_titles = [f"Peptides of {sample}" for sample in rows_by_sample.keys()]
-    max_coverage_value = get_max_coverage(coverages_by_sample)
+    subplot_titles = [
+        f"Peptides of {group_name}" for group_name in rows_by_group_name.keys()
+    ]
+    max_coverage_value = get_max_coverage(coverages_by_group_name)
 
     fig = make_subplots(
         rows=number_of_subplots,
@@ -228,25 +260,23 @@ def _new_build_coverage_plot(
         specs=[[{"secondary_y": True}] for _ in range(number_of_subplots)],
     )
     # Peptides
-    for sample_index, sample_name in enumerate(rows_by_sample.keys(), start=1):
-        peptide_rows = rows_by_sample[sample_name]
-        sample_subplot_index = sample_index
-        coverage = coverages_by_sample[sample_name]
+    for group_index, group_name in enumerate(rows_by_group_name.keys(), start=1):
+        peptide_rows = rows_by_group_name[group_name]
+        coverage = coverages_by_group_name[group_name]
 
         add_sequence_depth_to_plot(
-            fig,
-            coverage,
-            sample_subplot_index,
-            max_coverage_value,
-            protein_sequence_labels,
+            fig=fig,
+            coverage=coverage,
+            group_subplot_index=group_index,
+            protein_sequence_labels=protein_sequence_labels,
         )
 
         # Peptide plot
-        current_sample_intensities = [
+        current_group_intensities = [
             peptide_match.intensity for row in peptide_rows for peptide_match in row
         ]
-        max_intensity, min_intensity = max(current_sample_intensities), min(
-            current_sample_intensities
+        max_intensity, min_intensity = max(current_group_intensities), min(
+            current_group_intensities
         )
         scale_intensity = lambda intensity: (intensity - min_intensity) / (
             max_intensity - min_intensity
@@ -257,25 +287,35 @@ def _new_build_coverage_plot(
                     fig=fig,
                     normalized_intensity=scale_intensity(peptide_match.intensity),
                     peptide_match=peptide_match,
-                    peptides_subplot_index=sample_subplot_index,
+                    group_subplot_index=group_index,
                     protein_sequence_labels=protein_sequence_labels,
                     row_index=row_index,
-                    offset=max_coverage_value,
+                    offset=max_coverage_value + SEQUENCE_DEPTH_PEPTIDE_SPACING,
                     color_a=INTENSITY_COLORS[0],
                     color_b=INTENSITY_COLORS[1],
+                    grouping=grouping,
                 )
 
+        fig.update_layout(hovermode="closest")
         fig.update_yaxes(
-            range=[0, len(peptide_rows) + max_coverage_value],
+            range=[
+                0,
+                len(peptide_rows) + max_coverage_value + SEQUENCE_DEPTH_PEPTIDE_SPACING,
+            ],
             showticklabels=False,
             showgrid=False,
-            row=sample_subplot_index,
+            row=group_index,
             col=1,
         )
-        fig.update_xaxes(
-            showticklabels=False, showgrid=False, row=sample_subplot_index, col=1
-        )
-    # Color legend trace
+        fig.update_xaxes(showticklabels=False, showgrid=False, row=group_index, col=1)
+    fig.update_xaxes(showticklabels=True, row=number_of_subplots, col=1)
+
+    add_intensity_legend(fig)
+
+    return fig
+
+
+def add_intensity_legend(fig):
     color_scale = [[0, INTENSITY_COLORS[0]], [1, INTENSITY_COLORS[1]]]
     color_legend_trace = go.Scatter(
         x=[None],
@@ -297,17 +337,13 @@ def _new_build_coverage_plot(
         ),
         showlegend=False,
     )
-
-    fig.add_trace(color_legend_trace, row=number_of_subplots, col=1)
-
-    return fig
+    fig.add_trace(color_legend_trace, row=1, col=1)
 
 
 def add_sequence_depth_to_plot(
     fig: go.Figure,
     coverage: list[int],
-    coverage_subplot_index: int,
-    max_coverage_value: int,
+    group_subplot_index: int,
     protein_sequence_labels: list[str],
 ) -> None:
     SEQUENCE_DEPTH_BAR_CHART = dict(
@@ -315,23 +351,24 @@ def add_sequence_depth_to_plot(
         y=coverage,
         showlegend=False,
         marker=dict(color=PLOT_PRIMARY_COLOR),
-        # hovering should give the amino acid, the position and the coverage
-        hoverinfo="text",
-        hovertemplate="%{x}: %{y}<extra></extra>",
+        # add hover text
+        hoverinfo="y",
+        hovertemplate="Sequencing depth at %{x}: %{y}",
     )
-    fig.add_trace(go.Bar(**SEQUENCE_DEPTH_BAR_CHART), row=coverage_subplot_index, col=1)
+    fig.add_trace(go.Bar(**SEQUENCE_DEPTH_BAR_CHART), row=group_subplot_index, col=1)
 
 
 def add_peptide_to_plot(
     fig: go.Figure,
     normalized_intensity: float,
     peptide_match: PeptideMatch,
-    peptides_subplot_index: int,
+    group_subplot_index: int,
     protein_sequence_labels: list[str],
     row_index: int,
     offset: int,
     color_a: str = "#FFFFFF",
     color_b: str = PLOT_COLOR_SEQUENCE[3],
+    grouping: str = "",
 ) -> None:
     x0, x1 = (
         peptide_match.start_location_on_protein,
@@ -350,7 +387,7 @@ def add_peptide_to_plot(
         line_color=color,
         layer="above",
     )
-    fig.add_shape(**PEPTIDE_SHAPE, row=peptides_subplot_index, col=1)
+    fig.add_shape(**PEPTIDE_SHAPE, row=group_subplot_index, col=1)
     # add invisible plotly object to the rectangle to show the peptide sequence when hovered over
     fig.add_trace(
         go.Scatter(
@@ -360,7 +397,7 @@ def add_peptide_to_plot(
             ],
             y=[(y0 + y1) / 2] * len(peptide_match.peptide_sequence),
             text=[
-                f"Sample: {peptide_match.sample}<br>Peptide: {peptide_match.peptide_sequence}<br>"
+                f"{grouping}: {peptide_match.metadata_group}<br>Peptide: {peptide_match.peptide_sequence}<br>"
                 f"({peptide_match.start_location_on_protein}-{peptide_match.end_location_on_protein})<br>"
                 f"Intensity: {peptide_match.intensity}"
             ]
@@ -371,7 +408,7 @@ def add_peptide_to_plot(
             hovertemplate="%{text}<extra></extra>",
             showlegend=False,
         ),
-        row=peptides_subplot_index,
+        row=group_subplot_index,
         col=1,
     )
 
@@ -387,6 +424,25 @@ def increment_coverage(coverage: list[int], protein_hit: ProteinHit) -> None:
     ]
 
 
+def extract_peptide_from_slice(
+    slice_df: pd.DataFrame,
+    aggregation_strategy: AggregationMethod = AggregationMethod.median,
+) -> pd.DataFrame:
+    """
+    Given multiple peptide datapoints, return a single peptide datapoint with the intensity value extracted from the
+    slice of the peptide data.
+    """
+    if len(slice_df) == 1:
+        return slice_df
+    if aggregation_strategy == AggregationMethod.median:
+        slice_df = slice_df.groupby("Sequence").median()
+    elif aggregation_strategy == AggregationMethod.mean:
+        slice_df = slice_df.groupby("Sequence").mean()
+    else:
+        raise ValueError(f"Unknown strategy: {aggregation_strategy}")
+    return slice_df
+
+
 def distribute_to_rows(
     peptide_matches: list[PeptideMatch],
 ) -> dict[list[list[PeptideMatch]]]:
@@ -398,12 +454,14 @@ def distribute_to_rows(
         key=lambda peptide_match: peptide_match.start_location_on_protein
     )
     rows = {
-        sample: []
-        for sample in set([peptide_match.sample for peptide_match in peptide_matches])
+        group_name: []
+        for group_name in set(
+            [peptide_match.metadata_group for peptide_match in peptide_matches]
+        )
     }
     for peptide_match in peptide_matches:
         # find the first row that does not overlap with the current peptide
-        for row in rows[peptide_match.sample]:
+        for row in rows[peptide_match.metadata_group]:
             if (
                 row[-1].end_location_on_protein
                 < peptide_match.start_location_on_protein
@@ -411,7 +469,7 @@ def distribute_to_rows(
                 row.append((peptide_match))
                 break
         else:
-            rows[peptide_match.sample].append([(peptide_match)])
+            rows[peptide_match.metadata_group].append([(peptide_match)])
     return rows
 
 
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 765b3047..05a0d0fc 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -791,8 +791,11 @@ class PlotProteinCoverage(PlotStep):
     input_keys = [
         "protein_id",
         "fasta_df",
+        "metadata_df",
         "peptide_df",
-        "samples"
+        "grouping",
+        "selected_groups",
+        "aggregation_method",
     ]
     output_keys = []
 
@@ -806,6 +809,7 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
         inputs["peptide_df"] = steps.get_step_output(
             Step, "peptide_df", inputs["peptide_df_instance"]
         )
+        inputs["metadata_df"] = steps.metadata_df
         return
 
 
diff --git a/protzilla/steps.py b/protzilla/steps.py
index eecf0d04..eac35b65 100644
--- a/protzilla/steps.py
+++ b/protzilla/steps.py
@@ -7,6 +7,7 @@
 from enum import Enum
 from io import BytesIO
 from pathlib import Path
+from protzilla.constants.protzilla_logging import logger
 
 import pandas as pd
 import plotly
@@ -455,6 +456,9 @@ def check_instance_identifier(step):
                 and input_key in step.inputs
             ):
                 return step.inputs[input_key]
+        logging.warning(
+            f"No input {input_key} found for step type {step_type} and instance identifier {instance_identifier}"
+        )
         return None
 
     def all_steps_in_section(self, section: str) -> list[Step]:
@@ -506,7 +510,6 @@ def metadata_df(self) -> pd.DataFrame | None:
         from protzilla.methods.importing import ImportingStep
 
         return self.get_step_output(ImportingStep, "metadata_df")
-        logging.warning("No metadata_df found in steps")
 
     @property
     def preprocessed_output(self) -> Output:
diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py
index 5195f7d5..1bee889b 100644
--- a/ui/runs/forms/data_analysis.py
+++ b/ui/runs/forms/data_analysis.py
@@ -175,12 +175,12 @@ class DifferentialExpressionANOVAForm(MethodForm):
     )
 
     def fill_form(self, run: Run) -> None:
-        self.fields[
-            "protein_df"
-        ].choices = fill_helper.get_choices_for_protein_df_steps(run)
-        self.fields[
-            "grouping"
-        ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
+        self.fields["protein_df"].choices = (
+            fill_helper.get_choices_for_protein_df_steps(run)
+        )
+        self.fields["grouping"].choices = (
+            fill_helper.get_choices_for_metadata_non_sample_columns(run)
+        )
         grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0])
         self.fields["selected_groups"].choices = fill_helper.to_choices(
             run.steps.metadata_df[grouping].unique()
@@ -216,12 +216,12 @@ class DifferentialExpressionTTestForm(MethodForm):
     group2 = CustomChoiceField(choices=[], label="Group 2")
 
     def fill_form(self, run: Run) -> None:
-        self.fields[
-            "protein_df"
-        ].choices = fill_helper.get_choices_for_protein_df_steps(run)
-        self.fields[
-            "grouping"
-        ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
+        self.fields["protein_df"].choices = (
+            fill_helper.get_choices_for_protein_df_steps(run)
+        )
+        self.fields["grouping"].choices = (
+            fill_helper.get_choices_for_metadata_non_sample_columns(run)
+        )
 
         grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0])
 
@@ -264,9 +264,9 @@ class DifferentialExpressionLinearModelForm(MethodForm):
     group2 = CustomChoiceField(choices=[], label="Group 2")
 
     def fill_form(self, run: Run) -> None:
-        self.fields[
-            "grouping"
-        ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
+        self.fields["grouping"].choices = (
+            fill_helper.get_choices_for_metadata_non_sample_columns(run)
+        )
 
         grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0])
 
@@ -312,13 +312,13 @@ class DifferentialExpressionMannWhitneyOnIntensityForm(MethodForm):
     group2 = CustomChoiceField(choices=[], label="Group 2")
 
     def fill_form(self, run: Run) -> None:
-        self.fields[
-            "protein_df"
-        ].choices = fill_helper.get_choices_for_protein_df_steps(run)
+        self.fields["protein_df"].choices = (
+            fill_helper.get_choices_for_protein_df_steps(run)
+        )
 
-        self.fields[
-            "grouping"
-        ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
+        self.fields["grouping"].choices = (
+            fill_helper.get_choices_for_metadata_non_sample_columns(run)
+        )
 
         grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0])
 
@@ -373,9 +373,9 @@ def fill_form(self, run: Run) -> None:
             run.steps.get_instance_identifiers(PTMsPerSample, "ptm_df")
         )
 
-        self.fields[
-            "grouping"
-        ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
+        self.fields["grouping"].choices = (
+            fill_helper.get_choices_for_metadata_non_sample_columns(run)
+        )
 
         grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0])
 
@@ -423,13 +423,13 @@ class DifferentialExpressionKruskalWallisOnIntensityForm(MethodForm):
     )
 
     def fill_form(self, run: Run) -> None:
-        self.fields[
-            "protein_df"
-        ].choices = fill_helper.get_choices_for_protein_df_steps(run)
+        self.fields["protein_df"].choices = (
+            fill_helper.get_choices_for_protein_df_steps(run)
+        )
 
-        self.fields[
-            "grouping"
-        ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
+        self.fields["grouping"].choices = (
+            fill_helper.get_choices_for_metadata_non_sample_columns(run)
+        )
         grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0])
         self.fields["selected_groups"].choices = fill_helper.to_choices(
             run.steps.metadata_df[grouping].unique()
@@ -462,9 +462,9 @@ def fill_form(self, run: Run) -> None:
         self.fields["ptm_df"].choices = fill_helper.to_choices(
             run.steps.get_instance_identifiers(PTMsPerSample, "ptm_df")
         )
-        self.fields[
-            "grouping"
-        ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run)
+        self.fields["grouping"].choices = (
+            fill_helper.get_choices_for_metadata_non_sample_columns(run)
+        )
         grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0])
         self.fields["selected_groups"].choices = fill_helper.to_choices(
             run.steps.metadata_df[grouping].unique()
@@ -1091,6 +1091,9 @@ class ProteinGraphVariationGraphForm(MethodForm):
 
 
 class PlotProteinCoverageForm(MethodForm):
+    from protzilla.data_analysis.protein_coverage import AggregationMethod
+
+    is_dynamic = True
     peptide_df_instance = CustomChoiceField(
         choices=[],
         label="Step to use peptide data from",
@@ -1103,11 +1106,19 @@ class PlotProteinCoverageForm(MethodForm):
         choices=[],
         label="Protein ID",
     )
-    samples = CustomMultipleChoiceField(
+    grouping = CustomChoiceField(
         choices=[],
-        label="Samples",
+        label="Grouping from metadata",
+    )
+    selected_groups = CustomMultipleChoiceField(
+        choices=[],
+        label="Select groups / samples to plot",
+    )
+    aggregation_method = CustomChoiceField(
+        choices=fill_helper.to_choices(AggregationMethod),
+        label="Aggregation method",
+        initial=AggregationMethod.median,
     )
-
 
     def fill_form(self, run: Run) -> None:
         self.fields["peptide_df_instance"].choices = fill_helper.get_choices(
@@ -1129,11 +1140,28 @@ def fill_form(self, run: Run) -> None:
             Step, "fasta_df", fasta_df_instance_id
         )["Protein ID"].unique()
         peptide_df_protein_ids = peptide_df["Protein ID"].unique()
-        common_protein_ids = sorted(list(set(fasta_protein_ids) & set(peptide_df_protein_ids)))
+        common_protein_ids = sorted(
+            list(set(fasta_protein_ids) & set(peptide_df_protein_ids))
+        )
         self.fields["protein_id"].choices = fill_helper.to_choices(common_protein_ids)
 
-        peptide_df_samples = peptide_df["Sample"].unique()
-        self.fields["samples"].choices = fill_helper.to_choices(peptide_df_samples)
+        # Grouping
+        self.fields["grouping"].choices = (
+            fill_helper.get_choices_for_metadata_non_sample_columns(run)
+            + [("Sample", "Sample")]
+        )
+
+        grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0])
+        if grouping == "Sample":
+            self.fields["selected_groups"].choices = fill_helper.to_choices(
+                peptide_df["Sample"].unique()
+            )
+        else:
+            self.fields["selected_groups"].choices = fill_helper.to_choices(
+                run.steps.metadata_df[grouping].unique()
+            )
+        # if grouping is not Sample, show the aggregation method option
+        self.toggle_visibility("aggregation_method", grouping != "Sample")
 
 
 class FLEXIQuantLFForm(MethodForm):
diff --git a/ui/runs/forms/fill_helper.py b/ui/runs/forms/fill_helper.py
index eef83be6..d4f86960 100644
--- a/ui/runs/forms/fill_helper.py
+++ b/ui/runs/forms/fill_helper.py
@@ -1,5 +1,6 @@
 from protzilla.run import Run
 from protzilla.steps import Step
+from protzilla.constants.protzilla_logging import logger
 
 
 def to_choices(choices: list[str], required: bool = True) -> list[tuple[str, str]]:
@@ -29,6 +30,9 @@ def get_choices(
 
 
 def get_choices_for_metadata_non_sample_columns(run: Run) -> list[tuple[str, str]]:
+    if run.steps.metadata_df is None:
+        logger.warning("No metadata_df found in run")
+        return []
     return to_choices(
         run.steps.metadata_df.columns[
             run.steps.metadata_df.columns != "Sample"

From e7fc5f5b1e05e7bcaeffe28a5b46de1478aa6aac Mon Sep 17 00:00:00 2001
From: henning <henning.gaertner@student.hpi.de>
Date: Mon, 13 Jan 2025 14:42:07 +0100
Subject: [PATCH 12/13] Error handling and dynamic scaling of peptide boxes to
 preserve 1:2 ratio with bar plot

---
 protzilla/data_analysis/protein_coverage.py | 61 +++++++++++++++++----
 1 file changed, 49 insertions(+), 12 deletions(-)

diff --git a/protzilla/data_analysis/protein_coverage.py b/protzilla/data_analysis/protein_coverage.py
index 6b0e0bb9..bd77d783 100644
--- a/protzilla/data_analysis/protein_coverage.py
+++ b/protzilla/data_analysis/protein_coverage.py
@@ -94,8 +94,7 @@ def match_peptide_to_protein_ids(
 ) -> list[ProteinHit]:
     """
     Matches a peptide sequence to a dictionary of kmers in protein sequences.
-    Returns a list of tuples containing the protein ID and the start, end location of the peptide in the protein
-    sequence.
+    Returns a list of ProteinHit objects containing the protein ID and the start location of the peptide on the protein.
     """
     if len(peptide_sequence) == 0:
         raise ValueError("Peptide sequence is empty.")
@@ -107,6 +106,9 @@ def match_peptide_to_protein_ids(
     # match the kmers
     first_kmer_matches = protein_kmer_dictionary.get(first_kmer, [])
     last_kmer_matches = protein_kmer_dictionary.get(last_kmer, [])
+    if first_kmer_matches == [] or last_kmer_matches == []:
+        return []
+
     hits = [
         ProteinHit(
             protein_id, start_first_kmer, start_first_kmer + len(peptide_sequence)
@@ -171,6 +173,9 @@ def plot_protein_coverage(
     )
     sequence_kmer_dict = build_kmer_dictionary(protein_sequence_dict, k=5)
 
+    if protein_id not in protein_sequence_dict:
+        raise ValueError(f"Protein ID {protein_id} not found in protein dictionary.")
+
     protein_sequence = protein_sequence_dict[protein_id]
     peptide_matches = []
     coverage_by_group_name = {
@@ -227,6 +232,7 @@ def plot_protein_coverage(
         protein_sequence=protein_sequence,
         rows_by_group_name=rows_by_group_name,
         grouping=grouping,
+        aggregation_method=aggregation_method,
     )
 
     return dict(plots=[fig])
@@ -240,6 +246,7 @@ def build_coverage_plot(
     rows_by_group_name: dict[str, list[list[PeptideMatch]]],
     intensity_normalization: IntensityNormalization = IntensityNormalization.min_max_scaling,
     grouping: str = "",
+    aggregation_method: AggregationMethod = AggregationMethod.median,
 ) -> go.Figure:
     number_of_subplots = len(rows_by_group_name.keys())
     protein_sequence_labels = [
@@ -264,6 +271,21 @@ def build_coverage_plot(
         peptide_rows = rows_by_group_name[group_name]
         coverage = coverages_by_group_name[group_name]
 
+        # Calculate the desired heights
+        max_bar_height = max_coverage_value
+        desired_peptide_height = max_bar_height * 2  # 1:2 ratio
+
+        # Calculate the height scaling factor for peptides
+        num_peptide_rows = len(peptide_rows)
+        if num_peptide_rows > 0:
+            # Include spacing in the calculation
+            total_spacing = SEQUENCE_DEPTH_PEPTIDE_SPACING
+            peptide_box_height = (
+                desired_peptide_height - total_spacing
+            ) / num_peptide_rows
+        else:
+            peptide_box_height = 1.0
+
         add_sequence_depth_to_plot(
             fig=fig,
             coverage=coverage,
@@ -278,8 +300,10 @@ def build_coverage_plot(
         max_intensity, min_intensity = max(current_group_intensities), min(
             current_group_intensities
         )
-        scale_intensity = lambda intensity: (intensity - min_intensity) / (
-            max_intensity - min_intensity
+        scale_intensity = lambda intensity: (
+            (intensity - min_intensity) / (max_intensity - min_intensity)
+            if max_intensity != min_intensity
+            else 0
         )
         for row_index, row in enumerate(peptide_rows):
             for peptide_match in row:
@@ -291,17 +315,19 @@ def build_coverage_plot(
                     protein_sequence_labels=protein_sequence_labels,
                     row_index=row_index,
                     offset=max_coverage_value + SEQUENCE_DEPTH_PEPTIDE_SPACING,
+                    box_height=peptide_box_height,
                     color_a=INTENSITY_COLORS[0],
                     color_b=INTENSITY_COLORS[1],
                     grouping=grouping,
                 )
 
         fig.update_layout(hovermode="closest")
+        # Update y-axis range to accommodate both the bar plot and peptide sections
+        total_height = (
+            max_coverage_value + SEQUENCE_DEPTH_PEPTIDE_SPACING + desired_peptide_height
+        )
         fig.update_yaxes(
-            range=[
-                0,
-                len(peptide_rows) + max_coverage_value + SEQUENCE_DEPTH_PEPTIDE_SPACING,
-            ],
+            range=[0, total_height],
             showticklabels=False,
             showgrid=False,
             row=group_index,
@@ -310,12 +336,14 @@ def build_coverage_plot(
         fig.update_xaxes(showticklabels=False, showgrid=False, row=group_index, col=1)
     fig.update_xaxes(showticklabels=True, row=number_of_subplots, col=1)
 
-    add_intensity_legend(fig)
+    add_intensity_legend(fig, aggregation_method)
 
     return fig
 
 
-def add_intensity_legend(fig):
+def add_intensity_legend(
+    fig, aggregation_method: AggregationMethod = AggregationMethod.median
+):
     color_scale = [[0, INTENSITY_COLORS[0]], [1, INTENSITY_COLORS[1]]]
     color_legend_trace = go.Scatter(
         x=[None],
@@ -327,7 +355,7 @@ def add_intensity_legend(fig):
             cmin=0,
             cmax=1,
             colorbar=dict(
-                title="Intensity of peptide",
+                title=f"Intensity of peptide<br>(aggregated via {aggregation_method})",
                 x=1.0,  # Move further outside the plot
                 len=0.9,  # Increase length of colorbar (70% of subplot height)
                 thickness=30,  # Increase thickness of colorbar
@@ -366,6 +394,7 @@ def add_peptide_to_plot(
     protein_sequence_labels: list[str],
     row_index: int,
     offset: int,
+    box_height: float,
     color_a: str = "#FFFFFF",
     color_b: str = PLOT_COLOR_SEQUENCE[3],
     grouping: str = "",
@@ -374,7 +403,7 @@ def add_peptide_to_plot(
         peptide_match.start_location_on_protein,
         peptide_match.end_location_on_protein,
     )
-    y0, y1 = row_index + offset, row_index + offset + 1
+    y0, y1 = row_index * box_height + offset, (row_index + 1) * box_height + offset
     # interpolate between the two colors
     color = interpolate_color(color_a, color_b, normalized_intensity)
     PEPTIDE_SHAPE = dict(
@@ -450,6 +479,10 @@ def distribute_to_rows(
     Distributes peptide matches to rows such that no two peptides overlap in the same row.
     Greedy algorithm (see Interval Scheduling Problem).
     """
+    if len(peptide_matches) == 0:
+        raise ValueError(
+            "Attempted to distribute empty list of peptide matches to rows in plot."
+        )
     peptide_matches.sort(
         key=lambda peptide_match: peptide_match.start_location_on_protein
     )
@@ -474,4 +507,8 @@ def distribute_to_rows(
 
 
 def get_max_coverage(coverage: dict[list[int]]) -> int:
+    if len(coverage) == 0:
+        raise ValueError(
+            "Cannot calculate maximum coverage value: No coverage data provided."
+        )
     return max([max(coverage) for coverage in coverage.values()])

From a762b20cd5bbc23c06d6319ffaafa98776abf9f5 Mon Sep 17 00:00:00 2001
From: henning <henning.gaertner@student.hpi.de>
Date: Mon, 13 Jan 2025 15:08:49 +0100
Subject: [PATCH 13/13] Tests for protein_coverage

---
 .../data_analysis/test_protein_coverage.py    | 285 ++++++++++++++++++
 1 file changed, 285 insertions(+)

diff --git a/tests/protzilla/data_analysis/test_protein_coverage.py b/tests/protzilla/data_analysis/test_protein_coverage.py
index e69de29b..2339d840 100644
--- a/tests/protzilla/data_analysis/test_protein_coverage.py
+++ b/tests/protzilla/data_analysis/test_protein_coverage.py
@@ -0,0 +1,285 @@
+import pytest
+from protzilla.data_analysis.protein_coverage import (
+    distribute_to_rows,
+    get_max_coverage,
+    PeptideMatch,
+)
+
+import pytest
+import pandas as pd
+from protzilla.data_analysis.protein_coverage import (
+    extract_peptide_from_slice,
+    AggregationMethod,
+)
+
+import pytest
+from protzilla.data_analysis.protein_coverage import increment_coverage, ProteinHit
+
+import pytest
+from protzilla.data_analysis.protein_coverage import (
+    match_peptide_to_protein_ids,
+    ProteinHit,
+)
+
+
+def test_match_peptide_to_protein_ids_empty_peptide():
+    with pytest.raises(ValueError, match="Peptide sequence is empty."):
+        match_peptide_to_protein_ids("", {}, {})
+
+
+def test_match_peptide_to_protein_ids_no_matches():
+    peptide_sequence = "ABCDE"
+    protein_kmer_dictionary = {"ABCDE": [("protein1", 0)]}
+    protein_dictionary = {"protein1": "XYZ"}
+    result = match_peptide_to_protein_ids(
+        peptide_sequence, protein_kmer_dictionary, protein_dictionary
+    )
+    assert result == []
+
+
+def test_match_peptide_to_protein_ids_single_match():
+    peptide_sequence = "ABCDE"
+    protein_kmer_dictionary = {"ABCDE": [("protein1", 0)]}
+    protein_dictionary = {"protein1": "ABCDE"}
+    result = match_peptide_to_protein_ids(
+        peptide_sequence, protein_kmer_dictionary, protein_dictionary
+    )
+    expected = [
+        ProteinHit(
+            protein_id="protein1",
+            start_location_on_protein=0,
+            end_location_on_protein=5,
+        )
+    ]
+    assert result == expected
+
+
+def test_match_peptide_to_protein_ids_multiple_matches():
+    peptide_sequence = "ABCDE"
+    protein_kmer_dictionary = {"ABCDE": [("protein1", 0), ("protein2", 0)]}
+    protein_dictionary = {"protein1": "ABCDE", "protein2": "ABCDE"}
+    result = match_peptide_to_protein_ids(
+        peptide_sequence, protein_kmer_dictionary, protein_dictionary
+    )
+    expected = list(
+        set(
+            [
+                ProteinHit(
+                    protein_id="protein1",
+                    start_location_on_protein=0,
+                    end_location_on_protein=5,
+                ),
+                ProteinHit(
+                    protein_id="protein2",
+                    start_location_on_protein=0,
+                    end_location_on_protein=5,
+                ),
+            ]
+        )
+    )
+    assert result == expected
+
+
+def test_match_peptide_to_protein_ids_partial_match():
+    peptide_sequence = "ABCDE"
+    protein_kmer_dictionary = {"ABCDE": [("protein1", 0)]}
+    protein_dictionary = {"protein1": "ABCDEXYZ"}
+    result = match_peptide_to_protein_ids(
+        peptide_sequence, protein_kmer_dictionary, protein_dictionary
+    )
+    expected = [
+        ProteinHit(
+            protein_id="protein1",
+            start_location_on_protein=0,
+            end_location_on_protein=5,
+        )
+    ]
+    assert result == expected
+
+
+def test_increment_coverage():
+    coverage = [0, 0, 0, 0, 0]
+    protein_hit = ProteinHit(
+        protein_id="P12345", start_location_on_protein=1, end_location_on_protein=4
+    )
+    increment_coverage(coverage, protein_hit)
+    assert coverage == [0, 1, 1, 1, 0]
+
+
+def test_increment_coverage_multiple_hits():
+    coverage = [0, 0, 0, 0, 0]
+    protein_hit1 = ProteinHit(
+        protein_id="P12345", start_location_on_protein=1, end_location_on_protein=3
+    )
+    protein_hit2 = ProteinHit(
+        protein_id="P12345", start_location_on_protein=2, end_location_on_protein=4
+    )
+    increment_coverage(coverage, protein_hit1)
+    increment_coverage(coverage, protein_hit2)
+    assert coverage == [0, 1, 2, 1, 0]
+
+
+def test_increment_coverage_no_overlap():
+    coverage = [0, 0, 0, 0, 0]
+    protein_hit1 = ProteinHit(
+        protein_id="P12345", start_location_on_protein=0, end_location_on_protein=2
+    )
+    protein_hit2 = ProteinHit(
+        protein_id="P12345", start_location_on_protein=3, end_location_on_protein=5
+    )
+    increment_coverage(coverage, protein_hit1)
+    increment_coverage(coverage, protein_hit2)
+    assert coverage == [1, 1, 0, 1, 1]
+
+
+def test_extract_peptide_from_slice_single_entry():
+    data = {"Sequence": ["PEPTIDE"], "Intensity": [100]}
+    df = pd.DataFrame(data)
+    result = extract_peptide_from_slice(df)
+    assert result.equals(df)
+
+
+def test_extract_peptide_from_slice_multiple_entries():
+    data = {"Sequence": ["PEPTIDE", "PEPTIDE", "PEPTIDE"], "Intensity": [100, 130, 200]}
+    df = pd.DataFrame(data)
+    result = extract_peptide_from_slice(df)
+    expected = pd.DataFrame({"Sequence": "PEPTIDE", "Intensity": [130.0]})
+    expected.set_index("Sequence", inplace=True)
+    assert result.equals(expected)
+
+
+def test_extract_peptide_from_slice_median():
+    data = {"Sequence": ["PEPTIDE", "PEPTIDE"], "Intensity": [100, 200]}
+    df = pd.DataFrame(data)
+    result = extract_peptide_from_slice(df, AggregationMethod.median)
+    expected = pd.DataFrame({"Sequence": "PEPTIDE", "Intensity": [150.0]})
+    expected.set_index("Sequence", inplace=True)
+    assert result.equals(expected)
+
+
+def test_extract_peptide_from_slice_mean():
+    data = {"Sequence": ["PEPTIDE", "PEPTIDE"], "Intensity": [100, 200]}
+    df = pd.DataFrame(data)
+    result = extract_peptide_from_slice(df, AggregationMethod.mean)
+    expected = pd.DataFrame({"Sequence": "PEPTIDE", "Intensity": [150.0]})
+    expected.set_index("Sequence", inplace=True)
+    assert result.equals(expected)
+
+
+def test_extract_peptide_from_slice_unknown_strategy():
+    data = {"Sequence": ["PEPTIDE", "PEPTIDE"], "Intensity": [100, 200]}
+    df = pd.DataFrame(data)
+    with pytest.raises(ValueError, match="Unknown strategy: unknown"):
+        extract_peptide_from_slice(df, "unknown")
+
+
+import pytest
+from protzilla.data_analysis.protein_coverage import distribute_to_rows, PeptideMatch
+
+
+def test_distribute_to_rows_empty():
+    with pytest.raises(
+        ValueError,
+        match="Attempted to distribute empty list of peptide matches to rows in plot.",
+    ):
+        distribute_to_rows([])
+
+
+def test_distribute_to_rows_single_peptide():
+    peptide_matches = [
+        PeptideMatch(
+            peptide_sequence="AAA",
+            start_location_on_protein=0,
+            end_location_on_protein=3,
+            metadata_group="group1",
+        )
+    ]
+    expected = {"group1": [[peptide_matches[0]]]}
+    assert distribute_to_rows(peptide_matches) == expected
+
+
+def test_distribute_to_rows_non_overlapping():
+    peptide_matches = [
+        PeptideMatch(
+            peptide_sequence="AAA",
+            start_location_on_protein=0,
+            end_location_on_protein=3,
+            metadata_group="group1",
+        ),
+        PeptideMatch(
+            peptide_sequence="BBB",
+            start_location_on_protein=4,
+            end_location_on_protein=7,
+            metadata_group="group1",
+        ),
+    ]
+    expected = {"group1": [[peptide_matches[0], peptide_matches[1]]]}
+    assert distribute_to_rows(peptide_matches) == expected
+
+
+def test_distribute_to_rows_overlapping():
+    peptide_matches = [
+        PeptideMatch(
+            peptide_sequence="AAA",
+            start_location_on_protein=0,
+            end_location_on_protein=3,
+            metadata_group="group1",
+        ),
+        PeptideMatch(
+            peptide_sequence="BBB",
+            start_location_on_protein=2,
+            end_location_on_protein=5,
+            metadata_group="group1",
+        ),
+    ]
+    expected = {"group1": [[peptide_matches[0]], [peptide_matches[1]]]}
+    assert distribute_to_rows(peptide_matches) == expected
+
+
+def test_distribute_to_rows_multiple_groups():
+    peptide_matches = [
+        PeptideMatch(
+            peptide_sequence="AAA",
+            start_location_on_protein=0,
+            end_location_on_protein=3,
+            metadata_group="group1",
+        ),
+        PeptideMatch(
+            peptide_sequence="BBB",
+            start_location_on_protein=4,
+            end_location_on_protein=7,
+            metadata_group="group2",
+        ),
+    ]
+    expected = {"group1": [[peptide_matches[0]]], "group2": [[peptide_matches[1]]]}
+    assert distribute_to_rows(peptide_matches) == expected
+
+
+def test_distribute_to_rows():
+    peptide_matches = [
+        PeptideMatch("PEPTIDE1", 0, 7, 1.0, "group1"),
+        PeptideMatch("PEPTIDE2", 8, 15, 1.0, "group1"),
+        PeptideMatch("PEPTIDE3", 16, 23, 1.0, "group1"),
+        PeptideMatch("PEPTIDE4", 0, 7, 1.0, "group2"),
+        PeptideMatch("PEPTIDE5", 8, 15, 1.0, "group2"),
+    ]
+    rows = distribute_to_rows(peptide_matches)
+    assert len(rows["group1"]) == 1
+    assert len(rows["group2"]) == 1
+
+
+def test_get_max_coverage():
+    coverage = {
+        "group1": [1, 2, 3, 4, 5],
+        "group2": [2, 3, 4, 5, 6],
+    }
+    max_coverage = get_max_coverage(coverage)
+    assert max_coverage == 6
+
+
+def test_get_max_coverage_empty():
+    with pytest.raises(
+        ValueError,
+        match="Cannot calculate maximum coverage value: No coverage data provided.",
+    ):
+        get_max_coverage({})