From 51dff9489d278da95a9a09606ab55d07703dffa2 Mon Sep 17 00:00:00 2001 From: henning Date: Thu, 14 Nov 2024 15:58:03 +0100 Subject: [PATCH 01/13] Add fasta import, primitive peptide - protein sequence alignment, preliminary coverage bar plot --- protzilla/data_analysis/protein_coverage.py | 70 ++++++++ protzilla/importing/fasta_import.py | 36 ++++ protzilla/methods/data_analysis.py | 128 ++++++++++---- protzilla/methods/importing.py | 21 ++- .../data_analysis/test_protein_coverage.py | 73 ++++++++ ui/runs/form_mapping.py | 2 + ui/runs/forms/data_analysis.py | 156 +++++++++++++----- ui/runs/forms/importing.py | 4 + 8 files changed, 407 insertions(+), 83 deletions(-) create mode 100644 protzilla/data_analysis/protein_coverage.py create mode 100644 protzilla/importing/fasta_import.py create mode 100644 tests/protzilla/data_analysis/test_protein_coverage.py diff --git a/protzilla/data_analysis/protein_coverage.py b/protzilla/data_analysis/protein_coverage.py new file mode 100644 index 00000000..afc504ad --- /dev/null +++ b/protzilla/data_analysis/protein_coverage.py @@ -0,0 +1,70 @@ +from typing import List + +import pandas as pd +import plotly.graph_objects as go + + +def align_peptide_to_protein_sequence( + peptide_sequence: str, protein_sequence: str +) -> List[int]: + """ + Aligns a peptide to a protein sequence and returns the indices of the protein sequence where the peptide. + NAIVE APPROACH + """ + if ( + len(peptide_sequence) == 0 + or len(protein_sequence) == 0 + or len(peptide_sequence) > len(protein_sequence) + ): + return [] + indices = [] + for i in range(len(protein_sequence) - len(peptide_sequence) + 1): + if protein_sequence[i : i + len(peptide_sequence)] == peptide_sequence: + indices.append(i) + return indices + + +def plot_protein_coverage( + fasta_df: pd.DataFrame, peptide_df: pd.DataFrame, protein_id: str +) -> None: + """ + Plots the coverage of a protein sequence by peptides. + """ + # Retrieve the relevant peptides from the peptide df + relevant_peptides = peptide_df[peptide_df["Protein ID"] == protein_id] + protein_sequence = fasta_df[fasta_df["Protein ID"] == protein_id][ + "Protein Sequence" + ].values[0] + protein_sequence_length = len(protein_sequence) + coverage = [0] * protein_sequence_length + for peptide_sequence in relevant_peptides["Sequence"].unique(): + indices = align_peptide_to_protein_sequence(peptide_sequence, protein_sequence) + for i in indices: + for j in range(len(peptide_sequence)): + coverage[i + j] += 1 + + # Make sure coverage length matches the sequence length + if len(coverage) != protein_sequence_length: + raise ValueError("Length of coverage must match protein_sequence_length.") + + # Generate labels for amino acid positions + amino_acid_positions = list(range(1, protein_sequence_length + 1)) + x_labels = [f"{i} ({protein_sequence[i - 1]})" for i in amino_acid_positions] + + # Create the bar chart + fig = go.Figure() + + # Add bars for coverage values + fig.add_trace( + go.Bar(x=x_labels, y=coverage, name="Coverage", marker=dict(color="skyblue")) + ) + + # Customize layout + fig.update_layout( + title="Protein Coverage", + xaxis_title="Amino Acid", + yaxis_title="Coverage Value", + xaxis=dict(tickmode="linear"), + bargap=0.1, # Adjust space between bars if needed + ) + return dict(plots=[fig]) diff --git a/protzilla/importing/fasta_import.py b/protzilla/importing/fasta_import.py new file mode 100644 index 00000000..bbdb9373 --- /dev/null +++ b/protzilla/importing/fasta_import.py @@ -0,0 +1,36 @@ +""" +This module contains the code to parse a fasta file containing protein sequences and their ids. +""" +import logging + +import pandas as pd +from Bio import SeqIO + + +def parse_fasta_id(fasta_id: str) -> str: + """ + Parse the fasta id to get the protein name from the fasta id string + """ + metadata = fasta_id.split("|")[1] + if len(metadata) < 2: + logging.warning(f"Metadata too short: {metadata}") + return "" + return metadata + + +def fasta_import(file_path: str) -> pd.DataFrame: + """ + Import a fasta file and return a DataFrame with the protein sequences and their protein ids + """ + fasta_iterator = SeqIO.parse(open(file_path), "fasta") + protein_ids = [] + protein_sequences = [] + for fasta_sequence in fasta_iterator: + id, sequence = parse_fasta_id(fasta_sequence.id), str(fasta_sequence.seq) + protein_ids.append(id) + protein_sequences.append(sequence) + + fasta_sequences = pd.DataFrame( + {"Protein ID": protein_ids, "Protein Sequence": protein_sequences} + ) + return {"fasta_df": fasta_sequences} diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 511629f5..ab8bdc27 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -7,15 +7,17 @@ k_means, ) from protzilla.data_analysis.differential_expression_anova import anova -from protzilla.data_analysis.differential_expression_kruskal_wallis import kruskal_wallis_test_on_ptm_data, \ - kruskal_wallis_test_on_intensity_data +from protzilla.data_analysis.differential_expression_kruskal_wallis import ( + kruskal_wallis_test_on_intensity_data, + kruskal_wallis_test_on_ptm_data, +) from protzilla.data_analysis.differential_expression_linear_model import linear_model from protzilla.data_analysis.differential_expression_mann_whitney import ( - mann_whitney_test_on_intensity_data, mann_whitney_test_on_ptm_data) + mann_whitney_test_on_intensity_data, + mann_whitney_test_on_ptm_data, +) from protzilla.data_analysis.differential_expression_t_test import t_test from protzilla.data_analysis.dimension_reduction import t_sne, umap -from protzilla.data_analysis.ptm_analysis import ptms_per_sample, \ - ptms_per_protein_and_sample, select_peptides_of_protein from protzilla.data_analysis.model_evaluation import evaluate_classification_model from protzilla.data_analysis.plots import ( clustergram_plot, @@ -23,11 +25,12 @@ prot_quant_plot, scatter_plot, ) +from protzilla.data_analysis.protein_coverage import plot_protein_coverage from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph from protzilla.data_analysis.ptm_analysis import ( - select_peptides_of_protein, ptms_per_protein_and_sample, ptms_per_sample, + select_peptides_of_protein, ) from protzilla.data_analysis.ptm_quantification import flexiquant_lf from protzilla.methods.data_preprocessing import TransformationLog @@ -164,8 +167,10 @@ def plot(self, inputs): class DifferentialExpressionMannWhitneyOnIntensity(DataAnalysisStep): display_name = "Mann-Whitney Test" operation = "differential_expression" - method_description = ("A function to conduct a Mann-Whitney U test between groups defined in the clinical data." - "The p-values are corrected for multiple testing.") + method_description = ( + "A function to conduct a Mann-Whitney U test between groups defined in the clinical data." + "The p-values are corrected for multiple testing." + ) input_keys = [ "protein_df", @@ -191,7 +196,9 @@ def method(self, inputs: dict) -> dict: def insert_dataframes(self, steps: StepManager, inputs) -> dict: if steps.get_step_output(Step, "protein_df", inputs["protein_df"]) is not None: - inputs["protein_df"] = steps.get_step_output(Step, "protein_df", inputs["protein_df"]) + inputs["protein_df"] = steps.get_step_output( + Step, "protein_df", inputs["protein_df"] + ) inputs["metadata_df"] = steps.metadata_df inputs["log_base"] = steps.get_step_input(TransformationLog, "log_base") return inputs @@ -200,8 +207,10 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: class DifferentialExpressionMannWhitneyOnPTM(DataAnalysisStep): display_name = "Mann-Whitney Test" operation = "Peptide analysis" - method_description = ("A function to conduct a Mann-Whitney U test between groups defined in the clinical data." - "The p-values are corrected for multiple testing.") + method_description = ( + "A function to conduct a Mann-Whitney U test between groups defined in the clinical data." + "The p-values are corrected for multiple testing." + ) input_keys = [ "ptm_df", @@ -234,8 +243,10 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: class DifferentialExpressionKruskalWallisOnIntensity(DataAnalysisStep): display_name = "Kruskal-Wallis Test" operation = "differential_expression" - method_description = ("A function to conduct a Kruskal-Wallis test between groups defined in the clinical data." - "The p-values are corrected for multiple testing.") + method_description = ( + "A function to conduct a Kruskal-Wallis test between groups defined in the clinical data." + "The p-values are corrected for multiple testing." + ) input_keys = [ "protein_df", @@ -265,8 +276,10 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: class DifferentialExpressionKruskalWallisOnIntensity(DataAnalysisStep): display_name = "Kruskal-Wallis Test" operation = "differential_expression" - method_description = ("A function to conduct a Kruskal-Wallis test between groups defined in the clinical data." - "The p-values are corrected for multiple testing.") + method_description = ( + "A function to conduct a Kruskal-Wallis test between groups defined in the clinical data." + "The p-values are corrected for multiple testing." + ) input_keys = [ "protein_df", @@ -288,7 +301,9 @@ def method(self, inputs: dict) -> dict: return kruskal_wallis_test_on_intensity_data(**inputs) def insert_dataframes(self, steps: StepManager, inputs) -> dict: - inputs["protein_df"] = steps.get_step_output(Step, "protein_df", inputs["protein_df"]) + inputs["protein_df"] = steps.get_step_output( + Step, "protein_df", inputs["protein_df"] + ) inputs["metadata_df"] = steps.metadata_df inputs["log_base"] = steps.get_step_input(TransformationLog, "log_base") return inputs @@ -297,8 +312,10 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: class DifferentialExpressionKruskalWallisOnPTM(DataAnalysisStep): display_name = "Kruskal-Wallis Test" operation = "Peptide analysis" - method_description = ("A function to conduct a Kruskal-Wallis test between groups defined in the clinical data." - "The p-values are corrected for multiple testing.") + method_description = ( + "A function to conduct a Kruskal-Wallis test between groups defined in the clinical data." + "The p-values are corrected for multiple testing." + ) input_keys = [ "ptm_df", @@ -327,9 +344,11 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: class PlotVolcano(PlotStep): display_name = "Volcano Plot" operation = "plot" - method_description = ("Plots the results of a differential expression analysis in a volcano plot. The x-axis shows " - "the log2 fold change and the y-axis shows the -log10 of the corrected p-values. The user " - "can define a fold change threshold and an alpha level to highlight significant items.") + method_description = ( + "Plots the results of a differential expression analysis in a volcano plot. The x-axis shows " + "the log2 fold change and the y-axis shows the -log10 of the corrected p-values. The user " + "can define a fold change threshold and an alpha level to highlight significant items." + ) input_keys = [ "p_values", "fc_threshold", @@ -762,6 +781,33 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: return inputs +class PlotProteinCoverage(PlotStep): + display_name = "Protein Coverage Plot" + operation = "plot" + method_description = ( + "Create a protein coverage plot from a protein graph and peptide data" + ) + + input_keys = [ + "protein_id", + "fasta_df", + "peptide_df", + ] + output_keys = [] + + def method(self, inputs: dict) -> dict: + return plot_protein_coverage(**inputs) + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + inputs["fasta_df"] = steps.get_step_output( + Step, "fasta_df", inputs["fasta_df_instance"] + ) + inputs["peptide_df"] = steps.get_step_output( + Step, "peptide_df", inputs["peptide_df_instance"] + ) + return + + class ProteinGraphVariationGraph(DataAnalysisStep): display_name = "Protein Variation Graph" operation = "protein_graph" @@ -841,17 +887,23 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: inputs["metadata_df"] = steps.metadata_df if inputs["auto_select"]: - significant_proteins = ( - steps.get_step_output(DataAnalysisStep, "significant_proteins_df", inputs["protein_list"])) - index_of_most_significant_protein = significant_proteins['corrected_p_value'].idxmin() - most_significant_protein = significant_proteins.loc[index_of_most_significant_protein] + significant_proteins = steps.get_step_output( + DataAnalysisStep, "significant_proteins_df", inputs["protein_list"] + ) + index_of_most_significant_protein = significant_proteins[ + "corrected_p_value" + ].idxmin() + most_significant_protein = significant_proteins.loc[ + index_of_most_significant_protein + ] inputs["protein_id"] = [most_significant_protein["Protein ID"]] - self.messages.append({ - "level": logging.INFO, - "msg": - f"Selected the most significant Protein: {most_significant_protein['Protein ID']}, " - f"from {inputs['protein_list']}" - }) + self.messages.append( + { + "level": logging.INFO, + "msg": f"Selected the most significant Protein: {most_significant_protein['Protein ID']}, " + f"from {inputs['protein_list']}", + } + ) return inputs @@ -859,8 +911,10 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: class PTMsPerSample(DataAnalysisStep): display_name = "PTMs per Sample" operation = "Peptide analysis" - method_description = ("Analyze the post-translational modifications (PTMs) of a single protein of interest. " - "This function requires a peptide dataframe with PTM information.") + method_description = ( + "Analyze the post-translational modifications (PTMs) of a single protein of interest. " + "This function requires a peptide dataframe with PTM information." + ) input_keys = [ "peptide_df", @@ -882,8 +936,10 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: class PTMsProteinAndPerSample(DataAnalysisStep): display_name = "PTMs per Sample and Protein" operation = "Peptide analysis" - method_description = ("Analyze the post-translational modifications (PTMs) of all Proteins. " - "This function requires a peptide dataframe with PTM information.") + method_description = ( + "Analyze the post-translational modifications (PTMs) of all Proteins. " + "This function requires a peptide dataframe with PTM information." + ) input_keys = [ "peptide_df", @@ -899,4 +955,4 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: inputs["peptide_df"] = steps.get_step_output( Step, "peptide_df", inputs["peptide_df"] ) - return inputs \ No newline at end of file + return inputs diff --git a/protzilla/methods/importing.py b/protzilla/methods/importing.py index 6a2f6835..effcd5f1 100644 --- a/protzilla/methods/importing.py +++ b/protzilla/methods/importing.py @@ -1,5 +1,6 @@ from __future__ import annotations +from protzilla.importing.fasta_import import fasta_import from protzilla.importing.metadata_import import ( metadata_column_assignment, metadata_import_method, @@ -10,7 +11,7 @@ max_quant_import, ms_fragger_import, ) -from protzilla.importing.peptide_import import peptide_import, evidence_import +from protzilla.importing.peptide_import import evidence_import, peptide_import from protzilla.steps import Step, StepManager @@ -51,7 +52,9 @@ def method(self, inputs): class MsFraggerImport(ImportingStep): display_name = "MS Fragger Combined Protein Import" operation = "Protein Data Import" - method_description = "Import the combined_protein.tsv file form output of MS Fragger" + method_description = ( + "Import the combined_protein.tsv file form output of MS Fragger" + ) input_keys = ["file_path", "intensity_name", "map_to_uniprot", "aggregation_method"] output_keys = ["protein_df"] @@ -139,4 +142,16 @@ class EvidenceImport(ImportingStep): output_keys = ["peptide_df"] def method(self, inputs): - return evidence_import(**inputs) \ No newline at end of file + return evidence_import(**inputs) + + +class FastaImport(ImportingStep): + display_name = "Fasta Protein Sequence Import" + operation = "fasta_import" + method_description = "Import a fasta file containing protein sequences." + + input_keys = ["file_path"] + output_keys = ["fasta_df"] + + def method(self, inputs): + return fasta_import(**inputs) diff --git a/tests/protzilla/data_analysis/test_protein_coverage.py b/tests/protzilla/data_analysis/test_protein_coverage.py new file mode 100644 index 00000000..f2ca2f1e --- /dev/null +++ b/tests/protzilla/data_analysis/test_protein_coverage.py @@ -0,0 +1,73 @@ +from protzilla.data_analysis.protein_coverage import align_peptide_to_protein_sequence + + +def test_aligns_peptide_at_start(): + peptide = "MKT" + protein_sequence = "MKTLLL" + expected_indices = [0] + assert ( + align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices + ) + + +def test_aligns_peptide_in_middle(): + peptide = "KTL" + protein_sequence = "MKTLLL" + expected_indices = [1] + assert ( + align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices + ) + + +def test_aligns_peptide_at_end(): + peptide = "LLL" + protein_sequence = "MKTLLL" + expected_indices = [3] + assert ( + align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices + ) + + +def test_aligns_multiple_occurrences(): + peptide = "LL" + protein_sequence = "MKTLLLL" + expected_indices = [3, 4, 5] + assert ( + align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices + ) + + +def aligns_no_occurrences(): + peptide = "XYZ" + protein_sequence = "MKTLLL" + expected_indices = [] + assert ( + align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices + ) + + +def test_aligns_empty_peptide(): + peptide = "" + protein_sequence = "MKTLLL" + expected_indices = [] + assert ( + align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices + ) + + +def test_aligns_empty_protein_sequence(): + peptide = "MKT" + protein_sequence = "" + expected_indices = [] + assert ( + align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices + ) + + +def test_aligns_peptide_longer_than_protein_sequence(): + peptide = "MKTLLL" + protein_sequence = "MKT" + expected_indices = [] + assert ( + align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices + ) diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py index 7221bba6..4d6fae1c 100644 --- a/ui/runs/form_mapping.py +++ b/ui/runs/form_mapping.py @@ -22,6 +22,7 @@ importing.MetadataColumnAssignment: importing_forms.MetadataColumnAssignmentForm, importing.PeptideImport: importing_forms.PeptideImportForm, importing.EvidenceImport: importing_forms.EvidenceImportForm, + importing.FastaImport: importing_forms.FastaImportForm, data_preprocessing.FilterProteinsBySamplesMissing: data_preprocessing_forms.FilterProteinsBySamplesMissingForm, data_preprocessing.FilterByProteinsCount: data_preprocessing_forms.FilterByProteinsCountForm, data_preprocessing.FilterSamplesByProteinsMissing: data_preprocessing_forms.FilterSamplesByProteinsMissingForm, @@ -52,6 +53,7 @@ data_analysis.PlotClustergram: data_analysis_forms.PlotClustergramForm, data_analysis.PlotProtQuant: data_analysis_forms.PlotProtQuantForm, data_analysis.PlotPrecisionRecallCurve: data_analysis_forms.PlotPrecisionRecallCurveForm, + data_analysis.PlotProteinCoverage: data_analysis_forms.PlotProteinCoverageForm, data_analysis.PlotROC: data_analysis_forms.PlotROCCurveForm, data_analysis.ClusteringKMeans: data_analysis_forms.ClusteringKMeansForm, data_analysis.ClusteringExpectationMaximisation: data_analysis_forms.ClusteringExpectationMaximizationForm, diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 6e34aa6e..be9ea58f 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -1,16 +1,10 @@ -import logging from enum import Enum, StrEnum -from protzilla.methods.data_preprocessing import DataPreprocessingStep from protzilla.methods.data_analysis import ( DataAnalysisStep, - DifferentialExpressionLinearModel, - DifferentialExpressionTTest, DimensionReductionUMAP, - DataAnalysisStep, PTMsPerSample, SelectPeptidesForProtein, - DifferentialExpressionMannWhitneyOnPTM, ) from protzilla.methods.data_preprocessing import DataPreprocessingStep from protzilla.run import Run @@ -25,7 +19,6 @@ CustomFloatField, CustomMultipleChoiceField, CustomNumberField, - CustomBooleanField, ) @@ -169,7 +162,11 @@ class DifferentialExpressionANOVAForm(MethodForm): initial=MultipleTestingCorrectionMethod.benjamini_hochberg, ) alpha = CustomFloatField( - label="Error rate (alpha)", min_value=0, max_value=1, step_size=0.01, initial=0.05 + label="Error rate (alpha)", + min_value=0, + max_value=1, + step_size=0.01, + initial=0.05, ) grouping = CustomChoiceField(choices=[], label="Grouping from metadata") @@ -256,7 +253,11 @@ class DifferentialExpressionLinearModelForm(MethodForm): initial=MultipleTestingCorrectionMethod.benjamini_hochberg, ) alpha = CustomFloatField( - label="Error rate (alpha)", min_value=0, max_value=1, step_size=0.01, initial=0.05 + label="Error rate (alpha)", + min_value=0, + max_value=1, + step_size=0.01, + initial=0.05, ) grouping = CustomChoiceField(choices=[], label="Grouping from metadata") group1 = CustomChoiceField(choices=[], label="Group 1") @@ -293,25 +294,31 @@ def fill_form(self, run: Run) -> None: class DifferentialExpressionMannWhitneyOnIntensityForm(MethodForm): is_dynamic = True - protein_df = CustomChoiceField( - choices=[], label="Step to use protein data from" - ) + protein_df = CustomChoiceField(choices=[], label="Step to use protein data from") multiple_testing_correction_method = CustomChoiceField( choices=MultipleTestingCorrectionMethod, label="Multiple testing correction", initial=MultipleTestingCorrectionMethod.benjamini_hochberg, ) alpha = CustomFloatField( - label="Error rate (alpha)", min_value=0, max_value=1, step_size=0.01, initial=0.05 + label="Error rate (alpha)", + min_value=0, + max_value=1, + step_size=0.01, + initial=0.05, ) grouping = CustomChoiceField(choices=[], label="Grouping from metadata") group1 = CustomChoiceField(choices=[], label="Group 1") group2 = CustomChoiceField(choices=[], label="Group 2") def fill_form(self, run: Run) -> None: - self.fields["protein_df"].choices = fill_helper.get_choices_for_protein_df_steps(run) + self.fields[ + "protein_df" + ].choices = fill_helper.get_choices_for_protein_df_steps(run) - self.fields["grouping"].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + self.fields[ + "grouping" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0]) @@ -346,7 +353,11 @@ class DifferentialExpressionMannWhitneyOnPTMForm(MethodForm): initial=MultipleTestingCorrectionMethod.benjamini_hochberg, ) alpha = CustomFloatField( - label="Error rate (alpha)", min_value=0, max_value=1, step_size=0.01, initial=0.05 + label="Error rate (alpha)", + min_value=0, + max_value=1, + step_size=0.01, + initial=0.05, ) p_value_calculation_method = CustomChoiceField( choices=PValueCalculationMethod, @@ -362,7 +373,9 @@ def fill_form(self, run: Run) -> None: run.steps.get_instance_identifiers(PTMsPerSample, "ptm_df") ) - self.fields["grouping"].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + self.fields[ + "grouping" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0]) @@ -390,16 +403,18 @@ def fill_form(self, run: Run) -> None: class DifferentialExpressionKruskalWallisOnIntensityForm(MethodForm): is_dynamic = True - protein_df = CustomChoiceField( - choices=[], label="Step to use protein data from" - ) + protein_df = CustomChoiceField(choices=[], label="Step to use protein data from") multiple_testing_correction_method = CustomChoiceField( choices=MultipleTestingCorrectionMethod, label="Multiple testing correction", initial=MultipleTestingCorrectionMethod.benjamini_hochberg, ) alpha = CustomFloatField( - label="Error rate (alpha)", min_value=0, max_value=1, step_size=0.01, initial=0.05 + label="Error rate (alpha)", + min_value=0, + max_value=1, + step_size=0.01, + initial=0.05, ) grouping = CustomChoiceField(choices=[], label="Grouping from metadata") @@ -408,9 +423,13 @@ class DifferentialExpressionKruskalWallisOnIntensityForm(MethodForm): ) def fill_form(self, run: Run) -> None: - self.fields["protein_df"].choices = fill_helper.get_choices_for_protein_df_steps(run) + self.fields[ + "protein_df" + ].choices = fill_helper.get_choices_for_protein_df_steps(run) - self.fields["grouping"].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + self.fields[ + "grouping" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0]) self.fields["selected_groups"].choices = fill_helper.to_choices( run.steps.metadata_df[grouping].unique() @@ -420,16 +439,18 @@ def fill_form(self, run: Run) -> None: class DifferentialExpressionKruskalWallisOnPTMForm(MethodForm): is_dynamic = True - ptm_df = CustomChoiceField( - choices=[], label="Step to use ptm data from" - ) + ptm_df = CustomChoiceField(choices=[], label="Step to use ptm data from") multiple_testing_correction_method = CustomChoiceField( choices=MultipleTestingCorrectionMethod, label="Multiple testing correction", initial=MultipleTestingCorrectionMethod.benjamini_hochberg, ) alpha = CustomFloatField( - label="Error rate (alpha)", min_value=0, max_value=1, step_size=0.01, initial=0.05 + label="Error rate (alpha)", + min_value=0, + max_value=1, + step_size=0.01, + initial=0.05, ) grouping = CustomChoiceField(choices=[], label="Grouping from metadata") @@ -441,7 +462,9 @@ def fill_form(self, run: Run) -> None: self.fields["ptm_df"].choices = fill_helper.to_choices( run.steps.get_instance_identifiers(PTMsPerSample, "ptm_df") ) - self.fields["grouping"].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + self.fields[ + "grouping" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0]) self.fields["selected_groups"].choices = fill_helper.to_choices( run.steps.metadata_df[grouping].unique() @@ -466,7 +489,8 @@ class PlotVolcanoForm(MethodForm): def fill_form(self, run: Run) -> None: self.fields["input_dict"].choices = fill_helper.to_choices( run.steps.get_instance_identifiers( - Step, ["corrected_p_values_df", "log2_fold_change_df"], + Step, + ["corrected_p_values_df", "log2_fold_change_df"], ) ) @@ -486,7 +510,9 @@ def fill_form(self, run: Run) -> None: if step_output is not None: items_of_interest = step_output["PTM"].unique() - self.fields["items_of_interest"].choices = fill_helper.to_choices(items_of_interest) + self.fields["items_of_interest"].choices = fill_helper.to_choices( + items_of_interest + ) class PlotScatterPlotForm(MethodForm): @@ -804,7 +830,7 @@ class ClassificationRandomForestForm(MethodForm): # TODO: Workflow_meta line 1763 train_val_split = CustomNumberField( label="Choose the size of the validation data set (you can either enter the absolute number of validation " - "samples or a number between 0.0 and 1.0 to represent the percentage of validation samples)", + "samples or a number between 0.0 and 1.0 to represent the percentage of validation samples)", initial=0.20, ) # TODO: Workflow_meta line 1770 @@ -892,7 +918,7 @@ class ClassificationSVMForm(MethodForm): ) train_val_split = CustomNumberField( label="Choose the size of the validation data set (you can either enter the absolute number of validation " - "samples or a number between 0.0 and 1.0 to represent the percentage of validation samples)", + "samples or a number between 0.0 and 1.0 to represent the percentage of validation samples)", initial=0.20, ) # TODO: Workflow_meta line 1973 @@ -1009,7 +1035,7 @@ class DimensionReductionUMAPForm(MethodForm): ) n_neighbors = CustomNumberField( label="The size of local neighborhood (in terms of number of neighboring sample points) used for manifold " - "approximation", + "approximation", min_value=2, max_value=100, step_size=1, @@ -1050,7 +1076,7 @@ class ProteinGraphPeptidesToIsoformForm(MethodForm): k = CustomNumberField(label="k-mer length", min_value=1, step_size=1, initial=5) allowed_mismatches = CustomNumberField( label="Number of allowed mismatched amino acids per peptide. For many allowed mismatches, this can take a " - "long time.", + "long time.", min_value=0, step_size=1, initial=2, @@ -1064,6 +1090,44 @@ class ProteinGraphVariationGraphForm(MethodForm): # TODO: workflow_meta line 2291 - 2295 +class PlotProteinCoverageForm(MethodForm): + peptide_df_instance = CustomChoiceField( + choices=[], + label="Step to use peptide data from", + ) + fasta_df_instance = CustomChoiceField( + choices=[], + label="Step to use fasta protein data from", + ) + protein_id = CustomChoiceField( + choices=[], + label="Protein ID", + ) + + def fill_form(self, run: Run) -> None: + self.fields["peptide_df_instance"].choices = fill_helper.get_choices( + run, "peptide_df" + ) + self.fields["fasta_df_instance"].choices = fill_helper.get_choices( + run, "fasta_df", Step + ) + peptide_df_instance_id = self.data.get( + "peptide_df_instance", self.fields["peptide_df_instance"].choices[0][0] + ) + fasta_df_instance_id = self.data.get( + "fasta_df_instance", self.fields["fasta_df_instance"].choices[0][0] + ) + peptide_df = run.steps.get_step_output( + Step, "peptide_df", peptide_df_instance_id + ) + fasta_protein_ids = run.steps.get_step_output( + Step, "fasta_df", fasta_df_instance_id + )["Protein ID"].unique() + peptide_df_protein_ids = peptide_df["Protein ID"].unique() + common_protein_ids = list(set(fasta_protein_ids) & set(peptide_df_protein_ids)) + self.fields["protein_id"].choices = fill_helper.to_choices(common_protein_ids) + + class FLEXIQuantLFForm(MethodForm): is_dynamic = True @@ -1143,13 +1207,17 @@ def fill_form(self, run: Run) -> None: selected_auto_select = self.data.get("auto_select") - choices = fill_helper.to_choices([] if selected_auto_select else ["all proteins"]) - choices.extend(fill_helper.get_choices( - run, "significant_proteins_df", DataAnalysisStep - )) + choices = fill_helper.to_choices( + [] if selected_auto_select else ["all proteins"] + ) + choices.extend( + fill_helper.get_choices(run, "significant_proteins_df", DataAnalysisStep) + ) self.fields["protein_list"].choices = choices - chosen_list = self.data.get("protein_list", self.fields["protein_list"].choices[0][0]) + chosen_list = self.data.get( + "protein_list", self.fields["protein_list"].choices[0][0] + ) if not selected_auto_select: self.toggle_visibility("sort_proteins", True) self.toggle_visibility("protein_ids", True) @@ -1163,7 +1231,9 @@ def fill_form(self, run: Run) -> None: self.fields["protein_ids"].choices = fill_helper.to_choices( run.steps.get_step_output( DataAnalysisStep, "significant_proteins_df", chosen_list - ).sort_values(by="corrected_p_value")["Protein ID"].unique() + ) + .sort_values(by="corrected_p_value")["Protein ID"] + .unique() ) else: self.fields["protein_ids"].choices = fill_helper.to_choices( @@ -1183,9 +1253,7 @@ class PTMsPerSampleForm(MethodForm): ) def fill_form(self, run: Run) -> None: - self.fields["peptide_df"].choices = fill_helper.get_choices( - run, "peptide_df" - ) + self.fields["peptide_df"].choices = fill_helper.get_choices(run, "peptide_df") single_protein_peptides = run.steps.get_instance_identifiers( SelectPeptidesForProtein, "peptide_df" @@ -1207,4 +1275,4 @@ def fill_form(self, run: Run) -> None: SelectPeptidesForProtein, "peptide_df" ) if single_protein_peptides: - self.fields["peptide_df"].initial = single_protein_peptides[0] \ No newline at end of file + self.fields["peptide_df"].initial = single_protein_peptides[0] diff --git a/ui/runs/forms/importing.py b/ui/runs/forms/importing.py index 8975e3ba..600ea517 100644 --- a/ui/runs/forms/importing.py +++ b/ui/runs/forms/importing.py @@ -192,3 +192,7 @@ def fill_form(self, run: Run) -> None: self.fields["map_to_uniprot"].initial = run.steps.get_step_input( [MaxQuantImport, MsFraggerImport, DiannImport], "map_to_uniprot" ) + + +class FastaImportForm(MethodForm): + file_path = CustomFileField(label="Fasta file") From f66d4ed30480b1daaea72085009d981e2b28ba63 Mon Sep 17 00:00:00 2001 From: henning Date: Thu, 21 Nov 2024 13:43:50 +0100 Subject: [PATCH 02/13] Add new pickle disk operator interface, add k-mer dictionary generation and storing, annihilate protein coverage tests --- protzilla/data_analysis/protein_coverage.py | 108 ++++++++++++------ protzilla/disk_operator.py | 20 ++++ .../data_analysis/test_protein_coverage.py | 73 ------------ 3 files changed, 92 insertions(+), 109 deletions(-) diff --git a/protzilla/data_analysis/protein_coverage.py b/protzilla/data_analysis/protein_coverage.py index afc504ad..dce9c996 100644 --- a/protzilla/data_analysis/protein_coverage.py +++ b/protzilla/data_analysis/protein_coverage.py @@ -1,27 +1,67 @@ -from typing import List - +from tqdm import tqdm +from protzilla.constants.paths import EXTERNAL_DATA_PATH +from protzilla.disk_operator import PickleOperator import pandas as pd import plotly.graph_objects as go +def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dict[str, list[tuple[str, int]]]: + """ + Builds a dictionary of all kmers in a list of protein sequences. The kmers are generated by sliding a window of size + k along the protein sequences. The dictionary maps a the k-mer to a list of tuples containing the protein ID and the index of + the k-mer in the protein sequence. The dictionary is saved to disk for future use. + """ + # if the dictionary already exists, load it from disk + kmer_dict_path = EXTERNAL_DATA_PATH / f"kmer_dict_{k}.pkl" + if kmer_dict_path.exists(): + kmer_dict = PickleOperator.read(kmer_dict_path) + return kmer_dict + kmer_dict = {} + for protein_id, protein_sequence in tqdm(protein_dictionary.items(), desc="Building kmer dictionary", unit_scale=True, unit="protein"): + for i in range(len(protein_sequence) - k + 1): + kmer = protein_sequence[i : i + k] + if kmer in kmer_dict: + kmer_dict[kmer].append((protein_id, i)) + else: + kmer_dict[kmer] = [(protein_id, i)] + PickleOperator.write(kmer_dict_path, kmer_dict) + return kmer_dict -def align_peptide_to_protein_sequence( - peptide_sequence: str, protein_sequence: str -) -> List[int]: +def match_peptide_to_protein_ids( + peptide_sequence: str, protein_kmer_dictionary : dict[str, list[tuple[str, int]]], protein_dictionary: dict[str, str] +) -> list[tuple[str, int]]: """ - Aligns a peptide to a protein sequence and returns the indices of the protein sequence where the peptide. - NAIVE APPROACH + Matches a peptide sequence to a dictionary of kmers in protein sequences. + Returns a list of protein ids that could be matches """ - if ( - len(peptide_sequence) == 0 - or len(protein_sequence) == 0 - or len(peptide_sequence) > len(protein_sequence) - ): - return [] - indices = [] - for i in range(len(protein_sequence) - len(peptide_sequence) + 1): - if protein_sequence[i : i + len(peptide_sequence)] == peptide_sequence: - indices.append(i) - return indices + # convert the peptide sequence to a list of kmers + k = 5 + kmers = [peptide_sequence[i : i + k] for i in range(len(peptide_sequence) - k + 1)] + # for now, we only do exact matches, so only the first and last kmer of the peptide are needed + first_kmer = kmers[0] + last_kmer = kmers[-1] + # match the kmers + first_kmer_matches = protein_kmer_dictionary.get(first_kmer, []) + last_kmer_matches = protein_kmer_dictionary.get(last_kmer, []) + # determine the protein ids that are common to both lists, check if the peptide is in the protein sequence at that + # location and return the protein ids and the start location + hits = [] + for protein_id, start_location in first_kmer_matches: + expected_end_location = start_location + len(peptide_sequence) - k + for _, end_location in last_kmer_matches: + if protein_id != _: + continue + if end_location != expected_end_location: + continue + protein_sequence = protein_dictionary[protein_id] + subsequence = protein_sequence[start_location:end_location + k] + if subsequence != peptide_sequence: + continue + assert len(subsequence) == len(peptide_sequence), f"Lengths do not match: {len(subsequence)} != {len(peptide_sequence)}" + assert subsequence in peptide_sequence, f"Subsequence not in peptide sequence:\nA: {subsequence}\nB: {peptide_sequence}" + assert peptide_sequence in protein_sequence, f"Peptide not in protein sequence: {peptide_sequence} not in {protein_sequence}" + hits.append((protein_id, start_location)) + return hits + def plot_protein_coverage( @@ -30,28 +70,24 @@ def plot_protein_coverage( """ Plots the coverage of a protein sequence by peptides. """ - # Retrieve the relevant peptides from the peptide df - relevant_peptides = peptide_df[peptide_df["Protein ID"] == protein_id] - protein_sequence = fasta_df[fasta_df["Protein ID"] == protein_id][ - "Protein Sequence" - ].values[0] + # generate the protein kmer dictionary + protein_dict = dict(zip(fasta_df["Protein ID"], fasta_df["Protein Sequence"])) + kmer_dict = build_kmer_dictionary(protein_dict, k=5) + + protein_sequence = protein_dict[protein_id] protein_sequence_length = len(protein_sequence) - coverage = [0] * protein_sequence_length - for peptide_sequence in relevant_peptides["Sequence"].unique(): - indices = align_peptide_to_protein_sequence(peptide_sequence, protein_sequence) - for i in indices: - for j in range(len(peptide_sequence)): - coverage[i + j] += 1 + coverage = [] - # Make sure coverage length matches the sequence length - if len(coverage) != protein_sequence_length: - raise ValueError("Length of coverage must match protein_sequence_length.") + for peptide_sequence in tqdm(peptide_df['Sequence'].unique(), desc="Matching peptides to protein", unit_scale=True, unit="peptide"): + hits = match_peptide_to_protein_ids(peptide_sequence=peptide_sequence, protein_kmer_dictionary=kmer_dict, protein_dictionary=protein_dict) + for protein, start_location in hits: + # we are only interested in the provided protein id, everything else is ignored + if protein != protein_id: + continue + # add the peptide sequence and location to the coverage list + coverage.append((peptide_sequence, start_location)) - # Generate labels for amino acid positions - amino_acid_positions = list(range(1, protein_sequence_length + 1)) x_labels = [f"{i} ({protein_sequence[i - 1]})" for i in amino_acid_positions] - - # Create the bar chart fig = go.Figure() # Add bars for coverage values diff --git a/protzilla/disk_operator.py b/protzilla/disk_operator.py index ce7f07ad..dfee538c 100644 --- a/protzilla/disk_operator.py +++ b/protzilla/disk_operator.py @@ -6,6 +6,7 @@ import pandas as pd import yaml +import pickle from plotly.io import read_json, write_json import protzilla.utilities as utilities @@ -36,6 +37,25 @@ def __exit__(self, exc_type, exc_val, exc_tb): return False return True +class PickleOperator: + @staticmethod + def read(file_path: Path): + with ErrorHandler(): + with open(file_path, "rb") as file: + logger.info(f"Reading pickle from {file_path}") + return pickle.load(file) + + @staticmethod + def write(file_path: Path, data): + with ErrorHandler(): + if not file_path.exists(): + if not file_path.parent.exists(): + logger.info( + f"Parent directory {file_path.parent} did not exist and was created" + ) + file_path.parent.mkdir(parents=True) + with open(file_path, "wb") as file: + pickle.dump(data, file) class YamlOperator: @staticmethod diff --git a/tests/protzilla/data_analysis/test_protein_coverage.py b/tests/protzilla/data_analysis/test_protein_coverage.py index f2ca2f1e..e69de29b 100644 --- a/tests/protzilla/data_analysis/test_protein_coverage.py +++ b/tests/protzilla/data_analysis/test_protein_coverage.py @@ -1,73 +0,0 @@ -from protzilla.data_analysis.protein_coverage import align_peptide_to_protein_sequence - - -def test_aligns_peptide_at_start(): - peptide = "MKT" - protein_sequence = "MKTLLL" - expected_indices = [0] - assert ( - align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices - ) - - -def test_aligns_peptide_in_middle(): - peptide = "KTL" - protein_sequence = "MKTLLL" - expected_indices = [1] - assert ( - align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices - ) - - -def test_aligns_peptide_at_end(): - peptide = "LLL" - protein_sequence = "MKTLLL" - expected_indices = [3] - assert ( - align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices - ) - - -def test_aligns_multiple_occurrences(): - peptide = "LL" - protein_sequence = "MKTLLLL" - expected_indices = [3, 4, 5] - assert ( - align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices - ) - - -def aligns_no_occurrences(): - peptide = "XYZ" - protein_sequence = "MKTLLL" - expected_indices = [] - assert ( - align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices - ) - - -def test_aligns_empty_peptide(): - peptide = "" - protein_sequence = "MKTLLL" - expected_indices = [] - assert ( - align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices - ) - - -def test_aligns_empty_protein_sequence(): - peptide = "MKT" - protein_sequence = "" - expected_indices = [] - assert ( - align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices - ) - - -def test_aligns_peptide_longer_than_protein_sequence(): - peptide = "MKTLLL" - protein_sequence = "MKT" - expected_indices = [] - assert ( - align_peptide_to_protein_sequence(peptide, protein_sequence) == expected_indices - ) From 4e981c2790e7069d570015e73b4d8a393ee64107 Mon Sep 17 00:00:00 2001 From: henning Date: Tue, 26 Nov 2024 12:27:56 +0100 Subject: [PATCH 03/13] Add peptide sequences as hoverable rectangles in the protein coverage plot, space optimization --- protzilla/data_analysis/protein_coverage.py | 95 +++++++++++++++------ 1 file changed, 68 insertions(+), 27 deletions(-) diff --git a/protzilla/data_analysis/protein_coverage.py b/protzilla/data_analysis/protein_coverage.py index dce9c996..35bc0516 100644 --- a/protzilla/data_analysis/protein_coverage.py +++ b/protzilla/data_analysis/protein_coverage.py @@ -1,3 +1,4 @@ +from docutils.nodes import title from tqdm import tqdm from protzilla.constants.paths import EXTERNAL_DATA_PATH from protzilla.disk_operator import PickleOperator @@ -28,10 +29,10 @@ def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dic def match_peptide_to_protein_ids( peptide_sequence: str, protein_kmer_dictionary : dict[str, list[tuple[str, int]]], protein_dictionary: dict[str, str] -) -> list[tuple[str, int]]: +) -> list[tuple[str, int, int]]: """ Matches a peptide sequence to a dictionary of kmers in protein sequences. - Returns a list of protein ids that could be matches + Returns a list of tuples containing the protein ID and the start, end location of the peptide in the protein sequence. """ # convert the peptide sequence to a list of kmers k = 5 @@ -45,21 +46,22 @@ def match_peptide_to_protein_ids( # determine the protein ids that are common to both lists, check if the peptide is in the protein sequence at that # location and return the protein ids and the start location hits = [] - for protein_id, start_location in first_kmer_matches: - expected_end_location = start_location + len(peptide_sequence) - k - for _, end_location in last_kmer_matches: + for protein_id, start_first_kmer in first_kmer_matches: + expected_start_last_kmer = start_first_kmer + len(peptide_sequence) - k + for _, start_last_kmer in last_kmer_matches: if protein_id != _: continue - if end_location != expected_end_location: + if start_last_kmer != expected_start_last_kmer: continue protein_sequence = protein_dictionary[protein_id] - subsequence = protein_sequence[start_location:end_location + k] + end_last_kmer = start_first_kmer + len(peptide_sequence) + subsequence = protein_sequence[start_first_kmer:end_last_kmer] if subsequence != peptide_sequence: continue assert len(subsequence) == len(peptide_sequence), f"Lengths do not match: {len(subsequence)} != {len(peptide_sequence)}" assert subsequence in peptide_sequence, f"Subsequence not in peptide sequence:\nA: {subsequence}\nB: {peptide_sequence}" assert peptide_sequence in protein_sequence, f"Peptide not in protein sequence: {peptide_sequence} not in {protein_sequence}" - hits.append((protein_id, start_location)) + hits.append((protein_id, start_first_kmer, end_last_kmer)) return hits @@ -76,31 +78,70 @@ def plot_protein_coverage( protein_sequence = protein_dict[protein_id] protein_sequence_length = len(protein_sequence) - coverage = [] + peptide_matches = [] for peptide_sequence in tqdm(peptide_df['Sequence'].unique(), desc="Matching peptides to protein", unit_scale=True, unit="peptide"): - hits = match_peptide_to_protein_ids(peptide_sequence=peptide_sequence, protein_kmer_dictionary=kmer_dict, protein_dictionary=protein_dict) - for protein, start_location in hits: - # we are only interested in the provided protein id, everything else is ignored - if protein != protein_id: + protein_hits = match_peptide_to_protein_ids(peptide_sequence=peptide_sequence, protein_kmer_dictionary=kmer_dict, protein_dictionary=protein_dict) + for prot_id, start_location_on_protein, end_location_on_protein in protein_hits: + # we are only interested in the provided protein id, everything else is discarded + if prot_id != protein_id: continue - # add the peptide sequence and location to the coverage list - coverage.append((peptide_sequence, start_location)) + # add the peptide sequence and location to the peptide matches pertaining to the protein id + peptide_matches.append((peptide_sequence, start_location_on_protein, end_location_on_protein)) - x_labels = [f"{i} ({protein_sequence[i - 1]})" for i in amino_acid_positions] - fig = go.Figure() + # now that all the matches have been determined with their start and end on the protein sequence, we need to find + # an optimal solution for the location of their rectangles in the plot without overlap while minimizing the + # required number of rows (vertical space) + rows_of_peptide_matches = distribute_to_rows(peptide_matches) - # Add bars for coverage values - fig.add_trace( - go.Bar(x=x_labels, y=coverage, name="Coverage", marker=dict(color="skyblue")) - ) + x_labels = [f"{i} ({protein_sequence[i - 1]})" for i in range(1, protein_sequence_length + 1)] + fig = go.Figure(data=go.Bar(x=x_labels, y=[1] * protein_sequence_length, + name="Protein Sequence", marker=dict(color="lightgray")), ) + for row_index, row in enumerate(rows_of_peptide_matches): + for start_location_on_protein, end_location_on_protein, peptide_sequence in row: + print(f"Peptide: {peptide_sequence}, start: {start_location_on_protein}, end: {end_location_on_protein}") + # add a rectangle for each peptide + fig.add_shape( + type="rect", + x0=start_location_on_protein, + y0=row_index+1 + 0.1, + x1=end_location_on_protein, + y1=row_index+ 2 - 0.1, + fillcolor="blue", + opacity=0.5, + layer="above", + ) + # add invisible plotly object to the rectangle to show the peptide sequence when hovered over + fig.add_trace(go.Scatter( + x=[x_labels[(start_location_on_protein + end_location_on_protein) // 2]], + y=[row_index + 1.5], + text=[f"{peptide_sequence}
({start_location_on_protein}-{end_location_on_protein})"], + mode="markers", + marker=dict(opacity=0, size=20), # make marker invisible + hoverinfo="text", + hovertemplate="%{text}", + showlegend=False, + )) - # Customize layout fig.update_layout( - title="Protein Coverage", - xaxis_title="Amino Acid", - yaxis_title="Coverage Value", - xaxis=dict(tickmode="linear"), - bargap=0.1, # Adjust space between bars if needed + title=f"Protein Coverage of {protein_id}", + xaxis_title="Protein Sequence", + yaxis=dict(visible=False, range=[0, 1+len(rows_of_peptide_matches)+1], title=""), + showlegend=False, ) + return dict(plots=[fig]) + + +def distribute_to_rows(coverage): + coverage.sort(key=lambda x: x[1]) + rows = [] + for peptide_sequence, start_location, end_location in coverage: + # find the first row that does not overlap with the current peptide + for row in rows: + if row[-1][1] <= start_location: + row.append((start_location, end_location, peptide_sequence)) + break + else: + rows.append([(start_location, end_location, peptide_sequence)]) + return rows From 1006c0caafbb462d0e6b435ecbb255366f609058 Mon Sep 17 00:00:00 2001 From: henning Date: Wed, 27 Nov 2024 09:17:52 +0100 Subject: [PATCH 04/13] Add sample selection, coloring based on intensities to the protein coverage plot --- protzilla/data_analysis/protein_coverage.py | 191 +++++++++++++++----- protzilla/methods/data_analysis.py | 1 + ui/runs/forms/data_analysis.py | 10 +- 3 files changed, 154 insertions(+), 48 deletions(-) diff --git a/protzilla/data_analysis/protein_coverage.py b/protzilla/data_analysis/protein_coverage.py index 35bc0516..6e4fb04c 100644 --- a/protzilla/data_analysis/protein_coverage.py +++ b/protzilla/data_analysis/protein_coverage.py @@ -1,9 +1,24 @@ +from colour import color_scale from docutils.nodes import title from tqdm import tqdm from protzilla.constants.paths import EXTERNAL_DATA_PATH from protzilla.disk_operator import PickleOperator +from dataclasses import dataclass import pandas as pd +from numpy import log2 import plotly.graph_objects as go + +@dataclass(unsafe_hash=True) +class PeptideMatch: + + peptide_sequence: str + start_location_on_protein: int + end_location_on_protein: int + intensity: float = 0.0 + sample: str = "" + + + def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dict[str, list[tuple[str, int]]]: """ Builds a dictionary of all kmers in a list of protein sequences. The kmers are generated by sliding a window of size @@ -16,9 +31,10 @@ def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dic kmer_dict = PickleOperator.read(kmer_dict_path) return kmer_dict kmer_dict = {} - for protein_id, protein_sequence in tqdm(protein_dictionary.items(), desc="Building kmer dictionary", unit_scale=True, unit="protein"): + for protein_id, protein_sequence in tqdm(protein_dictionary.items(), desc="Building kmer dictionary", + unit_scale=True, unit="protein"): for i in range(len(protein_sequence) - k + 1): - kmer = protein_sequence[i : i + k] + kmer = protein_sequence[i: i + k] if kmer in kmer_dict: kmer_dict[kmer].append((protein_id, i)) else: @@ -28,7 +44,8 @@ def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dic def match_peptide_to_protein_ids( - peptide_sequence: str, protein_kmer_dictionary : dict[str, list[tuple[str, int]]], protein_dictionary: dict[str, str] + peptide_sequence: str, protein_kmer_dictionary: dict[str, list[tuple[str, int]]], + protein_dictionary: dict[str, str] ) -> list[tuple[str, int, int]]: """ Matches a peptide sequence to a dictionary of kmers in protein sequences. @@ -36,7 +53,7 @@ def match_peptide_to_protein_ids( """ # convert the peptide sequence to a list of kmers k = 5 - kmers = [peptide_sequence[i : i + k] for i in range(len(peptide_sequence) - k + 1)] + kmers = [peptide_sequence[i: i + k] for i in range(len(peptide_sequence) - k + 1)] # for now, we only do exact matches, so only the first and last kmer of the peptide are needed first_kmer = kmers[0] last_kmer = kmers[-1] @@ -58,90 +75,170 @@ def match_peptide_to_protein_ids( subsequence = protein_sequence[start_first_kmer:end_last_kmer] if subsequence != peptide_sequence: continue - assert len(subsequence) == len(peptide_sequence), f"Lengths do not match: {len(subsequence)} != {len(peptide_sequence)}" + assert len(subsequence) == len( + peptide_sequence), f"Lengths do not match: {len(subsequence)} != {len(peptide_sequence)}" assert subsequence in peptide_sequence, f"Subsequence not in peptide sequence:\nA: {subsequence}\nB: {peptide_sequence}" assert peptide_sequence in protein_sequence, f"Peptide not in protein sequence: {peptide_sequence} not in {protein_sequence}" hits.append((protein_id, start_first_kmer, end_last_kmer)) return hits - def plot_protein_coverage( - fasta_df: pd.DataFrame, peptide_df: pd.DataFrame, protein_id: str + fasta_df: pd.DataFrame, peptide_df: pd.DataFrame, protein_id: str, samples: list[str] = None ) -> None: """ Plots the coverage of a protein sequence by peptides. """ # generate the protein kmer dictionary + if samples is None: + samples = [] + reduced_peptide_df = peptide_df[peptide_df["Sample"].isin(samples) & peptide_df['Intensity'] > 0] + protein_dict = dict(zip(fasta_df["Protein ID"], fasta_df["Protein Sequence"])) kmer_dict = build_kmer_dictionary(protein_dict, k=5) protein_sequence = protein_dict[protein_id] protein_sequence_length = len(protein_sequence) peptide_matches = [] + coverage = [0] * protein_sequence_length - for peptide_sequence in tqdm(peptide_df['Sequence'].unique(), desc="Matching peptides to protein", unit_scale=True, unit="peptide"): - protein_hits = match_peptide_to_protein_ids(peptide_sequence=peptide_sequence, protein_kmer_dictionary=kmer_dict, protein_dictionary=protein_dict) + for sample, peptide_sequence, intensity in tqdm( + reduced_peptide_df[['Sample', 'Sequence', 'Intensity']].drop_duplicates().itertuples(index=False), + desc="Matching peptides to protein", unit_scale=True, + unit="peptide"): + protein_hits = match_peptide_to_protein_ids(peptide_sequence=peptide_sequence, + protein_kmer_dictionary=kmer_dict, protein_dictionary=protein_dict) for prot_id, start_location_on_protein, end_location_on_protein in protein_hits: # we are only interested in the provided protein id, everything else is discarded if prot_id != protein_id: continue # add the peptide sequence and location to the peptide matches pertaining to the protein id - peptide_matches.append((peptide_sequence, start_location_on_protein, end_location_on_protein)) + peptide_matches.append( + PeptideMatch(peptide_sequence=peptide_sequence, + start_location_on_protein=start_location_on_protein, + end_location_on_protein=end_location_on_protein, + intensity=log2(intensity), + sample=sample) + ) + # update the coverage of the protein sequence + coverage[start_location_on_protein:end_location_on_protein] = \ + [coverage + 1 for coverage in coverage[start_location_on_protein:end_location_on_protein]] + if len(peptide_matches) == 0: + raise ValueError(f"No peptides matched for protein {protein_id}") # now that all the matches have been determined with their start and end on the protein sequence, we need to find # an optimal solution for the location of their rectangles in the plot without overlap while minimizing the # required number of rows (vertical space) - rows_of_peptide_matches = distribute_to_rows(peptide_matches) - - x_labels = [f"{i} ({protein_sequence[i - 1]})" for i in range(1, protein_sequence_length + 1)] - fig = go.Figure(data=go.Bar(x=x_labels, y=[1] * protein_sequence_length, - name="Protein Sequence", marker=dict(color="lightgray")), ) - for row_index, row in enumerate(rows_of_peptide_matches): - for start_location_on_protein, end_location_on_protein, peptide_sequence in row: - print(f"Peptide: {peptide_sequence}, start: {start_location_on_protein}, end: {end_location_on_protein}") - # add a rectangle for each peptide + rows = distribute_to_rows(peptide_matches) + + x_labels = [f"{amino_acid_index} ({amino_acid})" for amino_acid_index, amino_acid in enumerate(protein_sequence)] + max_coverage_in_sequence = max(coverage) + max_intensity = max([peptide_match.intensity for peptide_match in peptide_matches]) + normalized_coverage = [value / max_coverage_in_sequence for value in coverage] + hover_text = [f"Position: {i}
Amino acid: {amino_acid}
Coverage: {coverage}" + for i, (coverage, amino_acid) in + enumerate(zip(normalized_coverage, protein_sequence))] + + fig = go.Figure(data=go.Bar(x=x_labels, + y=normalized_coverage, + marker=dict( + color=normalized_coverage, + colorscale='Bluered', + colorbar=dict(title="Coverage
(normalized)"), + line=dict(width=0) + ), + hovertext=hover_text, + hoverinfo="text", + showlegend=False, + name="Coverage (normalized to max in Protein Sequence)")) + + # Keep track of the current row position across all samples + current_row = 1 + + # Iterate through samples and their rows in order + for sample_idx, (sample, sample_rows) in enumerate(rows.items()): + # Add a sample label + fig.add_annotation( + x=-0.1, # slightly to the left of the plot + y=current_row + len(sample_rows) / 2, + text=sample, + showarrow=False, + textangle=-90, + xref='paper', + yref='y' + ) + + for row_index, row in enumerate(sample_rows): + for peptide_match in row: + # add a rectangle for each peptide + x0, x1 = peptide_match.start_location_on_protein, peptide_match.end_location_on_protein + y0, y1 = current_row, current_row + 1 + normalized_intensity = (peptide_match.intensity) / max_intensity + fig.add_shape( + type="rect", + # the coordinates need to be a bit offset, as otherwise the rectangles would start in the middle of the bars + x0=x0 - 0.5, y0=y0, + x1=x1 - 0.5, y1=y1, + fillcolor=f"rgba({int(255 * (1 - normalized_intensity))}, 0, {int(255 * normalized_intensity)}, 0.5)", + opacity=0.5, + layer="above", + ) + # add invisible plotly object to the rectangle to show the peptide sequence when hovered over + fig.add_trace(go.Scatter( + x=x_labels[peptide_match.start_location_on_protein:peptide_match.end_location_on_protein + 1], + y=[(y0 + y1) / 2] * len(peptide_match.peptide_sequence), + text=[f"Sample: {sample}
Peptide: {peptide_match.peptide_sequence}
" + f"({peptide_match.start_location_on_protein}-{peptide_match.end_location_on_protein})
" + f"Intensity: {peptide_match.intensity}"] * len(peptide_match.peptide_sequence), + mode="markers", + marker=dict(opacity=0, size=30), # make marker invisible + hoverinfo="text", + hovertemplate="%{text}", + showlegend=False, + )) + + # Move to next row + current_row += 1 + + # Add a horizontal line separator between samples (except after the last sample) + if sample_idx < len(rows) - 1: fig.add_shape( - type="rect", - x0=start_location_on_protein, - y0=row_index+1 + 0.1, - x1=end_location_on_protein, - y1=row_index+ 2 - 0.1, - fillcolor="blue", - opacity=0.5, - layer="above", + type="line", + x0=-0.5, + y0=current_row, + x1=len(protein_sequence) - 0.5, + y1=current_row, + line=dict( + color="Gray", + width=2, + dash="solid", + ) ) - # add invisible plotly object to the rectangle to show the peptide sequence when hovered over - fig.add_trace(go.Scatter( - x=[x_labels[(start_location_on_protein + end_location_on_protein) // 2]], - y=[row_index + 1.5], - text=[f"{peptide_sequence}
({start_location_on_protein}-{end_location_on_protein})"], - mode="markers", - marker=dict(opacity=0, size=20), # make marker invisible - hoverinfo="text", - hovertemplate="%{text}", - showlegend=False, - )) fig.update_layout( title=f"Protein Coverage of {protein_id}", xaxis_title="Protein Sequence", - yaxis=dict(visible=False, range=[0, 1+len(rows_of_peptide_matches)+1], title=""), + yaxis=dict(visible=False, range=[0, 1 + current_row], title=""), showlegend=False, + bargap=0 ) return dict(plots=[fig]) -def distribute_to_rows(coverage): - coverage.sort(key=lambda x: x[1]) - rows = [] - for peptide_sequence, start_location, end_location in coverage: +def distribute_to_rows(peptide_matches: list[PeptideMatch]) -> dict[list[list[PeptideMatch]]]: + """ + Distributes peptide matches to rows such that no two peptides overlap in the same row. + Greedy algorithm (see Interval Scheduling Problem). + """ + peptide_matches.sort(key=lambda peptide_match: peptide_match.start_location_on_protein) + rows = {sample : [] for sample in set([peptide_match.sample for peptide_match in peptide_matches])} + for peptide_match in peptide_matches: # find the first row that does not overlap with the current peptide - for row in rows: - if row[-1][1] <= start_location: - row.append((start_location, end_location, peptide_sequence)) + for row in rows[peptide_match.sample]: + if row[-1].end_location_on_protein < peptide_match.start_location_on_protein: + row.append((peptide_match)) break else: - rows.append([(start_location, end_location, peptide_sequence)]) + rows[peptide_match.sample].append([(peptide_match)]) return rows diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index ab8bdc27..765b3047 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -792,6 +792,7 @@ class PlotProteinCoverage(PlotStep): "protein_id", "fasta_df", "peptide_df", + "samples" ] output_keys = [] diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index be9ea58f..38c8668e 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -1103,6 +1103,11 @@ class PlotProteinCoverageForm(MethodForm): choices=[], label="Protein ID", ) + samples = CustomMultipleChoiceField( + choices=[], + label="Samples", + ) + def fill_form(self, run: Run) -> None: self.fields["peptide_df_instance"].choices = fill_helper.get_choices( @@ -1124,9 +1129,12 @@ def fill_form(self, run: Run) -> None: Step, "fasta_df", fasta_df_instance_id )["Protein ID"].unique() peptide_df_protein_ids = peptide_df["Protein ID"].unique() - common_protein_ids = list(set(fasta_protein_ids) & set(peptide_df_protein_ids)) + common_protein_ids = sorted(list(set(fasta_protein_ids) & set(peptide_df_protein_ids))) self.fields["protein_id"].choices = fill_helper.to_choices(common_protein_ids) + peptide_df_samples = peptide_df["Sample"].unique() + self.fields["samples"].choices = fill_helper.to_choices(peptide_df_samples) + class FLEXIQuantLFForm(MethodForm): is_dynamic = True From cb227f4622bbe87d817aa64b2d600a0d5466c39d Mon Sep 17 00:00:00 2001 From: henning Date: Thu, 28 Nov 2024 15:32:47 +0100 Subject: [PATCH 05/13] Refactoring and cleanup --- protzilla/data_analysis/protein_coverage.py | 79 +++++++++++---------- 1 file changed, 40 insertions(+), 39 deletions(-) diff --git a/protzilla/data_analysis/protein_coverage.py b/protzilla/data_analysis/protein_coverage.py index 6e4fb04c..6ffe135c 100644 --- a/protzilla/data_analysis/protein_coverage.py +++ b/protzilla/data_analysis/protein_coverage.py @@ -18,6 +18,12 @@ class PeptideMatch: sample: str = "" +@dataclass(unsafe_hash=True) +class ProteinHit: + protein_id: str + start_location_on_protein: int + end_location_on_protein: int + def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dict[str, list[tuple[str, int]]]: """ @@ -46,40 +52,34 @@ def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dic def match_peptide_to_protein_ids( peptide_sequence: str, protein_kmer_dictionary: dict[str, list[tuple[str, int]]], protein_dictionary: dict[str, str] -) -> list[tuple[str, int, int]]: +) -> list[ProteinHit]: """ Matches a peptide sequence to a dictionary of kmers in protein sequences. Returns a list of tuples containing the protein ID and the start, end location of the peptide in the protein sequence. """ - # convert the peptide sequence to a list of kmers k = 5 - kmers = [peptide_sequence[i: i + k] for i in range(len(peptide_sequence) - k + 1)] - # for now, we only do exact matches, so only the first and last kmer of the peptide are needed - first_kmer = kmers[0] - last_kmer = kmers[-1] + peptide_kmers = [peptide_sequence[i: i + k] for i in range(len(peptide_sequence) - k + 1)] + first_kmer, last_kmer = peptide_kmers[0], peptide_kmers[-1] # match the kmers first_kmer_matches = protein_kmer_dictionary.get(first_kmer, []) last_kmer_matches = protein_kmer_dictionary.get(last_kmer, []) - # determine the protein ids that are common to both lists, check if the peptide is in the protein sequence at that - # location and return the protein ids and the start location - hits = [] - for protein_id, start_first_kmer in first_kmer_matches: - expected_start_last_kmer = start_first_kmer + len(peptide_sequence) - k - for _, start_last_kmer in last_kmer_matches: - if protein_id != _: - continue - if start_last_kmer != expected_start_last_kmer: - continue - protein_sequence = protein_dictionary[protein_id] - end_last_kmer = start_first_kmer + len(peptide_sequence) - subsequence = protein_sequence[start_first_kmer:end_last_kmer] - if subsequence != peptide_sequence: - continue - assert len(subsequence) == len( - peptide_sequence), f"Lengths do not match: {len(subsequence)} != {len(peptide_sequence)}" - assert subsequence in peptide_sequence, f"Subsequence not in peptide sequence:\nA: {subsequence}\nB: {peptide_sequence}" - assert peptide_sequence in protein_sequence, f"Peptide not in protein sequence: {peptide_sequence} not in {protein_sequence}" - hits.append((protein_id, start_first_kmer, end_last_kmer)) + hits = [ + ProteinHit(protein_id, start_first_kmer, start_first_kmer + len(peptide_sequence)) + for protein_id, start_first_kmer in first_kmer_matches + for _, start_last_kmer in last_kmer_matches + if protein_id == _ and start_last_kmer == start_first_kmer + len(peptide_sequence) - k + and protein_dictionary[protein_id][ + start_first_kmer:start_first_kmer + len(peptide_sequence)] == peptide_sequence + ] + hits = list(set(hits)) # remove duplicates + # Just sanity checks + for hit in hits: + subsequence = protein_dictionary[hit.protein_id][hit.start_location_on_protein:hit.end_location_on_protein] + assert len(subsequence) == len( + peptide_sequence), f"Lengths do not match: {len(subsequence)} != {len(peptide_sequence)}" + assert subsequence in peptide_sequence, f"Subsequence not in peptide sequence:\nA: {subsequence}\nB: {peptide_sequence}" + assert peptide_sequence in protein_dictionary[ + hit.protein_id], f"Peptide not in protein sequence: {peptide_sequence} not in {protein_dictionary[protein_id]}" return hits @@ -90,8 +90,9 @@ def plot_protein_coverage( Plots the coverage of a protein sequence by peptides. """ # generate the protein kmer dictionary - if samples is None: - samples = [] + if samples is None or samples == []: + raise ValueError("No samples provided.") + reduced_peptide_df = peptide_df[peptide_df["Sample"].isin(samples) & peptide_df['Intensity'] > 0] protein_dict = dict(zip(fasta_df["Protein ID"], fasta_df["Protein Sequence"])) @@ -107,22 +108,22 @@ def plot_protein_coverage( desc="Matching peptides to protein", unit_scale=True, unit="peptide"): protein_hits = match_peptide_to_protein_ids(peptide_sequence=peptide_sequence, - protein_kmer_dictionary=kmer_dict, protein_dictionary=protein_dict) - for prot_id, start_location_on_protein, end_location_on_protein in protein_hits: - # we are only interested in the provided protein id, everything else is discarded - if prot_id != protein_id: - continue + protein_kmer_dictionary=kmer_dict, + protein_dictionary=protein_dict) + # we only care about the hits pertaining to the argument-supplied protein id + protein_hits = filter(lambda x: x.protein_id == protein_id, protein_hits) + for protein_hit in protein_hits: # add the peptide sequence and location to the peptide matches pertaining to the protein id peptide_matches.append( PeptideMatch(peptide_sequence=peptide_sequence, - start_location_on_protein=start_location_on_protein, - end_location_on_protein=end_location_on_protein, + start_location_on_protein=protein_hit.start_location_on_protein, + end_location_on_protein=protein_hit.end_location_on_protein, intensity=log2(intensity), sample=sample) ) # update the coverage of the protein sequence - coverage[start_location_on_protein:end_location_on_protein] = \ - [coverage + 1 for coverage in coverage[start_location_on_protein:end_location_on_protein]] + coverage[protein_hit.start_location_on_protein:protein_hit.end_location_on_protein] = \ + [coverage + 1 for coverage in coverage[protein_hit.start_location_on_protein:protein_hit.end_location_on_protein]] if len(peptide_matches) == 0: raise ValueError(f"No peptides matched for protein {protein_id}") @@ -131,7 +132,7 @@ def plot_protein_coverage( # required number of rows (vertical space) rows = distribute_to_rows(peptide_matches) - x_labels = [f"{amino_acid_index} ({amino_acid})" for amino_acid_index, amino_acid in enumerate(protein_sequence)] + x_labels = [f"{amino_acid} - {amino_acid_index} " for amino_acid_index, amino_acid in enumerate(protein_sequence)] max_coverage_in_sequence = max(coverage) max_intensity = max([peptide_match.intensity for peptide_match in peptide_matches]) normalized_coverage = [value / max_coverage_in_sequence for value in coverage] @@ -179,7 +180,7 @@ def plot_protein_coverage( # the coordinates need to be a bit offset, as otherwise the rectangles would start in the middle of the bars x0=x0 - 0.5, y0=y0, x1=x1 - 0.5, y1=y1, - fillcolor=f"rgba({int(255 * (1 - normalized_intensity))}, 0, {int(255 * normalized_intensity)}, 0.5)", + fillcolor=f"rgba({int(255 * (normalized_intensity))}, 0, 0, 1)", opacity=0.5, layer="above", ) From 265db599bfad473cd28eabeebd2034d2ce1e2bae Mon Sep 17 00:00:00 2001 From: henning Date: Thu, 28 Nov 2024 15:57:10 +0100 Subject: [PATCH 06/13] Refactor plot_protein_coverage by extracting _build_coverage_plot function --- protzilla/data_analysis/protein_coverage.py | 44 +++++++++------------ 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/protzilla/data_analysis/protein_coverage.py b/protzilla/data_analysis/protein_coverage.py index 6ffe135c..38584058 100644 --- a/protzilla/data_analysis/protein_coverage.py +++ b/protzilla/data_analysis/protein_coverage.py @@ -8,9 +8,9 @@ from numpy import log2 import plotly.graph_objects as go + @dataclass(unsafe_hash=True) class PeptideMatch: - peptide_sequence: str start_location_on_protein: int end_location_on_protein: int @@ -71,7 +71,7 @@ def match_peptide_to_protein_ids( and protein_dictionary[protein_id][ start_first_kmer:start_first_kmer + len(peptide_sequence)] == peptide_sequence ] - hits = list(set(hits)) # remove duplicates + hits = list(set(hits)) # remove duplicates # Just sanity checks for hit in hits: subsequence = protein_dictionary[hit.protein_id][hit.start_location_on_protein:hit.end_location_on_protein] @@ -99,9 +99,8 @@ def plot_protein_coverage( kmer_dict = build_kmer_dictionary(protein_dict, k=5) protein_sequence = protein_dict[protein_id] - protein_sequence_length = len(protein_sequence) peptide_matches = [] - coverage = [0] * protein_sequence_length + coverage = [0] * len(protein_sequence) for sample, peptide_sequence, intensity in tqdm( reduced_peptide_df[['Sample', 'Sequence', 'Intensity']].drop_duplicates().itertuples(index=False), @@ -122,16 +121,23 @@ def plot_protein_coverage( sample=sample) ) # update the coverage of the protein sequence - coverage[protein_hit.start_location_on_protein:protein_hit.end_location_on_protein] = \ - [coverage + 1 for coverage in coverage[protein_hit.start_location_on_protein:protein_hit.end_location_on_protein]] + increment_coverage(coverage, protein_hit) if len(peptide_matches) == 0: raise ValueError(f"No peptides matched for protein {protein_id}") + # now that all the matches have been determined with their start and end on the protein sequence, we need to find # an optimal solution for the location of their rectangles in the plot without overlap while minimizing the # required number of rows (vertical space) rows = distribute_to_rows(peptide_matches) + fig = _build_coverage_plot(coverage, peptide_matches, protein_id, protein_sequence, rows) + + return dict(plots=[fig]) + + +def _build_coverage_plot(coverage: list[int], peptide_matches: list[PeptideMatch], protein_id: str, + protein_sequence: str, rows: dict[str, list[list[PeptideMatch]]]) -> go.Figure: x_labels = [f"{amino_acid} - {amino_acid_index} " for amino_acid_index, amino_acid in enumerate(protein_sequence)] max_coverage_in_sequence = max(coverage) max_intensity = max([peptide_match.intensity for peptide_match in peptide_matches]) @@ -139,7 +145,6 @@ def plot_protein_coverage( hover_text = [f"Position: {i}
Amino acid: {amino_acid}
Coverage: {coverage}" for i, (coverage, amino_acid) in enumerate(zip(normalized_coverage, protein_sequence))] - fig = go.Figure(data=go.Bar(x=x_labels, y=normalized_coverage, marker=dict( @@ -152,23 +157,8 @@ def plot_protein_coverage( hoverinfo="text", showlegend=False, name="Coverage (normalized to max in Protein Sequence)")) - - # Keep track of the current row position across all samples current_row = 1 - - # Iterate through samples and their rows in order for sample_idx, (sample, sample_rows) in enumerate(rows.items()): - # Add a sample label - fig.add_annotation( - x=-0.1, # slightly to the left of the plot - y=current_row + len(sample_rows) / 2, - text=sample, - showarrow=False, - textangle=-90, - xref='paper', - yref='y' - ) - for row_index, row in enumerate(sample_rows): for peptide_match in row: # add a rectangle for each peptide @@ -215,7 +205,6 @@ def plot_protein_coverage( dash="solid", ) ) - fig.update_layout( title=f"Protein Coverage of {protein_id}", xaxis_title="Protein Sequence", @@ -223,8 +212,13 @@ def plot_protein_coverage( showlegend=False, bargap=0 ) + return fig - return dict(plots=[fig]) + +def increment_coverage(coverage: list[int], protein_hit: ProteinHit) -> None: + coverage[protein_hit.start_location_on_protein:protein_hit.end_location_on_protein] = \ + [coverage + 1 for coverage in + coverage[protein_hit.start_location_on_protein:protein_hit.end_location_on_protein]] def distribute_to_rows(peptide_matches: list[PeptideMatch]) -> dict[list[list[PeptideMatch]]]: @@ -233,7 +227,7 @@ def distribute_to_rows(peptide_matches: list[PeptideMatch]) -> dict[list[list[Pe Greedy algorithm (see Interval Scheduling Problem). """ peptide_matches.sort(key=lambda peptide_match: peptide_match.start_location_on_protein) - rows = {sample : [] for sample in set([peptide_match.sample for peptide_match in peptide_matches])} + rows = {sample: [] for sample in set([peptide_match.sample for peptide_match in peptide_matches])} for peptide_match in peptide_matches: # find the first row that does not overlap with the current peptide for row in rows[peptide_match.sample]: From f5ec509379a4cc7843ef3d71349d708ee1321e58 Mon Sep 17 00:00:00 2001 From: henning Date: Mon, 9 Dec 2024 14:25:49 +0100 Subject: [PATCH 07/13] New build_plot function for protein coverage plot: using subplots, color min-max-normalization on sample-level --- protzilla/data_analysis/protein_coverage.py | 71 +++++++++++++++++++-- 1 file changed, 67 insertions(+), 4 deletions(-) diff --git a/protzilla/data_analysis/protein_coverage.py b/protzilla/data_analysis/protein_coverage.py index 38584058..1a503ce9 100644 --- a/protzilla/data_analysis/protein_coverage.py +++ b/protzilla/data_analysis/protein_coverage.py @@ -2,11 +2,15 @@ from docutils.nodes import title from tqdm import tqdm from protzilla.constants.paths import EXTERNAL_DATA_PATH +from protzilla.data_analysis.differential_expression_mann_whitney import mann_whitney_test_on_intensity_data from protzilla.disk_operator import PickleOperator from dataclasses import dataclass import pandas as pd from numpy import log2 import plotly.graph_objects as go +from plotly.subplots import make_subplots +# import StrEnum +from enum import StrEnum @dataclass(unsafe_hash=True) @@ -24,6 +28,11 @@ class ProteinHit: start_location_on_protein: int end_location_on_protein: int +class IntensityNormalization(StrEnum): + min_max_scaling = "min_max_scaling" + none = "none" + + def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dict[str, list[tuple[str, int]]]: """ @@ -131,10 +140,65 @@ def plot_protein_coverage( # required number of rows (vertical space) rows = distribute_to_rows(peptide_matches) - fig = _build_coverage_plot(coverage, peptide_matches, protein_id, protein_sequence, rows) + fig = _new_build_coverage_plot(coverage, peptide_matches, protein_id, protein_sequence, rows) return dict(plots=[fig]) +def _new_build_coverage_plot( + coverage: list[int], peptide_matches: list[PeptideMatch], protein_id: str, protein_sequence: str, + rows: dict[str, list[list[PeptideMatch]]], intensity_normalization: IntensityNormalization = IntensityNormalization.min_max_scaling +) -> go.Figure: + # one row for each sample + two row for the protein sequence (on top and on the bottom) + number_of_subplots = len(rows) + 1 + protein_sequence_labels = [f"{amino_acid} - {amino_acid_index} " for amino_acid_index, amino_acid in enumerate(protein_sequence)] + fig = make_subplots(rows=number_of_subplots, cols=1, shared_xaxes=True, vertical_spacing=0.2, subplot_titles=list(rows.keys()) + [f"Sequencing depth of {protein_id}"]) + # Sequencing depth bar chart + fig.add_trace(go.Bar(x=protein_sequence_labels, y=coverage, showlegend=False), row=number_of_subplots, col=1) + # Peptides + for subplot_row_index, (sample, rows) in enumerate(rows.items()): + # Intensity scaling for the current sample + current_row_intensities = [peptide_match.intensity for row in rows for peptide_match in row] + max_intensity, min_intensity = max(current_row_intensities), min(current_row_intensities) + def scale_intensity(intensity: float) -> float: + if intensity_normalization == IntensityNormalization.min_max_scaling: + return (intensity - min_intensity) / (max_intensity - min_intensity) + else: + return intensity + + for row_index, row in enumerate(rows): + for peptide_match in row: + x0, x1 = peptide_match.start_location_on_protein, peptide_match.end_location_on_protein + y0, y1 = row_index, row_index + 1 + # add the rectangles + fig.add_shape( + type="rect", + x0=x0 - 0.5, y0=y0, + x1=x1 - 0.5, y1=y1, + fillcolor=f"rgba({int(255 * scale_intensity(peptide_match.intensity))}, 0, 0, 1)", + opacity=0.5, + layer="above", + row=subplot_row_index + 1, col=1 + ) + # add invisible plotly object to the rectangle to show the peptide sequence when hovered over + fig.add_trace(go.Scatter( + x=protein_sequence_labels[ + peptide_match.start_location_on_protein:peptide_match.end_location_on_protein + 1], + y=[(y0 + y1) / 2] * len(peptide_match.peptide_sequence), + text=[f"Sample: {sample}
Peptide: {peptide_match.peptide_sequence}
" + f"({peptide_match.start_location_on_protein}-{peptide_match.end_location_on_protein})
" + f"Intensity: {peptide_match.intensity}"] * len(peptide_match.peptide_sequence), + mode="markers", + marker=dict(opacity=0, size=15), # make marker invisible + hoverinfo="text", + hovertemplate="%{text}", + showlegend=False, + ), row=subplot_row_index + 1, col=1) + + fig.update_yaxes(range=[0, len(rows)], showticklabels=False, row=subplot_row_index + 1, col=1) + fig.update_xaxes(showticklabels=False, row=subplot_row_index + 1, col=1) + + return fig + def _build_coverage_plot(coverage: list[int], peptide_matches: list[PeptideMatch], protein_id: str, protein_sequence: str, rows: dict[str, list[list[PeptideMatch]]]) -> go.Figure: @@ -148,8 +212,6 @@ def _build_coverage_plot(coverage: list[int], peptide_matches: list[PeptideMatch fig = go.Figure(data=go.Bar(x=x_labels, y=normalized_coverage, marker=dict( - color=normalized_coverage, - colorscale='Bluered', colorbar=dict(title="Coverage
(normalized)"), line=dict(width=0) ), @@ -210,7 +272,7 @@ def _build_coverage_plot(coverage: list[int], peptide_matches: list[PeptideMatch xaxis_title="Protein Sequence", yaxis=dict(visible=False, range=[0, 1 + current_row], title=""), showlegend=False, - bargap=0 + bargap=0.2 ) return fig @@ -237,3 +299,4 @@ def distribute_to_rows(peptide_matches: list[PeptideMatch]) -> dict[list[list[Pe else: rows[peptide_match.sample].append([(peptide_match)]) return rows + From d04024863bc276f607267e07259de32a13f550ea Mon Sep 17 00:00:00 2001 From: henning Date: Tue, 10 Dec 2024 12:17:32 +0100 Subject: [PATCH 08/13] Separate the sequencing depth bar charts for each sample --- protzilla/data_analysis/protein_coverage.py | 72 +++++++++++++-------- 1 file changed, 45 insertions(+), 27 deletions(-) diff --git a/protzilla/data_analysis/protein_coverage.py b/protzilla/data_analysis/protein_coverage.py index 1a503ce9..0a5f514b 100644 --- a/protzilla/data_analysis/protein_coverage.py +++ b/protzilla/data_analysis/protein_coverage.py @@ -4,6 +4,7 @@ from protzilla.constants.paths import EXTERNAL_DATA_PATH from protzilla.data_analysis.differential_expression_mann_whitney import mann_whitney_test_on_intensity_data from protzilla.disk_operator import PickleOperator +from protzilla.constants.colors import PLOT_PRIMARY_COLOR from dataclasses import dataclass import pandas as pd from numpy import log2 @@ -104,33 +105,33 @@ def plot_protein_coverage( reduced_peptide_df = peptide_df[peptide_df["Sample"].isin(samples) & peptide_df['Intensity'] > 0] - protein_dict = dict(zip(fasta_df["Protein ID"], fasta_df["Protein Sequence"])) - kmer_dict = build_kmer_dictionary(protein_dict, k=5) + protein_sequence_dict = dict(zip(fasta_df["Protein ID"], fasta_df["Protein Sequence"])) + sequence_kmer_dict = build_kmer_dictionary(protein_sequence_dict, k=5) - protein_sequence = protein_dict[protein_id] + protein_sequence = protein_sequence_dict[protein_id] peptide_matches = [] - coverage = [0] * len(protein_sequence) + coverage_by_sample = {sample : [0] * len(protein_sequence) for sample in samples} - for sample, peptide_sequence, intensity in tqdm( + for sample_name, peptide_sequence, intensity in tqdm( reduced_peptide_df[['Sample', 'Sequence', 'Intensity']].drop_duplicates().itertuples(index=False), desc="Matching peptides to protein", unit_scale=True, unit="peptide"): protein_hits = match_peptide_to_protein_ids(peptide_sequence=peptide_sequence, - protein_kmer_dictionary=kmer_dict, - protein_dictionary=protein_dict) + protein_kmer_dictionary=sequence_kmer_dict, + protein_dictionary=protein_sequence_dict) # we only care about the hits pertaining to the argument-supplied protein id - protein_hits = filter(lambda x: x.protein_id == protein_id, protein_hits) - for protein_hit in protein_hits: + filtered_protein_hits = filter(lambda x: x.protein_id == protein_id,protein_hits) + for protein_hit in filtered_protein_hits: # add the peptide sequence and location to the peptide matches pertaining to the protein id peptide_matches.append( PeptideMatch(peptide_sequence=peptide_sequence, start_location_on_protein=protein_hit.start_location_on_protein, end_location_on_protein=protein_hit.end_location_on_protein, intensity=log2(intensity), - sample=sample) + sample=sample_name) ) # update the coverage of the protein sequence - increment_coverage(coverage, protein_hit) + increment_coverage(coverage_by_sample[sample_name], protein_hit) if len(peptide_matches) == 0: raise ValueError(f"No peptides matched for protein {protein_id}") @@ -138,26 +139,40 @@ def plot_protein_coverage( # now that all the matches have been determined with their start and end on the protein sequence, we need to find # an optimal solution for the location of their rectangles in the plot without overlap while minimizing the # required number of rows (vertical space) - rows = distribute_to_rows(peptide_matches) + rows_by_sample = distribute_to_rows(peptide_matches) - fig = _new_build_coverage_plot(coverage, peptide_matches, protein_id, protein_sequence, rows) + fig = _new_build_coverage_plot(coverage_by_sample, peptide_matches, protein_id, protein_sequence, rows_by_sample) return dict(plots=[fig]) def _new_build_coverage_plot( - coverage: list[int], peptide_matches: list[PeptideMatch], protein_id: str, protein_sequence: str, - rows: dict[str, list[list[PeptideMatch]]], intensity_normalization: IntensityNormalization = IntensityNormalization.min_max_scaling + coverages_by_sample: list[int], peptide_matches: list[PeptideMatch], protein_id: str, protein_sequence: str, + rows_by_sample: dict[str, list[list[PeptideMatch]]], intensity_normalization: IntensityNormalization = IntensityNormalization.min_max_scaling ) -> go.Figure: # one row for each sample + two row for the protein sequence (on top and on the bottom) - number_of_subplots = len(rows) + 1 + number_of_subplots = len(rows_by_sample.keys()) + len(coverages_by_sample.keys()) protein_sequence_labels = [f"{amino_acid} - {amino_acid_index} " for amino_acid_index, amino_acid in enumerate(protein_sequence)] - fig = make_subplots(rows=number_of_subplots, cols=1, shared_xaxes=True, vertical_spacing=0.2, subplot_titles=list(rows.keys()) + [f"Sequencing depth of {protein_id}"]) - # Sequencing depth bar chart - fig.add_trace(go.Bar(x=protein_sequence_labels, y=coverage, showlegend=False), row=number_of_subplots, col=1) + subplot_titles = zip( + [f"Peptides of {sample}" for sample in rows_by_sample.keys()], # str_a + [f"Sequencing depth of {sample}" for sample in coverages_by_sample.keys()] # str_b + ) # now is (str_a, str_b), (str_a, str_b), ..., so we need to flatten it + subplot_titles = [title for titles in subplot_titles for title in titles] + max_coverage_value = get_max_coverage(coverages_by_sample) + + fig = make_subplots(rows=number_of_subplots, cols=1, shared_xaxes=True, vertical_spacing=0.1, subplot_titles=subplot_titles) # Peptides - for subplot_row_index, (sample, rows) in enumerate(rows.items()): + for sample_index, sample_name in enumerate(rows_by_sample.keys()): + peptide_subplot_rows = rows_by_sample[sample_name] + peptides_subplot_index = (2*sample_index) + 1 + coverage = coverages_by_sample[sample_name] + coverage_subplot_index = (2*sample_index) + 2 + + # Sequencing depth bar chart + fig.add_trace(go.Bar(x=protein_sequence_labels, y=coverage, showlegend=False, marker=dict(color=PLOT_PRIMARY_COLOR)), + row=coverage_subplot_index, col=1) + fig.update_yaxes(range=[0, max_coverage_value], row=coverage_subplot_index, col=1) # Intensity scaling for the current sample - current_row_intensities = [peptide_match.intensity for row in rows for peptide_match in row] + current_row_intensities = [peptide_match.intensity for row in peptide_subplot_rows for peptide_match in row] max_intensity, min_intensity = max(current_row_intensities), min(current_row_intensities) def scale_intensity(intensity: float) -> float: if intensity_normalization == IntensityNormalization.min_max_scaling: @@ -165,7 +180,7 @@ def scale_intensity(intensity: float) -> float: else: return intensity - for row_index, row in enumerate(rows): + for row_index, row in enumerate(peptide_subplot_rows): for peptide_match in row: x0, x1 = peptide_match.start_location_on_protein, peptide_match.end_location_on_protein y0, y1 = row_index, row_index + 1 @@ -177,14 +192,14 @@ def scale_intensity(intensity: float) -> float: fillcolor=f"rgba({int(255 * scale_intensity(peptide_match.intensity))}, 0, 0, 1)", opacity=0.5, layer="above", - row=subplot_row_index + 1, col=1 + row=peptides_subplot_index, col=1 ) # add invisible plotly object to the rectangle to show the peptide sequence when hovered over fig.add_trace(go.Scatter( x=protein_sequence_labels[ peptide_match.start_location_on_protein:peptide_match.end_location_on_protein + 1], y=[(y0 + y1) / 2] * len(peptide_match.peptide_sequence), - text=[f"Sample: {sample}
Peptide: {peptide_match.peptide_sequence}
" + text=[f"Sample: {sample_name}
Peptide: {peptide_match.peptide_sequence}
" f"({peptide_match.start_location_on_protein}-{peptide_match.end_location_on_protein})
" f"Intensity: {peptide_match.intensity}"] * len(peptide_match.peptide_sequence), mode="markers", @@ -192,10 +207,10 @@ def scale_intensity(intensity: float) -> float: hoverinfo="text", hovertemplate="%{text}", showlegend=False, - ), row=subplot_row_index + 1, col=1) + ), row=peptides_subplot_index, col=1) - fig.update_yaxes(range=[0, len(rows)], showticklabels=False, row=subplot_row_index + 1, col=1) - fig.update_xaxes(showticklabels=False, row=subplot_row_index + 1, col=1) + fig.update_yaxes(range=[0, len(peptide_subplot_rows)], showticklabels=False, row=peptides_subplot_index, col=1) + fig.update_xaxes(showticklabels=False, row=peptides_subplot_index, col=1) return fig @@ -300,3 +315,6 @@ def distribute_to_rows(peptide_matches: list[PeptideMatch]) -> dict[list[list[Pe rows[peptide_match.sample].append([(peptide_match)]) return rows + +def get_max_coverage(coverage: dict[list[int]]) -> int: + return max([max(coverage) for coverage in coverage.values()]) \ No newline at end of file From 19a0c257f6d60524ae73d984a69837202ca89a21 Mon Sep 17 00:00:00 2001 From: henning Date: Thu, 12 Dec 2024 16:17:35 +0100 Subject: [PATCH 09/13] Reformatting and refactoring --- protzilla/data_analysis/protein_coverage.py | 407 ++++++++++++-------- 1 file changed, 238 insertions(+), 169 deletions(-) diff --git a/protzilla/data_analysis/protein_coverage.py b/protzilla/data_analysis/protein_coverage.py index 0a5f514b..b292d0bd 100644 --- a/protzilla/data_analysis/protein_coverage.py +++ b/protzilla/data_analysis/protein_coverage.py @@ -2,7 +2,9 @@ from docutils.nodes import title from tqdm import tqdm from protzilla.constants.paths import EXTERNAL_DATA_PATH -from protzilla.data_analysis.differential_expression_mann_whitney import mann_whitney_test_on_intensity_data +from protzilla.data_analysis.differential_expression_mann_whitney import ( + mann_whitney_test_on_intensity_data, +) from protzilla.disk_operator import PickleOperator from protzilla.constants.colors import PLOT_PRIMARY_COLOR from dataclasses import dataclass @@ -10,6 +12,7 @@ from numpy import log2 import plotly.graph_objects as go from plotly.subplots import make_subplots + # import StrEnum from enum import StrEnum @@ -29,16 +32,19 @@ class ProteinHit: start_location_on_protein: int end_location_on_protein: int + class IntensityNormalization(StrEnum): min_max_scaling = "min_max_scaling" none = "none" - -def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dict[str, list[tuple[str, int]]]: +def build_kmer_dictionary( + protein_dictionary: dict[str, str], k: int = 5 +) -> dict[str, list[tuple[str, int]]]: """ Builds a dictionary of all kmers in a list of protein sequences. The kmers are generated by sliding a window of size - k along the protein sequences. The dictionary maps a the k-mer to a list of tuples containing the protein ID and the index of + k along the protein sequences. The dictionary maps a the k-mer to a list of tuples containing the protein ID and + the index of the k-mer in the protein sequence. The dictionary is saved to disk for future use. """ # if the dictionary already exists, load it from disk @@ -47,10 +53,14 @@ def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dic kmer_dict = PickleOperator.read(kmer_dict_path) return kmer_dict kmer_dict = {} - for protein_id, protein_sequence in tqdm(protein_dictionary.items(), desc="Building kmer dictionary", - unit_scale=True, unit="protein"): + for protein_id, protein_sequence in tqdm( + protein_dictionary.items(), + desc="Building kmer dictionary", + unit_scale=True, + unit="protein", + ): for i in range(len(protein_sequence) - k + 1): - kmer = protein_sequence[i: i + k] + kmer = protein_sequence[i : i + k] if kmer in kmer_dict: kmer_dict[kmer].append((protein_id, i)) else: @@ -60,41 +70,61 @@ def build_kmer_dictionary(protein_dictionary: dict[str, str], k: int = 5) -> dic def match_peptide_to_protein_ids( - peptide_sequence: str, protein_kmer_dictionary: dict[str, list[tuple[str, int]]], - protein_dictionary: dict[str, str] + peptide_sequence: str, + protein_kmer_dictionary: dict[str, list[tuple[str, int]]], + protein_dictionary: dict[str, str], ) -> list[ProteinHit]: """ Matches a peptide sequence to a dictionary of kmers in protein sequences. - Returns a list of tuples containing the protein ID and the start, end location of the peptide in the protein sequence. + Returns a list of tuples containing the protein ID and the start, end location of the peptide in the protein + sequence. """ k = 5 - peptide_kmers = [peptide_sequence[i: i + k] for i in range(len(peptide_sequence) - k + 1)] + peptide_kmers = [ + peptide_sequence[i : i + k] for i in range(len(peptide_sequence) - k + 1) + ] first_kmer, last_kmer = peptide_kmers[0], peptide_kmers[-1] # match the kmers first_kmer_matches = protein_kmer_dictionary.get(first_kmer, []) last_kmer_matches = protein_kmer_dictionary.get(last_kmer, []) hits = [ - ProteinHit(protein_id, start_first_kmer, start_first_kmer + len(peptide_sequence)) + ProteinHit( + protein_id, start_first_kmer, start_first_kmer + len(peptide_sequence) + ) for protein_id, start_first_kmer in first_kmer_matches for _, start_last_kmer in last_kmer_matches - if protein_id == _ and start_last_kmer == start_first_kmer + len(peptide_sequence) - k - and protein_dictionary[protein_id][ - start_first_kmer:start_first_kmer + len(peptide_sequence)] == peptide_sequence + if protein_id == _ + and start_last_kmer == start_first_kmer + len(peptide_sequence) - k + and protein_dictionary[protein_id][ + start_first_kmer : start_first_kmer + len(peptide_sequence) + ] + == peptide_sequence ] hits = list(set(hits)) # remove duplicates # Just sanity checks for hit in hits: - subsequence = protein_dictionary[hit.protein_id][hit.start_location_on_protein:hit.end_location_on_protein] + subsequence = protein_dictionary[hit.protein_id][ + hit.start_location_on_protein : hit.end_location_on_protein + ] assert len(subsequence) == len( - peptide_sequence), f"Lengths do not match: {len(subsequence)} != {len(peptide_sequence)}" - assert subsequence in peptide_sequence, f"Subsequence not in peptide sequence:\nA: {subsequence}\nB: {peptide_sequence}" - assert peptide_sequence in protein_dictionary[ - hit.protein_id], f"Peptide not in protein sequence: {peptide_sequence} not in {protein_dictionary[protein_id]}" + peptide_sequence + ), f"Lengths do not match: {len(subsequence)} != {len(peptide_sequence)}" + assert subsequence in peptide_sequence, ( + f"Subsequence not in peptide sequence:\nA: {subsequence}\nB: " + f"{peptide_sequence}" + ) + assert peptide_sequence in protein_dictionary[hit.protein_id], ( + f"Peptide not in protein sequence: {peptide_sequence} not in " + f"{protein_dictionary[protein_id]}" + ) return hits def plot_protein_coverage( - fasta_df: pd.DataFrame, peptide_df: pd.DataFrame, protein_id: str, samples: list[str] = None + fasta_df: pd.DataFrame, + peptide_df: pd.DataFrame, + protein_id: str, + samples: list[str] = None, ) -> None: """ Plots the coverage of a protein sequence by peptides. @@ -103,32 +133,46 @@ def plot_protein_coverage( if samples is None or samples == []: raise ValueError("No samples provided.") - reduced_peptide_df = peptide_df[peptide_df["Sample"].isin(samples) & peptide_df['Intensity'] > 0] + reduced_peptide_df = peptide_df[ + peptide_df["Sample"].isin(samples) & peptide_df["Intensity"] > 0 + ] - protein_sequence_dict = dict(zip(fasta_df["Protein ID"], fasta_df["Protein Sequence"])) + protein_sequence_dict = dict( + zip(fasta_df["Protein ID"], fasta_df["Protein Sequence"]) + ) sequence_kmer_dict = build_kmer_dictionary(protein_sequence_dict, k=5) protein_sequence = protein_sequence_dict[protein_id] peptide_matches = [] - coverage_by_sample = {sample : [0] * len(protein_sequence) for sample in samples} + coverage_by_sample = {sample: [0] * len(protein_sequence) for sample in samples} for sample_name, peptide_sequence, intensity in tqdm( - reduced_peptide_df[['Sample', 'Sequence', 'Intensity']].drop_duplicates().itertuples(index=False), - desc="Matching peptides to protein", unit_scale=True, - unit="peptide"): - protein_hits = match_peptide_to_protein_ids(peptide_sequence=peptide_sequence, - protein_kmer_dictionary=sequence_kmer_dict, - protein_dictionary=protein_sequence_dict) + reduced_peptide_df[["Sample", "Sequence", "Intensity"]] + .drop_duplicates() + .itertuples(index=False), + desc="Matching peptides to protein", + unit_scale=True, + unit="peptide", + ): + protein_hits = match_peptide_to_protein_ids( + peptide_sequence=peptide_sequence, + protein_kmer_dictionary=sequence_kmer_dict, + protein_dictionary=protein_sequence_dict, + ) # we only care about the hits pertaining to the argument-supplied protein id - filtered_protein_hits = filter(lambda x: x.protein_id == protein_id,protein_hits) + filtered_protein_hits = filter( + lambda x: x.protein_id == protein_id, protein_hits + ) for protein_hit in filtered_protein_hits: # add the peptide sequence and location to the peptide matches pertaining to the protein id peptide_matches.append( - PeptideMatch(peptide_sequence=peptide_sequence, - start_location_on_protein=protein_hit.start_location_on_protein, - end_location_on_protein=protein_hit.end_location_on_protein, - intensity=log2(intensity), - sample=sample_name) + PeptideMatch( + peptide_sequence=peptide_sequence, + start_location_on_protein=protein_hit.start_location_on_protein, + end_location_on_protein=protein_hit.end_location_on_protein, + intensity=log2(intensity), + sample=sample_name, + ) ) # update the coverage of the protein sequence increment_coverage(coverage_by_sample[sample_name], protein_hit) @@ -141,174 +185,199 @@ def plot_protein_coverage( # required number of rows (vertical space) rows_by_sample = distribute_to_rows(peptide_matches) - fig = _new_build_coverage_plot(coverage_by_sample, peptide_matches, protein_id, protein_sequence, rows_by_sample) + fig = _new_build_coverage_plot( + coverage_by_sample, + peptide_matches, + protein_id, + protein_sequence, + rows_by_sample, + ) return dict(plots=[fig]) + def _new_build_coverage_plot( - coverages_by_sample: list[int], peptide_matches: list[PeptideMatch], protein_id: str, protein_sequence: str, - rows_by_sample: dict[str, list[list[PeptideMatch]]], intensity_normalization: IntensityNormalization = IntensityNormalization.min_max_scaling + coverages_by_sample: list[int], + peptide_matches: list[PeptideMatch], + protein_id: str, + protein_sequence: str, + rows_by_sample: dict[str, list[list[PeptideMatch]]], + intensity_normalization: IntensityNormalization = IntensityNormalization.min_max_scaling, ) -> go.Figure: # one row for each sample + two row for the protein sequence (on top and on the bottom) number_of_subplots = len(rows_by_sample.keys()) + len(coverages_by_sample.keys()) - protein_sequence_labels = [f"{amino_acid} - {amino_acid_index} " for amino_acid_index, amino_acid in enumerate(protein_sequence)] + protein_sequence_labels = [ + f"{amino_acid} - {amino_acid_index} " + for amino_acid_index, amino_acid in enumerate(protein_sequence) + ] subplot_titles = zip( - [f"Peptides of {sample}" for sample in rows_by_sample.keys()], # str_a - [f"Sequencing depth of {sample}" for sample in coverages_by_sample.keys()] # str_b - ) # now is (str_a, str_b), (str_a, str_b), ..., so we need to flatten it + [f"Peptides of {sample}" for sample in rows_by_sample.keys()], # str_a + [ + f"Sequencing depth of {sample}" for sample in coverages_by_sample.keys() + ], # str_b + ) # now is (str_a, str_b), (str_a, str_b), ..., so we need to flatten it subplot_titles = [title for titles in subplot_titles for title in titles] max_coverage_value = get_max_coverage(coverages_by_sample) - fig = make_subplots(rows=number_of_subplots, cols=1, shared_xaxes=True, vertical_spacing=0.1, subplot_titles=subplot_titles) + fig = make_subplots( + rows=number_of_subplots, + cols=1, + shared_xaxes=True, + vertical_spacing=0.1, + subplot_titles=subplot_titles, + ) # Peptides for sample_index, sample_name in enumerate(rows_by_sample.keys()): peptide_subplot_rows = rows_by_sample[sample_name] - peptides_subplot_index = (2*sample_index) + 1 - coverage = coverages_by_sample[sample_name] - coverage_subplot_index = (2*sample_index) + 2 - - # Sequencing depth bar chart - fig.add_trace(go.Bar(x=protein_sequence_labels, y=coverage, showlegend=False, marker=dict(color=PLOT_PRIMARY_COLOR)), - row=coverage_subplot_index, col=1) - fig.update_yaxes(range=[0, max_coverage_value], row=coverage_subplot_index, col=1) + peptides_subplot_index = (2 * sample_index) + 1 + coverage = coverages_by_sample[sample_name] + coverage_subplot_index = (2 * sample_index) + 2 + + add_sequence_depth_to_plot( + fig, + coverage, + coverage_subplot_index, + max_coverage_value, + protein_sequence_labels, + ) + + # Peptide plot # Intensity scaling for the current sample - current_row_intensities = [peptide_match.intensity for row in peptide_subplot_rows for peptide_match in row] - max_intensity, min_intensity = max(current_row_intensities), min(current_row_intensities) - def scale_intensity(intensity: float) -> float: - if intensity_normalization == IntensityNormalization.min_max_scaling: - return (intensity - min_intensity) / (max_intensity - min_intensity) - else: - return intensity - + current_row_intensities = [ + peptide_match.intensity + for row in peptide_subplot_rows + for peptide_match in row + ] + max_intensity, min_intensity = max(current_row_intensities), min( + current_row_intensities + ) + scale_intensity = lambda intensity: (intensity - min_intensity) / ( + max_intensity - min_intensity + ) for row_index, row in enumerate(peptide_subplot_rows): for peptide_match in row: - x0, x1 = peptide_match.start_location_on_protein, peptide_match.end_location_on_protein - y0, y1 = row_index, row_index + 1 - # add the rectangles - fig.add_shape( - type="rect", - x0=x0 - 0.5, y0=y0, - x1=x1 - 0.5, y1=y1, - fillcolor=f"rgba({int(255 * scale_intensity(peptide_match.intensity))}, 0, 0, 1)", - opacity=0.5, - layer="above", - row=peptides_subplot_index, col=1 + normalized_intensity = scale_intensity(peptide_match.intensity) + add_peptide_to_plot( + fig, + normalized_intensity, + peptide_match, + peptides_subplot_index, + protein_sequence_labels, + row_index, ) - # add invisible plotly object to the rectangle to show the peptide sequence when hovered over - fig.add_trace(go.Scatter( - x=protein_sequence_labels[ - peptide_match.start_location_on_protein:peptide_match.end_location_on_protein + 1], - y=[(y0 + y1) / 2] * len(peptide_match.peptide_sequence), - text=[f"Sample: {sample_name}
Peptide: {peptide_match.peptide_sequence}
" - f"({peptide_match.start_location_on_protein}-{peptide_match.end_location_on_protein})
" - f"Intensity: {peptide_match.intensity}"] * len(peptide_match.peptide_sequence), - mode="markers", - marker=dict(opacity=0, size=15), # make marker invisible - hoverinfo="text", - hovertemplate="%{text}", - showlegend=False, - ), row=peptides_subplot_index, col=1) - - fig.update_yaxes(range=[0, len(peptide_subplot_rows)], showticklabels=False, row=peptides_subplot_index, col=1) - fig.update_xaxes(showticklabels=False, row=peptides_subplot_index, col=1) + + fig.update_yaxes( + range=[0, len(peptide_subplot_rows)], + showticklabels=False, + showgrid=False, + row=peptides_subplot_index, + col=1, + ) + fig.update_xaxes( + showticklabels=False, showgrid=False, row=peptides_subplot_index, col=1 + ) return fig -def _build_coverage_plot(coverage: list[int], peptide_matches: list[PeptideMatch], protein_id: str, - protein_sequence: str, rows: dict[str, list[list[PeptideMatch]]]) -> go.Figure: - x_labels = [f"{amino_acid} - {amino_acid_index} " for amino_acid_index, amino_acid in enumerate(protein_sequence)] - max_coverage_in_sequence = max(coverage) - max_intensity = max([peptide_match.intensity for peptide_match in peptide_matches]) - normalized_coverage = [value / max_coverage_in_sequence for value in coverage] - hover_text = [f"Position: {i}
Amino acid: {amino_acid}
Coverage: {coverage}" - for i, (coverage, amino_acid) in - enumerate(zip(normalized_coverage, protein_sequence))] - fig = go.Figure(data=go.Bar(x=x_labels, - y=normalized_coverage, - marker=dict( - colorbar=dict(title="Coverage
(normalized)"), - line=dict(width=0) - ), - hovertext=hover_text, - hoverinfo="text", - showlegend=False, - name="Coverage (normalized to max in Protein Sequence)")) - current_row = 1 - for sample_idx, (sample, sample_rows) in enumerate(rows.items()): - for row_index, row in enumerate(sample_rows): - for peptide_match in row: - # add a rectangle for each peptide - x0, x1 = peptide_match.start_location_on_protein, peptide_match.end_location_on_protein - y0, y1 = current_row, current_row + 1 - normalized_intensity = (peptide_match.intensity) / max_intensity - fig.add_shape( - type="rect", - # the coordinates need to be a bit offset, as otherwise the rectangles would start in the middle of the bars - x0=x0 - 0.5, y0=y0, - x1=x1 - 0.5, y1=y1, - fillcolor=f"rgba({int(255 * (normalized_intensity))}, 0, 0, 1)", - opacity=0.5, - layer="above", - ) - # add invisible plotly object to the rectangle to show the peptide sequence when hovered over - fig.add_trace(go.Scatter( - x=x_labels[peptide_match.start_location_on_protein:peptide_match.end_location_on_protein + 1], - y=[(y0 + y1) / 2] * len(peptide_match.peptide_sequence), - text=[f"Sample: {sample}
Peptide: {peptide_match.peptide_sequence}
" - f"({peptide_match.start_location_on_protein}-{peptide_match.end_location_on_protein})
" - f"Intensity: {peptide_match.intensity}"] * len(peptide_match.peptide_sequence), - mode="markers", - marker=dict(opacity=0, size=30), # make marker invisible - hoverinfo="text", - hovertemplate="%{text}", - showlegend=False, - )) - - # Move to next row - current_row += 1 - - # Add a horizontal line separator between samples (except after the last sample) - if sample_idx < len(rows) - 1: - fig.add_shape( - type="line", - x0=-0.5, - y0=current_row, - x1=len(protein_sequence) - 0.5, - y1=current_row, - line=dict( - color="Gray", - width=2, - dash="solid", - ) - ) - fig.update_layout( - title=f"Protein Coverage of {protein_id}", - xaxis_title="Protein Sequence", - yaxis=dict(visible=False, range=[0, 1 + current_row], title=""), +def add_sequence_depth_to_plot( + fig: go.Figure, + coverage: list[int], + coverage_subplot_index: int, + max_coverage_value: int, + protein_sequence_labels: list[str], +) -> None: + SEQUENCE_DEPTH_BAR_CHART = dict( + x=protein_sequence_labels, + y=coverage, showlegend=False, - bargap=0.2 + marker=dict(color=PLOT_PRIMARY_COLOR), + ) + fig.add_trace(go.Bar(**SEQUENCE_DEPTH_BAR_CHART), row=coverage_subplot_index, col=1) + fig.update_yaxes(range=[0, max_coverage_value], row=coverage_subplot_index, col=1) + + +def add_peptide_to_plot( + fig: go.Figure, + normalized_intensity: float, + peptide_match: PeptideMatch, + peptides_subplot_index: int, + protein_sequence_labels: list[str], + row_index: int, +) -> None: + x0, x1 = ( + peptide_match.start_location_on_protein, + peptide_match.end_location_on_protein, + ) + y0, y1 = row_index, row_index + 1 + PEPTIDE_SHAPE = dict( + type="rect", + x0=x0 - 0.5, + y0=y0, + x1=x1 - 0.5, + y1=y1, + fillcolor=f"rgba({int(255 * normalized_intensity)}, 0, 0, 1)", + opacity=0.5, + layer="above", + ) + fig.add_shape(**PEPTIDE_SHAPE, row=peptides_subplot_index, col=1) + # add invisible plotly object to the rectangle to show the peptide sequence when hovered over + fig.add_trace( + go.Scatter( + x=protein_sequence_labels[ + peptide_match.start_location_on_protein : peptide_match.end_location_on_protein + + 1 + ], + y=[(y0 + y1) / 2] * len(peptide_match.peptide_sequence), + text=[ + f"Sample: {peptide_match.sample}
Peptide: {peptide_match.peptide_sequence}
" + f"({peptide_match.start_location_on_protein}-{peptide_match.end_location_on_protein})
" + f"Intensity: {peptide_match.intensity}" + ] + * len(peptide_match.peptide_sequence), + mode="markers", + marker=dict(opacity=0, size=15), # make marker invisible + hoverinfo="text", + hovertemplate="%{text}", + showlegend=False, + ), + row=peptides_subplot_index, + col=1, ) - return fig def increment_coverage(coverage: list[int], protein_hit: ProteinHit) -> None: - coverage[protein_hit.start_location_on_protein:protein_hit.end_location_on_protein] = \ - [coverage + 1 for coverage in - coverage[protein_hit.start_location_on_protein:protein_hit.end_location_on_protein]] + coverage[ + protein_hit.start_location_on_protein : protein_hit.end_location_on_protein + ] = [ + coverage + 1 + for coverage in coverage[ + protein_hit.start_location_on_protein : protein_hit.end_location_on_protein + ] + ] -def distribute_to_rows(peptide_matches: list[PeptideMatch]) -> dict[list[list[PeptideMatch]]]: +def distribute_to_rows( + peptide_matches: list[PeptideMatch], +) -> dict[list[list[PeptideMatch]]]: """ Distributes peptide matches to rows such that no two peptides overlap in the same row. Greedy algorithm (see Interval Scheduling Problem). """ - peptide_matches.sort(key=lambda peptide_match: peptide_match.start_location_on_protein) - rows = {sample: [] for sample in set([peptide_match.sample for peptide_match in peptide_matches])} + peptide_matches.sort( + key=lambda peptide_match: peptide_match.start_location_on_protein + ) + rows = { + sample: [] + for sample in set([peptide_match.sample for peptide_match in peptide_matches]) + } for peptide_match in peptide_matches: # find the first row that does not overlap with the current peptide for row in rows[peptide_match.sample]: - if row[-1].end_location_on_protein < peptide_match.start_location_on_protein: + if ( + row[-1].end_location_on_protein + < peptide_match.start_location_on_protein + ): row.append((peptide_match)) break else: @@ -317,4 +386,4 @@ def distribute_to_rows(peptide_matches: list[PeptideMatch]) -> dict[list[list[Pe def get_max_coverage(coverage: dict[list[int]]) -> int: - return max([max(coverage) for coverage in coverage.values()]) \ No newline at end of file + return max([max(coverage) for coverage in coverage.values()]) From a7bd9f0f2187b3836d23cdec791bbae25a969212 Mon Sep 17 00:00:00 2001 From: henning Date: Mon, 16 Dec 2024 15:00:43 +0100 Subject: [PATCH 10/13] Add color interpolation, combine peptides and sequence depth subplots --- protzilla/constants/colors.py | 35 ++++++- protzilla/data_analysis/protein_coverage.py | 102 +++++++++++++------- 2 files changed, 100 insertions(+), 37 deletions(-) diff --git a/protzilla/constants/colors.py b/protzilla/constants/colors.py index 0c6600f7..9a92e93c 100644 --- a/protzilla/constants/colors.py +++ b/protzilla/constants/colors.py @@ -11,4 +11,37 @@ """First color in list. Conventionally used for visualizing outliers.""" PLOT_SECONDARY_COLOR = PLOT_COLOR_SEQUENCE[1] -"""Second color in list.""" \ No newline at end of file +"""Second color in list.""" + + +def interpolate_color(color_a, color_b, t): + """ + Interpolate between two RGB color strings based on a float t between 0 and 1. + + Args: + color_a (str): RGB color string in the format "#RRGGBB". + color_b (str): RGB color string in the format "#RRGGBB". + t (float): A float between 0 and 1 representing the interpolation factor. + + Returns: + str: Interpolated color as an RGB string in the format "#RRGGBB". + """ + if not (0 <= t <= 1): + raise ValueError("Interpolation factor t must be between 0 and 1") + + # Convert hex color strings to RGB tuples + def hex_to_rgb(hex_color): + hex_color = hex_color.lstrip("#") + return tuple(int(hex_color[i : i + 2], 16) for i in (0, 2, 4)) + + # Convert RGB tuples back to hex color strings + def rgb_to_hex(rgb): + return f"#{''.join(f'{c:02x}' for c in rgb)}" + + rgb_a = hex_to_rgb(color_a) + rgb_b = hex_to_rgb(color_b) + + # Interpolate each color channel + interpolated_rgb = tuple(int(a + (b - a) * t) for a, b in zip(rgb_a, rgb_b)) + + return rgb_to_hex(interpolated_rgb) diff --git a/protzilla/data_analysis/protein_coverage.py b/protzilla/data_analysis/protein_coverage.py index b292d0bd..89133bfb 100644 --- a/protzilla/data_analysis/protein_coverage.py +++ b/protzilla/data_analysis/protein_coverage.py @@ -6,7 +6,11 @@ mann_whitney_test_on_intensity_data, ) from protzilla.disk_operator import PickleOperator -from protzilla.constants.colors import PLOT_PRIMARY_COLOR +from protzilla.constants.colors import ( + PLOT_PRIMARY_COLOR, + PLOT_COLOR_SEQUENCE, + interpolate_color, +) from dataclasses import dataclass import pandas as pd from numpy import log2 @@ -16,6 +20,8 @@ # import StrEnum from enum import StrEnum +INTENSITY_COLORS = ["#FFFFFF", PLOT_COLOR_SEQUENCE[3]] + @dataclass(unsafe_hash=True) class PeptideMatch: @@ -205,18 +211,12 @@ def _new_build_coverage_plot( intensity_normalization: IntensityNormalization = IntensityNormalization.min_max_scaling, ) -> go.Figure: # one row for each sample + two row for the protein sequence (on top and on the bottom) - number_of_subplots = len(rows_by_sample.keys()) + len(coverages_by_sample.keys()) + number_of_subplots = len(rows_by_sample.keys()) protein_sequence_labels = [ f"{amino_acid} - {amino_acid_index} " for amino_acid_index, amino_acid in enumerate(protein_sequence) ] - subplot_titles = zip( - [f"Peptides of {sample}" for sample in rows_by_sample.keys()], # str_a - [ - f"Sequencing depth of {sample}" for sample in coverages_by_sample.keys() - ], # str_b - ) # now is (str_a, str_b), (str_a, str_b), ..., so we need to flatten it - subplot_titles = [title for titles in subplot_titles for title in titles] + subplot_titles = [f"Peptides of {sample}" for sample in rows_by_sample.keys()] max_coverage_value = get_max_coverage(coverages_by_sample) fig = make_subplots( @@ -225,57 +225,80 @@ def _new_build_coverage_plot( shared_xaxes=True, vertical_spacing=0.1, subplot_titles=subplot_titles, + specs=[[{"secondary_y": True}] for _ in range(number_of_subplots)], ) # Peptides - for sample_index, sample_name in enumerate(rows_by_sample.keys()): - peptide_subplot_rows = rows_by_sample[sample_name] - peptides_subplot_index = (2 * sample_index) + 1 + for sample_index, sample_name in enumerate(rows_by_sample.keys(), start=1): + peptide_rows = rows_by_sample[sample_name] + sample_subplot_index = sample_index coverage = coverages_by_sample[sample_name] - coverage_subplot_index = (2 * sample_index) + 2 add_sequence_depth_to_plot( fig, coverage, - coverage_subplot_index, + sample_subplot_index, max_coverage_value, protein_sequence_labels, ) # Peptide plot - # Intensity scaling for the current sample - current_row_intensities = [ - peptide_match.intensity - for row in peptide_subplot_rows - for peptide_match in row + current_sample_intensities = [ + peptide_match.intensity for row in peptide_rows for peptide_match in row ] - max_intensity, min_intensity = max(current_row_intensities), min( - current_row_intensities + max_intensity, min_intensity = max(current_sample_intensities), min( + current_sample_intensities ) scale_intensity = lambda intensity: (intensity - min_intensity) / ( max_intensity - min_intensity ) - for row_index, row in enumerate(peptide_subplot_rows): + for row_index, row in enumerate(peptide_rows): for peptide_match in row: - normalized_intensity = scale_intensity(peptide_match.intensity) add_peptide_to_plot( - fig, - normalized_intensity, - peptide_match, - peptides_subplot_index, - protein_sequence_labels, - row_index, + fig=fig, + normalized_intensity=scale_intensity(peptide_match.intensity), + peptide_match=peptide_match, + peptides_subplot_index=sample_subplot_index, + protein_sequence_labels=protein_sequence_labels, + row_index=row_index, + offset=max_coverage_value, + color_a=INTENSITY_COLORS[0], + color_b=INTENSITY_COLORS[1], ) fig.update_yaxes( - range=[0, len(peptide_subplot_rows)], + range=[0, len(peptide_rows) + max_coverage_value], showticklabels=False, showgrid=False, - row=peptides_subplot_index, + row=sample_subplot_index, col=1, ) fig.update_xaxes( - showticklabels=False, showgrid=False, row=peptides_subplot_index, col=1 + showticklabels=False, showgrid=False, row=sample_subplot_index, col=1 ) + # Color legend trace + color_scale = [[0, INTENSITY_COLORS[0]], [1, INTENSITY_COLORS[1]]] + color_legend_trace = go.Scatter( + x=[None], + y=[None], + mode="markers", + marker=dict( + colorscale=color_scale, + showscale=True, + cmin=0, + cmax=1, + colorbar=dict( + title="Intensity of peptide", + x=1.0, # Move further outside the plot + len=0.9, # Increase length of colorbar (70% of subplot height) + thickness=30, # Increase thickness of colorbar + titleside="right", # Move title to the right side of the bar + ), + size=10, + ), + showlegend=False, + ) + + fig.add_trace(color_legend_trace, row=number_of_subplots, col=1) return fig @@ -292,9 +315,11 @@ def add_sequence_depth_to_plot( y=coverage, showlegend=False, marker=dict(color=PLOT_PRIMARY_COLOR), + # hovering should give the amino acid, the position and the coverage + hoverinfo="text", + hovertemplate="%{x}: %{y}", ) fig.add_trace(go.Bar(**SEQUENCE_DEPTH_BAR_CHART), row=coverage_subplot_index, col=1) - fig.update_yaxes(range=[0, max_coverage_value], row=coverage_subplot_index, col=1) def add_peptide_to_plot( @@ -304,20 +329,25 @@ def add_peptide_to_plot( peptides_subplot_index: int, protein_sequence_labels: list[str], row_index: int, + offset: int, + color_a: str = "#FFFFFF", + color_b: str = PLOT_COLOR_SEQUENCE[3], ) -> None: x0, x1 = ( peptide_match.start_location_on_protein, peptide_match.end_location_on_protein, ) - y0, y1 = row_index, row_index + 1 + y0, y1 = row_index + offset, row_index + offset + 1 + # interpolate between the two colors + color = interpolate_color(color_a, color_b, normalized_intensity) PEPTIDE_SHAPE = dict( type="rect", x0=x0 - 0.5, y0=y0, x1=x1 - 0.5, y1=y1, - fillcolor=f"rgba({int(255 * normalized_intensity)}, 0, 0, 1)", - opacity=0.5, + fillcolor=color, + line_color=color, layer="above", ) fig.add_shape(**PEPTIDE_SHAPE, row=peptides_subplot_index, col=1) From 8a3ba5a299fa34ab9b06a7e656e7e686092e4cff Mon Sep 17 00:00:00 2001 From: henning Date: Tue, 17 Dec 2024 15:41:46 +0100 Subject: [PATCH 11/13] Add grouping via metadata and aggregation via median, improve logging --- protzilla/data_analysis/protein_coverage.py | 184 +++++++++++++------- protzilla/methods/data_analysis.py | 6 +- protzilla/steps.py | 5 +- ui/runs/forms/data_analysis.py | 106 ++++++----- ui/runs/forms/fill_helper.py | 4 + 5 files changed, 201 insertions(+), 104 deletions(-) diff --git a/protzilla/data_analysis/protein_coverage.py b/protzilla/data_analysis/protein_coverage.py index 89133bfb..6b0e0bb9 100644 --- a/protzilla/data_analysis/protein_coverage.py +++ b/protzilla/data_analysis/protein_coverage.py @@ -16,11 +16,23 @@ from numpy import log2 import plotly.graph_objects as go from plotly.subplots import make_subplots +from protzilla.constants.protzilla_logging import logger # import StrEnum from enum import StrEnum INTENSITY_COLORS = ["#FFFFFF", PLOT_COLOR_SEQUENCE[3]] +SEQUENCE_DEPTH_PEPTIDE_SPACING = 1 + + +class IntensityNormalization(StrEnum): + min_max_scaling = "min_max_scaling" + none = "none" + + +class AggregationMethod(StrEnum): + median = "median" + mean = "mean" @dataclass(unsafe_hash=True) @@ -29,7 +41,7 @@ class PeptideMatch: start_location_on_protein: int end_location_on_protein: int intensity: float = 0.0 - sample: str = "" + metadata_group: str = "" @dataclass(unsafe_hash=True) @@ -85,6 +97,8 @@ def match_peptide_to_protein_ids( Returns a list of tuples containing the protein ID and the start, end location of the peptide in the protein sequence. """ + if len(peptide_sequence) == 0: + raise ValueError("Peptide sequence is empty.") k = 5 peptide_kmers = [ peptide_sequence[i : i + k] for i in range(len(peptide_sequence) - k + 1) @@ -107,6 +121,7 @@ def match_peptide_to_protein_ids( == peptide_sequence ] hits = list(set(hits)) # remove duplicates + # Just sanity checks for hit in hits: subsequence = protein_dictionary[hit.protein_id][ @@ -129,19 +144,27 @@ def match_peptide_to_protein_ids( def plot_protein_coverage( fasta_df: pd.DataFrame, peptide_df: pd.DataFrame, + metadata_df: pd.DataFrame, protein_id: str, - samples: list[str] = None, + grouping: str, + selected_groups: list[str] = None, + aggregation_method: AggregationMethod = AggregationMethod.median, ) -> None: """ Plots the coverage of a protein sequence by peptides. """ # generate the protein kmer dictionary - if samples is None or samples == []: + if selected_groups is None or selected_groups == []: raise ValueError("No samples provided.") + # add group information to the peptide dataframe + peptide_df = peptide_df.merge(metadata_df, on="Sample") + reduced_peptide_df = peptide_df[ - peptide_df["Sample"].isin(samples) & peptide_df["Intensity"] > 0 + peptide_df[grouping].isin(selected_groups) & peptide_df["Intensity"] > 0 ] + if len(reduced_peptide_df) == 0: + raise ValueError("No peptides found for the samples provided.") protein_sequence_dict = dict( zip(fasta_df["Protein ID"], fasta_df["Protein Sequence"]) @@ -150,16 +173,22 @@ def plot_protein_coverage( protein_sequence = protein_sequence_dict[protein_id] peptide_matches = [] - coverage_by_sample = {sample: [0] * len(protein_sequence) for sample in samples} + coverage_by_group_name = { + sample: [0] * len(protein_sequence) for sample in selected_groups + } - for sample_name, peptide_sequence, intensity in tqdm( - reduced_peptide_df[["Sample", "Sequence", "Intensity"]] + for (group_name, peptide_sequence), row in tqdm( + reduced_peptide_df[[grouping, "Sequence", "Intensity"]] .drop_duplicates() - .itertuples(index=False), + .groupby([grouping, "Sequence"]), desc="Matching peptides to protein", unit_scale=True, unit="peptide", ): + sample = extract_peptide_from_slice(row, aggregation_method) + if sample.empty: + continue + intensity = sample["Intensity"].values[0] protein_hits = match_peptide_to_protein_ids( peptide_sequence=peptide_sequence, protein_kmer_dictionary=sequence_kmer_dict, @@ -177,11 +206,11 @@ def plot_protein_coverage( start_location_on_protein=protein_hit.start_location_on_protein, end_location_on_protein=protein_hit.end_location_on_protein, intensity=log2(intensity), - sample=sample_name, + metadata_group=group_name, ) ) # update the coverage of the protein sequence - increment_coverage(coverage_by_sample[sample_name], protein_hit) + increment_coverage(coverage_by_group_name[group_name], protein_hit) if len(peptide_matches) == 0: raise ValueError(f"No peptides matched for protein {protein_id}") @@ -189,35 +218,38 @@ def plot_protein_coverage( # now that all the matches have been determined with their start and end on the protein sequence, we need to find # an optimal solution for the location of their rectangles in the plot without overlap while minimizing the # required number of rows (vertical space) - rows_by_sample = distribute_to_rows(peptide_matches) - - fig = _new_build_coverage_plot( - coverage_by_sample, - peptide_matches, - protein_id, - protein_sequence, - rows_by_sample, + rows_by_group_name = distribute_to_rows(peptide_matches) + + fig = build_coverage_plot( + coverages_by_group_name=coverage_by_group_name, + peptide_matches=peptide_matches, + protein_id=protein_id, + protein_sequence=protein_sequence, + rows_by_group_name=rows_by_group_name, + grouping=grouping, ) return dict(plots=[fig]) -def _new_build_coverage_plot( - coverages_by_sample: list[int], +def build_coverage_plot( + coverages_by_group_name: list[int], peptide_matches: list[PeptideMatch], protein_id: str, protein_sequence: str, - rows_by_sample: dict[str, list[list[PeptideMatch]]], + rows_by_group_name: dict[str, list[list[PeptideMatch]]], intensity_normalization: IntensityNormalization = IntensityNormalization.min_max_scaling, + grouping: str = "", ) -> go.Figure: - # one row for each sample + two row for the protein sequence (on top and on the bottom) - number_of_subplots = len(rows_by_sample.keys()) + number_of_subplots = len(rows_by_group_name.keys()) protein_sequence_labels = [ f"{amino_acid} - {amino_acid_index} " for amino_acid_index, amino_acid in enumerate(protein_sequence) ] - subplot_titles = [f"Peptides of {sample}" for sample in rows_by_sample.keys()] - max_coverage_value = get_max_coverage(coverages_by_sample) + subplot_titles = [ + f"Peptides of {group_name}" for group_name in rows_by_group_name.keys() + ] + max_coverage_value = get_max_coverage(coverages_by_group_name) fig = make_subplots( rows=number_of_subplots, @@ -228,25 +260,23 @@ def _new_build_coverage_plot( specs=[[{"secondary_y": True}] for _ in range(number_of_subplots)], ) # Peptides - for sample_index, sample_name in enumerate(rows_by_sample.keys(), start=1): - peptide_rows = rows_by_sample[sample_name] - sample_subplot_index = sample_index - coverage = coverages_by_sample[sample_name] + for group_index, group_name in enumerate(rows_by_group_name.keys(), start=1): + peptide_rows = rows_by_group_name[group_name] + coverage = coverages_by_group_name[group_name] add_sequence_depth_to_plot( - fig, - coverage, - sample_subplot_index, - max_coverage_value, - protein_sequence_labels, + fig=fig, + coverage=coverage, + group_subplot_index=group_index, + protein_sequence_labels=protein_sequence_labels, ) # Peptide plot - current_sample_intensities = [ + current_group_intensities = [ peptide_match.intensity for row in peptide_rows for peptide_match in row ] - max_intensity, min_intensity = max(current_sample_intensities), min( - current_sample_intensities + max_intensity, min_intensity = max(current_group_intensities), min( + current_group_intensities ) scale_intensity = lambda intensity: (intensity - min_intensity) / ( max_intensity - min_intensity @@ -257,25 +287,35 @@ def _new_build_coverage_plot( fig=fig, normalized_intensity=scale_intensity(peptide_match.intensity), peptide_match=peptide_match, - peptides_subplot_index=sample_subplot_index, + group_subplot_index=group_index, protein_sequence_labels=protein_sequence_labels, row_index=row_index, - offset=max_coverage_value, + offset=max_coverage_value + SEQUENCE_DEPTH_PEPTIDE_SPACING, color_a=INTENSITY_COLORS[0], color_b=INTENSITY_COLORS[1], + grouping=grouping, ) + fig.update_layout(hovermode="closest") fig.update_yaxes( - range=[0, len(peptide_rows) + max_coverage_value], + range=[ + 0, + len(peptide_rows) + max_coverage_value + SEQUENCE_DEPTH_PEPTIDE_SPACING, + ], showticklabels=False, showgrid=False, - row=sample_subplot_index, + row=group_index, col=1, ) - fig.update_xaxes( - showticklabels=False, showgrid=False, row=sample_subplot_index, col=1 - ) - # Color legend trace + fig.update_xaxes(showticklabels=False, showgrid=False, row=group_index, col=1) + fig.update_xaxes(showticklabels=True, row=number_of_subplots, col=1) + + add_intensity_legend(fig) + + return fig + + +def add_intensity_legend(fig): color_scale = [[0, INTENSITY_COLORS[0]], [1, INTENSITY_COLORS[1]]] color_legend_trace = go.Scatter( x=[None], @@ -297,17 +337,13 @@ def _new_build_coverage_plot( ), showlegend=False, ) - - fig.add_trace(color_legend_trace, row=number_of_subplots, col=1) - - return fig + fig.add_trace(color_legend_trace, row=1, col=1) def add_sequence_depth_to_plot( fig: go.Figure, coverage: list[int], - coverage_subplot_index: int, - max_coverage_value: int, + group_subplot_index: int, protein_sequence_labels: list[str], ) -> None: SEQUENCE_DEPTH_BAR_CHART = dict( @@ -315,23 +351,24 @@ def add_sequence_depth_to_plot( y=coverage, showlegend=False, marker=dict(color=PLOT_PRIMARY_COLOR), - # hovering should give the amino acid, the position and the coverage - hoverinfo="text", - hovertemplate="%{x}: %{y}", + # add hover text + hoverinfo="y", + hovertemplate="Sequencing depth at %{x}: %{y}", ) - fig.add_trace(go.Bar(**SEQUENCE_DEPTH_BAR_CHART), row=coverage_subplot_index, col=1) + fig.add_trace(go.Bar(**SEQUENCE_DEPTH_BAR_CHART), row=group_subplot_index, col=1) def add_peptide_to_plot( fig: go.Figure, normalized_intensity: float, peptide_match: PeptideMatch, - peptides_subplot_index: int, + group_subplot_index: int, protein_sequence_labels: list[str], row_index: int, offset: int, color_a: str = "#FFFFFF", color_b: str = PLOT_COLOR_SEQUENCE[3], + grouping: str = "", ) -> None: x0, x1 = ( peptide_match.start_location_on_protein, @@ -350,7 +387,7 @@ def add_peptide_to_plot( line_color=color, layer="above", ) - fig.add_shape(**PEPTIDE_SHAPE, row=peptides_subplot_index, col=1) + fig.add_shape(**PEPTIDE_SHAPE, row=group_subplot_index, col=1) # add invisible plotly object to the rectangle to show the peptide sequence when hovered over fig.add_trace( go.Scatter( @@ -360,7 +397,7 @@ def add_peptide_to_plot( ], y=[(y0 + y1) / 2] * len(peptide_match.peptide_sequence), text=[ - f"Sample: {peptide_match.sample}
Peptide: {peptide_match.peptide_sequence}
" + f"{grouping}: {peptide_match.metadata_group}
Peptide: {peptide_match.peptide_sequence}
" f"({peptide_match.start_location_on_protein}-{peptide_match.end_location_on_protein})
" f"Intensity: {peptide_match.intensity}" ] @@ -371,7 +408,7 @@ def add_peptide_to_plot( hovertemplate="%{text}", showlegend=False, ), - row=peptides_subplot_index, + row=group_subplot_index, col=1, ) @@ -387,6 +424,25 @@ def increment_coverage(coverage: list[int], protein_hit: ProteinHit) -> None: ] +def extract_peptide_from_slice( + slice_df: pd.DataFrame, + aggregation_strategy: AggregationMethod = AggregationMethod.median, +) -> pd.DataFrame: + """ + Given multiple peptide datapoints, return a single peptide datapoint with the intensity value extracted from the + slice of the peptide data. + """ + if len(slice_df) == 1: + return slice_df + if aggregation_strategy == AggregationMethod.median: + slice_df = slice_df.groupby("Sequence").median() + elif aggregation_strategy == AggregationMethod.mean: + slice_df = slice_df.groupby("Sequence").mean() + else: + raise ValueError(f"Unknown strategy: {aggregation_strategy}") + return slice_df + + def distribute_to_rows( peptide_matches: list[PeptideMatch], ) -> dict[list[list[PeptideMatch]]]: @@ -398,12 +454,14 @@ def distribute_to_rows( key=lambda peptide_match: peptide_match.start_location_on_protein ) rows = { - sample: [] - for sample in set([peptide_match.sample for peptide_match in peptide_matches]) + group_name: [] + for group_name in set( + [peptide_match.metadata_group for peptide_match in peptide_matches] + ) } for peptide_match in peptide_matches: # find the first row that does not overlap with the current peptide - for row in rows[peptide_match.sample]: + for row in rows[peptide_match.metadata_group]: if ( row[-1].end_location_on_protein < peptide_match.start_location_on_protein @@ -411,7 +469,7 @@ def distribute_to_rows( row.append((peptide_match)) break else: - rows[peptide_match.sample].append([(peptide_match)]) + rows[peptide_match.metadata_group].append([(peptide_match)]) return rows diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 765b3047..05a0d0fc 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -791,8 +791,11 @@ class PlotProteinCoverage(PlotStep): input_keys = [ "protein_id", "fasta_df", + "metadata_df", "peptide_df", - "samples" + "grouping", + "selected_groups", + "aggregation_method", ] output_keys = [] @@ -806,6 +809,7 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: inputs["peptide_df"] = steps.get_step_output( Step, "peptide_df", inputs["peptide_df_instance"] ) + inputs["metadata_df"] = steps.metadata_df return diff --git a/protzilla/steps.py b/protzilla/steps.py index eecf0d04..eac35b65 100644 --- a/protzilla/steps.py +++ b/protzilla/steps.py @@ -7,6 +7,7 @@ from enum import Enum from io import BytesIO from pathlib import Path +from protzilla.constants.protzilla_logging import logger import pandas as pd import plotly @@ -455,6 +456,9 @@ def check_instance_identifier(step): and input_key in step.inputs ): return step.inputs[input_key] + logging.warning( + f"No input {input_key} found for step type {step_type} and instance identifier {instance_identifier}" + ) return None def all_steps_in_section(self, section: str) -> list[Step]: @@ -506,7 +510,6 @@ def metadata_df(self) -> pd.DataFrame | None: from protzilla.methods.importing import ImportingStep return self.get_step_output(ImportingStep, "metadata_df") - logging.warning("No metadata_df found in steps") @property def preprocessed_output(self) -> Output: diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 5195f7d5..1bee889b 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -175,12 +175,12 @@ class DifferentialExpressionANOVAForm(MethodForm): ) def fill_form(self, run: Run) -> None: - self.fields[ - "protein_df" - ].choices = fill_helper.get_choices_for_protein_df_steps(run) - self.fields[ - "grouping" - ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + self.fields["protein_df"].choices = ( + fill_helper.get_choices_for_protein_df_steps(run) + ) + self.fields["grouping"].choices = ( + fill_helper.get_choices_for_metadata_non_sample_columns(run) + ) grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0]) self.fields["selected_groups"].choices = fill_helper.to_choices( run.steps.metadata_df[grouping].unique() @@ -216,12 +216,12 @@ class DifferentialExpressionTTestForm(MethodForm): group2 = CustomChoiceField(choices=[], label="Group 2") def fill_form(self, run: Run) -> None: - self.fields[ - "protein_df" - ].choices = fill_helper.get_choices_for_protein_df_steps(run) - self.fields[ - "grouping" - ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + self.fields["protein_df"].choices = ( + fill_helper.get_choices_for_protein_df_steps(run) + ) + self.fields["grouping"].choices = ( + fill_helper.get_choices_for_metadata_non_sample_columns(run) + ) grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0]) @@ -264,9 +264,9 @@ class DifferentialExpressionLinearModelForm(MethodForm): group2 = CustomChoiceField(choices=[], label="Group 2") def fill_form(self, run: Run) -> None: - self.fields[ - "grouping" - ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + self.fields["grouping"].choices = ( + fill_helper.get_choices_for_metadata_non_sample_columns(run) + ) grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0]) @@ -312,13 +312,13 @@ class DifferentialExpressionMannWhitneyOnIntensityForm(MethodForm): group2 = CustomChoiceField(choices=[], label="Group 2") def fill_form(self, run: Run) -> None: - self.fields[ - "protein_df" - ].choices = fill_helper.get_choices_for_protein_df_steps(run) + self.fields["protein_df"].choices = ( + fill_helper.get_choices_for_protein_df_steps(run) + ) - self.fields[ - "grouping" - ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + self.fields["grouping"].choices = ( + fill_helper.get_choices_for_metadata_non_sample_columns(run) + ) grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0]) @@ -373,9 +373,9 @@ def fill_form(self, run: Run) -> None: run.steps.get_instance_identifiers(PTMsPerSample, "ptm_df") ) - self.fields[ - "grouping" - ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + self.fields["grouping"].choices = ( + fill_helper.get_choices_for_metadata_non_sample_columns(run) + ) grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0]) @@ -423,13 +423,13 @@ class DifferentialExpressionKruskalWallisOnIntensityForm(MethodForm): ) def fill_form(self, run: Run) -> None: - self.fields[ - "protein_df" - ].choices = fill_helper.get_choices_for_protein_df_steps(run) + self.fields["protein_df"].choices = ( + fill_helper.get_choices_for_protein_df_steps(run) + ) - self.fields[ - "grouping" - ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + self.fields["grouping"].choices = ( + fill_helper.get_choices_for_metadata_non_sample_columns(run) + ) grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0]) self.fields["selected_groups"].choices = fill_helper.to_choices( run.steps.metadata_df[grouping].unique() @@ -462,9 +462,9 @@ def fill_form(self, run: Run) -> None: self.fields["ptm_df"].choices = fill_helper.to_choices( run.steps.get_instance_identifiers(PTMsPerSample, "ptm_df") ) - self.fields[ - "grouping" - ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + self.fields["grouping"].choices = ( + fill_helper.get_choices_for_metadata_non_sample_columns(run) + ) grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0]) self.fields["selected_groups"].choices = fill_helper.to_choices( run.steps.metadata_df[grouping].unique() @@ -1091,6 +1091,9 @@ class ProteinGraphVariationGraphForm(MethodForm): class PlotProteinCoverageForm(MethodForm): + from protzilla.data_analysis.protein_coverage import AggregationMethod + + is_dynamic = True peptide_df_instance = CustomChoiceField( choices=[], label="Step to use peptide data from", @@ -1103,11 +1106,19 @@ class PlotProteinCoverageForm(MethodForm): choices=[], label="Protein ID", ) - samples = CustomMultipleChoiceField( + grouping = CustomChoiceField( choices=[], - label="Samples", + label="Grouping from metadata", + ) + selected_groups = CustomMultipleChoiceField( + choices=[], + label="Select groups / samples to plot", + ) + aggregation_method = CustomChoiceField( + choices=fill_helper.to_choices(AggregationMethod), + label="Aggregation method", + initial=AggregationMethod.median, ) - def fill_form(self, run: Run) -> None: self.fields["peptide_df_instance"].choices = fill_helper.get_choices( @@ -1129,11 +1140,28 @@ def fill_form(self, run: Run) -> None: Step, "fasta_df", fasta_df_instance_id )["Protein ID"].unique() peptide_df_protein_ids = peptide_df["Protein ID"].unique() - common_protein_ids = sorted(list(set(fasta_protein_ids) & set(peptide_df_protein_ids))) + common_protein_ids = sorted( + list(set(fasta_protein_ids) & set(peptide_df_protein_ids)) + ) self.fields["protein_id"].choices = fill_helper.to_choices(common_protein_ids) - peptide_df_samples = peptide_df["Sample"].unique() - self.fields["samples"].choices = fill_helper.to_choices(peptide_df_samples) + # Grouping + self.fields["grouping"].choices = ( + fill_helper.get_choices_for_metadata_non_sample_columns(run) + + [("Sample", "Sample")] + ) + + grouping = self.data.get("grouping", self.fields["grouping"].choices[0][0]) + if grouping == "Sample": + self.fields["selected_groups"].choices = fill_helper.to_choices( + peptide_df["Sample"].unique() + ) + else: + self.fields["selected_groups"].choices = fill_helper.to_choices( + run.steps.metadata_df[grouping].unique() + ) + # if grouping is not Sample, show the aggregation method option + self.toggle_visibility("aggregation_method", grouping != "Sample") class FLEXIQuantLFForm(MethodForm): diff --git a/ui/runs/forms/fill_helper.py b/ui/runs/forms/fill_helper.py index eef83be6..d4f86960 100644 --- a/ui/runs/forms/fill_helper.py +++ b/ui/runs/forms/fill_helper.py @@ -1,5 +1,6 @@ from protzilla.run import Run from protzilla.steps import Step +from protzilla.constants.protzilla_logging import logger def to_choices(choices: list[str], required: bool = True) -> list[tuple[str, str]]: @@ -29,6 +30,9 @@ def get_choices( def get_choices_for_metadata_non_sample_columns(run: Run) -> list[tuple[str, str]]: + if run.steps.metadata_df is None: + logger.warning("No metadata_df found in run") + return [] return to_choices( run.steps.metadata_df.columns[ run.steps.metadata_df.columns != "Sample" From e7fc5f5b1e05e7bcaeffe28a5b46de1478aa6aac Mon Sep 17 00:00:00 2001 From: henning Date: Mon, 13 Jan 2025 14:42:07 +0100 Subject: [PATCH 12/13] Error handling and dynamic scaling of peptide boxes to preserve 1:2 ratio with bar plot --- protzilla/data_analysis/protein_coverage.py | 61 +++++++++++++++++---- 1 file changed, 49 insertions(+), 12 deletions(-) diff --git a/protzilla/data_analysis/protein_coverage.py b/protzilla/data_analysis/protein_coverage.py index 6b0e0bb9..bd77d783 100644 --- a/protzilla/data_analysis/protein_coverage.py +++ b/protzilla/data_analysis/protein_coverage.py @@ -94,8 +94,7 @@ def match_peptide_to_protein_ids( ) -> list[ProteinHit]: """ Matches a peptide sequence to a dictionary of kmers in protein sequences. - Returns a list of tuples containing the protein ID and the start, end location of the peptide in the protein - sequence. + Returns a list of ProteinHit objects containing the protein ID and the start location of the peptide on the protein. """ if len(peptide_sequence) == 0: raise ValueError("Peptide sequence is empty.") @@ -107,6 +106,9 @@ def match_peptide_to_protein_ids( # match the kmers first_kmer_matches = protein_kmer_dictionary.get(first_kmer, []) last_kmer_matches = protein_kmer_dictionary.get(last_kmer, []) + if first_kmer_matches == [] or last_kmer_matches == []: + return [] + hits = [ ProteinHit( protein_id, start_first_kmer, start_first_kmer + len(peptide_sequence) @@ -171,6 +173,9 @@ def plot_protein_coverage( ) sequence_kmer_dict = build_kmer_dictionary(protein_sequence_dict, k=5) + if protein_id not in protein_sequence_dict: + raise ValueError(f"Protein ID {protein_id} not found in protein dictionary.") + protein_sequence = protein_sequence_dict[protein_id] peptide_matches = [] coverage_by_group_name = { @@ -227,6 +232,7 @@ def plot_protein_coverage( protein_sequence=protein_sequence, rows_by_group_name=rows_by_group_name, grouping=grouping, + aggregation_method=aggregation_method, ) return dict(plots=[fig]) @@ -240,6 +246,7 @@ def build_coverage_plot( rows_by_group_name: dict[str, list[list[PeptideMatch]]], intensity_normalization: IntensityNormalization = IntensityNormalization.min_max_scaling, grouping: str = "", + aggregation_method: AggregationMethod = AggregationMethod.median, ) -> go.Figure: number_of_subplots = len(rows_by_group_name.keys()) protein_sequence_labels = [ @@ -264,6 +271,21 @@ def build_coverage_plot( peptide_rows = rows_by_group_name[group_name] coverage = coverages_by_group_name[group_name] + # Calculate the desired heights + max_bar_height = max_coverage_value + desired_peptide_height = max_bar_height * 2 # 1:2 ratio + + # Calculate the height scaling factor for peptides + num_peptide_rows = len(peptide_rows) + if num_peptide_rows > 0: + # Include spacing in the calculation + total_spacing = SEQUENCE_DEPTH_PEPTIDE_SPACING + peptide_box_height = ( + desired_peptide_height - total_spacing + ) / num_peptide_rows + else: + peptide_box_height = 1.0 + add_sequence_depth_to_plot( fig=fig, coverage=coverage, @@ -278,8 +300,10 @@ def build_coverage_plot( max_intensity, min_intensity = max(current_group_intensities), min( current_group_intensities ) - scale_intensity = lambda intensity: (intensity - min_intensity) / ( - max_intensity - min_intensity + scale_intensity = lambda intensity: ( + (intensity - min_intensity) / (max_intensity - min_intensity) + if max_intensity != min_intensity + else 0 ) for row_index, row in enumerate(peptide_rows): for peptide_match in row: @@ -291,17 +315,19 @@ def build_coverage_plot( protein_sequence_labels=protein_sequence_labels, row_index=row_index, offset=max_coverage_value + SEQUENCE_DEPTH_PEPTIDE_SPACING, + box_height=peptide_box_height, color_a=INTENSITY_COLORS[0], color_b=INTENSITY_COLORS[1], grouping=grouping, ) fig.update_layout(hovermode="closest") + # Update y-axis range to accommodate both the bar plot and peptide sections + total_height = ( + max_coverage_value + SEQUENCE_DEPTH_PEPTIDE_SPACING + desired_peptide_height + ) fig.update_yaxes( - range=[ - 0, - len(peptide_rows) + max_coverage_value + SEQUENCE_DEPTH_PEPTIDE_SPACING, - ], + range=[0, total_height], showticklabels=False, showgrid=False, row=group_index, @@ -310,12 +336,14 @@ def build_coverage_plot( fig.update_xaxes(showticklabels=False, showgrid=False, row=group_index, col=1) fig.update_xaxes(showticklabels=True, row=number_of_subplots, col=1) - add_intensity_legend(fig) + add_intensity_legend(fig, aggregation_method) return fig -def add_intensity_legend(fig): +def add_intensity_legend( + fig, aggregation_method: AggregationMethod = AggregationMethod.median +): color_scale = [[0, INTENSITY_COLORS[0]], [1, INTENSITY_COLORS[1]]] color_legend_trace = go.Scatter( x=[None], @@ -327,7 +355,7 @@ def add_intensity_legend(fig): cmin=0, cmax=1, colorbar=dict( - title="Intensity of peptide", + title=f"Intensity of peptide
(aggregated via {aggregation_method})", x=1.0, # Move further outside the plot len=0.9, # Increase length of colorbar (70% of subplot height) thickness=30, # Increase thickness of colorbar @@ -366,6 +394,7 @@ def add_peptide_to_plot( protein_sequence_labels: list[str], row_index: int, offset: int, + box_height: float, color_a: str = "#FFFFFF", color_b: str = PLOT_COLOR_SEQUENCE[3], grouping: str = "", @@ -374,7 +403,7 @@ def add_peptide_to_plot( peptide_match.start_location_on_protein, peptide_match.end_location_on_protein, ) - y0, y1 = row_index + offset, row_index + offset + 1 + y0, y1 = row_index * box_height + offset, (row_index + 1) * box_height + offset # interpolate between the two colors color = interpolate_color(color_a, color_b, normalized_intensity) PEPTIDE_SHAPE = dict( @@ -450,6 +479,10 @@ def distribute_to_rows( Distributes peptide matches to rows such that no two peptides overlap in the same row. Greedy algorithm (see Interval Scheduling Problem). """ + if len(peptide_matches) == 0: + raise ValueError( + "Attempted to distribute empty list of peptide matches to rows in plot." + ) peptide_matches.sort( key=lambda peptide_match: peptide_match.start_location_on_protein ) @@ -474,4 +507,8 @@ def distribute_to_rows( def get_max_coverage(coverage: dict[list[int]]) -> int: + if len(coverage) == 0: + raise ValueError( + "Cannot calculate maximum coverage value: No coverage data provided." + ) return max([max(coverage) for coverage in coverage.values()]) From a762b20cd5bbc23c06d6319ffaafa98776abf9f5 Mon Sep 17 00:00:00 2001 From: henning Date: Mon, 13 Jan 2025 15:08:49 +0100 Subject: [PATCH 13/13] Tests for protein_coverage --- .../data_analysis/test_protein_coverage.py | 285 ++++++++++++++++++ 1 file changed, 285 insertions(+) diff --git a/tests/protzilla/data_analysis/test_protein_coverage.py b/tests/protzilla/data_analysis/test_protein_coverage.py index e69de29b..2339d840 100644 --- a/tests/protzilla/data_analysis/test_protein_coverage.py +++ b/tests/protzilla/data_analysis/test_protein_coverage.py @@ -0,0 +1,285 @@ +import pytest +from protzilla.data_analysis.protein_coverage import ( + distribute_to_rows, + get_max_coverage, + PeptideMatch, +) + +import pytest +import pandas as pd +from protzilla.data_analysis.protein_coverage import ( + extract_peptide_from_slice, + AggregationMethod, +) + +import pytest +from protzilla.data_analysis.protein_coverage import increment_coverage, ProteinHit + +import pytest +from protzilla.data_analysis.protein_coverage import ( + match_peptide_to_protein_ids, + ProteinHit, +) + + +def test_match_peptide_to_protein_ids_empty_peptide(): + with pytest.raises(ValueError, match="Peptide sequence is empty."): + match_peptide_to_protein_ids("", {}, {}) + + +def test_match_peptide_to_protein_ids_no_matches(): + peptide_sequence = "ABCDE" + protein_kmer_dictionary = {"ABCDE": [("protein1", 0)]} + protein_dictionary = {"protein1": "XYZ"} + result = match_peptide_to_protein_ids( + peptide_sequence, protein_kmer_dictionary, protein_dictionary + ) + assert result == [] + + +def test_match_peptide_to_protein_ids_single_match(): + peptide_sequence = "ABCDE" + protein_kmer_dictionary = {"ABCDE": [("protein1", 0)]} + protein_dictionary = {"protein1": "ABCDE"} + result = match_peptide_to_protein_ids( + peptide_sequence, protein_kmer_dictionary, protein_dictionary + ) + expected = [ + ProteinHit( + protein_id="protein1", + start_location_on_protein=0, + end_location_on_protein=5, + ) + ] + assert result == expected + + +def test_match_peptide_to_protein_ids_multiple_matches(): + peptide_sequence = "ABCDE" + protein_kmer_dictionary = {"ABCDE": [("protein1", 0), ("protein2", 0)]} + protein_dictionary = {"protein1": "ABCDE", "protein2": "ABCDE"} + result = match_peptide_to_protein_ids( + peptide_sequence, protein_kmer_dictionary, protein_dictionary + ) + expected = list( + set( + [ + ProteinHit( + protein_id="protein1", + start_location_on_protein=0, + end_location_on_protein=5, + ), + ProteinHit( + protein_id="protein2", + start_location_on_protein=0, + end_location_on_protein=5, + ), + ] + ) + ) + assert result == expected + + +def test_match_peptide_to_protein_ids_partial_match(): + peptide_sequence = "ABCDE" + protein_kmer_dictionary = {"ABCDE": [("protein1", 0)]} + protein_dictionary = {"protein1": "ABCDEXYZ"} + result = match_peptide_to_protein_ids( + peptide_sequence, protein_kmer_dictionary, protein_dictionary + ) + expected = [ + ProteinHit( + protein_id="protein1", + start_location_on_protein=0, + end_location_on_protein=5, + ) + ] + assert result == expected + + +def test_increment_coverage(): + coverage = [0, 0, 0, 0, 0] + protein_hit = ProteinHit( + protein_id="P12345", start_location_on_protein=1, end_location_on_protein=4 + ) + increment_coverage(coverage, protein_hit) + assert coverage == [0, 1, 1, 1, 0] + + +def test_increment_coverage_multiple_hits(): + coverage = [0, 0, 0, 0, 0] + protein_hit1 = ProteinHit( + protein_id="P12345", start_location_on_protein=1, end_location_on_protein=3 + ) + protein_hit2 = ProteinHit( + protein_id="P12345", start_location_on_protein=2, end_location_on_protein=4 + ) + increment_coverage(coverage, protein_hit1) + increment_coverage(coverage, protein_hit2) + assert coverage == [0, 1, 2, 1, 0] + + +def test_increment_coverage_no_overlap(): + coverage = [0, 0, 0, 0, 0] + protein_hit1 = ProteinHit( + protein_id="P12345", start_location_on_protein=0, end_location_on_protein=2 + ) + protein_hit2 = ProteinHit( + protein_id="P12345", start_location_on_protein=3, end_location_on_protein=5 + ) + increment_coverage(coverage, protein_hit1) + increment_coverage(coverage, protein_hit2) + assert coverage == [1, 1, 0, 1, 1] + + +def test_extract_peptide_from_slice_single_entry(): + data = {"Sequence": ["PEPTIDE"], "Intensity": [100]} + df = pd.DataFrame(data) + result = extract_peptide_from_slice(df) + assert result.equals(df) + + +def test_extract_peptide_from_slice_multiple_entries(): + data = {"Sequence": ["PEPTIDE", "PEPTIDE", "PEPTIDE"], "Intensity": [100, 130, 200]} + df = pd.DataFrame(data) + result = extract_peptide_from_slice(df) + expected = pd.DataFrame({"Sequence": "PEPTIDE", "Intensity": [130.0]}) + expected.set_index("Sequence", inplace=True) + assert result.equals(expected) + + +def test_extract_peptide_from_slice_median(): + data = {"Sequence": ["PEPTIDE", "PEPTIDE"], "Intensity": [100, 200]} + df = pd.DataFrame(data) + result = extract_peptide_from_slice(df, AggregationMethod.median) + expected = pd.DataFrame({"Sequence": "PEPTIDE", "Intensity": [150.0]}) + expected.set_index("Sequence", inplace=True) + assert result.equals(expected) + + +def test_extract_peptide_from_slice_mean(): + data = {"Sequence": ["PEPTIDE", "PEPTIDE"], "Intensity": [100, 200]} + df = pd.DataFrame(data) + result = extract_peptide_from_slice(df, AggregationMethod.mean) + expected = pd.DataFrame({"Sequence": "PEPTIDE", "Intensity": [150.0]}) + expected.set_index("Sequence", inplace=True) + assert result.equals(expected) + + +def test_extract_peptide_from_slice_unknown_strategy(): + data = {"Sequence": ["PEPTIDE", "PEPTIDE"], "Intensity": [100, 200]} + df = pd.DataFrame(data) + with pytest.raises(ValueError, match="Unknown strategy: unknown"): + extract_peptide_from_slice(df, "unknown") + + +import pytest +from protzilla.data_analysis.protein_coverage import distribute_to_rows, PeptideMatch + + +def test_distribute_to_rows_empty(): + with pytest.raises( + ValueError, + match="Attempted to distribute empty list of peptide matches to rows in plot.", + ): + distribute_to_rows([]) + + +def test_distribute_to_rows_single_peptide(): + peptide_matches = [ + PeptideMatch( + peptide_sequence="AAA", + start_location_on_protein=0, + end_location_on_protein=3, + metadata_group="group1", + ) + ] + expected = {"group1": [[peptide_matches[0]]]} + assert distribute_to_rows(peptide_matches) == expected + + +def test_distribute_to_rows_non_overlapping(): + peptide_matches = [ + PeptideMatch( + peptide_sequence="AAA", + start_location_on_protein=0, + end_location_on_protein=3, + metadata_group="group1", + ), + PeptideMatch( + peptide_sequence="BBB", + start_location_on_protein=4, + end_location_on_protein=7, + metadata_group="group1", + ), + ] + expected = {"group1": [[peptide_matches[0], peptide_matches[1]]]} + assert distribute_to_rows(peptide_matches) == expected + + +def test_distribute_to_rows_overlapping(): + peptide_matches = [ + PeptideMatch( + peptide_sequence="AAA", + start_location_on_protein=0, + end_location_on_protein=3, + metadata_group="group1", + ), + PeptideMatch( + peptide_sequence="BBB", + start_location_on_protein=2, + end_location_on_protein=5, + metadata_group="group1", + ), + ] + expected = {"group1": [[peptide_matches[0]], [peptide_matches[1]]]} + assert distribute_to_rows(peptide_matches) == expected + + +def test_distribute_to_rows_multiple_groups(): + peptide_matches = [ + PeptideMatch( + peptide_sequence="AAA", + start_location_on_protein=0, + end_location_on_protein=3, + metadata_group="group1", + ), + PeptideMatch( + peptide_sequence="BBB", + start_location_on_protein=4, + end_location_on_protein=7, + metadata_group="group2", + ), + ] + expected = {"group1": [[peptide_matches[0]]], "group2": [[peptide_matches[1]]]} + assert distribute_to_rows(peptide_matches) == expected + + +def test_distribute_to_rows(): + peptide_matches = [ + PeptideMatch("PEPTIDE1", 0, 7, 1.0, "group1"), + PeptideMatch("PEPTIDE2", 8, 15, 1.0, "group1"), + PeptideMatch("PEPTIDE3", 16, 23, 1.0, "group1"), + PeptideMatch("PEPTIDE4", 0, 7, 1.0, "group2"), + PeptideMatch("PEPTIDE5", 8, 15, 1.0, "group2"), + ] + rows = distribute_to_rows(peptide_matches) + assert len(rows["group1"]) == 1 + assert len(rows["group2"]) == 1 + + +def test_get_max_coverage(): + coverage = { + "group1": [1, 2, 3, 4, 5], + "group2": [2, 3, 4, 5, 6], + } + max_coverage = get_max_coverage(coverage) + assert max_coverage == 6 + + +def test_get_max_coverage_empty(): + with pytest.raises( + ValueError, + match="Cannot calculate maximum coverage value: No coverage data provided.", + ): + get_max_coverage({})