Merge pull request #25 from MannLabs/FragPipe_import

Frag pipe import
MannLabs · Sep 1, 2021 · b045b59 · b045b59
2 parents 285a45d + dc5e3bd
commit b045b59
Show file tree

Hide file tree

Showing 21 changed files with 8,113 additions and 128 deletions.
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 
 ## About
 
-AlphaMap is a tool for peptide level MS data exploration. You can load and inspect MS data analyzed by [AlphaPept](https://github.com/MannLabs/alphapept), DIA-NN, MaxQuant or Spectronaut. Uploaded data is processed and formatted for visual inspection of the sequence coverage of any selected protein and its identified post-translational modifications (PTMs). UniProt information is available to directly annotate sequence regions of interest such as protein domains, secondary structures, sequence variants, known PTMs, etc. Additionally, users can select proteases to further evaluate the distribution of proteolytic cleavage sites across a protein sequence. The functionality of AlphaMap can be accessed via an intuitive graphical user interface or – more flexibly – as a Python package that allows its integration into common analysis workflows for data visualization. 
+AlphaMap is a tool for peptide level MS data exploration. You can load and inspect MS data analyzed by [AlphaPept](https://github.com/MannLabs/alphapept), DIA-NN, MaxQuant, Spectronaut or FragPipe. Uploaded data is processed and formatted for visual inspection of the sequence coverage of any selected protein and its identified post-translational modifications (PTMs). UniProt information is available to directly annotate sequence regions of interest such as protein domains, secondary structures, sequence variants, known PTMs, etc. Additionally, users can select proteases to further evaluate the distribution of proteolytic cleavage sites across a protein sequence. The functionality of AlphaMap can be accessed via an intuitive graphical user interface or - more flexibly - as a Python package that allows its integration into common analysis workflows for data visualization. 
 
 
 ## License
@@ -29,8 +29,9 @@ The GUI of AlphaMap is a completely stand-alone tool that requires no knowledge
 * [**Windows**](https://github.com/MannLabs/alphamap/releases/latest/download/alphamap_installer_windows.exe)
 * [**MacOS**](https://github.com/MannLabs/alphamap/releases/latest/download/alphamap_gui_installer_macos.pkg)
 
-***IMPORTANT: Please refer to the [GUI manual](alphamap/data/alphamap_tutorial.pdf) for detailed instructions on the installation, troubleshooting and usage of the stand-alone AlphaMap GUI.***
+***IMPORTANT: Please refer to the [GUI manual](alphamap/data/alphamap_tutorial.pdf) for detailed instructions on the installation, troubleshooting and usage of the stand-alone AlphaMap GUI.*** 
 
+***IMPORTANT***: The one-click-installers on macOS and Windows require **at least macOS Catalina (10.15) or higher** and **Windows 10** respectively. For Windows, a system update might be necessary in case older versions do not work. To prevent installation errors on **Windows**, we recommend **uninstalling the previous AlphaMap version before installing a new one**.
 
 ### Pip
 
@@ -102,6 +103,11 @@ To ensure proper formatting of the Spectronaut output, an export scheme is avail
 A reduced example file is also available for [download here](https://github.com/MannLabs/alphamap/releases/download/v0.0.210622-alpha/test_spectronaut_input.tsv).
 <!-- It is not directly clear how to download this individual file from here. Luckily, the two larger ones have a "download" button on the top right -->
 
+### FragPipe
+There are two options to visualize data analyzed by FragPipe:
+1) Upload individual **"peptide.tsv"** files for single MS runs. A reduced example file is available for [download here](https://github.com/MannLabs/alphamap/releases/download/v0.1.3/test_fragpipe_input.tsv).
+
+2) Upload the **"combined_peptide.tsv"** file with the joint information about peptides identified in all runs (there is an option to select the experiment(s)). Be aware that the combined_peptide.tsv does not provide information about PTM localization. PTMs are therefore not shown for this option. A reduced example file is available for [download here](https://github.com/MannLabs/alphamap/releases/download/v0.1.3/combined_peptide.txt).
 
 ## Usage
 

diff --git a/Workflow.ipynb b/Workflow.ipynb
diff --git a/alphamap/_nbdev.py b/alphamap/_nbdev.py
@@ -10,6 +10,8 @@
          "import_alphapept_data": "Importing.ipynb",
          "convert_diann_mq_mod": "Importing.ipynb",
          "import_diann_data": "Importing.ipynb",
+         "convert_fragpipe_mq_mod": "Importing.ipynb",
+         "import_fragpipe_data": "Importing.ipynb",
          "import_data": "Importing.ipynb",
          "expand_protein_ids": "Preprocessing.ipynb",
          "pep_position_helper": "Preprocessing.ipynb",

diff --git a/alphamap/data/alphamap_tutorial.pdf b/alphamap/data/alphamap_tutorial.pdf
diff --git a/alphamap/data/gui_style.css b/alphamap/data/gui_style.css
@@ -23,7 +23,7 @@
 
 .bk-root .bk-btn-primary {
     background-color:  ;
-    font-size: 11px;
+    font-size: 12px;
     font-weight: 700;
     text-transform: uppercase;
     letter-spacing: 1.5px;

diff --git a/alphamap/gui.py b/alphamap/gui.py
@@ -325,7 +325,6 @@ def init_panel():
 upload_button = pn.widgets.Button(
     name='Upload data',
     button_type='primary',
-    css_classes=['button_options'],
     height=40,
     width=170,
     align='center',
@@ -342,7 +341,6 @@ def init_panel():
 visualize_button = pn.widgets.Button(
     name='Visualize protein',
     button_type='primary',
-    css_classes=['button_options'],
     height=40,
     width=170,
     align='center',
@@ -548,9 +546,9 @@ def download_pdf_report():
 ### MAIN PART
 project_description = pn.pane.Markdown(
     """### AlphaMap enables the exploration of proteomic datasets on the peptide level. It is possible to evaluate the sequence coverage of any identified protein and its post-translational modifications (PTMs). AlphaMap further integrates all available UniProt sequence annotations as well as information about proteolytic cleavage sites.""",
-    margin=(10, 0, -20, 0),
+    margin=(0, 0, -20, 0),
     css_classes=['main-part'],
-    width=635
+    width=700
 )
 
 divider_descr = pn.pane.HTML(
@@ -564,7 +562,7 @@ def download_pdf_report():
     """#### How to use AlphaMap:
     1. Select the organism of your proteomic study.
     2. Provide the filepath to your proteomic datasets analyzed by
-    AlphaPept, MaxQuant, Spectronaut or DIA-NN.
+    AlphaPept, MaxQuant, Spectronaut, DIA-NN or FragPipe.
         - Wait for samples to be displayed in the 'Select samples' field.
         - (optional) Select either all samples (default) or any specific
         sample(s) to visualize together as one trace.
@@ -584,7 +582,7 @@ def download_pdf_report():
     """,
     width=530,
     align='start',
-    margin=(20, 80, 0, 10)
+    margin=(0, 80, 0, 10)
 )
 
 alphamap_tutorial = pn.widgets.FileDownload(
@@ -664,6 +662,25 @@ def download_pdf_report():
     margin=(0, 80, 0, 20)
 )
 
+fragpipe_description = pn.pane.Markdown(
+    """
+    There are two options to visualize data analyzed by FragPipe:
+
+    1) Upload individual **"peptide.tsv"** files for single MS runs. In this case, the following columns from the original file are used for visualization:
+    >- Protein ID
+    >- Peptide
+    >- Assigned Modifications
+
+    2) Upload the **"combined_peptide.tsv"** file with the joint information about peptides identified in all runs (there is an option to select the experiment(s)). Be aware that the combined_peptide.tsv does not provide information about PTM localization. PTMs are therefore not shown for this option. Following columns are used for visalization:
+    >- Protein ID
+    >- Sequence
+    >- All 'Spectral Count' columns containing information about individual experiments
+    """,
+    width=530,
+    align='start',
+    margin=(0, 80, 0, 20)
+)
+
 spectronaut_instructions = pn.Card(
     spectronaut_description,
     spectronaut_scheme,
@@ -705,6 +722,16 @@ def download_pdf_report():
     css_classes=['spectronaut_instr']
 )
 
+fragpipe_instructions = pn.Card(
+    fragpipe_description,
+    title='FragPipe instructions',
+    collapsed=True,
+    width=530,
+    align='start',
+    margin=(0, 80, 5, 10),
+    css_classes=['spectronaut_instr']
+)
+
 additional_data_card = pn.Card(
     pn.Row(
         experimental_data_2,
@@ -740,7 +767,7 @@ def download_pdf_report():
     experimental_data_warning,
     experimental_data_sample,
     additional_data_card,
-    margin=(20, 30, 10, 30),
+    margin=(0, 30, 10, 30),
     width=790,
     css_classes=['selection_box'],
 )
@@ -756,7 +783,8 @@ def download_pdf_report():
             spectronaut_instructions,
             maxquant_instructions,
             alphapept_instructions,
-            diann_instructions
+            diann_instructions,
+            fragpipe_instructions
         ),
         selection_box,
         align='center',

diff --git a/alphamap/importing.py b/alphamap/importing.py
@@ -1,7 +1,8 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: nbs/Importing.ipynb (unless otherwise specified).
 
 __all__ = ['read_file', 'extract_rawfile_unique_values', 'import_spectronaut_data', 'import_maxquant_data',
-           'convert_ap_mq_mod', 'import_alphapept_data', 'convert_diann_mq_mod', 'import_diann_data', 'import_data']
+           'convert_ap_mq_mod', 'import_alphapept_data', 'convert_diann_mq_mod', 'import_diann_data',
+           'convert_fragpipe_mq_mod', 'import_fragpipe_data', 'import_data']
 
 # Cell
 import os
@@ -69,7 +70,8 @@ def extract_rawfile_unique_values(
     file: str
 ) -> list:
     """Extract the unique raw file names from "R.FileName" (Spectronaut output), "Raw file" (MaxQuant output),
-    "shortname" (AlphaPept output) or "Run" (DIA-NN output) column.
+    "shortname" (AlphaPept output) or "Run" (DIA-NN output) column or from the "Spectral Count" column from the
+    combined_peptide.tsv file without modifications for the FragPipe.
 
     Args:
         file (str): The name of a file.
@@ -100,7 +102,14 @@ def extract_rawfile_unique_values(
                         filename_col_index = l.index(col)
                         break
                 if not isinstance(filename_col_index, int):
-                    raise ValueError('A column with the raw file names is not in the file.')
+                    # to check the case with the FragPipe peptide.tsv file when we don't have the info about the experiment name
+                    if ("Assigned Modifications" in "".join(l)) and ("Protein ID" in "".join(l)) and ("Peptide" in "".join(l)):
+                        return []
+                    # to check the case with the FragPipe combined_peptide.tsv file when the experiment name is included in the "Spectral Count" column
+                    elif ("Sequence" in "".join(l)) and ("Assigned Modifications" in "".join(l)) and ("Protein ID" in "".join(l)):
+                        return sorted(list(set([col.replace('_', '').replace(' Spectral Count', '') for col in l if 'Spectral Count' in col])))
+                    else:
+                        raise ValueError('A column with the raw file names is not in the file.')
             else:
                 filename_data.append(l[filename_col_index])
             i += 1
@@ -446,6 +455,172 @@ def import_diann_data(
     input_data = input_data.drop_duplicates().reset_index(drop=True)
     return input_data
 
+# Cell
+import re
+
+def convert_fragpipe_mq_mod(
+    sequence:str,
+    assigned_modifications: str
+) -> str:
+    """Convert FragPipe style modifications into MaxQuant style modifications.
+
+    Args:
+        sequence (str): The peptide sequence with modification.
+        assigned_modifications (str): The string of assigned modifications separated by comma.
+
+    Returns:
+        str: The peptide sequence with modification in a similar to DIA-NN style.
+    """
+    modif_convers_dict = {
+        42.0106: '[Acetyl ({})]',
+        -0.9840: '[Amidated ({})]',
+        57.0215: '[Carbamidomethyl ({})]',
+        43.0058: '[Carbamyl ({})]',
+        0.9840: '[Deamidation ({})]',
+        79.9663: '[Phospho ({})]',
+        -18.0106: ['[Dehydrated ({})]', '[Glu->pyro-Glu]'],
+        39.9949: '[Pyro-carbamidomethyl ({})]',
+        -17.0265: '[Gln->pyro-Glu]',
+        21.9819: '[Cation:Na ({})]',
+        14.0157: '[Methyl ({})]',
+        15.9949: '[Oxidation ({})]',
+        28.0313: '[Dimethyl ({})]',
+        42.047: '[Trimethyl ({})]',
+        79.9568: '[Sulfo ({})]',
+        305.0682: '[Cys-Cys]',
+        114.0429: '[GlyGly ({})]',
+        26.0157: '[Delta:H(2)C(2) ({})]',
+        119.0041: '[Cysteinyl]',
+        47.9847: '[Trioxidation ({})]',
+        148.0372: '[Hydroxyproline]',
+        31.9898: '[Dioxidation ({})]',
+        -48.0034: '[Dethiomethyl ({})]',
+        599.2663: '[QQTGG ({})]',
+    }
+
+    if assigned_modifications:
+        modifs_posit = [''] * (len(sequence) + 1)
+        for mod in assigned_modifications.split(','):
+            mod = mod.strip()
+            data = mod.replace(')', '').replace('"', '').split('(')
+            mod_pos, mod_mass = data[0], float(data[1])
+            if mod_pos == 'N-term':
+                posit = 0
+                add_aa = 'N-term'
+            elif mod_pos == 'C-term':
+                posit = -1
+                add_aa = 'C-term'
+            else:
+                posit = int(mod_pos[:-1])
+                add_aa = mod_pos[-1]
+                if mod_mass == 0.9840:
+                    if add_aa in 'NQ':
+                        add_aa = 'NQ'
+                elif mod_mass == 79.9663:
+                    if add_aa in 'STY':
+                        add_aa = 'STY'
+                elif mod_mass == 21.9819:
+                    if add_aa in 'DE':
+                        add_aa = 'DE'
+                elif mod_mass == 14.0157:
+                    if add_aa in 'KR':
+                        add_aa = 'KR'
+                elif mod_mass == 28.0313:
+                    if add_aa in 'KR':
+                        add_aa = 'KR'
+                elif mod_mass == 79.9568:
+                    if add_aa in 'STY':
+                        add_aa = 'STY'
+                elif mod_mass == 31.9898:
+                    if add_aa in 'MW':
+                        add_aa = 'MW'
+            if mod_mass == -18.0106:
+                if add_aa == 'E':
+                    modifs_posit[posit] = modif_convers_dict[mod_mass][1].format(add_aa)
+                else:
+                    if add_aa in 'ST':
+                        add_aa = 'ST'
+                    modifs_posit[posit] = modif_convers_dict[mod_mass][0].format(add_aa)
+            else:
+                modifs_posit[posit] = modif_convers_dict[mod_mass].format(add_aa)
+
+        modif_sequence = ''.join(["".join(i) for i in zip(' '+ sequence, modifs_posit)]).strip()
+        return modif_sequence
+
+    else:
+        return sequence
+
+# Cell
+import pandas as pd
+from typing import Union
+
+def import_fragpipe_data(
+    file: str,
+    sample: Union[str, list, None] = None
+) -> pd.DataFrame:
+    """Import peptide level data from FragPipe/MSFragger.
+
+    Args:
+        file (str): The name of a file.
+        sample (Union[str, list, None]): The unique raw file name(s) to filter the original file. Defaults to None. In this case data for all raw files will be extracted.
+
+    Returns:
+        pd.DataFrame: A pandas dataframe containing information about: all_protein_ids (str), modified_sequence (str), naked_sequence (str)
+    """
+    file_ext = os.path.splitext(file)[-1]
+    if file_ext=='.csv':
+        sep=','
+    elif file_ext=='.tsv':
+        sep='\t'
+    elif file_ext=='.txt':
+        sep='\t'
+    if sample:
+        if isinstance(sample, list):
+            column_names = [each + ' Spectral Count' for each in sample]
+            combined_fragpipe_columns = ["Sequence", "Protein ID"] + column_names
+            data = pd.read_csv(file, sep=sep, low_memory=False, usecols=combined_fragpipe_columns)
+            selected_indices = []
+            for column_name in column_names:
+                selected_indices.extend(data[data[column_name] > 0].index.tolist())
+            data_sub = data.iloc[list(set(selected_indices))]
+            data_sub = data_sub[["Sequence", "Protein ID"]]
+        elif isinstance(sample, str):
+            column_name = sample + ' Spectral Count'
+            combined_fragpipe_columns = ["Sequence", "Protein ID", column_name]
+            data = pd.read_csv(file, sep=sep, low_memory=False, usecols=combined_fragpipe_columns)
+            selected_indices = data[data[column_name] > 0].index.tolist()
+            data_sub = data.iloc[selected_indices]
+            data_sub = data_sub[["Sequence", "Protein ID"]]
+
+        # rename columns into all_proteins_id and naked sequence
+        data_sub = data_sub.rename(columns={"Protein ID": "all_protein_ids", "Sequence": "naked_sequence"})
+        data_sub['modified_sequence'] = data_sub.naked_sequence
+
+    else:
+        try:
+            combined_fragpipe_columns = ["Sequence", "Protein ID"]
+            data_sub = pd.read_csv(file, sep=sep, low_memory=False, usecols=combined_fragpipe_columns)
+
+            # rename columns into all_proteins_id and naked sequence
+            data_sub = data_sub.rename(columns={"Protein ID": "all_protein_ids", "Sequence": "naked_sequence"})
+            data_sub['modified_sequence'] = data_sub.naked_sequence
+        except:
+            fragpipe_columns = ["Protein ID", "Peptide", "Assigned Modifications"]
+            data = read_file(file, fragpipe_columns)
+            data_sub = data[["Protein ID", "Peptide", "Assigned Modifications"]]
+
+            # get modified sequence
+            modif_seq = data_sub.apply(lambda row: convert_fragpipe_mq_mod(row["Peptide"], row["Assigned Modifications"]), axis=1)
+            data_sub['modified_sequence'] = modif_seq.values
+
+            # rename columns into all_proteins_id and naked sequence
+            data_sub = data_sub.rename(columns={"Protein ID": "all_protein_ids", "Peptide": "naked_sequence"})
+
+    input_data = data_sub[["all_protein_ids", "modified_sequence", "naked_sequence"]]
+    input_data = input_data.dropna() # remove missing values
+    input_data = input_data.drop_duplicates().reset_index(drop=True)
+    return input_data
+
 # Cell
 import pandas as pd
 import re
@@ -494,22 +669,27 @@ def import_data(
 
         uploaded_data_columns = set(l)
         input_info = file
+
     if set(["Proteins","Modified sequence","Raw file"]).issubset(uploaded_data_columns):
         if verbose:
-            print("Import MaxQuant input")
+            print("Import MaxQuant output")
         data = import_maxquant_data(input_info, sample=sample)
     elif set(["PEP.AllOccurringProteinAccessions","EG.ModifiedSequence","R.FileName"]).issubset(uploaded_data_columns):
         if verbose:
-            print("Import Spectronaut input")
+            print("Import Spectronaut output")
         data = import_spectronaut_data(input_info, sample=sample)
     elif set(["protein_group", "sequence", "shortname"]).issubset(uploaded_data_columns):
         if verbose:
-            print("Import AlphaPept input")
+            print("Import AlphaPept output")
         data = import_alphapept_data(input_info, sample=sample)
     elif set(["Protein.Ids", "Modified.Sequence", "Run"]).issubset(uploaded_data_columns):
         if verbose:
-            print("Import DIA-NN input")
+            print("Import DIA-NN output")
         data = import_diann_data(input_info, sample=sample)
+    elif set(["Protein ID", "Assigned Modifications"]).issubset(uploaded_data_columns):
+        if verbose:
+            print("Import FragPipe output")
+        data = import_fragpipe_data(input_info, sample=sample)
     else:
         raise TypeError(f'Input data format for {file} not known.')
     return data