From 7dcacd2bb39608c6f4ee56fb6827c14e4b926e0f Mon Sep 17 00:00:00 2001 From: jalew188 Date: Mon, 29 Jan 2024 13:17:57 +0100 Subject: [PATCH 01/18] Add Ella's charge prediction --- peptdeep/model/charge.py | 108 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 peptdeep/model/charge.py diff --git a/peptdeep/model/charge.py b/peptdeep/model/charge.py new file mode 100644 index 00000000..b57c6266 --- /dev/null +++ b/peptdeep/model/charge.py @@ -0,0 +1,108 @@ +import pandas as pd +import numpy as np + + +from peptdeep.model.generic_property_prediction import ( + ModelInterface_for_Generic_AASeq_MultiLabelClassification, + Model_for_Generic_AASeq_BinaryClassification_Transformer, + ModelInterface_for_Generic_ModAASeq_MultiLabelClassification, + Model_for_Generic_ModAASeq_BinaryClassification_Transformer, +) + +class ChargeModelForAASeq( + ModelInterface_for_Generic_AASeq_MultiLabelClassification +): + def __init__(self, min_charge:int, max_charge:int): + super().__init__( + num_target_values=max_charge-min_charge+1, + model_class=Model_for_Generic_AASeq_BinaryClassification_Transformer, + nlayers=4, hidden_dim=128, dropout=0.1 + ) + + self.target_column_to_predict = "charge_probs" + self.target_column_to_train = "charge_indicators" + self.min_charge = min_charge + self.max_charge = max_charge + self.charge_range = np.arange(min_charge, max_charge+1, dtype=np.int8) + + def predict_charges_for_pep_df(self, + pep_df:pd.DataFrame, + charge_prob=0.3, + drop_probs_column=True + ): + df = self.predict(pep_df) + df["charge"] = self.charge_probs.apply( + lambda x: self.charge_range[x>charge_prob] + ) + df = df.explode("charge").dropna(subset=["charge"]) + if drop_probs_column: + df.drop(columns="charge_probs", inplace=True) + df["charge"] = df.charge.astype(np.int8) + return df + +class ChargeModelForModAASeq( + ModelInterface_for_Generic_ModAASeq_MultiLabelClassification +): + def __init__(self, min_charge:int, max_charge:int): + super().__init__( + num_target_values=max_charge-min_charge+1, + model_class=Model_for_Generic_ModAASeq_BinaryClassification_Transformer, + nlayers=4, hidden_dim=128, dropout=0.1 + ) + + self.target_column_to_predict = "charge_probs" + self.target_column_to_train = "charge_indicators" + self.min_charge = min_charge + self.max_charge = max_charge + self.charge_range = np.arange( + min_charge, max_charge+1, dtype=np.int8 + ) + + def predict_charges_for_pep_df(self, + pep_df:pd.DataFrame, + charge_prob=0.3, + drop_probs_column=True + ): + df = self.predict(pep_df) + df["charge"] = self.charge_probs.apply( + lambda x: self.charge_range[x>charge_prob] + ) + df = df.explode("charge").dropna(subset=["charge"]) + if drop_probs_column: + df.drop(columns="charge_probs", inplace=True) + df["charge"] = df.charge.astype(np.int8) + return df + +def group_psm_df_by_sequence( + psm_df: pd.DataFrame, + min_charge:int, + max_charge:int, +): + return psm_df.groupby("sequence")["charge"].apply( + lambda x: get_charge_indicators(set(x), + min_charge=min_charge, max_charge=max_charge + ) + ).reset_index(drop=False).rename(columns={"charge":"charge_indicators"}) + + +def group_psm_df_by_modseq( + psm_df: pd.DataFrame, + min_charge:int, + max_charge:int, +): + return psm_df.groupby(["sequence","mods","mod_sites"])["charge"].apply( + lambda x: get_charge_indicators(set(x), + min_charge=min_charge, max_charge=max_charge + ) + ).reset_index(drop=False).rename(columns={"charge":"charge_indicators"}) + +def get_charge_indicators( + charge_list, + min_charge:int, + max_charge:int, +): + charge_indicators = np.zeros(max_charge-min_charge+1) + for charge in charge_list: + if charge <= max_charge and charge >= min_charge: + charge_indicators[charge-min_charge] = 1.0 + return charge_indicators \ No newline at end of file From d9b5ad2251ab3e4aaaac04a8726248efad768311 Mon Sep 17 00:00:00 2001 From: jalew188 Date: Mon, 29 Jan 2024 15:46:37 +0100 Subject: [PATCH 02/18] test nbs for charge pred --- nbdev_nbs/model/charge.ipynb | 244 +++++++++++++++++++++++++++++++++++ peptdeep/model/charge.py | 34 +++-- 2 files changed, 264 insertions(+), 14 deletions(-) create mode 100644 nbdev_nbs/model/charge.ipynb diff --git a/nbdev_nbs/model/charge.ipynb b/nbdev_nbs/model/charge.ipynb new file mode 100644 index 00000000..daadf88e --- /dev/null +++ b/nbdev_nbs/model/charge.ipynb @@ -0,0 +1,244 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sequencecharge_indicatorsnAAcharge_probs
0ABCDE[1, 0]5[0.7503374, 0.18496446]
1FGHIJK[0, 1]6[0.29431552, 0.5896796]
2LMNOPQ[1, 1]6[0.7241462, 0.6150697]
3RSTUVWXYZ[0, 0]9[0.30657992, 0.22709145]
\n", + "
" + ], + "text/plain": [ + " sequence charge_indicators nAA charge_probs\n", + "0 ABCDE [1, 0] 5 [0.7503374, 0.18496446]\n", + "1 FGHIJK [0, 1] 6 [0.29431552, 0.5896796]\n", + "2 LMNOPQ [1, 1] 6 [0.7241462, 0.6150697]\n", + "3 RSTUVWXYZ [0, 0] 9 [0.30657992, 0.22709145]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from peptdeep.model.charge import *\n", + "import pandas as pd\n", + "\n", + "model = ChargeModelForAASeq(min_charge=1, max_charge=2)\n", + "\n", + "df = pd.DataFrame({\n", + " 'sequence': ['ABCDE','FGHIJK','LMNOPQ','RSTUVWXYZ'],\n", + " 'charge_indicators': [[1,0],[0,1],[1,1],[0,0]],\n", + "})\n", + "model.train(df)\n", + "model.predict(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sequencemodsmod_sitescharge_indicatorsnAAcharge_probs
0ABCDE[1, 0]5[0.80226785, 0.19673407]
1FGHIJK[0, 1]6[0.3100456, 0.6222909]
2LMNOPQOxidation@M2[1, 1]6[0.7553099, 0.66014636]
3RSTUVWXYZPhospho@T3[0, 0]9[0.28392678, 0.25133142]
\n", + "
" + ], + "text/plain": [ + " sequence mods mod_sites charge_indicators nAA \\\n", + "0 ABCDE [1, 0] 5 \n", + "1 FGHIJK [0, 1] 6 \n", + "2 LMNOPQ Oxidation@M 2 [1, 1] 6 \n", + "3 RSTUVWXYZ Phospho@T 3 [0, 0] 9 \n", + "\n", + " charge_probs \n", + "0 [0.80226785, 0.19673407] \n", + "1 [0.3100456, 0.6222909] \n", + "2 [0.7553099, 0.66014636] \n", + "3 [0.28392678, 0.25133142] " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from peptdeep.model.charge import *\n", + "import pandas as pd\n", + "\n", + "model = ChargeModelForModAASeq(min_charge=1, max_charge=2)\n", + "\n", + "df = pd.DataFrame({\n", + " 'sequence': ['ABCDE','FGHIJK','LMNOPQ','RSTUVWXYZ'],\n", + " 'mods': ['', '', 'Oxidation@M', 'Phospho@T'],\n", + " 'mod_sites': ['', '', '2', '3'],\n", + " 'charge_indicators': [[1,0],[0,1],[1,1],[0,0]],\n", + "})\n", + "model.train(df)\n", + "model.predict(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/peptdeep/model/charge.py b/peptdeep/model/charge.py index b57c6266..5c0fc631 100644 --- a/peptdeep/model/charge.py +++ b/peptdeep/model/charge.py @@ -8,14 +8,17 @@ ModelInterface_for_Generic_ModAASeq_MultiLabelClassification, Model_for_Generic_ModAASeq_BinaryClassification_Transformer, ) - -class ChargeModelForAASeq( - ModelInterface_for_Generic_AASeq_MultiLabelClassification + +class ChargeModelForModAASeq( + ModelInterface_for_Generic_ModAASeq_MultiLabelClassification ): - def __init__(self, min_charge:int, max_charge:int): + """ + ModelInterface for charge prediction for modified peptides + """ + def __init__(self, min_charge:int=1, max_charge:int=6): super().__init__( num_target_values=max_charge-min_charge+1, - model_class=Model_for_Generic_AASeq_BinaryClassification_Transformer, + model_class=Model_for_Generic_ModAASeq_BinaryClassification_Transformer, nlayers=4, hidden_dim=128, dropout=0.1 ) @@ -23,7 +26,9 @@ def __init__(self, min_charge:int, max_charge:int): self.target_column_to_train = "charge_indicators" self.min_charge = min_charge self.max_charge = max_charge - self.charge_range = np.arange(min_charge, max_charge+1, dtype=np.int8) + self.charge_range = np.arange( + min_charge, max_charge+1, dtype=np.int8 + ) def predict_charges_for_pep_df(self, pep_df:pd.DataFrame, @@ -39,14 +44,17 @@ def predict_charges_for_pep_df(self, df.drop(columns="charge_probs", inplace=True) df["charge"] = df.charge.astype(np.int8) return df - -class ChargeModelForModAASeq( - ModelInterface_for_Generic_ModAASeq_MultiLabelClassification + +class ChargeModelForAASeq( + ModelInterface_for_Generic_AASeq_MultiLabelClassification ): - def __init__(self, min_charge:int, max_charge:int): + """ + ModelInterface for charge prediction for amino acid sequence + """ + def __init__(self, min_charge:int=1, max_charge:int=6): super().__init__( num_target_values=max_charge-min_charge+1, - model_class=Model_for_Generic_ModAASeq_BinaryClassification_Transformer, + model_class=Model_for_Generic_AASeq_BinaryClassification_Transformer, nlayers=4, hidden_dim=128, dropout=0.1 ) @@ -54,9 +62,7 @@ def __init__(self, min_charge:int, max_charge:int): self.target_column_to_train = "charge_indicators" self.min_charge = min_charge self.max_charge = max_charge - self.charge_range = np.arange( - min_charge, max_charge+1, dtype=np.int8 - ) + self.charge_range = np.arange(min_charge, max_charge+1, dtype=np.int8) def predict_charges_for_pep_df(self, pep_df:pd.DataFrame, From c0b9b0e38c3f89fb15701b796aee6af63e070acb Mon Sep 17 00:00:00 2001 From: jalew188 Date: Mon, 29 Jan 2024 15:56:17 +0100 Subject: [PATCH 03/18] test predict_charges_for_pep_df --- nbdev_nbs/model/charge.ipynb | 111 +++++++++++++++++++++++------------ peptdeep/model/charge.py | 4 +- 2 files changed, 74 insertions(+), 41 deletions(-) diff --git a/nbdev_nbs/model/charge.ipynb b/nbdev_nbs/model/charge.ipynb index daadf88e..72b7a0cc 100644 --- a/nbdev_nbs/model/charge.ipynb +++ b/nbdev_nbs/model/charge.ipynb @@ -40,6 +40,7 @@ " charge_indicators\n", " nAA\n", " charge_probs\n", + " charge\n", " \n", " \n", " \n", @@ -48,39 +49,52 @@ " ABCDE\n", " [1, 0]\n", " 5\n", - " [0.7503374, 0.18496446]\n", + " [0.7461448, 0.2694278]\n", + " 1\n", " \n", " \n", " 1\n", " FGHIJK\n", " [0, 1]\n", " 6\n", - " [0.29431552, 0.5896796]\n", + " [0.32061976, 0.63410914]\n", + " 1\n", + " \n", + " \n", + " 1\n", + " FGHIJK\n", + " [0, 1]\n", + " 6\n", + " [0.32061976, 0.63410914]\n", + " 2\n", " \n", " \n", " 2\n", " LMNOPQ\n", " [1, 1]\n", " 6\n", - " [0.7241462, 0.6150697]\n", + " [0.6874503, 0.56040055]\n", + " 1\n", " \n", " \n", - " 3\n", - " RSTUVWXYZ\n", - " [0, 0]\n", - " 9\n", - " [0.30657992, 0.22709145]\n", + " 2\n", + " LMNOPQ\n", + " [1, 1]\n", + " 6\n", + " [0.6874503, 0.56040055]\n", + " 2\n", " \n", " \n", "\n", "" ], "text/plain": [ - " sequence charge_indicators nAA charge_probs\n", - "0 ABCDE [1, 0] 5 [0.7503374, 0.18496446]\n", - "1 FGHIJK [0, 1] 6 [0.29431552, 0.5896796]\n", - "2 LMNOPQ [1, 1] 6 [0.7241462, 0.6150697]\n", - "3 RSTUVWXYZ [0, 0] 9 [0.30657992, 0.22709145]" + " sequence charge_indicators nAA charge_probs charge\n", + "0 ABCDE [1, 0] 5 [0.7461448, 0.2694278] 1\n", + "1 FGHIJK [0, 1] 6 [0.32061976, 0.63410914] 1\n", + "1 FGHIJK [0, 1] 6 [0.32061976, 0.63410914] 2\n", + "2 LMNOPQ [1, 1] 6 [0.6874503, 0.56040055] 1\n", + "2 LMNOPQ [1, 1] 6 [0.6874503, 0.56040055] 2" ] }, "execution_count": 2, @@ -94,12 +108,13 @@ "\n", "model = ChargeModelForAASeq(min_charge=1, max_charge=2)\n", "\n", - "df = pd.DataFrame({\n", + "seq_df = pd.DataFrame({\n", " 'sequence': ['ABCDE','FGHIJK','LMNOPQ','RSTUVWXYZ'],\n", " 'charge_indicators': [[1,0],[0,1],[1,1],[0,0]],\n", "})\n", - "model.train(df)\n", - "model.predict(df)" + "model.train(seq_df)\n", + "model.predict(seq_df)\n", + "model.predict_charges_for_pep_df(seq_df, drop_probs_column=False)" ] }, { @@ -134,6 +149,7 @@ " charge_indicators\n", " nAA\n", " charge_probs\n", + " charge\n", " \n", " \n", " \n", @@ -144,7 +160,18 @@ " \n", " [1, 0]\n", " 5\n", - " [0.80226785, 0.19673407]\n", + " [0.7292267, 0.24495421]\n", + " 1\n", + " \n", + " \n", + " 1\n", + " FGHIJK\n", + " \n", + " \n", + " [0, 1]\n", + " 6\n", + " [0.30077943, 0.5916298]\n", + " 1\n", " \n", " \n", " 1\n", @@ -153,7 +180,8 @@ " \n", " [0, 1]\n", " 6\n", - " [0.3100456, 0.6222909]\n", + " [0.30077943, 0.5916298]\n", + " 2\n", " \n", " \n", " 2\n", @@ -162,33 +190,37 @@ " 2\n", " [1, 1]\n", " 6\n", - " [0.7553099, 0.66014636]\n", + " [0.7352803, 0.60003597]\n", + " 1\n", " \n", " \n", - " 3\n", - " RSTUVWXYZ\n", - " Phospho@T\n", - " 3\n", - " [0, 0]\n", - " 9\n", - " [0.28392678, 0.25133142]\n", + " 2\n", + " LMNOPQ\n", + " Oxidation@M\n", + " 2\n", + " [1, 1]\n", + " 6\n", + " [0.7352803, 0.60003597]\n", + " 2\n", " \n", " \n", "\n", "" ], "text/plain": [ - " sequence mods mod_sites charge_indicators nAA \\\n", - "0 ABCDE [1, 0] 5 \n", - "1 FGHIJK [0, 1] 6 \n", - "2 LMNOPQ Oxidation@M 2 [1, 1] 6 \n", - "3 RSTUVWXYZ Phospho@T 3 [0, 0] 9 \n", + " sequence mods mod_sites charge_indicators nAA \\\n", + "0 ABCDE [1, 0] 5 \n", + "1 FGHIJK [0, 1] 6 \n", + "1 FGHIJK [0, 1] 6 \n", + "2 LMNOPQ Oxidation@M 2 [1, 1] 6 \n", + "2 LMNOPQ Oxidation@M 2 [1, 1] 6 \n", "\n", - " charge_probs \n", - "0 [0.80226785, 0.19673407] \n", - "1 [0.3100456, 0.6222909] \n", - "2 [0.7553099, 0.66014636] \n", - "3 [0.28392678, 0.25133142] " + " charge_probs charge \n", + "0 [0.7292267, 0.24495421] 1 \n", + "1 [0.30077943, 0.5916298] 1 \n", + "1 [0.30077943, 0.5916298] 2 \n", + "2 [0.7352803, 0.60003597] 1 \n", + "2 [0.7352803, 0.60003597] 2 " ] }, "execution_count": 3, @@ -202,14 +234,15 @@ "\n", "model = ChargeModelForModAASeq(min_charge=1, max_charge=2)\n", "\n", - "df = pd.DataFrame({\n", + "modseq_df = pd.DataFrame({\n", " 'sequence': ['ABCDE','FGHIJK','LMNOPQ','RSTUVWXYZ'],\n", " 'mods': ['', '', 'Oxidation@M', 'Phospho@T'],\n", " 'mod_sites': ['', '', '2', '3'],\n", " 'charge_indicators': [[1,0],[0,1],[1,1],[0,0]],\n", "})\n", - "model.train(df)\n", - "model.predict(df)" + "model.train(modseq_df)\n", + "model.predict(modseq_df)\n", + "model.predict_charges_for_pep_df(modseq_df, drop_probs_column=False)" ] }, { diff --git a/peptdeep/model/charge.py b/peptdeep/model/charge.py index 5c0fc631..5c0bea65 100644 --- a/peptdeep/model/charge.py +++ b/peptdeep/model/charge.py @@ -36,7 +36,7 @@ def predict_charges_for_pep_df(self, drop_probs_column=True ): df = self.predict(pep_df) - df["charge"] = self.charge_probs.apply( + df["charge"] = df.charge_probs.apply( lambda x: self.charge_range[x>charge_prob] ) df = df.explode("charge").dropna(subset=["charge"]) @@ -70,7 +70,7 @@ def predict_charges_for_pep_df(self, drop_probs_column=True ): df = self.predict(pep_df) - df["charge"] = self.charge_probs.apply( + df["charge"] = df.charge_probs.apply( lambda x: self.charge_range[x>charge_prob] ) df = df.explode("charge").dropna(subset=["charge"]) From d912e570eeacdb45fa448e746044318ff8562205 Mon Sep 17 00:00:00 2001 From: jalew188 Date: Mon, 29 Jan 2024 16:02:12 +0100 Subject: [PATCH 04/18] nbdev_clean --- nbdev_nbs/model/charge.ipynb | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/nbdev_nbs/model/charge.ipynb b/nbdev_nbs/model/charge.ipynb index 72b7a0cc..302041c7 100644 --- a/nbdev_nbs/model/charge.ipynb +++ b/nbdev_nbs/model/charge.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -97,7 +97,7 @@ "2 LMNOPQ [1, 1] 6 [0.6874503, 0.56040055] 2" ] }, - "execution_count": 2, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -119,7 +119,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -223,7 +223,7 @@ "2 [0.7352803, 0.60003597] 2 " ] }, - "execution_count": 3, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -258,18 +258,6 @@ "display_name": "base", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" } }, "nbformat": 4, From 58e7e66f05698a41902897b119cdcbb0e50b554b Mon Sep 17 00:00:00 2001 From: jalew188 Date: Mon, 29 Jan 2024 22:17:25 +0100 Subject: [PATCH 05/18] add_charge for speclib prediction --- nbdev_nbs/model/charge.ipynb | 256 +++- nbdev_nbs/protein/fasta.ipynb | 1321 +++++++++--------- nbs_tests/test_fasta_lib_precursor_lib.ipynb | 109 +- peptdeep/constants/default_settings.yaml | 7 + peptdeep/model/charge.py | 91 +- peptdeep/pretrained_models.py | 16 +- peptdeep/protein/fasta.py | 16 + 7 files changed, 1007 insertions(+), 809 deletions(-) diff --git a/nbdev_nbs/model/charge.ipynb b/nbdev_nbs/model/charge.ipynb index 302041c7..7755d736 100644 --- a/nbdev_nbs/model/charge.ipynb +++ b/nbdev_nbs/model/charge.ipynb @@ -15,6 +15,13 @@ "execution_count": null, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n" + ] + }, { "data": { "text/html": [ @@ -39,7 +46,7 @@ " sequence\n", " charge_indicators\n", " nAA\n", - " charge_probs\n", + " charge_prob\n", " charge\n", " \n", " \n", @@ -49,7 +56,7 @@ " ABCDE\n", " [1, 0]\n", " 5\n", - " [0.7461448, 0.2694278]\n", + " 0.739165\n", " 1\n", " \n", " \n", @@ -57,7 +64,7 @@ " FGHIJK\n", " [0, 1]\n", " 6\n", - " [0.32061976, 0.63410914]\n", + " 0.439334\n", " 1\n", " \n", " \n", @@ -65,7 +72,7 @@ " FGHIJK\n", " [0, 1]\n", " 6\n", - " [0.32061976, 0.63410914]\n", + " 0.627932\n", " 2\n", " \n", " \n", @@ -73,7 +80,7 @@ " LMNOPQ\n", " [1, 1]\n", " 6\n", - " [0.6874503, 0.56040055]\n", + " 0.628110\n", " 1\n", " \n", " \n", @@ -81,7 +88,7 @@ " LMNOPQ\n", " [1, 1]\n", " 6\n", - " [0.6874503, 0.56040055]\n", + " 0.587332\n", " 2\n", " \n", " \n", @@ -89,12 +96,12 @@ "" ], "text/plain": [ - " sequence charge_indicators nAA charge_probs charge\n", - "0 ABCDE [1, 0] 5 [0.7461448, 0.2694278] 1\n", - "1 FGHIJK [0, 1] 6 [0.32061976, 0.63410914] 1\n", - "1 FGHIJK [0, 1] 6 [0.32061976, 0.63410914] 2\n", - "2 LMNOPQ [1, 1] 6 [0.6874503, 0.56040055] 1\n", - "2 LMNOPQ [1, 1] 6 [0.6874503, 0.56040055] 2" + " sequence charge_indicators nAA charge_prob charge\n", + "0 ABCDE [1, 0] 5 0.739165 1\n", + "1 FGHIJK [0, 1] 6 0.439334 1\n", + "1 FGHIJK [0, 1] 6 0.627932 2\n", + "2 LMNOPQ [1, 1] 6 0.628110 1\n", + "2 LMNOPQ [1, 1] 6 0.587332 2" ] }, "execution_count": null, @@ -114,7 +121,7 @@ "})\n", "model.train(seq_df)\n", "model.predict(seq_df)\n", - "model.predict_charges_for_pep_df(seq_df, drop_probs_column=False)" + "model.predict_and_clip_charges(seq_df, charge_prob_cutoff=0.3)" ] }, { @@ -144,11 +151,9 @@ " \n", " \n", " sequence\n", - " mods\n", - " mod_sites\n", " charge_indicators\n", " nAA\n", - " charge_probs\n", + " charge_prob\n", " charge\n", " \n", " \n", @@ -156,21 +161,100 @@ " \n", " 0\n", " ABCDE\n", - " \n", - " \n", " [1, 0]\n", " 5\n", - " [0.7292267, 0.24495421]\n", - " 1\n", + " 0.249596\n", + " 2\n", " \n", " \n", " 1\n", " FGHIJK\n", - " \n", - " \n", " [0, 1]\n", " 6\n", - " [0.30077943, 0.5916298]\n", + " 0.627932\n", + " 2\n", + " \n", + " \n", + " 2\n", + " LMNOPQ\n", + " [1, 1]\n", + " 6\n", + " 0.587332\n", + " 2\n", + " \n", + " \n", + " 3\n", + " RSTUVWXYZ\n", + " [0, 0]\n", + " 9\n", + " 0.260932\n", + " 2\n", + " \n", + " \n", + "\n", + "" + ], + "text/plain": [ + " sequence charge_indicators nAA charge_prob charge\n", + "0 ABCDE [1, 0] 5 0.249596 2\n", + "1 FGHIJK [0, 1] 6 0.627932 2\n", + "2 LMNOPQ [1, 1] 6 0.587332 2\n", + "3 RSTUVWXYZ [0, 0] 9 0.260932 2" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.predict_charges_as_prob(seq_df, 2, 4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -180,7 +264,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -190,7 +274,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -200,7 +284,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -208,19 +292,11 @@ "" ], "text/plain": [ - " sequence mods mod_sites charge_indicators nAA \\\n", - "0 ABCDE [1, 0] 5 \n", - "1 FGHIJK [0, 1] 6 \n", - "1 FGHIJK [0, 1] 6 \n", - "2 LMNOPQ Oxidation@M 2 [1, 1] 6 \n", - "2 LMNOPQ Oxidation@M 2 [1, 1] 6 \n", - "\n", - " charge_probs charge \n", - "0 [0.7292267, 0.24495421] 1 \n", - "1 [0.30077943, 0.5916298] 1 \n", - "1 [0.30077943, 0.5916298] 2 \n", - "2 [0.7352803, 0.60003597] 1 \n", - "2 [0.7352803, 0.60003597] 2 " + " sequence mods mod_sites charge_indicators nAA charge_prob charge\n", + "0 ABCDE [1, 0] 5 0.814867 1\n", + "1 FGHIJK [0, 1] 6 0.708186 2\n", + "2 LMNOPQ Oxidation@M 2 [1, 1] 6 0.712738 1\n", + "2 LMNOPQ Oxidation@M 2 [1, 1] 6 0.534221 2" ] }, "execution_count": null, @@ -242,15 +318,111 @@ "})\n", "model.train(modseq_df)\n", "model.predict(modseq_df)\n", - "model.predict_charges_for_pep_df(modseq_df, drop_probs_column=False)" + "model.predict_and_clip_charges(modseq_df, charge_prob_cutoff=0.3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "
sequencemodsmod_sitescharge_indicatorsnAAcharge_probcharge
0ABCDE[1, 0]50.8148671
[0, 1]6[0.30077943, 0.5916298]0.7081862
2[1, 1]6[0.7352803, 0.60003597]0.7127381
2[1, 1]6[0.7352803, 0.60003597]0.5342212
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sequencemodsmod_sitescharge_indicatorsnAAcharge_probcharge
0ABCDE[1, 0]50.2249462
1FGHIJK[0, 1]60.7081862
2LMNOPQOxidation@M2[1, 1]60.5342212
3RSTUVWXYZPhospho@T3[0, 0]90.2782212
\n", + "
" + ], + "text/plain": [ + " sequence mods mod_sites charge_indicators nAA charge_prob \\\n", + "0 ABCDE [1, 0] 5 0.224946 \n", + "1 FGHIJK [0, 1] 6 0.708186 \n", + "2 LMNOPQ Oxidation@M 2 [1, 1] 6 0.534221 \n", + "3 RSTUVWXYZ Phospho@T 3 [0, 0] 9 0.278221 \n", + "\n", + " charge \n", + "0 2 \n", + "1 2 \n", + "2 2 \n", + "3 2 " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.predict_charges_as_prob(modseq_df, 2, 4)" + ] } ], "metadata": { diff --git a/nbdev_nbs/protein/fasta.ipynb b/nbdev_nbs/protein/fasta.ipynb index fb7436e2..3332879e 100644 --- a/nbdev_nbs/protein/fasta.ipynb +++ b/nbdev_nbs/protein/fasta.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -18,16 +18,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n" + ] + } + ], "source": [ "from peptdeep.protein.fasta import *" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -166,8 +174,8 @@ "" ], "text/plain": [ - " sequence protein_idxes miss_cleavage is_prot_nterm \n", - "0 MABCDEK 0 0 True \\\n", + " sequence protein_idxes miss_cleavage is_prot_nterm \\\n", + "0 MABCDEK 0 0 True \n", "1 LMNOPQR 0;1 0 False \n", "2 LMNOPQRST 0 1 False \n", "3 ABCDEKFGHIJK 0 1 True \n", @@ -189,7 +197,7 @@ "8 False 20 " ] }, - "execution_count": null, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -216,7 +224,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -268,7 +276,7 @@ "1 yy gene FGHIJKLMNOPQR" ] }, - "execution_count": null, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -279,7 +287,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -438,8 +446,8 @@ "" ], "text/plain": [ - " sequence protein_idxes miss_cleavage is_prot_nterm \n", - "0 MABCDEK 0 0 True \\\n", + " sequence protein_idxes miss_cleavage is_prot_nterm \\\n", + "0 MABCDEK 0 0 True \n", "1 LMNOPQR 0;1 0 False \n", "2 LMNOPQRST 0 1 False \n", "3 ABCDEKFGHIJK 0 1 True \n", @@ -461,7 +469,7 @@ "8 False 20 xx " ] }, - "execution_count": null, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -474,7 +482,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -506,7 +514,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -550,8 +558,8 @@ " 0\n", " True\n", " False\n", - " Carbamidomethyl@C;Oxidation@M\n", - " 4;1\n", + " Oxidation@M;Carbamidomethyl@C\n", + " 1;4\n", " 7\n", " xx\n", " \n", @@ -576,8 +584,8 @@ " 0\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...\n", - " 4;0;1\n", + " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " 0;1;4\n", " 7\n", " xx\n", " \n", @@ -589,8 +597,8 @@ " 0\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term\n", - " 4;0\n", + " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " 0;4\n", " 7\n", " xx\n", " \n", @@ -667,8 +675,8 @@ " 1\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term\n", - " 3;0\n", + " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " 0;3\n", " 12\n", " xx\n", " \n", @@ -680,8 +688,8 @@ " 1\n", " True\n", " False\n", - " Carbamidomethyl@C;Oxidation@M\n", - " 4;1\n", + " Oxidation@M;Carbamidomethyl@C\n", + " 1;4\n", " 13\n", " xx\n", " \n", @@ -706,8 +714,8 @@ " 1\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...\n", - " 4;0;1\n", + " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " 0;1;4\n", " 13\n", " xx\n", " \n", @@ -719,8 +727,8 @@ " 1\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term\n", - " 4;0\n", + " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " 0;4\n", " 13\n", " xx\n", " \n", @@ -810,8 +818,8 @@ " 2\n", " True\n", " False\n", - " Carbamidomethyl@C;Oxidation@M\n", - " 3;14\n", + " Oxidation@M;Carbamidomethyl@C\n", + " 14;3\n", " 19\n", " xx\n", " \n", @@ -836,8 +844,8 @@ " 2\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...\n", - " 3;0;14\n", + " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " 0;14;3\n", " 19\n", " xx\n", " \n", @@ -849,8 +857,8 @@ " 2\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term\n", - " 3;0\n", + " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " 0;3\n", " 19\n", " xx\n", " \n", @@ -862,8 +870,8 @@ " 2\n", " True\n", " False\n", - " Carbamidomethyl@C;Oxidation@M\n", - " 4;1\n", + " Oxidation@M;Carbamidomethyl@C\n", + " 1;4\n", " 20\n", " xx\n", " \n", @@ -875,8 +883,8 @@ " 2\n", " True\n", " False\n", - " Carbamidomethyl@C;Oxidation@M\n", - " 4;15\n", + " Oxidation@M;Carbamidomethyl@C\n", + " 15;4\n", " 20\n", " xx\n", " \n", @@ -888,8 +896,8 @@ " 2\n", " True\n", " False\n", - " Carbamidomethyl@C;Oxidation@M;Oxidation@M\n", - " 4;1;15\n", + " Oxidation@M;Oxidation@M;Carbamidomethyl@C\n", + " 1;15;4\n", " 20\n", " xx\n", " \n", @@ -914,8 +922,8 @@ " 2\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...\n", - " 4;0;1\n", + " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " 0;1;4\n", " 20\n", " xx\n", " \n", @@ -927,8 +935,8 @@ " 2\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...\n", - " 4;0;15\n", + " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " 0;15;4\n", " 20\n", " xx\n", " \n", @@ -940,8 +948,8 @@ " 2\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...\n", - " 4;0;1;15\n", + " Acetyl@Protein N-term;Oxidation@M;Oxidation@M;...\n", + " 0;1;15;4\n", " 20\n", " xx\n", " \n", @@ -953,8 +961,8 @@ " 2\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term\n", - " 4;0\n", + " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " 0;4\n", " 20\n", " xx\n", " \n", @@ -964,8 +972,8 @@ "" ], "text/plain": [ - " sequence protein_idxes miss_cleavage is_prot_nterm \n", - "0 MABCDEK 0 0 True \\\n", + " sequence protein_idxes miss_cleavage is_prot_nterm \\\n", + "0 MABCDEK 0 0 True \n", "1 MABCDEK 0 0 True \n", "2 MABCDEK 0 0 True \n", "3 MABCDEK 0 0 True \n", @@ -998,76 +1006,76 @@ "30 MABCDEKFGHIJKLMNOPQR 0 2 True \n", "31 MABCDEKFGHIJKLMNOPQR 0 2 True \n", "\n", - " is_prot_cterm mods \n", - "0 False Carbamidomethyl@C;Oxidation@M \\\n", + " is_prot_cterm mods \\\n", + "0 False Oxidation@M;Carbamidomethyl@C \n", "1 False Carbamidomethyl@C \n", - "2 False Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat... \n", - "3 False Carbamidomethyl@C;Acetyl@Protein N-term \n", + "2 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", + "3 False Acetyl@Protein N-term;Carbamidomethyl@C \n", "4 True Oxidation@M \n", "5 True \n", "6 True Oxidation@M \n", "7 True \n", "8 False Carbamidomethyl@C \n", - "9 False Carbamidomethyl@C;Acetyl@Protein N-term \n", - "10 False Carbamidomethyl@C;Oxidation@M \n", + "9 False Acetyl@Protein N-term;Carbamidomethyl@C \n", + "10 False Oxidation@M;Carbamidomethyl@C \n", "11 False Carbamidomethyl@C \n", - "12 False Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat... \n", - "13 False Carbamidomethyl@C;Acetyl@Protein N-term \n", + "12 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", + "13 False Acetyl@Protein N-term;Carbamidomethyl@C \n", "14 True Oxidation@M \n", "15 True \n", "16 True Acetyl@Protein N-term;Oxidation@M \n", "17 True Acetyl@Protein N-term \n", "18 True Oxidation@M \n", "19 True \n", - "20 False Carbamidomethyl@C;Oxidation@M \n", + "20 False Oxidation@M;Carbamidomethyl@C \n", "21 False Carbamidomethyl@C \n", - "22 False Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat... \n", - "23 False Carbamidomethyl@C;Acetyl@Protein N-term \n", - "24 False Carbamidomethyl@C;Oxidation@M \n", - "25 False Carbamidomethyl@C;Oxidation@M \n", - "26 False Carbamidomethyl@C;Oxidation@M;Oxidation@M \n", + "22 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", + "23 False Acetyl@Protein N-term;Carbamidomethyl@C \n", + "24 False Oxidation@M;Carbamidomethyl@C \n", + "25 False Oxidation@M;Carbamidomethyl@C \n", + "26 False Oxidation@M;Oxidation@M;Carbamidomethyl@C \n", "27 False Carbamidomethyl@C \n", - "28 False Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat... \n", - "29 False Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat... \n", - "30 False Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat... \n", - "31 False Carbamidomethyl@C;Acetyl@Protein N-term \n", + "28 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", + "29 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", + "30 False Acetyl@Protein N-term;Oxidation@M;Oxidation@M;... \n", + "31 False Acetyl@Protein N-term;Carbamidomethyl@C \n", "\n", " mod_sites nAA proteins genes \n", - "0 4;1 7 xx \n", + "0 1;4 7 xx \n", "1 4 7 xx \n", - "2 4;0;1 7 xx \n", - "3 4;0 7 xx \n", + "2 0;1;4 7 xx \n", + "3 0;4 7 xx \n", "4 2 7 xx;yy gene \n", "5 7 xx;yy gene \n", "6 2 9 xx \n", "7 9 xx \n", "8 3 12 xx \n", - "9 3;0 12 xx \n", - "10 4;1 13 xx \n", + "9 0;3 12 xx \n", + "10 1;4 13 xx \n", "11 4 13 xx \n", - "12 4;0;1 13 xx \n", - "13 4;0 13 xx \n", + "12 0;1;4 13 xx \n", + "13 0;4 13 xx \n", "14 8 13 xx;yy gene \n", "15 13 xx;yy gene \n", "16 0;8 13 xx;yy gene \n", "17 0 13 xx;yy gene \n", "18 8 15 xx \n", "19 15 xx \n", - "20 3;14 19 xx \n", + "20 14;3 19 xx \n", "21 3 19 xx \n", - "22 3;0;14 19 xx \n", - "23 3;0 19 xx \n", - "24 4;1 20 xx \n", - "25 4;15 20 xx \n", - "26 4;1;15 20 xx \n", + "22 0;14;3 19 xx \n", + "23 0;3 19 xx \n", + "24 1;4 20 xx \n", + "25 15;4 20 xx \n", + "26 1;15;4 20 xx \n", "27 4 20 xx \n", - "28 4;0;1 20 xx \n", - "29 4;0;15 20 xx \n", - "30 4;0;1;15 20 xx \n", - "31 4;0 20 xx " + "28 0;1;4 20 xx \n", + "29 0;15;4 20 xx \n", + "30 0;1;15;4 20 xx \n", + "31 0;4 20 xx " ] }, - "execution_count": null, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -1079,7 +1087,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -1125,7 +1133,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -1169,8 +1177,8 @@ " 0\n", " True\n", " False\n", - " Carbamidomethyl@C;Oxidation@M\n", - " 4;1\n", + " Oxidation@M;Carbamidomethyl@C\n", + " 1;4\n", " 7\n", " xx\n", " \n", @@ -1195,8 +1203,8 @@ " 0\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...\n", - " 4;0;1\n", + " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " 0;1;4\n", " 7\n", " xx\n", " \n", @@ -1208,8 +1216,8 @@ " 0\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term\n", - " 4;0\n", + " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " 0;4\n", " 7\n", " xx\n", " \n", @@ -1338,8 +1346,8 @@ " 1\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term\n", - " 3;0\n", + " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " 0;3\n", " 12\n", " xx\n", " \n", @@ -1351,8 +1359,8 @@ " 1\n", " True\n", " False\n", - " Carbamidomethyl@C;Oxidation@M\n", - " 4;1\n", + " Oxidation@M;Carbamidomethyl@C\n", + " 1;4\n", " 13\n", " xx\n", " \n", @@ -1377,8 +1385,8 @@ " 1\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...\n", - " 4;0;1\n", + " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " 0;1;4\n", " 13\n", " xx\n", " \n", @@ -1390,8 +1398,8 @@ " 1\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term\n", - " 4;0\n", + " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " 0;4\n", " 13\n", " xx\n", " \n", @@ -1533,8 +1541,8 @@ " 2\n", " True\n", " False\n", - " Carbamidomethyl@C;Oxidation@M\n", - " 3;14\n", + " Oxidation@M;Carbamidomethyl@C\n", + " 14;3\n", " 19\n", " xx\n", " \n", @@ -1559,8 +1567,8 @@ " 2\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...\n", - " 3;0;14\n", + " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " 0;14;3\n", " 19\n", " xx\n", " \n", @@ -1572,8 +1580,8 @@ " 2\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term\n", - " 3;0\n", + " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " 0;3\n", " 19\n", " xx\n", " \n", @@ -1585,8 +1593,8 @@ " 2\n", " True\n", " False\n", - " Carbamidomethyl@C;Oxidation@M\n", - " 4;1\n", + " Oxidation@M;Carbamidomethyl@C\n", + " 1;4\n", " 20\n", " xx\n", " \n", @@ -1598,8 +1606,8 @@ " 2\n", " True\n", " False\n", - " Carbamidomethyl@C;Oxidation@M\n", - " 4;15\n", + " Oxidation@M;Carbamidomethyl@C\n", + " 15;4\n", " 20\n", " xx\n", " \n", @@ -1611,8 +1619,8 @@ " 2\n", " True\n", " False\n", - " Carbamidomethyl@C;Oxidation@M;Oxidation@M\n", - " 4;1;15\n", + " Oxidation@M;Oxidation@M;Carbamidomethyl@C\n", + " 1;15;4\n", " 20\n", " xx\n", " \n", @@ -1637,8 +1645,8 @@ " 2\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...\n", - " 4;0;1\n", + " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " 0;1;4\n", " 20\n", " xx\n", " \n", @@ -1650,8 +1658,8 @@ " 2\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...\n", - " 4;0;15\n", + " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " 0;15;4\n", " 20\n", " xx\n", " \n", @@ -1663,8 +1671,8 @@ " 2\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...\n", - " 4;0;1;15\n", + " Acetyl@Protein N-term;Oxidation@M;Oxidation@M;...\n", + " 0;1;15;4\n", " 20\n", " xx\n", " \n", @@ -1676,8 +1684,8 @@ " 2\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term\n", - " 4;0\n", + " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " 0;4\n", " 20\n", " xx\n", " \n", @@ -1687,8 +1695,8 @@ "" ], "text/plain": [ - " sequence protein_idxes miss_cleavage is_prot_nterm \n", - "0 MABCDEK 0 0 True \\\n", + " sequence protein_idxes miss_cleavage is_prot_nterm \\\n", + "0 MABCDEK 0 0 True \n", "1 MABCDEK 0 0 True \n", "2 MABCDEK 0 0 True \n", "3 MABCDEK 0 0 True \n", @@ -1729,11 +1737,11 @@ "38 MABCDEKFGHIJKLMNOPQR 0 2 True \n", "39 MABCDEKFGHIJKLMNOPQR 0 2 True \n", "\n", - " is_prot_cterm mods \n", - "0 False Carbamidomethyl@C;Oxidation@M \\\n", + " is_prot_cterm mods \\\n", + "0 False Oxidation@M;Carbamidomethyl@C \n", "1 False Carbamidomethyl@C \n", - "2 False Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat... \n", - "3 False Carbamidomethyl@C;Acetyl@Protein N-term \n", + "2 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", + "3 False Acetyl@Protein N-term;Carbamidomethyl@C \n", "4 True Oxidation@M \n", "5 True \n", "6 True Oxidation@M;Phospho@S \n", @@ -1743,11 +1751,11 @@ "10 True Phospho@T \n", "11 True \n", "12 False Carbamidomethyl@C \n", - "13 False Carbamidomethyl@C;Acetyl@Protein N-term \n", - "14 False Carbamidomethyl@C;Oxidation@M \n", + "13 False Acetyl@Protein N-term;Carbamidomethyl@C \n", + "14 False Oxidation@M;Carbamidomethyl@C \n", "15 False Carbamidomethyl@C \n", - "16 False Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat... \n", - "17 False Carbamidomethyl@C;Acetyl@Protein N-term \n", + "16 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", + "17 False Acetyl@Protein N-term;Carbamidomethyl@C \n", "18 True Oxidation@M \n", "19 True \n", "20 True Acetyl@Protein N-term;Oxidation@M \n", @@ -1758,24 +1766,24 @@ "25 True Phospho@S \n", "26 True Phospho@T \n", "27 True \n", - "28 False Carbamidomethyl@C;Oxidation@M \n", + "28 False Oxidation@M;Carbamidomethyl@C \n", "29 False Carbamidomethyl@C \n", - "30 False Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat... \n", - "31 False Carbamidomethyl@C;Acetyl@Protein N-term \n", - "32 False Carbamidomethyl@C;Oxidation@M \n", - "33 False Carbamidomethyl@C;Oxidation@M \n", - "34 False Carbamidomethyl@C;Oxidation@M;Oxidation@M \n", + "30 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", + "31 False Acetyl@Protein N-term;Carbamidomethyl@C \n", + "32 False Oxidation@M;Carbamidomethyl@C \n", + "33 False Oxidation@M;Carbamidomethyl@C \n", + "34 False Oxidation@M;Oxidation@M;Carbamidomethyl@C \n", "35 False Carbamidomethyl@C \n", - "36 False Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat... \n", - "37 False Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat... \n", - "38 False Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat... \n", - "39 False Carbamidomethyl@C;Acetyl@Protein N-term \n", + "36 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", + "37 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", + "38 False Acetyl@Protein N-term;Oxidation@M;Oxidation@M;... \n", + "39 False Acetyl@Protein N-term;Carbamidomethyl@C \n", "\n", " mod_sites nAA proteins genes \n", - "0 4;1 7 xx \n", + "0 1;4 7 xx \n", "1 4 7 xx \n", - "2 4;0;1 7 xx \n", - "3 4;0 7 xx \n", + "2 0;1;4 7 xx \n", + "3 0;4 7 xx \n", "4 2 7 xx;yy gene \n", "5 7 xx;yy gene \n", "6 2;8 9 xx \n", @@ -1785,11 +1793,11 @@ "10 9 9 xx \n", "11 9 xx \n", "12 3 12 xx \n", - "13 3;0 12 xx \n", - "14 4;1 13 xx \n", + "13 0;3 12 xx \n", + "14 1;4 13 xx \n", "15 4 13 xx \n", - "16 4;0;1 13 xx \n", - "17 4;0 13 xx \n", + "16 0;1;4 13 xx \n", + "17 0;4 13 xx \n", "18 8 13 xx;yy gene \n", "19 13 xx;yy gene \n", "20 0;8 13 xx;yy gene \n", @@ -1800,21 +1808,21 @@ "25 14 15 xx \n", "26 15 15 xx \n", "27 15 xx \n", - "28 3;14 19 xx \n", + "28 14;3 19 xx \n", "29 3 19 xx \n", - "30 3;0;14 19 xx \n", - "31 3;0 19 xx \n", - "32 4;1 20 xx \n", - "33 4;15 20 xx \n", - "34 4;1;15 20 xx \n", + "30 0;14;3 19 xx \n", + "31 0;3 19 xx \n", + "32 1;4 20 xx \n", + "33 15;4 20 xx \n", + "34 1;15;4 20 xx \n", "35 4 20 xx \n", - "36 4;0;1 20 xx \n", - "37 4;0;15 20 xx \n", - "38 4;0;1;15 20 xx \n", - "39 4;0 20 xx " + "36 0;1;4 20 xx \n", + "37 0;15;4 20 xx \n", + "38 0;1;15;4 20 xx \n", + "39 0;4 20 xx " ] }, - "execution_count": null, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -1828,7 +1836,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -1873,8 +1881,8 @@ " 0\n", " True\n", " False\n", - " Carbamidomethyl@C;Oxidation@M\n", - " 4;1\n", + " Oxidation@M;Carbamidomethyl@C\n", + " 1;4\n", " 7\n", " xx\n", " \n", @@ -1901,8 +1909,8 @@ " 0\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...\n", - " 4;0;1\n", + " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " 0;1;4\n", " 7\n", " xx\n", " \n", @@ -1915,8 +1923,8 @@ " 0\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term\n", - " 4;0\n", + " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " 0;4\n", " 7\n", " xx\n", " \n", @@ -1971,8 +1979,8 @@ " 2\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...\n", - " 4;0;1;7;13\n", + " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " 0;1;4;7;13\n", " 20\n", " xx\n", " \n", @@ -1985,8 +1993,8 @@ " 2\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...\n", - " 4;0;15;7;13\n", + " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " 0;15;4;7;13\n", " 20\n", " xx\n", " \n", @@ -1999,8 +2007,8 @@ " 2\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...\n", - " 4;0;1;15;7;13\n", + " Acetyl@Protein N-term;Oxidation@M;Oxidation@M;...\n", + " 0;1;15;4;7;13\n", " 20\n", " xx\n", " \n", @@ -2013,8 +2021,8 @@ " 2\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term;Dimeth...\n", - " 4;0;7;13\n", + " Acetyl@Protein N-term;Carbamidomethyl@C;Dimeth...\n", + " 0;4;7;13\n", " 20\n", " xx\n", " \n", @@ -2026,8 +2034,8 @@ "" ], "text/plain": [ - " sequence protein_idxes miss_cleavage is_prot_nterm \n", - "0 MABCDEK 0 0 True \\\n", + " sequence protein_idxes miss_cleavage is_prot_nterm \\\n", + "0 MABCDEK 0 0 True \n", "1 MABCDEK 0 0 True \n", "2 MABCDEK 0 0 True \n", "3 MABCDEK 0 0 True \n", @@ -2039,36 +2047,36 @@ "118 MABCDEKFGHIJKLMNOPQR 0 2 True \n", "119 MABCDEKFGHIJKLMNOPQR 0 2 True \n", "\n", - " is_prot_cterm mods \n", - "0 False Carbamidomethyl@C;Oxidation@M \\\n", + " is_prot_cterm mods \\\n", + "0 False Oxidation@M;Carbamidomethyl@C \n", "1 False Carbamidomethyl@C \n", - "2 False Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat... \n", - "3 False Carbamidomethyl@C;Acetyl@Protein N-term \n", + "2 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", + "3 False Acetyl@Protein N-term;Carbamidomethyl@C \n", "4 True Oxidation@M \n", ".. ... ... \n", "115 False Carbamidomethyl@C;Dimethyl:2H(6)13C(2)@Any N-t... \n", - "116 False Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat... \n", - "117 False Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat... \n", - "118 False Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat... \n", - "119 False Carbamidomethyl@C;Acetyl@Protein N-term;Dimeth... \n", + "116 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", + "117 False Acetyl@Protein N-term;Oxidation@M;Carbamidomet... \n", + "118 False Acetyl@Protein N-term;Oxidation@M;Oxidation@M;... \n", + "119 False Acetyl@Protein N-term;Carbamidomethyl@C;Dimeth... \n", "\n", " mod_sites nAA proteins genes labeling_channel \n", - "0 4;1 7 xx none \n", + "0 1;4 7 xx none \n", "1 4 7 xx none \n", - "2 4;0;1 7 xx none \n", - "3 4;0 7 xx none \n", + "2 0;1;4 7 xx none \n", + "3 0;4 7 xx none \n", "4 2 7 xx;yy gene none \n", ".. ... ... ... ... ... \n", "115 4;0;7;13 20 xx heavy \n", - "116 4;0;1;7;13 20 xx heavy \n", - "117 4;0;15;7;13 20 xx heavy \n", - "118 4;0;1;15;7;13 20 xx heavy \n", - "119 4;0;7;13 20 xx heavy \n", + "116 0;1;4;7;13 20 xx heavy \n", + "117 0;15;4;7;13 20 xx heavy \n", + "118 0;1;15;4;7;13 20 xx heavy \n", + "119 0;4;7;13 20 xx heavy \n", "\n", "[120 rows x 11 columns]" ] }, - "execution_count": null, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -2085,7 +2093,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -2094,7 +2102,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -2129,16 +2137,16 @@ " decoy\n", " charge\n", " ...\n", - " isotope_apex_mz\n", - " isotope_right_most_mz\n", + " i_5\n", + " mono_isotope_idx\n", " rt_pred\n", " rt_norm_pred\n", " ccs_pred\n", " mobility_pred\n", - " frag_stop_idx\n", " nce\n", " instrument\n", " frag_start_idx\n", + " frag_stop_idx\n", " \n", " \n", " \n", @@ -2155,16 +2163,16 @@ " 0\n", " 2\n", " ...\n", - " 481.739834\n", - " 482.241484\n", + " 0.001232\n", + " 0\n", " 0.021263\n", " 0.021263\n", " 318.941895\n", " 0.785035\n", - " 7\n", " 30.0\n", " Lumos\n", " 0\n", + " 7\n", " \n", " \n", " 1\n", @@ -2179,16 +2187,16 @@ " 0\n", " 2\n", " ...\n", - " 473.742377\n", - " 474.244027\n", + " 0.001173\n", + " 0\n", " 0.092409\n", " 0.092409\n", " 317.660034\n", " 0.781693\n", - " 14\n", " 30.0\n", " Lumos\n", " 7\n", + " 14\n", " \n", " \n", " 2\n", @@ -2203,16 +2211,16 @@ " 0\n", " 2\n", " ...\n", - " 487.200207\n", - " 487.701857\n", - " 0.032798\n", - " 0.032798\n", - " 329.176941\n", + " 0.001409\n", + " 0\n", + " 0.032797\n", + " 0.032797\n", + " 329.177002\n", " 0.810355\n", - " 21\n", " 30.0\n", " Lumos\n", " 14\n", + " 21\n", " \n", " \n", " 3\n", @@ -2221,22 +2229,22 @@ " 0\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term\n", - " 2;0\n", + " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " 0;2\n", " 8\n", " 0\n", " 2\n", " ...\n", - " 508.205490\n", - " 509.208790\n", + " 0.001604\n", + " 0\n", " 0.109105\n", " 0.109105\n", - " 342.048767\n", + " 342.048706\n", " 0.842529\n", - " 28\n", " 30.0\n", " Lumos\n", " 21\n", + " 28\n", " \n", " \n", " 4\n", @@ -2251,16 +2259,16 @@ " 1\n", " 2\n", " ...\n", - " 481.739834\n", - " 482.241484\n", + " 0.001232\n", + " 0\n", " 0.044289\n", " 0.044289\n", - " 321.865784\n", + " 321.865723\n", " 0.792231\n", - " 35\n", " 30.0\n", " Lumos\n", " 28\n", + " 35\n", " \n", " \n", " 5\n", @@ -2275,16 +2283,16 @@ " 1\n", " 2\n", " ...\n", - " 473.742377\n", - " 474.244027\n", + " 0.001173\n", + " 0\n", " 0.158330\n", " 0.158330\n", " 323.465607\n", " 0.795979\n", - " 42\n", " 30.0\n", " Lumos\n", " 35\n", + " 42\n", " \n", " \n", " 6\n", @@ -2299,16 +2307,16 @@ " 1\n", " 2\n", " ...\n", - " 487.200207\n", - " 487.701857\n", - " 0.016275\n", - " 0.016275\n", + " 0.001409\n", + " 0\n", + " 0.016274\n", + " 0.016274\n", " 328.831970\n", " 0.809506\n", - " 49\n", " 30.0\n", " Lumos\n", " 42\n", + " 49\n", " \n", " \n", " 7\n", @@ -2317,22 +2325,22 @@ " 0\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term\n", - " 6;0\n", + " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " 0;6\n", " 8\n", " 1\n", " 2\n", " ...\n", - " 508.205490\n", - " 509.208790\n", + " 0.001604\n", + " 0\n", " 0.119288\n", " 0.119288\n", - " 339.180786\n", + " 339.180847\n", " 0.835465\n", - " 56\n", " 30.0\n", " Lumos\n", " 49\n", + " 56\n", " \n", " \n", " 8\n", @@ -2341,22 +2349,22 @@ " 0\n", " True\n", " False\n", - " Carbamidomethyl@C;Oxidation@M\n", - " 3;1\n", + " Oxidation@M;Carbamidomethyl@C\n", + " 1;3\n", " 9\n", " 0\n", " 2\n", " ...\n", - " 560.717907\n", - " 561.721207\n", + " 0.003490\n", + " 0\n", " 0.048364\n", " 0.048364\n", - " 351.815094\n", + " 351.815063\n", " 0.867675\n", - " 64\n", " 30.0\n", " Lumos\n", " 56\n", + " 64\n", " \n", " \n", " 9\n", @@ -2371,16 +2379,16 @@ " 0\n", " 2\n", " ...\n", - " 552.720450\n", - " 553.723750\n", + " 0.003395\n", + " 0\n", " 0.081848\n", " 0.081848\n", " 353.857971\n", " 0.872560\n", - " 72\n", " 30.0\n", " Lumos\n", " 64\n", + " 72\n", " \n", " \n", " 10\n", @@ -2389,22 +2397,22 @@ " 0\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...\n", - " 3;0;1\n", + " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " 0;1;3\n", " 9\n", " 0\n", " 2\n", " ...\n", - " 581.723190\n", - " 582.726490\n", + " 0.003824\n", + " 0\n", " 0.204708\n", " 0.204708\n", - " 362.488403\n", + " 362.488342\n", " 0.894392\n", - " 80\n", " 30.0\n", " Lumos\n", " 72\n", + " 80\n", " \n", " \n", " 11\n", @@ -2413,22 +2421,22 @@ " 0\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term\n", - " 3;0\n", + " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " 0;3\n", " 9\n", " 0\n", " 2\n", " ...\n", - " 573.725732\n", - " 574.729032\n", + " 0.003724\n", + " 0\n", " 0.279585\n", " 0.279585\n", " 361.742859\n", " 0.892406\n", - " 88\n", " 30.0\n", " Lumos\n", " 80\n", + " 88\n", " \n", " \n", " 12\n", @@ -2437,22 +2445,22 @@ " 0\n", " True\n", " False\n", - " Carbamidomethyl@C;Oxidation@M\n", - " 6;8\n", + " Oxidation@M;Carbamidomethyl@C\n", + " 8;6\n", " 9\n", " 1\n", " 2\n", " ...\n", - " 560.717907\n", - " 561.721207\n", + " 0.003490\n", + " 0\n", " 0.015205\n", " 0.015205\n", - " 354.745117\n", + " 354.745087\n", " 0.874901\n", - " 96\n", " 30.0\n", " Lumos\n", " 88\n", + " 96\n", " \n", " \n", " 13\n", @@ -2467,16 +2475,16 @@ " 1\n", " 2\n", " ...\n", - " 552.720450\n", - " 553.723750\n", + " 0.003395\n", + " 0\n", " 0.084342\n", " 0.084342\n", - " 355.178955\n", + " 355.178925\n", " 0.875817\n", - " 104\n", " 30.0\n", " Lumos\n", " 96\n", + " 104\n", " \n", " \n", " 14\n", @@ -2485,22 +2493,22 @@ " 0\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...\n", - " 6;0;8\n", + " Acetyl@Protein N-term;Oxidation@M;Carbamidomet...\n", + " 0;8;6\n", " 9\n", " 1\n", " 2\n", " ...\n", - " 581.723190\n", - " 582.726490\n", + " 0.003824\n", + " 0\n", " 0.134268\n", " 0.134268\n", " 363.174927\n", " 0.896086\n", - " 112\n", " 30.0\n", " Lumos\n", " 104\n", + " 112\n", " \n", " \n", " 15\n", @@ -2509,22 +2517,22 @@ " 0\n", " True\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term\n", - " 6;0\n", + " Acetyl@Protein N-term;Carbamidomethyl@C\n", + " 0;6\n", " 9\n", " 1\n", " 2\n", " ...\n", - " 573.725732\n", - " 574.729032\n", + " 0.003724\n", + " 0\n", " 0.263092\n", " 0.263092\n", - " 366.395264\n", + " 366.395203\n", " 0.903884\n", - " 120\n", " 30.0\n", " Lumos\n", " 112\n", + " 120\n", " \n", " \n", " 16\n", @@ -2539,16 +2547,16 @@ " 0\n", " 2\n", " ...\n", - " 678.863889\n", - " 679.867189\n", + " 0.003411\n", + " 0\n", " 0.243750\n", " 0.243750\n", " 401.572327\n", " 0.992497\n", - " 130\n", " 30.0\n", " Lumos\n", " 120\n", + " 130\n", " \n", " \n", " 17\n", @@ -2563,16 +2571,16 @@ " 0\n", " 3\n", " ...\n", - " 452.911685\n", - " 453.580551\n", + " 0.003411\n", + " 0\n", " 0.243750\n", " 0.243750\n", " 470.914978\n", " 0.775925\n", - " 140\n", " 30.0\n", " Lumos\n", " 130\n", + " 140\n", " \n", " \n", " 18\n", @@ -2587,16 +2595,16 @@ " 0\n", " 2\n", " ...\n", - " 670.866431\n", - " 671.869731\n", + " 0.003305\n", + " 0\n", " 0.299989\n", " 0.299989\n", - " 402.828186\n", + " 402.828217\n", " 0.995481\n", - " 150\n", " 30.0\n", " Lumos\n", " 140\n", + " 150\n", " \n", " \n", " 19\n", @@ -2611,16 +2619,16 @@ " 0\n", " 3\n", " ...\n", - " 447.580046\n", - " 448.248913\n", + " 0.003305\n", + " 0\n", " 0.299989\n", " 0.299989\n", - " 474.472504\n", + " 474.472473\n", " 0.781693\n", - " 160\n", " 30.0\n", " Lumos\n", " 150\n", + " 160\n", " \n", " \n", " 20\n", @@ -2635,16 +2643,16 @@ " 0\n", " 2\n", " ...\n", - " 699.869171\n", - " 700.872471\n", + " 0.003771\n", + " 0\n", " 0.416815\n", " 0.416815\n", " 406.307281\n", " 1.004504\n", - " 170\n", " 30.0\n", " Lumos\n", " 160\n", + " 170\n", " \n", " \n", " 21\n", @@ -2659,16 +2667,16 @@ " 0\n", " 3\n", " ...\n", - " 466.915206\n", - " 467.584073\n", + " 0.003771\n", + " 0\n", " 0.416815\n", " 0.416815\n", - " 463.901062\n", + " 463.901123\n", " 0.764600\n", - " 180\n", " 30.0\n", " Lumos\n", " 170\n", + " 180\n", " \n", " \n", " 22\n", @@ -2683,16 +2691,16 @@ " 0\n", " 2\n", " ...\n", - " 691.871714\n", - " 692.875014\n", + " 0.003660\n", + " 0\n", " 0.498515\n", " 0.498515\n", - " 407.171875\n", + " 407.171814\n", " 1.006527\n", - " 190\n", " 30.0\n", " Lumos\n", " 180\n", + " 190\n", " \n", " \n", " 23\n", @@ -2707,16 +2715,16 @@ " 0\n", " 3\n", " ...\n", - " 461.583568\n", - " 462.252435\n", + " 0.003660\n", + " 0\n", " 0.498515\n", " 0.498515\n", - " 468.311951\n", + " 468.311920\n", " 0.771782\n", - " 200\n", " 30.0\n", " Lumos\n", " 190\n", + " 200\n", " \n", " \n", " 24\n", @@ -2731,16 +2739,16 @@ " 1\n", " 2\n", " ...\n", - " 678.863889\n", - " 679.867189\n", + " 0.003411\n", + " 0\n", " 0.339134\n", " 0.339134\n", - " 400.909943\n", - " 0.990860\n", - " 210\n", + " 400.909912\n", + " 0.990859\n", " 30.0\n", " Lumos\n", " 200\n", + " 210\n", " \n", " \n", " 25\n", @@ -2755,16 +2763,16 @@ " 1\n", " 3\n", " ...\n", - " 452.911685\n", - " 453.580551\n", + " 0.003411\n", + " 0\n", " 0.339134\n", " 0.339134\n", " 478.989624\n", " 0.789230\n", - " 220\n", " 30.0\n", " Lumos\n", " 210\n", + " 220\n", " \n", " \n", " 26\n", @@ -2779,16 +2787,16 @@ " 1\n", " 2\n", " ...\n", - " 670.866431\n", - " 671.869731\n", + " 0.003305\n", + " 0\n", " 0.352144\n", " 0.352144\n", - " 402.555054\n", + " 402.555023\n", " 0.994806\n", - " 230\n", " 30.0\n", " Lumos\n", " 220\n", + " 230\n", " \n", " \n", " 27\n", @@ -2803,16 +2811,16 @@ " 1\n", " 3\n", " ...\n", - " 447.580046\n", - " 448.248913\n", + " 0.003305\n", + " 0\n", " 0.352144\n", " 0.352144\n", " 482.206787\n", " 0.794435\n", - " 240\n", " 30.0\n", " Lumos\n", " 230\n", + " 240\n", " \n", " \n", " 28\n", @@ -2827,16 +2835,16 @@ " 1\n", " 2\n", " ...\n", - " 699.869171\n", - " 700.872471\n", + " 0.003771\n", + " 0\n", " 0.406691\n", " 0.406691\n", - " 414.260376\n", + " 414.260437\n", " 1.024166\n", - " 250\n", " 30.0\n", " Lumos\n", " 240\n", + " 250\n", " \n", " \n", " 29\n", @@ -2851,16 +2859,16 @@ " 1\n", " 3\n", " ...\n", - " 466.915206\n", - " 467.584073\n", + " 0.003771\n", + " 0\n", " 0.406691\n", " 0.406691\n", - " 470.269623\n", + " 470.269653\n", " 0.775096\n", - " 260\n", " 30.0\n", " Lumos\n", " 250\n", + " 260\n", " \n", " \n", " 30\n", @@ -2875,16 +2883,16 @@ " 1\n", " 2\n", " ...\n", - " 691.871714\n", - " 692.875014\n", + " 0.003660\n", + " 0\n", " 0.462864\n", " 0.462864\n", " 417.726074\n", " 1.032617\n", - " 270\n", " 30.0\n", " Lumos\n", " 260\n", + " 270\n", " \n", " \n", " 31\n", @@ -2899,16 +2907,16 @@ " 1\n", " 3\n", " ...\n", - " 461.583568\n", - " 462.252435\n", + " 0.003660\n", + " 0\n", " 0.462864\n", " 0.462864\n", - " 469.226746\n", + " 469.226685\n", " 0.773290\n", - " 280\n", " 30.0\n", " Lumos\n", " 270\n", + " 280\n", " \n", " \n", " 32\n", @@ -2923,16 +2931,16 @@ " 1\n", " 2\n", " ...\n", - " 772.903742\n", - " 773.907042\n", + " 0.004945\n", + " 0\n", " 0.277093\n", " 0.277093\n", " 421.076538\n", " 1.041983\n", - " 292\n", " 30.0\n", " Lumos\n", " 280\n", + " 292\n", " \n", " \n", " 33\n", @@ -2947,16 +2955,16 @@ " 1\n", " 3\n", " ...\n", - " 515.604920\n", - " 516.273787\n", + " 0.004945\n", + " 0\n", " 0.277093\n", " 0.277093\n", - " 490.627563\n", + " 490.627533\n", " 0.809400\n", - " 304\n", " 30.0\n", " Lumos\n", " 292\n", + " 304\n", " \n", " \n", " 34\n", @@ -2971,16 +2979,16 @@ " 1\n", " 2\n", " ...\n", - " 764.906285\n", - " 765.909585\n", + " 0.004819\n", + " 0\n", " 0.336550\n", " 0.336550\n", - " 423.214264\n", + " 423.214233\n", " 1.047176\n", - " 316\n", " 30.0\n", " Lumos\n", " 304\n", + " 316\n", " \n", " \n", " 35\n", @@ -2995,16 +3003,16 @@ " 1\n", " 3\n", " ...\n", - " 510.273282\n", - " 510.942149\n", + " 0.004819\n", + " 0\n", " 0.336550\n", " 0.336550\n", " 487.170013\n", " 0.803621\n", - " 328\n", " 30.0\n", " Lumos\n", " 316\n", + " 328\n", " \n", " \n", " 36\n", @@ -3019,16 +3027,16 @@ " 0\n", " 2\n", " ...\n", - " 772.903742\n", - " 773.907042\n", + " 0.004945\n", + " 0\n", " 0.218114\n", " 0.218114\n", " 415.696411\n", " 1.028670\n", - " 340\n", " 30.0\n", " Lumos\n", " 328\n", + " 340\n", " \n", " \n", " 37\n", @@ -3043,16 +3051,16 @@ " 0\n", " 3\n", " ...\n", - " 515.604920\n", - " 516.273787\n", + " 0.004945\n", + " 0\n", " 0.218114\n", " 0.218114\n", " 473.192200\n", " 0.780636\n", - " 352\n", " 30.0\n", " Lumos\n", " 340\n", + " 352\n", " \n", " \n", " 38\n", @@ -3067,16 +3075,16 @@ " 0\n", " 2\n", " ...\n", - " 764.906285\n", - " 765.909585\n", + " 0.004819\n", + " 0\n", " 0.252718\n", " 0.252718\n", " 416.934204\n", " 1.031637\n", - " 364\n", " 30.0\n", " Lumos\n", " 352\n", + " 364\n", " \n", " \n", " 39\n", @@ -3091,25 +3099,25 @@ " 0\n", " 3\n", " ...\n", - " 510.273282\n", - " 510.942149\n", + " 0.004819\n", + " 0\n", " 0.252718\n", " 0.252718\n", - " 477.759796\n", + " 477.759888\n", " 0.788098\n", - " 376\n", " 30.0\n", " Lumos\n", " 364\n", + " 376\n", " \n", " \n", "\n", - "

40 rows × 27 columns

\n", + "

40 rows × 26 columns

\n", "" ], "text/plain": [ - " sequence protein_idxes miss_cleavage is_prot_nterm is_prot_cterm \n", - "0 LMNPQRST 0 1 False True \\\n", + " sequence protein_idxes miss_cleavage is_prot_nterm is_prot_cterm \\\n", + "0 LMNPQRST 0 1 False True \n", "1 LMNPQRST 0 1 False True \n", "2 ACDESTYK 0 0 True False \n", "3 ACDESTYK 0 0 True False \n", @@ -3150,23 +3158,23 @@ "38 FGHIKLMNPQRST 0 2 False True \n", "39 FGHIKLMNPQRST 0 2 False True \n", "\n", - " mods mod_sites nAA decoy \n", - "0 Oxidation@M 2 8 0 \\\n", + " mods mod_sites nAA decoy \\\n", + "0 Oxidation@M 2 8 0 \n", "1 8 0 \n", "2 Carbamidomethyl@C 2 8 0 \n", - "3 Carbamidomethyl@C;Acetyl@Protein N-term 2;0 8 0 \n", + "3 Acetyl@Protein N-term;Carbamidomethyl@C 0;2 8 0 \n", "4 Oxidation@M 6 8 1 \n", "5 8 1 \n", "6 Carbamidomethyl@C 6 8 1 \n", - "7 Carbamidomethyl@C;Acetyl@Protein N-term 6;0 8 1 \n", - "8 Carbamidomethyl@C;Oxidation@M 3;1 9 0 \n", + "7 Acetyl@Protein N-term;Carbamidomethyl@C 0;6 8 1 \n", + "8 Oxidation@M;Carbamidomethyl@C 1;3 9 0 \n", "9 Carbamidomethyl@C 3 9 0 \n", - "10 Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat... 3;0;1 9 0 \n", - "11 Carbamidomethyl@C;Acetyl@Protein N-term 3;0 9 0 \n", - "12 Carbamidomethyl@C;Oxidation@M 6;8 9 1 \n", + "10 Acetyl@Protein N-term;Oxidation@M;Carbamidomet... 0;1;3 9 0 \n", + "11 Acetyl@Protein N-term;Carbamidomethyl@C 0;3 9 0 \n", + "12 Oxidation@M;Carbamidomethyl@C 8;6 9 1 \n", "13 Carbamidomethyl@C 6 9 1 \n", - "14 Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat... 6;0;8 9 1 \n", - "15 Carbamidomethyl@C;Acetyl@Protein N-term 6;0 9 1 \n", + "14 Acetyl@Protein N-term;Oxidation@M;Carbamidomet... 0;8;6 9 1 \n", + "15 Acetyl@Protein N-term;Carbamidomethyl@C 0;6 9 1 \n", "16 Oxidation@M 7 11 0 \n", "17 Oxidation@M 7 11 0 \n", "18 11 0 \n", @@ -3192,136 +3200,94 @@ "38 13 0 \n", "39 13 0 \n", "\n", - " charge ... isotope_apex_mz isotope_right_most_mz rt_pred \n", - "0 2 ... 481.739834 482.241484 0.021263 \\\n", - "1 2 ... 473.742377 474.244027 0.092409 \n", - "2 2 ... 487.200207 487.701857 0.032798 \n", - "3 2 ... 508.205490 509.208790 0.109105 \n", - "4 2 ... 481.739834 482.241484 0.044289 \n", - "5 2 ... 473.742377 474.244027 0.158330 \n", - "6 2 ... 487.200207 487.701857 0.016275 \n", - "7 2 ... 508.205490 509.208790 0.119288 \n", - "8 2 ... 560.717907 561.721207 0.048364 \n", - "9 2 ... 552.720450 553.723750 0.081848 \n", - "10 2 ... 581.723190 582.726490 0.204708 \n", - "11 2 ... 573.725732 574.729032 0.279585 \n", - "12 2 ... 560.717907 561.721207 0.015205 \n", - "13 2 ... 552.720450 553.723750 0.084342 \n", - "14 2 ... 581.723190 582.726490 0.134268 \n", - "15 2 ... 573.725732 574.729032 0.263092 \n", - "16 2 ... 678.863889 679.867189 0.243750 \n", - "17 3 ... 452.911685 453.580551 0.243750 \n", - "18 2 ... 670.866431 671.869731 0.299989 \n", - "19 3 ... 447.580046 448.248913 0.299989 \n", - "20 2 ... 699.869171 700.872471 0.416815 \n", - "21 3 ... 466.915206 467.584073 0.416815 \n", - "22 2 ... 691.871714 692.875014 0.498515 \n", - "23 3 ... 461.583568 462.252435 0.498515 \n", - "24 2 ... 678.863889 679.867189 0.339134 \n", - "25 3 ... 452.911685 453.580551 0.339134 \n", - "26 2 ... 670.866431 671.869731 0.352144 \n", - "27 3 ... 447.580046 448.248913 0.352144 \n", - "28 2 ... 699.869171 700.872471 0.406691 \n", - "29 3 ... 466.915206 467.584073 0.406691 \n", - "30 2 ... 691.871714 692.875014 0.462864 \n", - "31 3 ... 461.583568 462.252435 0.462864 \n", - "32 2 ... 772.903742 773.907042 0.277093 \n", - "33 3 ... 515.604920 516.273787 0.277093 \n", - "34 2 ... 764.906285 765.909585 0.336550 \n", - "35 3 ... 510.273282 510.942149 0.336550 \n", - "36 2 ... 772.903742 773.907042 0.218114 \n", - "37 3 ... 515.604920 516.273787 0.218114 \n", - "38 2 ... 764.906285 765.909585 0.252718 \n", - "39 3 ... 510.273282 510.942149 0.252718 \n", - "\n", - " rt_norm_pred ccs_pred mobility_pred frag_stop_idx nce instrument \n", - "0 0.021263 318.941895 0.785035 7 30.0 Lumos \\\n", - "1 0.092409 317.660034 0.781693 14 30.0 Lumos \n", - "2 0.032798 329.176941 0.810355 21 30.0 Lumos \n", - "3 0.109105 342.048767 0.842529 28 30.0 Lumos \n", - "4 0.044289 321.865784 0.792231 35 30.0 Lumos \n", - "5 0.158330 323.465607 0.795979 42 30.0 Lumos \n", - "6 0.016275 328.831970 0.809506 49 30.0 Lumos \n", - "7 0.119288 339.180786 0.835465 56 30.0 Lumos \n", - "8 0.048364 351.815094 0.867675 64 30.0 Lumos \n", - "9 0.081848 353.857971 0.872560 72 30.0 Lumos \n", - "10 0.204708 362.488403 0.894392 80 30.0 Lumos \n", - "11 0.279585 361.742859 0.892406 88 30.0 Lumos \n", - "12 0.015205 354.745117 0.874901 96 30.0 Lumos \n", - "13 0.084342 355.178955 0.875817 104 30.0 Lumos \n", - "14 0.134268 363.174927 0.896086 112 30.0 Lumos \n", - "15 0.263092 366.395264 0.903884 120 30.0 Lumos \n", - "16 0.243750 401.572327 0.992497 130 30.0 Lumos \n", - "17 0.243750 470.914978 0.775925 140 30.0 Lumos \n", - "18 0.299989 402.828186 0.995481 150 30.0 Lumos \n", - "19 0.299989 474.472504 0.781693 160 30.0 Lumos \n", - "20 0.416815 406.307281 1.004504 170 30.0 Lumos \n", - "21 0.416815 463.901062 0.764600 180 30.0 Lumos \n", - "22 0.498515 407.171875 1.006527 190 30.0 Lumos \n", - "23 0.498515 468.311951 0.771782 200 30.0 Lumos \n", - "24 0.339134 400.909943 0.990860 210 30.0 Lumos \n", - "25 0.339134 478.989624 0.789230 220 30.0 Lumos \n", - "26 0.352144 402.555054 0.994806 230 30.0 Lumos \n", - "27 0.352144 482.206787 0.794435 240 30.0 Lumos \n", - "28 0.406691 414.260376 1.024166 250 30.0 Lumos \n", - "29 0.406691 470.269623 0.775096 260 30.0 Lumos \n", - "30 0.462864 417.726074 1.032617 270 30.0 Lumos \n", - "31 0.462864 469.226746 0.773290 280 30.0 Lumos \n", - "32 0.277093 421.076538 1.041983 292 30.0 Lumos \n", - "33 0.277093 490.627563 0.809400 304 30.0 Lumos \n", - "34 0.336550 423.214264 1.047176 316 30.0 Lumos \n", - "35 0.336550 487.170013 0.803621 328 30.0 Lumos \n", - "36 0.218114 415.696411 1.028670 340 30.0 Lumos \n", - "37 0.218114 473.192200 0.780636 352 30.0 Lumos \n", - "38 0.252718 416.934204 1.031637 364 30.0 Lumos \n", - "39 0.252718 477.759796 0.788098 376 30.0 Lumos \n", + " charge ... i_5 mono_isotope_idx rt_pred rt_norm_pred \\\n", + "0 2 ... 0.001232 0 0.021263 0.021263 \n", + "1 2 ... 0.001173 0 0.092409 0.092409 \n", + "2 2 ... 0.001409 0 0.032797 0.032797 \n", + "3 2 ... 0.001604 0 0.109105 0.109105 \n", + "4 2 ... 0.001232 0 0.044289 0.044289 \n", + "5 2 ... 0.001173 0 0.158330 0.158330 \n", + "6 2 ... 0.001409 0 0.016274 0.016274 \n", + "7 2 ... 0.001604 0 0.119288 0.119288 \n", + "8 2 ... 0.003490 0 0.048364 0.048364 \n", + "9 2 ... 0.003395 0 0.081848 0.081848 \n", + "10 2 ... 0.003824 0 0.204708 0.204708 \n", + "11 2 ... 0.003724 0 0.279585 0.279585 \n", + "12 2 ... 0.003490 0 0.015205 0.015205 \n", + "13 2 ... 0.003395 0 0.084342 0.084342 \n", + "14 2 ... 0.003824 0 0.134268 0.134268 \n", + "15 2 ... 0.003724 0 0.263092 0.263092 \n", + "16 2 ... 0.003411 0 0.243750 0.243750 \n", + "17 3 ... 0.003411 0 0.243750 0.243750 \n", + "18 2 ... 0.003305 0 0.299989 0.299989 \n", + "19 3 ... 0.003305 0 0.299989 0.299989 \n", + "20 2 ... 0.003771 0 0.416815 0.416815 \n", + "21 3 ... 0.003771 0 0.416815 0.416815 \n", + "22 2 ... 0.003660 0 0.498515 0.498515 \n", + "23 3 ... 0.003660 0 0.498515 0.498515 \n", + "24 2 ... 0.003411 0 0.339134 0.339134 \n", + "25 3 ... 0.003411 0 0.339134 0.339134 \n", + "26 2 ... 0.003305 0 0.352144 0.352144 \n", + "27 3 ... 0.003305 0 0.352144 0.352144 \n", + "28 2 ... 0.003771 0 0.406691 0.406691 \n", + "29 3 ... 0.003771 0 0.406691 0.406691 \n", + "30 2 ... 0.003660 0 0.462864 0.462864 \n", + "31 3 ... 0.003660 0 0.462864 0.462864 \n", + "32 2 ... 0.004945 0 0.277093 0.277093 \n", + "33 3 ... 0.004945 0 0.277093 0.277093 \n", + "34 2 ... 0.004819 0 0.336550 0.336550 \n", + "35 3 ... 0.004819 0 0.336550 0.336550 \n", + "36 2 ... 0.004945 0 0.218114 0.218114 \n", + "37 3 ... 0.004945 0 0.218114 0.218114 \n", + "38 2 ... 0.004819 0 0.252718 0.252718 \n", + "39 3 ... 0.004819 0 0.252718 0.252718 \n", "\n", - " frag_start_idx \n", - "0 0 \n", - "1 7 \n", - "2 14 \n", - "3 21 \n", - "4 28 \n", - "5 35 \n", - "6 42 \n", - "7 49 \n", - "8 56 \n", - "9 64 \n", - "10 72 \n", - "11 80 \n", - "12 88 \n", - "13 96 \n", - "14 104 \n", - "15 112 \n", - "16 120 \n", - "17 130 \n", - "18 140 \n", - "19 150 \n", - "20 160 \n", - "21 170 \n", - "22 180 \n", - "23 190 \n", - "24 200 \n", - "25 210 \n", - "26 220 \n", - "27 230 \n", - "28 240 \n", - "29 250 \n", - "30 260 \n", - "31 270 \n", - "32 280 \n", - "33 292 \n", - "34 304 \n", - "35 316 \n", - "36 328 \n", - "37 340 \n", - "38 352 \n", - "39 364 \n", + " ccs_pred mobility_pred nce instrument frag_start_idx frag_stop_idx \n", + "0 318.941895 0.785035 30.0 Lumos 0 7 \n", + "1 317.660034 0.781693 30.0 Lumos 7 14 \n", + "2 329.177002 0.810355 30.0 Lumos 14 21 \n", + "3 342.048706 0.842529 30.0 Lumos 21 28 \n", + "4 321.865723 0.792231 30.0 Lumos 28 35 \n", + "5 323.465607 0.795979 30.0 Lumos 35 42 \n", + "6 328.831970 0.809506 30.0 Lumos 42 49 \n", + "7 339.180847 0.835465 30.0 Lumos 49 56 \n", + "8 351.815063 0.867675 30.0 Lumos 56 64 \n", + "9 353.857971 0.872560 30.0 Lumos 64 72 \n", + "10 362.488342 0.894392 30.0 Lumos 72 80 \n", + "11 361.742859 0.892406 30.0 Lumos 80 88 \n", + "12 354.745087 0.874901 30.0 Lumos 88 96 \n", + "13 355.178925 0.875817 30.0 Lumos 96 104 \n", + "14 363.174927 0.896086 30.0 Lumos 104 112 \n", + "15 366.395203 0.903884 30.0 Lumos 112 120 \n", + "16 401.572327 0.992497 30.0 Lumos 120 130 \n", + "17 470.914978 0.775925 30.0 Lumos 130 140 \n", + "18 402.828217 0.995481 30.0 Lumos 140 150 \n", + "19 474.472473 0.781693 30.0 Lumos 150 160 \n", + "20 406.307281 1.004504 30.0 Lumos 160 170 \n", + "21 463.901123 0.764600 30.0 Lumos 170 180 \n", + "22 407.171814 1.006527 30.0 Lumos 180 190 \n", + "23 468.311920 0.771782 30.0 Lumos 190 200 \n", + "24 400.909912 0.990859 30.0 Lumos 200 210 \n", + "25 478.989624 0.789230 30.0 Lumos 210 220 \n", + "26 402.555023 0.994806 30.0 Lumos 220 230 \n", + "27 482.206787 0.794435 30.0 Lumos 230 240 \n", + "28 414.260437 1.024166 30.0 Lumos 240 250 \n", + "29 470.269653 0.775096 30.0 Lumos 250 260 \n", + "30 417.726074 1.032617 30.0 Lumos 260 270 \n", + "31 469.226685 0.773290 30.0 Lumos 270 280 \n", + "32 421.076538 1.041983 30.0 Lumos 280 292 \n", + "33 490.627533 0.809400 30.0 Lumos 292 304 \n", + "34 423.214233 1.047176 30.0 Lumos 304 316 \n", + "35 487.170013 0.803621 30.0 Lumos 316 328 \n", + "36 415.696411 1.028670 30.0 Lumos 328 340 \n", + "37 473.192200 0.780636 30.0 Lumos 340 352 \n", + "38 416.934204 1.031637 30.0 Lumos 352 364 \n", + "39 477.759888 0.788098 30.0 Lumos 364 376 \n", "\n", - "[40 rows x 27 columns]" + "[40 rows x 26 columns]" ] }, - "execution_count": null, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -3352,15 +3318,15 @@ "_lib.predict_all()\n", "assert (_lib.precursor_df.decoy==1).any()\n", "assert ('MACDESTY'[::-1]+'K') in _lib.precursor_df.sequence.values\n", - "assert 'isotope_apex_offset' in _lib.precursor_df.columns\n", - "assert 'isotope_apex_intensity' in _lib.precursor_df.columns\n", + "assert 'i_0' in _lib.precursor_df.columns\n", + "assert 'i_1' in _lib.precursor_df.columns\n", "assert ~_lib.precursor_df.sequence.str.contains('B').any()\n", "_lib.precursor_df" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -3395,16 +3361,16 @@ " decoy\n", " charge\n", " ...\n", - " isotope_apex_mz\n", - " isotope_right_most_mz\n", + " i_5\n", + " mono_isotope_idx\n", " rt_pred\n", " rt_norm_pred\n", " ccs_pred\n", " mobility_pred\n", - " frag_stop_idx\n", " nce\n", " instrument\n", " frag_start_idx\n", + " frag_stop_idx\n", " \n", " \n", " \n", @@ -3421,64 +3387,64 @@ " 0\n", " 2\n", " ...\n", - " 495.755484\n", - " 496.257134\n", + " 0.001352\n", + " 0\n", " 0.242660\n", " 0.242660\n", - " 345.390869\n", - " 0.850475\n", - " 7\n", + " 345.390839\n", + " 0.850135\n", " 30.0\n", " Lumos\n", " 0\n", + " 7\n", " \n", " \n", " 1\n", - " YTSEDCAK\n", - " 0\n", + " LMNPQRST\n", " 0\n", - " True\n", + " 1\n", " False\n", - " Carbamidomethyl@C;Acetyl@Protein N-term;Dimeth...\n", - " 6;0;8\n", + " True\n", + " Dimethyl:2H(6)13C(2)@Any N-term\n", + " 0\n", " 8\n", - " 1\n", + " 0\n", " 2\n", " ...\n", - " 526.243325\n", - " 526.744975\n", - " 0.106988\n", - " 0.106988\n", - " 347.019043\n", - " 0.855165\n", - " 14\n", + " 0.027430\n", + " 2\n", + " 0.063860\n", + " 0.063860\n", + " 313.133270\n", + " 0.770554\n", " 30.0\n", " Lumos\n", " 7\n", + " 14\n", " \n", " \n", " 2\n", - " YTSEDCAK\n", - " 0\n", + " LMNPQRST\n", " 0\n", - " True\n", + " 1\n", " False\n", - " Carbamidomethyl@C;Dimethyl:2H(6)13C(2)@Any N-t...\n", - " 6;0;8\n", + " True\n", + " Oxidation@M;Dimethyl:2H(6)13C(2)@Any N-term\n", + " 2;0\n", " 8\n", - " 1\n", + " 0\n", " 2\n", " ...\n", - " 523.275878\n", - " 523.777528\n", - " 0.009153\n", - " 0.009153\n", - " 331.465332\n", - " 0.816775\n", - " 21\n", + " 0.027954\n", + " 2\n", + " 0.017637\n", + " 0.017637\n", + " 314.302277\n", + " 0.773615\n", " 30.0\n", " Lumos\n", " 14\n", + " 21\n", " \n", " \n", " 3\n", @@ -3487,22 +3453,22 @@ " 1\n", " False\n", " True\n", - " Dimethyl:2H(6)13C(2)@Any N-term\n", - " 0\n", + " Oxidation@M;Dimethyl:2H(6)13C(2)@Any N-term\n", + " 6;0\n", " 8\n", " 1\n", " 2\n", " ...\n", - " 491.780212\n", - " 492.281862\n", - " 0.152593\n", - " 0.152593\n", - " 320.333069\n", - " 0.788686\n", - " 28\n", + " 0.027954\n", + " 2\n", + " 0.040846\n", + " 0.040846\n", + " 319.400330\n", + " 0.786163\n", " 30.0\n", " Lumos\n", " 21\n", + " 28\n", " \n", " \n", " 4\n", @@ -3511,22 +3477,22 @@ " 1\n", " False\n", " True\n", - " Oxidation@M;Dimethyl:2H(6)13C(2)@Any N-term\n", - " 6;0\n", + " Dimethyl:2H(6)13C(2)@Any N-term\n", + " 0\n", " 8\n", " 1\n", " 2\n", " ...\n", - " 499.777669\n", - " 500.279319\n", - " 0.040845\n", - " 0.040845\n", - " 319.400391\n", - " 0.786564\n", - " 35\n", + " 0.027430\n", + " 2\n", + " 0.152593\n", + " 0.152593\n", + " 320.333069\n", + " 0.788271\n", " 30.0\n", " Lumos\n", " 28\n", + " 35\n", " \n", " \n", " ...\n", @@ -3553,103 +3519,103 @@ " ...\n", " \n", " \n", - " 83\n", - " FGHIKLMNPQRST\n", + " 75\n", + " SRQPNMLKIHGFT\n", " 0\n", " 2\n", " False\n", " True\n", " Dimethyl@Any N-term;Dimethyl@K\n", - " 0;5\n", + " 0;8\n", " 13\n", - " 0\n", + " 1\n", " 2\n", " ...\n", - " 792.937585\n", - " 793.940885\n", - " 0.636318\n", - " 0.636318\n", - " 428.658142\n", - " 1.060983\n", - " 792\n", + " 0.005469\n", + " 0\n", + " 0.620949\n", + " 0.620949\n", + " 430.461273\n", + " 1.065108\n", " 30.0\n", " Lumos\n", - " 780\n", + " 692\n", + " 704\n", " \n", " \n", - " 84\n", - " FGHIKLMNPQRST\n", + " 76\n", + " SRQPNMLKIHGFT\n", " 0\n", " 2\n", " False\n", " True\n", - " Dimethyl@Any N-term;Dimethyl@K\n", - " 0;5\n", + " Oxidation@M;Dimethyl@Any N-term;Dimethyl@K\n", + " 6;0;8\n", " 13\n", - " 0\n", + " 1\n", " 3\n", " ...\n", - " 528.960816\n", - " 529.629682\n", - " 0.636318\n", - " 0.636318\n", - " 482.273010\n", - " 0.795796\n", - " 804\n", + " 0.005604\n", + " 0\n", + " 0.468698\n", + " 0.468698\n", + " 482.796692\n", + " 0.796481\n", " 30.0\n", " Lumos\n", - " 792\n", + " 704\n", + " 716\n", " \n", " \n", - " 85\n", - " FGHIKLMNPQRST\n", + " 77\n", + " SRQPNMLKIHGFT\n", " 0\n", " 2\n", " False\n", " True\n", - " Dimethyl:2H(6)13C(2)@Any N-term;Dimethyl:2H(6)...\n", - " 0;5\n", + " Oxidation@M;Dimethyl@Any N-term;Dimethyl@K\n", + " 6;0;8\n", " 13\n", - " 0\n", - " 3\n", + " 1\n", + " 2\n", " ...\n", - " 534.323729\n", - " 534.992596\n", - " 0.206957\n", - " 0.206957\n", - " 478.660187\n", - " 0.789903\n", - " 816\n", + " 0.005604\n", + " 0\n", + " 0.468698\n", + " 0.468698\n", + " 428.150757\n", + " 1.059489\n", " 30.0\n", " Lumos\n", - " 804\n", + " 716\n", + " 728\n", " \n", " \n", - " 86\n", - " SRQPNMLKIHGFT\n", + " 78\n", + " FGHIKLMNPQRST\n", " 0\n", " 2\n", " False\n", " True\n", - " Oxidation@M;Dimethyl@Any N-term;Dimethyl@K\n", - " 6;0;8\n", + " Dimethyl:2H(6)13C(2)@Any N-term;Dimethyl:2H(6)...\n", + " 0;5\n", " 13\n", - " 1\n", + " 0\n", " 2\n", " ...\n", - " 800.935042\n", - " 801.938342\n", - " 0.468698\n", - " 0.468698\n", - " 428.150787\n", - " 1.059819\n", - " 828\n", + " 0.058123\n", + " 2\n", + " 0.206957\n", + " 0.206957\n", + " 412.858307\n", + " 1.021552\n", " 30.0\n", " Lumos\n", - " 816\n", + " 728\n", + " 740\n", " \n", " \n", - " 87\n", + " 79\n", " FGHIKLMNPQRST\n", " 0\n", " 2\n", @@ -3659,94 +3625,81 @@ " 0;5\n", " 13\n", " 0\n", - " 4\n", + " 3\n", " ...\n", - " 400.994616\n", - " 401.496266\n", + " 0.058123\n", + " 2\n", " 0.206957\n", " 0.206957\n", - " 605.993408\n", - " 0.750029\n", - " 840\n", + " 478.660187\n", + " 0.789583\n", " 30.0\n", " Lumos\n", - " 828\n", + " 740\n", + " 752\n", " \n", " \n", "\n", - "

88 rows × 28 columns

\n", + "

80 rows × 27 columns

\n", "" ], "text/plain": [ - " sequence protein_idxes miss_cleavage is_prot_nterm is_prot_cterm \n", - "0 LMNPQRST 0 1 False True \\\n", - "1 YTSEDCAK 0 0 True False \n", - "2 YTSEDCAK 0 0 True False \n", + " sequence protein_idxes miss_cleavage is_prot_nterm is_prot_cterm \\\n", + "0 LMNPQRST 0 1 False True \n", + "1 LMNPQRST 0 1 False True \n", + "2 LMNPQRST 0 1 False True \n", "3 SRQPNMLT 0 1 False True \n", "4 SRQPNMLT 0 1 False True \n", ".. ... ... ... ... ... \n", - "83 FGHIKLMNPQRST 0 2 False True \n", - "84 FGHIKLMNPQRST 0 2 False True \n", - "85 FGHIKLMNPQRST 0 2 False True \n", - "86 SRQPNMLKIHGFT 0 2 False True \n", - "87 FGHIKLMNPQRST 0 2 False True \n", + "75 SRQPNMLKIHGFT 0 2 False True \n", + "76 SRQPNMLKIHGFT 0 2 False True \n", + "77 SRQPNMLKIHGFT 0 2 False True \n", + "78 FGHIKLMNPQRST 0 2 False True \n", + "79 FGHIKLMNPQRST 0 2 False True \n", "\n", - " mods mod_sites nAA decoy \n", - "0 Oxidation@M;Dimethyl@Any N-term 2;0 8 0 \\\n", - "1 Carbamidomethyl@C;Acetyl@Protein N-term;Dimeth... 6;0;8 8 1 \n", - "2 Carbamidomethyl@C;Dimethyl:2H(6)13C(2)@Any N-t... 6;0;8 8 1 \n", - "3 Dimethyl:2H(6)13C(2)@Any N-term 0 8 1 \n", - "4 Oxidation@M;Dimethyl:2H(6)13C(2)@Any N-term 6;0 8 1 \n", + " mods mod_sites nAA decoy \\\n", + "0 Oxidation@M;Dimethyl@Any N-term 2;0 8 0 \n", + "1 Dimethyl:2H(6)13C(2)@Any N-term 0 8 0 \n", + "2 Oxidation@M;Dimethyl:2H(6)13C(2)@Any N-term 2;0 8 0 \n", + "3 Oxidation@M;Dimethyl:2H(6)13C(2)@Any N-term 6;0 8 1 \n", + "4 Dimethyl:2H(6)13C(2)@Any N-term 0 8 1 \n", ".. ... ... ... ... \n", - "83 Dimethyl@Any N-term;Dimethyl@K 0;5 13 0 \n", - "84 Dimethyl@Any N-term;Dimethyl@K 0;5 13 0 \n", - "85 Dimethyl:2H(6)13C(2)@Any N-term;Dimethyl:2H(6)... 0;5 13 0 \n", - "86 Oxidation@M;Dimethyl@Any N-term;Dimethyl@K 6;0;8 13 1 \n", - "87 Dimethyl:2H(6)13C(2)@Any N-term;Dimethyl:2H(6)... 0;5 13 0 \n", + "75 Dimethyl@Any N-term;Dimethyl@K 0;8 13 1 \n", + "76 Oxidation@M;Dimethyl@Any N-term;Dimethyl@K 6;0;8 13 1 \n", + "77 Oxidation@M;Dimethyl@Any N-term;Dimethyl@K 6;0;8 13 1 \n", + "78 Dimethyl:2H(6)13C(2)@Any N-term;Dimethyl:2H(6)... 0;5 13 0 \n", + "79 Dimethyl:2H(6)13C(2)@Any N-term;Dimethyl:2H(6)... 0;5 13 0 \n", "\n", - " charge ... isotope_apex_mz isotope_right_most_mz rt_pred \n", - "0 2 ... 495.755484 496.257134 0.242660 \\\n", - "1 2 ... 526.243325 526.744975 0.106988 \n", - "2 2 ... 523.275878 523.777528 0.009153 \n", - "3 2 ... 491.780212 492.281862 0.152593 \n", - "4 2 ... 499.777669 500.279319 0.040845 \n", - ".. ... ... ... ... ... \n", - "83 2 ... 792.937585 793.940885 0.636318 \n", - "84 3 ... 528.960816 529.629682 0.636318 \n", - "85 3 ... 534.323729 534.992596 0.206957 \n", - "86 2 ... 800.935042 801.938342 0.468698 \n", - "87 4 ... 400.994616 401.496266 0.206957 \n", + " charge ... i_5 mono_isotope_idx rt_pred rt_norm_pred \\\n", + "0 2 ... 0.001352 0 0.242660 0.242660 \n", + "1 2 ... 0.027430 2 0.063860 0.063860 \n", + "2 2 ... 0.027954 2 0.017637 0.017637 \n", + "3 2 ... 0.027954 2 0.040846 0.040846 \n", + "4 2 ... 0.027430 2 0.152593 0.152593 \n", + ".. ... ... ... ... ... ... \n", + "75 2 ... 0.005469 0 0.620949 0.620949 \n", + "76 3 ... 0.005604 0 0.468698 0.468698 \n", + "77 2 ... 0.005604 0 0.468698 0.468698 \n", + "78 2 ... 0.058123 2 0.206957 0.206957 \n", + "79 3 ... 0.058123 2 0.206957 0.206957 \n", "\n", - " rt_norm_pred ccs_pred mobility_pred frag_stop_idx nce instrument \n", - "0 0.242660 345.390869 0.850475 7 30.0 Lumos \\\n", - "1 0.106988 347.019043 0.855165 14 30.0 Lumos \n", - "2 0.009153 331.465332 0.816775 21 30.0 Lumos \n", - "3 0.152593 320.333069 0.788686 28 30.0 Lumos \n", - "4 0.040845 319.400391 0.786564 35 30.0 Lumos \n", - ".. ... ... ... ... ... ... \n", - "83 0.636318 428.658142 1.060983 792 30.0 Lumos \n", - "84 0.636318 482.273010 0.795796 804 30.0 Lumos \n", - "85 0.206957 478.660187 0.789903 816 30.0 Lumos \n", - "86 0.468698 428.150787 1.059819 828 30.0 Lumos \n", - "87 0.206957 605.993408 0.750029 840 30.0 Lumos \n", + " ccs_pred mobility_pred nce instrument frag_start_idx frag_stop_idx \n", + "0 345.390839 0.850135 30.0 Lumos 0 7 \n", + "1 313.133270 0.770554 30.0 Lumos 7 14 \n", + "2 314.302277 0.773615 30.0 Lumos 14 21 \n", + "3 319.400330 0.786163 30.0 Lumos 21 28 \n", + "4 320.333069 0.788271 30.0 Lumos 28 35 \n", + ".. ... ... ... ... ... ... \n", + "75 430.461273 1.065108 30.0 Lumos 692 704 \n", + "76 482.796692 0.796481 30.0 Lumos 704 716 \n", + "77 428.150757 1.059489 30.0 Lumos 716 728 \n", + "78 412.858307 1.021552 30.0 Lumos 728 740 \n", + "79 478.660187 0.789583 30.0 Lumos 740 752 \n", "\n", - " frag_start_idx \n", - "0 0 \n", - "1 7 \n", - "2 14 \n", - "3 21 \n", - "4 28 \n", - ".. ... \n", - "83 780 \n", - "84 792 \n", - "85 804 \n", - "86 816 \n", - "87 828 \n", - "\n", - "[88 rows x 28 columns]" + "[80 rows x 27 columns]" ] }, - "execution_count": null, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -3760,8 +3713,8 @@ "_lib.predict_all()\n", "assert (_lib.precursor_df.decoy==1).any()\n", "assert ('MACDESTY'[::-1]+'K') in _lib.precursor_df.sequence.values\n", - "assert 'isotope_apex_offset' in _lib.precursor_df.columns\n", - "assert 'isotope_apex_intensity' in _lib.precursor_df.columns\n", + "assert 'i_0' in _lib.precursor_df.columns\n", + "assert 'i_1' in _lib.precursor_df.columns\n", "assert ~_lib.precursor_df.sequence.str.contains('B').any()\n", "_lib.precursor_df" ] @@ -3779,6 +3732,18 @@ "display_name": "Python 3.8.3 ('base')", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" } }, "nbformat": 4, diff --git a/nbs_tests/test_fasta_lib_precursor_lib.ipynb b/nbs_tests/test_fasta_lib_precursor_lib.ipynb index 5e8e76a0..d1ee398c 100644 --- a/nbs_tests/test_fasta_lib_precursor_lib.ipynb +++ b/nbs_tests/test_fasta_lib_precursor_lib.ipynb @@ -5,13 +5,20 @@ "execution_count": 1, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "2024-01-24 16:54:02> WARNING: Temp mmap arrays are written to /var/folders/fh/hf8t3l1x02d42ggk3b304_rh0000gn/T/temp_mmap_cbynx21w. Cleanup of this folder is OS dependant, and might need to be triggered manually! Current space: 640,486,821,888\n", - "2024-01-24 16:54:02> WARNING: No Bruker libraries are available for this operating system. Mobility and m/z values need to be estimated. While this estimation often returns acceptable results with errors < 0.02 Th, huge errors (e.g. offsets of 6 Th) have already been observed for some samples!\n", - "2024-01-24 16:54:02> \n" + "2024-01-29 22:16:22> WARNING: Temp mmap arrays are written to /var/folders/fh/hf8t3l1x02d42ggk3b304_rh0000gn/T/temp_mmap_oan6nfyd. Cleanup of this folder is OS dependant, and might need to be triggered manually! Current space: 638,180,737,024\n", + "2024-01-29 22:16:22> WARNING: No Bruker libraries are available for this operating system. Mobility and m/z values need to be estimated. While this estimation often returns acceptable results with errors < 0.02 Th, huge errors (e.g. offsets of 6 Th) have already been observed for some samples!\n", + "2024-01-29 22:16:22> \n" ] } ], @@ -31,59 +38,59 @@ "name": "stdout", "output_type": "stream", "text": [ - "2024-01-24 16:54:03> [PeptDeep] Running library task ...\n", - "2024-01-24 16:54:03> Input files (precursor_table): ['precursor_table.csv']\n", - "2024-01-24 16:54:03> Platform information:\n", - "2024-01-24 16:54:03> system - Darwin\n", - "2024-01-24 16:54:03> release - 23.2.0\n", - "2024-01-24 16:54:03> version - 14.2.1\n", - "2024-01-24 16:54:03> machine - arm64\n", - "2024-01-24 16:54:03> processor - arm\n", - "2024-01-24 16:54:03> cpu count - 10\n", - "2024-01-24 16:54:03> ram - 34.6/64.0 Gb (available/total)\n", - "2024-01-24 16:54:03> \n", - "2024-01-24 16:54:03> Python information:\n", - "2024-01-24 16:54:03> alphabase - 1.2.0\n", - "2024-01-24 16:54:03> alphabase> - \n", - "2024-01-24 16:54:03> alpharaw - 0.2.0\n", - "2024-01-24 16:54:03> alpharaw> - \n", - "2024-01-24 16:54:03> biopython - 1.79\n", - "2024-01-24 16:54:03> click - 8.1.3\n", - "2024-01-24 16:54:03> lxml - 4.9.1\n", - "2024-01-24 16:54:03> numba - 0.58.1\n", - "2024-01-24 16:54:03> numpy - 1.26.3\n", - "2024-01-24 16:54:03> pandas - 2.1.4\n", - "2024-01-24 16:54:03> peptdeep - 1.1.4\n", - "2024-01-24 16:54:03> psutil - 5.9.2\n", - "2024-01-24 16:54:03> pyteomics - 4.5.6\n", - "2024-01-24 16:54:03> python - 3.9.12\n", - "2024-01-24 16:54:03> scikit-learn - 1.1.2\n", - "2024-01-24 16:54:03> streamlit - 1.30.0\n", - "2024-01-24 16:54:03> streamlit-aggrid - 0.3.3\n", - "2024-01-24 16:54:03> streamlit> - \n", - "2024-01-24 16:54:03> torch - 2.0.0\n", - "2024-01-24 16:54:03> tqdm - 4.64.0\n", - "2024-01-24 16:54:03> transformers - 4.28.1\n", - "2024-01-24 16:54:03> \n", - "2024-01-24 16:54:05> xxx/library.tsv does not exist, use default IRT_PEPTIDE_DF to translate irt\n", - "2024-01-24 16:54:05> Generating the spectral library ...\n", - "2024-01-24 16:54:05> Loaded 3 precursors.\n", - "2024-01-24 16:54:05> Predicting RT/IM/MS2 for 3 precursors ...\n", - "2024-01-24 16:54:05> Predicting RT ...\n" + "2024-01-29 22:16:23> [PeptDeep] Running library task ...\n", + "2024-01-29 22:16:23> Input files (precursor_table): ['precursor_table.csv']\n", + "2024-01-29 22:16:23> Platform information:\n", + "2024-01-29 22:16:23> system - Darwin\n", + "2024-01-29 22:16:23> release - 23.2.0\n", + "2024-01-29 22:16:23> version - 14.2.1\n", + "2024-01-29 22:16:23> machine - arm64\n", + "2024-01-29 22:16:23> processor - arm\n", + "2024-01-29 22:16:23> cpu count - 10\n", + "2024-01-29 22:16:23> ram - 35.3/64.0 Gb (available/total)\n", + "2024-01-29 22:16:23> \n", + "2024-01-29 22:16:23> Python information:\n", + "2024-01-29 22:16:23> alphabase - 1.2.0\n", + "2024-01-29 22:16:23> alphabase> - \n", + "2024-01-29 22:16:23> alpharaw - 0.2.0\n", + "2024-01-29 22:16:23> alpharaw> - \n", + "2024-01-29 22:16:23> biopython - 1.79\n", + "2024-01-29 22:16:23> click - 8.1.3\n", + "2024-01-29 22:16:23> lxml - 4.9.1\n", + "2024-01-29 22:16:23> numba - 0.58.1\n", + "2024-01-29 22:16:23> numpy - 1.26.3\n", + "2024-01-29 22:16:23> pandas - 2.1.4\n", + "2024-01-29 22:16:23> peptdeep - 1.1.4\n", + "2024-01-29 22:16:23> psutil - 5.9.2\n", + "2024-01-29 22:16:23> pyteomics - 4.5.6\n", + "2024-01-29 22:16:23> python - 3.9.12\n", + "2024-01-29 22:16:23> scikit-learn - 1.1.2\n", + "2024-01-29 22:16:23> streamlit - 1.30.0\n", + "2024-01-29 22:16:23> streamlit-aggrid - 0.3.3\n", + "2024-01-29 22:16:23> streamlit> - \n", + "2024-01-29 22:16:23> torch - 2.0.0\n", + "2024-01-29 22:16:23> tqdm - 4.64.0\n", + "2024-01-29 22:16:23> transformers - 4.28.1\n", + "2024-01-29 22:16:23> \n", + "2024-01-29 22:16:25> xxx/library.tsv does not exist, use default IRT_PEPTIDE_DF to translate irt\n", + "2024-01-29 22:16:25> Generating the spectral library ...\n", + "2024-01-29 22:16:25> Loaded 3 precursors.\n", + "2024-01-29 22:16:25> Predicting RT/IM/MS2 for 3 precursors ...\n", + "2024-01-29 22:16:25> Predicting RT ...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 3/3 [00:00<00:00, 238.62it/s]" + "100%|██████████| 3/3 [00:00<00:00, 319.48it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "2024-01-24 16:54:05> Predicting mobility ...\n" + "2024-01-29 22:16:25> Predicting mobility ...\n" ] }, { @@ -91,14 +98,14 @@ "output_type": "stream", "text": [ "\n", - "100%|██████████| 3/3 [00:00<00:00, 393.66it/s]" + "100%|██████████| 3/3 [00:00<00:00, 387.66it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "2024-01-24 16:54:05> Predicting MS2 ...\n" + "2024-01-29 22:16:25> Predicting MS2 ...\n" ] }, { @@ -106,17 +113,17 @@ "output_type": "stream", "text": [ "\n", - "100%|██████████| 3/3 [00:00<00:00, 176.88it/s]" + "100%|██████████| 3/3 [00:00<00:00, 175.87it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "2024-01-24 16:54:05> End predicting RT/IM/MS2\n", - "2024-01-24 16:54:05> Predicting the spectral library with 3 precursors and 0.00M fragments used 0.5398 GB memory\n", - "2024-01-24 16:54:05> Saving HDF library to /Users/wenfengzeng/peptdeep/spec_libs/predict.speclib.hdf ...\n", - "2024-01-24 16:54:05> Library generated!!\n" + "2024-01-29 22:16:25> End predicting RT/IM/MS2\n", + "2024-01-29 22:16:25> Predicting the spectral library with 3 precursors and 0.00M fragments used 0.5076 GB memory\n", + "2024-01-29 22:16:25> Saving HDF library to /Users/wenfengzeng/peptdeep/spec_libs/predict.speclib.hdf ...\n", + "2024-01-29 22:16:25> Library generated!!\n" ] }, { diff --git a/peptdeep/constants/default_settings.yaml b/peptdeep/constants/default_settings.yaml index 31ed1fa1..a90c1f10 100644 --- a/peptdeep/constants/default_settings.yaml +++ b/peptdeep/constants/default_settings.yaml @@ -72,6 +72,13 @@ model_mgr: external_ms2_model: '' external_rt_model: '' external_ccs_model: '' + charge_model_type: seq + charge_model_choices: + - seq + - modseq + charge_model_file: '' + charge_prob_cutoff: 0.3 + use_predicted_charge_in_speclib: True # if True, it ignores min/max_precursor_charge in `library` instrument_group: ThermoTOF: ThermoTOF Astral: ThermoTOF diff --git a/peptdeep/model/charge.py b/peptdeep/model/charge.py index 5c0bea65..281a0c5d 100644 --- a/peptdeep/model/charge.py +++ b/peptdeep/model/charge.py @@ -1,16 +1,60 @@ import pandas as pd import numpy as np - from peptdeep.model.generic_property_prediction import ( ModelInterface_for_Generic_AASeq_MultiLabelClassification, Model_for_Generic_AASeq_BinaryClassification_Transformer, ModelInterface_for_Generic_ModAASeq_MultiLabelClassification, Model_for_Generic_ModAASeq_BinaryClassification_Transformer, ) + +class ChargeModelInterface: + def predict_charges_as_prob(self, + pep_df:pd.DataFrame, + min_precursor_charge:int, + max_precursor_charge:int, + ): + df = self.predict(pep_df.copy()) + df.rename(columns={"charge_probs":"charge_prob"}, inplace=True) + df["charge"] = [self.charge_range[ + min_precursor_charge-self.min_predict_charge: + max_precursor_charge-self.min_predict_charge+1 + ]]*len(df) + df["charge_prob"] = df.charge_prob.apply( + lambda x: x[ + min_precursor_charge-self.min_predict_charge: + max_precursor_charge-self.min_predict_charge+1 + ] + ) + df = df.explode( + ["charge","charge_prob"], ignore_index=True + ).dropna(subset=["charge"]) + df["charge"] = df.charge.astype(np.int8) + df["charge_prob"] = df.charge_prob.astype(np.float32) + return df + + def predict_and_clip_charges(self, + pep_df:pd.DataFrame, + charge_prob_cutoff:float, + ): + df = self.predict(pep_df.copy()) + df.rename(columns={"charge_probs":"charge_prob"}, inplace=True) + df["charge"] = df.charge_prob.apply( + lambda x: self.charge_range[x>charge_prob_cutoff] + ) + df["charge_prob"] = df.charge_prob.apply( + lambda x: x[x>charge_prob_cutoff] + ) + df = df.explode( + ["charge","charge_prob"], ignore_index=True + ).dropna(subset=["charge"]) + df["charge"] = df.charge.astype(np.int8) + df["charge_prob"] = df.charge_prob.astype(np.float32) + return df class ChargeModelForModAASeq( - ModelInterface_for_Generic_ModAASeq_MultiLabelClassification + ModelInterface_for_Generic_ModAASeq_MultiLabelClassification, + ChargeModelInterface ): """ ModelInterface for charge prediction for modified peptides @@ -24,29 +68,15 @@ def __init__(self, min_charge:int=1, max_charge:int=6): self.target_column_to_predict = "charge_probs" self.target_column_to_train = "charge_indicators" - self.min_charge = min_charge - self.max_charge = max_charge + self.min_predict_charge = min_charge + self.max_predict_charge = max_charge self.charge_range = np.arange( min_charge, max_charge+1, dtype=np.int8 ) - - def predict_charges_for_pep_df(self, - pep_df:pd.DataFrame, - charge_prob=0.3, - drop_probs_column=True - ): - df = self.predict(pep_df) - df["charge"] = df.charge_probs.apply( - lambda x: self.charge_range[x>charge_prob] - ) - df = df.explode("charge").dropna(subset=["charge"]) - if drop_probs_column: - df.drop(columns="charge_probs", inplace=True) - df["charge"] = df.charge.astype(np.int8) - return df class ChargeModelForAASeq( - ModelInterface_for_Generic_AASeq_MultiLabelClassification + ModelInterface_for_Generic_AASeq_MultiLabelClassification, + ChargeModelInterface ): """ ModelInterface for charge prediction for amino acid sequence @@ -60,24 +90,11 @@ def __init__(self, min_charge:int=1, max_charge:int=6): self.target_column_to_predict = "charge_probs" self.target_column_to_train = "charge_indicators" - self.min_charge = min_charge - self.max_charge = max_charge - self.charge_range = np.arange(min_charge, max_charge+1, dtype=np.int8) - - def predict_charges_for_pep_df(self, - pep_df:pd.DataFrame, - charge_prob=0.3, - drop_probs_column=True - ): - df = self.predict(pep_df) - df["charge"] = df.charge_probs.apply( - lambda x: self.charge_range[x>charge_prob] + self.min_predict_charge = min_charge + self.max_predict_charge = max_charge + self.charge_range = np.arange( + min_charge, max_charge+1, dtype=np.int8 ) - df = df.explode("charge").dropna(subset=["charge"]) - if drop_probs_column: - df.drop(columns="charge_probs", inplace=True) - df["charge"] = df.charge.astype(np.int8) - return df def group_psm_df_by_sequence( psm_df: pd.DataFrame, diff --git a/peptdeep/pretrained_models.py b/peptdeep/pretrained_models.py index 704ea711..d119362a 100644 --- a/peptdeep/pretrained_models.py +++ b/peptdeep/pretrained_models.py @@ -9,6 +9,7 @@ import logging import shutil import ssl +import typing from pickle import UnpicklingError import torch.multiprocessing as mp if sys.platform.lower().startswith("linux"): @@ -43,6 +44,7 @@ ) from peptdeep.model.rt import AlphaRTModel from peptdeep.model.ccs import AlphaCCSModel +from peptdeep.model.charge import ChargeModelForAASeq, ChargeModelForModAASeq from peptdeep.utils import ( uniform_sampling, evaluate_linear_regression ) @@ -299,12 +301,25 @@ def __init__(self, self.rt_model:AlphaRTModel = AlphaRTModel(device=device) self.ccs_model:AlphaCCSModel = AlphaCCSModel(device=device) self.load_installed_models() + + self.charge_model:typing.Union[ChargeModelForAASeq,ChargeModelForModAASeq] = None + self.reset_by_global_settings(reload_models=False) def reset_by_global_settings(self, reload_models=True, ): mgr_settings = global_settings['model_mgr'] + + if os.path.isfile(mgr_settings['charge_model_file']): + if mgr_settings['charge_model_type'] == 'modseq': + self.charge_model = ChargeModelForModAASeq() + else: + self.charge_model = ChargeModelForAASeq() + self.charge_model.load(mgr_settings['charge_model_file']) + self.charge_prob_cutoff = mgr_settings['charge_prob_cutoff'] + self.use_predicted_charge_in_speclib = mgr_settings['use_predicted_charge_in_speclib'] + if reload_models: self.load_installed_models(mgr_settings['model_type']) self.load_external_models( @@ -1141,4 +1156,3 @@ def refine_df(df): process_num = process_num, mp_batch_size=mp_batch_size, ) - diff --git a/peptdeep/protein/fasta.py b/peptdeep/protein/fasta.py index aede97b1..eaefd0c9 100644 --- a/peptdeep/protein/fasta.py +++ b/peptdeep/protein/fasta.py @@ -172,3 +172,19 @@ def __init__(self, else: print("Oops, `PredictSpecLibFasta.model_manager` is None, while it should not happen") self.model_manager = model_manager + + def add_charge(self): + if self.model_manager.charge_model is None: + super().add_charge() + else: + if self.model_manager.use_predicted_charge_in_speclib: + self._precursor_df = self.model_manager.charge_model.predict_and_clip_charges( + self.precursor_df, + charge_prob_cutoff=self.model_manager.charge_prob_cutoff + ) + else: + self._precursor_df = self.model_manager.charge_model.predict_charges_as_prob( + self.precursor_df, + min_precursor_charge=self.min_precursor_charge, + max_precursor_charge=self.max_precursor_charge + ) From 83bf54e65fb7ada6787f5d0edd1957827eb6364c Mon Sep 17 00:00:00 2001 From: jalew188 Date: Tue, 30 Jan 2024 00:13:52 +0100 Subject: [PATCH 06/18] multilabel must predict_in_order --- nbdev_nbs/model/charge.ipynb | 308 +++++++++++++++--- nbdev_nbs/protein/fasta.ipynb | 54 ++- peptdeep/model/charge.py | 14 + peptdeep/model/generic_property_prediction.py | 44 ++- 4 files changed, 311 insertions(+), 109 deletions(-) diff --git a/nbdev_nbs/model/charge.ipynb b/nbdev_nbs/model/charge.ipynb index 7755d736..7522bd5c 100644 --- a/nbdev_nbs/model/charge.ipynb +++ b/nbdev_nbs/model/charge.ipynb @@ -56,39 +56,31 @@ " ABCDE\n", " [1, 0]\n", " 5\n", - " 0.739165\n", + " 0.693502\n", " 1\n", " \n", " \n", - " 1\n", - " FGHIJK\n", - " [0, 1]\n", - " 6\n", - " 0.439334\n", - " 1\n", - " \n", - " \n", - " 1\n", + " 2\n", " FGHIJK\n", " [0, 1]\n", " 6\n", - " 0.627932\n", + " 0.462107\n", " 2\n", " \n", " \n", - " 2\n", + " 3\n", " LMNOPQ\n", " [1, 1]\n", " 6\n", - " 0.628110\n", + " 0.544402\n", " 1\n", " \n", " \n", - " 2\n", + " 4\n", " LMNOPQ\n", " [1, 1]\n", " 6\n", - " 0.587332\n", + " 0.394243\n", " 2\n", " \n", " \n", @@ -97,11 +89,10 @@ ], "text/plain": [ " sequence charge_indicators nAA charge_prob charge\n", - "0 ABCDE [1, 0] 5 0.739165 1\n", - "1 FGHIJK [0, 1] 6 0.439334 1\n", - "1 FGHIJK [0, 1] 6 0.627932 2\n", - "2 LMNOPQ [1, 1] 6 0.628110 1\n", - "2 LMNOPQ [1, 1] 6 0.587332 2" + "0 ABCDE [1, 0] 5 0.693502 1\n", + "2 FGHIJK [0, 1] 6 0.462107 2\n", + "3 LMNOPQ [1, 1] 6 0.544402 1\n", + "4 LMNOPQ [1, 1] 6 0.394243 2" ] }, "execution_count": null, @@ -116,11 +107,10 @@ "model = ChargeModelForAASeq(min_charge=1, max_charge=2)\n", "\n", "seq_df = pd.DataFrame({\n", - " 'sequence': ['ABCDE','FGHIJK','LMNOPQ','RSTUVWXYZ'],\n", - " 'charge_indicators': [[1,0],[0,1],[1,1],[0,0]],\n", + " 'sequence': ['ABCDE','FGHIJK','LMNOPQ','RSTUVWXYZ','HIJKL'],\n", + " 'charge_indicators': [[1,0],[0,1],[1,1],[0,0],[0,0]],\n", "})\n", "model.train(seq_df)\n", - "model.predict(seq_df)\n", "model.predict_and_clip_charges(seq_df, charge_prob_cutoff=0.3)" ] }, @@ -163,31 +153,39 @@ " ABCDE\n", " [1, 0]\n", " 5\n", - " 0.249596\n", + " 0.186966\n", " 2\n", " \n", " \n", " 1\n", + " HIJKL\n", + " [0, 0]\n", + " 5\n", + " 0.253555\n", + " 2\n", + " \n", + " \n", + " 2\n", " FGHIJK\n", " [0, 1]\n", " 6\n", - " 0.627932\n", + " 0.462107\n", " 2\n", " \n", " \n", - " 2\n", + " 3\n", " LMNOPQ\n", " [1, 1]\n", " 6\n", - " 0.587332\n", + " 0.394243\n", " 2\n", " \n", " \n", - " 3\n", + " 4\n", " RSTUVWXYZ\n", " [0, 0]\n", " 9\n", - " 0.260932\n", + " 0.129340\n", " 2\n", " \n", " \n", @@ -196,10 +194,11 @@ ], "text/plain": [ " sequence charge_indicators nAA charge_prob charge\n", - "0 ABCDE [1, 0] 5 0.249596 2\n", - "1 FGHIJK [0, 1] 6 0.627932 2\n", - "2 LMNOPQ [1, 1] 6 0.587332 2\n", - "3 RSTUVWXYZ [0, 0] 9 0.260932 2" + "0 ABCDE [1, 0] 5 0.186966 2\n", + "1 HIJKL [0, 0] 5 0.253555 2\n", + "2 FGHIJK [0, 1] 6 0.462107 2\n", + "3 LMNOPQ [1, 1] 6 0.394243 2\n", + "4 RSTUVWXYZ [0, 0] 9 0.129340 2" ] }, "execution_count": null, @@ -211,6 +210,103 @@ "model.predict_charges_as_prob(seq_df, 2, 4)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sequencecharge_indicatorsnAAchargecharge_prob
0ABCDE[1, 0]510.693502
1HIJKL[0, 0]510.196651
2FGHIJK[0, 1]610.148395
3LMNOPQ[1, 1]610.544402
4RSTUVWXYZ[0, 0]910.132826
\n", + "
" + ], + "text/plain": [ + " sequence charge_indicators nAA charge charge_prob\n", + "0 ABCDE [1, 0] 5 1 0.693502\n", + "1 HIJKL [0, 0] 5 1 0.196651\n", + "2 FGHIJK [0, 1] 6 1 0.148395\n", + "3 LMNOPQ [1, 1] 6 1 0.544402\n", + "4 RSTUVWXYZ [0, 0] 9 1 0.132826" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seq_df['charge'] = 1\n", + "model.predict_prob_for_charge(seq_df)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -254,7 +350,7 @@ " \n", " [1, 0]\n", " 5\n", - " 0.814867\n", + " 0.794292\n", " 1\n", " \n", " \n", @@ -264,7 +360,7 @@ " \n", " [0, 1]\n", " 6\n", - " 0.708186\n", + " 0.654079\n", " 2\n", " \n", " \n", @@ -274,17 +370,17 @@ " 2\n", " [1, 1]\n", " 6\n", - " 0.712738\n", + " 0.747879\n", " 1\n", " \n", " \n", - " 2\n", + " 3\n", " LMNOPQ\n", " Oxidation@M\n", " 2\n", " [1, 1]\n", " 6\n", - " 0.534221\n", + " 0.691985\n", " 2\n", " \n", " \n", @@ -293,10 +389,10 @@ ], "text/plain": [ " sequence mods mod_sites charge_indicators nAA charge_prob charge\n", - "0 ABCDE [1, 0] 5 0.814867 1\n", - "1 FGHIJK [0, 1] 6 0.708186 2\n", - "2 LMNOPQ Oxidation@M 2 [1, 1] 6 0.712738 1\n", - "2 LMNOPQ Oxidation@M 2 [1, 1] 6 0.534221 2" + "0 ABCDE [1, 0] 5 0.794292 1\n", + "1 FGHIJK [0, 1] 6 0.654079 2\n", + "2 LMNOPQ Oxidation@M 2 [1, 1] 6 0.747879 1\n", + "3 LMNOPQ Oxidation@M 2 [1, 1] 6 0.691985 2" ] }, "execution_count": null, @@ -317,7 +413,6 @@ " 'charge_indicators': [[1,0],[0,1],[1,1],[0,0]],\n", "})\n", "model.train(modseq_df)\n", - "model.predict(modseq_df)\n", "model.predict_and_clip_charges(modseq_df, charge_prob_cutoff=0.3)" ] }, @@ -364,7 +459,7 @@ " \n", " [1, 0]\n", " 5\n", - " 0.224946\n", + " 0.199697\n", " 2\n", " \n", " \n", @@ -374,7 +469,7 @@ " \n", " [0, 1]\n", " 6\n", - " 0.708186\n", + " 0.654079\n", " 2\n", " \n", " \n", @@ -384,7 +479,7 @@ " 2\n", " [1, 1]\n", " 6\n", - " 0.534221\n", + " 0.691985\n", " 2\n", " \n", " \n", @@ -394,7 +489,7 @@ " 3\n", " [0, 0]\n", " 9\n", - " 0.278221\n", + " 0.282058\n", " 2\n", " \n", " \n", @@ -403,10 +498,10 @@ ], "text/plain": [ " sequence mods mod_sites charge_indicators nAA charge_prob \\\n", - "0 ABCDE [1, 0] 5 0.224946 \n", - "1 FGHIJK [0, 1] 6 0.708186 \n", - "2 LMNOPQ Oxidation@M 2 [1, 1] 6 0.534221 \n", - "3 RSTUVWXYZ Phospho@T 3 [0, 0] 9 0.278221 \n", + "0 ABCDE [1, 0] 5 0.199697 \n", + "1 FGHIJK [0, 1] 6 0.654079 \n", + "2 LMNOPQ Oxidation@M 2 [1, 1] 6 0.691985 \n", + "3 RSTUVWXYZ Phospho@T 3 [0, 0] 9 0.282058 \n", "\n", " charge \n", "0 2 \n", @@ -423,6 +518,117 @@ "source": [ "model.predict_charges_as_prob(modseq_df, 2, 4)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sequencemodsmod_sitescharge_indicatorsnAAchargecharge_prob
0ABCDE[1, 0]510.794292
1FGHIJK[0, 1]610.191645
2LMNOPQOxidation@M2[1, 1]610.747879
3RSTUVWXYZPhospho@T3[0, 0]910.188159
\n", + "
" + ], + "text/plain": [ + " sequence mods mod_sites charge_indicators nAA charge \\\n", + "0 ABCDE [1, 0] 5 1 \n", + "1 FGHIJK [0, 1] 6 1 \n", + "2 LMNOPQ Oxidation@M 2 [1, 1] 6 1 \n", + "3 RSTUVWXYZ Phospho@T 3 [0, 0] 9 1 \n", + "\n", + " charge_prob \n", + "0 0.794292 \n", + "1 0.191645 \n", + "2 0.747879 \n", + "3 0.188159 " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "modseq_df['charge'] = 1\n", + "model.predict_prob_for_charge(modseq_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/nbdev_nbs/protein/fasta.ipynb b/nbdev_nbs/protein/fasta.ipynb index 3332879e..1256ad16 100644 --- a/nbdev_nbs/protein/fasta.ipynb +++ b/nbdev_nbs/protein/fasta.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -197,7 +197,7 @@ "8 False 20 " ] }, - "execution_count": 3, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -224,7 +224,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -276,7 +276,7 @@ "1 yy gene FGHIJKLMNOPQR" ] }, - "execution_count": 4, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -287,7 +287,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -469,7 +469,7 @@ "8 False 20 xx " ] }, - "execution_count": 5, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -482,7 +482,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -514,7 +514,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1075,7 +1075,7 @@ "31 0;4 20 xx " ] }, - "execution_count": 7, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -1087,7 +1087,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1133,7 +1133,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1822,7 +1822,7 @@ "39 0;4 20 xx " ] }, - "execution_count": 9, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -1836,7 +1836,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -2076,7 +2076,7 @@ "[120 rows x 11 columns]" ] }, - "execution_count": 10, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -2093,7 +2093,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -2102,7 +2102,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -3287,7 +3287,7 @@ "[40 rows x 26 columns]" ] }, - "execution_count": 13, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -3326,7 +3326,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -3699,7 +3699,7 @@ "[80 rows x 27 columns]" ] }, - "execution_count": 14, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -3732,18 +3732,6 @@ "display_name": "Python 3.8.3 ('base')", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" } }, "nbformat": 4, diff --git a/peptdeep/model/charge.py b/peptdeep/model/charge.py index 281a0c5d..ded7df06 100644 --- a/peptdeep/model/charge.py +++ b/peptdeep/model/charge.py @@ -9,6 +9,8 @@ ) class ChargeModelInterface: + def __init__(self): + raise TypeError("The abstract interface class cannot be initialized") def predict_charges_as_prob(self, pep_df:pd.DataFrame, min_precursor_charge:int, @@ -32,6 +34,18 @@ def predict_charges_as_prob(self, df["charge"] = df.charge.astype(np.int8) df["charge_prob"] = df.charge_prob.astype(np.float32) return df + + def predict_prob_for_charge(self, + precursor_df:pd.DataFrame, + ): + if "charge" not in precursor_df.columns: + raise KeyError("precursor_df must contain `charge` column") + precursor_df = self.predict(precursor_df) + precursor_df["charge_prob"] = precursor_df[["charge_probs","charge"]].apply( + lambda x: x.iloc[0][x.iloc[1]-self.min_predict_charge], axis=1 + ).astype(np.float32) + precursor_df.drop(columns="charge_probs", inplace=True) + return precursor_df def predict_and_clip_charges(self, pep_df:pd.DataFrame, diff --git a/peptdeep/model/generic_property_prediction.py b/peptdeep/model/generic_property_prediction.py index 3777143d..0d7b04c6 100644 --- a/peptdeep/model/generic_property_prediction.py +++ b/peptdeep/model/generic_property_prediction.py @@ -3,7 +3,7 @@ import numpy as np import peptdeep.model.building_block as building_block -from peptdeep.model.model_interface import ModelInterface +from peptdeep.model.model_interface import ModelInterface, is_precursor_sorted ASCII_NUM=128 @@ -435,6 +435,12 @@ def _get_targets_from_batch_df(self, batch_df, **kwargs): np.stack(batch_df[self.target_column_to_train].values), dtype=torch.float32 ) + + def _check_predict_in_order(self, precursor_df:pd.DataFrame): + if not is_precursor_sorted(precursor_df): + # multilabel prediction can only predict in order + precursor_df.sort_values("nAA", inplace=True) + precursor_df.reset_index(drop=True, inplace=True) def _prepare_predict_data_df(self, precursor_df, **kwargs): precursor_df[self.target_column_to_predict] = [ @@ -443,18 +449,9 @@ def _prepare_predict_data_df(self, precursor_df, **kwargs): self.predict_df = precursor_df def _set_batch_predict_data(self, batch_df, predict_values, **kwargs): - if self._predict_in_order: - self.predict_df.loc[:,self.target_column_to_predict].values[ - batch_df.index.values[0]:batch_df.index.values[-1]+1 - ] = list(predict_values) - else: - # self.predict_df.loc[ - # batch_df.index,self.target_column_to_predict - # ] = [val.tolist() for val in predict_values] - - # fail to assign list of list/ndarray by .loc, use for loop instead (slow) - for idx,val in zip(batch_df.index.values,predict_values): - self.predict_df.loc[idx,self.target_column_to_predict] = val + self.predict_df.loc[:,self.target_column_to_predict].values[ + batch_df.index.values[0]:batch_df.index.values[-1]+1 + ] = list(predict_values) class ModelInterface_for_Generic_ModAASeq_MultiLabelClassification( ModelInterface_for_Generic_ModAASeq_BinaryClassification @@ -481,6 +478,12 @@ def _get_targets_from_batch_df(self, batch_df, **kwargs): np.stack(batch_df[self.target_column_to_train].values), dtype=torch.float32 ) + + def _check_predict_in_order(self, precursor_df:pd.DataFrame): + if not is_precursor_sorted(precursor_df): + # multilabel prediction can only predict in order + precursor_df.sort_values("nAA", inplace=True) + precursor_df.reset_index(drop=True, inplace=True) def _prepare_predict_data_df(self, precursor_df, **kwargs): precursor_df[self.target_column_to_predict] = [ @@ -489,18 +492,9 @@ def _prepare_predict_data_df(self, precursor_df, **kwargs): self.predict_df = precursor_df def _set_batch_predict_data(self, batch_df, predict_values, **kwargs): - if self._predict_in_order: - self.predict_df.loc[:,self.target_column_to_predict].values[ - batch_df.index.values[0]:batch_df.index.values[-1]+1 - ] = list(predict_values) - else: - # self.predict_df.loc[ - # batch_df.index,self.target_column_to_predict - # ] = [val.tolist() for val in predict_values] - - # fail to assign list of list/ndarray by .loc, use for loop instead (slow) - for idx,val in zip(batch_df.index.values,predict_values): - self.predict_df.loc[idx,self.target_column_to_predict] = val + self.predict_df.loc[:,self.target_column_to_predict].values[ + batch_df.index.values[0]:batch_df.index.values[-1]+1 + ] = list(predict_values) # alias ModelInterface_for_Generic_AASeq_MultiTargetClassification = ModelInterface_for_Generic_AASeq_MultiLabelClassification From 830ce15de65be17a714e288036832144ccc6208d Mon Sep 17 00:00:00 2001 From: jalew188 Date: Tue, 30 Jan 2024 00:26:03 +0100 Subject: [PATCH 07/18] remove interface class --- peptdeep/model/charge.py | 102 +++++++++++++++++++++++++++++---------- 1 file changed, 76 insertions(+), 26 deletions(-) diff --git a/peptdeep/model/charge.py b/peptdeep/model/charge.py index ded7df06..7952d22d 100644 --- a/peptdeep/model/charge.py +++ b/peptdeep/model/charge.py @@ -7,10 +7,28 @@ ModelInterface_for_Generic_ModAASeq_MultiLabelClassification, Model_for_Generic_ModAASeq_BinaryClassification_Transformer, ) + +class ChargeModelForModAASeq( + ModelInterface_for_Generic_ModAASeq_MultiLabelClassification, +): + """ + ModelInterface for charge prediction for modified peptides + """ + def __init__(self, min_charge:int=1, max_charge:int=6): + super().__init__( + num_target_values=max_charge-min_charge+1, + model_class=Model_for_Generic_ModAASeq_BinaryClassification_Transformer, + nlayers=4, hidden_dim=128, dropout=0.1 + ) + + self.target_column_to_predict = "charge_probs" + self.target_column_to_train = "charge_indicators" + self.min_predict_charge = min_charge + self.max_predict_charge = max_charge + self.charge_range = np.arange( + min_charge, max_charge+1, dtype=np.int8 + ) -class ChargeModelInterface: - def __init__(self): - raise TypeError("The abstract interface class cannot be initialized") def predict_charges_as_prob(self, pep_df:pd.DataFrame, min_precursor_charge:int, @@ -65,32 +83,9 @@ def predict_and_clip_charges(self, df["charge"] = df.charge.astype(np.int8) df["charge_prob"] = df.charge_prob.astype(np.float32) return df - -class ChargeModelForModAASeq( - ModelInterface_for_Generic_ModAASeq_MultiLabelClassification, - ChargeModelInterface -): - """ - ModelInterface for charge prediction for modified peptides - """ - def __init__(self, min_charge:int=1, max_charge:int=6): - super().__init__( - num_target_values=max_charge-min_charge+1, - model_class=Model_for_Generic_ModAASeq_BinaryClassification_Transformer, - nlayers=4, hidden_dim=128, dropout=0.1 - ) - - self.target_column_to_predict = "charge_probs" - self.target_column_to_train = "charge_indicators" - self.min_predict_charge = min_charge - self.max_predict_charge = max_charge - self.charge_range = np.arange( - min_charge, max_charge+1, dtype=np.int8 - ) class ChargeModelForAASeq( ModelInterface_for_Generic_AASeq_MultiLabelClassification, - ChargeModelInterface ): """ ModelInterface for charge prediction for amino acid sequence @@ -110,6 +105,61 @@ def __init__(self, min_charge:int=1, max_charge:int=6): min_charge, max_charge+1, dtype=np.int8 ) + def predict_charges_as_prob(self, + pep_df:pd.DataFrame, + min_precursor_charge:int, + max_precursor_charge:int, + ): + df = self.predict(pep_df.copy()) + df.rename(columns={"charge_probs":"charge_prob"}, inplace=True) + df["charge"] = [self.charge_range[ + min_precursor_charge-self.min_predict_charge: + max_precursor_charge-self.min_predict_charge+1 + ]]*len(df) + df["charge_prob"] = df.charge_prob.apply( + lambda x: x[ + min_precursor_charge-self.min_predict_charge: + max_precursor_charge-self.min_predict_charge+1 + ] + ) + df = df.explode( + ["charge","charge_prob"], ignore_index=True + ).dropna(subset=["charge"]) + df["charge"] = df.charge.astype(np.int8) + df["charge_prob"] = df.charge_prob.astype(np.float32) + return df + + def predict_prob_for_charge(self, + precursor_df:pd.DataFrame, + ): + if "charge" not in precursor_df.columns: + raise KeyError("precursor_df must contain `charge` column") + precursor_df = self.predict(precursor_df) + precursor_df["charge_prob"] = precursor_df[["charge_probs","charge"]].apply( + lambda x: x.iloc[0][x.iloc[1]-self.min_predict_charge], axis=1 + ).astype(np.float32) + precursor_df.drop(columns="charge_probs", inplace=True) + return precursor_df + + def predict_and_clip_charges(self, + pep_df:pd.DataFrame, + charge_prob_cutoff:float, + ): + df = self.predict(pep_df.copy()) + df.rename(columns={"charge_probs":"charge_prob"}, inplace=True) + df["charge"] = df.charge_prob.apply( + lambda x: self.charge_range[x>charge_prob_cutoff] + ) + df["charge_prob"] = df.charge_prob.apply( + lambda x: x[x>charge_prob_cutoff] + ) + df = df.explode( + ["charge","charge_prob"], ignore_index=True + ).dropna(subset=["charge"]) + df["charge"] = df.charge.astype(np.int8) + df["charge_prob"] = df.charge_prob.astype(np.float32) + return df + def group_psm_df_by_sequence( psm_df: pd.DataFrame, min_charge:int, From 549ef6a19ce322ab2a25cef0493a88431a04fca2 Mon Sep 17 00:00:00 2001 From: jalew188 Date: Tue, 30 Jan 2024 00:40:28 +0100 Subject: [PATCH 08/18] add batch_size and verbose --- peptdeep/model/charge.py | 43 ++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/peptdeep/model/charge.py b/peptdeep/model/charge.py index 7952d22d..5912a608 100644 --- a/peptdeep/model/charge.py +++ b/peptdeep/model/charge.py @@ -33,8 +33,14 @@ def predict_charges_as_prob(self, pep_df:pd.DataFrame, min_precursor_charge:int, max_precursor_charge:int, + batch_size=1024, + verbose=False, ): - df = self.predict(pep_df.copy()) + df = self.predict( + pep_df.copy(), + batch_size=batch_size, + verbose=verbose, + ) df.rename(columns={"charge_probs":"charge_prob"}, inplace=True) df["charge"] = [self.charge_range[ min_precursor_charge-self.min_predict_charge: @@ -55,10 +61,15 @@ def predict_charges_as_prob(self, def predict_prob_for_charge(self, precursor_df:pd.DataFrame, + batch_size=1024, + verbose=False, ): if "charge" not in precursor_df.columns: raise KeyError("precursor_df must contain `charge` column") - precursor_df = self.predict(precursor_df) + precursor_df = self.predict(precursor_df, + batch_size=batch_size, + verbose=verbose, + ) precursor_df["charge_prob"] = precursor_df[["charge_probs","charge"]].apply( lambda x: x.iloc[0][x.iloc[1]-self.min_predict_charge], axis=1 ).astype(np.float32) @@ -68,8 +79,13 @@ def predict_prob_for_charge(self, def predict_and_clip_charges(self, pep_df:pd.DataFrame, charge_prob_cutoff:float, + batch_size=1024, + verbose=False, ): - df = self.predict(pep_df.copy()) + df = self.predict(pep_df.copy(), + batch_size=batch_size, + verbose=verbose, + ) df.rename(columns={"charge_probs":"charge_prob"}, inplace=True) df["charge"] = df.charge_prob.apply( lambda x: self.charge_range[x>charge_prob_cutoff] @@ -109,8 +125,13 @@ def predict_charges_as_prob(self, pep_df:pd.DataFrame, min_precursor_charge:int, max_precursor_charge:int, + batch_size=1024, + verbose=False, ): - df = self.predict(pep_df.copy()) + df = self.predict(pep_df.copy(), + batch_size=batch_size, + verbose=verbose, + ) df.rename(columns={"charge_probs":"charge_prob"}, inplace=True) df["charge"] = [self.charge_range[ min_precursor_charge-self.min_predict_charge: @@ -131,10 +152,15 @@ def predict_charges_as_prob(self, def predict_prob_for_charge(self, precursor_df:pd.DataFrame, + batch_size=1024, + verbose=False, ): if "charge" not in precursor_df.columns: raise KeyError("precursor_df must contain `charge` column") - precursor_df = self.predict(precursor_df) + precursor_df = self.predict(precursor_df, + batch_size=batch_size, + verbose=verbose, + ) precursor_df["charge_prob"] = precursor_df[["charge_probs","charge"]].apply( lambda x: x.iloc[0][x.iloc[1]-self.min_predict_charge], axis=1 ).astype(np.float32) @@ -144,8 +170,13 @@ def predict_prob_for_charge(self, def predict_and_clip_charges(self, pep_df:pd.DataFrame, charge_prob_cutoff:float, + batch_size=1024, + verbose=False, ): - df = self.predict(pep_df.copy()) + df = self.predict(pep_df.copy(), + batch_size=batch_size, + verbose=verbose, + ) df.rename(columns={"charge_probs":"charge_prob"}, inplace=True) df["charge"] = df.charge_prob.apply( lambda x: self.charge_range[x>charge_prob_cutoff] From edfc759f7e5f516387c96a4a35af7a8ab840743d Mon Sep 17 00:00:00 2001 From: jalew188 Date: Tue, 30 Jan 2024 00:46:47 +0100 Subject: [PATCH 09/18] batch_size and verbose as properties --- peptdeep/model/charge.py | 55 +++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 29 deletions(-) diff --git a/peptdeep/model/charge.py b/peptdeep/model/charge.py index 5912a608..344f56ae 100644 --- a/peptdeep/model/charge.py +++ b/peptdeep/model/charge.py @@ -28,18 +28,18 @@ def __init__(self, min_charge:int=1, max_charge:int=6): self.charge_range = np.arange( min_charge, max_charge+1, dtype=np.int8 ) + self.predict_batch_size = 1024 + self.predict_verbose = False def predict_charges_as_prob(self, pep_df:pd.DataFrame, min_precursor_charge:int, max_precursor_charge:int, - batch_size=1024, - verbose=False, ): df = self.predict( pep_df.copy(), - batch_size=batch_size, - verbose=verbose, + batch_size=self.predict_batch_size, + verbose=self.predict_verbose, ) df.rename(columns={"charge_probs":"charge_prob"}, inplace=True) df["charge"] = [self.charge_range[ @@ -61,14 +61,13 @@ def predict_charges_as_prob(self, def predict_prob_for_charge(self, precursor_df:pd.DataFrame, - batch_size=1024, - verbose=False, ): if "charge" not in precursor_df.columns: raise KeyError("precursor_df must contain `charge` column") - precursor_df = self.predict(precursor_df, - batch_size=batch_size, - verbose=verbose, + precursor_df = self.predict( + precursor_df, + batch_size=self.predict_batch_size, + verbose=self.predict_verbose, ) precursor_df["charge_prob"] = precursor_df[["charge_probs","charge"]].apply( lambda x: x.iloc[0][x.iloc[1]-self.min_predict_charge], axis=1 @@ -79,12 +78,11 @@ def predict_prob_for_charge(self, def predict_and_clip_charges(self, pep_df:pd.DataFrame, charge_prob_cutoff:float, - batch_size=1024, - verbose=False, ): - df = self.predict(pep_df.copy(), - batch_size=batch_size, - verbose=verbose, + df = self.predict( + pep_df.copy(), + batch_size=self.predict_batch_size, + verbose=self.predict_verbose, ) df.rename(columns={"charge_probs":"charge_prob"}, inplace=True) df["charge"] = df.charge_prob.apply( @@ -120,17 +118,18 @@ def __init__(self, min_charge:int=1, max_charge:int=6): self.charge_range = np.arange( min_charge, max_charge+1, dtype=np.int8 ) + self.predict_batch_size = 1024 + self.predict_verbose = False def predict_charges_as_prob(self, pep_df:pd.DataFrame, min_precursor_charge:int, max_precursor_charge:int, - batch_size=1024, - verbose=False, ): - df = self.predict(pep_df.copy(), - batch_size=batch_size, - verbose=verbose, + df = self.predict( + pep_df.copy(), + batch_size=self.predict_batch_size, + verbose=self.predict_verbose, ) df.rename(columns={"charge_probs":"charge_prob"}, inplace=True) df["charge"] = [self.charge_range[ @@ -152,14 +151,13 @@ def predict_charges_as_prob(self, def predict_prob_for_charge(self, precursor_df:pd.DataFrame, - batch_size=1024, - verbose=False, ): if "charge" not in precursor_df.columns: raise KeyError("precursor_df must contain `charge` column") - precursor_df = self.predict(precursor_df, - batch_size=batch_size, - verbose=verbose, + precursor_df = self.predict( + precursor_df, + batch_size=self.predict_batch_size, + verbose=self.predict_verbose, ) precursor_df["charge_prob"] = precursor_df[["charge_probs","charge"]].apply( lambda x: x.iloc[0][x.iloc[1]-self.min_predict_charge], axis=1 @@ -170,12 +168,11 @@ def predict_prob_for_charge(self, def predict_and_clip_charges(self, pep_df:pd.DataFrame, charge_prob_cutoff:float, - batch_size=1024, - verbose=False, ): - df = self.predict(pep_df.copy(), - batch_size=batch_size, - verbose=verbose, + df = self.predict( + pep_df.copy(), + batch_size=self.predict_batch_size, + verbose=self.predict_verbose, ) df.rename(columns={"charge_probs":"charge_prob"}, inplace=True) df["charge"] = df.charge_prob.apply( From 8f9039634a9ac1da2931d113af307829adfe3932 Mon Sep 17 00:00:00 2001 From: jalew188 Date: Tue, 30 Jan 2024 01:51:33 +0100 Subject: [PATCH 10/18] predict_mp for charges --- nbdev_nbs/model/charge.ipynb | 218 +++++++++++++++++------ peptdeep/constants/default_settings.yaml | 1 + peptdeep/model/charge.py | 20 +-- peptdeep/model/model_interface.py | 2 +- peptdeep/pretrained_models.py | 1 + peptdeep/protein/fasta.py | 1 + 6 files changed, 174 insertions(+), 69 deletions(-) diff --git a/nbdev_nbs/model/charge.ipynb b/nbdev_nbs/model/charge.ipynb index 7522bd5c..176de010 100644 --- a/nbdev_nbs/model/charge.ipynb +++ b/nbdev_nbs/model/charge.ipynb @@ -22,6 +22,21 @@ "OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n" ] }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Predicting with multiprocessing ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/1 [00:00ABCDE\n", " [1, 0]\n", " 5\n", - " 0.693502\n", + " 0.725907\n", " 1\n", " \n", " \n", + " 1\n", + " HIJKL\n", + " [0, 0]\n", + " 5\n", + " 0.304970\n", + " 2\n", + " \n", + " \n", " 2\n", " FGHIJK\n", " [0, 1]\n", " 6\n", - " 0.462107\n", + " 0.376047\n", " 2\n", " \n", " \n", @@ -72,7 +95,7 @@ " LMNOPQ\n", " [1, 1]\n", " 6\n", - " 0.544402\n", + " 0.611807\n", " 1\n", " \n", " \n", @@ -80,7 +103,7 @@ " LMNOPQ\n", " [1, 1]\n", " 6\n", - " 0.394243\n", + " 0.434347\n", " 2\n", " \n", " \n", @@ -89,10 +112,11 @@ ], "text/plain": [ " sequence charge_indicators nAA charge_prob charge\n", - "0 ABCDE [1, 0] 5 0.693502 1\n", - "2 FGHIJK [0, 1] 6 0.462107 2\n", - "3 LMNOPQ [1, 1] 6 0.544402 1\n", - "4 LMNOPQ [1, 1] 6 0.394243 2" + "0 ABCDE [1, 0] 5 0.725907 1\n", + "1 HIJKL [0, 0] 5 0.304970 2\n", + "2 FGHIJK [0, 1] 6 0.376047 2\n", + "3 LMNOPQ [1, 1] 6 0.611807 1\n", + "4 LMNOPQ [1, 1] 6 0.434347 2" ] }, "execution_count": null, @@ -119,6 +143,21 @@ "execution_count": null, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Predicting with multiprocessing ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/1 [00:00ABCDE\n", " [1, 0]\n", " 5\n", - " 0.186966\n", + " 0.297068\n", " 2\n", " \n", " \n", @@ -161,7 +200,7 @@ " HIJKL\n", " [0, 0]\n", " 5\n", - " 0.253555\n", + " 0.304970\n", " 2\n", " \n", " \n", @@ -169,7 +208,7 @@ " FGHIJK\n", " [0, 1]\n", " 6\n", - " 0.462107\n", + " 0.376047\n", " 2\n", " \n", " \n", @@ -177,7 +216,7 @@ " LMNOPQ\n", " [1, 1]\n", " 6\n", - " 0.394243\n", + " 0.434347\n", " 2\n", " \n", " \n", @@ -185,7 +224,7 @@ " RSTUVWXYZ\n", " [0, 0]\n", " 9\n", - " 0.129340\n", + " 0.206633\n", " 2\n", " \n", " \n", @@ -194,11 +233,11 @@ ], "text/plain": [ " sequence charge_indicators nAA charge_prob charge\n", - "0 ABCDE [1, 0] 5 0.186966 2\n", - "1 HIJKL [0, 0] 5 0.253555 2\n", - "2 FGHIJK [0, 1] 6 0.462107 2\n", - "3 LMNOPQ [1, 1] 6 0.394243 2\n", - "4 RSTUVWXYZ [0, 0] 9 0.129340 2" + "0 ABCDE [1, 0] 5 0.297068 2\n", + "1 HIJKL [0, 0] 5 0.304970 2\n", + "2 FGHIJK [0, 1] 6 0.376047 2\n", + "3 LMNOPQ [1, 1] 6 0.434347 2\n", + "4 RSTUVWXYZ [0, 0] 9 0.206633 2" ] }, "execution_count": null, @@ -215,6 +254,21 @@ "execution_count": null, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Predicting with multiprocessing ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/1 [00:00[1, 0]\n", " 5\n", " 1\n", - " 0.693502\n", + " 0.725907\n", " \n", " \n", " 1\n", @@ -258,7 +312,7 @@ " [0, 0]\n", " 5\n", " 1\n", - " 0.196651\n", + " 0.131404\n", " \n", " \n", " 2\n", @@ -266,7 +320,7 @@ " [0, 1]\n", " 6\n", " 1\n", - " 0.148395\n", + " 0.126947\n", " \n", " \n", " 3\n", @@ -274,7 +328,7 @@ " [1, 1]\n", " 6\n", " 1\n", - " 0.544402\n", + " 0.611807\n", " \n", " \n", " 4\n", @@ -282,7 +336,7 @@ " [0, 0]\n", " 9\n", " 1\n", - " 0.132826\n", + " 0.162039\n", " \n", " \n", "\n", @@ -290,11 +344,11 @@ ], "text/plain": [ " sequence charge_indicators nAA charge charge_prob\n", - "0 ABCDE [1, 0] 5 1 0.693502\n", - "1 HIJKL [0, 0] 5 1 0.196651\n", - "2 FGHIJK [0, 1] 6 1 0.148395\n", - "3 LMNOPQ [1, 1] 6 1 0.544402\n", - "4 RSTUVWXYZ [0, 0] 9 1 0.132826" + "0 ABCDE [1, 0] 5 1 0.725907\n", + "1 HIJKL [0, 0] 5 1 0.131404\n", + "2 FGHIJK [0, 1] 6 1 0.126947\n", + "3 LMNOPQ [1, 1] 6 1 0.611807\n", + "4 RSTUVWXYZ [0, 0] 9 1 0.162039" ] }, "execution_count": null, @@ -312,6 +366,21 @@ "execution_count": null, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Predicting with multiprocessing ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/1 [00:00\n", " [1, 0]\n", " 5\n", - " 0.794292\n", + " 0.733565\n", " 1\n", " \n", " \n", @@ -360,27 +429,37 @@ " \n", " [0, 1]\n", " 6\n", - " 0.654079\n", - " 2\n", + " 0.350178\n", + " 1\n", " \n", " \n", " 2\n", + " FGHIJK\n", + " \n", + " \n", + " [0, 1]\n", + " 6\n", + " 0.650386\n", + " 2\n", + " \n", + " \n", + " 3\n", " LMNOPQ\n", " Oxidation@M\n", " 2\n", " [1, 1]\n", " 6\n", - " 0.747879\n", + " 0.742781\n", " 1\n", " \n", " \n", - " 3\n", + " 4\n", " LMNOPQ\n", " Oxidation@M\n", " 2\n", " [1, 1]\n", " 6\n", - " 0.691985\n", + " 0.617950\n", " 2\n", " \n", " \n", @@ -389,10 +468,11 @@ ], "text/plain": [ " sequence mods mod_sites charge_indicators nAA charge_prob charge\n", - "0 ABCDE [1, 0] 5 0.794292 1\n", - "1 FGHIJK [0, 1] 6 0.654079 2\n", - "2 LMNOPQ Oxidation@M 2 [1, 1] 6 0.747879 1\n", - "3 LMNOPQ Oxidation@M 2 [1, 1] 6 0.691985 2" + "0 ABCDE [1, 0] 5 0.733565 1\n", + "1 FGHIJK [0, 1] 6 0.350178 1\n", + "2 FGHIJK [0, 1] 6 0.650386 2\n", + "3 LMNOPQ Oxidation@M 2 [1, 1] 6 0.742781 1\n", + "4 LMNOPQ Oxidation@M 2 [1, 1] 6 0.617950 2" ] }, "execution_count": null, @@ -421,6 +501,21 @@ "execution_count": null, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Predicting with multiprocessing ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/1 [00:00\n", " [1, 0]\n", " 5\n", - " 0.199697\n", + " 0.244911\n", " 2\n", " \n", " \n", @@ -469,7 +564,7 @@ " \n", " [0, 1]\n", " 6\n", - " 0.654079\n", + " 0.650386\n", " 2\n", " \n", " \n", @@ -479,7 +574,7 @@ " 2\n", " [1, 1]\n", " 6\n", - " 0.691985\n", + " 0.617950\n", " 2\n", " \n", " \n", @@ -489,7 +584,7 @@ " 3\n", " [0, 0]\n", " 9\n", - " 0.282058\n", + " 0.228520\n", " 2\n", " \n", " \n", @@ -498,10 +593,10 @@ ], "text/plain": [ " sequence mods mod_sites charge_indicators nAA charge_prob \\\n", - "0 ABCDE [1, 0] 5 0.199697 \n", - "1 FGHIJK [0, 1] 6 0.654079 \n", - "2 LMNOPQ Oxidation@M 2 [1, 1] 6 0.691985 \n", - "3 RSTUVWXYZ Phospho@T 3 [0, 0] 9 0.282058 \n", + "0 ABCDE [1, 0] 5 0.244911 \n", + "1 FGHIJK [0, 1] 6 0.650386 \n", + "2 LMNOPQ Oxidation@M 2 [1, 1] 6 0.617950 \n", + "3 RSTUVWXYZ Phospho@T 3 [0, 0] 9 0.228520 \n", "\n", " charge \n", "0 2 \n", @@ -524,6 +619,21 @@ "execution_count": null, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Predicting with multiprocessing ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/1 [00:00[1, 0]\n", " 5\n", " 1\n", - " 0.794292\n", + " 0.733565\n", " \n", " \n", " 1\n", @@ -573,7 +683,7 @@ " [0, 1]\n", " 6\n", " 1\n", - " 0.191645\n", + " 0.350178\n", " \n", " \n", " 2\n", @@ -583,7 +693,7 @@ " [1, 1]\n", " 6\n", " 1\n", - " 0.747879\n", + " 0.742781\n", " \n", " \n", " 3\n", @@ -593,7 +703,7 @@ " [0, 0]\n", " 9\n", " 1\n", - " 0.188159\n", + " 0.294222\n", " \n", " \n", "\n", @@ -607,10 +717,10 @@ "3 RSTUVWXYZ Phospho@T 3 [0, 0] 9 1 \n", "\n", " charge_prob \n", - "0 0.794292 \n", - "1 0.191645 \n", - "2 0.747879 \n", - "3 0.188159 " + "0 0.733565 \n", + "1 0.350178 \n", + "2 0.742781 \n", + "3 0.294222 " ] }, "execution_count": null, diff --git a/peptdeep/constants/default_settings.yaml b/peptdeep/constants/default_settings.yaml index a90c1f10..a7e7845d 100644 --- a/peptdeep/constants/default_settings.yaml +++ b/peptdeep/constants/default_settings.yaml @@ -100,6 +100,7 @@ model_mgr: predict: batch_size_ms2: 512 batch_size_rt_ccs: 1024 + batch_size_charge: 1024 verbose: True multiprocessing: True transfer: diff --git a/peptdeep/model/charge.py b/peptdeep/model/charge.py index 344f56ae..74e4871a 100644 --- a/peptdeep/model/charge.py +++ b/peptdeep/model/charge.py @@ -29,17 +29,15 @@ def __init__(self, min_charge:int=1, max_charge:int=6): min_charge, max_charge+1, dtype=np.int8 ) self.predict_batch_size = 1024 - self.predict_verbose = False def predict_charges_as_prob(self, pep_df:pd.DataFrame, min_precursor_charge:int, max_precursor_charge:int, ): - df = self.predict( + df = self.predict_mp( pep_df.copy(), batch_size=self.predict_batch_size, - verbose=self.predict_verbose, ) df.rename(columns={"charge_probs":"charge_prob"}, inplace=True) df["charge"] = [self.charge_range[ @@ -64,10 +62,9 @@ def predict_prob_for_charge(self, ): if "charge" not in precursor_df.columns: raise KeyError("precursor_df must contain `charge` column") - precursor_df = self.predict( + precursor_df = self.predict_mp( precursor_df, batch_size=self.predict_batch_size, - verbose=self.predict_verbose, ) precursor_df["charge_prob"] = precursor_df[["charge_probs","charge"]].apply( lambda x: x.iloc[0][x.iloc[1]-self.min_predict_charge], axis=1 @@ -79,10 +76,9 @@ def predict_and_clip_charges(self, pep_df:pd.DataFrame, charge_prob_cutoff:float, ): - df = self.predict( + df = self.predict_mp( pep_df.copy(), batch_size=self.predict_batch_size, - verbose=self.predict_verbose, ) df.rename(columns={"charge_probs":"charge_prob"}, inplace=True) df["charge"] = df.charge_prob.apply( @@ -119,17 +115,15 @@ def __init__(self, min_charge:int=1, max_charge:int=6): min_charge, max_charge+1, dtype=np.int8 ) self.predict_batch_size = 1024 - self.predict_verbose = False def predict_charges_as_prob(self, pep_df:pd.DataFrame, min_precursor_charge:int, max_precursor_charge:int, ): - df = self.predict( + df = self.predict_mp( pep_df.copy(), batch_size=self.predict_batch_size, - verbose=self.predict_verbose, ) df.rename(columns={"charge_probs":"charge_prob"}, inplace=True) df["charge"] = [self.charge_range[ @@ -154,10 +148,9 @@ def predict_prob_for_charge(self, ): if "charge" not in precursor_df.columns: raise KeyError("precursor_df must contain `charge` column") - precursor_df = self.predict( + precursor_df = self.predict_mp( precursor_df, batch_size=self.predict_batch_size, - verbose=self.predict_verbose, ) precursor_df["charge_prob"] = precursor_df[["charge_probs","charge"]].apply( lambda x: x.iloc[0][x.iloc[1]-self.min_predict_charge], axis=1 @@ -169,10 +162,9 @@ def predict_and_clip_charges(self, pep_df:pd.DataFrame, charge_prob_cutoff:float, ): - df = self.predict( + df = self.predict_mp( pep_df.copy(), batch_size=self.predict_batch_size, - verbose=self.predict_verbose, ) df.rename(columns={"charge_probs":"charge_prob"}, inplace=True) df["charge"] = df.charge_prob.apply( diff --git a/peptdeep/model/model_interface.py b/peptdeep/model/model_interface.py index 9d88bc7d..f8f65b13 100644 --- a/peptdeep/model/model_interface.py +++ b/peptdeep/model/model_interface.py @@ -421,7 +421,7 @@ def predict_mp(self, return self.predict( precursor_df, batch_size=batch_size, - verbose=False, + verbose=True, **kwargs ) diff --git a/peptdeep/pretrained_models.py b/peptdeep/pretrained_models.py index d119362a..cb733d54 100644 --- a/peptdeep/pretrained_models.py +++ b/peptdeep/pretrained_models.py @@ -317,6 +317,7 @@ def reset_by_global_settings(self, else: self.charge_model = ChargeModelForAASeq() self.charge_model.load(mgr_settings['charge_model_file']) + self.charge_model.predict_batch_size = mgr_settings['predict']['batch_size_charge'] self.charge_prob_cutoff = mgr_settings['charge_prob_cutoff'] self.use_predicted_charge_in_speclib = mgr_settings['use_predicted_charge_in_speclib'] diff --git a/peptdeep/protein/fasta.py b/peptdeep/protein/fasta.py index eaefd0c9..b2a9ba90 100644 --- a/peptdeep/protein/fasta.py +++ b/peptdeep/protein/fasta.py @@ -177,6 +177,7 @@ def add_charge(self): if self.model_manager.charge_model is None: super().add_charge() else: + print("Predicting charge states ...") if self.model_manager.use_predicted_charge_in_speclib: self._precursor_df = self.model_manager.charge_model.predict_and_clip_charges( self.precursor_df, From 3e868d49313af59d2bd0786039a92cdfce0c6f1b Mon Sep 17 00:00:00 2001 From: jalew188 Date: Tue, 30 Jan 2024 10:17:05 +0100 Subject: [PATCH 11/18] _ChargeModelInterface for common methods --- peptdeep/model/charge.py | 113 ++++++++++----------------------------- 1 file changed, 27 insertions(+), 86 deletions(-) diff --git a/peptdeep/model/charge.py b/peptdeep/model/charge.py index 74e4871a..429a2e95 100644 --- a/peptdeep/model/charge.py +++ b/peptdeep/model/charge.py @@ -7,29 +7,10 @@ ModelInterface_for_Generic_ModAASeq_MultiLabelClassification, Model_for_Generic_ModAASeq_BinaryClassification_Transformer, ) - -class ChargeModelForModAASeq( - ModelInterface_for_Generic_ModAASeq_MultiLabelClassification, -): - """ - ModelInterface for charge prediction for modified peptides - """ - def __init__(self, min_charge:int=1, max_charge:int=6): - super().__init__( - num_target_values=max_charge-min_charge+1, - model_class=Model_for_Generic_ModAASeq_BinaryClassification_Transformer, - nlayers=4, hidden_dim=128, dropout=0.1 - ) - - self.target_column_to_predict = "charge_probs" - self.target_column_to_train = "charge_indicators" - self.min_predict_charge = min_charge - self.max_predict_charge = max_charge - self.charge_range = np.arange( - min_charge, max_charge+1, dtype=np.int8 - ) - self.predict_batch_size = 1024 +class _ChargeModelInterface: + def __init__(self, *args, **kwargs): + raise TypeError("Interface class cannot be instantiated.") def predict_charges_as_prob(self, pep_df:pd.DataFrame, min_precursor_charge:int, @@ -93,9 +74,33 @@ def predict_and_clip_charges(self, df["charge"] = df.charge.astype(np.int8) df["charge_prob"] = df.charge_prob.astype(np.float32) return df + +class ChargeModelForModAASeq( + ModelInterface_for_Generic_ModAASeq_MultiLabelClassification, + _ChargeModelInterface +): + """ + ModelInterface for charge prediction for modified peptides + """ + def __init__(self, min_charge:int=1, max_charge:int=6): + super().__init__( + num_target_values=max_charge-min_charge+1, + model_class=Model_for_Generic_ModAASeq_BinaryClassification_Transformer, + nlayers=4, hidden_dim=128, dropout=0.1 + ) + + self.target_column_to_predict = "charge_probs" + self.target_column_to_train = "charge_indicators" + self.min_predict_charge = min_charge + self.max_predict_charge = max_charge + self.charge_range = np.arange( + min_charge, max_charge+1, dtype=np.int8 + ) + self.predict_batch_size = 1024 class ChargeModelForAASeq( ModelInterface_for_Generic_AASeq_MultiLabelClassification, + _ChargeModelInterface ): """ ModelInterface for charge prediction for amino acid sequence @@ -116,70 +121,6 @@ def __init__(self, min_charge:int=1, max_charge:int=6): ) self.predict_batch_size = 1024 - def predict_charges_as_prob(self, - pep_df:pd.DataFrame, - min_precursor_charge:int, - max_precursor_charge:int, - ): - df = self.predict_mp( - pep_df.copy(), - batch_size=self.predict_batch_size, - ) - df.rename(columns={"charge_probs":"charge_prob"}, inplace=True) - df["charge"] = [self.charge_range[ - min_precursor_charge-self.min_predict_charge: - max_precursor_charge-self.min_predict_charge+1 - ]]*len(df) - df["charge_prob"] = df.charge_prob.apply( - lambda x: x[ - min_precursor_charge-self.min_predict_charge: - max_precursor_charge-self.min_predict_charge+1 - ] - ) - df = df.explode( - ["charge","charge_prob"], ignore_index=True - ).dropna(subset=["charge"]) - df["charge"] = df.charge.astype(np.int8) - df["charge_prob"] = df.charge_prob.astype(np.float32) - return df - - def predict_prob_for_charge(self, - precursor_df:pd.DataFrame, - ): - if "charge" not in precursor_df.columns: - raise KeyError("precursor_df must contain `charge` column") - precursor_df = self.predict_mp( - precursor_df, - batch_size=self.predict_batch_size, - ) - precursor_df["charge_prob"] = precursor_df[["charge_probs","charge"]].apply( - lambda x: x.iloc[0][x.iloc[1]-self.min_predict_charge], axis=1 - ).astype(np.float32) - precursor_df.drop(columns="charge_probs", inplace=True) - return precursor_df - - def predict_and_clip_charges(self, - pep_df:pd.DataFrame, - charge_prob_cutoff:float, - ): - df = self.predict_mp( - pep_df.copy(), - batch_size=self.predict_batch_size, - ) - df.rename(columns={"charge_probs":"charge_prob"}, inplace=True) - df["charge"] = df.charge_prob.apply( - lambda x: self.charge_range[x>charge_prob_cutoff] - ) - df["charge_prob"] = df.charge_prob.apply( - lambda x: x[x>charge_prob_cutoff] - ) - df = df.explode( - ["charge","charge_prob"], ignore_index=True - ).dropna(subset=["charge"]) - df["charge"] = df.charge.astype(np.int8) - df["charge_prob"] = df.charge_prob.astype(np.float32) - return df - def group_psm_df_by_sequence( psm_df: pd.DataFrame, min_charge:int, From b1b673ce93cce223bb059ba6743ccad26a636afe Mon Sep 17 00:00:00 2001 From: jalew188 Date: Wed, 31 Jan 2024 10:42:03 +0100 Subject: [PATCH 12/18] CHROE: logging for MS files --- peptdeep/pipeline_api.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/peptdeep/pipeline_api.py b/peptdeep/pipeline_api.py index db91d9b4..d6849975 100644 --- a/peptdeep/pipeline_api.py +++ b/peptdeep/pipeline_api.py @@ -192,6 +192,15 @@ def match_psms()->Tuple[ ms2_file_list ) + logging.info( + f"{len(ms2_file_dict)} MS files for fragment extraction: \n" + + "\n".join([ + f" - {raw_name} : {_path}" for raw_name, _path + in ms2_file_dict.items() + ]) + + "\n" + ) + psm_df = psm_df[ psm_df.raw_name.isin(ms2_file_dict) ].reset_index(drop=True) From 00815b3b0585e7f2e671d74d1bd191c3d6095749 Mon Sep 17 00:00:00 2001 From: jalew188 Date: Wed, 31 Jan 2024 13:49:03 +0100 Subject: [PATCH 13/18] CHROE: logging charge pred --- peptdeep/protein/fasta.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/peptdeep/protein/fasta.py b/peptdeep/protein/fasta.py index b2a9ba90..7e849ef9 100644 --- a/peptdeep/protein/fasta.py +++ b/peptdeep/protein/fasta.py @@ -177,7 +177,7 @@ def add_charge(self): if self.model_manager.charge_model is None: super().add_charge() else: - print("Predicting charge states ...") + print(f"Predicting charge states for {len(self.precursor_df)} peptides ...") if self.model_manager.use_predicted_charge_in_speclib: self._precursor_df = self.model_manager.charge_model.predict_and_clip_charges( self.precursor_df, From c697a69f45c2ad10cd8edaa2360d453b5925c032 Mon Sep 17 00:00:00 2001 From: jalew188 Date: Wed, 31 Jan 2024 20:22:40 +0100 Subject: [PATCH 14/18] min/max charge for charge pred in speclib --- nbdev_nbs/model/charge.ipynb | 14 ++++++++++++-- peptdeep/model/charge.py | 7 ++++++- peptdeep/protein/fasta.py | 2 ++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/nbdev_nbs/model/charge.ipynb b/nbdev_nbs/model/charge.ipynb index 176de010..6f9fc898 100644 --- a/nbdev_nbs/model/charge.ipynb +++ b/nbdev_nbs/model/charge.ipynb @@ -135,7 +135,12 @@ " 'charge_indicators': [[1,0],[0,1],[1,1],[0,0],[0,0]],\n", "})\n", "model.train(seq_df)\n", - "model.predict_and_clip_charges(seq_df, charge_prob_cutoff=0.3)" + "model.predict_and_clip_charges(\n", + " seq_df, \n", + " min_precursor_charge=model.min_predict_charge,\n", + " max_precursor_charge=model.max_predict_charge,\n", + " charge_prob_cutoff=0.3\n", + ")" ] }, { @@ -493,7 +498,12 @@ " 'charge_indicators': [[1,0],[0,1],[1,1],[0,0]],\n", "})\n", "model.train(modseq_df)\n", - "model.predict_and_clip_charges(modseq_df, charge_prob_cutoff=0.3)" + "model.predict_and_clip_charges(\n", + " modseq_df, \n", + " min_precursor_charge=model.min_predict_charge,\n", + " max_precursor_charge=model.max_predict_charge,\n", + " charge_prob_cutoff=0.3\n", + ")" ] }, { diff --git a/peptdeep/model/charge.py b/peptdeep/model/charge.py index 429a2e95..ba459c73 100644 --- a/peptdeep/model/charge.py +++ b/peptdeep/model/charge.py @@ -55,6 +55,8 @@ def predict_prob_for_charge(self, def predict_and_clip_charges(self, pep_df:pd.DataFrame, + min_precursor_charge:int, + max_precursor_charge:int, charge_prob_cutoff:float, ): df = self.predict_mp( @@ -69,9 +71,12 @@ def predict_and_clip_charges(self, lambda x: x[x>charge_prob_cutoff] ) df = df.explode( - ["charge","charge_prob"], ignore_index=True + ["charge","charge_prob"] ).dropna(subset=["charge"]) df["charge"] = df.charge.astype(np.int8) + df = df.query( + f"charge>={min_precursor_charge} and charge<={max_precursor_charge}" + ).reset_index(drop=True) df["charge_prob"] = df.charge_prob.astype(np.float32) return df diff --git a/peptdeep/protein/fasta.py b/peptdeep/protein/fasta.py index 7e849ef9..49ccd270 100644 --- a/peptdeep/protein/fasta.py +++ b/peptdeep/protein/fasta.py @@ -181,6 +181,8 @@ def add_charge(self): if self.model_manager.use_predicted_charge_in_speclib: self._precursor_df = self.model_manager.charge_model.predict_and_clip_charges( self.precursor_df, + min_precursor_charge=self.min_precursor_charge, + max_precursor_charge=self.max_precursor_charge, charge_prob_cutoff=self.model_manager.charge_prob_cutoff ) else: From 61b6a3fd751fa791a3febca0d8adca94bfb02605 Mon Sep 17 00:00:00 2001 From: jalew188 Date: Wed, 31 Jan 2024 22:17:53 +0100 Subject: [PATCH 15/18] irt_pep_df for irt translation --- peptdeep/model/rt.py | 21 +++++++++++++-------- peptdeep/spec_lib/predict_lib.py | 6 ++++-- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/peptdeep/model/rt.py b/peptdeep/model/rt.py index 9254a014..c863b125 100644 --- a/peptdeep/model/rt.py +++ b/peptdeep/model/rt.py @@ -162,18 +162,23 @@ def _get_features_from_batch_df(self, ) def add_irt_column_to_precursor_df(self, - precursor_df: pd.DataFrame + precursor_df: pd.DataFrame, + irt_pep_df:pd.DataFrame = None, ): - print(f"Predict RT for {len(IRT_PEPTIDE_DF)} iRT precursors.") - self.predict(IRT_PEPTIDE_DF) - eval_df = evaluate_linear_regression(IRT_PEPTIDE_DF, "rt_pred", y="irt") + if irt_pep_df is None: + irt_pep_df = IRT_PEPTIDE_DF + print(f"Predict RT for {len(irt_pep_df)} iRT precursors.") + self.predict(irt_pep_df) + if "irt" not in irt_pep_df.columns: + irt_pep_df["irt"] = irt_pep_df["rt"] + eval_df = evaluate_linear_regression(irt_pep_df, "rt_pred", y="irt") print("Linear regression of `rt_pred` to `irt`:") print(eval_df) # simple linear regression - # rt_pred_mean = IRT_PEPTIDE_DF.rt_pred.mean() - # irt_mean = IRT_PEPTIDE_DF.irt.mean() - # x = IRT_PEPTIDE_DF.rt_pred.values - rt_pred_mean - # y = IRT_PEPTIDE_DF.irt.values - irt_mean + # rt_pred_mean = irt_pep_df.rt_pred.mean() + # irt_mean = irt_pep_df.irt.mean() + # x = irt_pep_df.rt_pred.values - rt_pred_mean + # y = irt_pep_df.irt.values - irt_mean # slope = np.sum(x*y)/np.sum(x*x) # intercept = irt_mean - slope*rt_pred_mean # end linear regression diff --git a/peptdeep/spec_lib/predict_lib.py b/peptdeep/spec_lib/predict_lib.py index e43068ee..b136890e 100644 --- a/peptdeep/spec_lib/predict_lib.py +++ b/peptdeep/spec_lib/predict_lib.py @@ -90,9 +90,11 @@ def set_precursor_and_fragment(self, if col not in self.charged_frag_types ], inplace=True) - def translate_rt_to_irt_pred(self): + def translate_rt_to_irt_pred(self, irt_pep_df:pd.DataFrame = None): """ Add 'irt_pred' into columns based on 'rt_pred' """ - return self.model_manager.rt_model.add_irt_column_to_precursor_df(self._precursor_df) + return self.model_manager.rt_model.add_irt_column_to_precursor_df( + self._precursor_df, irt_pep_df=irt_pep_df + ) def predict_all(self, min_required_precursor_num_for_mp:int=2000, From 17a2ad7d57e470d38e2616689965fce4aba68707 Mon Sep 17 00:00:00 2001 From: jalew188 Date: Thu, 22 Feb 2024 22:19:13 +0100 Subject: [PATCH 16/18] FIX match_psms() --- peptdeep/pipeline_api.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/peptdeep/pipeline_api.py b/peptdeep/pipeline_api.py index d6849975..46e50889 100644 --- a/peptdeep/pipeline_api.py +++ b/peptdeep/pipeline_api.py @@ -135,8 +135,9 @@ def match_psms()->Tuple[ Returns ------- - Tuple[pd.DataFrame,pd.DataFrame] - pd.DataFrame: the PSM DataFrame, and + Tuple[pd.DataFrame,pd.DataFrame,pd.DataFrame] + pd.DataFrame: the PSM DataFrame + pd.DataFrame: the fragment mz DataFrame pd.DataFrame: the matched fragment intensity DataFrame """ mgr_settings = global_settings['model_mgr'] @@ -325,7 +326,7 @@ def transfer_learn(verbose=True): dfs, frag_inten_dfs ) elif len(mgr_settings['transfer']['ms_files'])>0: - psm_df, frag_df = match_psms() + psm_df, _, frag_df = match_psms() else: psm_df = import_psm_df( mgr_settings['transfer']['psm_files'], From 516ffe99016985dcaec1e5c8985cb367d3162a37 Mon Sep 17 00:00:00 2001 From: jalew188 Date: Thu, 22 Feb 2024 22:19:27 +0100 Subject: [PATCH 17/18] =?UTF-8?q?Bump=20version:=201.1.5=20=E2=86=92=201.1?= =?UTF-8?q?.6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- docs/conf.py | 2 +- peptdeep/__init__.py | 2 +- release/one_click_linux_gui/control | 2 +- release/one_click_linux_gui/create_installer_linux.sh | 2 +- release/one_click_macos_gui/Info.plist | 4 ++-- release/one_click_macos_gui/create_installer_macos.sh | 4 ++-- release/one_click_macos_gui/distribution.xml | 2 +- release/one_click_windows_gui/create_installer_windows.sh | 2 +- release/one_click_windows_gui/peptdeep_innoinstaller.iss | 2 +- settings.ini | 2 +- 11 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 7060a748..fe93a21e 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.1.5 +current_version = 1.1.6 commit = True tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? diff --git a/docs/conf.py b/docs/conf.py index e70be6ba..8b82fc4b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -23,7 +23,7 @@ copyright = '2022, Mann Labs, MPIB' author = 'Mann Labs, MPIB' -release = "1.1.5" +release = "1.1.6" # -- General configuration --------------------------------------------------- diff --git a/peptdeep/__init__.py b/peptdeep/__init__.py index 659c11c4..61c5edb3 100644 --- a/peptdeep/__init__.py +++ b/peptdeep/__init__.py @@ -11,7 +11,7 @@ # pass __project__ = "peptdeep" -__version__ = "1.1.5" +__version__ = "1.1.6" __license__ = "Apache 2.0" __description__ = "The AlphaX deep learning framework for Proteomics" __author__ = "Mann Labs" diff --git a/release/one_click_linux_gui/control b/release/one_click_linux_gui/control index 808fbc4f..16d6fa36 100644 --- a/release/one_click_linux_gui/control +++ b/release/one_click_linux_gui/control @@ -1,5 +1,5 @@ Package: peptdeep -Version: 1.1.5 +Version: 1.1.6 Architecture: all Maintainer: Mann Labs Description: peptdeep diff --git a/release/one_click_linux_gui/create_installer_linux.sh b/release/one_click_linux_gui/create_installer_linux.sh index 12048b49..e8820f69 100644 --- a/release/one_click_linux_gui/create_installer_linux.sh +++ b/release/one_click_linux_gui/create_installer_linux.sh @@ -17,7 +17,7 @@ python setup.py sdist bdist_wheel # Setting up the local package cd release/one_click_linux_gui # Make sure you include the required extra packages and always use the stable or very-stable options! -pip install "../../dist/peptdeep-1.1.5-py3-none-any.whl[stable]" +pip install "../../dist/peptdeep-1.1.6-py3-none-any.whl[stable]" if [ "$1" == "CPU" ]; then pip install torch -U --extra-index-url https://download.pytorch.org/whl/cpu diff --git a/release/one_click_macos_gui/Info.plist b/release/one_click_macos_gui/Info.plist index 12f67d9b..fd3a740b 100644 --- a/release/one_click_macos_gui/Info.plist +++ b/release/one_click_macos_gui/Info.plist @@ -9,9 +9,9 @@ CFBundleIconFile alpha_logo.icns CFBundleIdentifier - peptdeep.1.1.5 + peptdeep.1.1.6 CFBundleShortVersionString - 1.1.5 + 1.1.6 CFBundleInfoDictionaryVersion 6.0 CFBundleName diff --git a/release/one_click_macos_gui/create_installer_macos.sh b/release/one_click_macos_gui/create_installer_macos.sh index 5792fef5..42fcd8a0 100644 --- a/release/one_click_macos_gui/create_installer_macos.sh +++ b/release/one_click_macos_gui/create_installer_macos.sh @@ -20,7 +20,7 @@ python setup.py sdist bdist_wheel # Setting up the local package cd release/one_click_macos_gui -pip install "../../dist/peptdeep-1.1.5-py3-none-any.whl[stable]" +pip install "../../dist/peptdeep-1.1.6-py3-none-any.whl[stable]" # Creating the stand-alone pyinstaller folder pip install pyinstaller @@ -40,5 +40,5 @@ cp ../../LICENSE.txt Resources/LICENSE.txt cp ../logos/alpha_logo.png Resources/alpha_logo.png chmod 777 scripts/* -pkgbuild --root dist/peptdeep --identifier de.mpg.biochem.peptdeep.app --version 1.1.5 --install-location /Applications/peptdeep.app --scripts scripts peptdeep.pkg +pkgbuild --root dist/peptdeep --identifier de.mpg.biochem.peptdeep.app --version 1.1.6 --install-location /Applications/peptdeep.app --scripts scripts peptdeep.pkg productbuild --distribution distribution.xml --resources Resources --package-path peptdeep.pkg dist/peptdeep_gui_installer_macos.pkg diff --git a/release/one_click_macos_gui/distribution.xml b/release/one_click_macos_gui/distribution.xml index acfb6db2..a65bc9b1 100644 --- a/release/one_click_macos_gui/distribution.xml +++ b/release/one_click_macos_gui/distribution.xml @@ -1,6 +1,6 @@ - peptdeep 1.1.5 + peptdeep 1.1.6 diff --git a/release/one_click_windows_gui/create_installer_windows.sh b/release/one_click_windows_gui/create_installer_windows.sh index 31658a88..f6675c8d 100644 --- a/release/one_click_windows_gui/create_installer_windows.sh +++ b/release/one_click_windows_gui/create_installer_windows.sh @@ -17,7 +17,7 @@ python setup.py sdist bdist_wheel # Setting up the local package cd release/one_click_windows_gui # Make sure you include the required extra packages and always use the stable or very-stable options! -pip install "../../dist/peptdeep-1.1.5-py3-none-any.whl[stable]" +pip install "../../dist/peptdeep-1.1.6-py3-none-any.whl[stable]" # Creating the stand-alone pyinstaller folder pip install pyinstaller diff --git a/release/one_click_windows_gui/peptdeep_innoinstaller.iss b/release/one_click_windows_gui/peptdeep_innoinstaller.iss index 8e7f2e3d..7be43673 100644 --- a/release/one_click_windows_gui/peptdeep_innoinstaller.iss +++ b/release/one_click_windows_gui/peptdeep_innoinstaller.iss @@ -2,7 +2,7 @@ ; SEE THE DOCUMENTATION FOR DETAILS ON CREATING INNO SETUP SCRIPT FILES! #define MyAppName "peptdeep" -#define MyAppVersion "1.1.5" +#define MyAppVersion "1.1.6" #define MyAppPublisher "Max Planck Institute of Biochemistry and the University of Copenhagen, Mann Labs" #define MyAppURL "https://github.com/MannLabs/peptdeep" #define MyAppExeName "peptdeep_gui.exe" diff --git a/settings.ini b/settings.ini index b46b3b6e..51805e24 100644 --- a/settings.ini +++ b/settings.ini @@ -5,7 +5,7 @@ ### Python library ### repo = alphapeptdeep lib_name = peptdeep -version = 1.1.5 +version = 1.1.6 min_python = 3.7 license = apache2 From 439cee2624ff541afc1c2ac600897464c4295a8b Mon Sep 17 00:00:00 2001 From: jalew188 Date: Thu, 22 Feb 2024 22:27:50 +0100 Subject: [PATCH 18/18] FIX #138 --- peptdeep/pipeline_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/peptdeep/pipeline_api.py b/peptdeep/pipeline_api.py index 46e50889..4e500afc 100644 --- a/peptdeep/pipeline_api.py +++ b/peptdeep/pipeline_api.py @@ -326,7 +326,7 @@ def transfer_learn(verbose=True): dfs, frag_inten_dfs ) elif len(mgr_settings['transfer']['ms_files'])>0: - psm_df, _, frag_df = match_psms() + psm_df, frag_mz_df, frag_df = match_psms() else: psm_df = import_psm_df( mgr_settings['transfer']['psm_files'],