From 6f5b05d49d8b7a7abed38539fbc2f5d4f0875f7a Mon Sep 17 00:00:00 2001 From: jalew188 Date: Wed, 17 Jul 2024 16:50:06 +0200 Subject: [PATCH] Modify descriptions --- docs/nbs/tutorial_dev_basic_definations.ipynb | 1668 ----------------- ...ipynb => tutorial_basic_definations.ipynb} | 317 ++-- ...nb => tutorial_dataframe_structures.ipynb} | 8 +- ...pynb => tutorial_spectral_libraries.ipynb} | 0 4 files changed, 211 insertions(+), 1782 deletions(-) delete mode 100644 docs/nbs/tutorial_dev_basic_definations.ipynb rename docs/tutorials/{basic_definations.ipynb => tutorial_basic_definations.ipynb} (96%) rename docs/tutorials/{dataframe_structures.ipynb => tutorial_dataframe_structures.ipynb} (99%) rename docs/tutorials/{spectral_libraries.ipynb => tutorial_spectral_libraries.ipynb} (100%) diff --git a/docs/nbs/tutorial_dev_basic_definations.ipynb b/docs/nbs/tutorial_dev_basic_definations.ipynb deleted file mode 100644 index 34e02e8..0000000 --- a/docs/nbs/tutorial_dev_basic_definations.ipynb +++ /dev/null @@ -1,1668 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Tutorial for Dev: Basic Definations\n", - "\n", - "This notebook introduces low-level functionalities use in AlphaBase to developers." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "%reload_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Atoms/Elements\n", - "\n", - "The masses of all amino acids and modifications are calculated from their atom compositions.\n", - "\n", - "The atom information are defined in https://github.com/MannLabs/alphabase/blob/main/alphabase/constants/const_files/nist_element.yaml which is parsed from NIST, see https://github.com/MannLabs/alphabase/blob/main/nbs/nist_chem_to_yaml.ipynb.\n", - "\n", - "After adding some heavy isotopes, including 13C, 15N, 2H, and 18O, we obtain 109 kinds of atoms:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abundancemass
13C[0.01, 0.99][12.0, 13.00335483507]
14N[0.996337, 0.003663][14.00307400443, 15.00010889888]
15N[0.01, 0.99][14.00307400443, 15.00010889888]
18O[0.005, 0.005, 0.99][15.99491461957, 16.9991317565, 17.99915961286]
2H[0.01, 0.99][1.00782503223, 2.01410177812]
.........
Xe[0.000952, 0.00089, 0.019102, 0.264006, 0.0407...[123.905892, 125.9042983, 127.903531, 128.9047...
Y[1.0][88.9058403]
Yb[0.00123, 0.02982, 0.1409, 0.2168, 0.16103, 0....[167.9338896, 169.9347664, 170.9363302, 171.93...
Zn[0.4917, 0.2773, 0.0404, 0.1845, 0.0061][63.92914201, 65.92603381, 66.92712775, 67.924...
Zr[0.5145, 0.1122, 0.1715, 0.1738, 0.028][89.9046977, 90.9056396, 91.9050347, 93.906310...
\n", - "

109 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " abundance \\\n", - "13C [0.01, 0.99] \n", - "14N [0.996337, 0.003663] \n", - "15N [0.01, 0.99] \n", - "18O [0.005, 0.005, 0.99] \n", - "2H [0.01, 0.99] \n", - ".. ... \n", - "Xe [0.000952, 0.00089, 0.019102, 0.264006, 0.0407... \n", - "Y [1.0] \n", - "Yb [0.00123, 0.02982, 0.1409, 0.2168, 0.16103, 0.... \n", - "Zn [0.4917, 0.2773, 0.0404, 0.1845, 0.0061] \n", - "Zr [0.5145, 0.1122, 0.1715, 0.1738, 0.028] \n", - "\n", - " mass \n", - "13C [12.0, 13.00335483507] \n", - "14N [14.00307400443, 15.00010889888] \n", - "15N [14.00307400443, 15.00010889888] \n", - "18O [15.99491461957, 16.9991317565, 17.99915961286] \n", - "2H [1.00782503223, 2.01410177812] \n", - ".. ... \n", - "Xe [123.905892, 125.9042983, 127.903531, 128.9047... \n", - "Y [88.9058403] \n", - "Yb [167.9338896, 169.9347664, 170.9363302, 171.93... \n", - "Zn [63.92914201, 65.92603381, 66.92712775, 67.924... \n", - "Zr [89.9046977, 90.9056396, 91.9050347, 93.906310... \n", - "\n", - "[109 rows x 2 columns]" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "from alphabase.constants.atom import CHEM_INFO_DICT\n", - "pd.DataFrame().from_dict(CHEM_INFO_DICT, orient='index')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And their mono-isotopic mass are in `CHEM_MONO_MASS` (dict):" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
0
13C13.003355
14N14.003074
15N15.000109
18O17.999160
2H2.014102
......
Xe131.904155
Y88.905840
Yb173.938866
Zn63.929142
Zr89.904698
\n", - "

109 rows × 1 columns

\n", - "
" - ], - "text/plain": [ - " 0\n", - "13C 13.003355\n", - "14N 14.003074\n", - "15N 15.000109\n", - "18O 17.999160\n", - "2H 2.014102\n", - ".. ...\n", - "Xe 131.904155\n", - "Y 88.905840\n", - "Yb 173.938866\n", - "Zn 63.929142\n", - "Zr 89.904698\n", - "\n", - "[109 rows x 1 columns]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from alphabase.constants.atom import CHEM_MONO_MASS\n", - "pd.DataFrame().from_dict(CHEM_MONO_MASS, orient='index')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "These atom masses are used to calculate the masses of amino acids, modifications, and then subsequent masses of peptides and fragments." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Commonly used molecular masses" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(1.007276467, 1.0033, 17.02654910112, 18.01056468403)" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from alphabase.constants.atom import (\n", - " MASS_PROTON, MASS_ISOTOPE, MASS_NH3, MASS_H2O\n", - ")\n", - "MASS_PROTON, MASS_ISOTOPE, MASS_NH3, MASS_H2O" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Amino Acids" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
aaformulamass
65AC(3)H(5)N(1)O(1)S(0)7.103711e+01
66BC(1000000)1.200000e+07
67CC(3)H(5)N(1)O(1)S(1)1.030092e+02
68DC(4)H(5)N(1)O(3)S(0)1.150269e+02
69EC(5)H(7)N(1)O(3)S(0)1.290426e+02
70FC(9)H(9)N(1)O(1)S(0)1.470684e+02
71GC(2)H(3)N(1)O(1)S(0)5.702146e+01
72HC(6)H(7)N(3)O(1)S(0)1.370589e+02
73IC(6)H(11)N(1)O(1)S(0)1.130841e+02
74JC(6)H(11)N(1)O(1)S(0)1.130841e+02
75KC(6)H(12)N(2)O(1)S(0)1.280950e+02
76LC(6)H(11)N(1)O(1)S(0)1.130841e+02
77MC(5)H(9)N(1)O(1)S(1)1.310405e+02
78NC(4)H(6)N(2)O(2)S(0)1.140429e+02
79OC(12)H(19)N(3)O(2)2.371477e+02
80PC(5)H(7)N(1)O(1)S(0)9.705276e+01
81QC(5)H(8)N(2)O(2)S(0)1.280586e+02
82RC(6)H(12)N(4)O(1)S(0)1.561011e+02
83SC(3)H(5)N(1)O(2)S(0)8.703203e+01
84TC(4)H(7)N(1)O(2)S(0)1.010477e+02
85UC(3)H(5)N(1)O(1)Se(1)1.509536e+02
86VC(5)H(9)N(1)O(1)S(0)9.906841e+01
87WC(11)H(10)N(2)O(1)S(0)1.860793e+02
88XC(1000000)1.200000e+07
89YC(9)H(9)N(1)O(2)S(0)1.630633e+02
90ZC(1000000)1.200000e+07
\n", - "
" - ], - "text/plain": [ - " aa formula mass\n", - "65 A C(3)H(5)N(1)O(1)S(0) 7.103711e+01\n", - "66 B C(1000000) 1.200000e+07\n", - "67 C C(3)H(5)N(1)O(1)S(1) 1.030092e+02\n", - "68 D C(4)H(5)N(1)O(3)S(0) 1.150269e+02\n", - "69 E C(5)H(7)N(1)O(3)S(0) 1.290426e+02\n", - "70 F C(9)H(9)N(1)O(1)S(0) 1.470684e+02\n", - "71 G C(2)H(3)N(1)O(1)S(0) 5.702146e+01\n", - "72 H C(6)H(7)N(3)O(1)S(0) 1.370589e+02\n", - "73 I C(6)H(11)N(1)O(1)S(0) 1.130841e+02\n", - "74 J C(6)H(11)N(1)O(1)S(0) 1.130841e+02\n", - "75 K C(6)H(12)N(2)O(1)S(0) 1.280950e+02\n", - "76 L C(6)H(11)N(1)O(1)S(0) 1.130841e+02\n", - "77 M C(5)H(9)N(1)O(1)S(1) 1.310405e+02\n", - "78 N C(4)H(6)N(2)O(2)S(0) 1.140429e+02\n", - "79 O C(12)H(19)N(3)O(2) 2.371477e+02\n", - "80 P C(5)H(7)N(1)O(1)S(0) 9.705276e+01\n", - "81 Q C(5)H(8)N(2)O(2)S(0) 1.280586e+02\n", - "82 R C(6)H(12)N(4)O(1)S(0) 1.561011e+02\n", - "83 S C(3)H(5)N(1)O(2)S(0) 8.703203e+01\n", - "84 T C(4)H(7)N(1)O(2)S(0) 1.010477e+02\n", - "85 U C(3)H(5)N(1)O(1)Se(1) 1.509536e+02\n", - "86 V C(5)H(9)N(1)O(1)S(0) 9.906841e+01\n", - "87 W C(11)H(10)N(2)O(1)S(0) 1.860793e+02\n", - "88 X C(1000000) 1.200000e+07\n", - "89 Y C(9)H(9)N(1)O(2)S(0) 1.630633e+02\n", - "90 Z C(1000000) 1.200000e+07" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from alphabase.constants.aa import AA_DF\n", - "AA_DF.loc[ord('A'):ord('Z')]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From `AA_DF`, we can see that amino acids are encoded by ASCII (128 characters). 65==ord('A'), ..., 90==ord('Z'). Unicode strings can be fastly converted to ascii int32 values using numpy:" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([65, 66, 67, 88, 89, 90], dtype=int32)" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import numpy as np\n", - "\n", - "np.array(['ABCXYZ']).view(np.int32)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "But users does not need to know this, as we provided easy to use functionalities to get residue masses from sequences." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Calculate AA masses in batch" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[131.04048509, 71.03711379, 103.00918496, 115.02694302,\n", - " 129.04259309, 147.06841391, 57.02146372],\n", - " [131.04048509, 71.03711379, 128.09496302, 115.02694302,\n", - " 129.04259309, 147.06841391, 57.02146372],\n", - " [131.04048509, 71.03711379, 128.09496302, 115.02694302,\n", - " 129.04259309, 147.06841391, 156.10111102]])" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from alphabase.constants.aa import calc_AA_masses_for_same_len_seqs\n", - "calc_AA_masses_for_same_len_seqs(\n", - " [\n", - " 'MACDEFG', 'MAKDEFG', 'MAKDEFR'\n", - " ]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Modifications\n", - "\n", - "In AlphaBase, we used `mod_name@aa` to represent a modification, the `mod_name` is from UniMod. We also used `mod_name@Protein_N-term`, `mod_name@Any_N-term` and `mod_name@Any_C-term` for terminal modifications, which follow the UniMod terminal name schema.\n", - "\n", - "The default modification TSV is stored in `alphabase/constants/const_files/modification.tsv`, users can add more modifications into the tsv file (only `mod_name` and `composition` colums are required). Please https://github.com/MannLabs/alphabase/blob/main/alphabase/constants/const_files/modification.tsv." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
mod_nameunimod_massunimod_avge_masscompositionunimod_modlossmodloss_compositionclassificationunimod_idmodloss_importancemassmodloss_originalmodloss
mod_name
Acetyl@TAcetyl@T42.01056542.0367H(2)C(2)O(1)0.0Post-translational10.042.0105650.00.0
Acetyl@Protein_N-termAcetyl@Protein_N-term42.01056542.0367H(2)C(2)O(1)0.0Post-translational10.042.0105650.00.0
Acetyl@SAcetyl@S42.01056542.0367H(2)C(2)O(1)0.0Post-translational10.042.0105650.00.0
Acetyl@CAcetyl@C42.01056542.0367H(2)C(2)O(1)0.0Post-translational10.042.0105650.00.0
Acetyl@Any_N-termAcetyl@Any_N-term42.01056542.0367H(2)C(2)O(1)0.0Multiple10.042.0105650.00.0
.......................................
TMTpro_zero@KTMTpro_zero@K295.189592295.3773H(25)C(15)N(3)O(3)0.0Chemical derivative20170.0295.1895920.00.0
TMTpro_zero@TTMTpro_zero@T295.189592295.3773H(25)C(15)N(3)O(3)0.0Chemical derivative20170.0295.1895920.00.0
Andro-H2O@CAndro-H2O@C332.198760332.4339H(28)C(20)O(4)0.0Chemical derivative20250.0332.1987590.00.0
His+O(2)@HHis+O(2)@H169.048741169.1381H(7)C(6)N(3)O(3)0.0Post-translational20270.0169.0487410.00.0
GlyGly@KGlyGly@K114.042927114.1026H(6)C(4)N(2)O(2)0.0Post-translational1211000000.0114.0429270.00.0
\n", - "

2685 rows × 12 columns

\n", - "
" - ], - "text/plain": [ - " mod_name unimod_mass unimod_avge_mass \\\n", - "mod_name \n", - "Acetyl@T Acetyl@T 42.010565 42.0367 \n", - "Acetyl@Protein_N-term Acetyl@Protein_N-term 42.010565 42.0367 \n", - "Acetyl@S Acetyl@S 42.010565 42.0367 \n", - "Acetyl@C Acetyl@C 42.010565 42.0367 \n", - "Acetyl@Any_N-term Acetyl@Any_N-term 42.010565 42.0367 \n", - "... ... ... ... \n", - "TMTpro_zero@K TMTpro_zero@K 295.189592 295.3773 \n", - "TMTpro_zero@T TMTpro_zero@T 295.189592 295.3773 \n", - "Andro-H2O@C Andro-H2O@C 332.198760 332.4339 \n", - "His+O(2)@H His+O(2)@H 169.048741 169.1381 \n", - "GlyGly@K GlyGly@K 114.042927 114.1026 \n", - "\n", - " composition unimod_modloss modloss_composition \\\n", - "mod_name \n", - "Acetyl@T H(2)C(2)O(1) 0.0 \n", - "Acetyl@Protein_N-term H(2)C(2)O(1) 0.0 \n", - "Acetyl@S H(2)C(2)O(1) 0.0 \n", - "Acetyl@C H(2)C(2)O(1) 0.0 \n", - "Acetyl@Any_N-term H(2)C(2)O(1) 0.0 \n", - "... ... ... ... \n", - "TMTpro_zero@K H(25)C(15)N(3)O(3) 0.0 \n", - "TMTpro_zero@T H(25)C(15)N(3)O(3) 0.0 \n", - "Andro-H2O@C H(28)C(20)O(4) 0.0 \n", - "His+O(2)@H H(7)C(6)N(3)O(3) 0.0 \n", - "GlyGly@K H(6)C(4)N(2)O(2) 0.0 \n", - "\n", - " classification unimod_id modloss_importance \\\n", - "mod_name \n", - "Acetyl@T Post-translational 1 0.0 \n", - "Acetyl@Protein_N-term Post-translational 1 0.0 \n", - "Acetyl@S Post-translational 1 0.0 \n", - "Acetyl@C Post-translational 1 0.0 \n", - "Acetyl@Any_N-term Multiple 1 0.0 \n", - "... ... ... ... \n", - "TMTpro_zero@K Chemical derivative 2017 0.0 \n", - "TMTpro_zero@T Chemical derivative 2017 0.0 \n", - "Andro-H2O@C Chemical derivative 2025 0.0 \n", - "His+O(2)@H Post-translational 2027 0.0 \n", - "GlyGly@K Post-translational 121 1000000.0 \n", - "\n", - " mass modloss_original modloss \n", - "mod_name \n", - "Acetyl@T 42.010565 0.0 0.0 \n", - "Acetyl@Protein_N-term 42.010565 0.0 0.0 \n", - "Acetyl@S 42.010565 0.0 0.0 \n", - "Acetyl@C 42.010565 0.0 0.0 \n", - "Acetyl@Any_N-term 42.010565 0.0 0.0 \n", - "... ... ... ... \n", - "TMTpro_zero@K 295.189592 0.0 0.0 \n", - "TMTpro_zero@T 295.189592 0.0 0.0 \n", - "Andro-H2O@C 332.198759 0.0 0.0 \n", - "His+O(2)@H 169.048741 0.0 0.0 \n", - "GlyGly@K 114.042927 0.0 0.0 \n", - "\n", - "[2685 rows x 12 columns]" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from alphabase.constants.modification import MOD_DF\n", - "MOD_DF" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Modification sites\n", - "\n", - "In alphabase, we use 0 and -1 to represent modification site of N-term and C-term, respectively. For other modification sites, we use 1 to n." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([42.01056468, 0. , 57.02146372, 0. , 0. ,\n", - " 0. , 0. ])" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from alphabase.constants.modification import calc_modification_mass\n", - "sequence = 'MACDEFG'\n", - "mod_names = ['Acetyl@Any_N-term', 'Carbamidomethyl@C']\n", - "mod_sites = [0,3]\n", - "calc_modification_mass(\n", - " nAA=len(sequence),\n", - " mod_names=mod_names,\n", - " mod_sites=mod_sites\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The modifications on the first amino acid and N-term will be added." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([58.0054793, 0. , 0. , 0. , 0. ,\n", - " 0. , 0. ])" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sequence = 'MAKDEFG'\n", - "mod_names = ['Acetyl@Any_N-term', 'Oxidation@M']\n", - "mod_sites = [0,1]\n", - "calc_modification_mass(\n", - " nAA=len(sequence),\n", - " mod_names=mod_names,\n", - " mod_sites=mod_sites\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Multiple modification at a single site is supported, for example, in the following example, `K3` contains both `GlyGly@K` and `Dimethyl@K`:" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([ 0. , 0. , 142.07422757, 0. ,\n", - " 0. , 0. , 0. ])" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sequence = 'MAKDEFR'\n", - "mod_names = ['GlyGly@K', 'Dimethyl@K']\n", - "mod_sites = [3,3]\n", - "calc_modification_mass(\n", - " nAA=len(sequence),\n", - " mod_names=mod_names,\n", - " mod_sites=mod_sites\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Caculate modification masses in batch" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[ 42.01056468, 0. , 57.02146372, 0. ,\n", - " 0. , 0. , 0. ],\n", - " [ 58.0054793 , 0. , 0. , 0. ,\n", - " 0. , 0. , 0. ],\n", - " [ 0. , 0. , 142.07422757, 0. ,\n", - " 0. , 0. , 0. ]])" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from alphabase.constants.modification import calc_mod_masses_for_same_len_seqs\n", - "calc_mod_masses_for_same_len_seqs(\n", - " nAA=7,\n", - " mod_names_list=[\n", - " ['Acetyl@Any_N-term', 'Carbamidomethyl@C'],\n", - " ['Acetyl@Any_N-term', 'Oxidation@M'],\n", - " ['GlyGly@K', 'Dimethyl@K'],\n", - " ],\n", - " mod_sites_list=[\n", - " [0, 3],\n", - " [0, 1],\n", - " [3, 3],\n", - " ]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Mass calculation functionalities" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Calculate AA and modification masses in batch" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[173.05104977, 71.03711379, 160.03064868, 115.02694302,\n", - " 129.04259309, 147.06841391, 57.02146372],\n", - " [189.04596439, 71.03711379, 128.09496302, 115.02694302,\n", - " 129.04259309, 147.06841391, 57.02146372],\n", - " [131.04048509, 71.03711379, 270.16919059, 115.02694302,\n", - " 129.04259309, 147.06841391, 156.10111102]])" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from alphabase.constants.aa import calc_AA_masses_for_same_len_seqs\n", - "from alphabase.constants.modification import calc_mod_masses_for_same_len_seqs\n", - "mod_masses = calc_mod_masses_for_same_len_seqs(\n", - " nAA=7,\n", - " mod_names_list=[\n", - " ['Acetyl@Any_N-term', 'Carbamidomethyl@C'],\n", - " ['Acetyl@Any_N-term', 'Oxidation@M'],\n", - " ['GlyGly@K', 'Dimethyl@K'],\n", - " ],\n", - " mod_sites_list=[\n", - " [0, 3],\n", - " [0, 1],\n", - " [3, 3],\n", - " ]\n", - ")\n", - "aa_masses = calc_AA_masses_for_same_len_seqs(\n", - " [\n", - " 'MACDEFG', 'MAKDEFG', 'MAKDEFR'\n", - " ]\n", - ")\n", - "mod_masses+aa_masses" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### np.cumsum to get b-ion neutral masses" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[ 173.05104977, 244.08816356, 404.11881224, 519.14575526,\n", - " 648.18834835, 795.25676227, 852.27822599],\n", - " [ 189.04596439, 260.08307818, 388.17804119, 503.20498422,\n", - " 632.24757731, 779.31599122, 836.33745494],\n", - " [ 131.04048509, 202.07759887, 472.24678946, 587.27373248,\n", - " 716.31632557, 863.38473949, 1019.48585051]])" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import numpy as np\n", - "np.cumsum(aa_masses+mod_masses, axis=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Mass functionalities in 'mass_calc'\n", - "\n", - "The functionalities for peptide and fragment neutral masses have been implement in `alphabase.peptide.mass_calc`:" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([ 870.28879067, 854.34801962, 1037.49641519])" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from alphabase.peptide.mass_calc import calc_peptide_masses_for_same_len_seqs\n", - "\n", - "peptide_masses = calc_peptide_masses_for_same_len_seqs(\n", - " ['MACDEFG', 'MAKDEFG', 'MAKDEFR'],\n", - " mod_list=[\n", - " 'Acetyl@Any_N-term;Carbamidomethyl@C',\n", - " 'Acetyl@Any_N-term;Oxidation@M',\n", - " 'GlyGly@K;Dimethyl@K',\n", - " ],\n", - ")\n", - "peptide_masses" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([ 870.28879067, 854.34801962, 1037.49641519])" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from alphabase.peptide.mass_calc import calc_b_y_and_peptide_masses_for_same_len_seqs\n", - "b_masses, y_masses, peptide_masses = calc_b_y_and_peptide_masses_for_same_len_seqs(\n", - " ['MACDEFG', 'MAKDEFG', 'MAKDEFR'],\n", - " mod_list=[\n", - " ['Acetyl@Any_N-term', 'Carbamidomethyl@C'],\n", - " ['Acetyl@Any_N-term', 'Oxidation@M'],\n", - " ['GlyGly@K', 'Dimethyl@K'],\n", - " ],\n", - " site_list=[\n", - " [0, 3],\n", - " [0, 1],\n", - " [3, 3],\n", - " ],\n", - ")\n", - "peptide_masses" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[173.05104977, 244.08816356, 404.11881224, 519.14575526,\n", - " 648.18834835, 795.25676227],\n", - " [189.04596439, 260.08307818, 388.17804119, 503.20498422,\n", - " 632.24757731, 779.31599122],\n", - " [131.04048509, 202.07759887, 472.24678946, 587.27373248,\n", - " 716.31632557, 863.38473949]])" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "b_masses" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[697.2377409 , 626.20062711, 466.16997843, 351.14303541,\n", - " 222.10044232, 75.0320284 ],\n", - " [665.30205523, 594.26494145, 466.16997843, 351.14303541,\n", - " 222.10044232, 75.0320284 ],\n", - " [906.45593011, 835.41881632, 565.24962574, 450.22268271,\n", - " 321.18008962, 174.11167571]])" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "y_masses" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Isotope distribution\n", - "\n", - "`alphabase.constants.isotope.IsotopeDistribution` will calculate the isotope distribution and the mono-isotopic idx in the distribution for a given atom composition. \n", - "\n", - "What is the mono-isotopic idx (mono_idx)? For an atom, the `mono_idx` points to the highest abundance isotope, so the value is `round(mass of highest isotope - mass of first isotope)`." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abundancemassmono_idx
13C[0.01, 0.99][12.0, 13.00335483507]1
14N[0.996337, 0.003663][14.00307400443, 15.00010889888]0
15N[0.01, 0.99][14.00307400443, 15.00010889888]1
18O[0.005, 0.005, 0.99][15.99491461957, 16.9991317565, 17.99915961286]2
2H[0.01, 0.99][1.00782503223, 2.01410177812]1
............
Xe[0.000952, 0.00089, 0.019102, 0.264006, 0.0407...[123.905892, 125.9042983, 127.903531, 128.9047...8
Y[1.0][88.9058403]0
Yb[0.00123, 0.02982, 0.1409, 0.2168, 0.16103, 0....[167.9338896, 169.9347664, 170.9363302, 171.93...6
Zn[0.4917, 0.2773, 0.0404, 0.1845, 0.0061][63.92914201, 65.92603381, 66.92712775, 67.924...0
Zr[0.5145, 0.1122, 0.1715, 0.1738, 0.028][89.9046977, 90.9056396, 91.9050347, 93.906310...0
\n", - "

109 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " abundance \\\n", - "13C [0.01, 0.99] \n", - "14N [0.996337, 0.003663] \n", - "15N [0.01, 0.99] \n", - "18O [0.005, 0.005, 0.99] \n", - "2H [0.01, 0.99] \n", - ".. ... \n", - "Xe [0.000952, 0.00089, 0.019102, 0.264006, 0.0407... \n", - "Y [1.0] \n", - "Yb [0.00123, 0.02982, 0.1409, 0.2168, 0.16103, 0.... \n", - "Zn [0.4917, 0.2773, 0.0404, 0.1845, 0.0061] \n", - "Zr [0.5145, 0.1122, 0.1715, 0.1738, 0.028] \n", - "\n", - " mass mono_idx \n", - "13C [12.0, 13.00335483507] 1 \n", - "14N [14.00307400443, 15.00010889888] 0 \n", - "15N [14.00307400443, 15.00010889888] 1 \n", - "18O [15.99491461957, 16.9991317565, 17.99915961286] 2 \n", - "2H [1.00782503223, 2.01410177812] 1 \n", - ".. ... ... \n", - "Xe [123.905892, 125.9042983, 127.903531, 128.9047... 8 \n", - "Y [88.9058403] 0 \n", - "Yb [167.9338896, 169.9347664, 170.9363302, 171.93... 6 \n", - "Zn [63.92914201, 65.92603381, 66.92712775, 67.924... 0 \n", - "Zr [89.9046977, 90.9056396, 91.9050347, 93.906310... 0 \n", - "\n", - "[109 rows x 3 columns]" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "from alphabase.constants.atom import CHEM_INFO_DICT\n", - "atom_df = pd.DataFrame().from_dict(CHEM_INFO_DICT, orient='index')\n", - "def get_mono(masses_abundances):\n", - " masses, abundances = masses_abundances\n", - " return round(masses[np.argmax(abundances)]-masses[0])\n", - "atom_df['mono_idx'] = atom_df[['mass','abundance']].apply(\n", - " get_mono, axis=1\n", - ")\n", - "atom_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`mono_idx` of an atom composition refers to the sum of the `mono_idx` of all atoms. In AlphaBase, `alphabase.constants.isotope.IsotopeDistribution` calculate both isotope abundance and `mono_idx`. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For example, `Fe`'s `mono_idx` is 2," - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "abundance [0.05845, 0.91754, 0.02119, 0.00282]\n", - "mass [53.93960899, 55.93493633, 56.93539284, 57.933...\n", - "mono_idx 2\n", - "Name: Fe, dtype: object" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "atom_df.loc['Fe']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So `C(1)Fe(1)`'s `mono_idx` is also 2:" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([5.78245850e-02, 6.25415000e-04, 9.07722322e-01, 3.07809450e-02,\n", - " 3.01655900e-03, 3.01740000e-05, 0.00000000e+00, 0.00000000e+00,\n", - " 0.00000000e+00, 0.00000000e+00]),\n", - " 2)" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from alphabase.constants.isotope import IsotopeDistribution, parse_formula\n", - "iso = IsotopeDistribution()\n", - "iso.calc_formula_distribution(\n", - " [('C',1),('Fe',1)]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "But `13C(1)Fe(1)`'s `mono_idx` should be 3:" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([5.845000e-04, 5.786550e-02, 9.175400e-03, 9.085765e-01,\n", - " 2.100630e-02, 2.791800e-03, 0.000000e+00, 0.000000e+00,\n", - " 0.000000e+00, 0.000000e+00]),\n", - " 3)" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "iso.calc_formula_distribution(\n", - " [('13C',1),('Fe',1)]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `mono_idx` for most of the atom compositions is 0, no matter how big the compositions are." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[('C', 100), ('H', 100), ('O', 50), ('Na', 1)]" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from alphabase.constants.isotope import IsotopeDistribution, parse_formula\n", - "iso = IsotopeDistribution()\n", - "\n", - "formula = 'C(100)H(100)O(50)Na(1)'\n", - "formula = parse_formula(formula)\n", - "formula" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> `mono` isotope is not the `highest` isotope!!!" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(0,\n", - " 1,\n", - " array([2.98521241e-01, 3.31991573e-01, 2.13532938e-01, 1.00604878e-01,\n", - " 3.82856126e-02, 1.23872292e-02, 3.51773755e-03, 8.95830236e-04,\n", - " 2.07763024e-04, 4.43944472e-05]))" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dist, mono = iso.calc_formula_distribution(formula)\n", - "mono, dist.argmax(), dist" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "All these low-level functionalities have been integrated into DataFrame functionalities, see `tutorial_dev_dataframes.ipynb` or `Tutorial for Dev: Peptide and Fragment DataFrames`" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.8.3 ('base')", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.3" - }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "8a3b27e141e49c996c9b863f8707e97aabd49c4a7e8445b9b783b34e4a21a9b2" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/tutorials/basic_definations.ipynb b/docs/tutorials/tutorial_basic_definations.ipynb similarity index 96% rename from docs/tutorials/basic_definations.ipynb rename to docs/tutorials/tutorial_basic_definations.ipynb index a7fe1ba..bd7f14d 100644 --- a/docs/tutorials/basic_definations.ipynb +++ b/docs/tutorials/tutorial_basic_definations.ipynb @@ -6,11 +6,11 @@ "source": [ "# Tutorial: Basic Definations and Settings\n", "\n", - "Measuring m/z values is the very elemental function of MS technologies, therefore the calculation of mass values for a peptide and its fragments becomes the most essential part in MS-based computational tools. AlphaBase calculates all mass values from atoms. And the masses of amino acids and modifications are calculated from their atom compositions, repectively. Eventually, the masses of peptides or precursors as well as their fragments can be calculated by the amino acid sequences with or without modifications (See figure below).\n", + "Measuring m/z values is the elemental function of MS technologies, therefore the calculation of mass values for a peptide and its fragments becomes the most essential part in MS-based computational tools. AlphaBase calculates all mass values from atoms. And the masses of amino acids and modifications are calculated from their atom compositions, repectively. Eventually, the masses of peptides or precursors as well as their fragments can be calculated by the amino acid sequences with or without modifications (See figure below).\n", "\n", "Calculating masses from atoms makes it much easier to switch between unlabeled and heavy-labeled peptides, as we did in Steller MS for 15N-labeled peptides as the reference for targeted proteomics (https://www.biorxiv.org/content/10.1101/2024.06.02.597029v2.full).\n", "\n", - "The other advantage of starting from atoms is that AlphaBase can calculate isotope distributions of peptides based on a pre-defined isotope distribution list of atoms (e.g., NIST atom table in https://physics.nist.gov/cgi-bin/Compositions/stand_alone.pl). The isotope information has been applied in our AlphaDIA search engine to boost the identification of DIA-MS data (https://www.biorxiv.org/content/10.1101/2024.05.28.596182v1)." + "The other advantage of starting from atoms is that AlphaBase can calculate isotope distributions of peptides based on a pre-defined isotope distribution list of atoms (e.g., NIST atom table in https://physics.nist.gov/cgi-bin/Compositions/stand_alone.pl). The isotope information has been applied in our alphaDIA search engine to boost the identification of DIA-MS data (https://www.biorxiv.org/content/10.1101/2024.05.28.596182v1)." ] }, { @@ -687,6 +687,7 @@ " modloss_composition\n", " classification\n", " unimod_id\n", + " smiles\n", " modloss_importance\n", " mass\n", " modloss_original\n", @@ -706,6 +707,7 @@ " \n", " \n", " \n", + " \n", " \n", " \n", " \n", @@ -719,6 +721,7 @@ " \n", " Post-translational\n", " 1\n", + " \n", " 0.0\n", " 42.010565\n", " 0.0\n", @@ -734,6 +737,7 @@ " \n", " Post-translational\n", " 1\n", + " \n", " 0.0\n", " 42.010565\n", " 0.0\n", @@ -749,6 +753,7 @@ " \n", " Post-translational\n", " 1\n", + " \n", " 0.0\n", " 42.010565\n", " 0.0\n", @@ -764,6 +769,7 @@ " \n", " Post-translational\n", " 1\n", + " \n", " 0.0\n", " 42.010565\n", " 0.0\n", @@ -779,6 +785,7 @@ " \n", " Multiple\n", " 1\n", + " \n", " 0.0\n", " 42.010565\n", " 0.0\n", @@ -798,64 +805,69 @@ " ...\n", " ...\n", " ...\n", + " ...\n", " \n", " \n", - " TMTpro_zero@K\n", - " TMTpro_zero@K\n", - " 295.189592\n", - " 295.3773\n", - " H(25)C(15)N(3)O(3)\n", + " NQTGG@K\n", + " NQTGG@K\n", + " 457.192111\n", + " 457.4384\n", + " H(27)C(17)N(7)O(8)\n", " 0.0\n", " \n", - " Chemical derivative\n", - " 2017\n", + " Other\n", + " 2084\n", + " \n", " 0.0\n", - " 295.189592\n", + " 457.192111\n", " 0.0\n", " 0.0\n", " \n", " \n", - " TMTpro_zero@T\n", - " TMTpro_zero@T\n", - " 295.189592\n", - " 295.3773\n", - " H(25)C(15)N(3)O(3)\n", + " DVFQQQTGG@K\n", + " DVFQQQTGG@K\n", + " 960.430110\n", + " 960.9865\n", + " H(60)C(41)N(12)O(15)\n", " 0.0\n", " \n", - " Chemical derivative\n", - " 2017\n", + " Other\n", + " 2085\n", + " \n", " 0.0\n", - " 295.189592\n", + " 960.430109\n", " 0.0\n", " 0.0\n", " \n", " \n", - " Andro-H2O@C\n", - " Andro-H2O@C\n", - " 332.198760\n", - " 332.4339\n", - " H(28)C(20)O(4)\n", + " iST-NHS_specific_cysteine_modification@C\n", + " iST-NHS_specific_cysteine_modification@C\n", + " 113.084064\n", + " 113.1576\n", + " H(11)C(6)N(1)O(1)\n", " 0.0\n", " \n", " Chemical derivative\n", - " 2025\n", + " 2086\n", + " \n", " 0.0\n", - " 332.198759\n", + " 113.084064\n", " 0.0\n", " 0.0\n", " \n", " \n", - " His+O(2)@H\n", - " His+O(2)@H\n", - " 169.048741\n", - " 169.1381\n", - " H(7)C(6)N(3)O(3)\n", + " Label:13C(2)15N(1)@G\n", + " Label:13C(2)15N(1)@G\n", + " 3.003745\n", + " 2.9787\n", + " C(-2)13C(2)N(-1)15N(1)\n", " 0.0\n", " \n", - " Post-translational\n", - " 2027\n", + " Isotopic label\n", + " 2088\n", + " \n", " 0.0\n", - " 169.048741\n", + " 3.003745\n", " 0.0\n", " 0.0\n", " \n", @@ -869,6 +881,7 @@ " \n", " Multiple\n", " 121\n", + " \n", " 1000000.0\n", " 114.042927\n", " 0.0\n", @@ -876,67 +889,123 @@ " \n", " \n", "\n", - "

2685 rows × 12 columns

\n", + "

2772 rows × 13 columns

\n", "" ], "text/plain": [ - " mod_name unimod_mass unimod_avge_mass \\\n", - "mod_name \n", - "Acetyl@T Acetyl@T 42.010565 42.0367 \n", - "Acetyl@Protein_N-term Acetyl@Protein_N-term 42.010565 42.0367 \n", - "Acetyl@S Acetyl@S 42.010565 42.0367 \n", - "Acetyl@C Acetyl@C 42.010565 42.0367 \n", - "Acetyl@Any_N-term Acetyl@Any_N-term 42.010565 42.0367 \n", - "... ... ... ... \n", - "TMTpro_zero@K TMTpro_zero@K 295.189592 295.3773 \n", - "TMTpro_zero@T TMTpro_zero@T 295.189592 295.3773 \n", - "Andro-H2O@C Andro-H2O@C 332.198760 332.4339 \n", - "His+O(2)@H His+O(2)@H 169.048741 169.1381 \n", - "GlyGly@K GlyGly@K 114.042927 114.1026 \n", + " mod_name \\\n", + "mod_name \n", + "Acetyl@T Acetyl@T \n", + "Acetyl@Protein_N-term Acetyl@Protein_N-term \n", + "Acetyl@S Acetyl@S \n", + "Acetyl@C Acetyl@C \n", + "Acetyl@Any_N-term Acetyl@Any_N-term \n", + "... ... \n", + "NQTGG@K NQTGG@K \n", + "DVFQQQTGG@K DVFQQQTGG@K \n", + "iST-NHS_specific_cysteine_modification@C iST-NHS_specific_cysteine_modification@C \n", + "Label:13C(2)15N(1)@G Label:13C(2)15N(1)@G \n", + "GlyGly@K GlyGly@K \n", + "\n", + " unimod_mass unimod_avge_mass \\\n", + "mod_name \n", + "Acetyl@T 42.010565 42.0367 \n", + "Acetyl@Protein_N-term 42.010565 42.0367 \n", + "Acetyl@S 42.010565 42.0367 \n", + "Acetyl@C 42.010565 42.0367 \n", + "Acetyl@Any_N-term 42.010565 42.0367 \n", + "... ... ... \n", + "NQTGG@K 457.192111 457.4384 \n", + "DVFQQQTGG@K 960.430110 960.9865 \n", + "iST-NHS_specific_cysteine_modification@C 113.084064 113.1576 \n", + "Label:13C(2)15N(1)@G 3.003745 2.9787 \n", + "GlyGly@K 114.042927 114.1026 \n", + "\n", + " composition \\\n", + "mod_name \n", + "Acetyl@T H(2)C(2)O(1) \n", + "Acetyl@Protein_N-term H(2)C(2)O(1) \n", + "Acetyl@S H(2)C(2)O(1) \n", + "Acetyl@C H(2)C(2)O(1) \n", + "Acetyl@Any_N-term H(2)C(2)O(1) \n", + "... ... \n", + "NQTGG@K H(27)C(17)N(7)O(8) \n", + "DVFQQQTGG@K H(60)C(41)N(12)O(15) \n", + "iST-NHS_specific_cysteine_modification@C H(11)C(6)N(1)O(1) \n", + "Label:13C(2)15N(1)@G C(-2)13C(2)N(-1)15N(1) \n", + "GlyGly@K H(6)C(4)N(2)O(2) \n", + "\n", + " unimod_modloss modloss_composition \\\n", + "mod_name \n", + "Acetyl@T 0.0 \n", + "Acetyl@Protein_N-term 0.0 \n", + "Acetyl@S 0.0 \n", + "Acetyl@C 0.0 \n", + "Acetyl@Any_N-term 0.0 \n", + "... ... ... \n", + "NQTGG@K 0.0 \n", + "DVFQQQTGG@K 0.0 \n", + "iST-NHS_specific_cysteine_modification@C 0.0 \n", + "Label:13C(2)15N(1)@G 0.0 \n", + "GlyGly@K 0.0 \n", + "\n", + " classification unimod_id \\\n", + "mod_name \n", + "Acetyl@T Post-translational 1 \n", + "Acetyl@Protein_N-term Post-translational 1 \n", + "Acetyl@S Post-translational 1 \n", + "Acetyl@C Post-translational 1 \n", + "Acetyl@Any_N-term Multiple 1 \n", + "... ... ... \n", + "NQTGG@K Other 2084 \n", + "DVFQQQTGG@K Other 2085 \n", + "iST-NHS_specific_cysteine_modification@C Chemical derivative 2086 \n", + "Label:13C(2)15N(1)@G Isotopic label 2088 \n", + "GlyGly@K Multiple 121 \n", "\n", - " composition unimod_modloss modloss_composition \\\n", - "mod_name \n", - "Acetyl@T H(2)C(2)O(1) 0.0 \n", - "Acetyl@Protein_N-term H(2)C(2)O(1) 0.0 \n", - "Acetyl@S H(2)C(2)O(1) 0.0 \n", - "Acetyl@C H(2)C(2)O(1) 0.0 \n", - "Acetyl@Any_N-term H(2)C(2)O(1) 0.0 \n", - "... ... ... ... \n", - "TMTpro_zero@K H(25)C(15)N(3)O(3) 0.0 \n", - "TMTpro_zero@T H(25)C(15)N(3)O(3) 0.0 \n", - "Andro-H2O@C H(28)C(20)O(4) 0.0 \n", - "His+O(2)@H H(7)C(6)N(3)O(3) 0.0 \n", - "GlyGly@K H(6)C(4)N(2)O(2) 0.0 \n", + " smiles modloss_importance \\\n", + "mod_name \n", + "Acetyl@T 0.0 \n", + "Acetyl@Protein_N-term 0.0 \n", + "Acetyl@S 0.0 \n", + "Acetyl@C 0.0 \n", + "Acetyl@Any_N-term 0.0 \n", + "... ... ... \n", + "NQTGG@K 0.0 \n", + "DVFQQQTGG@K 0.0 \n", + "iST-NHS_specific_cysteine_modification@C 0.0 \n", + "Label:13C(2)15N(1)@G 0.0 \n", + "GlyGly@K 1000000.0 \n", "\n", - " classification unimod_id modloss_importance \\\n", - "mod_name \n", - "Acetyl@T Post-translational 1 0.0 \n", - "Acetyl@Protein_N-term Post-translational 1 0.0 \n", - "Acetyl@S Post-translational 1 0.0 \n", - "Acetyl@C Post-translational 1 0.0 \n", - "Acetyl@Any_N-term Multiple 1 0.0 \n", - "... ... ... ... \n", - "TMTpro_zero@K Chemical derivative 2017 0.0 \n", - "TMTpro_zero@T Chemical derivative 2017 0.0 \n", - "Andro-H2O@C Chemical derivative 2025 0.0 \n", - "His+O(2)@H Post-translational 2027 0.0 \n", - "GlyGly@K Multiple 121 1000000.0 \n", + " mass modloss_original \\\n", + "mod_name \n", + "Acetyl@T 42.010565 0.0 \n", + "Acetyl@Protein_N-term 42.010565 0.0 \n", + "Acetyl@S 42.010565 0.0 \n", + "Acetyl@C 42.010565 0.0 \n", + "Acetyl@Any_N-term 42.010565 0.0 \n", + "... ... ... \n", + "NQTGG@K 457.192111 0.0 \n", + "DVFQQQTGG@K 960.430109 0.0 \n", + "iST-NHS_specific_cysteine_modification@C 113.084064 0.0 \n", + "Label:13C(2)15N(1)@G 3.003745 0.0 \n", + "GlyGly@K 114.042927 0.0 \n", "\n", - " mass modloss_original modloss \n", - "mod_name \n", - "Acetyl@T 42.010565 0.0 0.0 \n", - "Acetyl@Protein_N-term 42.010565 0.0 0.0 \n", - "Acetyl@S 42.010565 0.0 0.0 \n", - "Acetyl@C 42.010565 0.0 0.0 \n", - "Acetyl@Any_N-term 42.010565 0.0 0.0 \n", - "... ... ... ... \n", - "TMTpro_zero@K 295.189592 0.0 0.0 \n", - "TMTpro_zero@T 295.189592 0.0 0.0 \n", - "Andro-H2O@C 332.198759 0.0 0.0 \n", - "His+O(2)@H 169.048741 0.0 0.0 \n", - "GlyGly@K 114.042927 0.0 0.0 \n", + " modloss \n", + "mod_name \n", + "Acetyl@T 0.0 \n", + "Acetyl@Protein_N-term 0.0 \n", + "Acetyl@S 0.0 \n", + "Acetyl@C 0.0 \n", + "Acetyl@Any_N-term 0.0 \n", + "... ... \n", + "NQTGG@K 0.0 \n", + "DVFQQQTGG@K 0.0 \n", + "iST-NHS_specific_cysteine_modification@C 0.0 \n", + "Label:13C(2)15N(1)@G 0.0 \n", + "GlyGly@K 0.0 \n", "\n", - "[2685 rows x 12 columns]" + "[2772 rows x 13 columns]" ] }, "execution_count": 8, @@ -1026,7 +1095,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Multiple modification at a single site is supported, for example, in the following example, `K3` contains both `GlyGly@K` and `Dimethyl@K`:" + "Multiple modifications at a single site is supported, for example, in the following example, `K3` contains both `GlyGly@K` and `Dimethyl@K`:" ] }, { @@ -1323,9 +1392,14 @@ "source": [ "### Isotope distribution\n", "\n", - "`alphabase.constants.isotope.IsotopeDistribution` will calculate the isotope distribution and the mono-isotopic idx in the distribution for a given atom composition. \n", - "\n", - "What is the mono-isotopic idx (mono_idx)? For an atom, the `mono_idx` points to the highest abundance isotope, so the value is `round(mass of highest isotope - mass of first isotope)`." + "`alphabase.constants.isotope.IsotopeDistribution` will calculate the isotope distribution and the mono-isotopic idx in the distribution for a given atom composition. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For an atom, the mono-isotopic idx (`mono_idx`) points to the highest abundance isotope, so the value is `round(mass of highest isotope - mass of first isotope)`." ] }, { @@ -1490,7 +1564,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "For example, `Fe`'s `mono_idx` is 2," + "For example, `Fe`'s `mono_idx` is 2 (mass from 53.94 to 55.93), " ] }, { @@ -1586,7 +1660,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The `mono_idx` for most of the atom compositions is 0, no matter how big the compositions are." + "The `mono_idx` of unlabeled atom compositions is always 0, no matter how big the compositions are. This means `mono` isotope is not necessary to be the `highest` isotope peak, especially when the composition get larger. Here are three examples from small composition to large ones, we can see that the highest peaks move from 0 to 2." ] }, { @@ -1597,7 +1671,10 @@ { "data": { "text/plain": [ - "[('C', 100), ('H', 100), ('O', 50), ('Na', 1)]" + "('mono=0, highest=0',\n", + " array([5.53058051e-01, 3.06480210e-01, 1.06031073e-01, 2.73885413e-02,\n", + " 5.79597328e-03, 1.05055134e-03, 1.67897345e-04, 2.41173838e-05,\n", + " 3.15729577e-06, 3.80635657e-07]))" ] }, "execution_count": 23, @@ -1609,49 +1686,69 @@ "from alphabase.constants.isotope import IsotopeDistribution, parse_formula\n", "iso = IsotopeDistribution()\n", "\n", - "formula = 'C(100)H(100)O(50)Na(1)'\n", + "formula = 'C(50)H(50)O(20)Na(1)'\n", "formula = parse_formula(formula)\n", - "formula" + "dist, mono = iso.calc_formula_distribution(formula)\n", + "f\"mono={mono}, highest={dist.argmax()}\", dist" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 24, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('mono=0, highest=1',\n", + " array([3.21124792e-01, 3.53459703e-01, 2.05844502e-01, 8.38383715e-02,\n", + " 2.66913129e-02, 7.04911613e-03, 1.60206285e-03, 3.21190201e-04,\n", + " 5.78218885e-05, 9.47198919e-06]))" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "> `mono` isotope is not the `highest` isotope!!!" + "formula = 'C(100)H(100)O(20)Na(1)'\n", + "formula = parse_formula(formula)\n", + "dist, mono = iso.calc_formula_distribution(formula)\n", + "f\"mono={mono}, highest={dist.argmax()}\", dist" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(0,\n", - " 1,\n", - " array([2.98521241e-01, 3.31991573e-01, 2.13532938e-01, 1.00604878e-01,\n", - " 3.82856126e-02, 1.23872292e-02, 3.51773755e-03, 8.95830236e-04,\n", - " 2.07763024e-04, 4.43944472e-05]))" + "('mono=0, highest=2',\n", + " array([0.10312113, 0.22700935, 0.25713731, 0.19936063, 0.11878142,\n", + " 0.05791123, 0.02402947, 0.00871637, 0.00281814, 0.00082412]))" ] }, - "execution_count": 24, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "formula = 'C(200)H(200)O(40)Na(1)'\n", + "formula = parse_formula(formula)\n", "dist, mono = iso.calc_formula_distribution(formula)\n", - "mono, dist.argmax(), dist" + "f\"mono={mono}, highest={dist.argmax()}\", dist" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, - "source": [ - "All these low-level functionalities have been integrated into DataFrame functionalities, see `tutorial_dev_dataframes.ipynb` or `Tutorial for Dev: Peptide and Fragment DataFrames`" - ] + "outputs": [], + "source": [] } ], "metadata": { diff --git a/docs/tutorials/dataframe_structures.ipynb b/docs/tutorials/tutorial_dataframe_structures.ipynb similarity index 99% rename from docs/tutorials/dataframe_structures.ipynb rename to docs/tutorials/tutorial_dataframe_structures.ipynb index f95a50a..1234557 100644 --- a/docs/tutorials/dataframe_structures.ipynb +++ b/docs/tutorials/tutorial_dataframe_structures.ipynb @@ -6,7 +6,7 @@ "source": [ "# Tutorial: Peptide and Fragment DataFrames\n", "\n", - "We use dataframe, a tabular-like data structure. " + "We use dataframe, a tabular-like data structure to represent peptides and fragments." ] }, { @@ -134,12 +134,12 @@ "source": [ "## Fragment DataFrame\n", "\n", - "Fragment is also orginized in dataframe structure. The column names of the dataframe represent the fragment type, wich schema `Type[_LossType]_z[n]`, where:\n", + "Fragment is also orginized in dataframe structure. The column names of the dataframe represent the fragment type, wich schema `Type[_LossType]_zn`, where:\n", " - `Type` can be `b,y,c,z`\n", " - `_LossType` can be `_modloss`, `_H2O`, or `_NH3`, this is optional.\n", - " - `z[n]` is the charge state. If precursor charge is less than `n`.\n", + " - `zn` is the charge state, for example `z1`.\n", "\n", - "For example:" + "Here are some examples:" ] }, { diff --git a/docs/tutorials/spectral_libraries.ipynb b/docs/tutorials/tutorial_spectral_libraries.ipynb similarity index 100% rename from docs/tutorials/spectral_libraries.ipynb rename to docs/tutorials/tutorial_spectral_libraries.ipynb