diff --git a/docs/tutorials/tutorial_basic_definations.ipynb b/docs/tutorials/tutorial_basic_definitions.ipynb similarity index 100% rename from docs/tutorials/tutorial_basic_definations.ipynb rename to docs/tutorials/tutorial_basic_definitions.ipynb diff --git a/docs/tutorials/tutorial_dataframe_structures.ipynb b/docs/tutorials/tutorial_dataframe_structures.ipynb index 1234557..ac5f5fa 100644 --- a/docs/tutorials/tutorial_dataframe_structures.ipynb +++ b/docs/tutorials/tutorial_dataframe_structures.ipynb @@ -6,7 +6,7 @@ "source": [ "# Tutorial: Peptide and Fragment DataFrames\n", "\n", - "We use dataframe, a tabular-like data structure to represent peptides and fragments." + "We use dataframe, a tabular-like data structure to represent peptides and fragments. The dataframe structure is easy to read from human's perspective, and efficient for input and output from machine's perspective." ] }, { @@ -144,7 +144,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -673,7 +673,7 @@ "33 175.118958 157.108383 0.000000 159.100235 " ] }, - "execution_count": 5, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -703,7 +703,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -732,14 +732,6 @@ " mod_sites\n", " charge\n", " nAA\n", - " precursor_mz\n", - " i_0\n", - " i_1\n", - " i_2\n", - " i_3\n", - " i_4\n", - " i_5\n", - " mono_isotope_idx\n", " frag_start_idx\n", " frag_stop_idx\n", " \n", @@ -752,14 +744,6 @@ " 2\n", " 1\n", " 8\n", - " 1019.461492\n", - " 0.544890\n", - " 0.294208\n", - " 0.116900\n", - " 0.034340\n", - " 0.008077\n", - " 0.001584\n", - " 0\n", " 0\n", " 7\n", " \n", @@ -770,14 +754,6 @@ " \n", " 2\n", " 9\n", - " 532.757692\n", - " 0.527839\n", - " 0.300826\n", - " 0.123018\n", - " 0.037359\n", - " 0.009104\n", - " 0.001854\n", - " 0\n", " 7\n", " 15\n", " \n", @@ -788,14 +764,6 @@ " 3;6\n", " 3\n", " 20\n", - " 808.337166\n", - " 0.271028\n", - " 0.323775\n", - " 0.225641\n", - " 0.115441\n", - " 0.047553\n", - " 0.016561\n", - " 0\n", " 15\n", " 34\n", " \n", @@ -809,18 +777,13 @@ "1 APDEFMNIK 2 9 \n", "2 WDSEFMNTIRAAAAKDDDDR Phospho@S;Oxidation@M 3;6 3 20 \n", "\n", - " precursor_mz i_0 i_1 i_2 i_3 i_4 i_5 \\\n", - "0 1019.461492 0.544890 0.294208 0.116900 0.034340 0.008077 0.001584 \n", - "1 532.757692 0.527839 0.300826 0.123018 0.037359 0.009104 0.001854 \n", - "2 808.337166 0.271028 0.323775 0.225641 0.115441 0.047553 0.016561 \n", - "\n", - " mono_isotope_idx frag_start_idx frag_stop_idx \n", - "0 0 0 7 \n", - "1 0 7 15 \n", - "2 0 15 34 " + " frag_start_idx frag_stop_idx \n", + "0 0 7 \n", + "1 7 15 \n", + "2 15 34 " ] }, - "execution_count": 6, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -831,7 +794,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -1143,7 +1106,7 @@ "33 175.118958 157.108383 0.000000 159.100235 " ] }, - "execution_count": 7, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -1154,6 +1117,324 @@ "stop = df['frag_stop_idx'].values[pep_id]\n", "frag_mz_df.iloc[start:stop]" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using on several fragment dataframes (e.g., m/z and intensity dataframes) may be not convinient in some situations, especially when we need to operate subsets of the dataframes. Therefore, alphabase also provides a flattened fragment dataframe strucutre to store all fragment information." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from alphabase.peptide.fragment import flatten_fragments\n", + "import numpy as np\n", + "\n", + "precursor_df, flat_frag_df = flatten_fragments(\n", + " precursor_df=df, \n", + " fragment_mz_df=frag_mz_df, \n", + " fragment_intensity_df=pd.DataFrame(\n", + " np.zeros_like(frag_mz_df.values),\n", + " columns=frag_mz_df.columns\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sequencemodsmod_siteschargenAAfrag_start_idxfrag_stop_idxflat_frag_start_idxflat_frag_stop_idx
0ACDEFHIKCarbamidomethyl@C21807049
1APDEFMNIK2971549113
2WDSEFMNTIRAAAAKDDDDRPhospho@S;Oxidation@M3;63201534113267
\n", + "
" + ], + "text/plain": [ + " sequence mods mod_sites charge nAA \\\n", + "0 ACDEFHIK Carbamidomethyl@C 2 1 8 \n", + "1 APDEFMNIK 2 9 \n", + "2 WDSEFMNTIRAAAAKDDDDR Phospho@S;Oxidation@M 3;6 3 20 \n", + "\n", + " frag_start_idx frag_stop_idx flat_frag_start_idx flat_frag_stop_idx \n", + "0 0 7 0 49 \n", + "1 7 15 49 113 \n", + "2 15 34 113 267 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "precursor_df" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mzintensitytypeloss_typechargenumberposition
044.0494770.0970110
172.0443880.0980110
289.0709380.0990110
3974.4036250.01200170
4948.4243770.01210170
........................
2621124.9462890.098021918
263201.0982210.012001118
264175.1189580.012101118
265157.1083830.0121181118
266159.1002350.012201118
\n", + "

267 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " mz intensity type loss_type charge number position\n", + "0 44.049477 0.0 97 0 1 1 0\n", + "1 72.044388 0.0 98 0 1 1 0\n", + "2 89.070938 0.0 99 0 1 1 0\n", + "3 974.403625 0.0 120 0 1 7 0\n", + "4 948.424377 0.0 121 0 1 7 0\n", + ".. ... ... ... ... ... ... ...\n", + "262 1124.946289 0.0 98 0 2 19 18\n", + "263 201.098221 0.0 120 0 1 1 18\n", + "264 175.118958 0.0 121 0 1 1 18\n", + "265 157.108383 0.0 121 18 1 1 18\n", + "266 159.100235 0.0 122 0 1 1 18\n", + "\n", + "[267 rows x 7 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "flat_frag_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For the flattened fragment dataframe, it contains `mz`, `intensity`, `type`, `loss_type`, `charge`, `number`, and `position` columns, other columns can be flexibly added. All columns are converted to numeric values for better processing in numpy and numba package. For instance , `type` is the ASCII code of `abc/xyz` ions, `a`=97, `b`=98, `c`=99, `x`=120, `y`=121, and `z`=122. Losses are also converted to numbers as well, therefore, Water loss becomes `18`, and phospho loss becomes `98`. \n", + "\n", + "And similar to `frag_start_idx` and `frag_stop_idx`, we use `flat_frag_start_idx` and `flat_frag_stop_idx` to keep the connection between the precursor dataframe and the flattened fragment dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {