diff --git a/docs/tutorials/tutorial_basic_definations.ipynb b/docs/tutorials/tutorial_basic_definitions.ipynb
similarity index 100%
rename from docs/tutorials/tutorial_basic_definations.ipynb
rename to docs/tutorials/tutorial_basic_definitions.ipynb
diff --git a/docs/tutorials/tutorial_dataframe_structures.ipynb b/docs/tutorials/tutorial_dataframe_structures.ipynb
index 1234557..ac5f5fa 100644
--- a/docs/tutorials/tutorial_dataframe_structures.ipynb
+++ b/docs/tutorials/tutorial_dataframe_structures.ipynb
@@ -6,7 +6,7 @@
"source": [
"# Tutorial: Peptide and Fragment DataFrames\n",
"\n",
- "We use dataframe, a tabular-like data structure to represent peptides and fragments."
+ "We use dataframe, a tabular-like data structure to represent peptides and fragments. The dataframe structure is easy to read from human's perspective, and efficient for input and output from machine's perspective."
]
},
{
@@ -144,7 +144,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 3,
"metadata": {},
"outputs": [
{
@@ -673,7 +673,7 @@
"33 175.118958 157.108383 0.000000 159.100235 "
]
},
- "execution_count": 5,
+ "execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@@ -703,7 +703,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 4,
"metadata": {},
"outputs": [
{
@@ -732,14 +732,6 @@
"
mod_sites | \n",
" charge | \n",
" nAA | \n",
- " precursor_mz | \n",
- " i_0 | \n",
- " i_1 | \n",
- " i_2 | \n",
- " i_3 | \n",
- " i_4 | \n",
- " i_5 | \n",
- " mono_isotope_idx | \n",
" frag_start_idx | \n",
" frag_stop_idx | \n",
" \n",
@@ -752,14 +744,6 @@
" 2 | \n",
" 1 | \n",
" 8 | \n",
- " 1019.461492 | \n",
- " 0.544890 | \n",
- " 0.294208 | \n",
- " 0.116900 | \n",
- " 0.034340 | \n",
- " 0.008077 | \n",
- " 0.001584 | \n",
- " 0 | \n",
" 0 | \n",
" 7 | \n",
" \n",
@@ -770,14 +754,6 @@
" | \n",
" 2 | \n",
" 9 | \n",
- " 532.757692 | \n",
- " 0.527839 | \n",
- " 0.300826 | \n",
- " 0.123018 | \n",
- " 0.037359 | \n",
- " 0.009104 | \n",
- " 0.001854 | \n",
- " 0 | \n",
" 7 | \n",
" 15 | \n",
" \n",
@@ -788,14 +764,6 @@
" 3;6 | \n",
" 3 | \n",
" 20 | \n",
- " 808.337166 | \n",
- " 0.271028 | \n",
- " 0.323775 | \n",
- " 0.225641 | \n",
- " 0.115441 | \n",
- " 0.047553 | \n",
- " 0.016561 | \n",
- " 0 | \n",
" 15 | \n",
" 34 | \n",
" \n",
@@ -809,18 +777,13 @@
"1 APDEFMNIK 2 9 \n",
"2 WDSEFMNTIRAAAAKDDDDR Phospho@S;Oxidation@M 3;6 3 20 \n",
"\n",
- " precursor_mz i_0 i_1 i_2 i_3 i_4 i_5 \\\n",
- "0 1019.461492 0.544890 0.294208 0.116900 0.034340 0.008077 0.001584 \n",
- "1 532.757692 0.527839 0.300826 0.123018 0.037359 0.009104 0.001854 \n",
- "2 808.337166 0.271028 0.323775 0.225641 0.115441 0.047553 0.016561 \n",
- "\n",
- " mono_isotope_idx frag_start_idx frag_stop_idx \n",
- "0 0 0 7 \n",
- "1 0 7 15 \n",
- "2 0 15 34 "
+ " frag_start_idx frag_stop_idx \n",
+ "0 0 7 \n",
+ "1 7 15 \n",
+ "2 15 34 "
]
},
- "execution_count": 6,
+ "execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@@ -831,7 +794,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
@@ -1143,7 +1106,7 @@
"33 175.118958 157.108383 0.000000 159.100235 "
]
},
- "execution_count": 7,
+ "execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@@ -1154,6 +1117,324 @@
"stop = df['frag_stop_idx'].values[pep_id]\n",
"frag_mz_df.iloc[start:stop]"
]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Using on several fragment dataframes (e.g., m/z and intensity dataframes) may be not convinient in some situations, especially when we need to operate subsets of the dataframes. Therefore, alphabase also provides a flattened fragment dataframe strucutre to store all fragment information."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from alphabase.peptide.fragment import flatten_fragments\n",
+ "import numpy as np\n",
+ "\n",
+ "precursor_df, flat_frag_df = flatten_fragments(\n",
+ " precursor_df=df, \n",
+ " fragment_mz_df=frag_mz_df, \n",
+ " fragment_intensity_df=pd.DataFrame(\n",
+ " np.zeros_like(frag_mz_df.values),\n",
+ " columns=frag_mz_df.columns\n",
+ " )\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " sequence | \n",
+ " mods | \n",
+ " mod_sites | \n",
+ " charge | \n",
+ " nAA | \n",
+ " frag_start_idx | \n",
+ " frag_stop_idx | \n",
+ " flat_frag_start_idx | \n",
+ " flat_frag_stop_idx | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " ACDEFHIK | \n",
+ " Carbamidomethyl@C | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 8 | \n",
+ " 0 | \n",
+ " 7 | \n",
+ " 0 | \n",
+ " 49 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " APDEFMNIK | \n",
+ " | \n",
+ " | \n",
+ " 2 | \n",
+ " 9 | \n",
+ " 7 | \n",
+ " 15 | \n",
+ " 49 | \n",
+ " 113 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " WDSEFMNTIRAAAAKDDDDR | \n",
+ " Phospho@S;Oxidation@M | \n",
+ " 3;6 | \n",
+ " 3 | \n",
+ " 20 | \n",
+ " 15 | \n",
+ " 34 | \n",
+ " 113 | \n",
+ " 267 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " sequence mods mod_sites charge nAA \\\n",
+ "0 ACDEFHIK Carbamidomethyl@C 2 1 8 \n",
+ "1 APDEFMNIK 2 9 \n",
+ "2 WDSEFMNTIRAAAAKDDDDR Phospho@S;Oxidation@M 3;6 3 20 \n",
+ "\n",
+ " frag_start_idx frag_stop_idx flat_frag_start_idx flat_frag_stop_idx \n",
+ "0 0 7 0 49 \n",
+ "1 7 15 49 113 \n",
+ "2 15 34 113 267 "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "precursor_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " mz | \n",
+ " intensity | \n",
+ " type | \n",
+ " loss_type | \n",
+ " charge | \n",
+ " number | \n",
+ " position | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 44.049477 | \n",
+ " 0.0 | \n",
+ " 97 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 72.044388 | \n",
+ " 0.0 | \n",
+ " 98 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 89.070938 | \n",
+ " 0.0 | \n",
+ " 99 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 974.403625 | \n",
+ " 0.0 | \n",
+ " 120 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 7 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 948.424377 | \n",
+ " 0.0 | \n",
+ " 121 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 7 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 262 | \n",
+ " 1124.946289 | \n",
+ " 0.0 | \n",
+ " 98 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 19 | \n",
+ " 18 | \n",
+ "
\n",
+ " \n",
+ " 263 | \n",
+ " 201.098221 | \n",
+ " 0.0 | \n",
+ " 120 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 18 | \n",
+ "
\n",
+ " \n",
+ " 264 | \n",
+ " 175.118958 | \n",
+ " 0.0 | \n",
+ " 121 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 18 | \n",
+ "
\n",
+ " \n",
+ " 265 | \n",
+ " 157.108383 | \n",
+ " 0.0 | \n",
+ " 121 | \n",
+ " 18 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 18 | \n",
+ "
\n",
+ " \n",
+ " 266 | \n",
+ " 159.100235 | \n",
+ " 0.0 | \n",
+ " 122 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 18 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
267 rows × 7 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " mz intensity type loss_type charge number position\n",
+ "0 44.049477 0.0 97 0 1 1 0\n",
+ "1 72.044388 0.0 98 0 1 1 0\n",
+ "2 89.070938 0.0 99 0 1 1 0\n",
+ "3 974.403625 0.0 120 0 1 7 0\n",
+ "4 948.424377 0.0 121 0 1 7 0\n",
+ ".. ... ... ... ... ... ... ...\n",
+ "262 1124.946289 0.0 98 0 2 19 18\n",
+ "263 201.098221 0.0 120 0 1 1 18\n",
+ "264 175.118958 0.0 121 0 1 1 18\n",
+ "265 157.108383 0.0 121 18 1 1 18\n",
+ "266 159.100235 0.0 122 0 1 1 18\n",
+ "\n",
+ "[267 rows x 7 columns]"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "flat_frag_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "For the flattened fragment dataframe, it contains `mz`, `intensity`, `type`, `loss_type`, `charge`, `number`, and `position` columns, other columns can be flexibly added. All columns are converted to numeric values for better processing in numpy and numba package. For instance , `type` is the ASCII code of `abc/xyz` ions, `a`=97, `b`=98, `c`=99, `x`=120, `y`=121, and `z`=122. Losses are also converted to numbers as well, therefore, Water loss becomes `18`, and phospho loss becomes `98`. \n",
+ "\n",
+ "And similar to `frag_start_idx` and `frag_stop_idx`, we use `flat_frag_start_idx` and `flat_frag_stop_idx` to keep the connection between the precursor dataframe and the flattened fragment dataframe."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {