From 8bd12ad160d2bce4bcba6cb6ad78a4e57bcc585a Mon Sep 17 00:00:00 2001 From: Colin Wood <68213641+colinvwood@users.noreply.github.com> Date: Mon, 4 Nov 2024 15:05:05 -0700 Subject: [PATCH] Add jupyter notebooks for qPCR figures, culturing figures, qPCR limit of detection and quantification (#6) * jupyter notebooks for qPCR figures, culturing figures, qPCR limit of detection/quantification analysis * dependencies * move notebooks to their own directory * no longer duplicate rows in metadata * metadata changes * increase font sizes, other small changes --- ...-manuscript-figures-qiime2-tiny-2024.5.yml | 14 +- .../notebooks/culturing-plots.ipynb | 494 +++++++++++++ .../notebooks/lod-analysis.ipynb | 158 ++++ .../notebooks/qcpr-plots.ipynb | 672 ++++++++++++++++++ 4 files changed, 1333 insertions(+), 5 deletions(-) create mode 100644 gut_to_soil_manuscript_figures/notebooks/culturing-plots.ipynb create mode 100644 gut_to_soil_manuscript_figures/notebooks/lod-analysis.ipynb create mode 100644 gut_to_soil_manuscript_figures/notebooks/qcpr-plots.ipynb diff --git a/environments/gut-to-soil-manuscript-figures-qiime2-tiny-2024.5.yml b/environments/gut-to-soil-manuscript-figures-qiime2-tiny-2024.5.yml index 822342e..dcf220f 100644 --- a/environments/gut-to-soil-manuscript-figures-qiime2-tiny-2024.5.yml +++ b/environments/gut-to-soil-manuscript-figures-qiime2-tiny-2024.5.yml @@ -1,11 +1,15 @@ channels: -- https://packages.qiime2.org/qiime2/2024.5/tiny/passed -- https://packages.qiime2.org/qiime2/2024.5/amplicon/passed -- conda-forge -- bioconda + - https://packages.qiime2.org/qiime2/2024.5/tiny/passed + - https://packages.qiime2.org/qiime2/2024.5/amplicon/passed + - conda-forge + - bioconda dependencies: - qiime2-tiny - q2-diversity - pip - pip: - - gut_to_soil_manuscript_figures@git+https://github.com/caporaso-lab/gut-to-soil-manuscript-figures.git + - altair + - vl-convert-python + - openpyxl + - xlrd + - gut_to_soil_manuscript_figures@git+https://github.com/caporaso-lab/gut-to-soil-manuscript-figures.git diff --git a/gut_to_soil_manuscript_figures/notebooks/culturing-plots.ipynb b/gut_to_soil_manuscript_figures/notebooks/culturing-plots.ipynb new file mode 100644 index 0000000..3b272dd --- /dev/null +++ b/gut_to_soil_manuscript_figures/notebooks/culturing-plots.ipynb @@ -0,0 +1,494 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "443093ee-9c82-4bfe-a463-77c63ccdfda4", + "metadata": {}, + "outputs": [], + "source": [ + "import math\n", + "import statistics\n", + "import pandas as pd\n", + "import numpy as np\n", + "import altair as alt\n", + "import vl_convert as vlc" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "dc7b93df-b2c0-4961-b19d-0efeab002305", + "metadata": {}, + "outputs": [], + "source": [ + "# PARSE METADATA\n", + "\n", + "# NOTE: change me\n", + "md_fp = ''\n", + "md = pd.read_csv(md_fp, sep='\\t', dtype={'sample-id': str})\n", + "\n", + "# remove comment lines\n", + "md = md[~ md['sample-id'].str.startswith('#')]\n", + "\n", + "md = md.rename({'sample-id': 'sample_id'}, axis=1)\n", + "\n", + "md = md[['sample_id', 'Bucket', 'Composting Time Point', 'SampleType']]\n", + "\n", + "# drop nan & week-0 rows of HEC\n", + "md = md[~(\n", + " (md['Composting Time Point'] == 'Human Excrement Compost') &\n", + " (md['Composting Time Point'].isna() | md['Composting Time Point'] > 0)\n", + ")]\n", + "\n", + "# keep only HEC observations\n", + "md = md[md['SampleType'] == 'Human Excrement Compost']" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2aa626c2-d63d-4b1a-b50d-a7137da07260", + "metadata": {}, + "outputs": [], + "source": [ + "# RECAST WEEK AND BUCKET COLUMNS TO INTEGER\n", + "\n", + "md['Bucket'] = md['Bucket'].astype('Int64')\n", + "md['Composting Time Point'] = md['Composting Time Point'].astype('Int64')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e9093958-04d4-4233-bdb2-ed0f3f81c542", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idBucketPreliminary analysis of presenceResults from preliminary analysisSerial Dilution 0.1Serial Dilution 0.01Serial Dilution 0.001Serial Dilution 0.0001MPN/gLower 95% Confidence Limit (MPN/g)Upper 95% Confidence Limit (MPN/g)Lower 95% Confidence Limit (MPN/g) .1Upper 95% Confidence Limit (MPN/g).1Composting Time PointSampleTypeDate
0772b5c411(+/+)Y(+/+/+/+)(+/+/+/+)(-/-/-/-)(-/-/-/-)239.79163.784849.37238.676964.6721Compost Post-Roll2021-10-04
1e706dd011(+/+)Y(+/+/+/+)(+/+/+/+)(-/-/-/-)(-/-/-/-)239.79163.784849.37238.676964.6723Compost Post-Roll2021-10-13
2600569a71(+/+)Y(+/+/+/-)(-/-/-/-)(-/-/-/-)(-/-/-/-)11.4511.72429.6992.19143.7445Compost Post-Roll2021-10-28
3b9f1a9d71(+/-)Y(-/-/-/-)(-/-/-/-)(-/-/-/-)(-/-/-/-)000007Compost Post-Roll2021-11-11
43983043e1(+/+)Y(+/+/-/-)(-/-/-/-)(-/-/-/-)(-/-/-/-)6.0610.38616.260.65326.5979Compost Post-Roll2021-11-24
\n", + "
" + ], + "text/plain": [ + " sample_id Bucket Preliminary analysis of presence \\\n", + "0 772b5c41 1 (+/+) \n", + "1 e706dd01 1 (+/+) \n", + "2 600569a7 1 (+/+) \n", + "3 b9f1a9d7 1 (+/-) \n", + "4 3983043e 1 (+/+) \n", + "\n", + " Results from preliminary analysis Serial Dilution 0.1 Serial Dilution 0.01 \\\n", + "0 Y (+/+/+/+) (+/+/+/+) \n", + "1 Y (+/+/+/+) (+/+/+/+) \n", + "2 Y (+/+/+/-) (-/-/-/-) \n", + "3 Y (-/-/-/-) (-/-/-/-) \n", + "4 Y (+/+/-/-) (-/-/-/-) \n", + "\n", + " Serial Dilution 0.001 Serial Dilution 0.0001 MPN/g \\\n", + "0 (-/-/-/-) (-/-/-/-) 239.791 \n", + "1 (-/-/-/-) (-/-/-/-) 239.791 \n", + "2 (-/-/-/-) (-/-/-/-) 11.451 \n", + "3 (-/-/-/-) (-/-/-/-) 0 \n", + "4 (-/-/-/-) (-/-/-/-) 6.061 \n", + "\n", + " Lower 95% Confidence Limit (MPN/g) Upper 95% Confidence Limit (MPN/g) \\\n", + "0 63.784 849.372 \n", + "1 63.784 849.372 \n", + "2 1.724 29.699 \n", + "3 0 0 \n", + "4 0.386 16.26 \n", + "\n", + " Lower 95% Confidence Limit (MPN/g) .1 Upper 95% Confidence Limit (MPN/g).1 \\\n", + "0 38.676 964.672 \n", + "1 38.676 964.672 \n", + "2 2.191 43.744 \n", + "3 0 0 \n", + "4 0.653 26.597 \n", + "\n", + " Composting Time Point SampleType Date \n", + "0 1 Compost Post-Roll 2021-10-04 \n", + "1 3 Compost Post-Roll 2021-10-13 \n", + "2 5 Compost Post-Roll 2021-10-28 \n", + "3 7 Compost Post-Roll 2021-11-11 \n", + "4 9 Compost Post-Roll 2021-11-24 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# IMPORT CULUTURING DATA\n", + "\n", + "# NOTE: change me\n", + "culturing_fp = ''\n", + "culture = pd.read_csv(culturing_fp, sep='\\t', skiprows=[1])\n", + "\n", + "# remove comment lines\n", + "culture = culture[~ culture['sample-id'].str.startswith('#')]\n", + "\n", + "culture = culture.rename({'sample-id': 'sample_id'}, axis='columns')\n", + "\n", + "culture.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "b88b1884-df2b-4659-af93-68d6a01d1773", + "metadata": {}, + "outputs": [], + "source": [ + "# SUBSET CULTURING DATA\n", + "\n", + "culture = culture[['sample_id', 'MPN/g']]\n", + "culture = culture.astype({'MPN/g': float})" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ff11aa18-2c6c-4afb-a454-f0a6d4a6b9ad", + "metadata": {}, + "outputs": [], + "source": [ + "# DROP NA CULTURING VALUES\n", + "\n", + "culture = culture[culture['MPN/g'].notna()]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "762d31d2-040f-44f0-b87a-c2486f2dee10", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 384 entries, 0 to 383\n", + "Data columns (total 5 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 sample_id 384 non-null object \n", + " 1 MPN/g 384 non-null float64\n", + " 2 Bucket 384 non-null Int64 \n", + " 3 Composting Time Point 384 non-null Int64 \n", + " 4 SampleType 384 non-null object \n", + "dtypes: Int64(2), float64(1), object(2)\n", + "memory usage: 15.9+ KB\n" + ] + } + ], + "source": [ + "# MERGE CULTURING DATA, METADATA\n", + "\n", + "culture_md = culture.merge(md, on='sample_id', how='inner')\n", + "\n", + "culture_md.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "9e54c8fd-4ced-4987-8082-d739067d8358", + "metadata": {}, + "outputs": [], + "source": [ + "# CALCULATE PER-BUCKET MOVING AVERAGES\n", + "\n", + "# sort by week for moving average calculation\n", + "culture_md = culture_md.sort_values('Composting Time Point')\n", + "\n", + "# averaging period\n", + "window = 4\n", + "\n", + "culture_md['MPN/g Moving Average'] = (\n", + " culture_md.groupby(['Bucket', 'SampleType'])['MPN/g']\n", + " .transform(lambda x: x.rolling(window=window, min_periods=1).mean())\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "1f106022-e861-479b-8ce2-f0264c0bf4dd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.FacetChart(...)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "line = alt.Chart(culture_md).mark_line().encode(\n", + " x=alt.X('Composting Time Point').scale(domain=(0, 52), nice=False),\n", + " y=alt.Y('MPN/g').scale(type='linear').title('MPN/g'),\n", + ").transform_filter(\n", + " (alt.datum['SampleType'] == 'Human Excrement Compost')\n", + ").facet(\n", + " facet='Bucket',\n", + " columns=5,\n", + " title=alt.Title(\n", + " 'Culturing MPN/g, By Bucket'\n", + " )\n", + ").resolve_scale(\n", + " y='independent',\n", + " x='independent'\n", + ").configure_axis(\n", + " labelFontSize=15,\n", + " titleFontSize=15,\n", + ").configure_axisY(\n", + " format='e'\n", + ").configure_header(\n", + " titleFontSize=16,\n", + " labelFontSize=15\n", + ").configure_title(\n", + " fontSize=20\n", + ")\n", + "\n", + "line" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "abe139a1-39d6-4710-8c15-44736c4fe7e8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/gut_to_soil_manuscript_figures/notebooks/lod-analysis.ipynb b/gut_to_soil_manuscript_figures/notebooks/lod-analysis.ipynb new file mode 100644 index 0000000..0b72210 --- /dev/null +++ b/gut_to_soil_manuscript_figures/notebooks/lod-analysis.ipynb @@ -0,0 +1,158 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# PARSE QPCR DATA, which exist in multiple sheets\n", + "\n", + "# NOTE: change me\n", + "ecoli_cperf_fp = ''\n", + "\n", + "sheets = pd.read_excel(ecoli_cperf_fp, sheet_name=None)\n", + "df = pd.DataFrame()\n", + "for sheet_name, sheet in sheets.items():\n", + " df = pd.concat([df, sheet], axis=0)\n", + "\n", + "df['Sample Name'] = df['Sample Name'].astype(str)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "df = df[['Target Name', 'Sample Name', 'CT', 'Ct Mean', 'Quantity', 'Quantity Mean', 'Quantity SD']]\n", + "\n", + "# include only standards\n", + "df = df[df['Sample Name'].str.contains('Gblock')]\n", + "\n", + "# split concentration from sample name\n", + "df[['sample-name', 'concentration']] = df['Sample Name'].str.split('CTL', expand=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# extract standard concentrations from cells\n", + "\n", + "# 3.041 or 3.041E1, 3.041E2, etc.\n", + "conc_pattern = re.compile(r'3\\.041(E[0-9])?')\n", + "\n", + "def parse_conc(row):\n", + " match = conc_pattern.search(row['Sample Name'])\n", + " if not match:\n", + " match = conc_pattern.search(row['Target Name'])\n", + " if not match:\n", + " raise ValueError('concentration not found')\n", + "\n", + " row['Concentration'] = match.group()\n", + "\n", + " return row\n", + "\n", + "df = df.apply(parse_conc, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3.041 0.558824\n", + "3.041E1 1.000000\n", + "3.041E2 1.000000\n", + "3.041E3 1.000000\n", + "3.041E4 1.000000\n", + "3.041E5 1.000000\n", + "3.041E6 1.000000\n", + "3.041E7 0.985294\n", + "dtype: float64\n" + ] + } + ], + "source": [ + "# calculate proportion amplified per standard concentration\n", + "conc_grouped = df.groupby('Concentration')\n", + "proportions = pd.Series()\n", + "for conc, g_df in conc_grouped:\n", + " counts = g_df['CT'].value_counts(dropna=False)\n", + " assert np.nan not in counts\n", + "\n", + " if 'Undetermined' in counts:\n", + " undetermined = counts['Undetermined']\n", + " else:\n", + " undetermined = 0\n", + "\n", + " proportions[conc] = 1 - ( undetermined / counts.sum() )\n", + "\n", + "print(proportions)\n", + "\n", + "\n", + "# IMPORTANT: The limit of detection (LOD) is between 30.41 and 3.041 copies.\n", + "# We can detect 30.41 copies with 100% confidence, and\n", + "# 3.041 copies with ~55% confidence. The number of copies which\n", + "# we can detect with 95% confidence (LOD definition) thus lies\n", + "# between these two values.\n", + "#\n", + "# Separately, the limit of quantification (LOQ) was determined\n", + "# by Nate Stone and Megan Ruby to be 304.1 copies." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/gut_to_soil_manuscript_figures/notebooks/qcpr-plots.ipynb b/gut_to_soil_manuscript_figures/notebooks/qcpr-plots.ipynb new file mode 100644 index 0000000..c3e567b --- /dev/null +++ b/gut_to_soil_manuscript_figures/notebooks/qcpr-plots.ipynb @@ -0,0 +1,672 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import math\n", + "import statistics\n", + "import pandas as pd\n", + "import numpy as np\n", + "import altair as alt\n", + "import vl_convert as vlc" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# PARSE METADATA\n", + "\n", + "# NOTE: change me\n", + "md_fp = ''\n", + "md = pd.read_csv(md_fp, sep='\\t', dtype={'sample-id': str})\n", + "\n", + "# remove comment lines\n", + "md = md[~ md['sample-id'].str.startswith('#')]\n", + "\n", + "md = md.rename({'sample-id': 'sample_id'}, axis=1)\n", + "\n", + "md = md[['sample_id', 'Bucket', 'Composting Time Point', 'SampleType']]\n", + "\n", + "# drop nan & week-0 rows of HEC\n", + "md = md[~(\n", + " (md['Composting Time Point'] == 'Human Excrement Compost') &\n", + " (md['Composting Time Point'].isna() | md['Composting Time Point'] > 0)\n", + ")]\n", + "\n", + "# keep only HE and HEC observations\n", + "md = md[md['SampleType'].isin(['Human Excrement Compost', 'Human Excrement'])]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# PARSE BACTQUANT DATA\n", + "\n", + "# NOTE: change me\n", + "bq_fp = ''\n", + "bq = pd.read_excel(\n", + " bq_fp,\n", + " sheet_name='MasterFile',\n", + " dtype={'Sample ID': str}\n", + ")\n", + "bq = bq[['Sample ID', 'Copy #']]\n", + "bq = bq.rename(\n", + " {'Sample ID': 'sample_id', 'Copy #': 'bq_copy_number'},\n", + " axis=1\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# PARSE QPCR DATA, which exist in multiple sheets\n", + "\n", + "# NOTE: change me\n", + "ecoli_cperf_fp = ''\n", + "\n", + "sheets = pd.read_excel(ecoli_cperf_fp, sheet_name=None)\n", + "ecoli_cperf = pd.DataFrame()\n", + "for sheet_name, sheet in sheets.items():\n", + " ecoli_cperf = pd.concat([ecoli_cperf, sheet], axis=0)\n", + "\n", + "ecoli_cperf['Sample Name'] = ecoli_cperf['Sample Name'].astype(str)\n", + "\n", + "ecoli_cperf = ecoli_cperf[['Sample Name', 'Target Name', 'Quantity Mean', 'Quantity SD']]\n", + "ecoli_cperf = ecoli_cperf.rename(\n", + " {\n", + " 'Sample Name': 'sample_id', 'Target Name': 'target_name',\n", + " 'Quantity Mean': 'quantity_mean', 'Quantity SD': 'quantity_sd'\n", + " },\n", + " axis=1\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# averaging already done in excel; keep only one row per triplicate set\n", + "ecoli_cperf = ecoli_cperf.groupby(['sample_id', 'target_name']).head(1)\n", + "\n", + "# replace missing quantities with 0 for convenience\n", + "# IMPORTANT: we do not know whether these quantifications were 0,\n", + "# only that there are less than the limit of detection\n", + "ecoli_cperf = ecoli_cperf.fillna({'quantity_mean': 0})\n", + "\n", + "# drop NTCs\n", + "ecoli_cperf = ecoli_cperf[ecoli_cperf.sample_id != 'NTC']" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "target_name\n", + "Cperf23S 1052\n", + "EcoliIUD 1052\n", + "EcoliIUD3.041E1 1\n", + "EcoliIUD3.041E2 1\n", + "EcoliIUD3.041E3 1\n", + "EcoliIUD3.041E4 1\n", + "EcoliIUD3.041E5 1\n", + "EcoliIUD3.041E6 1\n", + "EcoliIUD3.041E7 1\n", + "Cperf23S3.041 1\n", + "Cperf23S3.041E1 1\n", + "Cperf23S3.041E2 1\n", + "Cperf23S3.041E3 1\n", + "Cperf23S3.041E4 1\n", + "Cperf23S3.041E5 1\n", + "Cperf23S3.041E6 1\n", + "Cperf23S3.041E7 1\n", + "EcoliIUD3.041 1\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "# TODO: looks like there were some data entry errors for the assay name\n", + "print( ecoli_cperf.target_name.value_counts() )\n", + "\n", + "# get rid of them for now\n", + "ecoli_cperf = ecoli_cperf[ ecoli_cperf.target_name.isin(['EcoliIUD', 'Cperf23S']) ]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# PIVOT ASSAY MEASUREMENTS TO COLUMNS\n", + "\n", + "ecoli_cperf = ecoli_cperf.pivot(index='sample_id', columns='target_name')\n", + "ecoli_cperf.columns = ecoli_cperf.columns.to_flat_index().map(lambda c: '_'.join(c))\n", + "ecoli_cperf.reset_index(inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# MERGE ECOLI/CPERF WITH METADATA\n", + "\n", + "ecoli_cperf_md = ecoli_cperf.merge(md, how='right', on='sample_id')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# ADD LIMIT OF QUANTIFICATION AND LIMIT OF DETECTION VALUES\n", + "\n", + "ecoli_cperf_md['loq'] = 302\n", + "ecoli_cperf_md['lod'] = 30.2" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# NORMALIZE ECOLI, CPERF QUANTIFICATIONS BY BACTQUANT\n", + "\n", + "merged = ecoli_cperf_md.merge(right=bq, on='sample_id', how='inner')\n", + "\n", + "merged['cperf_normalized'] = merged.quantity_mean_Cperf23S / merged.bq_copy_number\n", + "merged['ecoli_normalized'] = merged.quantity_mean_EcoliIUD / merged.bq_copy_number\n", + "\n", + "# normalize limit of quantification as well\n", + "merged['loq_normalized'] = merged.loq / merged.bq_copy_number\n", + "\n", + "ecoli_cperf_md = merged" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# RECAST WEEK AND BUCKET COLUMNS TO INTEGER\n", + "\n", + "ecoli_cperf_md['Bucket'] = ecoli_cperf_md['Bucket'].astype('Int64')\n", + "ecoli_cperf_md['Composting Time Point'] = ecoli_cperf_md['Composting Time Point'].astype('Int64')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 815 entries, 0 to 814\n", + "Data columns (total 14 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 sample_id 815 non-null object \n", + " 1 quantity_mean_Cperf23S 812 non-null float64\n", + " 2 quantity_mean_EcoliIUD 812 non-null float64\n", + " 3 quantity_sd_Cperf23S 725 non-null float64\n", + " 4 quantity_sd_EcoliIUD 409 non-null float64\n", + " 5 Bucket 815 non-null Int64 \n", + " 6 Composting Time Point 770 non-null Int64 \n", + " 7 SampleType 815 non-null object \n", + " 8 loq 815 non-null int64 \n", + " 9 lod 815 non-null float64\n", + " 10 bq_copy_number 813 non-null float64\n", + " 11 cperf_normalized 810 non-null float64\n", + " 12 ecoli_normalized 810 non-null float64\n", + " 13 loq_normalized 813 non-null float64\n", + "dtypes: Int64(2), float64(9), int64(1), object(2)\n", + "memory usage: 90.9+ KB\n" + ] + } + ], + "source": [ + "# SUBSET TO BUCKETS OF INTEREST\n", + "\n", + "buckets_to_keep = list(range(1, 8)) + list(range(9, 17))\n", + "ecoli_cperf_md = ecoli_cperf_md[ecoli_cperf_md['Bucket'].isin(buckets_to_keep)]\n", + "\n", + "ecoli_cperf_md.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# RENAME QUANTITY COLUMNS\n", + "\n", + "ecoli_cperf_md = ecoli_cperf_md.rename(\n", + " {\n", + " 'quantity_mean_EcoliIUD': 'ecoli_quantity_mean',\n", + " 'quantity_mean_Cperf23S': 'cperf_quantity_mean'\n", + " },\n", + " axis=1,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# CALCULATE MOVING AVERAGES\n", + "\n", + "# sort by week for moving average calculation\n", + "ecoli_cperf_md = ecoli_cperf_md.sort_values('Composting Time Point')\n", + "\n", + "# averaging period\n", + "window = 4\n", + "\n", + "# standard moving averages\n", + "ecoli_cperf_md['ecoli_mov_avg'] = (\n", + " ecoli_cperf_md.groupby(['Bucket', 'SampleType'])['ecoli_quantity_mean']\n", + " .transform(lambda x: x.rolling(window=window, min_periods=1).mean())\n", + ")\n", + "ecoli_cperf_md['cperf_mov_avg'] = (\n", + " ecoli_cperf_md.groupby(['Bucket', 'SampleType'])['cperf_quantity_mean']\n", + " .transform(lambda x: x.rolling(window=window, min_periods=1).mean())\n", + ")\n", + "\n", + "# bactquant-normalized moving averages\n", + "ecoli_cperf_md['ecoli_normalized_mov_avg'] = (\n", + " ecoli_cperf_md.groupby(['Bucket', 'SampleType'])['ecoli_normalized']\n", + " .transform(lambda x: x.rolling(window=window, min_periods=1).mean())\n", + ")\n", + "ecoli_cperf_md['cperf_normalized_mov_avg'] = (\n", + " ecoli_cperf_md.groupby(['Bucket', 'SampleType'])['cperf_normalized']\n", + " .transform(lambda x: x.rolling(window=window, min_periods=1).mean())\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# ADD PSEUDO COUNTS FOR LOG SCALE\n", + "\n", + "def calculate_pseudo_count(df, column):\n", + " minimum = df[df[column] > 0][column].min()\n", + " pseudo_count_exp = math.floor(math.log10(minimum)) - 1\n", + " return 1 * 10**(pseudo_count_exp)\n", + "\n", + "replace = [\n", + " 'ecoli_mov_avg', 'ecoli_quantity_mean', 'cperf_mov_avg', 'cperf_quantity_mean',\n", + " 'ecoli_normalized_mov_avg', 'ecoli_normalized', 'cperf_normalized_mov_avg', 'cperf_normalized',\n", + "]\n", + "for replace_column in replace:\n", + " pseudo_count = calculate_pseudo_count(ecoli_cperf_md, replace_column)\n", + " ecoli_cperf_md.loc[:, replace_column] += pseudo_count" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# FILL FECAL SAMPLE TIME POINTS AS 0\n", + "\n", + "ecoli_cperf_md.loc[ecoli_cperf_md['SampleType'] == 'Human Excrement', 'Composting Time Point'] = \\\n", + " ecoli_cperf_md.loc[ecoli_cperf_md['SampleType'] == 'Human Excrement', 'Composting Time Point'].fillna(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# RENAME qPCR COLUMNS FOR BETTER LEGIBILITY\n", + "renames = {\n", + " 'cperf_mov_avg': 'C perfringens Moving Average',\n", + " 'ecoli_mov_avg': 'E coli Moving Average',\n", + " 'cperf_normalized_mov_avg': 'C perfringens Moving Average (Normalized)',\n", + " 'ecoli_normalized_mov_avg': 'E coli Moving Average (Normalized)',\n", + " 'loq': 'Limit of Quantification',\n", + " 'loq_normalized': 'Limit of Quantification (Normalized)',\n", + " 'SampleType': 'Sample Type',\n", + "}\n", + "ecoli_cperf_md = ecoli_cperf_md.rename(renames, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.FacetChart(...)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# NOTE: change me\n", + "y_title = 'E coli Copy Number (Normalized)'\n", + "line_y_value = 'E coli Moving Average (Normalized)'\n", + "point_y_value = 'ecoli_normalized'\n", + "\n", + "pseudo_count = ecoli_cperf_md[line_y_value].min()\n", + "\n", + "line = alt.Chart(ecoli_cperf_md).transform_fold(\n", + " [line_y_value, 'Limit of Quantification (Normalized)'],\n", + " as_=['Line', 'Amount'],\n", + ").mark_line().encode(\n", + " x=alt.X('Composting Time Point').scale(domain=(0, 52), nice=False),\n", + " y=alt.Y('Amount', type='quantitative').scale(type='log').title(y_title),\n", + " color='Line:N',\n", + " strokeDash='Line:N',\n", + ").transform_filter(\n", + " (alt.datum['Sample Type'] == 'Human Excrement Compost')\n", + ")\n", + "\n", + "point = alt.Chart(ecoli_cperf_md).mark_point(\n", + " filled=True,\n", + " size=75,\n", + " color='black',\n", + " fillOpacity=0.85\n", + ").encode(\n", + " x=alt.X('Composting Time Point'),\n", + " y=alt.Y(point_y_value).scale(type='log'),\n", + " shape='Sample Type'\n", + ").transform_filter(\n", + " (alt.datum['Sample Type'] == 'Human Excrement')\n", + " #(alt.datum.ecoli_quantity_mean < 1_000_000)\n", + ")\n", + "\n", + "alt.layer(line, point).facet(\n", + " facet='Bucket',\n", + " columns=5,\n", + " title=alt.Title(\n", + " f'{y_title}, By Bucket',\n", + " subtitle=[\n", + " 'moving average, period of 4',\n", + " f'pseudo-count of {pseudo_count} added to all data points',\n", + " ]\n", + " )\n", + ").resolve_scale(\n", + " y='independent',\n", + " x='independent'\n", + ").configure_legend(\n", + " labelLimit=0,\n", + " titleFontSize=18,\n", + " labelFontSize=16,\n", + " orient='bottom',\n", + " padding=25\n", + ").configure_axis(\n", + " labelFontSize=15,\n", + " titleFontSize=15,\n", + ").configure_axisY(\n", + " format='e'\n", + ").configure_header(\n", + " titleFontSize=16,\n", + " labelFontSize=15\n", + ").configure_title(\n", + " fontSize=20,\n", + " subtitleFontSize=18\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.FacetChart(...)" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# BACTQUANT ONLY PLOTS\n", + "\n", + "bq = alt.Chart(ecoli_cperf_md).mark_line().encode(\n", + " x=alt.X('Composting Time Point').scale(domain=(0, 52), nice=False),\n", + " y=alt.Y('bq_copy_number').title('BactQuant Copy Number').scale(type='log'),\n", + ").transform_filter(\n", + " (alt.datum['Sample Type'] == 'Human Excrement Compost')\n", + ").facet(\n", + " facet='Bucket',\n", + " columns=5,\n", + " title=alt.Title(\n", + " 'BactQuant Copy Number, By Bucket',\n", + " #subtitle=['moving average, period=4',]\n", + " )\n", + ").resolve_scale(\n", + " y='independent',\n", + " x='independent'\n", + ").configure_axis(\n", + " labelFontSize=15,\n", + " titleFontSize=15,\n", + ").configure_axisY(\n", + " format='e'\n", + ").configure_header(\n", + " titleFontSize=16,\n", + " labelFontSize=15\n", + ").configure_title(\n", + " fontSize=20\n", + ")\n", + "\n", + "bq" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}