From 8bd12ad160d2bce4bcba6cb6ad78a4e57bcc585a Mon Sep 17 00:00:00 2001
From: Colin Wood <68213641+colinvwood@users.noreply.github.com>
Date: Mon, 4 Nov 2024 15:05:05 -0700
Subject: [PATCH] Add jupyter notebooks for qPCR figures, culturing figures,
qPCR limit of detection and quantification (#6)
* jupyter notebooks for qPCR figures, culturing figures, qPCR limit of detection/quantification analysis
* dependencies
* move notebooks to their own directory
* no longer duplicate rows in metadata
* metadata changes
* increase font sizes, other small changes
---
...-manuscript-figures-qiime2-tiny-2024.5.yml | 14 +-
.../notebooks/culturing-plots.ipynb | 494 +++++++++++++
.../notebooks/lod-analysis.ipynb | 158 ++++
.../notebooks/qcpr-plots.ipynb | 672 ++++++++++++++++++
4 files changed, 1333 insertions(+), 5 deletions(-)
create mode 100644 gut_to_soil_manuscript_figures/notebooks/culturing-plots.ipynb
create mode 100644 gut_to_soil_manuscript_figures/notebooks/lod-analysis.ipynb
create mode 100644 gut_to_soil_manuscript_figures/notebooks/qcpr-plots.ipynb
diff --git a/environments/gut-to-soil-manuscript-figures-qiime2-tiny-2024.5.yml b/environments/gut-to-soil-manuscript-figures-qiime2-tiny-2024.5.yml
index 822342e..dcf220f 100644
--- a/environments/gut-to-soil-manuscript-figures-qiime2-tiny-2024.5.yml
+++ b/environments/gut-to-soil-manuscript-figures-qiime2-tiny-2024.5.yml
@@ -1,11 +1,15 @@
channels:
-- https://packages.qiime2.org/qiime2/2024.5/tiny/passed
-- https://packages.qiime2.org/qiime2/2024.5/amplicon/passed
-- conda-forge
-- bioconda
+ - https://packages.qiime2.org/qiime2/2024.5/tiny/passed
+ - https://packages.qiime2.org/qiime2/2024.5/amplicon/passed
+ - conda-forge
+ - bioconda
dependencies:
- qiime2-tiny
- q2-diversity
- pip
- pip:
- - gut_to_soil_manuscript_figures@git+https://github.com/caporaso-lab/gut-to-soil-manuscript-figures.git
+ - altair
+ - vl-convert-python
+ - openpyxl
+ - xlrd
+ - gut_to_soil_manuscript_figures@git+https://github.com/caporaso-lab/gut-to-soil-manuscript-figures.git
diff --git a/gut_to_soil_manuscript_figures/notebooks/culturing-plots.ipynb b/gut_to_soil_manuscript_figures/notebooks/culturing-plots.ipynb
new file mode 100644
index 0000000..3b272dd
--- /dev/null
+++ b/gut_to_soil_manuscript_figures/notebooks/culturing-plots.ipynb
@@ -0,0 +1,494 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "443093ee-9c82-4bfe-a463-77c63ccdfda4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import math\n",
+ "import statistics\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import altair as alt\n",
+ "import vl_convert as vlc"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "dc7b93df-b2c0-4961-b19d-0efeab002305",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# PARSE METADATA\n",
+ "\n",
+ "# NOTE: change me\n",
+ "md_fp = ''\n",
+ "md = pd.read_csv(md_fp, sep='\\t', dtype={'sample-id': str})\n",
+ "\n",
+ "# remove comment lines\n",
+ "md = md[~ md['sample-id'].str.startswith('#')]\n",
+ "\n",
+ "md = md.rename({'sample-id': 'sample_id'}, axis=1)\n",
+ "\n",
+ "md = md[['sample_id', 'Bucket', 'Composting Time Point', 'SampleType']]\n",
+ "\n",
+ "# drop nan & week-0 rows of HEC\n",
+ "md = md[~(\n",
+ " (md['Composting Time Point'] == 'Human Excrement Compost') &\n",
+ " (md['Composting Time Point'].isna() | md['Composting Time Point'] > 0)\n",
+ ")]\n",
+ "\n",
+ "# keep only HEC observations\n",
+ "md = md[md['SampleType'] == 'Human Excrement Compost']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "2aa626c2-d63d-4b1a-b50d-a7137da07260",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# RECAST WEEK AND BUCKET COLUMNS TO INTEGER\n",
+ "\n",
+ "md['Bucket'] = md['Bucket'].astype('Int64')\n",
+ "md['Composting Time Point'] = md['Composting Time Point'].astype('Int64')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "e9093958-04d4-4233-bdb2-ed0f3f81c542",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " sample_id | \n",
+ " Bucket | \n",
+ " Preliminary analysis of presence | \n",
+ " Results from preliminary analysis | \n",
+ " Serial Dilution 0.1 | \n",
+ " Serial Dilution 0.01 | \n",
+ " Serial Dilution 0.001 | \n",
+ " Serial Dilution 0.0001 | \n",
+ " MPN/g | \n",
+ " Lower 95% Confidence Limit (MPN/g) | \n",
+ " Upper 95% Confidence Limit (MPN/g) | \n",
+ " Lower 95% Confidence Limit (MPN/g) .1 | \n",
+ " Upper 95% Confidence Limit (MPN/g).1 | \n",
+ " Composting Time Point | \n",
+ " SampleType | \n",
+ " Date | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 772b5c41 | \n",
+ " 1 | \n",
+ " (+/+) | \n",
+ " Y | \n",
+ " (+/+/+/+) | \n",
+ " (+/+/+/+) | \n",
+ " (-/-/-/-) | \n",
+ " (-/-/-/-) | \n",
+ " 239.791 | \n",
+ " 63.784 | \n",
+ " 849.372 | \n",
+ " 38.676 | \n",
+ " 964.672 | \n",
+ " 1 | \n",
+ " Compost Post-Roll | \n",
+ " 2021-10-04 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " e706dd01 | \n",
+ " 1 | \n",
+ " (+/+) | \n",
+ " Y | \n",
+ " (+/+/+/+) | \n",
+ " (+/+/+/+) | \n",
+ " (-/-/-/-) | \n",
+ " (-/-/-/-) | \n",
+ " 239.791 | \n",
+ " 63.784 | \n",
+ " 849.372 | \n",
+ " 38.676 | \n",
+ " 964.672 | \n",
+ " 3 | \n",
+ " Compost Post-Roll | \n",
+ " 2021-10-13 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 600569a7 | \n",
+ " 1 | \n",
+ " (+/+) | \n",
+ " Y | \n",
+ " (+/+/+/-) | \n",
+ " (-/-/-/-) | \n",
+ " (-/-/-/-) | \n",
+ " (-/-/-/-) | \n",
+ " 11.451 | \n",
+ " 1.724 | \n",
+ " 29.699 | \n",
+ " 2.191 | \n",
+ " 43.744 | \n",
+ " 5 | \n",
+ " Compost Post-Roll | \n",
+ " 2021-10-28 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " b9f1a9d7 | \n",
+ " 1 | \n",
+ " (+/-) | \n",
+ " Y | \n",
+ " (-/-/-/-) | \n",
+ " (-/-/-/-) | \n",
+ " (-/-/-/-) | \n",
+ " (-/-/-/-) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 7 | \n",
+ " Compost Post-Roll | \n",
+ " 2021-11-11 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 3983043e | \n",
+ " 1 | \n",
+ " (+/+) | \n",
+ " Y | \n",
+ " (+/+/-/-) | \n",
+ " (-/-/-/-) | \n",
+ " (-/-/-/-) | \n",
+ " (-/-/-/-) | \n",
+ " 6.061 | \n",
+ " 0.386 | \n",
+ " 16.26 | \n",
+ " 0.653 | \n",
+ " 26.597 | \n",
+ " 9 | \n",
+ " Compost Post-Roll | \n",
+ " 2021-11-24 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " sample_id Bucket Preliminary analysis of presence \\\n",
+ "0 772b5c41 1 (+/+) \n",
+ "1 e706dd01 1 (+/+) \n",
+ "2 600569a7 1 (+/+) \n",
+ "3 b9f1a9d7 1 (+/-) \n",
+ "4 3983043e 1 (+/+) \n",
+ "\n",
+ " Results from preliminary analysis Serial Dilution 0.1 Serial Dilution 0.01 \\\n",
+ "0 Y (+/+/+/+) (+/+/+/+) \n",
+ "1 Y (+/+/+/+) (+/+/+/+) \n",
+ "2 Y (+/+/+/-) (-/-/-/-) \n",
+ "3 Y (-/-/-/-) (-/-/-/-) \n",
+ "4 Y (+/+/-/-) (-/-/-/-) \n",
+ "\n",
+ " Serial Dilution 0.001 Serial Dilution 0.0001 MPN/g \\\n",
+ "0 (-/-/-/-) (-/-/-/-) 239.791 \n",
+ "1 (-/-/-/-) (-/-/-/-) 239.791 \n",
+ "2 (-/-/-/-) (-/-/-/-) 11.451 \n",
+ "3 (-/-/-/-) (-/-/-/-) 0 \n",
+ "4 (-/-/-/-) (-/-/-/-) 6.061 \n",
+ "\n",
+ " Lower 95% Confidence Limit (MPN/g) Upper 95% Confidence Limit (MPN/g) \\\n",
+ "0 63.784 849.372 \n",
+ "1 63.784 849.372 \n",
+ "2 1.724 29.699 \n",
+ "3 0 0 \n",
+ "4 0.386 16.26 \n",
+ "\n",
+ " Lower 95% Confidence Limit (MPN/g) .1 Upper 95% Confidence Limit (MPN/g).1 \\\n",
+ "0 38.676 964.672 \n",
+ "1 38.676 964.672 \n",
+ "2 2.191 43.744 \n",
+ "3 0 0 \n",
+ "4 0.653 26.597 \n",
+ "\n",
+ " Composting Time Point SampleType Date \n",
+ "0 1 Compost Post-Roll 2021-10-04 \n",
+ "1 3 Compost Post-Roll 2021-10-13 \n",
+ "2 5 Compost Post-Roll 2021-10-28 \n",
+ "3 7 Compost Post-Roll 2021-11-11 \n",
+ "4 9 Compost Post-Roll 2021-11-24 "
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# IMPORT CULUTURING DATA\n",
+ "\n",
+ "# NOTE: change me\n",
+ "culturing_fp = ''\n",
+ "culture = pd.read_csv(culturing_fp, sep='\\t', skiprows=[1])\n",
+ "\n",
+ "# remove comment lines\n",
+ "culture = culture[~ culture['sample-id'].str.startswith('#')]\n",
+ "\n",
+ "culture = culture.rename({'sample-id': 'sample_id'}, axis='columns')\n",
+ "\n",
+ "culture.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "b88b1884-df2b-4659-af93-68d6a01d1773",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# SUBSET CULTURING DATA\n",
+ "\n",
+ "culture = culture[['sample_id', 'MPN/g']]\n",
+ "culture = culture.astype({'MPN/g': float})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "ff11aa18-2c6c-4afb-a454-f0a6d4a6b9ad",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# DROP NA CULTURING VALUES\n",
+ "\n",
+ "culture = culture[culture['MPN/g'].notna()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "762d31d2-040f-44f0-b87a-c2486f2dee10",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 384 entries, 0 to 383\n",
+ "Data columns (total 5 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 sample_id 384 non-null object \n",
+ " 1 MPN/g 384 non-null float64\n",
+ " 2 Bucket 384 non-null Int64 \n",
+ " 3 Composting Time Point 384 non-null Int64 \n",
+ " 4 SampleType 384 non-null object \n",
+ "dtypes: Int64(2), float64(1), object(2)\n",
+ "memory usage: 15.9+ KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "# MERGE CULTURING DATA, METADATA\n",
+ "\n",
+ "culture_md = culture.merge(md, on='sample_id', how='inner')\n",
+ "\n",
+ "culture_md.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "9e54c8fd-4ced-4987-8082-d739067d8358",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# CALCULATE PER-BUCKET MOVING AVERAGES\n",
+ "\n",
+ "# sort by week for moving average calculation\n",
+ "culture_md = culture_md.sort_values('Composting Time Point')\n",
+ "\n",
+ "# averaging period\n",
+ "window = 4\n",
+ "\n",
+ "culture_md['MPN/g Moving Average'] = (\n",
+ " culture_md.groupby(['Bucket', 'SampleType'])['MPN/g']\n",
+ " .transform(lambda x: x.rolling(window=window, min_periods=1).mean())\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "1f106022-e861-479b-8ce2-f0264c0bf4dd",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.FacetChart(...)"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "line = alt.Chart(culture_md).mark_line().encode(\n",
+ " x=alt.X('Composting Time Point').scale(domain=(0, 52), nice=False),\n",
+ " y=alt.Y('MPN/g').scale(type='linear').title('MPN/g'),\n",
+ ").transform_filter(\n",
+ " (alt.datum['SampleType'] == 'Human Excrement Compost')\n",
+ ").facet(\n",
+ " facet='Bucket',\n",
+ " columns=5,\n",
+ " title=alt.Title(\n",
+ " 'Culturing MPN/g, By Bucket'\n",
+ " )\n",
+ ").resolve_scale(\n",
+ " y='independent',\n",
+ " x='independent'\n",
+ ").configure_axis(\n",
+ " labelFontSize=15,\n",
+ " titleFontSize=15,\n",
+ ").configure_axisY(\n",
+ " format='e'\n",
+ ").configure_header(\n",
+ " titleFontSize=16,\n",
+ " labelFontSize=15\n",
+ ").configure_title(\n",
+ " fontSize=20\n",
+ ")\n",
+ "\n",
+ "line"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "abe139a1-39d6-4710-8c15-44736c4fe7e8",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.15"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/gut_to_soil_manuscript_figures/notebooks/lod-analysis.ipynb b/gut_to_soil_manuscript_figures/notebooks/lod-analysis.ipynb
new file mode 100644
index 0000000..0b72210
--- /dev/null
+++ b/gut_to_soil_manuscript_figures/notebooks/lod-analysis.ipynb
@@ -0,0 +1,158 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import re"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# PARSE QPCR DATA, which exist in multiple sheets\n",
+ "\n",
+ "# NOTE: change me\n",
+ "ecoli_cperf_fp = ''\n",
+ "\n",
+ "sheets = pd.read_excel(ecoli_cperf_fp, sheet_name=None)\n",
+ "df = pd.DataFrame()\n",
+ "for sheet_name, sheet in sheets.items():\n",
+ " df = pd.concat([df, sheet], axis=0)\n",
+ "\n",
+ "df['Sample Name'] = df['Sample Name'].astype(str)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = df[['Target Name', 'Sample Name', 'CT', 'Ct Mean', 'Quantity', 'Quantity Mean', 'Quantity SD']]\n",
+ "\n",
+ "# include only standards\n",
+ "df = df[df['Sample Name'].str.contains('Gblock')]\n",
+ "\n",
+ "# split concentration from sample name\n",
+ "df[['sample-name', 'concentration']] = df['Sample Name'].str.split('CTL', expand=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# extract standard concentrations from cells\n",
+ "\n",
+ "# 3.041 or 3.041E1, 3.041E2, etc.\n",
+ "conc_pattern = re.compile(r'3\\.041(E[0-9])?')\n",
+ "\n",
+ "def parse_conc(row):\n",
+ " match = conc_pattern.search(row['Sample Name'])\n",
+ " if not match:\n",
+ " match = conc_pattern.search(row['Target Name'])\n",
+ " if not match:\n",
+ " raise ValueError('concentration not found')\n",
+ "\n",
+ " row['Concentration'] = match.group()\n",
+ "\n",
+ " return row\n",
+ "\n",
+ "df = df.apply(parse_conc, axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "3.041 0.558824\n",
+ "3.041E1 1.000000\n",
+ "3.041E2 1.000000\n",
+ "3.041E3 1.000000\n",
+ "3.041E4 1.000000\n",
+ "3.041E5 1.000000\n",
+ "3.041E6 1.000000\n",
+ "3.041E7 0.985294\n",
+ "dtype: float64\n"
+ ]
+ }
+ ],
+ "source": [
+ "# calculate proportion amplified per standard concentration\n",
+ "conc_grouped = df.groupby('Concentration')\n",
+ "proportions = pd.Series()\n",
+ "for conc, g_df in conc_grouped:\n",
+ " counts = g_df['CT'].value_counts(dropna=False)\n",
+ " assert np.nan not in counts\n",
+ "\n",
+ " if 'Undetermined' in counts:\n",
+ " undetermined = counts['Undetermined']\n",
+ " else:\n",
+ " undetermined = 0\n",
+ "\n",
+ " proportions[conc] = 1 - ( undetermined / counts.sum() )\n",
+ "\n",
+ "print(proportions)\n",
+ "\n",
+ "\n",
+ "# IMPORTANT: The limit of detection (LOD) is between 30.41 and 3.041 copies.\n",
+ "# We can detect 30.41 copies with 100% confidence, and\n",
+ "# 3.041 copies with ~55% confidence. The number of copies which\n",
+ "# we can detect with 95% confidence (LOD definition) thus lies\n",
+ "# between these two values.\n",
+ "#\n",
+ "# Separately, the limit of quantification (LOQ) was determined\n",
+ "# by Nate Stone and Megan Ruby to be 304.1 copies."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.14"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/gut_to_soil_manuscript_figures/notebooks/qcpr-plots.ipynb b/gut_to_soil_manuscript_figures/notebooks/qcpr-plots.ipynb
new file mode 100644
index 0000000..c3e567b
--- /dev/null
+++ b/gut_to_soil_manuscript_figures/notebooks/qcpr-plots.ipynb
@@ -0,0 +1,672 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import math\n",
+ "import statistics\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import altair as alt\n",
+ "import vl_convert as vlc"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# PARSE METADATA\n",
+ "\n",
+ "# NOTE: change me\n",
+ "md_fp = ''\n",
+ "md = pd.read_csv(md_fp, sep='\\t', dtype={'sample-id': str})\n",
+ "\n",
+ "# remove comment lines\n",
+ "md = md[~ md['sample-id'].str.startswith('#')]\n",
+ "\n",
+ "md = md.rename({'sample-id': 'sample_id'}, axis=1)\n",
+ "\n",
+ "md = md[['sample_id', 'Bucket', 'Composting Time Point', 'SampleType']]\n",
+ "\n",
+ "# drop nan & week-0 rows of HEC\n",
+ "md = md[~(\n",
+ " (md['Composting Time Point'] == 'Human Excrement Compost') &\n",
+ " (md['Composting Time Point'].isna() | md['Composting Time Point'] > 0)\n",
+ ")]\n",
+ "\n",
+ "# keep only HE and HEC observations\n",
+ "md = md[md['SampleType'].isin(['Human Excrement Compost', 'Human Excrement'])]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# PARSE BACTQUANT DATA\n",
+ "\n",
+ "# NOTE: change me\n",
+ "bq_fp = ''\n",
+ "bq = pd.read_excel(\n",
+ " bq_fp,\n",
+ " sheet_name='MasterFile',\n",
+ " dtype={'Sample ID': str}\n",
+ ")\n",
+ "bq = bq[['Sample ID', 'Copy #']]\n",
+ "bq = bq.rename(\n",
+ " {'Sample ID': 'sample_id', 'Copy #': 'bq_copy_number'},\n",
+ " axis=1\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# PARSE QPCR DATA, which exist in multiple sheets\n",
+ "\n",
+ "# NOTE: change me\n",
+ "ecoli_cperf_fp = ''\n",
+ "\n",
+ "sheets = pd.read_excel(ecoli_cperf_fp, sheet_name=None)\n",
+ "ecoli_cperf = pd.DataFrame()\n",
+ "for sheet_name, sheet in sheets.items():\n",
+ " ecoli_cperf = pd.concat([ecoli_cperf, sheet], axis=0)\n",
+ "\n",
+ "ecoli_cperf['Sample Name'] = ecoli_cperf['Sample Name'].astype(str)\n",
+ "\n",
+ "ecoli_cperf = ecoli_cperf[['Sample Name', 'Target Name', 'Quantity Mean', 'Quantity SD']]\n",
+ "ecoli_cperf = ecoli_cperf.rename(\n",
+ " {\n",
+ " 'Sample Name': 'sample_id', 'Target Name': 'target_name',\n",
+ " 'Quantity Mean': 'quantity_mean', 'Quantity SD': 'quantity_sd'\n",
+ " },\n",
+ " axis=1\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# averaging already done in excel; keep only one row per triplicate set\n",
+ "ecoli_cperf = ecoli_cperf.groupby(['sample_id', 'target_name']).head(1)\n",
+ "\n",
+ "# replace missing quantities with 0 for convenience\n",
+ "# IMPORTANT: we do not know whether these quantifications were 0,\n",
+ "# only that there are less than the limit of detection\n",
+ "ecoli_cperf = ecoli_cperf.fillna({'quantity_mean': 0})\n",
+ "\n",
+ "# drop NTCs\n",
+ "ecoli_cperf = ecoli_cperf[ecoli_cperf.sample_id != 'NTC']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "target_name\n",
+ "Cperf23S 1052\n",
+ "EcoliIUD 1052\n",
+ "EcoliIUD3.041E1 1\n",
+ "EcoliIUD3.041E2 1\n",
+ "EcoliIUD3.041E3 1\n",
+ "EcoliIUD3.041E4 1\n",
+ "EcoliIUD3.041E5 1\n",
+ "EcoliIUD3.041E6 1\n",
+ "EcoliIUD3.041E7 1\n",
+ "Cperf23S3.041 1\n",
+ "Cperf23S3.041E1 1\n",
+ "Cperf23S3.041E2 1\n",
+ "Cperf23S3.041E3 1\n",
+ "Cperf23S3.041E4 1\n",
+ "Cperf23S3.041E5 1\n",
+ "Cperf23S3.041E6 1\n",
+ "Cperf23S3.041E7 1\n",
+ "EcoliIUD3.041 1\n",
+ "Name: count, dtype: int64\n"
+ ]
+ }
+ ],
+ "source": [
+ "# TODO: looks like there were some data entry errors for the assay name\n",
+ "print( ecoli_cperf.target_name.value_counts() )\n",
+ "\n",
+ "# get rid of them for now\n",
+ "ecoli_cperf = ecoli_cperf[ ecoli_cperf.target_name.isin(['EcoliIUD', 'Cperf23S']) ]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# PIVOT ASSAY MEASUREMENTS TO COLUMNS\n",
+ "\n",
+ "ecoli_cperf = ecoli_cperf.pivot(index='sample_id', columns='target_name')\n",
+ "ecoli_cperf.columns = ecoli_cperf.columns.to_flat_index().map(lambda c: '_'.join(c))\n",
+ "ecoli_cperf.reset_index(inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# MERGE ECOLI/CPERF WITH METADATA\n",
+ "\n",
+ "ecoli_cperf_md = ecoli_cperf.merge(md, how='right', on='sample_id')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ADD LIMIT OF QUANTIFICATION AND LIMIT OF DETECTION VALUES\n",
+ "\n",
+ "ecoli_cperf_md['loq'] = 302\n",
+ "ecoli_cperf_md['lod'] = 30.2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# NORMALIZE ECOLI, CPERF QUANTIFICATIONS BY BACTQUANT\n",
+ "\n",
+ "merged = ecoli_cperf_md.merge(right=bq, on='sample_id', how='inner')\n",
+ "\n",
+ "merged['cperf_normalized'] = merged.quantity_mean_Cperf23S / merged.bq_copy_number\n",
+ "merged['ecoli_normalized'] = merged.quantity_mean_EcoliIUD / merged.bq_copy_number\n",
+ "\n",
+ "# normalize limit of quantification as well\n",
+ "merged['loq_normalized'] = merged.loq / merged.bq_copy_number\n",
+ "\n",
+ "ecoli_cperf_md = merged"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# RECAST WEEK AND BUCKET COLUMNS TO INTEGER\n",
+ "\n",
+ "ecoli_cperf_md['Bucket'] = ecoli_cperf_md['Bucket'].astype('Int64')\n",
+ "ecoli_cperf_md['Composting Time Point'] = ecoli_cperf_md['Composting Time Point'].astype('Int64')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 815 entries, 0 to 814\n",
+ "Data columns (total 14 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 sample_id 815 non-null object \n",
+ " 1 quantity_mean_Cperf23S 812 non-null float64\n",
+ " 2 quantity_mean_EcoliIUD 812 non-null float64\n",
+ " 3 quantity_sd_Cperf23S 725 non-null float64\n",
+ " 4 quantity_sd_EcoliIUD 409 non-null float64\n",
+ " 5 Bucket 815 non-null Int64 \n",
+ " 6 Composting Time Point 770 non-null Int64 \n",
+ " 7 SampleType 815 non-null object \n",
+ " 8 loq 815 non-null int64 \n",
+ " 9 lod 815 non-null float64\n",
+ " 10 bq_copy_number 813 non-null float64\n",
+ " 11 cperf_normalized 810 non-null float64\n",
+ " 12 ecoli_normalized 810 non-null float64\n",
+ " 13 loq_normalized 813 non-null float64\n",
+ "dtypes: Int64(2), float64(9), int64(1), object(2)\n",
+ "memory usage: 90.9+ KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "# SUBSET TO BUCKETS OF INTEREST\n",
+ "\n",
+ "buckets_to_keep = list(range(1, 8)) + list(range(9, 17))\n",
+ "ecoli_cperf_md = ecoli_cperf_md[ecoli_cperf_md['Bucket'].isin(buckets_to_keep)]\n",
+ "\n",
+ "ecoli_cperf_md.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# RENAME QUANTITY COLUMNS\n",
+ "\n",
+ "ecoli_cperf_md = ecoli_cperf_md.rename(\n",
+ " {\n",
+ " 'quantity_mean_EcoliIUD': 'ecoli_quantity_mean',\n",
+ " 'quantity_mean_Cperf23S': 'cperf_quantity_mean'\n",
+ " },\n",
+ " axis=1,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# CALCULATE MOVING AVERAGES\n",
+ "\n",
+ "# sort by week for moving average calculation\n",
+ "ecoli_cperf_md = ecoli_cperf_md.sort_values('Composting Time Point')\n",
+ "\n",
+ "# averaging period\n",
+ "window = 4\n",
+ "\n",
+ "# standard moving averages\n",
+ "ecoli_cperf_md['ecoli_mov_avg'] = (\n",
+ " ecoli_cperf_md.groupby(['Bucket', 'SampleType'])['ecoli_quantity_mean']\n",
+ " .transform(lambda x: x.rolling(window=window, min_periods=1).mean())\n",
+ ")\n",
+ "ecoli_cperf_md['cperf_mov_avg'] = (\n",
+ " ecoli_cperf_md.groupby(['Bucket', 'SampleType'])['cperf_quantity_mean']\n",
+ " .transform(lambda x: x.rolling(window=window, min_periods=1).mean())\n",
+ ")\n",
+ "\n",
+ "# bactquant-normalized moving averages\n",
+ "ecoli_cperf_md['ecoli_normalized_mov_avg'] = (\n",
+ " ecoli_cperf_md.groupby(['Bucket', 'SampleType'])['ecoli_normalized']\n",
+ " .transform(lambda x: x.rolling(window=window, min_periods=1).mean())\n",
+ ")\n",
+ "ecoli_cperf_md['cperf_normalized_mov_avg'] = (\n",
+ " ecoli_cperf_md.groupby(['Bucket', 'SampleType'])['cperf_normalized']\n",
+ " .transform(lambda x: x.rolling(window=window, min_periods=1).mean())\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ADD PSEUDO COUNTS FOR LOG SCALE\n",
+ "\n",
+ "def calculate_pseudo_count(df, column):\n",
+ " minimum = df[df[column] > 0][column].min()\n",
+ " pseudo_count_exp = math.floor(math.log10(minimum)) - 1\n",
+ " return 1 * 10**(pseudo_count_exp)\n",
+ "\n",
+ "replace = [\n",
+ " 'ecoli_mov_avg', 'ecoli_quantity_mean', 'cperf_mov_avg', 'cperf_quantity_mean',\n",
+ " 'ecoli_normalized_mov_avg', 'ecoli_normalized', 'cperf_normalized_mov_avg', 'cperf_normalized',\n",
+ "]\n",
+ "for replace_column in replace:\n",
+ " pseudo_count = calculate_pseudo_count(ecoli_cperf_md, replace_column)\n",
+ " ecoli_cperf_md.loc[:, replace_column] += pseudo_count"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# FILL FECAL SAMPLE TIME POINTS AS 0\n",
+ "\n",
+ "ecoli_cperf_md.loc[ecoli_cperf_md['SampleType'] == 'Human Excrement', 'Composting Time Point'] = \\\n",
+ " ecoli_cperf_md.loc[ecoli_cperf_md['SampleType'] == 'Human Excrement', 'Composting Time Point'].fillna(0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# RENAME qPCR COLUMNS FOR BETTER LEGIBILITY\n",
+ "renames = {\n",
+ " 'cperf_mov_avg': 'C perfringens Moving Average',\n",
+ " 'ecoli_mov_avg': 'E coli Moving Average',\n",
+ " 'cperf_normalized_mov_avg': 'C perfringens Moving Average (Normalized)',\n",
+ " 'ecoli_normalized_mov_avg': 'E coli Moving Average (Normalized)',\n",
+ " 'loq': 'Limit of Quantification',\n",
+ " 'loq_normalized': 'Limit of Quantification (Normalized)',\n",
+ " 'SampleType': 'Sample Type',\n",
+ "}\n",
+ "ecoli_cperf_md = ecoli_cperf_md.rename(renames, axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.FacetChart(...)"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# NOTE: change me\n",
+ "y_title = 'E coli Copy Number (Normalized)'\n",
+ "line_y_value = 'E coli Moving Average (Normalized)'\n",
+ "point_y_value = 'ecoli_normalized'\n",
+ "\n",
+ "pseudo_count = ecoli_cperf_md[line_y_value].min()\n",
+ "\n",
+ "line = alt.Chart(ecoli_cperf_md).transform_fold(\n",
+ " [line_y_value, 'Limit of Quantification (Normalized)'],\n",
+ " as_=['Line', 'Amount'],\n",
+ ").mark_line().encode(\n",
+ " x=alt.X('Composting Time Point').scale(domain=(0, 52), nice=False),\n",
+ " y=alt.Y('Amount', type='quantitative').scale(type='log').title(y_title),\n",
+ " color='Line:N',\n",
+ " strokeDash='Line:N',\n",
+ ").transform_filter(\n",
+ " (alt.datum['Sample Type'] == 'Human Excrement Compost')\n",
+ ")\n",
+ "\n",
+ "point = alt.Chart(ecoli_cperf_md).mark_point(\n",
+ " filled=True,\n",
+ " size=75,\n",
+ " color='black',\n",
+ " fillOpacity=0.85\n",
+ ").encode(\n",
+ " x=alt.X('Composting Time Point'),\n",
+ " y=alt.Y(point_y_value).scale(type='log'),\n",
+ " shape='Sample Type'\n",
+ ").transform_filter(\n",
+ " (alt.datum['Sample Type'] == 'Human Excrement')\n",
+ " #(alt.datum.ecoli_quantity_mean < 1_000_000)\n",
+ ")\n",
+ "\n",
+ "alt.layer(line, point).facet(\n",
+ " facet='Bucket',\n",
+ " columns=5,\n",
+ " title=alt.Title(\n",
+ " f'{y_title}, By Bucket',\n",
+ " subtitle=[\n",
+ " 'moving average, period of 4',\n",
+ " f'pseudo-count of {pseudo_count} added to all data points',\n",
+ " ]\n",
+ " )\n",
+ ").resolve_scale(\n",
+ " y='independent',\n",
+ " x='independent'\n",
+ ").configure_legend(\n",
+ " labelLimit=0,\n",
+ " titleFontSize=18,\n",
+ " labelFontSize=16,\n",
+ " orient='bottom',\n",
+ " padding=25\n",
+ ").configure_axis(\n",
+ " labelFontSize=15,\n",
+ " titleFontSize=15,\n",
+ ").configure_axisY(\n",
+ " format='e'\n",
+ ").configure_header(\n",
+ " titleFontSize=16,\n",
+ " labelFontSize=15\n",
+ ").configure_title(\n",
+ " fontSize=20,\n",
+ " subtitleFontSize=18\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.FacetChart(...)"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# BACTQUANT ONLY PLOTS\n",
+ "\n",
+ "bq = alt.Chart(ecoli_cperf_md).mark_line().encode(\n",
+ " x=alt.X('Composting Time Point').scale(domain=(0, 52), nice=False),\n",
+ " y=alt.Y('bq_copy_number').title('BactQuant Copy Number').scale(type='log'),\n",
+ ").transform_filter(\n",
+ " (alt.datum['Sample Type'] == 'Human Excrement Compost')\n",
+ ").facet(\n",
+ " facet='Bucket',\n",
+ " columns=5,\n",
+ " title=alt.Title(\n",
+ " 'BactQuant Copy Number, By Bucket',\n",
+ " #subtitle=['moving average, period=4',]\n",
+ " )\n",
+ ").resolve_scale(\n",
+ " y='independent',\n",
+ " x='independent'\n",
+ ").configure_axis(\n",
+ " labelFontSize=15,\n",
+ " titleFontSize=15,\n",
+ ").configure_axisY(\n",
+ " format='e'\n",
+ ").configure_header(\n",
+ " titleFontSize=16,\n",
+ " labelFontSize=15\n",
+ ").configure_title(\n",
+ " fontSize=20\n",
+ ")\n",
+ "\n",
+ "bq"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.15"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}