marbl-ecosys · mgrover1 · Sep 15, 2021 · Sep 15, 2021
diff --git a/environment/environment.yml b/environment/environment.yml
@@ -54,4 +54,5 @@ dependencies:
 - pip:
   - git+https://github.com/NCAR/esmlab.git
   - git+https://github.com/NCAR/esmlab-regrid.git
+  - ecgtools
 
diff --git a/notebooks/_config_calc.yml b/notebooks/_config_calc.yml
@@ -1,6 +1,7 @@
 project_kernel: cesm2-marbl
 notebooks:
     pre_notebooks:
+      - generate-catalog
       - _data-Ncycle
       - _data-nutrient-plots
       - _data-mld-obs
@@ -23,7 +24,8 @@ notebooks:
       - transient-fgco2
       - transient-biological-pump
 
-esm_collection: data/campaign-cesm2-cmip6-timeseries.json
+esm_data_dir: /glade/scratch/mclong/cesm2-marbl-data_nc
+esm_collection: data/cesm2-cmip6-timeseries.json
 cache_dir: /glade/p/cgd/oce/projects/cesm2-marbl/funnel-cache
 data_collections:
     epoch_mean:

diff --git a/notebooks/catalog-generation.ipynb b/notebooks/catalog-generation.ipynb
@@ -0,0 +1,195 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "fc8d1154-c8a2-40f0-b218-035912eef869",
+   "metadata": {},
+   "source": [
+    "# Generate an [Intake-ESM](https://intake-esm.readthedocs.io/en/latest/) Catalog Using [ECGTools](https://ecgtools.readthedocs.io/en/latest/)\n",
+    "In this notebook, we use the data directory specified in `_config_calc.yml` to build a data calog to be used throughout the analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "6809d32c-440c-4e19-9250-eea90f0e7e4e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import yaml\n",
+    "\n",
+    "from ecgtools import Builder\n",
+    "from ecgtools.parsers.cesm import parse_cesm_timeseries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "afd8b5bc-2307-423c-87ec-8a0bee002ea9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('_config_calc.yml') as fid:\n",
+    "    config_dict = yaml.load(fid, Loader=yaml.Loader)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b4e00d81-a5ea-4b44-967e-18e3a8ffe4ce",
+   "metadata": {},
+   "source": [
+    "## Setup the Builder\n",
+    "We set up the builder object here - specifying the data directory within the `_config_calc.yml` file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "bd7a2533-2fa5-4d7e-8fca-4fdf981421c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "b = Builder(config_dict['esm_data_dir'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3302e629-5a20-4b90-b805-ecbb57300086",
+   "metadata": {},
+   "source": [
+    "## Build the Catalog\n",
+    "When we build the catalog, we specify to use the `parse_cesm_timeseries` parser"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "52a229db-3b87-447d-a914-4aa41d69e385",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.\n",
+      "[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished\n",
+      "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.\n",
+      "[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    1.3s\n",
+      "[Parallel(n_jobs=-1)]: Done  43 out of  43 | elapsed:    1.7s finished\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Builder(root_path=PosixPath('/glade/scratch/mclong/cesm2-marbl-data_nc'), extension='.nc', depth=0, exclude_patterns=None, njobs=-1)"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "b.build(parse_cesm_timeseries)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "id": "95de1cf4-573f-4144-b3a2-a76d5b4d91cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def add_experiment_to_dataframe(df):\n",
+    "    case_split = df.case.str.split('.', expand=True)\n",
+    "    experiment = case_split.iloc[:, 1] + '.' + case_split.iloc[:, 2]\n",
+    "    df['experiment'] = experiment.fillna('historical')\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "id": "fe73027a-4c34-4b37-89c1-13e29908e4db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "b.df = add_experiment_to_dataframe(b.df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c183e135-7809-48bb-9d04-67bf83f17fb6",
+   "metadata": {},
+   "source": [
+    "## Save the Catalog\n",
+    "Now that we have built the catalog, let's save it to disk, using the file name specified in `_config_calc.yml`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "id": "5f02662c-f43f-42c4-8a13-f867a63cc3b7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Saved catalog location: data/cesm2-cmip6-timeseries.json and data/cesm2-cmip6-timeseries.json\n"
+     ]
+    }
+   ],
+   "source": [
+    "b.save(\n",
+    "    config_dict['esm_collection'],\n",
+    "    # Column name including filepath\n",
+    "    path_column_name='path',\n",
+    "    # Column name including variables\n",
+    "    variable_column_name='variable',\n",
+    "    # Data file format - could be netcdf or zarr (in this case, netcdf)\n",
+    "    data_format=\"netcdf\",\n",
+    "    # Which attributes to groupby when reading in variables using intake-esm\n",
+    "    groupby_attrs=[\"component\", \"experiment\", \"stream\"],\n",
+    "    # Aggregations which are fed into xarray when reading in data using intake\n",
+    "    aggregations=[\n",
+    "        {\n",
+    "            \"type\": \"join_existing\",\n",
+    "            \"attribute_name\": \"time_range\",\n",
+    "            \"options\": {\"dim\": \"time\", \"coords\": \"minimal\", \"compat\": \"override\"},\n",
+    "        }\n",
+    "    ],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1fe95b3f-70c5-4c66-9731-3f5233b118fe",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [conda env:miniconda3-cesm2-marbl]",
+   "language": "python",
+   "name": "conda-env-miniconda3-cesm2-marbl-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}