Updated NYCTaxi-E2E notebook

Additions to NYCTaxi-E2E notebook addressing issues on rapidsai-community#214
vilmara · Oct 29, 2019 · 08848d2 · 08848d2
1 parent 02567e4
commit 08848d2
Showing 1 changed file with 73 additions and 3 deletions.
diff --git a/intermediate_notebooks/E2E/taxi/NYCTaxi-E2E.ipynb b/intermediate_notebooks/E2E/taxi/NYCTaxi-E2E.ipynb
@@ -39,6 +39,51 @@
     "client"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#  Rapids Memory Manager Functionality (RMM)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### Rapids Memory Manager Functionality (RMM) \n",
+    "import rmm\n",
+    "from rmm import rmm_config as rmm_cfg\n",
+    "\n",
+    "def initialize_rmm_pool():\n",
+    "    rmm_cfg.use_pool_allocator = True\n",
+    "    return rmm.initialize()\n",
+    "\n",
+    "def initialize_rmm_no_pool():\n",
+    "    rmm_cfg.use_pool_allocator = False\n",
+    "    return rmm.initialize()\n",
+    "\n",
+    "def finalize_rmm():\n",
+    "    return rmm.finalize()\n",
+    "\n",
+    "def run_dask_task(func, **kwargs):\n",
+    "    task = func(**kwargs)\n",
+    "    return task"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize the gpu memory pool\n",
+    "\n",
+    "client.run(finalize_rmm)\n",
+    "client.run(initialize_rmm_pool) "
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -309,8 +354,9 @@
     "                      outcols=dict(day_of_week=np.float32),\n",
     "                      kwargs=dict())\n",
     "    \n",
+    "    # Currently xgboost doesn't support consuming boolean values from cudf, as it specializes it to bitset according to arrow\n",
+    "    df['is_weekend'] = (df['day_of_week']<2).astype(np.int32)\n",
     "    \n",
-    "    df = df.drop('day_of_week')\n",    
     "    return df"
    ]
   },
@@ -392,7 +438,6 @@
     "  'silent': True,\n",
     "  'verbose_eval': True,\n",
     "  'tree_method':'gpu_hist',\n",
-    "  'n_gpus': 1\n",
     "}\n",
     "\n",
     "trained_model = dxgb_gpu.train(client, params, X_train, Y_train, num_boost_round=100)"
@@ -423,12 +468,21 @@
     "    return df.partitions[nonempty]"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Pick a Test Set"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
+    "%%time\n",
+    "\n",
     "X_test = taxi_df.query('day >= 25').persist()\n",
     "X_test = drop_empty_partitions(X_test)\n",
     "\n",
@@ -438,6 +492,9 @@
     "# Drop the fare amount from X_test\n",
     "X_test = X_test[X_test.columns.difference(['fare_amount'])]\n",
     "\n",
+    "# this wont return until all data is in GPU memory\n",
+    "done = wait([X_test, Y_test])\n",
+    "\n",
     "# display test set size\n",
     "len(X_test)"
    ]
@@ -448,6 +505,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "%%time\n",
+    "\n",
     "# generate predictions on the test set\n",
     "Y_test['prediction'] = dxgb_gpu.predict(client, trained_model, X_test)"
    ]
@@ -543,6 +602,17 @@
     "math.sqrt(Y_test.squared_error.mean().compute())"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Finalize the gpu memory pool\n",
+    "\n",
+    "client.run(finalize_rmm)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -577,7 +647,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.6.7"
   }
  },
  "nbformat": 4,