chrismattmann · bedirian · Apr 9, 2024
diff --git a/tika-geoparsing.ipynb b/tika-geoparsing.ipynb
@@ -0,0 +1,126 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "7215081c-b46f-4b70-b61c-316465095422",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done parsing all sightings. Enhanced data written to /Users/pardibedirian/Downloads/bfro_with_new_cols_enhanced.tsv\n"
+     ]
+    }
+   ],
+   "source": [
+    "import csv\n",
+    "import os\n",
+    "import subprocess\n",
+    "import shlex\n",
+    "import tempfile\n",
+    "\n",
+    "# Define the paths to the files and directories we'll use\n",
+    "home = os.getenv('HOME')\n",
+    "downloads_path = os.path.join(home, 'Downloads')\n",
+    "classpath = f\"{downloads_path}/tika-app-2.9.1.jar:{downloads_path}/tika-parser-nlp-package-2.9.1.jar:{home}/src/location-ner-model\"\n",
+    "sightings_file_path = os.path.join(downloads_path, 'bfro_with_new_cols.tsv')\n",
+    "output_file_path = os.path.join(downloads_path, 'part7_bfro_with_new_cols.tsv')\n",
+    "\n",
+    "# Function to call Tika GeoTopicParser and parse the output\n",
+    "def parse_sighting_with_tika(file_path):\n",
+    "    command = f\"java -classpath {classpath} org.apache.tika.cli.TikaCLI -m {file_path}\"\n",
+    "    args = shlex.split(command)\n",
+    "    result = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n",
+    "    output = result.stdout.decode()\n",
+    "\n",
+    "    location_data = {}\n",
+    "    for line in output.splitlines():\n",
+    "        if 'Geographic_NAME:' in line:\n",
+    "            location_data['Geographic_NAME'] = line.split(':', 1)[1].strip()\n",
+    "        elif 'Geographic_LATITUDE:' in line:\n",
+    "            location_data['Geographic_LATITUDE'] = line.split(':', 1)[1].strip()\n",
+    "        elif 'Geographic_LONGITUDE:' in line:\n",
+    "            location_data['Geographic_LONGITUDE'] = line.split(':', 1)[1].strip()\n",
+    "    return location_data\n",
+    "\n",
+    "# Process the sightings and write the results to a new TSV\n",
+    "with open(sightings_file_path, mode='r', encoding='utf-8') as infile, \\\n",
+    "     open(output_file_path, mode='w', encoding='utf-8', newline='') as outfile:\n",
+    "\n",
+    "    reader = csv.DictReader(infile, delimiter='\\t')\n",
+    "    fieldnames = reader.fieldnames + ['Geographic_NAME', 'Geographic_LATITUDE', 'Geographic_LONGITUDE']\n",
+    "    writer = csv.DictWriter(outfile, fieldnames=fieldnames, delimiter='\\t')\n",
+    "    writer.writeheader()\n",
+    "\n",
+    "    for row_index, row in enumerate(reader):\n",
+    "        # Combine relevant columns from the TSV into a string to write to a .geot file\n",
+    "        sighting_description = (\n",
+    "            f\"{row['County']} {row['State']} {row['Location Details']} \"\n",
+    "            f\"{row['Headline']} {row['Nearest Town']} {row['Nearest Road']}\"\n",
+    "        )\n",
+    "\n",
+    "        # Create a temporary .geot file with this sighting's description\n",
+    "        with tempfile.NamedTemporaryFile(mode='w+', suffix='.geot', delete=False) as temp_file:\n",
+    "            temp_file.write(sighting_description)\n",
+    "            temp_file_path = temp_file.name\n",
+    "\n",
+    "        # Call the function to parse this .geot file\n",
+    "        location_data = parse_sighting_with_tika(temp_file_path)\n",
+    "\n",
+    "        # Merge the original row data with the new location data\n",
+    "        row.update(location_data)\n",
+    "\n",
+    "        # Write the merged data to the new TSV\n",
+    "        writer.writerow(row)\n",
+    "\n",
+    "        # Clean up the temporary file\n",
+    "        os.remove(temp_file_path)\n",
+    "\n",
+    "print(f\"Done parsing all sightings. Enhanced data written to {output_file_path}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ba1d42fd-fae8-4a94-ab30-43e6101b3765",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "234299fe-3f14-475f-aa9e-5bd412b52ca9",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}