Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tika-geoparsing Jupyter notebook for geolocation parsing integration #27

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 126 additions & 0 deletions tika-geoparsing.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "7215081c-b46f-4b70-b61c-316465095422",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done parsing all sightings. Enhanced data written to /Users/pardibedirian/Downloads/bfro_with_new_cols_enhanced.tsv\n"
]
}
],
"source": [
"import csv\n",
"import os\n",
"import subprocess\n",
"import shlex\n",
"import tempfile\n",
"\n",
"# Define the paths to the files and directories we'll use\n",
"home = os.getenv('HOME')\n",
"downloads_path = os.path.join(home, 'Downloads')\n",
"classpath = f\"{downloads_path}/tika-app-2.9.1.jar:{downloads_path}/tika-parser-nlp-package-2.9.1.jar:{home}/src/location-ner-model\"\n",
"sightings_file_path = os.path.join(downloads_path, 'bfro_with_new_cols.tsv')\n",
"output_file_path = os.path.join(downloads_path, 'part7_bfro_with_new_cols.tsv')\n",
"\n",
"# Function to call Tika GeoTopicParser and parse the output\n",
"def parse_sighting_with_tika(file_path):\n",
" command = f\"java -classpath {classpath} org.apache.tika.cli.TikaCLI -m {file_path}\"\n",
" args = shlex.split(command)\n",
" result = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n",
" output = result.stdout.decode()\n",
"\n",
" location_data = {}\n",
" for line in output.splitlines():\n",
" if 'Geographic_NAME:' in line:\n",
" location_data['Geographic_NAME'] = line.split(':', 1)[1].strip()\n",
" elif 'Geographic_LATITUDE:' in line:\n",
" location_data['Geographic_LATITUDE'] = line.split(':', 1)[1].strip()\n",
" elif 'Geographic_LONGITUDE:' in line:\n",
" location_data['Geographic_LONGITUDE'] = line.split(':', 1)[1].strip()\n",
" return location_data\n",
"\n",
"# Process the sightings and write the results to a new TSV\n",
"with open(sightings_file_path, mode='r', encoding='utf-8') as infile, \\\n",
" open(output_file_path, mode='w', encoding='utf-8', newline='') as outfile:\n",
"\n",
" reader = csv.DictReader(infile, delimiter='\\t')\n",
" fieldnames = reader.fieldnames + ['Geographic_NAME', 'Geographic_LATITUDE', 'Geographic_LONGITUDE']\n",
" writer = csv.DictWriter(outfile, fieldnames=fieldnames, delimiter='\\t')\n",
" writer.writeheader()\n",
"\n",
" for row_index, row in enumerate(reader):\n",
" # Combine relevant columns from the TSV into a string to write to a .geot file\n",
" sighting_description = (\n",
" f\"{row['County']} {row['State']} {row['Location Details']} \"\n",
" f\"{row['Headline']} {row['Nearest Town']} {row['Nearest Road']}\"\n",
" )\n",
"\n",
" # Create a temporary .geot file with this sighting's description\n",
" with tempfile.NamedTemporaryFile(mode='w+', suffix='.geot', delete=False) as temp_file:\n",
" temp_file.write(sighting_description)\n",
" temp_file_path = temp_file.name\n",
"\n",
" # Call the function to parse this .geot file\n",
" location_data = parse_sighting_with_tika(temp_file_path)\n",
"\n",
" # Merge the original row data with the new location data\n",
" row.update(location_data)\n",
"\n",
" # Write the merged data to the new TSV\n",
" writer.writerow(row)\n",
"\n",
" # Clean up the temporary file\n",
" os.remove(temp_file_path)\n",
"\n",
"print(f\"Done parsing all sightings. Enhanced data written to {output_file_path}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ba1d42fd-fae8-4a94-ab30-43e6101b3765",
"metadata": {
"tags": []
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "234299fe-3f14-475f-aa9e-5bd412b52ca9",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}