-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
FEAT: Export compressed GTFS schedule to SQLITE db (#388)
Adds the ability to export compressed GTFS schedule data to an SQLITE db file, along with schedule S3 sync/upload logic. For each year partition folder, in the compressed gtfs archives, one gzipped SQLITE db file will be produced that contains a table for each GTFS schedule file that has been compressed. Asana Task: https://app.asana.com/0/1205827492903547/1207450430015372
- Loading branch information
Showing
8 changed files
with
206 additions
and
39 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
from lamp_py.ingestion.compress_gtfs.gtfs_to_parquet import gtfs_to_parquet | ||
|
||
if __name__ == "__main__": | ||
gtfs_to_parquet() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
import os | ||
import sqlite3 | ||
|
||
import pyarrow | ||
import pyarrow.dataset as pd | ||
|
||
from lamp_py.runtime_utils.process_logger import ProcessLogger | ||
from lamp_py.ingestion.utils import gzip_file | ||
|
||
|
||
def sqlite_type(pq_type: str) -> str: | ||
""" | ||
return SQLITE type from pyarrow Field type | ||
""" | ||
if "int" in pq_type: | ||
return "INTEGER" | ||
if "bool" in pq_type: | ||
return "INTEGER" | ||
if "float" in pq_type: | ||
return "REAL" | ||
if "double" in pq_type: | ||
return "REAL" | ||
return "TEXT" | ||
|
||
|
||
def sqlite_table_query(table_name: str, schema: pyarrow.Schema) -> str: | ||
""" | ||
return CREATE TABLE query for sqlite table from pyarrow schema | ||
""" | ||
logger = ProcessLogger("sqlite_create_table") | ||
logger.log_start() | ||
field_list = [ | ||
f"{field.name} {sqlite_type(str(field.type))}" for field in schema | ||
] | ||
query = f""" | ||
CREATE TABLE | ||
IF NOT EXISTS | ||
{table_name} | ||
( | ||
{','.join(field_list)} | ||
); | ||
""" | ||
logger.log_complete() | ||
return query | ||
|
||
|
||
def pq_folder_to_sqlite(year_path: str) -> None: | ||
""" | ||
load all files from year_path folder into SQLITE3 db file | ||
""" | ||
logger = ProcessLogger("pq_to_sqlite", year_path=year_path) | ||
logger.log_start() | ||
|
||
db_path = os.path.join(year_path, "GTFS_ARCHIVE.db") | ||
if os.path.exists(db_path): | ||
os.remove(db_path) | ||
try: | ||
for file in os.listdir(year_path): | ||
if ".parquet" not in file: | ||
continue | ||
logger.add_metadata(current_file=file) | ||
|
||
ds = pd.dataset(os.path.join(year_path, file)) | ||
|
||
table = file.replace(".parquet", "") | ||
columns = [f":{col}" for col in ds.schema.names] | ||
insert_query = f"INSERT INTO {table} VALUES({','.join(columns)});" | ||
|
||
conn = sqlite3.connect(db_path) | ||
with conn: | ||
conn.execute(sqlite_table_query(table, ds.schema)) | ||
with conn: | ||
for batch in ds.to_batches(batch_size=250_000): | ||
conn.executemany(insert_query, batch.to_pylist()) | ||
conn.close() | ||
|
||
gzip_file(db_path) | ||
|
||
logger.log_complete() | ||
except Exception as exception: | ||
logger.log_failure(exception) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.