FIX: GTFS Compressed Schedule Upload (#447)

The S3 sync operation for GTFS compressed parquet files was calling the "GTFSArchive.parquet_path" method to create object upload paths. This method automatically appends ".parquet" to any file passed to it. However the GTFS_ARCHIVE.db.gz file is not supposed to have a ".parquet" appended. This change stops using the "GTFSArchive.parquet_path" method and falls back to a simple path join to create the S3 object upload path. The matches the behavior of the current PROD environment which has not seen these errors.
mbta · Oct 2, 2024 · 42bb38b · 42bb38b
1 parent f0a4652
commit 42bb38b
Showing 1 changed file with 7 additions and 8 deletions.
diff --git a/src/lamp_py/ingestion/compress_gtfs/gtfs_to_parquet.py b/src/lamp_py/ingestion/compress_gtfs/gtfs_to_parquet.py
@@ -305,12 +305,10 @@ def gtfs_to_parquet() -> None:
 
     # compress each schedule in feed
     for schedule in feed.rows(named=True):
-        schedule_url = schedule["archive_url"]
-        schedule_pub_dt = schedule["published_dt"]
         schedule_details = ScheduleDetails(
-            schedule_url,
-            schedule_pub_dt,
-            gtfs_tmp_folder,
+            file_location=schedule["archive_url"],
+            published_dt=schedule["published_dt"],
+            tmp_folder=gtfs_tmp_folder,
         )
         compress_gtfs_schedule(schedule_details)
 
@@ -319,8 +317,9 @@ def gtfs_to_parquet() -> None:
         year_path = os.path.join(gtfs_tmp_folder, year)
         pq_folder_to_sqlite(year_path)
         for file in os.listdir(year_path):
-            local_path = os.path.join(year_path, file)
-            upload_path = compressed_gtfs.parquet_path(year, file).s3_uri
-            upload_file(local_path, upload_path)
+            upload_file(
+                file_name=os.path.join(year_path, file),
+                object_path=os.path.join(compressed_gtfs.s3_uri, year, file),
+            )
 
     logger.log_complete()