mbta · rymarczy · Nov 15, 2023 · Nov 15, 2023 · rymarczy · Nov 15, 2023
diff --git a/python_src/pyproject.toml b/python_src/pyproject.toml
@@ -44,7 +44,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.black]
 line-length = 80
-target-version = ['py39']
+target-version = ['py310']
 
 [tool.mypy]
 disallow_untyped_defs = true

diff --git a/python_src/src/lamp_py/tableau/hyper.py b/python_src/src/lamp_py/tableau/hyper.py
@@ -199,7 +199,7 @@ def run_hyper(self) -> None:
         )
         process_log.log_start()
 
-        for retry_count in range(max_retries):
+        for retry_count in range(max_retries + 1):
             try:
                 process_log.add_metadata(retry_count=retry_count)
                 # get datasource from Tableau to check "updated_at" datetime
@@ -256,7 +256,7 @@ def run_hyper(self) -> None:
                 break
 
             except Exception as exception:
-                if retry_count == max_retries - 1:
+                if retry_count == max_retries:
                     process_log.log_failure(exception=exception)
 
     def run_parquet(self, db_manager: DatabaseManager) -> None:

diff --git a/python_src/src/lamp_py/tableau/jobs/gtfs_rail.py b/python_src/src/lamp_py/tableau/jobs/gtfs_rail.py
@@ -40,10 +40,13 @@ def create_parquet(self, db_manager: DatabaseManager) -> None:
         if os.path.exists(self.local_parquet_path):
             os.remove(self.local_parquet_path)
 
+        db_batch_size = 1024 * 1024 / 2
+
         db_manager.write_to_parquet(
             select_query=sa.text(self.create_query),
             write_path=self.local_parquet_path,
             schema=self.parquet_schema,
+            batch_size=db_batch_size,
         )
 
     def update_parquet(self, db_manager: DatabaseManager) -> bool:

diff --git a/python_src/src/lamp_py/tableau/jobs/rt_rail.py b/python_src/src/lamp_py/tableau/jobs/rt_rail.py
@@ -26,7 +26,6 @@ def __init__(self) -> None:
             "   date(vt.service_date::text) as service_date"
             "   , TIMEZONE('UTC', TO_TIMESTAMP(extract(epoch FROM date(vt.service_date::text)) + vt.start_time)) as start_datetime"
             "   , TIMEZONE('UTC', TO_TIMESTAMP(extract(epoch FROM date(vt.service_date::text)) +  vt.static_start_time)) as static_start_datetime"
-            "   , ve.pm_trip_id"
             "   , ve.stop_sequence"
             "   , ve.canonical_stop_sequence"
             "   , prev_ve.canonical_stop_sequence as previous_canonical_stop_sequence"
@@ -36,11 +35,8 @@ def __init__(self) -> None:
             "   , prev_ve.stop_id as previous_stop_id"
             "   , ve.parent_station"
             "   , prev_ve.parent_station as previous_parent_station"
-            # "   , ve.vp_move_timestamp as previous_stop_departure_timestamp"
             "   , TIMEZONE('America/New_York', TO_TIMESTAMP(ve.vp_move_timestamp)) as previous_stop_departure_datetime"
-            # "   , COALESCE(ve.vp_stop_timestamp,  ve.tu_stop_timestamp) as stop_arrival_timestamp"
             "   , TIMEZONE('America/New_York', TO_TIMESTAMP(COALESCE(ve.vp_stop_timestamp,  ve.tu_stop_timestamp))) as stop_arrival_datetime"
-            # "   , COALESCE(ve.vp_stop_timestamp,  ve.tu_stop_timestamp) + ve.dwell_time_seconds as stop_departure_timestamp"
             "   , TIMEZONE('America/New_York', TO_TIMESTAMP(COALESCE(ve.vp_stop_timestamp,  ve.tu_stop_timestamp) + ve.dwell_time_seconds)) as stop_departure_datetime"
             "   , (ve.vp_move_timestamp - extract(epoch FROM date(vt.service_date::text)))::int as previous_stop_departure_sec"
             "   , (ve.vp_move_timestamp - extract(epoch FROM date(vt.service_date::text)) + ve.travel_time_seconds)::int as stop_arrival_sec"
@@ -59,13 +55,12 @@ def __init__(self) -> None:
             "   , vt.static_trip_id_guess"
             "   , vt.static_start_time"
             "   , vt.static_stop_count"
-            "   , vt.first_last_station_match"
+            "   , vt.first_last_station_match as exact_static_trip_match"
             "   , vt.static_version_key"
             "   , ve.travel_time_seconds"
             "   , ve.dwell_time_seconds"
             "   , ve.headway_trunk_seconds"
             "   , ve.headway_branch_seconds"
-            "   , ve.updated_on "
             "FROM "
             "   vehicle_events ve "
             "LEFT JOIN "
@@ -95,7 +90,6 @@ def parquet_schema(self) -> pyarrow.schema:
                 ("service_date", pyarrow.date32()),
                 ("start_datetime", pyarrow.timestamp("us")),
                 ("static_start_datetime", pyarrow.timestamp("us")),
-                ("pm_trip_id", pyarrow.int64()),
                 ("stop_sequence", pyarrow.int16()),
                 ("canonical_stop_sequence", pyarrow.int16()),
                 ("previous_canonical_stop_sequence", pyarrow.int16()),
@@ -105,11 +99,8 @@ def parquet_schema(self) -> pyarrow.schema:
                 ("previous_stop_id", pyarrow.string()),
                 ("parent_station", pyarrow.string()),
                 ("previous_parent_station", pyarrow.string()),
-                # ("previous_stop_departure_timestamp", pyarrow.int64()),
                 ("previous_stop_departure_datetime", pyarrow.timestamp("us")),
-                # ("stop_arrival_timestamp", pyarrow.int64()),
                 ("stop_arrival_datetime", pyarrow.timestamp("us")),
-                # ("stop_departure_timestamp", pyarrow.int64()),
                 ("stop_departure_datetime", pyarrow.timestamp("us")),
                 ("previous_stop_departure_sec", pyarrow.int64()),
                 ("stop_arrival_sec", pyarrow.int64()),
@@ -128,29 +119,35 @@ def parquet_schema(self) -> pyarrow.schema:
                 ("static_trip_id_guess", pyarrow.string()),
                 ("static_start_time", pyarrow.int64()),
                 ("static_stop_count", pyarrow.int64()),
-                ("first_last_station_match", pyarrow.bool_()),
+                ("exact_static_trip_match", pyarrow.bool_()),
                 ("static_version_key", pyarrow.int64()),
                 ("travel_time_seconds", pyarrow.int32()),
                 ("dwell_time_seconds", pyarrow.int32()),
                 ("headway_trunk_seconds", pyarrow.int32()),
                 ("headway_branch_seconds", pyarrow.int32()),
-                ("updated_on", pyarrow.timestamp("us")),
             ]
         )
 
     def create_parquet(self, db_manager: DatabaseManager) -> None:
         create_query = self.table_query % ""
 
+        # this is a fairly wide dataset, so dial back the batch size
+        # to limit memory usage
+        db_batch_size = 1024 * 1024 / 2
+
         if os.path.exists(self.local_parquet_path):
             os.remove(self.local_parquet_path)
 
         db_manager.write_to_parquet(
             select_query=sa.text(create_query),
             write_path=self.local_parquet_path,
             schema=self.parquet_schema,
+            batch_size=db_batch_size,
         )
 
     def update_parquet(self, db_manager: DatabaseManager) -> bool:
+        dataset_batch_size = 1024 * 1024
+
         download_file(
             object_path=self.remote_parquet_path,
             file_name=self.local_parquet_path,
@@ -174,7 +171,7 @@ def update_parquet(self, db_manager: DatabaseManager) -> bool:
         # update downloaded parquet file with filtered service_date
         old_filter = pc.field("service_date") < max_start_date
         old_batches = pd.dataset(self.local_parquet_path).to_batches(
-            filter=old_filter, batch_size=1024 * 1024
+            filter=old_filter, batch_size=dataset_batch_size
         )
         filter_path = "/tmp/filter_local.parquet"
         with pq.ParquetWriter(
@@ -193,7 +190,7 @@ def update_parquet(self, db_manager: DatabaseManager) -> bool:
         combine_batches = pd.dataset(
             joined_dataset,
             schema=self.parquet_schema,
-        ).to_batches(batch_size=1024 * 1024)
+        ).to_batches(batch_size=dataset_batch_size)
 
         with pq.ParquetWriter(
             combine_parquet_path, schema=self.parquet_schema

diff --git a/python_src/src/lamp_py/tableau/pipeline.py b/python_src/src/lamp_py/tableau/pipeline.py
@@ -1,4 +1,5 @@
 import os
+import gc
 from typing import List
 
 from lamp_py.runtime_utils.env_validation import validate_environment
@@ -56,3 +57,7 @@ def start_parquet_updates(db_manager: DatabaseManager) -> None:
     """Run all Parquet Update jobs"""
     for job in create_hyper_jobs():
         job.run_parquet(db_manager)
+
+    # ECS Memory Utilization appeared to stay elevated after initial calling
+    # of `run_parquet` to create all parquet files, this may resolve that...
+    gc.collect()
diff --git a/python_src/src/lamp_py/tableau/server.py b/python_src/src/lamp_py/tableau/server.py
@@ -23,7 +23,7 @@ def tableau_server(
 def tableau_authentication(
     tableau_user: Optional[str] = os.getenv("TABLEAU_USER"),
     tableau_password: Optional[str] = os.getenv("TABLEAU_PASSWORD"),
-    tableau_site: str = "",  # empty string corresponds to Default site
+    tableau_site: str = "",  # empty string corresponds to 'Default' site
 ) -> TSC.models.tableau_auth.TableauAuth:
     """
     Get Tableau Authentication object