-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: Update pipeline to use trip_id
as unique key
#160
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -271,15 +271,15 @@ def upgrade() -> None: | |
sa.Column("service_date", sa.Integer(), nullable=False), | ||
sa.Column("direction_id", sa.Boolean(), nullable=False), | ||
sa.Column("route_id", sa.String(length=60), nullable=False), | ||
sa.Column("start_time", sa.Integer(), nullable=False), | ||
sa.Column("start_time", sa.Integer(), nullable=True), | ||
sa.Column("vehicle_id", sa.String(length=60), nullable=False), | ||
sa.Column("stop_sequence", sa.SmallInteger(), nullable=True), | ||
sa.Column("stop_id", sa.String(length=60), nullable=False), | ||
sa.Column("parent_station", sa.String(length=60), nullable=False), | ||
sa.Column("vp_move_timestamp", sa.Integer(), nullable=True), | ||
sa.Column("vp_stop_timestamp", sa.Integer(), nullable=True), | ||
sa.Column("tu_stop_timestamp", sa.Integer(), nullable=True), | ||
sa.Column("trip_id", sa.String(length=128), nullable=True), | ||
sa.Column("trip_id", sa.String(length=128), nullable=False), | ||
sa.Column("vehicle_label", sa.String(length=128), nullable=True), | ||
sa.Column("vehicle_consist", sa.String(), nullable=True), | ||
sa.Column("static_version_key", sa.Integer(), nullable=False), | ||
|
@@ -342,12 +342,12 @@ def upgrade() -> None: | |
sa.Column("service_date", sa.Integer(), nullable=False), | ||
sa.Column("route_id", sa.String(length=60), nullable=False), | ||
sa.Column("direction_id", sa.Boolean(), nullable=False), | ||
sa.Column("start_time", sa.Integer(), nullable=False), | ||
sa.Column("start_time", sa.Integer(), nullable=True), | ||
sa.Column("vehicle_id", sa.String(length=60), nullable=False), | ||
sa.Column("branch_route_id", sa.String(length=60), nullable=True), | ||
sa.Column("trunk_route_id", sa.String(length=60), nullable=True), | ||
sa.Column("stop_count", sa.SmallInteger(), nullable=True), | ||
sa.Column("trip_id", sa.String(length=128), nullable=True), | ||
sa.Column("trip_id", sa.String(length=128), nullable=False), | ||
sa.Column("vehicle_label", sa.String(length=128), nullable=True), | ||
sa.Column("vehicle_consist", sa.String(), nullable=True), | ||
sa.Column("direction", sa.String(length=30), nullable=True), | ||
|
@@ -371,9 +371,7 @@ def upgrade() -> None: | |
sa.UniqueConstraint( | ||
"service_date", | ||
"route_id", | ||
"direction_id", | ||
"start_time", | ||
"vehicle_id", | ||
"trip_id", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion(non-blocking): not sure it matters for you all, but a route is an attribute of a trip, so that's already unique: you don't need both items. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have examples in our test set where the same I'm not sure how common this is, but it does occur, and I believe the unique trip designations we are looking for should include There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Comment has been added to the function call to describe this behavior There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you have details about when you've seen that, it would be helpful: I don't think that should be happening (at least not within a single start_date). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Our flat file test data is a random sample from May 8th of this year. Two trip ids exhibit this behavior (ADDED-1581518542, ADDED-1581518549). CSV file with the GTFS-RT Vehicle Position data is attached. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, something like this makes sense for ADDED trips (although probably that reflects a bug in RTR). I'm still not sure why |
||
name="vehicle_trips_unique_trip", | ||
), | ||
) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -237,11 +237,7 @@ def trips_for_metrics_subquery( | |
sa.func.lead(rt_trips_sub.c.vp_move_timestamp) | ||
.over( | ||
partition_by=rt_trips_sub.c.vehicle_id, | ||
order_by=sa.func.coalesce( | ||
rt_trips_sub.c.vp_move_timestamp, | ||
rt_trips_sub.c.vp_stop_timestamp, | ||
rt_trips_sub.c.tu_stop_timestamp, | ||
), | ||
order_by=rt_trips_sub.c.vp_move_timestamp, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. was this change necessary as well if we're running the VACUUM? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This change wasn't necessary, but looking at the field that's being calculated, and how it's used. I don't believe the coalesce is needed. I think this business logic was a little bit of a hold over from before Ops Analytics decided they wanted all headways as departure to departure calculations. |
||
) | ||
.label("next_station_move"), | ||
) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -74,6 +74,13 @@ def load_new_trip_data(db_manager: DatabaseManager) -> None: | |
|
||
This guarantees that all events will have | ||
matching trips data in the "vehicle_trips" table | ||
|
||
This INSERT/UPDATE logic will load distinct trip information from the last | ||
recorded trip-stop event of a trip. The information in the last trip-stop | ||
event is assumed to be more accurate than information from the first | ||
trip-stop event because some values can carry over from the last trip of the | ||
vehicle. The `TempEventCompare.stop_sequence.desc()` `order_by` call is | ||
responsible for this behavior. | ||
""" | ||
process_logger = ProcessLogger("l1_trips.load_new_trips") | ||
process_logger.log_start() | ||
|
@@ -93,15 +100,13 @@ def load_new_trip_data(db_manager: DatabaseManager) -> None: | |
.distinct( | ||
TempEventCompare.service_date, | ||
TempEventCompare.route_id, | ||
TempEventCompare.direction_id, | ||
TempEventCompare.start_time, | ||
TempEventCompare.vehicle_id, | ||
TempEventCompare.trip_id, | ||
) | ||
.order_by( | ||
TempEventCompare.service_date, | ||
TempEventCompare.route_id, | ||
TempEventCompare.direction_id, | ||
TempEventCompare.start_time, | ||
TempEventCompare.trip_id, | ||
TempEventCompare.stop_sequence.desc(), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why add this one into the ordering? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The purpose of this query is to collect additional information fields ( vehicle_label, vehicle_consist) for unique trips that will be used for UPDATE/INSERT operations into our vehicle_trips table. For this first stop-event in a trip, this information is frequently carried over from the last trip for the vehicle, so this query is collecting the latest stop-event values from these information fields to UPDATE/INSERT into our vehicle_trips table. That is why the desc() ORDER is used, it would probably be helpful to add a comment to this effect above these calls. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Comment has been added to the function call to describe this behavior There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. question: is this backwards? stop sequences start with low numbers and go up. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The purpose of this query is to collect additional information fields ( For this first stop-event in a trip, this information is frequently carried over from the last trip for the vehicle, so this query is collecting the latest stop-event values from these information fields to UPDATE/INSERT into our That is why the |
||
) | ||
) | ||
|
||
|
@@ -124,9 +129,7 @@ def load_new_trip_data(db_manager: DatabaseManager) -> None: | |
index_elements=[ | ||
VehicleTrips.service_date, | ||
VehicleTrips.route_id, | ||
VehicleTrips.direction_id, | ||
VehicleTrips.start_time, | ||
VehicleTrips.vehicle_id, | ||
VehicleTrips.trip_id, | ||
], | ||
) | ||
) | ||
|
@@ -146,15 +149,13 @@ def load_new_trip_data(db_manager: DatabaseManager) -> None: | |
.distinct( | ||
TempEventCompare.service_date, | ||
TempEventCompare.route_id, | ||
TempEventCompare.direction_id, | ||
TempEventCompare.start_time, | ||
TempEventCompare.vehicle_id, | ||
TempEventCompare.trip_id, | ||
) | ||
.order_by( | ||
TempEventCompare.service_date, | ||
TempEventCompare.route_id, | ||
TempEventCompare.direction_id, | ||
TempEventCompare.start_time, | ||
TempEventCompare.trip_id, | ||
TempEventCompare.stop_sequence.desc(), | ||
) | ||
.where( | ||
sa.or_( | ||
|
@@ -175,9 +176,7 @@ def load_new_trip_data(db_manager: DatabaseManager) -> None: | |
.where( | ||
VehicleTrips.service_date == distinct_update_query.c.service_date, | ||
VehicleTrips.route_id == distinct_update_query.c.route_id, | ||
VehicleTrips.direction_id == distinct_update_query.c.direction_id, | ||
VehicleTrips.start_time == distinct_update_query.c.start_time, | ||
VehicleTrips.vehicle_id == distinct_update_query.c.vehicle_id, | ||
VehicleTrips.trip_id == distinct_update_query.c.trip_id, | ||
) | ||
) | ||
db_manager.execute(trip_update_query) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do we need to keep
route_id
in here?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We have examples in our test set where the same trip_id continues across more than one Green Line route_id for the same start_date.
I'm not sure how common this is, but it does occur, and I believe the unique trip designations we are looking for should include route_id as a guarantee.