FIX: Tableau writer schema mismatch with Bus schema (#486)

huangh · web-flow · commit 0c76a7d1199e · 2025-03-10T14:02:24.000-04:00
Primary fix: The bus_schema defined in the Tableau writer needs to be in the same order as the polars schema from the Bus_Performance_Manager outputs in bus_vehicle_events.

Changes:

Re-aligns the order of the tableau schema to match bus schema
Add a select statement to ensure the read in Table is in the correct schema order 
Refactor conversions for tableau analysis out into a new method for easy testing and verification
Add new analysis scripts that help check for issues in Tableau/parquet writing
Updated debugging info in this area
diff --git a/analysis/check_bus_tableau.py b/analysis/check_bus_tableau.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+from lamp_py.runtime_utils.remote_files import bus_events
+import pyarrow
+import pyarrow.parquet as pq
+import pyarrow.dataset as pd
+from pyarrow.fs import S3FileSystem
+from lamp_py.aws.s3 import file_list_from_s3
+import polars as pl
+
+from lamp_py.tableau.conversions.convert_bus_performance_data import apply_bus_analysis_conversions
+
+########################################################################
+# NOTE: ensure .env PUBLIC_ARCHIVE_BUCKET is pointed to the right bucket
+########################################################################
+
+# this schema and the order of this schema SHOULD match what comes out
+# of the polars version out of bus_performance_manager.
+bus_schema = pyarrow.schema(
+    [
+        ("service_date", pyarrow.date32()),  # change to date type
+        ("route_id", pyarrow.large_string()),
+        ("trip_id", pyarrow.large_string()),
+        ("start_time", pyarrow.int64()),
+        ("start_dt", pyarrow.timestamp("us")),
+        ("stop_count", pyarrow.uint32()),
+        ("direction_id", pyarrow.int8()),
+        ("stop_id", pyarrow.large_string()),
+        ("stop_sequence", pyarrow.int64()),
+        ("vehicle_id", pyarrow.large_string()),
+        ("vehicle_label", pyarrow.large_string()),
+        ("gtfs_travel_to_dt", pyarrow.timestamp("us")),
+        ("tm_stop_sequence", pyarrow.int64()),
+        ("tm_scheduled_time_dt", pyarrow.timestamp("us")),
+        ("tm_actual_arrival_dt", pyarrow.timestamp("us")),
+        ("tm_actual_departure_dt", pyarrow.timestamp("us")),
+        ("tm_scheduled_time_sam", pyarrow.int64()),
+        ("tm_actual_arrival_time_sam", pyarrow.int64()),
+        ("tm_actual_departure_time_sam", pyarrow.int64()),
+        ("plan_trip_id", pyarrow.large_string()),
+        ("exact_plan_trip_match", pyarrow.bool_()),
+        ("block_id", pyarrow.large_string()),
+        ("service_id", pyarrow.large_string()),
+        ("route_pattern_id", pyarrow.large_string()),
+        ("route_pattern_typicality", pyarrow.int64()),
+        ("direction", pyarrow.large_string()),
+        ("direction_destination", pyarrow.large_string()),
+        ("plan_stop_count", pyarrow.uint32()),
+        ("plan_start_time", pyarrow.int64()),
+        ("plan_start_dt", pyarrow.timestamp("us")),
+        ("stop_name", pyarrow.large_string()),
+        ("plan_travel_time_seconds", pyarrow.int64()),
+        ("plan_route_direction_headway_seconds", pyarrow.int64()),
+        ("plan_direction_destination_headway_seconds", pyarrow.int64()),
+        ("stop_arrival_dt", pyarrow.timestamp("us")),
+        ("stop_departure_dt", pyarrow.timestamp("us")),
+        ("gtfs_travel_to_seconds", pyarrow.int64()),
+        ("stop_arrival_seconds", pyarrow.int64()),
+        ("stop_departure_seconds", pyarrow.int64()),
+        ("travel_time_seconds", pyarrow.int64()),
+        ("dwell_time_seconds", pyarrow.int64()),
+        ("route_direction_headway_seconds", pyarrow.int64()),
+        ("direction_destination_headway_seconds", pyarrow.int64()),
+    ]
+)
+s3_uris = file_list_from_s3(bucket_name=bus_events.bucket, file_prefix=bus_events.prefix)
+ds_paths = [s.replace("s3://", "") for s in s3_uris]
+
+ds_paths = ds_paths[-5:]
+
+ds = pd.dataset(
+    ds_paths,
+    format="parquet",
+    filesystem=S3FileSystem(),
+)
+
+with pq.ParquetWriter("test.parquet", schema=bus_schema) as writer:
+    for batch in ds.to_batches(batch_size=500_000):
+        try:
+            # this select() is here to make sure the order of the polars_df
+            # schema is the same as the bus_schema above.
+            # order of schema matters to the ParquetWriter
+
+            # if the bus_schema above is in the same order as the batch
+            # schema, then the select will do nothing - as expected
+            polars_df = pl.from_arrow(batch).select(bus_schema.names)  # type: ignore[union-attr]
+
+            if not isinstance(polars_df, pl.DataFrame):
+                raise TypeError(f"Expected a Polars DataFrame or Series, but got {type(polars_df)}")
+
+            writer.write_table(apply_bus_analysis_conversions(polars_df))
+        except Exception as exception:
+            print(exception)
diff --git a/src/lamp_py/tableau/conversions/convert_bus_performance_data.py b/src/lamp_py/tableau/conversions/convert_bus_performance_data.py
@@ -0,0 +1,31 @@
+import polars as pl
+from pyarrow import Table
+
+
+def apply_bus_analysis_conversions(polars_df: pl.DataFrame) -> Table:
+    """
+    Function to apply final conversions to lamp data before outputting for tableau consumption
+    """
+    # Convert datetime to Eastern Time
+    polars_df = polars_df.with_columns(
+        pl.col("stop_arrival_dt").dt.convert_time_zone(time_zone="US/Eastern").dt.replace_time_zone(None),
+        pl.col("stop_departure_dt").dt.convert_time_zone(time_zone="US/Eastern").dt.replace_time_zone(None),
+        pl.col("gtfs_travel_to_dt").dt.convert_time_zone(time_zone="US/Eastern").dt.replace_time_zone(None),
+    )
+
+    # Convert seconds columns to be aligned with Eastern Time
+    polars_df = polars_df.with_columns(
+        (pl.col("gtfs_travel_to_dt") - pl.col("service_date").str.strptime(pl.Date, "%Y%m%d"))
+        .dt.total_seconds()
+        .alias("gtfs_travel_to_seconds"),
+        (pl.col("stop_arrival_dt") - pl.col("service_date").str.strptime(pl.Date, "%Y%m%d"))
+        .dt.total_seconds()
+        .alias("stop_arrival_seconds"),
+        (pl.col("stop_departure_dt") - pl.col("service_date").str.strptime(pl.Date, "%Y%m%d"))
+        .dt.total_seconds()
+        .alias("stop_departure_seconds"),
+    )
+
+    polars_df = polars_df.with_columns(pl.col("service_date").str.strptime(pl.Date, "%Y%m%d", strict=False))
+
+    return polars_df.to_arrow()
diff --git a/src/lamp_py/tableau/hyper.py b/src/lamp_py/tableau/hyper.py
@@ -282,29 +282,32 @@ def run_parquet(self, db_manager: Optional[DatabaseManager]) -> None:
                 )
                 remote_schema_match = self.parquet_schema.equals(remote_schema)
                 remote_version_match = self.remote_version_match()
+                process_log.add_metadata(
+                    stage="check_schema",
+                    remote_schema_match=remote_schema_match,
+                    remote_version_match=remote_version_match,
+                )
 
             if remote_schema_match is False or remote_version_match is False:
                 # create new parquet if no remote parquet found or
                 # remote schema does not match expected local schema
-                run_action = "create"
                 upload_parquet = True
+                process_log.add_metadata(stage="create_parquet")
                 self.create_parquet(db_manager)
+
             else:
-                run_action = "update"
+                process_log.add_metadata(stage="update_parquet")
                 upload_parquet = self.update_parquet(db_manager)
 
             parquet_file_size_mb = 0.0
             if os.path.exists(self.local_parquet_path):
                 parquet_file_size_mb = os.path.getsize(self.local_parquet_path) / (1024 * 1024)
 
-            process_log.add_metadata(
-                remote_schema_match=remote_schema_match,
-                run_action=run_action,
-                upload_parquet=upload_parquet,
-                parquet_file_size_mb=f"{parquet_file_size_mb:.2f}",
-            )
-
             if upload_parquet:
+                process_log.add_metadata(
+                    stage="upload_parquet",
+                    parquet_file_size_mb=f"{parquet_file_size_mb:.2f}",
+                )
                 upload_file(
                     file_name=self.local_parquet_path,
                     object_path=self.remote_parquet_path,
diff --git a/src/lamp_py/tableau/jobs/bus_performance.py b/src/lamp_py/tableau/jobs/bus_performance.py
@@ -1,7 +1,6 @@
 from typing import Optional
 from datetime import datetime
 from datetime import timezone
-
 import pyarrow
 import pyarrow.parquet as pq
 import pyarrow.dataset as pd
@@ -10,13 +9,18 @@
 import polars as pl
 
 from lamp_py.tableau.hyper import HyperJob
+from lamp_py.tableau.conversions.convert_bus_performance_data import apply_bus_analysis_conversions
+
 from lamp_py.runtime_utils.remote_files import bus_events
 from lamp_py.runtime_utils.remote_files import tableau_bus_all
 from lamp_py.runtime_utils.remote_files import tableau_bus_recent
 from lamp_py.aws.s3 import file_list_from_s3
 from lamp_py.aws.s3 import file_list_from_s3_with_details
 from lamp_py.aws.s3 import object_exists
 
+# this schema and the order of this schema SHOULD match what comes out
+# of the polars version from bus_performance_manager.
+# see select() comment below..
 bus_schema = pyarrow.schema(
     [
         ("service_date", pyarrow.date32()),  # change to date type
@@ -32,6 +36,12 @@
         ("vehicle_label", pyarrow.large_string()),
         ("gtfs_travel_to_dt", pyarrow.timestamp("us")),
         ("tm_stop_sequence", pyarrow.int64()),
+        ("tm_scheduled_time_dt", pyarrow.timestamp("us")),
+        ("tm_actual_arrival_dt", pyarrow.timestamp("us")),
+        ("tm_actual_departure_dt", pyarrow.timestamp("us")),
+        ("tm_scheduled_time_sam", pyarrow.int64()),
+        ("tm_actual_arrival_time_sam", pyarrow.int64()),
+        ("tm_actual_departure_time_sam", pyarrow.int64()),
         ("plan_trip_id", pyarrow.large_string()),
         ("exact_plan_trip_match", pyarrow.bool_()),
         ("block_id", pyarrow.large_string()),
@@ -56,12 +66,6 @@
         ("dwell_time_seconds", pyarrow.int64()),
         ("route_direction_headway_seconds", pyarrow.int64()),
         ("direction_destination_headway_seconds", pyarrow.int64()),
-        ("tm_scheduled_time_dt", pyarrow.timestamp("us")),
-        ("tm_actual_arrival_dt", pyarrow.timestamp("us")),
-        ("tm_actual_departure_dt", pyarrow.timestamp("us")),
-        ("tm_scheduled_time_sam", pyarrow.int64()),
-        ("tm_actual_arrival_time_sam", pyarrow.int64()),
-        ("tm_actual_departure_time_sam", pyarrow.int64()),
     ]
 )
 
@@ -84,34 +88,19 @@ def create_bus_parquet(job: HyperJob, num_files: Optional[int]) -> None:
 
     with pq.ParquetWriter(job.local_parquet_path, schema=job.parquet_schema) as writer:
         for batch in ds.to_batches(batch_size=500_000):
-            polars_df = pl.from_arrow(batch)
+            # this select() is here to make sure the order of the polars_df
+            # schema is the same as the bus_schema above.
+            # order of schema matters to the ParquetWriter
+
+            # if the bus_schema above is in the same order as the batch
+            # schema, then the select will do nothing - as expected
+
+            polars_df = pl.from_arrow(batch).select(bus_schema.names)  # type: ignore[union-attr]
 
             if not isinstance(polars_df, pl.DataFrame):
                 raise TypeError(f"Expected a Polars DataFrame or Series, but got {type(polars_df)}")
 
-            # Convert datetime to Eastern Time
-            polars_df = polars_df.with_columns(
-                pl.col("stop_arrival_dt").dt.convert_time_zone(time_zone="US/Eastern").dt.replace_time_zone(None),
-                pl.col("stop_departure_dt").dt.convert_time_zone(time_zone="US/Eastern").dt.replace_time_zone(None),
-                pl.col("gtfs_travel_to_dt").dt.convert_time_zone(time_zone="US/Eastern").dt.replace_time_zone(None),
-            )
-
-            # Convert seconds columns to be aligned with Eastern Time
-            polars_df = polars_df.with_columns(
-                (pl.col("gtfs_travel_to_dt") - pl.col("service_date").str.strptime(pl.Date, "%Y%m%d"))
-                .dt.total_seconds()
-                .alias("gtfs_travel_to_seconds"),
-                (pl.col("stop_arrival_dt") - pl.col("service_date").str.strptime(pl.Date, "%Y%m%d"))
-                .dt.total_seconds()
-                .alias("stop_arrival_seconds"),
-                (pl.col("stop_departure_dt") - pl.col("service_date").str.strptime(pl.Date, "%Y%m%d"))
-                .dt.total_seconds()
-                .alias("stop_departure_seconds"),
-            )
-
-            polars_df = polars_df.with_columns(pl.col("service_date").str.strptime(pl.Date, "%Y%m%d", strict=False))
-
-            writer.write_table(polars_df.to_arrow())
+            writer.write_table(apply_bus_analysis_conversions(polars_df))
 
 
 class HyperBusPerformanceAll(HyperJob):
diff --git a/tests/bus_performance_manager/test_bus_convert_for_tableau.py b/tests/bus_performance_manager/test_bus_convert_for_tableau.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python
+
+import polars as pl
+
+from lamp_py.tableau.conversions.convert_bus_performance_data import apply_bus_analysis_conversions
+
+
+# poetry run pytest -s tests/bus_performance_manager/test_bus_convert_for_tableau.py
+def test_apply_bus_analysis_conversions() -> None:
+    """
+    Test extracted conversions for tableau user view
+    """
+    df = pl.read_parquet("tests/test_files/PUBLIC_ARCHIVE/lamp/bus_vehicle_events/test_events.parquet")
+    table = apply_bus_analysis_conversions(polars_df=df)
+    print(df)
+    print(table)
diff --git a/tests/test_files/PUBLIC_ARCHIVE/lamp/bus_vehicle_events/test_events.parquet b/tests/test_files/PUBLIC_ARCHIVE/lamp/bus_vehicle_events/test_events.parquet