PolicyEngine · vahid-ahmadi · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026
diff --git a/changelog.d/431.md b/changelog.d/431.md
@@ -0,0 +1 @@
+- Add an end-to-end regression test asserting the enhanced dataset contains a populated `bus_fare_spending` column (xfail until the downstream build drop in #430 is fixed), covering the gap between the unit-tested `generate_lcfs_table` and the published dataset.
diff --git a/policyengine_uk_data/datasets/create_datasets.py b/policyengine_uk_data/datasets/create_datasets.py
@@ -46,6 +46,12 @@ def _materialize_base_year_dataset(dataset, frs_release, uprate_dataset):
     return uprate_dataset(dataset, frs_release.base_year)
 
 
+def _busprobe(ds, stage):  # TEMP PROBE — remove after diagnosing bus_fare drop
+    present = "bus_fare_spending" in ds.household.columns
+    n = int((ds.household["bus_fare_spending"] > 0).sum()) if present else -1
+    logging.info("BUSPROBE %-34s present=%s nonzero=%s", stage, present, n)
+
+
 def main():
     """Create enhanced FRS dataset with rich progress tracking."""
     try:
@@ -158,31 +164,38 @@ def main():
             # uses num_vehicles as a predictor for fuel spending
             update_dataset("Impute wealth", "processing")
             frs = impute_wealth(frs)
+            _busprobe(frs, "after_wealth")  # TEMP PROBE
             frs = uprate_property_by_region(frs)
             update_dataset("Impute wealth", "completed")
 
             update_dataset("Impute consumption", "processing")
             frs = impute_consumption(frs)
+            _busprobe(frs, "after_consumption")  # TEMP PROBE
             update_dataset("Impute consumption", "completed")
 
             update_dataset("Impute VAT", "processing")
             frs = impute_vat(frs)
+            _busprobe(frs, "after_vat")  # TEMP PROBE
             update_dataset("Impute VAT", "completed")
 
             update_dataset("Impute public service usage", "processing")
             frs = impute_services(frs)
+            _busprobe(frs, "after_services")  # TEMP PROBE
             update_dataset("Impute public service usage", "completed")
 
             update_dataset("Impute income", "processing")
             frs = impute_income(frs)
+            _busprobe(frs, "after_income")  # TEMP PROBE
             update_dataset("Impute income", "completed")
 
             update_dataset("Impute capital gains", "processing")
             frs = impute_capital_gains(frs)
+            _busprobe(frs, "after_capital_gains")  # TEMP PROBE
             update_dataset("Impute capital gains", "completed")
 
             update_dataset("Impute salary sacrifice", "processing")
             frs = impute_salary_sacrifice(frs)
+            _busprobe(frs, "after_salary_sacrifice")  # TEMP PROBE
             update_dataset("Impute salary sacrifice", "completed")
 
             update_dataset("Impute student loan plan", "processing")
@@ -199,6 +212,7 @@ def main():
             )
 
             frs = clone_and_assign(frs, n_clones=oa_clones)
+            _busprobe(frs, "after_clone")  # TEMP PROBE
             update_dataset("Clone and assign OA geography", "completed")
 
             if align_to_base_year:
@@ -280,6 +294,7 @@ def main():
             update_dataset("Calibrate local authority weights", "completed")
 
             frs_calibrated = frs_calibrated_constituencies
+            _busprobe(frs_calibrated, "after_calibrate")  # TEMP PROBE
             if materialize_base_year:
                 update_dataset(materialize_step, "processing")
                 frs_calibrated = _materialize_base_year_dataset(
@@ -309,6 +324,7 @@ def main():
             update_dataset("Calibrate fuel litres", "completed")
 
             update_dataset("Save final dataset", "processing")
+            _busprobe(frs_calibrated, "before_save")  # TEMP PROBE
             strip_internal_disability_reported_amounts(frs_calibrated).save(
                 STORAGE_FOLDER / frs_release.enhanced_dataset_file
             )

diff --git a/policyengine_uk_data/tests/test_bus_fare_spending_in_dataset.py b/policyengine_uk_data/tests/test_bus_fare_spending_in_dataset.py
@@ -0,0 +1,36 @@
+"""End-to-end regression test: bus_fare_spending must survive the full build.
+
+`generate_lcfs_table` is unit-tested to compute the bus_fare_spending column
+(test_lcfs_consumption_ingestion), but nothing checks that it survives the
+QRF train/predict and enhanced-dataset assembly/save into the published
+dataset. It currently does not (see issue #430) — every other consumption
+output lands, but bus_fare_spending is dropped somewhere downstream.
+
+This test is marked xfail so it is mergeable and documents the known gap; it
+will XPASS once the pipeline is fixed, prompting removal of the marker and
+conversion to a hard assertion.
+"""
+
+import pytest
+
+
+@pytest.mark.xfail(
+    reason=(
+        "bus_fare_spending is imputed but dropped downstream of "
+        "generate_lcfs_table before reaching the enhanced dataset (issue #430). "
+        "Remove this marker once the dataset carries the column."
+    ),
+    strict=False,
+)
+def test_enhanced_dataset_contains_bus_fare_spending(baseline):
+    assert "bus_fare_spending" in baseline.input_variables, (
+        "bus_fare_spending is not present in the enhanced dataset."
+    )
+    total = baseline.calculate(
+        "bus_fare_spending", map_to="household", period=2025
+    ).sum()
+    # UK household bus/coach fare spend is ~£2.7bn; guard against an all-zero
+    # column slipping through as 'present'.
+    assert total > 1e9, (
+        f"bus_fare_spending present but implausibly small: £{total / 1e9:.2f}bn"
+    )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		- Add an end-to-end regression test asserting the enhanced dataset contains a populated `bus_fare_spending` column (xfail until the downstream build drop in #430 is fixed), covering the gap between the unit-tested `generate_lcfs_table` and the published dataset.