From 4d5b62b74ad4d6202205641ebf63131fb3039cdf Mon Sep 17 00:00:00 2001
From: Vahid Ahmadi <va.vahidahmadi@gmail.com>
Date: Wed, 17 Jun 2026 16:43:58 +0100
Subject: [PATCH 1/3] Add end-to-end regression test for bus_fare_spending in
 dataset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

generate_lcfs_table is unit-tested to compute bus_fare_spending, but nothing
checked it survives the QRF predict + enhanced-dataset assembly/save into the
published dataset — and it currently doesn't (issue #430): every other
consumption output lands, bus_fare_spending is dropped downstream.

Add an end-to-end test asserting the enhanced dataset carries a populated
bus_fare_spending column. Marked xfail so it is mergeable and documents the
gap; it will XPASS once the pipeline is fixed.

Refs #430.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../test_bus_fare_spending_in_dataset.py      | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 policyengine_uk_data/tests/test_bus_fare_spending_in_dataset.py

diff --git a/policyengine_uk_data/tests/test_bus_fare_spending_in_dataset.py b/policyengine_uk_data/tests/test_bus_fare_spending_in_dataset.py
new file mode 100644
index 00000000..b7a7c0d2
--- /dev/null
+++ b/policyengine_uk_data/tests/test_bus_fare_spending_in_dataset.py
@@ -0,0 +1,36 @@
+"""End-to-end regression test: bus_fare_spending must survive the full build.
+
+`generate_lcfs_table` is unit-tested to compute the bus_fare_spending column
+(test_lcfs_consumption_ingestion), but nothing checks that it survives the
+QRF train/predict and enhanced-dataset assembly/save into the published
+dataset. It currently does not (see issue #430) — every other consumption
+output lands, but bus_fare_spending is dropped somewhere downstream.
+
+This test is marked xfail so it is mergeable and documents the known gap; it
+will XPASS once the pipeline is fixed, prompting removal of the marker and
+conversion to a hard assertion.
+"""
+
+import pytest
+
+
+@pytest.mark.xfail(
+    reason=(
+        "bus_fare_spending is imputed but dropped downstream of "
+        "generate_lcfs_table before reaching the enhanced dataset (issue #430). "
+        "Remove this marker once the dataset carries the column."
+    ),
+    strict=False,
+)
+def test_enhanced_dataset_contains_bus_fare_spending(baseline):
+    assert "bus_fare_spending" in baseline.input_variables, (
+        "bus_fare_spending is not present in the enhanced dataset."
+    )
+    total = baseline.calculate(
+        "bus_fare_spending", map_to="household", period=2025
+    ).sum()
+    # UK household bus/coach fare spend is ~£2.7bn; guard against an all-zero
+    # column slipping through as 'present'.
+    assert total > 1e9, (
+        f"bus_fare_spending present but implausibly small: £{total / 1e9:.2f}bn"
+    )

From 4493f097ee379d0a761d184f1e1039a67e8a9789 Mon Sep 17 00:00:00 2001
From: Vahid Ahmadi <va.vahidahmadi@gmail.com>
Date: Wed, 17 Jun 2026 16:44:23 +0100
Subject: [PATCH 2/3] Add changelog entry for bus_fare_spending dataset
 regression test (#431)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 changelog.d/431.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 changelog.d/431.md

diff --git a/changelog.d/431.md b/changelog.d/431.md
new file mode 100644
index 00000000..ad3fcb58
--- /dev/null
+++ b/changelog.d/431.md
@@ -0,0 +1 @@
+- Add an end-to-end regression test asserting the enhanced dataset contains a populated `bus_fare_spending` column (xfail until the downstream build drop in #430 is fixed), covering the gap between the unit-tested `generate_lcfs_table` and the published dataset.

From 8fb9d9686949ecacd50753398cf631bc08fe61a4 Mon Sep 17 00:00:00 2001
From: Vahid Ahmadi <va.vahidahmadi@gmail.com>
Date: Wed, 17 Jun 2026 17:32:43 +0100
Subject: [PATCH 3/3] TEMP: probe bus_fare_spending presence after each build
 step (diagnostic, to revert)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 policyengine_uk_data/datasets/create_datasets.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/policyengine_uk_data/datasets/create_datasets.py b/policyengine_uk_data/datasets/create_datasets.py
index 391896db..691892ba 100644
--- a/policyengine_uk_data/datasets/create_datasets.py
+++ b/policyengine_uk_data/datasets/create_datasets.py
@@ -46,6 +46,12 @@ def _materialize_base_year_dataset(dataset, frs_release, uprate_dataset):
     return uprate_dataset(dataset, frs_release.base_year)
 
 
+def _busprobe(ds, stage):  # TEMP PROBE — remove after diagnosing bus_fare drop
+    present = "bus_fare_spending" in ds.household.columns
+    n = int((ds.household["bus_fare_spending"] > 0).sum()) if present else -1
+    logging.info("BUSPROBE %-34s present=%s nonzero=%s", stage, present, n)
+
+
 def main():
     """Create enhanced FRS dataset with rich progress tracking."""
     try:
@@ -158,31 +164,38 @@ def main():
             # uses num_vehicles as a predictor for fuel spending
             update_dataset("Impute wealth", "processing")
             frs = impute_wealth(frs)
+            _busprobe(frs, "after_wealth")  # TEMP PROBE
             frs = uprate_property_by_region(frs)
             update_dataset("Impute wealth", "completed")
 
             update_dataset("Impute consumption", "processing")
             frs = impute_consumption(frs)
+            _busprobe(frs, "after_consumption")  # TEMP PROBE
             update_dataset("Impute consumption", "completed")
 
             update_dataset("Impute VAT", "processing")
             frs = impute_vat(frs)
+            _busprobe(frs, "after_vat")  # TEMP PROBE
             update_dataset("Impute VAT", "completed")
 
             update_dataset("Impute public service usage", "processing")
             frs = impute_services(frs)
+            _busprobe(frs, "after_services")  # TEMP PROBE
             update_dataset("Impute public service usage", "completed")
 
             update_dataset("Impute income", "processing")
             frs = impute_income(frs)
+            _busprobe(frs, "after_income")  # TEMP PROBE
             update_dataset("Impute income", "completed")
 
             update_dataset("Impute capital gains", "processing")
             frs = impute_capital_gains(frs)
+            _busprobe(frs, "after_capital_gains")  # TEMP PROBE
             update_dataset("Impute capital gains", "completed")
 
             update_dataset("Impute salary sacrifice", "processing")
             frs = impute_salary_sacrifice(frs)
+            _busprobe(frs, "after_salary_sacrifice")  # TEMP PROBE
             update_dataset("Impute salary sacrifice", "completed")
 
             update_dataset("Impute student loan plan", "processing")
@@ -199,6 +212,7 @@ def main():
             )
 
             frs = clone_and_assign(frs, n_clones=oa_clones)
+            _busprobe(frs, "after_clone")  # TEMP PROBE
             update_dataset("Clone and assign OA geography", "completed")
 
             if align_to_base_year:
@@ -280,6 +294,7 @@ def main():
             update_dataset("Calibrate local authority weights", "completed")
 
             frs_calibrated = frs_calibrated_constituencies
+            _busprobe(frs_calibrated, "after_calibrate")  # TEMP PROBE
             if materialize_base_year:
                 update_dataset(materialize_step, "processing")
                 frs_calibrated = _materialize_base_year_dataset(
@@ -309,6 +324,7 @@ def main():
             update_dataset("Calibrate fuel litres", "completed")
 
             update_dataset("Save final dataset", "processing")
+            _busprobe(frs_calibrated, "before_save")  # TEMP PROBE
             strip_internal_disability_reported_amounts(frs_calibrated).save(
                 STORAGE_FOLDER / frs_release.enhanced_dataset_file
             )