From 4d5b62b74ad4d6202205641ebf63131fb3039cdf Mon Sep 17 00:00:00 2001 From: Vahid Ahmadi Date: Wed, 17 Jun 2026 16:43:58 +0100 Subject: [PATCH 1/3] Add end-to-end regression test for bus_fare_spending in dataset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit generate_lcfs_table is unit-tested to compute bus_fare_spending, but nothing checked it survives the QRF predict + enhanced-dataset assembly/save into the published dataset — and it currently doesn't (issue #430): every other consumption output lands, bus_fare_spending is dropped downstream. Add an end-to-end test asserting the enhanced dataset carries a populated bus_fare_spending column. Marked xfail so it is mergeable and documents the gap; it will XPASS once the pipeline is fixed. Refs #430. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../test_bus_fare_spending_in_dataset.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 policyengine_uk_data/tests/test_bus_fare_spending_in_dataset.py diff --git a/policyengine_uk_data/tests/test_bus_fare_spending_in_dataset.py b/policyengine_uk_data/tests/test_bus_fare_spending_in_dataset.py new file mode 100644 index 00000000..b7a7c0d2 --- /dev/null +++ b/policyengine_uk_data/tests/test_bus_fare_spending_in_dataset.py @@ -0,0 +1,36 @@ +"""End-to-end regression test: bus_fare_spending must survive the full build. + +`generate_lcfs_table` is unit-tested to compute the bus_fare_spending column +(test_lcfs_consumption_ingestion), but nothing checks that it survives the +QRF train/predict and enhanced-dataset assembly/save into the published +dataset. It currently does not (see issue #430) — every other consumption +output lands, but bus_fare_spending is dropped somewhere downstream. + +This test is marked xfail so it is mergeable and documents the known gap; it +will XPASS once the pipeline is fixed, prompting removal of the marker and +conversion to a hard assertion. +""" + +import pytest + + +@pytest.mark.xfail( + reason=( + "bus_fare_spending is imputed but dropped downstream of " + "generate_lcfs_table before reaching the enhanced dataset (issue #430). " + "Remove this marker once the dataset carries the column." + ), + strict=False, +) +def test_enhanced_dataset_contains_bus_fare_spending(baseline): + assert "bus_fare_spending" in baseline.input_variables, ( + "bus_fare_spending is not present in the enhanced dataset." + ) + total = baseline.calculate( + "bus_fare_spending", map_to="household", period=2025 + ).sum() + # UK household bus/coach fare spend is ~£2.7bn; guard against an all-zero + # column slipping through as 'present'. + assert total > 1e9, ( + f"bus_fare_spending present but implausibly small: £{total / 1e9:.2f}bn" + ) From 4493f097ee379d0a761d184f1e1039a67e8a9789 Mon Sep 17 00:00:00 2001 From: Vahid Ahmadi Date: Wed, 17 Jun 2026 16:44:23 +0100 Subject: [PATCH 2/3] Add changelog entry for bus_fare_spending dataset regression test (#431) Co-Authored-By: Claude Opus 4.8 (1M context) --- changelog.d/431.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/431.md diff --git a/changelog.d/431.md b/changelog.d/431.md new file mode 100644 index 00000000..ad3fcb58 --- /dev/null +++ b/changelog.d/431.md @@ -0,0 +1 @@ +- Add an end-to-end regression test asserting the enhanced dataset contains a populated `bus_fare_spending` column (xfail until the downstream build drop in #430 is fixed), covering the gap between the unit-tested `generate_lcfs_table` and the published dataset. From 8fb9d9686949ecacd50753398cf631bc08fe61a4 Mon Sep 17 00:00:00 2001 From: Vahid Ahmadi Date: Wed, 17 Jun 2026 17:32:43 +0100 Subject: [PATCH 3/3] TEMP: probe bus_fare_spending presence after each build step (diagnostic, to revert) Co-Authored-By: Claude Opus 4.8 (1M context) --- policyengine_uk_data/datasets/create_datasets.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/policyengine_uk_data/datasets/create_datasets.py b/policyengine_uk_data/datasets/create_datasets.py index 391896db..691892ba 100644 --- a/policyengine_uk_data/datasets/create_datasets.py +++ b/policyengine_uk_data/datasets/create_datasets.py @@ -46,6 +46,12 @@ def _materialize_base_year_dataset(dataset, frs_release, uprate_dataset): return uprate_dataset(dataset, frs_release.base_year) +def _busprobe(ds, stage): # TEMP PROBE — remove after diagnosing bus_fare drop + present = "bus_fare_spending" in ds.household.columns + n = int((ds.household["bus_fare_spending"] > 0).sum()) if present else -1 + logging.info("BUSPROBE %-34s present=%s nonzero=%s", stage, present, n) + + def main(): """Create enhanced FRS dataset with rich progress tracking.""" try: @@ -158,31 +164,38 @@ def main(): # uses num_vehicles as a predictor for fuel spending update_dataset("Impute wealth", "processing") frs = impute_wealth(frs) + _busprobe(frs, "after_wealth") # TEMP PROBE frs = uprate_property_by_region(frs) update_dataset("Impute wealth", "completed") update_dataset("Impute consumption", "processing") frs = impute_consumption(frs) + _busprobe(frs, "after_consumption") # TEMP PROBE update_dataset("Impute consumption", "completed") update_dataset("Impute VAT", "processing") frs = impute_vat(frs) + _busprobe(frs, "after_vat") # TEMP PROBE update_dataset("Impute VAT", "completed") update_dataset("Impute public service usage", "processing") frs = impute_services(frs) + _busprobe(frs, "after_services") # TEMP PROBE update_dataset("Impute public service usage", "completed") update_dataset("Impute income", "processing") frs = impute_income(frs) + _busprobe(frs, "after_income") # TEMP PROBE update_dataset("Impute income", "completed") update_dataset("Impute capital gains", "processing") frs = impute_capital_gains(frs) + _busprobe(frs, "after_capital_gains") # TEMP PROBE update_dataset("Impute capital gains", "completed") update_dataset("Impute salary sacrifice", "processing") frs = impute_salary_sacrifice(frs) + _busprobe(frs, "after_salary_sacrifice") # TEMP PROBE update_dataset("Impute salary sacrifice", "completed") update_dataset("Impute student loan plan", "processing") @@ -199,6 +212,7 @@ def main(): ) frs = clone_and_assign(frs, n_clones=oa_clones) + _busprobe(frs, "after_clone") # TEMP PROBE update_dataset("Clone and assign OA geography", "completed") if align_to_base_year: @@ -280,6 +294,7 @@ def main(): update_dataset("Calibrate local authority weights", "completed") frs_calibrated = frs_calibrated_constituencies + _busprobe(frs_calibrated, "after_calibrate") # TEMP PROBE if materialize_base_year: update_dataset(materialize_step, "processing") frs_calibrated = _materialize_base_year_dataset( @@ -309,6 +324,7 @@ def main(): update_dataset("Calibrate fuel litres", "completed") update_dataset("Save final dataset", "processing") + _busprobe(frs_calibrated, "before_save") # TEMP PROBE strip_internal_disability_reported_amounts(frs_calibrated).save( STORAGE_FOLDER / frs_release.enhanced_dataset_file )