Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/431.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
- Add an end-to-end regression test asserting the enhanced dataset contains a populated `bus_fare_spending` column (xfail until the downstream build drop in #430 is fixed), covering the gap between the unit-tested `generate_lcfs_table` and the published dataset.
16 changes: 16 additions & 0 deletions policyengine_uk_data/datasets/create_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ def _materialize_base_year_dataset(dataset, frs_release, uprate_dataset):
return uprate_dataset(dataset, frs_release.base_year)


def _busprobe(ds, stage): # TEMP PROBE — remove after diagnosing bus_fare drop
present = "bus_fare_spending" in ds.household.columns
n = int((ds.household["bus_fare_spending"] > 0).sum()) if present else -1
logging.info("BUSPROBE %-34s present=%s nonzero=%s", stage, present, n)


def main():
"""Create enhanced FRS dataset with rich progress tracking."""
try:
Expand Down Expand Up @@ -158,31 +164,38 @@ def main():
# uses num_vehicles as a predictor for fuel spending
update_dataset("Impute wealth", "processing")
frs = impute_wealth(frs)
_busprobe(frs, "after_wealth") # TEMP PROBE
frs = uprate_property_by_region(frs)
update_dataset("Impute wealth", "completed")

update_dataset("Impute consumption", "processing")
frs = impute_consumption(frs)
_busprobe(frs, "after_consumption") # TEMP PROBE
update_dataset("Impute consumption", "completed")

update_dataset("Impute VAT", "processing")
frs = impute_vat(frs)
_busprobe(frs, "after_vat") # TEMP PROBE
update_dataset("Impute VAT", "completed")

update_dataset("Impute public service usage", "processing")
frs = impute_services(frs)
_busprobe(frs, "after_services") # TEMP PROBE
update_dataset("Impute public service usage", "completed")

update_dataset("Impute income", "processing")
frs = impute_income(frs)
_busprobe(frs, "after_income") # TEMP PROBE
update_dataset("Impute income", "completed")

update_dataset("Impute capital gains", "processing")
frs = impute_capital_gains(frs)
_busprobe(frs, "after_capital_gains") # TEMP PROBE
update_dataset("Impute capital gains", "completed")

update_dataset("Impute salary sacrifice", "processing")
frs = impute_salary_sacrifice(frs)
_busprobe(frs, "after_salary_sacrifice") # TEMP PROBE
update_dataset("Impute salary sacrifice", "completed")

update_dataset("Impute student loan plan", "processing")
Expand All @@ -199,6 +212,7 @@ def main():
)

frs = clone_and_assign(frs, n_clones=oa_clones)
_busprobe(frs, "after_clone") # TEMP PROBE
update_dataset("Clone and assign OA geography", "completed")

if align_to_base_year:
Expand Down Expand Up @@ -280,6 +294,7 @@ def main():
update_dataset("Calibrate local authority weights", "completed")

frs_calibrated = frs_calibrated_constituencies
_busprobe(frs_calibrated, "after_calibrate") # TEMP PROBE
if materialize_base_year:
update_dataset(materialize_step, "processing")
frs_calibrated = _materialize_base_year_dataset(
Expand Down Expand Up @@ -309,6 +324,7 @@ def main():
update_dataset("Calibrate fuel litres", "completed")

update_dataset("Save final dataset", "processing")
_busprobe(frs_calibrated, "before_save") # TEMP PROBE
strip_internal_disability_reported_amounts(frs_calibrated).save(
STORAGE_FOLDER / frs_release.enhanced_dataset_file
)
Expand Down
36 changes: 36 additions & 0 deletions policyengine_uk_data/tests/test_bus_fare_spending_in_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""End-to-end regression test: bus_fare_spending must survive the full build.

`generate_lcfs_table` is unit-tested to compute the bus_fare_spending column
(test_lcfs_consumption_ingestion), but nothing checks that it survives the
QRF train/predict and enhanced-dataset assembly/save into the published
dataset. It currently does not (see issue #430) — every other consumption
output lands, but bus_fare_spending is dropped somewhere downstream.

This test is marked xfail so it is mergeable and documents the known gap; it
will XPASS once the pipeline is fixed, prompting removal of the marker and
conversion to a hard assertion.
"""

import pytest


@pytest.mark.xfail(
reason=(
"bus_fare_spending is imputed but dropped downstream of "
"generate_lcfs_table before reaching the enhanced dataset (issue #430). "
"Remove this marker once the dataset carries the column."
),
strict=False,
)
def test_enhanced_dataset_contains_bus_fare_spending(baseline):
assert "bus_fare_spending" in baseline.input_variables, (
"bus_fare_spending is not present in the enhanced dataset."
)
total = baseline.calculate(
"bus_fare_spending", map_to="household", period=2025
).sum()
# UK household bus/coach fare spend is ~£2.7bn; guard against an all-zero
# column slipping through as 'present'.
assert total > 1e9, (
f"bus_fare_spending present but implausibly small: £{total / 1e9:.2f}bn"
)
Loading