diff --git a/changelog.d/428.md b/changelog.d/428.md new file mode 100644 index 00000000..ba337597 --- /dev/null +++ b/changelog.d/428.md @@ -0,0 +1,2 @@ +- Impute household `bus_fare_spending` (COICOP 7.3.2 bus & coach fares) from the detailed LCFS codes as a new output of the consumption QRF, providing the passenger fare households pay (distinct from the ETB-based `bus_subsidy_spending`) as a building block for modelling bus fare reforms. Re-enables the `bus_subsidy_spending` smoke-test target (GBP 2.5bn) and records the `bus_fare_spending` target (GBP 3.4bn, DfT Annual Bus Statistics year ending March 2025) to enable once a dataset built with the imputation is published. Refs #427. +- Retry OBR detailed-forecast-table downloads with exponential backoff on transient HTTP errors (429/5xx) and connection failures, so an occasional OBR rate-limit no longer drops the OBR target set and reds the build. diff --git a/policyengine_uk_data/datasets/imputations/consumption.py b/policyengine_uk_data/datasets/imputations/consumption.py index bf8ad42f..3742bb04 100644 --- a/policyengine_uk_data/datasets/imputations/consumption.py +++ b/policyengine_uk_data/datasets/imputations/consumption.py @@ -146,6 +146,18 @@ "p537": "domestic_energy_consumption", # aggregate kept for backward compat } +# LCFS detailed COICOP codes for bus & coach fares (passenger transport by +# road). There is no single P-code for bus fares alone — P607 +# (transport_consumption) bundles vehicle purchase, running costs, fuel, air and +# rail — so bus_fare_spending is summed from the detailed codes. Excludes rail +# (c731xx), air, combined tickets and taxis (which the LCFS codes separately). +# Confirmed present in the LCFS 2021/22 and 2023/24 (current release) dvhh +# files; re-confirm whenever CURRENT_LCFS_RELEASE is bumped, as detailed +# sub-codes can change between vintages (these resolve directly against the +# current release at build time, so a renamed/removed code fails loudly rather +# than silently zeroing). +BUS_FARE_LCFS_CODES = ["c73212", "c73213", "c73214"] + PREDICTOR_VARIABLES = [ "is_adult", "is_child", @@ -174,6 +186,7 @@ "miscellaneous_consumption", "petrol_spending", "diesel_spending", + "bus_fare_spending", # COICOP 7.3.2 bus & coach fares (see BUS_FARE_LCFS_CODES) "domestic_energy_consumption", # aggregate; backward compat with price cap subsidy "electricity_consumption", "gas_consumption", @@ -585,11 +598,20 @@ def generate_lcfs_table(lcfs_person: pd.DataFrame, lcfs_household: pd.DataFrame) household = household.rename(columns=CONSUMPTION_VARIABLE_RENAMES) + # Bus & coach fares (COICOP 7.3.2), summed from the detailed LCFS codes. + # Recorded household-level only — LCFS has no person-level fare field — so + # this is the household total; allocating to individuals (e.g. for an + # age-targeted fare reform) requires an external age-usage profile (NTS). + household["bus_fare_spending"] = sum( + household[code] for code in BUS_FARE_LCFS_CODES + ) + # Annualise weekly LCFS values. Use the same WEEKS_IN_YEAR constant # (365.25 / 7 ≈ 52.1786) as `datasets/frs.py` rather than a bare `* 52`, # which underestimates annual totals by ~0.34% and skews VAT / energy # imputation targets against FRS income. annualise = list(CONSUMPTION_VARIABLE_RENAMES.values()) + [ + "bus_fare_spending", "hbai_household_net_income", "household_gross_income", "electricity_consumption", diff --git a/policyengine_uk_data/targets/sources/obr.py b/policyengine_uk_data/targets/sources/obr.py index 82b270be..9f2f31f7 100644 --- a/policyengine_uk_data/targets/sources/obr.py +++ b/policyengine_uk_data/targets/sources/obr.py @@ -11,6 +11,7 @@ import io import logging +import time from functools import lru_cache import openpyxl @@ -37,12 +38,46 @@ } -@lru_cache(maxsize=1) +# OBR occasionally rate-limits CI runners (HTTP 429) or returns transient 5xx +# errors. Retry with exponential backoff so a single throttled response does not +# drop the whole OBR target set and red an unrelated build. +_DOWNLOAD_MAX_ATTEMPTS = 4 +_DOWNLOAD_RETRY_STATUSES = {429, 500, 502, 503, 504} + + +@lru_cache(maxsize=2) def _download_workbook(url: str) -> openpyxl.Workbook: - """Download an xlsx from OBR and return an openpyxl workbook.""" - r = requests.get(url, headers=HEADERS, allow_redirects=True, timeout=60) - r.raise_for_status() - return openpyxl.load_workbook(io.BytesIO(r.content), data_only=False) + """Download an xlsx from OBR and return an openpyxl workbook. + + Retries transient HTTP errors (429/5xx) and connection failures with + exponential backoff, honouring a numeric Retry-After header when present. + """ + last_error: Exception | None = None + for attempt in range(_DOWNLOAD_MAX_ATTEMPTS): + wait = 2**attempt + try: + r = requests.get(url, headers=HEADERS, allow_redirects=True, timeout=60) + except requests.RequestException as e: + last_error = e # connection/timeout — retryable + else: + if r.status_code not in _DOWNLOAD_RETRY_STATUSES: + r.raise_for_status() + return openpyxl.load_workbook(io.BytesIO(r.content), data_only=False) + last_error = requests.HTTPError( + f"{r.status_code} for url: {url}", response=r + ) + retry_after = r.headers.get("Retry-After", "") + if retry_after.isdigit(): + wait = int(retry_after) + if attempt < _DOWNLOAD_MAX_ATTEMPTS - 1: + logger.warning( + "OBR download %s failed (%s); retrying in %ss", + url, + last_error, + wait, + ) + time.sleep(wait) + raise last_error def _read_row_values(ws, row_num: int, col_letters: list[str]) -> dict[int, float]: diff --git a/policyengine_uk_data/tests/test_aggregates.py b/policyengine_uk_data/tests/test_aggregates.py index 868bfe94..6a63c2be 100644 --- a/policyengine_uk_data/tests/test_aggregates.py +++ b/policyengine_uk_data/tests/test_aggregates.py @@ -6,7 +6,15 @@ # ORR/GOV.UK rail finance statistics report GBP 21.6bn of government # support to the rail industry in 2024-25. "rail_subsidy_spending": 21.6e9, - # "bus_subsidy_spending": 2.5e9, + # Approximate public support for local bus services; kept as a loose + # smoke-test target because source coverage and dataset coverage differ. + "bus_subsidy_spending": 2.5e9, + # DfT Annual Bus Statistics (year ending March 2025) report GBP 3.4bn + # passenger fare receipts for local bus services in England. The LCFS input + # is UK household bus/coach fare spending, so this is an order-of-magnitude + # target. Enable once a dataset built with the bus_fare_spending imputation + # is published — the column is absent from the currently-released dataset. + # "bus_fare_spending": 3.4e9, } diff --git a/policyengine_uk_data/tests/test_lcfs_consumption_ingestion.py b/policyengine_uk_data/tests/test_lcfs_consumption_ingestion.py index 3641061c..f421f44b 100644 --- a/policyengine_uk_data/tests/test_lcfs_consumption_ingestion.py +++ b/policyengine_uk_data/tests/test_lcfs_consumption_ingestion.py @@ -30,6 +30,9 @@ def add_has_fuel(household): **{f"p{code}": [1.0] for code in range(601, 613)}, "c72211": [5.0], "c72212": [6.0], + "c73212": [2.0], + "c73213": [3.0], + "c73214": [1.0], } ) person = pd.DataFrame( @@ -50,6 +53,8 @@ def add_has_fuel(household): assert result["accommodation_type"].iloc[0] == "HOUSE_SEMI_DETACHED" assert result["employment_income"].iloc[0] == 300.0 * WEEKS_IN_YEAR assert result["household_weight"].iloc[0] == 500 + # Bus fare = sum of the COICOP 7.3.2 codes (2 + 3 + 1), annualised. + assert result["bus_fare_spending"].iloc[0] == 6.0 * WEEKS_IN_YEAR assert ( result["domestic_energy_consumption"].iloc[0] == result["electricity_consumption"].iloc[0] + result["gas_consumption"].iloc[0]