Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions changelog.d/428.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
- Impute household `bus_fare_spending` (COICOP 7.3.2 bus & coach fares) from the detailed LCFS codes as a new output of the consumption QRF, providing the passenger fare households pay (distinct from the ETB-based `bus_subsidy_spending`) as a building block for modelling bus fare reforms. Re-enables the `bus_subsidy_spending` smoke-test target (GBP 2.5bn) and records the `bus_fare_spending` target (GBP 3.4bn, DfT Annual Bus Statistics year ending March 2025) to enable once a dataset built with the imputation is published. Refs #427.
- Retry OBR detailed-forecast-table downloads with exponential backoff on transient HTTP errors (429/5xx) and connection failures, so an occasional OBR rate-limit no longer drops the OBR target set and reds the build.
22 changes: 22 additions & 0 deletions policyengine_uk_data/datasets/imputations/consumption.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,18 @@
"p537": "domestic_energy_consumption", # aggregate kept for backward compat
}

# LCFS detailed COICOP codes for bus & coach fares (passenger transport by
# road). There is no single P-code for bus fares alone — P607
# (transport_consumption) bundles vehicle purchase, running costs, fuel, air and
# rail — so bus_fare_spending is summed from the detailed codes. Excludes rail
# (c731xx), air, combined tickets and taxis (which the LCFS codes separately).
# Confirmed present in the LCFS 2021/22 and 2023/24 (current release) dvhh
# files; re-confirm whenever CURRENT_LCFS_RELEASE is bumped, as detailed
# sub-codes can change between vintages (these resolve directly against the
# current release at build time, so a renamed/removed code fails loudly rather
# than silently zeroing).
BUS_FARE_LCFS_CODES = ["c73212", "c73213", "c73214"]

PREDICTOR_VARIABLES = [
"is_adult",
"is_child",
Expand Down Expand Up @@ -174,6 +186,7 @@
"miscellaneous_consumption",
"petrol_spending",
"diesel_spending",
"bus_fare_spending", # COICOP 7.3.2 bus & coach fares (see BUS_FARE_LCFS_CODES)
"domestic_energy_consumption", # aggregate; backward compat with price cap subsidy
"electricity_consumption",
"gas_consumption",
Expand Down Expand Up @@ -585,11 +598,20 @@ def generate_lcfs_table(lcfs_person: pd.DataFrame, lcfs_household: pd.DataFrame)

household = household.rename(columns=CONSUMPTION_VARIABLE_RENAMES)

# Bus & coach fares (COICOP 7.3.2), summed from the detailed LCFS codes.
# Recorded household-level only — LCFS has no person-level fare field — so
# this is the household total; allocating to individuals (e.g. for an
# age-targeted fare reform) requires an external age-usage profile (NTS).
household["bus_fare_spending"] = sum(
household[code] for code in BUS_FARE_LCFS_CODES
)

# Annualise weekly LCFS values. Use the same WEEKS_IN_YEAR constant
# (365.25 / 7 ≈ 52.1786) as `datasets/frs.py` rather than a bare `* 52`,
# which underestimates annual totals by ~0.34% and skews VAT / energy
# imputation targets against FRS income.
annualise = list(CONSUMPTION_VARIABLE_RENAMES.values()) + [
"bus_fare_spending",
"hbai_household_net_income",
"household_gross_income",
"electricity_consumption",
Expand Down
45 changes: 40 additions & 5 deletions policyengine_uk_data/targets/sources/obr.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import io
import logging
import time
from functools import lru_cache

import openpyxl
Expand All @@ -37,12 +38,46 @@
}


@lru_cache(maxsize=1)
# OBR occasionally rate-limits CI runners (HTTP 429) or returns transient 5xx
# errors. Retry with exponential backoff so a single throttled response does not
# drop the whole OBR target set and red an unrelated build.
_DOWNLOAD_MAX_ATTEMPTS = 4
_DOWNLOAD_RETRY_STATUSES = {429, 500, 502, 503, 504}


@lru_cache(maxsize=2)
def _download_workbook(url: str) -> openpyxl.Workbook:
"""Download an xlsx from OBR and return an openpyxl workbook."""
r = requests.get(url, headers=HEADERS, allow_redirects=True, timeout=60)
r.raise_for_status()
return openpyxl.load_workbook(io.BytesIO(r.content), data_only=False)
"""Download an xlsx from OBR and return an openpyxl workbook.

Retries transient HTTP errors (429/5xx) and connection failures with
exponential backoff, honouring a numeric Retry-After header when present.
"""
last_error: Exception | None = None
for attempt in range(_DOWNLOAD_MAX_ATTEMPTS):
wait = 2**attempt
try:
r = requests.get(url, headers=HEADERS, allow_redirects=True, timeout=60)
except requests.RequestException as e:
last_error = e # connection/timeout — retryable
else:
if r.status_code not in _DOWNLOAD_RETRY_STATUSES:
r.raise_for_status()
return openpyxl.load_workbook(io.BytesIO(r.content), data_only=False)
last_error = requests.HTTPError(
f"{r.status_code} for url: {url}", response=r
)
retry_after = r.headers.get("Retry-After", "")
if retry_after.isdigit():
wait = int(retry_after)
if attempt < _DOWNLOAD_MAX_ATTEMPTS - 1:
logger.warning(
"OBR download %s failed (%s); retrying in %ss",
url,
last_error,
wait,
)
time.sleep(wait)
raise last_error


def _read_row_values(ws, row_num: int, col_letters: list[str]) -> dict[int, float]:
Expand Down
10 changes: 9 additions & 1 deletion policyengine_uk_data/tests/test_aggregates.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,15 @@
# ORR/GOV.UK rail finance statistics report GBP 21.6bn of government
# support to the rail industry in 2024-25.
"rail_subsidy_spending": 21.6e9,
# "bus_subsidy_spending": 2.5e9,
# Approximate public support for local bus services; kept as a loose
# smoke-test target because source coverage and dataset coverage differ.
"bus_subsidy_spending": 2.5e9,
# DfT Annual Bus Statistics (year ending March 2025) report GBP 3.4bn
# passenger fare receipts for local bus services in England. The LCFS input
# is UK household bus/coach fare spending, so this is an order-of-magnitude
# target. Enable once a dataset built with the bus_fare_spending imputation
# is published — the column is absent from the currently-released dataset.
# "bus_fare_spending": 3.4e9,
}


Expand Down
5 changes: 5 additions & 0 deletions policyengine_uk_data/tests/test_lcfs_consumption_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ def add_has_fuel(household):
**{f"p{code}": [1.0] for code in range(601, 613)},
"c72211": [5.0],
"c72212": [6.0],
"c73212": [2.0],
"c73213": [3.0],
"c73214": [1.0],
}
)
person = pd.DataFrame(
Expand All @@ -50,6 +53,8 @@ def add_has_fuel(household):
assert result["accommodation_type"].iloc[0] == "HOUSE_SEMI_DETACHED"
assert result["employment_income"].iloc[0] == 300.0 * WEEKS_IN_YEAR
assert result["household_weight"].iloc[0] == 500
# Bus fare = sum of the COICOP 7.3.2 codes (2 + 3 + 1), annualised.
assert result["bus_fare_spending"].iloc[0] == 6.0 * WEEKS_IN_YEAR
assert (
result["domestic_energy_consumption"].iloc[0]
== result["electricity_consumption"].iloc[0] + result["gas_consumption"].iloc[0]
Expand Down