diff --git a/packages/populace-build/src/populace/build/us/__init__.py b/packages/populace-build/src/populace/build/us/__init__.py index 87395e3..3a90bbc 100644 --- a/packages/populace-build/src/populace/build/us/__init__.py +++ b/packages/populace-build/src/populace/build/us/__init__.py @@ -52,6 +52,15 @@ SimpleTaxExpenditureReform, compile_us_fiscal_target_registry, ) +from populace.build.us.reform_validation import ( + REFORM_VALIDATION_SCHEMA_VERSION, + ReformValidationSpec, + in_sample_reform_specs, + load_default_reform_specs, + out_of_sample_reform_specs, + reform_validation_payload, + write_reform_validation, +) from populace.build.us.source_coverage import ( LEDGER_US_SOURCE_COVERAGE_CONTRACT_COMMIT, US_SOURCE_COVERAGE, @@ -67,6 +76,8 @@ __all__ = [ "BuildConfig", "SimpleTaxExpenditureReform", + "ReformValidationSpec", + "REFORM_VALIDATION_SCHEMA_VERSION", "LEDGER_US_SOURCE_COVERAGE_CONTRACT_COMMIT", "US_DONORS", "US_FISCAL_MACRO_REALISM_BANDS", @@ -92,8 +103,13 @@ "US_STATE_INCOME_TAX_TARGET_REFERENCES", "compile_us_fiscal_target_registry", "hard_target_package_aliases", + "in_sample_reform_specs", + "load_default_reform_specs", + "out_of_sample_reform_specs", + "reform_validation_payload", "source_gap_family_ids", "us_plan", + "write_reform_validation", "us_source_coverage_diagnostics", "us_source_coverage_gate", "write_us_source_coverage_diagnostics", diff --git a/packages/populace-build/src/populace/build/us/obbba_reforms.json b/packages/populace-build/src/populace/build/us/obbba_reforms.json new file mode 100644 index 0000000..a8b6390 --- /dev/null +++ b/packages/populace-build/src/populace/build/us/obbba_reforms.json @@ -0,0 +1,54 @@ +{ + "_comment": "Out-of-sample reform-validation set: provisions of the 2025 One Big Beautiful Bill Act (OBBBA) that policyengine-us models in its baseline. Each reform is encoded as a COUNTERFACTUAL REVERT (the provision is already in the baseline, so we turn it off); the provision's budget effect is therefore baseline - reform (effect_direction='baseline_minus_reform'), which is sign-comparable to the JCT enactment score. We include only genuinely new provisions whose JCT line is NOT bundled with TCJA extension, so the parameter revert captures the whole provision. JCT figures are the per-fiscal-year lines from JCX-35-25 (July 1 2025), conventional/static, in dollars. We compare a single simulated year (period) to JCT's same-year line. Provisions excluded for now: SALT cap (JCT line bundles cap-extension + raise; sign ambiguous), CTC / standard deduction (JCT lines bundle TCJA extension + enhancement; a clean revert isn't isolable), senior bonus deduction (no standalone JCX-35-25 line), Trump accounts (not modeled in policyengine-us), estate exemption (rarely fires in microdata).", + "schema_version": 1, + "jct_document": { + "id": "JCX-35-25", + "title": "Estimated Revenue Effects Relative to the Present Law Baseline of the Tax Provisions in Title VII-Finance of the substitute legislation as passed by the Senate (FY2025 budget reconciliation)", + "published": "2025-07-01", + "url": "https://www.jct.gov/publications/2025/jcx-35-25/" + }, + "reforms": [ + { + "id": "obbba_no_tax_on_tips", + "name": "OBBBA — No tax on tips (tip income deduction)", + "category": "OBBBA", + "description": "Deduction for qualified tip income (sunsets 12/31/2028). Validated by reverting the tip-income deduction cap to 0 over the provision window and comparing the simulated FY2026 income-tax revenue loss to the JCT FY2026 line.", + "period": 2026, + "budget_measure": "income_tax", + "effect_direction": "baseline_minus_reform", + "parameter_changes": { + "gov.irs.deductions.tip_income.cap": { "2025-01-01.2028-12-31": 0 } + }, + "jct": { + "score": -10121000000, + "score_type": "conventional", + "window": "FY2026", + "source": "JCX-35-25, Ch.2 line 1 (FY2026)", + "source_url": "https://www.jct.gov/publications/2025/jcx-35-25/" + } + }, + { + "id": "obbba_no_tax_on_overtime", + "name": "OBBBA — No tax on overtime (overtime income deduction)", + "category": "OBBBA", + "description": "Deduction for qualified overtime premium income (sunsets 12/31/2028). Validated by reverting the overtime-income deduction cap to 0 over the provision window and comparing the simulated FY2026 income-tax revenue loss to the JCT FY2026 line.", + "period": 2026, + "budget_measure": "income_tax", + "effect_direction": "baseline_minus_reform", + "parameter_changes": { + "gov.irs.deductions.overtime_income.cap.JOINT": { "2025-01-01.2028-12-31": 0 }, + "gov.irs.deductions.overtime_income.cap.SINGLE": { "2025-01-01.2028-12-31": 0 }, + "gov.irs.deductions.overtime_income.cap.HEAD_OF_HOUSEHOLD": { "2025-01-01.2028-12-31": 0 }, + "gov.irs.deductions.overtime_income.cap.SURVIVING_SPOUSE": { "2025-01-01.2028-12-31": 0 }, + "gov.irs.deductions.overtime_income.cap.SEPARATE": { "2025-01-01.2028-12-31": 0 } + }, + "jct": { + "score": -32806000000, + "score_type": "conventional", + "window": "FY2026", + "source": "JCX-35-25, Ch.2 line 2 (FY2026)", + "source_url": "https://www.jct.gov/publications/2025/jcx-35-25/" + } + } + ] +} diff --git a/packages/populace-build/src/populace/build/us/reform_validation.py b/packages/populace-build/src/populace/build/us/reform_validation.py new file mode 100644 index 0000000..34b0b61 --- /dev/null +++ b/packages/populace-build/src/populace/build/us/reform_validation.py @@ -0,0 +1,395 @@ +"""Reform validation: score policy reforms on the calibrated dataset and +compare the budget effect to the authority's (JCT's) official score. + +Where ``calibration_diagnostics.json`` reports how well the calibrated weights +reproduce their *calibration targets*, ``reform_validation.json`` reports a +downstream property the calibration did not directly optimize: how closely the +dataset reproduces the *budget effects of scored policy reforms*. Two kinds of +reform are validated, and each row is labelled so a consumer can tell them +apart: + +* **in-sample** — the JCT tax-expenditure reforms that are themselves + calibration targets (``US_JCT_TAX_EXPENDITURE_REFORMS``). The dataset was + tuned to hit these, so agreement is expected; the row is published for + completeness and provenance, flagged ``in_sample=True``. +* **out-of-sample** — reforms the calibration never saw (e.g. provisions of + the 2025 One Big Beautiful Bill Act), curated in ``obbba_reforms.json`` with + their JCT scores. These are the genuine test of dataset fidelity. + +The simulation is isolated behind an injected ``simulate`` callable so the +payload assembly is unit-testable without policyengine-us; the default factory +builds a ``Microsimulation`` over the freshly written release H5. +""" + +from __future__ import annotations + +import json +import math +from collections.abc import Callable, Iterable, Sequence +from dataclasses import dataclass +from importlib.resources import files +from pathlib import Path +from typing import Any + +from populace.build.us.fiscal_targets import ( + US_JCT_TAX_EXPENDITURE_REFORMS, + SimpleTaxExpenditureReform, +) + +__all__ = [ + "REFORM_VALIDATION_SCHEMA_VERSION", + "ReformValidationSpec", + "in_sample_reform_specs", + "out_of_sample_reform_specs", + "tax_expenditure_reform_specs", + "load_default_reform_specs", + "reform_validation_payload", + "write_reform_validation", +] + +#: Schema version of reform_validation.json. The calibration-diagnostics +#: dashboard keys its reader on it; bump with any shape change. +REFORM_VALIDATION_SCHEMA_VERSION = 1 + +#: The budget effect of a reform is the weighted-sum change of this variable +#: between the reform and baseline simulations, unless a spec overrides it. For +#: income-tax provisions this is the simulated federal income-tax revenue change +#: — the quantity JCT scores (− = revenue loss / cost; + = revenue raised). +DEFAULT_BUDGET_MEASURE = "income_tax" + + +@dataclass(frozen=True) +class ReformValidationSpec: + """One reform to score on the dataset and compare to its JCT figure. + + Exactly one of ``neutralized_variable`` (an in-sample tax-expenditure + neutralization) or ``parameter_changes`` (an out-of-sample + ``Reform.from_dict`` payload) defines the reform. + """ + + id: str + name: str + category: str + in_sample: bool + period: int + # The published figure to compare against. None for provisions neither JCT + # nor Treasury score (e.g. the regular standard deduction, which both treat + # as baseline) — those rows publish the simulated magnitude with no error. + jct_score: float | None + jct_window: str + jct_source: str + jct_source_url: str + jct_score_type: str = "conventional" + budget_measure: str = DEFAULT_BUDGET_MEASURE + description: str = "" + neutralized_variable: str | None = None + parameter_changes: dict[str, Any] | None = None + # How the budget effect is signed relative to the simulations. JCT scores a + # *tax expenditure* as the revenue raised by repeal (reform − baseline, the + # neutralize convention), but scores an *enacted provision* as the effect of + # enacting it. OBBBA is already in the policyengine-us baseline, so its + # provisions are validated by a counterfactual *revert* reform — there the + # provision's effect is baseline − reform (JCT enactment sign). + effect_direction: str = "reform_minus_baseline" + + def __post_init__(self) -> None: + if not self.id: + raise ValueError("ReformValidationSpec.id is required.") + has_neutralize = bool(self.neutralized_variable) + has_params = bool(self.parameter_changes) + if has_neutralize == has_params: + raise ValueError( + f"{self.id}: provide exactly one of neutralized_variable or " + "parameter_changes." + ) + if self.effect_direction not in {"reform_minus_baseline", "baseline_minus_reform"}: + raise ValueError( + f"{self.id}: effect_direction must be 'reform_minus_baseline' or " + "'baseline_minus_reform'." + ) + + def build_reform(self) -> Any: + """Construct the policyengine reform object for this spec. + + Imports are lazy so the module (and its unit tests) load without + policyengine-us installed. + """ + if self.neutralized_variable: + from policyengine_core.reforms import Reform + + variable = self.neutralized_variable + + class _Neutralize(Reform): + def apply(self) -> None: + self.neutralize_variable(variable) + + return _Neutralize + from policyengine_core.reforms import Reform + + return Reform.from_dict(self.parameter_changes, country_id="us") + + +def _finite(value: float) -> float | None: + value = float(value) + return value if math.isfinite(value) else None + + +def in_sample_reform_specs( + reforms: Iterable[SimpleTaxExpenditureReform] = US_JCT_TAX_EXPENDITURE_REFORMS, + *, + period: int, +) -> tuple[ReformValidationSpec, ...]: + """The JCT tax-expenditure calibration targets as validation specs.""" + specs: list[ReformValidationSpec] = [] + for reform in reforms: + specs.append( + ReformValidationSpec( + id=reform.target_name, + name=reform.target_name, + category="JCT tax expenditure", + in_sample=True, + period=int(period), + # The JCT figure for an in-sample reform is the calibration + # target's own value, supplied at payload time via + # in_sample_targets — it lives in the ledger now, not on the + # reform object. + jct_score=None, + jct_window="annual", + jct_source=reform.source or "JCT tax-expenditure (calibration target)", + jct_source_url="", + budget_measure=reform.output_variable or DEFAULT_BUDGET_MEASURE, + neutralized_variable=reform.neutralized_variable, + ) + ) + return tuple(specs) + + +def _obbba_config_path() -> Path: + return Path(str(files(__package__).joinpath("obbba_reforms.json"))) + + +def out_of_sample_reform_specs( + path: Path | None = None, + *, + period: int, +) -> tuple[ReformValidationSpec, ...]: + """Curated out-of-sample reforms (OBBBA provisions) from JSON config.""" + config_path = path or _obbba_config_path() + if not config_path.exists(): + return () + payload = json.loads(config_path.read_text()) + specs: list[ReformValidationSpec] = [] + for raw in payload.get("reforms", ()): + jct = raw.get("jct", {}) + specs.append( + ReformValidationSpec( + id=raw["id"], + name=raw["name"], + category=raw.get("category", "OBBBA"), + in_sample=False, + period=int(raw.get("period", period)), + jct_score=float(jct["score"]), + jct_window=str(jct.get("window", "")), + jct_source=str(jct.get("source", "")), + jct_source_url=str(jct.get("source_url", "")), + jct_score_type=str(jct.get("score_type", "conventional")), + budget_measure=str(raw.get("budget_measure", DEFAULT_BUDGET_MEASURE)), + description=str(raw.get("description", "")), + parameter_changes=raw["parameter_changes"], + # OBBBA provisions are baked into the baseline, so the config + # encodes a revert; the provision's effect is baseline − reform. + effect_direction=str(raw.get("effect_direction", "baseline_minus_reform")), + ) + ) + return tuple(specs) + + +def _tax_expenditure_config_path() -> Path: + return Path(str(files(__package__).joinpath("tax_expenditure_reforms.json"))) + + +def tax_expenditure_reform_specs( + path: Path | None = None, + *, + period: int, +) -> tuple[ReformValidationSpec, ...]: + """Big-provision tax-expenditure reforms from JSON config. + + Each is a ``neutralize_variable`` reform (repeal the provision) whose + simulated revenue change is compared to a published tax-expenditure figure + (JCT where it scores the provision, Treasury otherwise — recorded per row). + The ``in_sample`` flag reflects whether the dataset is calibrated to that + provision (e.g. EITC is, the standard deduction is not), so a consumer can + weight the result accordingly. + """ + config_path = path or _tax_expenditure_config_path() + if not config_path.exists(): + return () + payload = json.loads(config_path.read_text()) + specs: list[ReformValidationSpec] = [] + for raw in payload.get("reforms", ()): + bench = raw.get("benchmark", {}) + specs.append( + ReformValidationSpec( + id=raw["id"], + name=raw["name"], + category=raw.get("category", "Tax expenditure"), + in_sample=bool(raw.get("in_sample", False)), + period=int(raw.get("period", period)), + jct_score=(float(bench["score"]) if bench.get("score") is not None else None), + jct_window=str(bench.get("window", "")), + jct_source=str(bench.get("source", "")), + jct_source_url=str(bench.get("source_url", "")), + jct_score_type=str(bench.get("score_type", "tax_expenditure")), + budget_measure=str(raw.get("budget_measure", DEFAULT_BUDGET_MEASURE)), + description=str(raw.get("description", "")), + neutralized_variable=raw["neutralized_variable"], + # Neutralizing the provision raises tax by the expenditure amount + # (positive), matching the positive published figure. + effect_direction="reform_minus_baseline", + ) + ) + return tuple(specs) + + +def load_default_reform_specs( + *, + period: int, + obbba_path: Path | None = None, + tax_expenditure_path: Path | None = None, +) -> tuple[ReformValidationSpec, ...]: + """In-sample JCT tax expenditures + out-of-sample OBBBA provisions + the + big-provision tax-expenditure reforms (CTC/EITC/CDCC/standard/itemized).""" + return ( + *in_sample_reform_specs(period=period), + *out_of_sample_reform_specs(obbba_path, period=period), + *tax_expenditure_reform_specs(tax_expenditure_path, period=period), + ) + + +# A simulate(reform_or_None) -> object with .calculate(measure, period).sum(). +SimulateFn = Callable[[Any], Any] + + +def _weighted_total(simulation: Any, measure: str, period: int) -> float: + """Weighted population total of ``measure`` (MicroSeries .sum() is + weight-aware in policyengine-us).""" + return float(simulation.calculate(measure, period).sum()) + + +def default_simulate_factory(dataset_path: Path) -> SimulateFn: + """Build a simulate() that runs a Microsimulation over the release H5.""" + + def simulate(reform: Any) -> Any: + from policyengine_us import Microsimulation + from policyengine_us.data import USSingleYearDataset + + dataset = USSingleYearDataset(file_path=str(dataset_path)) + if reform is None: + return Microsimulation(dataset=dataset) + return Microsimulation(dataset=dataset, reform=reform) + + return simulate + + +def reform_validation_payload( + specs: Sequence[ReformValidationSpec], + *, + period: int, + simulate: SimulateFn | None = None, + in_sample_estimates: dict[str, float] | None = None, + in_sample_targets: dict[str, float] | None = None, + release_id: str | None = None, +) -> dict[str, Any]: + """Score each reform on the dataset and render the JSON-stable payload. + + In-sample reforms are JCT tax-expenditure *calibration targets*: their + populace budget effect is the calibrated ``final_estimate`` the calibration + already produced — passed in via ``in_sample_estimates`` (keyed by spec id), + so no extra simulation is run for them. Out-of-sample reforms (OBBBA + provisions) are simulated: a baseline is built once and each reform's budget + effect is the weighted-sum change of its budget measure (reform − baseline). + + ``simulate`` is required only if any out-of-sample spec is present (or an + in-sample estimate is missing); when absent, those rows publish a null + budget effect rather than failing the build. The shape matches the + calibration-diagnostics dashboard's reform_validation reader. + """ + estimates = in_sample_estimates or {} + targets = in_sample_targets or {} + baseline: Any = None + baseline_totals: dict[tuple[int, str], float] = {} + + def baseline_total(measure: str, at_period: int) -> float: + nonlocal baseline + if baseline is None: + baseline = simulate(None) # type: ignore[misc] + key = (at_period, measure) + if key not in baseline_totals: + baseline_totals[key] = _weighted_total(baseline, measure, at_period) + return baseline_totals[key] + + def simulated_effect(spec: ReformValidationSpec) -> tuple[float | None, float | None, float | None]: + if simulate is None: + return None, None, None + base = baseline_total(spec.budget_measure, spec.period) + reform_total = _weighted_total(simulate(spec.build_reform()), spec.budget_measure, spec.period) + raw = reform_total - base + # A counterfactual revert measures the provision as baseline − reform. + effect = raw if spec.effect_direction == "reform_minus_baseline" else -raw + return effect, base, reform_total + + rows: list[dict[str, Any]] = [] + for spec in specs: + if spec.in_sample and spec.id in estimates: + effect: float | None = float(estimates[spec.id]) + base_total: float | None = None + reform_total: float | None = None + else: + effect, base_total, reform_total = simulated_effect(spec) + # In-sample reforms get their JCT figure from the calibration target. + effective_jct = spec.jct_score + if effective_jct is None and spec.in_sample and spec.id in targets: + effective_jct = targets[spec.id] + rows.append( + { + "id": spec.id, + "name": spec.name, + "category": spec.category, + "in_sample": spec.in_sample, + "period": spec.period, + "description": spec.description or None, + "jct": { + "score": None if effective_jct is None else _finite(effective_jct), + "score_type": spec.jct_score_type, + "window": spec.jct_window or None, + "source": spec.jct_source or None, + "source_url": spec.jct_source_url or None, + }, + "populace": { + "budget_effect": None if effect is None else _finite(effect), + "period": spec.period, + "window": spec.jct_window or None, + "measure": spec.budget_measure, + "baseline_total": None if base_total is None else _finite(base_total), + "reform_total": None if reform_total is None else _finite(reform_total), + }, + } + ) + + payload: dict[str, Any] = { + "schema_version": REFORM_VALIDATION_SCHEMA_VERSION, + "baseline_period": int(period), + "scoring_window": "see per-reform jct.window", + "reforms": rows, + } + if release_id is not None: + payload["release_id"] = release_id + return payload + + +def write_reform_validation(payload: dict[str, Any], path: Path | str) -> Path: + """Write the reform-validation payload as ``reform_validation.json``.""" + path = Path(path) + path.write_text(json.dumps(payload, indent=1, allow_nan=False)) + return path diff --git a/packages/populace-build/src/populace/build/us/tax_expenditure_reforms.json b/packages/populace-build/src/populace/build/us/tax_expenditure_reforms.json new file mode 100644 index 0000000..eb5bda1 --- /dev/null +++ b/packages/populace-build/src/populace/build/us/tax_expenditure_reforms.json @@ -0,0 +1,91 @@ +{ + "_comment": "Big-provision tax-expenditure validation. Each reform repeals a major provision (neutralize_variable) on the calibrated dataset; the simulated revenue change (income_tax delta, which captures the credit's refundable portion too) is compared to the published tax-expenditure figure. JCT where it scores the provision (Estimates of Federal Tax Expenditures FY2024-2028, JCX-48-24, Dec 11 2024, Table 1, FY2024, $); Treasury (Tax Expenditures FY2025, OTA) where JCT bundles it. Figures are FY2024 single-year totals incl. the outlay/refundable portion. in_sample flags whether the dataset is calibrated to the provision: EITC is (SOI EITC-by-AGI targets), CTC partly (SOI CTC amounts), CDCC/standard/itemized are not. The individual itemized deductions (SALT, mortgage, charitable, medical, QBI) are already validated in-sample via US_JCT_TAX_EXPENDITURE_REFORMS, so they are not duplicated here. Standard deduction and 'all itemized combined' have NO published tax-expenditure figure (both JCT and Treasury treat the standard deduction as baseline, and neither publishes a combined itemized total), so they carry no benchmark and publish the repeal magnitude only.", + "schema_version": 1, + "reforms": [ + { + "id": "te_ctc", + "name": "Child Tax Credit (incl. ODC)", + "category": "Tax expenditure", + "in_sample": false, + "period": 2024, + "neutralized_variable": "ctc", + "budget_measure": "income_tax", + "description": "Repeal the Child Tax Credit and Credit for Other Dependents. JCT line is 'Credit for children and other dependents'.", + "benchmark": { + "score": 173800000000, + "score_type": "tax_expenditure", + "window": "FY2024", + "source": "JCT JCX-48-24 Table 1 (rev 127.1B + outlay 46.7B)", + "source_url": "https://www.jct.gov/publications/2024/jcx-48-24/" + } + }, + { + "id": "te_eitc", + "name": "Earned Income Tax Credit", + "category": "Tax expenditure", + "in_sample": true, + "period": 2024, + "neutralized_variable": "eitc", + "budget_measure": "income_tax", + "description": "Repeal the EITC. Mostly refundable, so the bulk is the outlay portion. The dataset is calibrated to SOI EITC-by-AGI targets, so this is an in-sample check.", + "benchmark": { + "score": 124200000000, + "score_type": "tax_expenditure", + "window": "FY2024", + "source": "JCT JCX-48-24 Table 1 (rev 66.9B + outlay 57.3B)", + "source_url": "https://www.jct.gov/publications/2024/jcx-48-24/" + } + }, + { + "id": "te_cdcc", + "name": "Child & Dependent Care Credit", + "category": "Tax expenditure", + "in_sample": false, + "period": 2024, + "neutralized_variable": "cdcc", + "budget_measure": "income_tax", + "description": "Repeal the CDCC. JCT bundles it with the employer-provided-childcare exclusion, so the clean CDCC-only benchmark is Treasury's.", + "benchmark": { + "score": 3690000000, + "score_type": "tax_expenditure", + "window": "FY2024", + "source": "Treasury Tax Expenditures FY2025, Table 1 item 120", + "source_url": "https://home.treasury.gov/policy-issues/tax-policy/tax-expenditures" + } + }, + { + "id": "te_standard_deduction", + "name": "Standard deduction (no official benchmark)", + "category": "Tax expenditure", + "in_sample": false, + "period": 2024, + "neutralized_variable": "standard_deduction", + "budget_measure": "income_tax", + "description": "Repeal the standard deduction. Both JCT and Treasury treat the regular standard deduction as part of the normal-tax baseline, so there is no published tax-expenditure figure; shown as a repeal magnitude only.", + "benchmark": { + "score": null, + "score_type": "none", + "window": "FY2024", + "source": "Not scored — standard deduction is baseline in both JCT and Treasury", + "source_url": "https://www.jct.gov/publications/2024/jcx-48-24/" + } + }, + { + "id": "te_itemized_total", + "name": "All itemized deductions (no combined benchmark)", + "category": "Tax expenditure", + "in_sample": false, + "period": 2024, + "neutralized_variable": "itemized_taxable_income_deductions", + "budget_measure": "income_tax", + "description": "Repeal all itemized deductions at once. Neither JCT nor Treasury publishes a single combined itemized-deduction tax-expenditure figure (only components: SALT, mortgage, charitable, medical, …), so this carries no benchmark; the individual components are validated in-sample. Shown as a repeal magnitude only.", + "benchmark": { + "score": null, + "score_type": "none", + "window": "FY2024", + "source": "Not published as a combined figure — JCT/Treasury score itemized components individually", + "source_url": "https://www.jct.gov/publications/2024/jcx-48-24/" + } + } + ] +} diff --git a/packages/populace-build/tests/test_reform_validation.py b/packages/populace-build/tests/test_reform_validation.py new file mode 100644 index 0000000..1423232 --- /dev/null +++ b/packages/populace-build/tests/test_reform_validation.py @@ -0,0 +1,195 @@ +"""Reform-validation payload assembly, isolated from policyengine-us. + +The simulation is injected, so these tests exercise the budget-effect math and +the in-sample/out-of-sample split without running a Microsimulation. +""" + +from __future__ import annotations + +import json + +import pytest + +from populace.build.us.reform_validation import ( + REFORM_VALIDATION_SCHEMA_VERSION, + ReformValidationSpec, + in_sample_reform_specs, + out_of_sample_reform_specs, + reform_validation_payload, + tax_expenditure_reform_specs, + write_reform_validation, +) + + +class _FakeSeries: + def __init__(self, total: float) -> None: + self._total = total + + def sum(self) -> float: + return self._total + + +class _FakeSim: + """A sim whose weighted total for a measure shifts by a per-reform delta.""" + + def __init__(self, totals: dict[str, float]) -> None: + self._totals = totals + + def calculate(self, measure: str, period): # noqa: ARG002 + return _FakeSeries(self._totals[measure]) + + +def _oos_spec(score: float) -> ReformValidationSpec: + return ReformValidationSpec( + id="obbba_salt", + name="OBBBA — SALT cap to $40k", + category="OBBBA", + in_sample=False, + period=2024, + jct_score=score, + jct_window="FY2025-2034", + jct_source="JCX-00-25", + jct_source_url="https://www.jct.gov/", + parameter_changes={"gov.example.cap": {"2025-01-01.2034-12-31": 40000}}, + ) + + +def test_spec_requires_exactly_one_reform_definition(): + with pytest.raises(ValueError): + ReformValidationSpec( + id="x", name="x", category="c", in_sample=False, period=2024, + jct_score=1.0, jct_window="", jct_source="", jct_source_url="", + ) + with pytest.raises(ValueError): + ReformValidationSpec( + id="x", name="x", category="c", in_sample=False, period=2024, + jct_score=1.0, jct_window="", jct_source="", jct_source_url="", + neutralized_variable="v", parameter_changes={"a": 1}, + ) + + +def test_in_sample_uses_calibration_estimate_no_simulation(): + specs = ( + ReformValidationSpec( + id="nation/jct/mortgage", name="Mortgage interest deduction", + category="JCT tax expenditure", in_sample=True, period=2024, + jct_score=30e9, jct_window="annual", jct_source="JCT", jct_source_url="", + neutralized_variable="mortgage_interest_deduction", + ), + ) + + def simulate(_reform): # pragma: no cover - must not be called + raise AssertionError("in-sample rows must not simulate") + + payload = reform_validation_payload( + specs, + period=2024, + simulate=simulate, + in_sample_estimates={"nation/jct/mortgage": 28e9}, + ) + row = payload["reforms"][0] + assert row["in_sample"] is True + assert row["populace"]["budget_effect"] == pytest.approx(28e9) + assert row["jct"]["score"] == pytest.approx(30e9) + + +def test_out_of_sample_budget_effect_is_reform_minus_baseline(monkeypatch): + # baseline income_tax total 2.0e12; under the reform it rises by 50e9. + spec = _oos_spec(score=-60e9) + monkeypatch.setattr(spec.__class__, "build_reform", lambda self: "REFORM") + + def simulate(reform): + total = 2.0e12 + 50e9 if reform == "REFORM" else 2.0e12 + return _FakeSim({"income_tax": total}) + + payload = reform_validation_payload([spec], period=2024, simulate=simulate) + row = payload["reforms"][0] + assert row["in_sample"] is False + assert row["populace"]["budget_effect"] == pytest.approx(50e9) + assert row["populace"]["baseline_total"] == pytest.approx(2.0e12) + assert row["populace"]["reform_total"] == pytest.approx(2.05e12) + assert row["jct"]["score"] == pytest.approx(-60e9) + + +def test_counterfactual_revert_flips_sign(monkeypatch): + # A revert reform: baseline (provision on) income tax 2.0e12; reverting the + # provision (reform) raises it to 2.033e12. The provision's effect is + # baseline − reform = −33e9 (a cost), matching the JCT enactment sign. + spec = _oos_spec(score=-33e9) + object.__setattr__(spec, "effect_direction", "baseline_minus_reform") + monkeypatch.setattr(spec.__class__, "build_reform", lambda self: "REFORM") + + def simulate(reform): + total = 2.033e12 if reform is not None else 2.0e12 + return _FakeSim({"income_tax": total}) + + payload = reform_validation_payload([spec], period=2024, simulate=simulate) + assert payload["reforms"][0]["populace"]["budget_effect"] == pytest.approx(-33e9) + + +def test_shipped_obbba_config_is_out_of_sample_counterfactual(): + specs = out_of_sample_reform_specs(period=2026) + assert {s.id for s in specs} >= {"obbba_no_tax_on_tips", "obbba_no_tax_on_overtime"} + for spec in specs: + assert spec.effect_direction == "baseline_minus_reform" + assert spec.period == 2026 + assert spec.jct_score < 0 # OBBBA provisions are costs + assert spec.jct_source.startswith("JCX-35-25") + + +def test_shipped_tax_expenditure_specs_neutralize_big_provisions(): + specs = tax_expenditure_reform_specs(period=2024) + by_id = {s.id for s in specs} + assert {"te_ctc", "te_eitc", "te_cdcc", "te_standard_deduction", "te_itemized_total"} <= by_id + for spec in specs: + assert spec.neutralized_variable # all are repeals + assert spec.effect_direction == "reform_minus_baseline" # neutralize raises tax + eitc = next(s for s in specs if s.id == "te_eitc") + assert eitc.in_sample is True # calibrated to SOI EITC targets + std = next(s for s in specs if s.id == "te_standard_deduction") + assert std.jct_score is None # baseline in both JCT and Treasury — no benchmark + + +def test_null_benchmark_row_publishes_magnitude_only(monkeypatch): + spec = ReformValidationSpec( + id="te_std", name="Standard deduction", category="Tax expenditure", + in_sample=False, period=2024, jct_score=None, jct_window="FY2024", + jct_source="not scored", jct_source_url="", neutralized_variable="standard_deduction", + ) + monkeypatch.setattr(spec.__class__, "build_reform", lambda self: "REFORM") + + def simulate(reform): + return _FakeSim({"income_tax": 2.28e12 if reform is not None else 2.0e12}) + + payload = reform_validation_payload([spec], period=2024, simulate=simulate) + row = payload["reforms"][0] + assert row["jct"]["score"] is None + assert row["populace"]["budget_effect"] == pytest.approx(280e9) # repeal magnitude + + +def test_out_of_sample_null_when_no_simulate(): + payload = reform_validation_payload([_oos_spec(-1.0)], period=2024, simulate=None) + assert payload["reforms"][0]["populace"]["budget_effect"] is None + assert payload["schema_version"] == REFORM_VALIDATION_SCHEMA_VERSION + + +def test_in_sample_specs_built_from_jct_reforms(): + specs = in_sample_reform_specs(period=2024) + assert specs, "expected at least one JCT tax-expenditure reform" + assert all(s.in_sample for s in specs) + assert all(s.neutralized_variable for s in specs) + + +def test_out_of_sample_specs_load_from_default_config(): + # The shipped OBBBA config (if present) must parse into valid specs. + specs = out_of_sample_reform_specs(period=2024) + for spec in specs: + assert spec.in_sample is False + assert spec.parameter_changes + assert spec.jct_source + + +def test_write_round_trips(tmp_path): + payload = reform_validation_payload([_oos_spec(-1.0)], period=2024, simulate=None) + path = write_reform_validation(payload, tmp_path / "reform_validation.json") + assert json.loads(path.read_text())["reforms"][0]["id"] == "obbba_salt" diff --git a/tools/build_us_fiscal_refresh_release.py b/tools/build_us_fiscal_refresh_release.py index f91c29d..7d5b140 100644 --- a/tools/build_us_fiscal_refresh_release.py +++ b/tools/build_us_fiscal_refresh_release.py @@ -35,6 +35,12 @@ us_source_coverage_diagnostics, write_us_source_coverage_diagnostics, ) +from populace.build.us.reform_validation import ( + default_simulate_factory, + load_default_reform_specs, + reform_validation_payload, + write_reform_validation, +) from populace.calibrate import TargetRegistry, calibrate from populace.calibrate.diagnostics import ( diagnostics_payload, @@ -192,6 +198,21 @@ def _parse_args() -> argparse.Namespace: "diagnostics instead." ), ) + parser.add_argument( + "--skip-reform-validation", + action="store_true", + help="Do not emit reform_validation.json for this release.", + ) + parser.add_argument( + "--skip-out-of-sample-reforms", + action="store_true", + help=( + "Emit reform_validation.json with the in-sample JCT tax-expenditure " + "rows only (from the calibration fit), skipping the out-of-sample " + "OBBBA simulations. Faster; useful when policyengine-us microsim runs " + "are not wanted in the build." + ), + ) return parser.parse_args() @@ -985,6 +1006,64 @@ def _artifact_entry(path: str, sha: str, *, kind: str, revision: str) -> dict[st } +def _in_sample_estimates(result) -> dict[str, float]: + """Calibrated final estimate per JCT target, keyed by target name. + + The in-sample reform validation rows reuse the calibration's own fit (the + JCT tax-expenditure targets *are* calibration targets), so no extra + simulation is run for them. + """ + estimates: dict[str, float] = {} + for diagnostic, target in zip(result.diagnostics, result.problem.targets, strict=True): + value = diagnostic.final_estimate + if value is not None and math.isfinite(float(value)): + estimates[target.name] = float(value) + return estimates + + +def _in_sample_targets(result) -> dict[str, float]: + """Calibration target value (the JCT figure) per target, keyed by name. + + In-sample reforms are JCT tax-expenditure calibration targets, so their JCT + figure is the target's own value the calibration fit against. + """ + targets: dict[str, float] = {} + for diagnostic, target in zip(result.diagnostics, result.problem.targets, strict=True): + value = diagnostic.target + if value is not None and math.isfinite(float(value)): + targets[target.name] = float(value) + return targets + + +def _write_reform_validation( + *, + release_dir: Path, + dataset_path: Path, + result, + registry: TargetRegistry, + release_id: str, + simulate_out_of_sample: bool, +) -> None: + """Emit reform_validation.json: populace budget effects vs JCT scores. + + In-sample JCT tax-expenditure reforms come straight from the calibration + fit; out-of-sample OBBBA provisions are simulated on the freshly written + release H5 (skipped if ``simulate_out_of_sample`` is False, e.g. for a fast + diagnostics-only build). + """ + specs = load_default_reform_specs(period=PERIOD) + simulate = default_simulate_factory(dataset_path) if simulate_out_of_sample else None + payload = reform_validation_payload( + specs, + period=PERIOD, + simulate=simulate, + in_sample_estimates=_in_sample_estimates(result), + in_sample_targets=_in_sample_targets(result), + release_id=release_id, + ) + write_reform_validation(payload, release_dir / "reform_validation.json") + + def _build_manifests( *, release_id: str, @@ -1100,6 +1179,18 @@ def _build_manifests( kind="diagnostics", revision=release_id, ), + **( + { + "reform_validation": _artifact_entry( + "reform_validation.json", + _sha256(release_dir / "reform_validation.json"), + kind="diagnostics", + revision=release_id, + ) + } + if (release_dir / "reform_validation.json").exists() + else {} + ), }, } (release_dir / "release_manifest.json").write_text( @@ -1254,6 +1345,16 @@ def main() -> None: }, ) + if not args.skip_reform_validation: + _write_reform_validation( + release_dir=release_dir, + dataset_path=dataset_path, + result=result, + registry=registry, + release_id=release_id, + simulate_out_of_sample=not args.skip_out_of_sample_reforms, + ) + active_aliases = DIRECT_ACTIVE_ALIASES coverage = us_source_coverage_diagnostics( active_target_aliases=active_aliases,