From 4a36dfba0a97cf5e57dd204e3dcb5ca4f9e0d790 Mon Sep 17 00:00:00 2001
From: Pavel Makarchuk <pavel@policyengine.org>
Date: Tue, 16 Jun 2026 00:34:43 -0400
Subject: [PATCH 1/4] Emit reform_validation.json: dataset budget effects vs
 JCT scores
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a per-release reform-validation artifact, the downstream counterpart to
calibration_diagnostics.json: where calibration measures fit to its targets,
this measures how closely the calibrated dataset reproduces the budget effects
of JCT-scored reforms. The calibration-diagnostics dashboard consumes it.

Two labelled kinds of reform:
- in-sample: the JCT tax-expenditure reforms that are themselves calibration
  targets (US_JCT_TAX_EXPENDITURE_REFORMS). Their populace estimate is the
  calibration's own final_estimate — no extra simulation — flagged
  in_sample=True so a consumer knows agreement is expected.
- out-of-sample: OBBBA provisions the calibration never saw (obbba_reforms.json:
  no-tax-on-tips and no-tax-on-overtime, with their per-FY JCX-35-25 scores).
  OBBBA is baked into the policyengine-us baseline, so each is encoded as a
  counterfactual revert and the provision effect is baseline - reform
  (sign-comparable to the JCT enactment score), simulated at FY2026.

- packages/populace-build/.../reform_validation.py: ReformValidationSpec, the
  in-sample/out-of-sample spec builders, reform_validation_payload (microsim
  isolated behind an injected simulate() for testing), write_reform_validation.
- obbba_reforms.json: curated out-of-sample set; excludes provisions whose JCT
  line bundles TCJA extension (SALT/CTC/standard deduction), lacks a standalone
  line (senior deduction), or isn't modeled (Trump accounts) — documented inline.
- build_us_fiscal_refresh_release.py: writes reform_validation.json after the
  release H5, adds it to the release manifest; --skip-reform-validation and
  --skip-out-of-sample-reforms flags.
- 9 unit tests (sign conventions, in-sample-from-calibration, config loading),
  fake-sim isolated so they need no policyengine-us. ruff clean.

Out-of-sample budget effects populate when a release build runs the OBBBA
microsims; the artifact is otherwise the in-sample rows plus null estimates.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .../src/populace/build/us/__init__.py         |  16 +
 .../src/populace/build/us/obbba_reforms.json  |  54 +++
 .../populace/build/us/reform_validation.py    | 330 ++++++++++++++++++
 .../tests/test_reform_validation.py           | 163 +++++++++
 tools/build_us_fiscal_refresh_release.py      |  86 +++++
 5 files changed, 649 insertions(+)
 create mode 100644 packages/populace-build/src/populace/build/us/obbba_reforms.json
 create mode 100644 packages/populace-build/src/populace/build/us/reform_validation.py
 create mode 100644 packages/populace-build/tests/test_reform_validation.py

diff --git a/packages/populace-build/src/populace/build/us/__init__.py b/packages/populace-build/src/populace/build/us/__init__.py
index 87395e3..3a90bbc 100644
--- a/packages/populace-build/src/populace/build/us/__init__.py
+++ b/packages/populace-build/src/populace/build/us/__init__.py
@@ -52,6 +52,15 @@
     SimpleTaxExpenditureReform,
     compile_us_fiscal_target_registry,
 )
+from populace.build.us.reform_validation import (
+    REFORM_VALIDATION_SCHEMA_VERSION,
+    ReformValidationSpec,
+    in_sample_reform_specs,
+    load_default_reform_specs,
+    out_of_sample_reform_specs,
+    reform_validation_payload,
+    write_reform_validation,
+)
 from populace.build.us.source_coverage import (
     LEDGER_US_SOURCE_COVERAGE_CONTRACT_COMMIT,
     US_SOURCE_COVERAGE,
@@ -67,6 +76,8 @@
 __all__ = [
     "BuildConfig",
     "SimpleTaxExpenditureReform",
+    "ReformValidationSpec",
+    "REFORM_VALIDATION_SCHEMA_VERSION",
     "LEDGER_US_SOURCE_COVERAGE_CONTRACT_COMMIT",
     "US_DONORS",
     "US_FISCAL_MACRO_REALISM_BANDS",
@@ -92,8 +103,13 @@
     "US_STATE_INCOME_TAX_TARGET_REFERENCES",
     "compile_us_fiscal_target_registry",
     "hard_target_package_aliases",
+    "in_sample_reform_specs",
+    "load_default_reform_specs",
+    "out_of_sample_reform_specs",
+    "reform_validation_payload",
     "source_gap_family_ids",
     "us_plan",
+    "write_reform_validation",
     "us_source_coverage_diagnostics",
     "us_source_coverage_gate",
     "write_us_source_coverage_diagnostics",
diff --git a/packages/populace-build/src/populace/build/us/obbba_reforms.json b/packages/populace-build/src/populace/build/us/obbba_reforms.json
new file mode 100644
index 0000000..a8b6390
--- /dev/null
+++ b/packages/populace-build/src/populace/build/us/obbba_reforms.json
@@ -0,0 +1,54 @@
+{
+  "_comment": "Out-of-sample reform-validation set: provisions of the 2025 One Big Beautiful Bill Act (OBBBA) that policyengine-us models in its baseline. Each reform is encoded as a COUNTERFACTUAL REVERT (the provision is already in the baseline, so we turn it off); the provision's budget effect is therefore baseline - reform (effect_direction='baseline_minus_reform'), which is sign-comparable to the JCT enactment score. We include only genuinely new provisions whose JCT line is NOT bundled with TCJA extension, so the parameter revert captures the whole provision. JCT figures are the per-fiscal-year lines from JCX-35-25 (July 1 2025), conventional/static, in dollars. We compare a single simulated year (period) to JCT's same-year line. Provisions excluded for now: SALT cap (JCT line bundles cap-extension + raise; sign ambiguous), CTC / standard deduction (JCT lines bundle TCJA extension + enhancement; a clean revert isn't isolable), senior bonus deduction (no standalone JCX-35-25 line), Trump accounts (not modeled in policyengine-us), estate exemption (rarely fires in microdata).",
+  "schema_version": 1,
+  "jct_document": {
+    "id": "JCX-35-25",
+    "title": "Estimated Revenue Effects Relative to the Present Law Baseline of the Tax Provisions in Title VII-Finance of the substitute legislation as passed by the Senate (FY2025 budget reconciliation)",
+    "published": "2025-07-01",
+    "url": "https://www.jct.gov/publications/2025/jcx-35-25/"
+  },
+  "reforms": [
+    {
+      "id": "obbba_no_tax_on_tips",
+      "name": "OBBBA — No tax on tips (tip income deduction)",
+      "category": "OBBBA",
+      "description": "Deduction for qualified tip income (sunsets 12/31/2028). Validated by reverting the tip-income deduction cap to 0 over the provision window and comparing the simulated FY2026 income-tax revenue loss to the JCT FY2026 line.",
+      "period": 2026,
+      "budget_measure": "income_tax",
+      "effect_direction": "baseline_minus_reform",
+      "parameter_changes": {
+        "gov.irs.deductions.tip_income.cap": { "2025-01-01.2028-12-31": 0 }
+      },
+      "jct": {
+        "score": -10121000000,
+        "score_type": "conventional",
+        "window": "FY2026",
+        "source": "JCX-35-25, Ch.2 line 1 (FY2026)",
+        "source_url": "https://www.jct.gov/publications/2025/jcx-35-25/"
+      }
+    },
+    {
+      "id": "obbba_no_tax_on_overtime",
+      "name": "OBBBA — No tax on overtime (overtime income deduction)",
+      "category": "OBBBA",
+      "description": "Deduction for qualified overtime premium income (sunsets 12/31/2028). Validated by reverting the overtime-income deduction cap to 0 over the provision window and comparing the simulated FY2026 income-tax revenue loss to the JCT FY2026 line.",
+      "period": 2026,
+      "budget_measure": "income_tax",
+      "effect_direction": "baseline_minus_reform",
+      "parameter_changes": {
+        "gov.irs.deductions.overtime_income.cap.JOINT": { "2025-01-01.2028-12-31": 0 },
+        "gov.irs.deductions.overtime_income.cap.SINGLE": { "2025-01-01.2028-12-31": 0 },
+        "gov.irs.deductions.overtime_income.cap.HEAD_OF_HOUSEHOLD": { "2025-01-01.2028-12-31": 0 },
+        "gov.irs.deductions.overtime_income.cap.SURVIVING_SPOUSE": { "2025-01-01.2028-12-31": 0 },
+        "gov.irs.deductions.overtime_income.cap.SEPARATE": { "2025-01-01.2028-12-31": 0 }
+      },
+      "jct": {
+        "score": -32806000000,
+        "score_type": "conventional",
+        "window": "FY2026",
+        "source": "JCX-35-25, Ch.2 line 2 (FY2026)",
+        "source_url": "https://www.jct.gov/publications/2025/jcx-35-25/"
+      }
+    }
+  ]
+}
diff --git a/packages/populace-build/src/populace/build/us/reform_validation.py b/packages/populace-build/src/populace/build/us/reform_validation.py
new file mode 100644
index 0000000..70142a5
--- /dev/null
+++ b/packages/populace-build/src/populace/build/us/reform_validation.py
@@ -0,0 +1,330 @@
+"""Reform validation: score policy reforms on the calibrated dataset and
+compare the budget effect to the authority's (JCT's) official score.
+
+Where ``calibration_diagnostics.json`` reports how well the calibrated weights
+reproduce their *calibration targets*, ``reform_validation.json`` reports a
+downstream property the calibration did not directly optimize: how closely the
+dataset reproduces the *budget effects of scored policy reforms*. Two kinds of
+reform are validated, and each row is labelled so a consumer can tell them
+apart:
+
+* **in-sample** — the JCT tax-expenditure reforms that are themselves
+  calibration targets (``US_JCT_TAX_EXPENDITURE_REFORMS``). The dataset was
+  tuned to hit these, so agreement is expected; the row is published for
+  completeness and provenance, flagged ``in_sample=True``.
+* **out-of-sample** — reforms the calibration never saw (e.g. provisions of
+  the 2025 One Big Beautiful Bill Act), curated in ``obbba_reforms.json`` with
+  their JCT scores. These are the genuine test of dataset fidelity.
+
+The simulation is isolated behind an injected ``simulate`` callable so the
+payload assembly is unit-testable without policyengine-us; the default factory
+builds a ``Microsimulation`` over the freshly written release H5.
+"""
+
+from __future__ import annotations
+
+import json
+import math
+from collections.abc import Callable, Iterable, Sequence
+from dataclasses import dataclass
+from importlib.resources import files
+from pathlib import Path
+from typing import Any
+
+from populace.build.us.fiscal_targets import (
+    US_JCT_TAX_EXPENDITURE_REFORMS,
+    SimpleTaxExpenditureReform,
+)
+
+__all__ = [
+    "REFORM_VALIDATION_SCHEMA_VERSION",
+    "ReformValidationSpec",
+    "in_sample_reform_specs",
+    "out_of_sample_reform_specs",
+    "load_default_reform_specs",
+    "reform_validation_payload",
+    "write_reform_validation",
+]
+
+#: Schema version of reform_validation.json. The calibration-diagnostics
+#: dashboard keys its reader on it; bump with any shape change.
+REFORM_VALIDATION_SCHEMA_VERSION = 1
+
+#: The budget effect of a reform is the weighted-sum change of this variable
+#: between the reform and baseline simulations, unless a spec overrides it. For
+#: income-tax provisions this is the simulated federal income-tax revenue change
+#: — the quantity JCT scores (− = revenue loss / cost; + = revenue raised).
+DEFAULT_BUDGET_MEASURE = "income_tax"
+
+
+@dataclass(frozen=True)
+class ReformValidationSpec:
+    """One reform to score on the dataset and compare to its JCT figure.
+
+    Exactly one of ``neutralized_variable`` (an in-sample tax-expenditure
+    neutralization) or ``parameter_changes`` (an out-of-sample
+    ``Reform.from_dict`` payload) defines the reform.
+    """
+
+    id: str
+    name: str
+    category: str
+    in_sample: bool
+    period: int
+    jct_score: float
+    jct_window: str
+    jct_source: str
+    jct_source_url: str
+    jct_score_type: str = "conventional"
+    budget_measure: str = DEFAULT_BUDGET_MEASURE
+    description: str = ""
+    neutralized_variable: str | None = None
+    parameter_changes: dict[str, Any] | None = None
+    # How the budget effect is signed relative to the simulations. JCT scores a
+    # *tax expenditure* as the revenue raised by repeal (reform − baseline, the
+    # neutralize convention), but scores an *enacted provision* as the effect of
+    # enacting it. OBBBA is already in the policyengine-us baseline, so its
+    # provisions are validated by a counterfactual *revert* reform — there the
+    # provision's effect is baseline − reform (JCT enactment sign).
+    effect_direction: str = "reform_minus_baseline"
+
+    def __post_init__(self) -> None:
+        if not self.id:
+            raise ValueError("ReformValidationSpec.id is required.")
+        has_neutralize = bool(self.neutralized_variable)
+        has_params = bool(self.parameter_changes)
+        if has_neutralize == has_params:
+            raise ValueError(
+                f"{self.id}: provide exactly one of neutralized_variable or "
+                "parameter_changes."
+            )
+        if self.effect_direction not in {"reform_minus_baseline", "baseline_minus_reform"}:
+            raise ValueError(
+                f"{self.id}: effect_direction must be 'reform_minus_baseline' or "
+                "'baseline_minus_reform'."
+            )
+
+    def build_reform(self) -> Any:
+        """Construct the policyengine reform object for this spec.
+
+        Imports are lazy so the module (and its unit tests) load without
+        policyengine-us installed.
+        """
+        if self.neutralized_variable:
+            from policyengine_core.reforms import Reform
+
+            variable = self.neutralized_variable
+
+            class _Neutralize(Reform):
+                def apply(self) -> None:
+                    self.neutralize_variable(variable)
+
+            return _Neutralize
+        from policyengine_core.reforms import Reform
+
+        return Reform.from_dict(self.parameter_changes, country_id="us")
+
+
+def _finite(value: float) -> float | None:
+    value = float(value)
+    return value if math.isfinite(value) else None
+
+
+def in_sample_reform_specs(
+    reforms: Iterable[SimpleTaxExpenditureReform] = US_JCT_TAX_EXPENDITURE_REFORMS,
+    *,
+    period: int,
+) -> tuple[ReformValidationSpec, ...]:
+    """The JCT tax-expenditure calibration targets as validation specs."""
+    specs: list[ReformValidationSpec] = []
+    for reform in reforms:
+        specs.append(
+            ReformValidationSpec(
+                id=reform.target_name,
+                name=reform.target_name,
+                category="JCT tax expenditure",
+                in_sample=True,
+                period=int(period),
+                jct_score=float(reform.value),
+                jct_window="annual",
+                jct_source=reform.source,
+                jct_source_url="",
+                budget_measure=reform.output_variable or DEFAULT_BUDGET_MEASURE,
+                neutralized_variable=reform.neutralized_variable,
+            )
+        )
+    return tuple(specs)
+
+
+def _obbba_config_path() -> Path:
+    return Path(str(files(__package__).joinpath("obbba_reforms.json")))
+
+
+def out_of_sample_reform_specs(
+    path: Path | None = None,
+    *,
+    period: int,
+) -> tuple[ReformValidationSpec, ...]:
+    """Curated out-of-sample reforms (OBBBA provisions) from JSON config."""
+    config_path = path or _obbba_config_path()
+    if not config_path.exists():
+        return ()
+    payload = json.loads(config_path.read_text())
+    specs: list[ReformValidationSpec] = []
+    for raw in payload.get("reforms", ()):
+        jct = raw.get("jct", {})
+        specs.append(
+            ReformValidationSpec(
+                id=raw["id"],
+                name=raw["name"],
+                category=raw.get("category", "OBBBA"),
+                in_sample=False,
+                period=int(raw.get("period", period)),
+                jct_score=float(jct["score"]),
+                jct_window=str(jct.get("window", "")),
+                jct_source=str(jct.get("source", "")),
+                jct_source_url=str(jct.get("source_url", "")),
+                jct_score_type=str(jct.get("score_type", "conventional")),
+                budget_measure=str(raw.get("budget_measure", DEFAULT_BUDGET_MEASURE)),
+                description=str(raw.get("description", "")),
+                parameter_changes=raw["parameter_changes"],
+                # OBBBA provisions are baked into the baseline, so the config
+                # encodes a revert; the provision's effect is baseline − reform.
+                effect_direction=str(raw.get("effect_direction", "baseline_minus_reform")),
+            )
+        )
+    return tuple(specs)
+
+
+def load_default_reform_specs(
+    *,
+    period: int,
+    obbba_path: Path | None = None,
+) -> tuple[ReformValidationSpec, ...]:
+    """In-sample JCT tax expenditures + out-of-sample OBBBA provisions."""
+    return (
+        *in_sample_reform_specs(period=period),
+        *out_of_sample_reform_specs(obbba_path, period=period),
+    )
+
+
+# A simulate(reform_or_None) -> object with .calculate(measure, period).sum().
+SimulateFn = Callable[[Any], Any]
+
+
+def _weighted_total(simulation: Any, measure: str, period: int) -> float:
+    """Weighted population total of ``measure`` (MicroSeries .sum() is
+    weight-aware in policyengine-us)."""
+    return float(simulation.calculate(measure, period).sum())
+
+
+def default_simulate_factory(dataset_path: Path) -> SimulateFn:
+    """Build a simulate() that runs a Microsimulation over the release H5."""
+
+    def simulate(reform: Any) -> Any:
+        from policyengine_us import Microsimulation
+        from policyengine_us.data import USSingleYearDataset
+
+        dataset = USSingleYearDataset(file_path=str(dataset_path))
+        if reform is None:
+            return Microsimulation(dataset=dataset)
+        return Microsimulation(dataset=dataset, reform=reform)
+
+    return simulate
+
+
+def reform_validation_payload(
+    specs: Sequence[ReformValidationSpec],
+    *,
+    period: int,
+    simulate: SimulateFn | None = None,
+    in_sample_estimates: dict[str, float] | None = None,
+    release_id: str | None = None,
+) -> dict[str, Any]:
+    """Score each reform on the dataset and render the JSON-stable payload.
+
+    In-sample reforms are JCT tax-expenditure *calibration targets*: their
+    populace budget effect is the calibrated ``final_estimate`` the calibration
+    already produced — passed in via ``in_sample_estimates`` (keyed by spec id),
+    so no extra simulation is run for them. Out-of-sample reforms (OBBBA
+    provisions) are simulated: a baseline is built once and each reform's budget
+    effect is the weighted-sum change of its budget measure (reform − baseline).
+
+    ``simulate`` is required only if any out-of-sample spec is present (or an
+    in-sample estimate is missing); when absent, those rows publish a null
+    budget effect rather than failing the build. The shape matches the
+    calibration-diagnostics dashboard's reform_validation reader.
+    """
+    estimates = in_sample_estimates or {}
+    baseline: Any = None
+    baseline_totals: dict[tuple[int, str], float] = {}
+
+    def baseline_total(measure: str, at_period: int) -> float:
+        nonlocal baseline
+        if baseline is None:
+            baseline = simulate(None)  # type: ignore[misc]
+        key = (at_period, measure)
+        if key not in baseline_totals:
+            baseline_totals[key] = _weighted_total(baseline, measure, at_period)
+        return baseline_totals[key]
+
+    def simulated_effect(spec: ReformValidationSpec) -> tuple[float | None, float | None, float | None]:
+        if simulate is None:
+            return None, None, None
+        base = baseline_total(spec.budget_measure, spec.period)
+        reform_total = _weighted_total(simulate(spec.build_reform()), spec.budget_measure, spec.period)
+        raw = reform_total - base
+        # A counterfactual revert measures the provision as baseline − reform.
+        effect = raw if spec.effect_direction == "reform_minus_baseline" else -raw
+        return effect, base, reform_total
+
+    rows: list[dict[str, Any]] = []
+    for spec in specs:
+        if spec.in_sample and spec.id in estimates:
+            effect: float | None = float(estimates[spec.id])
+            base_total: float | None = None
+            reform_total: float | None = None
+        else:
+            effect, base_total, reform_total = simulated_effect(spec)
+        rows.append(
+            {
+                "id": spec.id,
+                "name": spec.name,
+                "category": spec.category,
+                "in_sample": spec.in_sample,
+                "period": spec.period,
+                "description": spec.description or None,
+                "jct": {
+                    "score": _finite(spec.jct_score),
+                    "score_type": spec.jct_score_type,
+                    "window": spec.jct_window or None,
+                    "source": spec.jct_source or None,
+                    "source_url": spec.jct_source_url or None,
+                },
+                "populace": {
+                    "budget_effect": None if effect is None else _finite(effect),
+                    "period": spec.period,
+                    "window": spec.jct_window or None,
+                    "measure": spec.budget_measure,
+                    "baseline_total": None if base_total is None else _finite(base_total),
+                    "reform_total": None if reform_total is None else _finite(reform_total),
+                },
+            }
+        )
+
+    payload: dict[str, Any] = {
+        "schema_version": REFORM_VALIDATION_SCHEMA_VERSION,
+        "baseline_period": int(period),
+        "scoring_window": "see per-reform jct.window",
+        "reforms": rows,
+    }
+    if release_id is not None:
+        payload["release_id"] = release_id
+    return payload
+
+
+def write_reform_validation(payload: dict[str, Any], path: Path | str) -> Path:
+    """Write the reform-validation payload as ``reform_validation.json``."""
+    path = Path(path)
+    path.write_text(json.dumps(payload, indent=1, allow_nan=False))
+    return path
diff --git a/packages/populace-build/tests/test_reform_validation.py b/packages/populace-build/tests/test_reform_validation.py
new file mode 100644
index 0000000..3e2e477
--- /dev/null
+++ b/packages/populace-build/tests/test_reform_validation.py
@@ -0,0 +1,163 @@
+"""Reform-validation payload assembly, isolated from policyengine-us.
+
+The simulation is injected, so these tests exercise the budget-effect math and
+the in-sample/out-of-sample split without running a Microsimulation.
+"""
+
+from __future__ import annotations
+
+import json
+
+import pytest
+
+from populace.build.us.reform_validation import (
+    REFORM_VALIDATION_SCHEMA_VERSION,
+    ReformValidationSpec,
+    in_sample_reform_specs,
+    out_of_sample_reform_specs,
+    reform_validation_payload,
+    write_reform_validation,
+)
+
+
+class _FakeSeries:
+    def __init__(self, total: float) -> None:
+        self._total = total
+
+    def sum(self) -> float:
+        return self._total
+
+
+class _FakeSim:
+    """A sim whose weighted total for a measure shifts by a per-reform delta."""
+
+    def __init__(self, totals: dict[str, float]) -> None:
+        self._totals = totals
+
+    def calculate(self, measure: str, period):  # noqa: ARG002
+        return _FakeSeries(self._totals[measure])
+
+
+def _oos_spec(score: float) -> ReformValidationSpec:
+    return ReformValidationSpec(
+        id="obbba_salt",
+        name="OBBBA — SALT cap to $40k",
+        category="OBBBA",
+        in_sample=False,
+        period=2024,
+        jct_score=score,
+        jct_window="FY2025-2034",
+        jct_source="JCX-00-25",
+        jct_source_url="https://www.jct.gov/",
+        parameter_changes={"gov.example.cap": {"2025-01-01.2034-12-31": 40000}},
+    )
+
+
+def test_spec_requires_exactly_one_reform_definition():
+    with pytest.raises(ValueError):
+        ReformValidationSpec(
+            id="x", name="x", category="c", in_sample=False, period=2024,
+            jct_score=1.0, jct_window="", jct_source="", jct_source_url="",
+        )
+    with pytest.raises(ValueError):
+        ReformValidationSpec(
+            id="x", name="x", category="c", in_sample=False, period=2024,
+            jct_score=1.0, jct_window="", jct_source="", jct_source_url="",
+            neutralized_variable="v", parameter_changes={"a": 1},
+        )
+
+
+def test_in_sample_uses_calibration_estimate_no_simulation():
+    specs = (
+        ReformValidationSpec(
+            id="nation/jct/mortgage", name="Mortgage interest deduction",
+            category="JCT tax expenditure", in_sample=True, period=2024,
+            jct_score=30e9, jct_window="annual", jct_source="JCT", jct_source_url="",
+            neutralized_variable="mortgage_interest_deduction",
+        ),
+    )
+
+    def simulate(_reform):  # pragma: no cover - must not be called
+        raise AssertionError("in-sample rows must not simulate")
+
+    payload = reform_validation_payload(
+        specs,
+        period=2024,
+        simulate=simulate,
+        in_sample_estimates={"nation/jct/mortgage": 28e9},
+    )
+    row = payload["reforms"][0]
+    assert row["in_sample"] is True
+    assert row["populace"]["budget_effect"] == pytest.approx(28e9)
+    assert row["jct"]["score"] == pytest.approx(30e9)
+
+
+def test_out_of_sample_budget_effect_is_reform_minus_baseline(monkeypatch):
+    # baseline income_tax total 2.0e12; under the reform it rises by 50e9.
+    spec = _oos_spec(score=-60e9)
+    monkeypatch.setattr(spec.__class__, "build_reform", lambda self: "REFORM")
+
+    def simulate(reform):
+        total = 2.0e12 + 50e9 if reform == "REFORM" else 2.0e12
+        return _FakeSim({"income_tax": total})
+
+    payload = reform_validation_payload([spec], period=2024, simulate=simulate)
+    row = payload["reforms"][0]
+    assert row["in_sample"] is False
+    assert row["populace"]["budget_effect"] == pytest.approx(50e9)
+    assert row["populace"]["baseline_total"] == pytest.approx(2.0e12)
+    assert row["populace"]["reform_total"] == pytest.approx(2.05e12)
+    assert row["jct"]["score"] == pytest.approx(-60e9)
+
+
+def test_counterfactual_revert_flips_sign():
+    # A revert reform: baseline (provision on) income tax 2.0e12; reverting the
+    # provision (reform) raises it to 2.033e12. The provision's effect is
+    # baseline − reform = −33e9 (a cost), matching the JCT enactment sign.
+    spec = _oos_spec(score=-33e9)
+    object.__setattr__(spec, "effect_direction", "baseline_minus_reform")
+
+    def simulate(reform):
+        total = 2.033e12 if reform is not None else 2.0e12
+        return _FakeSim({"income_tax": total})
+
+    payload = reform_validation_payload([spec], period=2024, simulate=simulate)
+    assert payload["reforms"][0]["populace"]["budget_effect"] == pytest.approx(-33e9)
+
+
+def test_shipped_obbba_config_is_out_of_sample_counterfactual():
+    specs = out_of_sample_reform_specs(period=2026)
+    assert {s.id for s in specs} >= {"obbba_no_tax_on_tips", "obbba_no_tax_on_overtime"}
+    for spec in specs:
+        assert spec.effect_direction == "baseline_minus_reform"
+        assert spec.period == 2026
+        assert spec.jct_score < 0  # OBBBA provisions are costs
+        assert spec.jct_source.startswith("JCX-35-25")
+
+
+def test_out_of_sample_null_when_no_simulate():
+    payload = reform_validation_payload([_oos_spec(-1.0)], period=2024, simulate=None)
+    assert payload["reforms"][0]["populace"]["budget_effect"] is None
+    assert payload["schema_version"] == REFORM_VALIDATION_SCHEMA_VERSION
+
+
+def test_in_sample_specs_built_from_jct_reforms():
+    specs = in_sample_reform_specs(period=2024)
+    assert specs, "expected at least one JCT tax-expenditure reform"
+    assert all(s.in_sample for s in specs)
+    assert all(s.neutralized_variable for s in specs)
+
+
+def test_out_of_sample_specs_load_from_default_config():
+    # The shipped OBBBA config (if present) must parse into valid specs.
+    specs = out_of_sample_reform_specs(period=2024)
+    for spec in specs:
+        assert spec.in_sample is False
+        assert spec.parameter_changes
+        assert spec.jct_source
+
+
+def test_write_round_trips(tmp_path):
+    payload = reform_validation_payload([_oos_spec(-1.0)], period=2024, simulate=None)
+    path = write_reform_validation(payload, tmp_path / "reform_validation.json")
+    assert json.loads(path.read_text())["reforms"][0]["id"] == "obbba_salt"
diff --git a/tools/build_us_fiscal_refresh_release.py b/tools/build_us_fiscal_refresh_release.py
index f91c29d..389c9e5 100644
--- a/tools/build_us_fiscal_refresh_release.py
+++ b/tools/build_us_fiscal_refresh_release.py
@@ -35,6 +35,12 @@
     us_source_coverage_diagnostics,
     write_us_source_coverage_diagnostics,
 )
+from populace.build.us.reform_validation import (
+    default_simulate_factory,
+    load_default_reform_specs,
+    reform_validation_payload,
+    write_reform_validation,
+)
 from populace.calibrate import TargetRegistry, calibrate
 from populace.calibrate.diagnostics import (
     diagnostics_payload,
@@ -192,6 +198,21 @@ def _parse_args() -> argparse.Namespace:
             "diagnostics instead."
         ),
     )
+    parser.add_argument(
+        "--skip-reform-validation",
+        action="store_true",
+        help="Do not emit reform_validation.json for this release.",
+    )
+    parser.add_argument(
+        "--skip-out-of-sample-reforms",
+        action="store_true",
+        help=(
+            "Emit reform_validation.json with the in-sample JCT tax-expenditure "
+            "rows only (from the calibration fit), skipping the out-of-sample "
+            "OBBBA simulations. Faster; useful when policyengine-us microsim runs "
+            "are not wanted in the build."
+        ),
+    )
     return parser.parse_args()
 
 
@@ -985,6 +1006,49 @@ def _artifact_entry(path: str, sha: str, *, kind: str, revision: str) -> dict[st
     }
 
 
+def _in_sample_estimates(result) -> dict[str, float]:
+    """Calibrated final estimate per JCT target, keyed by target name.
+
+    The in-sample reform validation rows reuse the calibration's own fit (the
+    JCT tax-expenditure targets *are* calibration targets), so no extra
+    simulation is run for them.
+    """
+    estimates: dict[str, float] = {}
+    for diagnostic, target in zip(result.diagnostics, result.problem.targets, strict=True):
+        value = diagnostic.final_estimate
+        if value is not None and math.isfinite(float(value)):
+            estimates[target.name] = float(value)
+    return estimates
+
+
+def _write_reform_validation(
+    *,
+    release_dir: Path,
+    dataset_path: Path,
+    result,
+    registry: TargetRegistry,
+    release_id: str,
+    simulate_out_of_sample: bool,
+) -> None:
+    """Emit reform_validation.json: populace budget effects vs JCT scores.
+
+    In-sample JCT tax-expenditure reforms come straight from the calibration
+    fit; out-of-sample OBBBA provisions are simulated on the freshly written
+    release H5 (skipped if ``simulate_out_of_sample`` is False, e.g. for a fast
+    diagnostics-only build).
+    """
+    specs = load_default_reform_specs(period=PERIOD)
+    simulate = default_simulate_factory(dataset_path) if simulate_out_of_sample else None
+    payload = reform_validation_payload(
+        specs,
+        period=PERIOD,
+        simulate=simulate,
+        in_sample_estimates=_in_sample_estimates(result),
+        release_id=release_id,
+    )
+    write_reform_validation(payload, release_dir / "reform_validation.json")
+
+
 def _build_manifests(
     *,
     release_id: str,
@@ -1100,6 +1164,18 @@ def _build_manifests(
                 kind="diagnostics",
                 revision=release_id,
             ),
+            **(
+                {
+                    "reform_validation": _artifact_entry(
+                        "reform_validation.json",
+                        _sha256(release_dir / "reform_validation.json"),
+                        kind="diagnostics",
+                        revision=release_id,
+                    )
+                }
+                if (release_dir / "reform_validation.json").exists()
+                else {}
+            ),
         },
     }
     (release_dir / "release_manifest.json").write_text(
@@ -1254,6 +1330,16 @@ def main() -> None:
         },
     )
 
+    if not args.skip_reform_validation:
+        _write_reform_validation(
+            release_dir=release_dir,
+            dataset_path=dataset_path,
+            result=result,
+            registry=registry,
+            release_id=release_id,
+            simulate_out_of_sample=not args.skip_out_of_sample_reforms,
+        )
+
     active_aliases = DIRECT_ACTIVE_ALIASES
     coverage = us_source_coverage_diagnostics(
         active_target_aliases=active_aliases,

From c6d231333929d9700b40fc9b5dd5e273ce0f4220 Mon Sep 17 00:00:00 2001
From: Pavel Makarchuk <pavel@policyengine.org>
Date: Tue, 16 Jun 2026 11:11:31 -0400
Subject: [PATCH 2/4] Add big-provision tax-expenditure reforms
 (CTC/EITC/CDCC/standard/itemized)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extends reform validation beyond OBBBA to the major tax provisions, each a
neutralize_variable repeal whose simulated revenue change is compared to a
published tax-expenditure figure:

- CTC vs JCT $173.8B (JCX-48-24), EITC vs JCT $124.2B, CDCC vs Treasury $3.69B
  (JCT bundles CDCC with the employer-childcare exclusion).
- Standard deduction and all-itemized-combined carry NO benchmark: both JCT and
  Treasury treat the standard deduction as baseline, and neither publishes a
  combined itemized total — so these publish the repeal magnitude only
  (jct_score is now Optional to support that).
- in_sample flags calibration status honestly: EITC is in-sample (SOI EITC
  targets), CTC partly, CDCC/standard/itemized out-of-sample. The individual
  itemized deductions (SALT/mortgage/charitable/medical/QBI) are already
  validated in-sample, so they aren't duplicated here.

Verified on the released populace_us_2024.h5 (FY2024): CTC $114.9B vs $173.8B
(-34%), EITC $96.2B vs $124.2B (-23%), CDCC $3.08B vs $3.69B (-17%), standard
$261.6B, all-itemized $88.1B — populace under-captures the big refundable
credits, a real validation signal.

tax_expenditure_reforms.json config + tax_expenditure_reform_specs loader,
wired into load_default_reform_specs; 2 new tests. ruff clean.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .../populace/build/us/reform_validation.py    | 61 ++++++++++++-
 .../build/us/tax_expenditure_reforms.json     | 91 +++++++++++++++++++
 .../tests/test_reform_validation.py           | 30 ++++++
 3 files changed, 179 insertions(+), 3 deletions(-)
 create mode 100644 packages/populace-build/src/populace/build/us/tax_expenditure_reforms.json

diff --git a/packages/populace-build/src/populace/build/us/reform_validation.py b/packages/populace-build/src/populace/build/us/reform_validation.py
index 70142a5..189fcc8 100644
--- a/packages/populace-build/src/populace/build/us/reform_validation.py
+++ b/packages/populace-build/src/populace/build/us/reform_validation.py
@@ -41,6 +41,7 @@
     "ReformValidationSpec",
     "in_sample_reform_specs",
     "out_of_sample_reform_specs",
+    "tax_expenditure_reform_specs",
     "load_default_reform_specs",
     "reform_validation_payload",
     "write_reform_validation",
@@ -71,7 +72,10 @@ class ReformValidationSpec:
     category: str
     in_sample: bool
     period: int
-    jct_score: float
+    # The published figure to compare against. None for provisions neither JCT
+    # nor Treasury score (e.g. the regular standard deduction, which both treat
+    # as baseline) — those rows publish the simulated magnitude with no error.
+    jct_score: float | None
     jct_window: str
     jct_source: str
     jct_source_url: str
@@ -196,15 +200,66 @@ def out_of_sample_reform_specs(
     return tuple(specs)
 
 
+def _tax_expenditure_config_path() -> Path:
+    return Path(str(files(__package__).joinpath("tax_expenditure_reforms.json")))
+
+
+def tax_expenditure_reform_specs(
+    path: Path | None = None,
+    *,
+    period: int,
+) -> tuple[ReformValidationSpec, ...]:
+    """Big-provision tax-expenditure reforms from JSON config.
+
+    Each is a ``neutralize_variable`` reform (repeal the provision) whose
+    simulated revenue change is compared to a published tax-expenditure figure
+    (JCT where it scores the provision, Treasury otherwise — recorded per row).
+    The ``in_sample`` flag reflects whether the dataset is calibrated to that
+    provision (e.g. EITC is, the standard deduction is not), so a consumer can
+    weight the result accordingly.
+    """
+    config_path = path or _tax_expenditure_config_path()
+    if not config_path.exists():
+        return ()
+    payload = json.loads(config_path.read_text())
+    specs: list[ReformValidationSpec] = []
+    for raw in payload.get("reforms", ()):
+        bench = raw.get("benchmark", {})
+        specs.append(
+            ReformValidationSpec(
+                id=raw["id"],
+                name=raw["name"],
+                category=raw.get("category", "Tax expenditure"),
+                in_sample=bool(raw.get("in_sample", False)),
+                period=int(raw.get("period", period)),
+                jct_score=(float(bench["score"]) if bench.get("score") is not None else None),
+                jct_window=str(bench.get("window", "")),
+                jct_source=str(bench.get("source", "")),
+                jct_source_url=str(bench.get("source_url", "")),
+                jct_score_type=str(bench.get("score_type", "tax_expenditure")),
+                budget_measure=str(raw.get("budget_measure", DEFAULT_BUDGET_MEASURE)),
+                description=str(raw.get("description", "")),
+                neutralized_variable=raw["neutralized_variable"],
+                # Neutralizing the provision raises tax by the expenditure amount
+                # (positive), matching the positive published figure.
+                effect_direction="reform_minus_baseline",
+            )
+        )
+    return tuple(specs)
+
+
 def load_default_reform_specs(
     *,
     period: int,
     obbba_path: Path | None = None,
+    tax_expenditure_path: Path | None = None,
 ) -> tuple[ReformValidationSpec, ...]:
-    """In-sample JCT tax expenditures + out-of-sample OBBBA provisions."""
+    """In-sample JCT tax expenditures + out-of-sample OBBBA provisions + the
+    big-provision tax-expenditure reforms (CTC/EITC/CDCC/standard/itemized)."""
     return (
         *in_sample_reform_specs(period=period),
         *out_of_sample_reform_specs(obbba_path, period=period),
+        *tax_expenditure_reform_specs(tax_expenditure_path, period=period),
     )
 
 
@@ -295,7 +350,7 @@ def simulated_effect(spec: ReformValidationSpec) -> tuple[float | None, float |
                 "period": spec.period,
                 "description": spec.description or None,
                 "jct": {
-                    "score": _finite(spec.jct_score),
+                    "score": None if spec.jct_score is None else _finite(spec.jct_score),
                     "score_type": spec.jct_score_type,
                     "window": spec.jct_window or None,
                     "source": spec.jct_source or None,
diff --git a/packages/populace-build/src/populace/build/us/tax_expenditure_reforms.json b/packages/populace-build/src/populace/build/us/tax_expenditure_reforms.json
new file mode 100644
index 0000000..eb5bda1
--- /dev/null
+++ b/packages/populace-build/src/populace/build/us/tax_expenditure_reforms.json
@@ -0,0 +1,91 @@
+{
+  "_comment": "Big-provision tax-expenditure validation. Each reform repeals a major provision (neutralize_variable) on the calibrated dataset; the simulated revenue change (income_tax delta, which captures the credit's refundable portion too) is compared to the published tax-expenditure figure. JCT where it scores the provision (Estimates of Federal Tax Expenditures FY2024-2028, JCX-48-24, Dec 11 2024, Table 1, FY2024, $); Treasury (Tax Expenditures FY2025, OTA) where JCT bundles it. Figures are FY2024 single-year totals incl. the outlay/refundable portion. in_sample flags whether the dataset is calibrated to the provision: EITC is (SOI EITC-by-AGI targets), CTC partly (SOI CTC amounts), CDCC/standard/itemized are not. The individual itemized deductions (SALT, mortgage, charitable, medical, QBI) are already validated in-sample via US_JCT_TAX_EXPENDITURE_REFORMS, so they are not duplicated here. Standard deduction and 'all itemized combined' have NO published tax-expenditure figure (both JCT and Treasury treat the standard deduction as baseline, and neither publishes a combined itemized total), so they carry no benchmark and publish the repeal magnitude only.",
+  "schema_version": 1,
+  "reforms": [
+    {
+      "id": "te_ctc",
+      "name": "Child Tax Credit (incl. ODC)",
+      "category": "Tax expenditure",
+      "in_sample": false,
+      "period": 2024,
+      "neutralized_variable": "ctc",
+      "budget_measure": "income_tax",
+      "description": "Repeal the Child Tax Credit and Credit for Other Dependents. JCT line is 'Credit for children and other dependents'.",
+      "benchmark": {
+        "score": 173800000000,
+        "score_type": "tax_expenditure",
+        "window": "FY2024",
+        "source": "JCT JCX-48-24 Table 1 (rev 127.1B + outlay 46.7B)",
+        "source_url": "https://www.jct.gov/publications/2024/jcx-48-24/"
+      }
+    },
+    {
+      "id": "te_eitc",
+      "name": "Earned Income Tax Credit",
+      "category": "Tax expenditure",
+      "in_sample": true,
+      "period": 2024,
+      "neutralized_variable": "eitc",
+      "budget_measure": "income_tax",
+      "description": "Repeal the EITC. Mostly refundable, so the bulk is the outlay portion. The dataset is calibrated to SOI EITC-by-AGI targets, so this is an in-sample check.",
+      "benchmark": {
+        "score": 124200000000,
+        "score_type": "tax_expenditure",
+        "window": "FY2024",
+        "source": "JCT JCX-48-24 Table 1 (rev 66.9B + outlay 57.3B)",
+        "source_url": "https://www.jct.gov/publications/2024/jcx-48-24/"
+      }
+    },
+    {
+      "id": "te_cdcc",
+      "name": "Child & Dependent Care Credit",
+      "category": "Tax expenditure",
+      "in_sample": false,
+      "period": 2024,
+      "neutralized_variable": "cdcc",
+      "budget_measure": "income_tax",
+      "description": "Repeal the CDCC. JCT bundles it with the employer-provided-childcare exclusion, so the clean CDCC-only benchmark is Treasury's.",
+      "benchmark": {
+        "score": 3690000000,
+        "score_type": "tax_expenditure",
+        "window": "FY2024",
+        "source": "Treasury Tax Expenditures FY2025, Table 1 item 120",
+        "source_url": "https://home.treasury.gov/policy-issues/tax-policy/tax-expenditures"
+      }
+    },
+    {
+      "id": "te_standard_deduction",
+      "name": "Standard deduction (no official benchmark)",
+      "category": "Tax expenditure",
+      "in_sample": false,
+      "period": 2024,
+      "neutralized_variable": "standard_deduction",
+      "budget_measure": "income_tax",
+      "description": "Repeal the standard deduction. Both JCT and Treasury treat the regular standard deduction as part of the normal-tax baseline, so there is no published tax-expenditure figure; shown as a repeal magnitude only.",
+      "benchmark": {
+        "score": null,
+        "score_type": "none",
+        "window": "FY2024",
+        "source": "Not scored — standard deduction is baseline in both JCT and Treasury",
+        "source_url": "https://www.jct.gov/publications/2024/jcx-48-24/"
+      }
+    },
+    {
+      "id": "te_itemized_total",
+      "name": "All itemized deductions (no combined benchmark)",
+      "category": "Tax expenditure",
+      "in_sample": false,
+      "period": 2024,
+      "neutralized_variable": "itemized_taxable_income_deductions",
+      "budget_measure": "income_tax",
+      "description": "Repeal all itemized deductions at once. Neither JCT nor Treasury publishes a single combined itemized-deduction tax-expenditure figure (only components: SALT, mortgage, charitable, medical, …), so this carries no benchmark; the individual components are validated in-sample. Shown as a repeal magnitude only.",
+      "benchmark": {
+        "score": null,
+        "score_type": "none",
+        "window": "FY2024",
+        "source": "Not published as a combined figure — JCT/Treasury score itemized components individually",
+        "source_url": "https://www.jct.gov/publications/2024/jcx-48-24/"
+      }
+    }
+  ]
+}
diff --git a/packages/populace-build/tests/test_reform_validation.py b/packages/populace-build/tests/test_reform_validation.py
index 3e2e477..ba76b49 100644
--- a/packages/populace-build/tests/test_reform_validation.py
+++ b/packages/populace-build/tests/test_reform_validation.py
@@ -16,6 +16,7 @@
     in_sample_reform_specs,
     out_of_sample_reform_specs,
     reform_validation_payload,
+    tax_expenditure_reform_specs,
     write_reform_validation,
 )
 
@@ -135,6 +136,35 @@ def test_shipped_obbba_config_is_out_of_sample_counterfactual():
         assert spec.jct_source.startswith("JCX-35-25")
 
 
+def test_shipped_tax_expenditure_specs_neutralize_big_provisions():
+    specs = tax_expenditure_reform_specs(period=2024)
+    by_id = {s.id for s in specs}
+    assert {"te_ctc", "te_eitc", "te_cdcc", "te_standard_deduction", "te_itemized_total"} <= by_id
+    for spec in specs:
+        assert spec.neutralized_variable  # all are repeals
+        assert spec.effect_direction == "reform_minus_baseline"  # neutralize raises tax
+    eitc = next(s for s in specs if s.id == "te_eitc")
+    assert eitc.in_sample is True  # calibrated to SOI EITC targets
+    std = next(s for s in specs if s.id == "te_standard_deduction")
+    assert std.jct_score is None  # baseline in both JCT and Treasury — no benchmark
+
+
+def test_null_benchmark_row_publishes_magnitude_only():
+    spec = ReformValidationSpec(
+        id="te_std", name="Standard deduction", category="Tax expenditure",
+        in_sample=False, period=2024, jct_score=None, jct_window="FY2024",
+        jct_source="not scored", jct_source_url="", neutralized_variable="standard_deduction",
+    )
+
+    def simulate(reform):
+        return _FakeSim({"income_tax": 2.28e12 if reform is not None else 2.0e12})
+
+    payload = reform_validation_payload([spec], period=2024, simulate=simulate)
+    row = payload["reforms"][0]
+    assert row["jct"]["score"] is None
+    assert row["populace"]["budget_effect"] == pytest.approx(280e9)  # repeal magnitude
+
+
 def test_out_of_sample_null_when_no_simulate():
     payload = reform_validation_payload([_oos_spec(-1.0)], period=2024, simulate=None)
     assert payload["reforms"][0]["populace"]["budget_effect"] is None

From 1f17db717df7d20a1d324f59311c0d36cc16bcc4 Mon Sep 17 00:00:00 2001
From: Pavel Makarchuk <pavel@policyengine.org>
Date: Tue, 16 Jun 2026 12:38:44 -0400
Subject: [PATCH 3/4] Adapt reform validation to ledger-backed targets (#65)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

#65 reworked SimpleTaxExpenditureReform — the JCT dollar figure (.value) now
lives in the ledger target, not on the reform object. in_sample_reform_specs no
longer reads reform.value; instead reform_validation_payload takes
in_sample_targets (the calibration target value per id), and the builder
supplies it from the calibration result. So an in-sample reform's JCT score is
the target it was calibrated to, and its populace estimate is the calibrated
final_estimate — both straight from the calibration diagnostics.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .../src/populace/build/us/reform_validation.py   | 16 +++++++++++++---
 tools/build_us_fiscal_refresh_release.py         | 15 +++++++++++++++
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/packages/populace-build/src/populace/build/us/reform_validation.py b/packages/populace-build/src/populace/build/us/reform_validation.py
index 189fcc8..34b0b61 100644
--- a/packages/populace-build/src/populace/build/us/reform_validation.py
+++ b/packages/populace-build/src/populace/build/us/reform_validation.py
@@ -149,9 +149,13 @@ def in_sample_reform_specs(
                 category="JCT tax expenditure",
                 in_sample=True,
                 period=int(period),
-                jct_score=float(reform.value),
+                # The JCT figure for an in-sample reform is the calibration
+                # target's own value, supplied at payload time via
+                # in_sample_targets — it lives in the ledger now, not on the
+                # reform object.
+                jct_score=None,
                 jct_window="annual",
-                jct_source=reform.source,
+                jct_source=reform.source or "JCT tax-expenditure (calibration target)",
                 jct_source_url="",
                 budget_measure=reform.output_variable or DEFAULT_BUDGET_MEASURE,
                 neutralized_variable=reform.neutralized_variable,
@@ -294,6 +298,7 @@ def reform_validation_payload(
     period: int,
     simulate: SimulateFn | None = None,
     in_sample_estimates: dict[str, float] | None = None,
+    in_sample_targets: dict[str, float] | None = None,
     release_id: str | None = None,
 ) -> dict[str, Any]:
     """Score each reform on the dataset and render the JSON-stable payload.
@@ -311,6 +316,7 @@ def reform_validation_payload(
     calibration-diagnostics dashboard's reform_validation reader.
     """
     estimates = in_sample_estimates or {}
+    targets = in_sample_targets or {}
     baseline: Any = None
     baseline_totals: dict[tuple[int, str], float] = {}
 
@@ -341,6 +347,10 @@ def simulated_effect(spec: ReformValidationSpec) -> tuple[float | None, float |
             reform_total: float | None = None
         else:
             effect, base_total, reform_total = simulated_effect(spec)
+        # In-sample reforms get their JCT figure from the calibration target.
+        effective_jct = spec.jct_score
+        if effective_jct is None and spec.in_sample and spec.id in targets:
+            effective_jct = targets[spec.id]
         rows.append(
             {
                 "id": spec.id,
@@ -350,7 +360,7 @@ def simulated_effect(spec: ReformValidationSpec) -> tuple[float | None, float |
                 "period": spec.period,
                 "description": spec.description or None,
                 "jct": {
-                    "score": None if spec.jct_score is None else _finite(spec.jct_score),
+                    "score": None if effective_jct is None else _finite(effective_jct),
                     "score_type": spec.jct_score_type,
                     "window": spec.jct_window or None,
                     "source": spec.jct_source or None,
diff --git a/tools/build_us_fiscal_refresh_release.py b/tools/build_us_fiscal_refresh_release.py
index 389c9e5..7d5b140 100644
--- a/tools/build_us_fiscal_refresh_release.py
+++ b/tools/build_us_fiscal_refresh_release.py
@@ -1021,6 +1021,20 @@ def _in_sample_estimates(result) -> dict[str, float]:
     return estimates
 
 
+def _in_sample_targets(result) -> dict[str, float]:
+    """Calibration target value (the JCT figure) per target, keyed by name.
+
+    In-sample reforms are JCT tax-expenditure calibration targets, so their JCT
+    figure is the target's own value the calibration fit against.
+    """
+    targets: dict[str, float] = {}
+    for diagnostic, target in zip(result.diagnostics, result.problem.targets, strict=True):
+        value = diagnostic.target
+        if value is not None and math.isfinite(float(value)):
+            targets[target.name] = float(value)
+    return targets
+
+
 def _write_reform_validation(
     *,
     release_dir: Path,
@@ -1044,6 +1058,7 @@ def _write_reform_validation(
         period=PERIOD,
         simulate=simulate,
         in_sample_estimates=_in_sample_estimates(result),
+        in_sample_targets=_in_sample_targets(result),
         release_id=release_id,
     )
     write_reform_validation(payload, release_dir / "reform_validation.json")

From 1fbba09b6021577ec1b0237ed9f790b910b6a32f Mon Sep 17 00:00:00 2001
From: Pavel Makarchuk <pavel@policyengine.org>
Date: Tue, 16 Jun 2026 12:44:19 -0400
Subject: [PATCH 4/4] Keep reform-validation tests isolated from
 policyengine_core

The counterfactual-revert and null-benchmark tests called the real
build_reform(), whose lazy policyengine_core import is absent in the
populace-build CI env. Monkeypatch build_reform to a sentinel like the
sibling budget-effect test, so the suite stays simulation-injected.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 packages/populace-build/tests/test_reform_validation.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/packages/populace-build/tests/test_reform_validation.py b/packages/populace-build/tests/test_reform_validation.py
index ba76b49..1423232 100644
--- a/packages/populace-build/tests/test_reform_validation.py
+++ b/packages/populace-build/tests/test_reform_validation.py
@@ -111,12 +111,13 @@ def simulate(reform):
     assert row["jct"]["score"] == pytest.approx(-60e9)
 
 
-def test_counterfactual_revert_flips_sign():
+def test_counterfactual_revert_flips_sign(monkeypatch):
     # A revert reform: baseline (provision on) income tax 2.0e12; reverting the
     # provision (reform) raises it to 2.033e12. The provision's effect is
     # baseline − reform = −33e9 (a cost), matching the JCT enactment sign.
     spec = _oos_spec(score=-33e9)
     object.__setattr__(spec, "effect_direction", "baseline_minus_reform")
+    monkeypatch.setattr(spec.__class__, "build_reform", lambda self: "REFORM")
 
     def simulate(reform):
         total = 2.033e12 if reform is not None else 2.0e12
@@ -149,12 +150,13 @@ def test_shipped_tax_expenditure_specs_neutralize_big_provisions():
     assert std.jct_score is None  # baseline in both JCT and Treasury — no benchmark
 
 
-def test_null_benchmark_row_publishes_magnitude_only():
+def test_null_benchmark_row_publishes_magnitude_only(monkeypatch):
     spec = ReformValidationSpec(
         id="te_std", name="Standard deduction", category="Tax expenditure",
         in_sample=False, period=2024, jct_score=None, jct_window="FY2024",
         jct_source="not scored", jct_source_url="", neutralized_variable="standard_deduction",
     )
+    monkeypatch.setattr(spec.__class__, "build_reform", lambda self: "REFORM")
 
     def simulate(reform):
         return _FakeSim({"income_tax": 2.28e12 if reform is not None else 2.0e12})