Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions packages/populace-data/src/populace/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@
there is no kernel in its dependency closure to gate against.
"""

from populace.data.contract import (
RELEASE_MANIFEST_SCHEMA_VERSION,
REQUIRED_RELEASE_FILES,
ReleaseContractError,
validate_release_dir,
)
from populace.data.loader import (
available,
download,
Expand All @@ -39,6 +45,10 @@
"DatasetSpec",
"REGISTRY",
"register",
"RELEASE_MANIFEST_SCHEMA_VERSION",
"REQUIRED_RELEASE_FILES",
"ReleaseContractError",
"validate_release_dir",
]

__version__ = "0.1.0"
196 changes: 196 additions & 0 deletions packages/populace-data/src/populace/data/contract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
"""The release artifact contract: what a published release MUST contain.

The releases already on the Hub disagree with each other — one carries no
``build_manifest.json`` at all, and two different ``release_manifest.json``
schemas coexist (an unversioned early shape next to ``schema_version: 1``).
A consumer iterating ``releases/`` therefore cannot trust the listing, and
every consumer ends up re-implementing its own defensive filter. The charter
makes "stage manifests are load-bearing" a binding process rule; the release
directory is the most public manifest of all, so its contract lives here,
with the producer — not in every consumer.

:func:`validate_release_dir` is the single gate: it checks a local release
directory against the contract and raises :class:`ReleaseContractError`
naming **every** failure at once (a publisher should see the full repair
list, not play whack-a-mole one failure per run). Publishing code calls it
before any byte reaches the Hub.
"""

from __future__ import annotations

import json
from collections.abc import Mapping
from pathlib import Path

__all__ = [
"RELEASE_MANIFEST_SCHEMA_VERSION",
"REQUIRED_RELEASE_FILES",
"ReleaseContractError",
"validate_release_dir",
]

#: The release-manifest schema this library reads and writes. Bump it with the
#: schema, and keep :func:`validate_release_dir` rejecting drift loudly — the
#: unversioned 1abddeb-era manifest is exactly the silence this guards against.
RELEASE_MANIFEST_SCHEMA_VERSION = 1

#: Files a release directory must contain to count as published. A release
#: missing any of these is invisible to :func:`validate_release_dir`-respecting
#: publishers, by design.
REQUIRED_RELEASE_FILES = (
"build_manifest.json",
"release_manifest.json",
"sound_ecps_replacement_comparison.json",
)


class ReleaseContractError(ValueError):
"""A release directory violates the release contract.

Attributes:
failures: Every contract violation found, each a self-contained
human-readable sentence naming the file and field at fault.
"""

def __init__(self, release_dir: Path, failures: list[str]) -> None:
self.failures = list(failures)
bullet_list = "\n".join(f" - {failure}" for failure in self.failures)
super().__init__(
f"Release directory {release_dir} violates the release contract "
f"({len(self.failures)} failure(s)):\n{bullet_list}"
)


def _load_json(path: Path, failures: list[str]) -> Mapping | None:
try:
loaded = json.loads(path.read_text())
except json.JSONDecodeError as exc:
failures.append(f"{path.name} is not valid JSON: {exc}.")
return None
if not isinstance(loaded, Mapping):
failures.append(
f"{path.name} must be a JSON object, got {type(loaded).__name__}."
)
return None
return loaded


def _check_build_manifest(
manifest: Mapping, release_id: str, failures: list[str]
) -> None:
build_id = manifest.get("build_id")
if not build_id:
failures.append("build_manifest.json is missing 'build_id'.")
elif build_id != release_id:
failures.append(
f"build_manifest.json 'build_id' is {build_id!r} but the release "
f"directory is named {release_id!r}; the directory name IS the "
f"build id."
)
dataset = manifest.get("dataset")
if not isinstance(dataset, Mapping):
failures.append("build_manifest.json is missing the 'dataset' object.")
else:
for key in ("filename", "sha256"):
if not dataset.get(key):
failures.append(
f"build_manifest.json 'dataset' is missing {key!r}."
)
if not isinstance(manifest.get("gates"), Mapping):
failures.append(
"build_manifest.json is missing the 'gates' object (the "
"acceptance-gate verdicts are the point of the manifest)."
)


def _check_release_manifest(
manifest: Mapping, release_id: str, failures: list[str]
) -> None:
schema_version = manifest.get("schema_version")
if schema_version is None:
failures.append(
"release_manifest.json has no 'schema_version'; unversioned "
"manifests (the 1abddeb-era shape) are not publishable."
)
elif schema_version != RELEASE_MANIFEST_SCHEMA_VERSION:
failures.append(
f"release_manifest.json 'schema_version' is {schema_version!r}; "
f"this library publishes version "
f"{RELEASE_MANIFEST_SCHEMA_VERSION}."
)
build = manifest.get("build")
if not isinstance(build, Mapping) or not build.get("build_id"):
failures.append(
"release_manifest.json is missing 'build.build_id'."
)
elif build["build_id"] != release_id:
failures.append(
f"release_manifest.json 'build.build_id' is "
f"{build['build_id']!r} but the release directory is named "
f"{release_id!r}."
)
artifacts = manifest.get("artifacts")
if not isinstance(artifacts, Mapping) or not artifacts:
failures.append(
"release_manifest.json must declare a non-empty 'artifacts' "
"mapping."
)
else:
for key, entry in artifacts.items():
if not isinstance(entry, Mapping):
failures.append(
f"release_manifest.json artifact {key!r} must be an "
f"object."
)
continue
for field in ("path", "repo_id", "sha256"):
if not entry.get(field):
failures.append(
f"release_manifest.json artifact {key!r} is missing "
f"{field!r}."
)


def validate_release_dir(release_dir: Path | str) -> None:
"""Check a local release directory against the release contract.

The directory name is the build id (``populace-us-2024-<sha>-<date>``);
its files are what :data:`REQUIRED_RELEASE_FILES` names; and both
manifests must agree with the directory about which build this is.

Args:
release_dir: The local ``releases/<build_id>`` directory about to be
published.

Raises:
ReleaseContractError: Naming every violation found — missing files,
unparseable or unversioned manifests, schema drift, and build-id
mismatches between the manifests and the directory name.
"""
release_dir = Path(release_dir)
release_id = release_dir.name
failures: list[str] = []

if not release_dir.is_dir():
raise ReleaseContractError(
release_dir, [f"{release_dir} is not a directory."]
)

for filename in REQUIRED_RELEASE_FILES:
if not (release_dir / filename).is_file():
failures.append(f"required file {filename!r} is missing.")

build_manifest_path = release_dir / "build_manifest.json"
if build_manifest_path.is_file():
manifest = _load_json(build_manifest_path, failures)
if manifest is not None:
_check_build_manifest(manifest, release_id, failures)

release_manifest_path = release_dir / "release_manifest.json"
if release_manifest_path.is_file():
manifest = _load_json(release_manifest_path, failures)
if manifest is not None:
_check_release_manifest(manifest, release_id, failures)

if failures:
raise ReleaseContractError(release_dir, failures)
159 changes: 159 additions & 0 deletions packages/populace-data/tests/test_contract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
"""The release contract: every published release looks the same, loudly.

These are behavioral tests against the failure modes already observed on the
Hub: a release with no build manifest at all (1abddeb), and two coexisting
release-manifest schemas (an unversioned early shape next to
``schema_version: 1``). A valid release passes silently; every broken release
fails with each violation named.
"""

from __future__ import annotations

import json
from pathlib import Path

import pytest

from populace.data import (
RELEASE_MANIFEST_SCHEMA_VERSION,
REQUIRED_RELEASE_FILES,
ReleaseContractError,
validate_release_dir,
)

RELEASE_ID = "populace-us-2024-9f1260b-20260611"


def _build_manifest(release_id: str = RELEASE_ID) -> dict:
return {
"build_id": release_id,
"builder": "populace",
"dataset": {"filename": "populace_us_2024.h5", "sha256": "dc75c0"},
"calibration": {
"filename": "populace_us_2024_calibration.npz",
"sha256": "a3da2f",
},
"gates": {"parity_gaps": 0},
"score_vs_enhanced_cps": {"per_target_wins": {}},
}


def _release_manifest(release_id: str = RELEASE_ID) -> dict:
return {
"schema_version": RELEASE_MANIFEST_SCHEMA_VERSION,
"data_package": {"name": "populace-data", "version": "0.1.0"},
"build": {"build_id": release_id},
"artifacts": {
"populace_us_2024": {
"kind": "microdata",
"path": "populace_us_2024.h5",
"repo_id": "policyengine/populace-us",
"sha256": "dc75c0",
}
},
}


@pytest.fixture
def release_dir(tmp_path: Path) -> Path:
"""A complete, contract-valid release directory."""
directory = tmp_path / "releases" / RELEASE_ID
directory.mkdir(parents=True)
(directory / "build_manifest.json").write_text(json.dumps(_build_manifest()))
(directory / "release_manifest.json").write_text(
json.dumps(_release_manifest())
)
(directory / "sound_ecps_replacement_comparison.json").write_text(
json.dumps({"schema_version": 1, "target_diagnostics": {}})
)
return directory


def test_a_complete_release_passes(release_dir: Path) -> None:
validate_release_dir(release_dir)


@pytest.mark.parametrize("filename", REQUIRED_RELEASE_FILES)
def test_each_required_file_is_named_when_missing(
release_dir: Path, filename: str
) -> None:
(release_dir / filename).unlink()
with pytest.raises(ReleaseContractError, match=filename):
validate_release_dir(release_dir)


def test_the_1abddeb_shape_is_rejected(release_dir: Path) -> None:
"""The regression: a release with only an unversioned release manifest."""
(release_dir / "build_manifest.json").unlink()
(release_dir / "sound_ecps_replacement_comparison.json").unlink()
(release_dir / "release_manifest.json").write_text(
json.dumps(
{
"release_id": RELEASE_ID,
"country_id": "us",
"artifacts": {},
"validation": {},
}
)
)
with pytest.raises(ReleaseContractError) as excinfo:
validate_release_dir(release_dir)
failures = "\n".join(excinfo.value.failures)
assert "build_manifest.json" in failures
assert "schema_version" in failures


def test_schema_drift_is_rejected_by_version(release_dir: Path) -> None:
manifest = _release_manifest()
manifest["schema_version"] = RELEASE_MANIFEST_SCHEMA_VERSION + 1
(release_dir / "release_manifest.json").write_text(json.dumps(manifest))
with pytest.raises(ReleaseContractError, match="schema_version"):
validate_release_dir(release_dir)


def test_build_id_mismatch_names_both_ids(release_dir: Path) -> None:
(release_dir / "build_manifest.json").write_text(
json.dumps(_build_manifest("populace-us-2024-other-20260101"))
)
with pytest.raises(ReleaseContractError, match="populace-us-2024-other"):
validate_release_dir(release_dir)


def test_release_manifest_build_id_must_match_directory(
release_dir: Path,
) -> None:
manifest = _release_manifest("populace-us-2024-other-20260101")
(release_dir / "release_manifest.json").write_text(json.dumps(manifest))
with pytest.raises(ReleaseContractError, match="build.build_id"):
validate_release_dir(release_dir)


def test_artifact_entries_must_carry_provenance(release_dir: Path) -> None:
manifest = _release_manifest()
manifest["artifacts"]["populace_us_2024"].pop("sha256")
(release_dir / "release_manifest.json").write_text(json.dumps(manifest))
with pytest.raises(ReleaseContractError, match="sha256"):
validate_release_dir(release_dir)


def test_unparseable_manifest_is_a_named_failure(release_dir: Path) -> None:
(release_dir / "build_manifest.json").write_text("{not json")
with pytest.raises(ReleaseContractError, match="not valid JSON"):
validate_release_dir(release_dir)


def test_all_failures_reported_at_once(release_dir: Path) -> None:
"""A publisher sees the full repair list, not one failure per run."""
(release_dir / "sound_ecps_replacement_comparison.json").unlink()
manifest = _release_manifest()
del manifest["schema_version"]
manifest["artifacts"] = {}
(release_dir / "release_manifest.json").write_text(json.dumps(manifest))
with pytest.raises(ReleaseContractError) as excinfo:
validate_release_dir(release_dir)
assert len(excinfo.value.failures) >= 3


def test_a_missing_directory_is_a_contract_error(tmp_path: Path) -> None:
with pytest.raises(ReleaseContractError, match="is not a directory"):
validate_release_dir(tmp_path / "releases" / "nope")
Loading