diff --git a/packages/populace-data/src/populace/data/__init__.py b/packages/populace-data/src/populace/data/__init__.py index ed94607..3422288 100644 --- a/packages/populace-data/src/populace/data/__init__.py +++ b/packages/populace-data/src/populace/data/__init__.py @@ -21,6 +21,12 @@ there is no kernel in its dependency closure to gate against. """ +from populace.data.contract import ( + RELEASE_MANIFEST_SCHEMA_VERSION, + REQUIRED_RELEASE_FILES, + ReleaseContractError, + validate_release_dir, +) from populace.data.loader import ( available, download, @@ -39,6 +45,10 @@ "DatasetSpec", "REGISTRY", "register", + "RELEASE_MANIFEST_SCHEMA_VERSION", + "REQUIRED_RELEASE_FILES", + "ReleaseContractError", + "validate_release_dir", ] __version__ = "0.1.0" diff --git a/packages/populace-data/src/populace/data/contract.py b/packages/populace-data/src/populace/data/contract.py new file mode 100644 index 0000000..b11f837 --- /dev/null +++ b/packages/populace-data/src/populace/data/contract.py @@ -0,0 +1,196 @@ +"""The release artifact contract: what a published release MUST contain. + +The releases already on the Hub disagree with each other — one carries no +``build_manifest.json`` at all, and two different ``release_manifest.json`` +schemas coexist (an unversioned early shape next to ``schema_version: 1``). +A consumer iterating ``releases/`` therefore cannot trust the listing, and +every consumer ends up re-implementing its own defensive filter. The charter +makes "stage manifests are load-bearing" a binding process rule; the release +directory is the most public manifest of all, so its contract lives here, +with the producer — not in every consumer. + +:func:`validate_release_dir` is the single gate: it checks a local release +directory against the contract and raises :class:`ReleaseContractError` +naming **every** failure at once (a publisher should see the full repair +list, not play whack-a-mole one failure per run). Publishing code calls it +before any byte reaches the Hub. +""" + +from __future__ import annotations + +import json +from collections.abc import Mapping +from pathlib import Path + +__all__ = [ + "RELEASE_MANIFEST_SCHEMA_VERSION", + "REQUIRED_RELEASE_FILES", + "ReleaseContractError", + "validate_release_dir", +] + +#: The release-manifest schema this library reads and writes. Bump it with the +#: schema, and keep :func:`validate_release_dir` rejecting drift loudly — the +#: unversioned 1abddeb-era manifest is exactly the silence this guards against. +RELEASE_MANIFEST_SCHEMA_VERSION = 1 + +#: Files a release directory must contain to count as published. A release +#: missing any of these is invisible to :func:`validate_release_dir`-respecting +#: publishers, by design. +REQUIRED_RELEASE_FILES = ( + "build_manifest.json", + "release_manifest.json", + "sound_ecps_replacement_comparison.json", +) + + +class ReleaseContractError(ValueError): + """A release directory violates the release contract. + + Attributes: + failures: Every contract violation found, each a self-contained + human-readable sentence naming the file and field at fault. + """ + + def __init__(self, release_dir: Path, failures: list[str]) -> None: + self.failures = list(failures) + bullet_list = "\n".join(f" - {failure}" for failure in self.failures) + super().__init__( + f"Release directory {release_dir} violates the release contract " + f"({len(self.failures)} failure(s)):\n{bullet_list}" + ) + + +def _load_json(path: Path, failures: list[str]) -> Mapping | None: + try: + loaded = json.loads(path.read_text()) + except json.JSONDecodeError as exc: + failures.append(f"{path.name} is not valid JSON: {exc}.") + return None + if not isinstance(loaded, Mapping): + failures.append( + f"{path.name} must be a JSON object, got {type(loaded).__name__}." + ) + return None + return loaded + + +def _check_build_manifest( + manifest: Mapping, release_id: str, failures: list[str] +) -> None: + build_id = manifest.get("build_id") + if not build_id: + failures.append("build_manifest.json is missing 'build_id'.") + elif build_id != release_id: + failures.append( + f"build_manifest.json 'build_id' is {build_id!r} but the release " + f"directory is named {release_id!r}; the directory name IS the " + f"build id." + ) + dataset = manifest.get("dataset") + if not isinstance(dataset, Mapping): + failures.append("build_manifest.json is missing the 'dataset' object.") + else: + for key in ("filename", "sha256"): + if not dataset.get(key): + failures.append( + f"build_manifest.json 'dataset' is missing {key!r}." + ) + if not isinstance(manifest.get("gates"), Mapping): + failures.append( + "build_manifest.json is missing the 'gates' object (the " + "acceptance-gate verdicts are the point of the manifest)." + ) + + +def _check_release_manifest( + manifest: Mapping, release_id: str, failures: list[str] +) -> None: + schema_version = manifest.get("schema_version") + if schema_version is None: + failures.append( + "release_manifest.json has no 'schema_version'; unversioned " + "manifests (the 1abddeb-era shape) are not publishable." + ) + elif schema_version != RELEASE_MANIFEST_SCHEMA_VERSION: + failures.append( + f"release_manifest.json 'schema_version' is {schema_version!r}; " + f"this library publishes version " + f"{RELEASE_MANIFEST_SCHEMA_VERSION}." + ) + build = manifest.get("build") + if not isinstance(build, Mapping) or not build.get("build_id"): + failures.append( + "release_manifest.json is missing 'build.build_id'." + ) + elif build["build_id"] != release_id: + failures.append( + f"release_manifest.json 'build.build_id' is " + f"{build['build_id']!r} but the release directory is named " + f"{release_id!r}." + ) + artifacts = manifest.get("artifacts") + if not isinstance(artifacts, Mapping) or not artifacts: + failures.append( + "release_manifest.json must declare a non-empty 'artifacts' " + "mapping." + ) + else: + for key, entry in artifacts.items(): + if not isinstance(entry, Mapping): + failures.append( + f"release_manifest.json artifact {key!r} must be an " + f"object." + ) + continue + for field in ("path", "repo_id", "sha256"): + if not entry.get(field): + failures.append( + f"release_manifest.json artifact {key!r} is missing " + f"{field!r}." + ) + + +def validate_release_dir(release_dir: Path | str) -> None: + """Check a local release directory against the release contract. + + The directory name is the build id (``populace-us-2024--``); + its files are what :data:`REQUIRED_RELEASE_FILES` names; and both + manifests must agree with the directory about which build this is. + + Args: + release_dir: The local ``releases/`` directory about to be + published. + + Raises: + ReleaseContractError: Naming every violation found — missing files, + unparseable or unversioned manifests, schema drift, and build-id + mismatches between the manifests and the directory name. + """ + release_dir = Path(release_dir) + release_id = release_dir.name + failures: list[str] = [] + + if not release_dir.is_dir(): + raise ReleaseContractError( + release_dir, [f"{release_dir} is not a directory."] + ) + + for filename in REQUIRED_RELEASE_FILES: + if not (release_dir / filename).is_file(): + failures.append(f"required file {filename!r} is missing.") + + build_manifest_path = release_dir / "build_manifest.json" + if build_manifest_path.is_file(): + manifest = _load_json(build_manifest_path, failures) + if manifest is not None: + _check_build_manifest(manifest, release_id, failures) + + release_manifest_path = release_dir / "release_manifest.json" + if release_manifest_path.is_file(): + manifest = _load_json(release_manifest_path, failures) + if manifest is not None: + _check_release_manifest(manifest, release_id, failures) + + if failures: + raise ReleaseContractError(release_dir, failures) diff --git a/packages/populace-data/tests/test_contract.py b/packages/populace-data/tests/test_contract.py new file mode 100644 index 0000000..133d725 --- /dev/null +++ b/packages/populace-data/tests/test_contract.py @@ -0,0 +1,159 @@ +"""The release contract: every published release looks the same, loudly. + +These are behavioral tests against the failure modes already observed on the +Hub: a release with no build manifest at all (1abddeb), and two coexisting +release-manifest schemas (an unversioned early shape next to +``schema_version: 1``). A valid release passes silently; every broken release +fails with each violation named. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from populace.data import ( + RELEASE_MANIFEST_SCHEMA_VERSION, + REQUIRED_RELEASE_FILES, + ReleaseContractError, + validate_release_dir, +) + +RELEASE_ID = "populace-us-2024-9f1260b-20260611" + + +def _build_manifest(release_id: str = RELEASE_ID) -> dict: + return { + "build_id": release_id, + "builder": "populace", + "dataset": {"filename": "populace_us_2024.h5", "sha256": "dc75c0"}, + "calibration": { + "filename": "populace_us_2024_calibration.npz", + "sha256": "a3da2f", + }, + "gates": {"parity_gaps": 0}, + "score_vs_enhanced_cps": {"per_target_wins": {}}, + } + + +def _release_manifest(release_id: str = RELEASE_ID) -> dict: + return { + "schema_version": RELEASE_MANIFEST_SCHEMA_VERSION, + "data_package": {"name": "populace-data", "version": "0.1.0"}, + "build": {"build_id": release_id}, + "artifacts": { + "populace_us_2024": { + "kind": "microdata", + "path": "populace_us_2024.h5", + "repo_id": "policyengine/populace-us", + "sha256": "dc75c0", + } + }, + } + + +@pytest.fixture +def release_dir(tmp_path: Path) -> Path: + """A complete, contract-valid release directory.""" + directory = tmp_path / "releases" / RELEASE_ID + directory.mkdir(parents=True) + (directory / "build_manifest.json").write_text(json.dumps(_build_manifest())) + (directory / "release_manifest.json").write_text( + json.dumps(_release_manifest()) + ) + (directory / "sound_ecps_replacement_comparison.json").write_text( + json.dumps({"schema_version": 1, "target_diagnostics": {}}) + ) + return directory + + +def test_a_complete_release_passes(release_dir: Path) -> None: + validate_release_dir(release_dir) + + +@pytest.mark.parametrize("filename", REQUIRED_RELEASE_FILES) +def test_each_required_file_is_named_when_missing( + release_dir: Path, filename: str +) -> None: + (release_dir / filename).unlink() + with pytest.raises(ReleaseContractError, match=filename): + validate_release_dir(release_dir) + + +def test_the_1abddeb_shape_is_rejected(release_dir: Path) -> None: + """The regression: a release with only an unversioned release manifest.""" + (release_dir / "build_manifest.json").unlink() + (release_dir / "sound_ecps_replacement_comparison.json").unlink() + (release_dir / "release_manifest.json").write_text( + json.dumps( + { + "release_id": RELEASE_ID, + "country_id": "us", + "artifacts": {}, + "validation": {}, + } + ) + ) + with pytest.raises(ReleaseContractError) as excinfo: + validate_release_dir(release_dir) + failures = "\n".join(excinfo.value.failures) + assert "build_manifest.json" in failures + assert "schema_version" in failures + + +def test_schema_drift_is_rejected_by_version(release_dir: Path) -> None: + manifest = _release_manifest() + manifest["schema_version"] = RELEASE_MANIFEST_SCHEMA_VERSION + 1 + (release_dir / "release_manifest.json").write_text(json.dumps(manifest)) + with pytest.raises(ReleaseContractError, match="schema_version"): + validate_release_dir(release_dir) + + +def test_build_id_mismatch_names_both_ids(release_dir: Path) -> None: + (release_dir / "build_manifest.json").write_text( + json.dumps(_build_manifest("populace-us-2024-other-20260101")) + ) + with pytest.raises(ReleaseContractError, match="populace-us-2024-other"): + validate_release_dir(release_dir) + + +def test_release_manifest_build_id_must_match_directory( + release_dir: Path, +) -> None: + manifest = _release_manifest("populace-us-2024-other-20260101") + (release_dir / "release_manifest.json").write_text(json.dumps(manifest)) + with pytest.raises(ReleaseContractError, match="build.build_id"): + validate_release_dir(release_dir) + + +def test_artifact_entries_must_carry_provenance(release_dir: Path) -> None: + manifest = _release_manifest() + manifest["artifacts"]["populace_us_2024"].pop("sha256") + (release_dir / "release_manifest.json").write_text(json.dumps(manifest)) + with pytest.raises(ReleaseContractError, match="sha256"): + validate_release_dir(release_dir) + + +def test_unparseable_manifest_is_a_named_failure(release_dir: Path) -> None: + (release_dir / "build_manifest.json").write_text("{not json") + with pytest.raises(ReleaseContractError, match="not valid JSON"): + validate_release_dir(release_dir) + + +def test_all_failures_reported_at_once(release_dir: Path) -> None: + """A publisher sees the full repair list, not one failure per run.""" + (release_dir / "sound_ecps_replacement_comparison.json").unlink() + manifest = _release_manifest() + del manifest["schema_version"] + manifest["artifacts"] = {} + (release_dir / "release_manifest.json").write_text(json.dumps(manifest)) + with pytest.raises(ReleaseContractError) as excinfo: + validate_release_dir(release_dir) + assert len(excinfo.value.failures) >= 3 + + +def test_a_missing_directory_is_a_contract_error(tmp_path: Path) -> None: + with pytest.raises(ReleaseContractError, match="is not a directory"): + validate_release_dir(tmp_path / "releases" / "nope")