diff --git a/packages/populace-data/src/populace/data/__init__.py b/packages/populace-data/src/populace/data/__init__.py index 3422288..855ab7d 100644 --- a/packages/populace-data/src/populace/data/__init__.py +++ b/packages/populace-data/src/populace/data/__init__.py @@ -35,6 +35,14 @@ resolve, ) from populace.data.registry import REGISTRY, DatasetSpec, register +from populace.data.release import ( + LATEST_POINTER_PATH, + LATEST_POINTER_SCHEMA_VERSION, + LatestPointer, + latest_pointer_payload, + latest_release, + publish_release, +) __all__ = [ "load", @@ -49,6 +57,12 @@ "REQUIRED_RELEASE_FILES", "ReleaseContractError", "validate_release_dir", + "LATEST_POINTER_PATH", + "LATEST_POINTER_SCHEMA_VERSION", + "LatestPointer", + "latest_pointer_payload", + "latest_release", + "publish_release", ] __version__ = "0.1.0" diff --git a/packages/populace-data/src/populace/data/release.py b/packages/populace-data/src/populace/data/release.py new file mode 100644 index 0000000..4aa4b30 --- /dev/null +++ b/packages/populace-data/src/populace/data/release.py @@ -0,0 +1,212 @@ +"""Publish a release and point ``latest.json`` at it. + +The Hub repo publishes builds under ``releases//``, but nothing +identified which release is *current*: a consumer had to list the tree and +guess, and because build ids end in a date (``populace-us-2024-9f1260b- +20260611``), two builds published the same day have no defined ordering. +``latest.json`` at the repo root is that missing pointer — a tiny manifest +naming the current release and the path of each of its contract files. + +Two sides of the pointer live here: + +- :func:`publish_release` is the producer: it validates the local release + directory against the :mod:`release contract ` + (a release that fails the contract refuses to publish), uploads its + files, and uploads ``latest.json`` **last** — so a reader never sees the + pointer before the release it points at. +- :func:`latest_release` is the consumer: it downloads ``latest.json`` and + returns the typed pointer, the one-call answer to "which release is + current?" for dashboards and scorers. + +The Hub client is injected (``api=``) everywhere it is used, so the suite +exercises the real ordering and payloads against a fake — no network, no +mocking of our own internals. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from datetime import UTC, datetime +from pathlib import Path + +from populace.data.contract import ( + REQUIRED_RELEASE_FILES, + validate_release_dir, +) + +__all__ = [ + "LATEST_POINTER_PATH", + "LATEST_POINTER_SCHEMA_VERSION", + "LatestPointer", + "latest_pointer_payload", + "publish_release", + "latest_release", +] + +#: Where the pointer lives in the dataset repo. The root, not a release +#: directory: the pointer is repo state, not release state. +LATEST_POINTER_PATH = "latest.json" + +#: Version of the pointer payload itself, so the pointer can evolve without +#: consumers guessing (the same discipline the release manifest learned). +LATEST_POINTER_SCHEMA_VERSION = 1 + + +@dataclass(frozen=True) +class LatestPointer: + """The parsed ``latest.json``: which release is current, and where. + + Attributes: + release_id: The current build id (the ``releases/`` directory name). + updated_at: ISO-8601 UTC timestamp of when the pointer was written. + paths: Repo-relative path of each contract file, keyed by its stem + (``"build_manifest"``, ``"release_manifest"``, + ``"sound_ecps_replacement_comparison"``). + """ + + release_id: str + updated_at: str + paths: dict[str, str] + + +def latest_pointer_payload( + release_id: str, *, updated_at: str | None = None +) -> dict: + """The ``latest.json`` payload for ``release_id``. + + Paths are derived from the release contract — the pointer names exactly + the files :data:`~populace.data.contract.REQUIRED_RELEASE_FILES` + guarantees exist. + + Args: + release_id: The build id being made current. + updated_at: ISO-8601 UTC timestamp; defaults to now. + """ + if updated_at is None: + updated_at = ( + datetime.now(UTC).replace(microsecond=0).isoformat() + ) + return { + "schema_version": LATEST_POINTER_SCHEMA_VERSION, + "release_id": release_id, + "updated_at": updated_at, + "paths": { + filename.removesuffix(".json"): f"releases/{release_id}/{filename}" + for filename in REQUIRED_RELEASE_FILES + }, + } + + +def _hf_api(): + try: + from huggingface_hub import HfApi + except ImportError as exc: # pragma: no cover - declared dependency + raise ImportError( + "populace-data needs huggingface_hub to publish releases; " + "reinstall populace-data with its dependencies." + ) from exc + return HfApi() + + +def publish_release( + release_dir: Path | str, + repo_id: str, + *, + api=None, + extra_files: tuple[str, ...] = (), + updated_at: str | None = None, +) -> dict: + """Upload a release directory and point ``latest.json`` at it. + + The order is the guarantee: the release contract is validated first (an + invalid release never reaches the Hub), every release file is uploaded + next, and the pointer goes up **last**, so a consumer that reads + ``latest.json`` always finds the files it names. + + Args: + release_dir: Local ``releases/`` directory. + repo_id: Hub dataset repo, e.g. ``"policyengine/populace-us"``. + api: A ``huggingface_hub.HfApi``-shaped object (anything with + ``upload_file(path_or_fileobj=, path_in_repo=, repo_id=, + repo_type=)``); constructed lazily when omitted. + extra_files: Additional filenames in ``release_dir`` to upload + beyond the contract files (e.g. a diagnostics artifact). + updated_at: Pointer timestamp; defaults to now (UTC). + + Returns: + The ``latest.json`` payload that was published. + + Raises: + ReleaseContractError: If the release directory violates the + contract. Nothing is uploaded in that case. + FileNotFoundError: If an ``extra_files`` entry does not exist. + """ + release_dir = Path(release_dir) + validate_release_dir(release_dir) + release_id = release_dir.name + if api is None: + api = _hf_api() + + filenames = list(REQUIRED_RELEASE_FILES) + [ + name for name in extra_files if name not in REQUIRED_RELEASE_FILES + ] + for filename in filenames: + local = release_dir / filename + if not local.is_file(): + raise FileNotFoundError( + f"extra release file {filename!r} not found in {release_dir}." + ) + api.upload_file( + path_or_fileobj=str(local), + path_in_repo=f"releases/{release_id}/{filename}", + repo_id=repo_id, + repo_type="dataset", + ) + + payload = latest_pointer_payload(release_id, updated_at=updated_at) + api.upload_file( + path_or_fileobj=json.dumps(payload, indent=1).encode(), + path_in_repo=LATEST_POINTER_PATH, + repo_id=repo_id, + repo_type="dataset", + ) + return payload + + +def latest_release(repo_id: str, *, api=None) -> LatestPointer: + """Read ``latest.json`` from a dataset repo: which release is current. + + Args: + repo_id: Hub dataset repo, e.g. ``"policyengine/populace-us"``. + api: A ``huggingface_hub.HfApi``-shaped object (anything with + ``hf_hub_download(repo_id=, filename=, repo_type=)``); + constructed lazily when omitted. + + Raises: + ValueError: If the pointer is malformed or its schema version is + newer than this library understands. + """ + if api is None: + api = _hf_api() + local = api.hf_hub_download( + repo_id=repo_id, filename=LATEST_POINTER_PATH, repo_type="dataset" + ) + payload = json.loads(Path(local).read_text()) + schema_version = payload.get("schema_version") + if schema_version != LATEST_POINTER_SCHEMA_VERSION: + raise ValueError( + f"{LATEST_POINTER_PATH} in {repo_id} has schema_version " + f"{schema_version!r}; this populace-data reads version " + f"{LATEST_POINTER_SCHEMA_VERSION}. Upgrade populace-data." + ) + release_id = payload.get("release_id") + if not release_id: + raise ValueError( + f"{LATEST_POINTER_PATH} in {repo_id} has no 'release_id'." + ) + return LatestPointer( + release_id=str(release_id), + updated_at=str(payload.get("updated_at", "")), + paths={str(k): str(v) for k, v in (payload.get("paths") or {}).items()}, + ) diff --git a/packages/populace-data/tests/test_release.py b/packages/populace-data/tests/test_release.py new file mode 100644 index 0000000..3dda64e --- /dev/null +++ b/packages/populace-data/tests/test_release.py @@ -0,0 +1,191 @@ +"""Publishing behavior: contract-gated uploads and a last-written pointer. + +The fake Hub client records every upload in order, so the suite asserts the +real guarantees — an invalid release uploads nothing, and ``latest.json`` +lands strictly after the files it points at — rather than implementation +details. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from populace.data import ReleaseContractError +from populace.data.contract import REQUIRED_RELEASE_FILES +from populace.data.release import ( + LATEST_POINTER_PATH, + LATEST_POINTER_SCHEMA_VERSION, + latest_pointer_payload, + latest_release, + publish_release, +) + +RELEASE_ID = "populace-us-2024-9f1260b-20260611" + + +class FakeHub: + """Records uploads in order; serves downloads from what was uploaded.""" + + def __init__(self) -> None: + self.uploads: list[tuple[str, bytes]] = [] + + def upload_file( + self, *, path_or_fileobj, path_in_repo, repo_id, repo_type + ) -> None: + assert repo_type == "dataset" + assert repo_id == "policyengine/populace-us" + if isinstance(path_or_fileobj, bytes): + content = path_or_fileobj + else: + content = Path(path_or_fileobj).read_bytes() + self.uploads.append((path_in_repo, content)) + + def hf_hub_download(self, *, repo_id, filename, repo_type) -> str: + assert repo_type == "dataset" + for path_in_repo, content in reversed(self.uploads): + if path_in_repo == filename: + local = self._download_dir / filename + local.parent.mkdir(parents=True, exist_ok=True) + local.write_bytes(content) + return str(local) + raise FileNotFoundError(filename) + + +@pytest.fixture +def hub(tmp_path: Path) -> FakeHub: + fake = FakeHub() + fake._download_dir = tmp_path / "hub-cache" + return fake + + +@pytest.fixture +def release_dir(tmp_path: Path) -> Path: + directory = tmp_path / "releases" / RELEASE_ID + directory.mkdir(parents=True) + (directory / "build_manifest.json").write_text( + json.dumps( + { + "build_id": RELEASE_ID, + "dataset": {"filename": "populace_us_2024.h5", "sha256": "dc"}, + "gates": {"parity_gaps": 0}, + } + ) + ) + (directory / "release_manifest.json").write_text( + json.dumps( + { + "schema_version": 1, + "build": {"build_id": RELEASE_ID}, + "artifacts": { + "populace_us_2024": { + "path": "populace_us_2024.h5", + "repo_id": "policyengine/populace-us", + "sha256": "dc", + } + }, + } + ) + ) + (directory / "sound_ecps_replacement_comparison.json").write_text("{}") + return directory + + +def test_pointer_payload_names_every_contract_file() -> None: + payload = latest_pointer_payload(RELEASE_ID, updated_at="2026-06-11T13:53:15+00:00") + assert payload["schema_version"] == LATEST_POINTER_SCHEMA_VERSION + assert payload["release_id"] == RELEASE_ID + assert set(payload["paths"]) == { + name.removesuffix(".json") for name in REQUIRED_RELEASE_FILES + } + assert ( + payload["paths"]["build_manifest"] + == f"releases/{RELEASE_ID}/build_manifest.json" + ) + + +def test_publish_uploads_pointer_last(hub: FakeHub, release_dir: Path) -> None: + publish_release( + release_dir, + "policyengine/populace-us", + api=hub, + updated_at="2026-06-11T13:53:15+00:00", + ) + uploaded_paths = [path for path, _ in hub.uploads] + assert uploaded_paths[-1] == LATEST_POINTER_PATH + for filename in REQUIRED_RELEASE_FILES: + assert f"releases/{RELEASE_ID}/{filename}" in uploaded_paths[:-1] + + +def test_invalid_release_uploads_nothing(hub: FakeHub, release_dir: Path) -> None: + (release_dir / "build_manifest.json").unlink() + with pytest.raises(ReleaseContractError): + publish_release(release_dir, "policyengine/populace-us", api=hub) + assert hub.uploads == [] + + +def test_extra_files_ride_along_before_the_pointer( + hub: FakeHub, release_dir: Path +) -> None: + (release_dir / "calibration_diagnostics.json").write_text("{}") + publish_release( + release_dir, + "policyengine/populace-us", + api=hub, + extra_files=("calibration_diagnostics.json",), + ) + uploaded_paths = [path for path, _ in hub.uploads] + extra = f"releases/{RELEASE_ID}/calibration_diagnostics.json" + assert extra in uploaded_paths + assert uploaded_paths.index(extra) < uploaded_paths.index(LATEST_POINTER_PATH) + + +def test_missing_extra_file_fails_loudly(hub: FakeHub, release_dir: Path) -> None: + with pytest.raises(FileNotFoundError, match="calibration_diagnostics"): + publish_release( + release_dir, + "policyengine/populace-us", + api=hub, + extra_files=("calibration_diagnostics.json",), + ) + + +def test_publish_then_resolve_round_trips( + hub: FakeHub, release_dir: Path +) -> None: + published = publish_release( + release_dir, + "policyengine/populace-us", + api=hub, + updated_at="2026-06-11T13:53:15+00:00", + ) + pointer = latest_release("policyengine/populace-us", api=hub) + assert pointer.release_id == RELEASE_ID + assert pointer.updated_at == "2026-06-11T13:53:15+00:00" + assert pointer.paths == published["paths"] + + +def test_future_pointer_schema_is_refused(hub: FakeHub) -> None: + hub.uploads.append( + ( + LATEST_POINTER_PATH, + json.dumps( + {"schema_version": LATEST_POINTER_SCHEMA_VERSION + 1} + ).encode(), + ) + ) + with pytest.raises(ValueError, match="Upgrade populace-data"): + latest_release("policyengine/populace-us", api=hub) + + +def test_pointer_without_release_id_is_refused(hub: FakeHub) -> None: + hub.uploads.append( + ( + LATEST_POINTER_PATH, + json.dumps({"schema_version": LATEST_POINTER_SCHEMA_VERSION}).encode(), + ) + ) + with pytest.raises(ValueError, match="release_id"): + latest_release("policyengine/populace-us", api=hub)