Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/populace-us-20260616.fixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Certify the Populace-US data release `populace-us-2024-a912aea-76666318a202-20260616T175345Z` and keep data-only release refreshes aligned with release-scoped diagnostics artifacts.
48 changes: 30 additions & 18 deletions src/policyengine/data/release_manifests/us.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,54 +2,66 @@
"bundle_id": "us-4.17.4",
"certification": {
"built_with_model_version": "1.729.0",
"certified_by": "policyengine.py certification",
"certified_by": "populace-data release manifest",
"certified_for_model_version": "1.729.0",
"compatibility_basis": "built_with_model_package",
"data_build_id": "populace-us-2024-f32c2e5-0c38bc47db89-20260616T124451Z"
"compatibility_basis": "exact_build_model_version",
"data_build_id": "populace-us-2024-a912aea-76666318a202-20260616T175345Z"
},
"certified_data_artifact": {
"build_id": "populace-us-2024-f32c2e5-0c38bc47db89-20260616T124451Z",
"build_id": "populace-us-2024-a912aea-76666318a202-20260616T175345Z",
"data_package": {
"name": "populace-data",
"version": "0.1.0"
},
"dataset": "populace_us_2024",
"sha256": "a912aea0ca0c27bb5225516bf859d4937cad0579a2f3971d7306086bc61cbfd7",
"uri": "hf://policyengine/populace-us/populace_us_2024.h5@populace-us-2024-f32c2e5-0c38bc47db89-20260616T124451Z"
"sha256": "9d87c7ff370be524e73aaf68d151b00846eefcae4b00a63760102e2c6f285f92",
"uri": "hf://policyengine/populace-us/populace_us_2024.h5@c4e2fd454ddce0e1889ab77abff178a7bdd72b18"
},
"country_id": "us",
"data_package": {
"name": "populace-data",
"release_manifest_path": "releases/populace-us-2024-f32c2e5-0c38bc47db89-20260616T124451Z/release_manifest.json",
"release_manifest_revision": "populace-us-2024-f32c2e5-0c38bc47db89-20260616T124451Z",
"release_manifest_path": "releases/populace-us-2024-a912aea-76666318a202-20260616T175345Z/release_manifest.json",
"release_manifest_revision": "c4e2fd454ddce0e1889ab77abff178a7bdd72b18",
"repo_id": "policyengine/populace-us",
"repo_type": "dataset",
"version": "0.1.0"
},
"datasets": {
"calibration_diagnostics": {
"path": "releases/populace-us-2024-f32c2e5-0c38bc47db89-20260616T124451Z/calibration_diagnostics.json",
"path": "releases/populace-us-2024-a912aea-76666318a202-20260616T175345Z/calibration_diagnostics.json",
"repo_id": "policyengine/populace-us",
"revision": "populace-us-2024-f32c2e5-0c38bc47db89-20260616T124451Z",
"sha256": "4c363816bce3decabb392c6d61002420fd60de85beec30022c3d81684765763b"
"revision": "populace-us-2024-a912aea-76666318a202-20260616T175345Z",
"sha256": "154a1b217211d92c50e0fb84750888920cf8a63afcf9437efa85e484a7d501c9"
},
"populace_us_2024": {
"path": "populace_us_2024.h5",
"repo_id": "policyengine/populace-us",
"revision": "populace-us-2024-f32c2e5-0c38bc47db89-20260616T124451Z",
"sha256": "a912aea0ca0c27bb5225516bf859d4937cad0579a2f3971d7306086bc61cbfd7"
"revision": "populace-us-2024-a912aea-76666318a202-20260616T175345Z",
"sha256": "9d87c7ff370be524e73aaf68d151b00846eefcae4b00a63760102e2c6f285f92"
},
"populace_us_2024_calibration": {
"path": "populace_us_2024_calibration.npz",
"repo_id": "policyengine/populace-us",
"revision": "populace-us-2024-f32c2e5-0c38bc47db89-20260616T124451Z",
"sha256": "f701d36341c5d6c39862acef14db074027bdcd289d3523571020c94892f33ccb"
"revision": "populace-us-2024-a912aea-76666318a202-20260616T175345Z",
"sha256": "0679dd35dbb198164beee6d56626af2b2fb57d6a3b6ea6511daf908e66296175"
},
"us_source_coverage": {
"path": "releases/populace-us-2024-f32c2e5-0c38bc47db89-20260616T124451Z/us_source_coverage.json",
"path": "releases/populace-us-2024-a912aea-76666318a202-20260616T175345Z/us_source_coverage.json",
"repo_id": "policyengine/populace-us",
"revision": "populace-us-2024-f32c2e5-0c38bc47db89-20260616T124451Z",
"sha256": "66c3749e39d324ae892b3482341468b6205d06e69140c19b7c5136f69e0cb0b3"
"revision": "populace-us-2024-a912aea-76666318a202-20260616T175345Z",
"sha256": "233a87ccc1c1eb8ed95321b7ebe586cd483e4e5af37686e182803f3b88edc76d"
},
"reform_validation": {
"path": "releases/populace-us-2024-a912aea-76666318a202-20260616T175345Z/reform_validation.json",
"revision": "populace-us-2024-a912aea-76666318a202-20260616T175345Z",
"repo_id": "policyengine/populace-us",
"sha256": "266851a23595eb832fdc3a88453fd60dfc12cd258e557b4ae7fd92b19eeb4f9e"
},
"demographics": {
"path": "releases/populace-us-2024-a912aea-76666318a202-20260616T175345Z/demographics.json",
"revision": "populace-us-2024-a912aea-76666318a202-20260616T175345Z",
"repo_id": "policyengine/populace-us",
"sha256": "073b9edf5a0594ab9e24c8aaae2a981a09ae510869a3f6c6cec3bb5500914897"
}
},
"default_dataset": "populace_us_2024",
Expand Down
29 changes: 13 additions & 16 deletions src/policyengine/data/release_manifests/us.trace.tro.jsonld
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"schema:name": "PolicyEngine",
"schema:url": "https://policyengine.org"
},
"schema:dateCreated": "2026-06-16T13:15:11.179501+00:00",
"schema:dateCreated": "2026-06-16T18:48:19.511357+00:00",
"schema:description": "TRACE TRO for certified runtime bundle us-4.17.4 covering the bundle manifest, the certified dataset artifact, the country model wheel, and the country data release manifest when it is available.",
"schema:name": "policyengine us certified bundle TRO",
"trov:createdWith": {
Expand Down Expand Up @@ -45,15 +45,15 @@
"trov:hasArtifact": {
"@id": "composition/1/artifact/data_release_manifest"
},
"trov:hasLocation": "https://huggingface.co/datasets/policyengine/populace-us/resolve/populace-us-2024-f32c2e5-0c38bc47db89-20260616T124451Z/releases/populace-us-2024-f32c2e5-0c38bc47db89-20260616T124451Z/release_manifest.json"
"trov:hasLocation": "https://huggingface.co/datasets/policyengine/populace-us/resolve/c4e2fd454ddce0e1889ab77abff178a7bdd72b18/releases/populace-us-2024-a912aea-76666318a202-20260616T175345Z/release_manifest.json"
},
{
"@id": "arrangement/1/location/dataset",
"@type": "trov:ArtifactLocation",
"trov:hasArtifact": {
"@id": "composition/1/artifact/dataset"
},
"trov:hasLocation": "https://huggingface.co/policyengine/populace-us/resolve/populace-us-2024-f32c2e5-0c38bc47db89-20260616T124451Z/populace_us_2024.h5"
"trov:hasLocation": "https://huggingface.co/datasets/policyengine/populace-us/resolve/c4e2fd454ddce0e1889ab77abff178a7bdd72b18/populace_us_2024.h5"
},
{
"@id": "arrangement/1/location/model_wheel",
Expand All @@ -75,21 +75,21 @@
"@type": "trov:ResearchArtifact",
"schema:name": "policyengine.py bundle manifest for us",
"trov:mimeType": "application/json",
"trov:sha256": "e9f9542464d4a73257b0a926463676122ac860ec5e217393d5c043bbc3185444"
"trov:sha256": "b7546cfb871cebc6a4ded1c5f6e1c5075e018577ccef92f8b9a32ed7faf0756e"
},
{
"@id": "composition/1/artifact/data_release_manifest",
"@type": "trov:ResearchArtifact",
"schema:name": "populace-data release manifest 0.1.0",
"trov:mimeType": "application/json",
"trov:sha256": "d2506e739a20ab4cdb8a709a93fb9c766e36566028e36b946e5f2f7d6714c061"
"trov:sha256": "b50925148419218dec45800baa1c3c683b966a622a8cb281e49410b65c92d099"
},
{
"@id": "composition/1/artifact/dataset",
"@type": "trov:ResearchArtifact",
"schema:name": "populace_us_2024",
"trov:mimeType": "application/x-hdf5",
"trov:sha256": "a912aea0ca0c27bb5225516bf859d4937cad0579a2f3971d7306086bc61cbfd7"
"trov:sha256": "9d87c7ff370be524e73aaf68d151b00846eefcae4b00a63760102e2c6f285f92"
},
{
"@id": "composition/1/artifact/model_wheel",
Expand All @@ -102,26 +102,23 @@
"trov:hasFingerprint": {
"@id": "composition/1/fingerprint",
"@type": "trov:CompositionFingerprint",
"trov:sha256": "9ec19a4d5c7074985ba476ebf0f7dff1eb4edb096a7e69857fa53f3c05bed2b5"
"trov:sha256": "575393c273989d893e6f1534b730f17896f652fcdb54f7f2057421794d0dd31b"
}
},
"trov:hasPerformance": {
"@id": "trp/1",
"@type": "trov:TransparentResearchPerformance",
"pe:builtWithModelVersion": "1.729.0",
"pe:certifiedBy": "policyengine.py certification",
"pe:certifiedBy": "populace-data release manifest",
"pe:certifiedForModelVersion": "1.729.0",
"pe:ciGitRef": "refs/heads/main",
"pe:ciGitSha": "e27453d8e0532d2394896570b7a1383f96930c69",
"pe:ciRunUrl": "https://github.com/PolicyEngine/policyengine.py/actions/runs/27622227038",
"pe:compatibilityBasis": "built_with_model_package",
"pe:dataBuildId": "populace-us-2024-f32c2e5-0c38bc47db89-20260616T124451Z",
"pe:emittedIn": "github-actions",
"rdfs:comment": "Certification of build populace-us-2024-f32c2e5-0c38bc47db89-20260616T124451Z for policyengine-us 1.729.0.",
"pe:compatibilityBasis": "exact_build_model_version",
"pe:dataBuildId": "populace-us-2024-a912aea-76666318a202-20260616T175345Z",
"pe:emittedIn": "local",
"rdfs:comment": "Certification of build populace-us-2024-a912aea-76666318a202-20260616T175345Z for policyengine-us 1.729.0.",
"trov:accessedArrangement": {
"@id": "arrangement/1"
},
"trov:startedAtTime": "2026-06-16T13:15:11.179501+00:00",
"trov:startedAtTime": "2026-06-16T18:48:19.511357+00:00",
"trov:wasConductedBy": {
"@id": "trs"
}
Expand Down
114 changes: 90 additions & 24 deletions src/policyengine/provenance/bundle.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import hashlib
import json
import os
import posixpath
import re
from dataclasses import dataclass
from pathlib import Path
Expand Down Expand Up @@ -258,6 +259,35 @@ def _metadata_sidecar_path(path: str) -> str:
return f"{path}.metadata.json"


def _release_scoped_artifact_path(
artifact: dict,
*,
release_manifest_path: str | None,
data_repo_id: str,
) -> str:
"""Return the dereferenceable country-manifest path for a release artifact.

Populace release manifests describe diagnostics relative to the release
directory, while the HF files are published under ``releases/{id}/``.
Runtime manifests must store the dereferenceable path.
"""
path = artifact.get("path", "")
if not path:
return path
if not release_manifest_path:
return path
release_dir = posixpath.dirname(release_manifest_path)
if (
release_dir
and release_dir != "."
and artifact.get("kind") == "diagnostics"
and artifact.get("repo_id") == data_repo_id
and not path.startswith(f"{release_dir}/")
):
return f"{release_dir}/{path}"
return path


def _specifier_matches(*, version: str, specifier: str) -> bool:
try:
return Version(version) in SpecifierSet(specifier)
Expand Down Expand Up @@ -324,6 +354,9 @@ def _release_manifest_compatibility_basis(
def _refresh_dataset_path_references_from_data_release(
manifest_json: dict,
release_manifest_json: dict,
*,
release_manifest_path: str | None = None,
data_repo_id: str,
) -> None:
"""Refresh bundled dataset hash pins from a data release manifest.

Expand All @@ -332,58 +365,89 @@ def _refresh_dataset_path_references_from_data_release(
``datasets``; notably the US long-term bundle stores one entry per year with
both H5 and metadata-sidecar hashes.
"""
for path_reference in manifest_json.get("datasets", {}).values():
path = path_reference.get("path")
if not path:
continue
if path_reference.get("revision"):
continue
artifact = _release_artifact_by_path(release_manifest_json, path)
if artifact is None:
if "sha256" in path_reference or "metadata_sha256" in path_reference:
raise ValueError(
"Data release manifest is missing dataset artifact "
f"for existing pinned path {path!r}; refusing to leave "
"stale dataset hash pins in place."
)
continue
if artifact.get("path"):
path_reference["path"] = artifact["path"]
path = artifact["path"]
datasets = manifest_json.setdefault("datasets", {})
release_artifacts = release_manifest_json.get("artifacts", {})

def update_reference_from_artifact(path_reference: dict, artifact: dict) -> None:
raw_path = artifact.get("path")
if raw_path:
path_reference["path"] = _release_scoped_artifact_path(
artifact,
release_manifest_path=release_manifest_path,
data_repo_id=data_repo_id,
)
if artifact.get("revision"):
path_reference["revision"] = artifact["revision"]
if artifact.get("repo_id"):
path_reference["repo_id"] = artifact["repo_id"]
if artifact.get("repo_type"):
path_reference["repo_type"] = artifact["repo_type"]

dataset_sha256 = artifact.get("sha256")
if dataset_sha256:
path_reference["sha256"] = dataset_sha256
elif "sha256" in path_reference:
raise ValueError(
"Data release manifest dataset artifact lacks sha256 "
f"for existing pinned path {path!r}; refusing to leave "
f"for existing pinned path {raw_path!r}; refusing to leave "
"stale dataset hash pin in place."
)

if not raw_path:
return
metadata_artifact = _release_artifact_by_path(
release_manifest_json,
_metadata_sidecar_path(path),
_metadata_sidecar_path(raw_path),
)
had_metadata_pin = "metadata_sha256" in path_reference
if metadata_artifact is None:
if had_metadata_pin:
raise ValueError(
"Data release manifest is missing metadata sidecar artifact "
f"for {path!r}; refusing to drop existing metadata hash pin."
f"for {raw_path!r}; refusing to drop existing metadata hash pin."
)
path_reference.pop("metadata_sha256", None)
continue
return
metadata_sha256 = metadata_artifact.get("sha256")
if not metadata_sha256:
if had_metadata_pin:
raise ValueError(
"Data release manifest metadata sidecar artifact lacks sha256 "
f"for {path!r}; refusing to drop existing metadata hash pin."
f"for {raw_path!r}; refusing to drop existing metadata hash pin."
)
path_reference.pop("metadata_sha256", None)
continue
return
path_reference["metadata_sha256"] = metadata_sha256

for name, path_reference in datasets.items():
named_artifact = release_artifacts.get(name)
if named_artifact is not None:
update_reference_from_artifact(path_reference, named_artifact)
continue

path = path_reference.get("path")
if not path:
continue
if path_reference.get("revision"):
continue
artifact = _release_artifact_by_path(release_manifest_json, path)
if artifact is None:
if "sha256" in path_reference or "metadata_sha256" in path_reference:
raise ValueError(
"Data release manifest is missing dataset artifact "
f"for existing pinned path {path!r}; refusing to leave "
"stale dataset hash pins in place."
)
continue
update_reference_from_artifact(path_reference, artifact)

for name, artifact in release_artifacts.items():
if name in datasets:
continue
path_reference: dict = {}
update_reference_from_artifact(path_reference, artifact)
datasets[name] = path_reference


# ---------------------------------------------------------------------------
# Refresh result
Expand Down Expand Up @@ -657,6 +721,8 @@ def refresh_release_bundle(
_refresh_dataset_path_references_from_data_release(
manifest_json,
release_manifest_json,
release_manifest_path=new_release_manifest_path,
data_repo_id=repo_id,
)

manifest_path.write_text(
Expand Down
11 changes: 9 additions & 2 deletions src/policyengine/provenance/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,9 +188,16 @@ def build_hf_uri(repo_id: str, path_in_repo: str, revision: str) -> str:
return f"hf://{repo_id}/{path_in_repo}@{revision}"


def https_dataset_uri(repo_id: str, path_in_repo: str, revision: str) -> str:
def https_dataset_uri(
repo_id: str,
path_in_repo: str,
revision: str,
*,
repo_type: str = "model",
) -> str:
"""Return a dereferenceable HTTPS URI for a Hugging Face dataset artifact."""
return f"https://huggingface.co/{repo_id}/resolve/{revision}/{path_in_repo}"
prefix = "datasets/" if repo_type == "dataset" else ""
return f"https://huggingface.co/{prefix}{repo_id}/resolve/{revision}/{path_in_repo}"


def _artifact_revision(data_package: "DataPackageVersion") -> str:
Expand Down
8 changes: 6 additions & 2 deletions src/policyengine/provenance/trace.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def _artifact_mime_type(path_or_uri: str) -> Optional[str]:
return _MIME_TYPES.get(suffix)


def _dataset_location_from_uri(uri: str) -> str:
def _dataset_location_from_uri(uri: str, *, repo_type: str = "model") -> str:
if not uri.startswith("hf://"):
return uri

Expand All @@ -85,6 +85,7 @@ def _dataset_location_from_uri(uri: str) -> str:
repo_id=repo_id,
path_in_repo=parts[2],
revision=revision,
repo_type=repo_type,
)


Expand Down Expand Up @@ -355,7 +356,10 @@ def build_trace_tro_from_release_bundle(
if data_release_manifest is not None
else None
)
dataset_location = _dataset_location_from_uri(certified_artifact.uri)
dataset_location = _dataset_location_from_uri(
certified_artifact.uri,
repo_type=country_manifest.data_package.repo_type,
)

bundle_manifest_hash = hashlib.sha256(
canonical_json_bytes(country_manifest.model_dump(mode="json"))
Expand Down
Loading
Loading