Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/415.added.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
`managed_microsimulation` and `resolve_managed_dataset_reference` now accept a local filesystem path to a dataset when `allow_unmanaged=True`. Local build-and-score pipelines can run their own Stage-output artifacts (H5 files that are not part of any release manifest) through the managed wrapper, instead of constructing country-package simulations directly. Passing a local path without `allow_unmanaged=True` raises an actionable error rather than the generic "Unknown dataset" message.
26 changes: 23 additions & 3 deletions src/policyengine/provenance/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,9 +467,12 @@ def resolve_managed_dataset_reference(

- omit `dataset` to use the certified default dataset for the bundle
- pass a logical dataset name present in the bundled/data-release manifests
- pass a local filesystem path with `allow_unmanaged=True` to run on a
build artifact the caller owns (e.g. a downstream pipeline's Stage-output
H5 that is not part of any release manifest)

Direct URLs or raw Hugging Face references are treated as unmanaged unless
`allow_unmanaged=True` is set explicitly.
Direct URLs, raw Hugging Face references, and local filesystem paths are
treated as unmanaged unless `allow_unmanaged=True` is set explicitly.
"""

manifest = get_release_manifest(country_id)
Expand All @@ -488,7 +491,24 @@ def resolve_managed_dataset_reference(
"bypass bundle enforcement."
)

return resolve_dataset_reference(country_id, dataset)
try:
return resolve_dataset_reference(country_id, dataset)
except ValueError:
# Not a managed dataset name. A local build artifact (e.g. a
# Stage-output H5 from a downstream pipeline) is unmanaged but
# legitimate; accept it under the same explicit opt-in as remote URIs
# so local build-and-score workflows need not bypass this wrapper.
candidate = Path(dataset).expanduser()
if candidate.is_file():
if allow_unmanaged:
return str(candidate)
raise ValueError(
f"Dataset {dataset!r} is a local file outside the "
"policyengine.py release bundle. Pass `allow_unmanaged=True` to "
"run a simulation on a local build artifact; its model-version "
"pairing is then the caller's responsibility."
) from None
raise


def resolve_local_managed_dataset_source(
Expand Down
29 changes: 29 additions & 0 deletions tests/test_release_manifests.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,35 @@ def test__given_explicit_uri__then_managed_resolution_requires_opt_in(self):
== dataset
)

def test__given_local_path__then_managed_resolution_requires_opt_in(self, tmp_path):
local_dataset = tmp_path / "2026.h5"
local_dataset.write_bytes(b"")

try:
resolve_managed_dataset_reference("us", str(local_dataset))
except ValueError as error:
assert "local file outside the policyengine.py release bundle" in str(error)
else:
raise AssertionError("Expected local dataset path to require opt-in")

def test__given_local_path_with_opt_in__then_resolves_to_that_path(self, tmp_path):
local_dataset = tmp_path / "2026.h5"
local_dataset.write_bytes(b"")

assert resolve_managed_dataset_reference(
"us",
str(local_dataset),
allow_unmanaged=True,
) == str(local_dataset)

def test__given_unknown_dataset_name__then_raises_unknown_dataset(self):
try:
resolve_managed_dataset_reference("us", "not_a_real_dataset_name")
except ValueError as error:
assert "Unknown dataset" in str(error)
else:
raise AssertionError("Expected unknown dataset name to be rejected")

def test__given_versioned_dataset_url__then_logical_name_drops_version(self):
dataset = "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.73.0"

Expand Down
Loading