diff --git a/changelog.d/415.added.md b/changelog.d/415.added.md new file mode 100644 index 00000000..ba0680e8 --- /dev/null +++ b/changelog.d/415.added.md @@ -0,0 +1 @@ +`managed_microsimulation` and `resolve_managed_dataset_reference` now accept a local filesystem path to a dataset when `allow_unmanaged=True`. Local build-and-score pipelines can run their own Stage-output artifacts (H5 files that are not part of any release manifest) through the managed wrapper, instead of constructing country-package simulations directly. Passing a local path without `allow_unmanaged=True` raises an actionable error rather than the generic "Unknown dataset" message. diff --git a/src/policyengine/provenance/manifest.py b/src/policyengine/provenance/manifest.py index 3d44a11e..065c4e85 100644 --- a/src/policyengine/provenance/manifest.py +++ b/src/policyengine/provenance/manifest.py @@ -467,9 +467,12 @@ def resolve_managed_dataset_reference( - omit `dataset` to use the certified default dataset for the bundle - pass a logical dataset name present in the bundled/data-release manifests + - pass a local filesystem path with `allow_unmanaged=True` to run on a + build artifact the caller owns (e.g. a downstream pipeline's Stage-output + H5 that is not part of any release manifest) - Direct URLs or raw Hugging Face references are treated as unmanaged unless - `allow_unmanaged=True` is set explicitly. + Direct URLs, raw Hugging Face references, and local filesystem paths are + treated as unmanaged unless `allow_unmanaged=True` is set explicitly. """ manifest = get_release_manifest(country_id) @@ -488,7 +491,24 @@ def resolve_managed_dataset_reference( "bypass bundle enforcement." ) - return resolve_dataset_reference(country_id, dataset) + try: + return resolve_dataset_reference(country_id, dataset) + except ValueError: + # Not a managed dataset name. A local build artifact (e.g. a + # Stage-output H5 from a downstream pipeline) is unmanaged but + # legitimate; accept it under the same explicit opt-in as remote URIs + # so local build-and-score workflows need not bypass this wrapper. + candidate = Path(dataset).expanduser() + if candidate.is_file(): + if allow_unmanaged: + return str(candidate) + raise ValueError( + f"Dataset {dataset!r} is a local file outside the " + "policyengine.py release bundle. Pass `allow_unmanaged=True` to " + "run a simulation on a local build artifact; its model-version " + "pairing is then the caller's responsibility." + ) from None + raise def resolve_local_managed_dataset_source( diff --git a/tests/test_release_manifests.py b/tests/test_release_manifests.py index e9de59f4..d025a0c1 100644 --- a/tests/test_release_manifests.py +++ b/tests/test_release_manifests.py @@ -212,6 +212,35 @@ def test__given_explicit_uri__then_managed_resolution_requires_opt_in(self): == dataset ) + def test__given_local_path__then_managed_resolution_requires_opt_in(self, tmp_path): + local_dataset = tmp_path / "2026.h5" + local_dataset.write_bytes(b"") + + try: + resolve_managed_dataset_reference("us", str(local_dataset)) + except ValueError as error: + assert "local file outside the policyengine.py release bundle" in str(error) + else: + raise AssertionError("Expected local dataset path to require opt-in") + + def test__given_local_path_with_opt_in__then_resolves_to_that_path(self, tmp_path): + local_dataset = tmp_path / "2026.h5" + local_dataset.write_bytes(b"") + + assert resolve_managed_dataset_reference( + "us", + str(local_dataset), + allow_unmanaged=True, + ) == str(local_dataset) + + def test__given_unknown_dataset_name__then_raises_unknown_dataset(self): + try: + resolve_managed_dataset_reference("us", "not_a_real_dataset_name") + except ValueError as error: + assert "Unknown dataset" in str(error) + else: + raise AssertionError("Expected unknown dataset name to be rejected") + def test__given_versioned_dataset_url__then_logical_name_drops_version(self): dataset = "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.73.0"