diff --git a/.gitignore b/.gitignore index e9e814be967..d0c9b065baf 100644 --- a/.gitignore +++ b/.gitignore @@ -61,6 +61,7 @@ venv/** oryx-build-commands.txt **/*.csv.gz +!policyengine_us/data/county_fips_2020.csv.gz **/*.pkl # Claude Code temporary files diff --git a/changelog.d/8650.fixed.md b/changelog.d/8650.fixed.md new file mode 100644 index 00000000000..01c6df7ff60 --- /dev/null +++ b/changelog.d/8650.fixed.md @@ -0,0 +1 @@ +Bundle the county FIPS dataset so county lookups do not require live Hugging Face access. diff --git a/policyengine_us/data/county_fips_2020.csv.gz b/policyengine_us/data/county_fips_2020.csv.gz new file mode 100644 index 00000000000..b8cd43c62ff Binary files /dev/null and b/policyengine_us/data/county_fips_2020.csv.gz differ diff --git a/policyengine_us/tests/utilities/test_load_county_fips_dataset.py b/policyengine_us/tests/utilities/test_load_county_fips_dataset.py index 4b4620f0204..102f18b7f05 100644 --- a/policyengine_us/tests/utilities/test_load_county_fips_dataset.py +++ b/policyengine_us/tests/utilities/test_load_county_fips_dataset.py @@ -1,11 +1,14 @@ -from policyengine_core.tools.hugging_face import download_huggingface_dataset +import gzip +from pathlib import Path + +import pandas as pd +import pytest + +from policyengine_us.tools.geography import county_helpers from policyengine_us.tools.geography.county_helpers import ( + COUNTY_FIPS_DATASET_FILENAME, load_county_fips_dataset, ) -from pathlib import Path -import pytest -import pandas as pd -import gzip @pytest.fixture @@ -33,114 +36,57 @@ def mock_dataset_file(tmp_fips_dir) -> Path: ) # Save as gzipped CSV - test_file_path = tmp_fips_dir / "county_fips_2020.csv.gz" + test_file_path = tmp_fips_dir / COUNTY_FIPS_DATASET_FILENAME with gzip.open(test_file_path, "wb") as f: test_data.to_csv(f, index=False, encoding="utf-8") return test_file_path -def mock_download_huggingface_dataset_success(filepath): - def _mock(*args, **kwargs): - return filepath - - return _mock - - -def mock_download_huggingface_dataset_failure(filepath): - def _mock(*args, **kwargs): - raise Exception("Download failed") - - return _mock - - -class TestCountyFIPSDatasetFile: - """ - Test that the county FIPS dataset file exists and downloads properly. - """ - - HUGGINGFACE_REPO = "policyengine/policyengine-us-data" - COUNTY_FIPS_DATASET_FILENAME = "county_fips_2020.csv.gz" - - def test_when_downloading_county_fips__download_is_successful(self, tmp_fips_dir): - download_huggingface_dataset( - repo=self.HUGGINGFACE_REPO, - repo_filename=self.COUNTY_FIPS_DATASET_FILENAME, - version=None, - local_dir=tmp_fips_dir, - ) - - TMP_FILE = tmp_fips_dir / self.COUNTY_FIPS_DATASET_FILENAME - assert TMP_FILE.is_file() - - def test_when_downloading_and_parsing_county_fips__result_is_correct( - self, tmp_fips_dir - ): - download_huggingface_dataset( - repo=self.HUGGINGFACE_REPO, - repo_filename=self.COUNTY_FIPS_DATASET_FILENAME, - version=None, - local_dir=tmp_fips_dir, - ) - - TMP_FILE = tmp_fips_dir / self.COUNTY_FIPS_DATASET_FILENAME - - df = pd.read_csv( - TMP_FILE, - compression="gzip", - dtype={"county_fips": str}, - encoding="utf-8", - nrows=5, # Just read a few rows - ) - - assert "county_fips" in df.columns - assert len(df) > 0 - - # Check FIPS codes are properly preserved as strings - assert all(isinstance(fips, str) for fips in df["county_fips"]) - - class TestLoadCountyFIPSDataset: """ Test that the load_county_fips_dataset function works correctly. """ - def test_when_func_is_run__correctly__returns_dataframe( + def test_when_local_data_file_exists__returns_local_dataframe( self, mock_dataset_file, monkeypatch ): """ - Test that the load_county_fips_dataset function returns a DataFrame with the correct columns. + Test that the load_county_fips_dataset function reads a local data file when present. """ - # Apply the mock monkeypatch.setattr( - "policyengine_us.tools.geography.county_helpers.download_huggingface_dataset", - mock_download_huggingface_dataset_success(mock_dataset_file), + county_helpers, + "DATA_FOLDER", + mock_dataset_file.parent, ) result = load_county_fips_dataset() - # Verify the result is a pandas DataFrame with expected structure assert isinstance(result, pd.DataFrame) assert len(result) == 3 assert ( "01001" in result.values ) # Check that FIPS codes are preserved as strings - def test_when_func_is_run__download_fails__raises_exception( - self, mock_dataset_file, monkeypatch + def test_when_local_data_file_is_missing__returns_packaged_dataframe( + self, tmp_fips_dir, monkeypatch ): """ - Test that the load_county_fips_dataset function raises an exception when download fails. + Test that the packaged dataset is used when no local data file exists. """ - # Apply the mock monkeypatch.setattr( - "policyengine_us.tools.geography.county_helpers.download_huggingface_dataset", - mock_download_huggingface_dataset_failure(mock_dataset_file), + county_helpers, + "DATA_FOLDER", + tmp_fips_dir, ) - with pytest.raises(Exception) as excinfo: - load_county_fips_dataset() + result = load_county_fips_dataset() - assert "Error downloading" in str(excinfo.value) + assert isinstance(result, pd.DataFrame) + assert {"county_fips", "county_name", "state"}.issubset(result.columns) + assert len(result) > 3_000 + assert "01001" in result["county_fips"].values + assert "06037" in result["county_fips"].values + assert all(isinstance(fips, str) for fips in result["county_fips"]) diff --git a/policyengine_us/tools/geography/county_helpers.py b/policyengine_us/tools/geography/county_helpers.py index 543709a0581..9efa740b4e5 100644 --- a/policyengine_us/tools/geography/county_helpers.py +++ b/policyengine_us/tools/geography/county_helpers.py @@ -1,46 +1,44 @@ -import pandas as pd +from importlib import resources +from pathlib import Path + import numpy as np +import pandas as pd + from policyengine_us.variables.household.demographic.geographic.county.county_enum import ( County, ) -from pathlib import Path -from policyengine_core.tools.hugging_face import download_huggingface_dataset -def load_county_fips_dataset() -> pd.DataFrame: - """ - Download the county FIPS dataset from Hugging Face and load it into a pandas DataFrame. - If the dataset already exists in the 'data' folder and is the most recent version, this - function will just load that into a pandas DataFrame. - """ +DATA_FOLDER = Path("data") +COUNTY_FIPS_DATASET_FILENAME = "county_fips_2020.csv.gz" +COUNTY_FIPS_PACKAGE = "policyengine_us.data" - DATA_FOLDER = Path("data") - HUGGINGFACE_REPO = "policyengine/policyengine-us-data" - COUNTY_FIPS_DATASET_FILENAME = "county_fips_2020.csv.gz" - try: - COUNTY_FIPS_RAW = download_huggingface_dataset( - repo=HUGGINGFACE_REPO, - repo_filename=COUNTY_FIPS_DATASET_FILENAME, - version=None, - local_dir=DATA_FOLDER, - ) +def _read_county_fips_dataset(dataset_file) -> pd.DataFrame: + return pd.read_csv( + dataset_file, + compression="gzip", + dtype={"county_fips": str}, + encoding="utf-8", + ) - # Read raw data into pandas dataframe; county FIPS MUST be defined as string, - # else pandas reads as int and drops leading zeros - COUNTY_FIPS_DATASET = pd.read_csv( - COUNTY_FIPS_RAW, - compression="gzip", - dtype={"county_fips": str}, - encoding="utf-8", - ) - return COUNTY_FIPS_DATASET +def load_county_fips_dataset() -> pd.DataFrame: + """ + Load the county FIPS dataset into a pandas DataFrame. + If the dataset exists in the 'data' folder, load that local copy. Otherwise, + use the packaged fallback so runtime county lookup does not require network access. + """ - except Exception as e: - raise Exception( - f"Error downloading {COUNTY_FIPS_DATASET_FILENAME} from {HUGGINGFACE_REPO}: {e}" - ) + local_dataset = DATA_FOLDER / COUNTY_FIPS_DATASET_FILENAME + if local_dataset.is_file(): + return _read_county_fips_dataset(local_dataset) + + package_dataset = resources.files(COUNTY_FIPS_PACKAGE).joinpath( + COUNTY_FIPS_DATASET_FILENAME + ) + with package_dataset.open("rb") as dataset_file: + return _read_county_fips_dataset(dataset_file) def map_county_string_to_enum(