Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ venv/**
oryx-build-commands.txt

**/*.csv.gz
!policyengine_us/data/county_fips_2020.csv.gz
**/*.pkl

# Claude Code temporary files
Expand Down
1 change: 1 addition & 0 deletions changelog.d/8650.fixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Bundle the county FIPS dataset so county lookups do not require live Hugging Face access.
Binary file added policyengine_us/data/county_fips_2020.csv.gz
Binary file not shown.
108 changes: 27 additions & 81 deletions policyengine_us/tests/utilities/test_load_county_fips_dataset.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
from policyengine_core.tools.hugging_face import download_huggingface_dataset
import gzip
from pathlib import Path

import pandas as pd
import pytest

from policyengine_us.tools.geography import county_helpers
from policyengine_us.tools.geography.county_helpers import (
COUNTY_FIPS_DATASET_FILENAME,
load_county_fips_dataset,
)
from pathlib import Path
import pytest
import pandas as pd
import gzip


@pytest.fixture
Expand Down Expand Up @@ -33,114 +36,57 @@ def mock_dataset_file(tmp_fips_dir) -> Path:
)

# Save as gzipped CSV
test_file_path = tmp_fips_dir / "county_fips_2020.csv.gz"
test_file_path = tmp_fips_dir / COUNTY_FIPS_DATASET_FILENAME
with gzip.open(test_file_path, "wb") as f:
test_data.to_csv(f, index=False, encoding="utf-8")

return test_file_path


def mock_download_huggingface_dataset_success(filepath):
def _mock(*args, **kwargs):
return filepath

return _mock


def mock_download_huggingface_dataset_failure(filepath):
def _mock(*args, **kwargs):
raise Exception("Download failed")

return _mock


class TestCountyFIPSDatasetFile:
"""
Test that the county FIPS dataset file exists and downloads properly.
"""

HUGGINGFACE_REPO = "policyengine/policyengine-us-data"
COUNTY_FIPS_DATASET_FILENAME = "county_fips_2020.csv.gz"

def test_when_downloading_county_fips__download_is_successful(self, tmp_fips_dir):
download_huggingface_dataset(
repo=self.HUGGINGFACE_REPO,
repo_filename=self.COUNTY_FIPS_DATASET_FILENAME,
version=None,
local_dir=tmp_fips_dir,
)

TMP_FILE = tmp_fips_dir / self.COUNTY_FIPS_DATASET_FILENAME
assert TMP_FILE.is_file()

def test_when_downloading_and_parsing_county_fips__result_is_correct(
self, tmp_fips_dir
):
download_huggingface_dataset(
repo=self.HUGGINGFACE_REPO,
repo_filename=self.COUNTY_FIPS_DATASET_FILENAME,
version=None,
local_dir=tmp_fips_dir,
)

TMP_FILE = tmp_fips_dir / self.COUNTY_FIPS_DATASET_FILENAME

df = pd.read_csv(
TMP_FILE,
compression="gzip",
dtype={"county_fips": str},
encoding="utf-8",
nrows=5, # Just read a few rows
)

assert "county_fips" in df.columns
assert len(df) > 0

# Check FIPS codes are properly preserved as strings
assert all(isinstance(fips, str) for fips in df["county_fips"])


class TestLoadCountyFIPSDataset:
"""
Test that the load_county_fips_dataset function works correctly.
"""

def test_when_func_is_run__correctly__returns_dataframe(
def test_when_local_data_file_exists__returns_local_dataframe(
self, mock_dataset_file, monkeypatch
):
"""
Test that the load_county_fips_dataset function returns a DataFrame with the correct columns.
Test that the load_county_fips_dataset function reads a local data file when present.
"""

# Apply the mock
monkeypatch.setattr(
"policyengine_us.tools.geography.county_helpers.download_huggingface_dataset",
mock_download_huggingface_dataset_success(mock_dataset_file),
county_helpers,
"DATA_FOLDER",
mock_dataset_file.parent,
)

result = load_county_fips_dataset()

# Verify the result is a pandas DataFrame with expected structure
assert isinstance(result, pd.DataFrame)
assert len(result) == 3
assert (
"01001" in result.values
) # Check that FIPS codes are preserved as strings

def test_when_func_is_run__download_fails__raises_exception(
self, mock_dataset_file, monkeypatch
def test_when_local_data_file_is_missing__returns_packaged_dataframe(
self, tmp_fips_dir, monkeypatch
):
"""
Test that the load_county_fips_dataset function raises an exception when download fails.
Test that the packaged dataset is used when no local data file exists.
"""

# Apply the mock
monkeypatch.setattr(
"policyengine_us.tools.geography.county_helpers.download_huggingface_dataset",
mock_download_huggingface_dataset_failure(mock_dataset_file),
county_helpers,
"DATA_FOLDER",
tmp_fips_dir,
)

with pytest.raises(Exception) as excinfo:
load_county_fips_dataset()
result = load_county_fips_dataset()

assert "Error downloading" in str(excinfo.value)
assert isinstance(result, pd.DataFrame)
assert {"county_fips", "county_name", "state"}.issubset(result.columns)
assert len(result) > 3_000
assert "01001" in result["county_fips"].values
assert "06037" in result["county_fips"].values
assert all(isinstance(fips, str) for fips in result["county_fips"])
62 changes: 30 additions & 32 deletions policyengine_us/tools/geography/county_helpers.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,44 @@
import pandas as pd
from importlib import resources
from pathlib import Path

import numpy as np
import pandas as pd

from policyengine_us.variables.household.demographic.geographic.county.county_enum import (
County,
)
from pathlib import Path
from policyengine_core.tools.hugging_face import download_huggingface_dataset


def load_county_fips_dataset() -> pd.DataFrame:
"""
Download the county FIPS dataset from Hugging Face and load it into a pandas DataFrame.
If the dataset already exists in the 'data' folder and is the most recent version, this
function will just load that into a pandas DataFrame.
"""
DATA_FOLDER = Path("data")
COUNTY_FIPS_DATASET_FILENAME = "county_fips_2020.csv.gz"
COUNTY_FIPS_PACKAGE = "policyengine_us.data"

DATA_FOLDER = Path("data")
HUGGINGFACE_REPO = "policyengine/policyengine-us-data"
COUNTY_FIPS_DATASET_FILENAME = "county_fips_2020.csv.gz"

try:
COUNTY_FIPS_RAW = download_huggingface_dataset(
repo=HUGGINGFACE_REPO,
repo_filename=COUNTY_FIPS_DATASET_FILENAME,
version=None,
local_dir=DATA_FOLDER,
)
def _read_county_fips_dataset(dataset_file) -> pd.DataFrame:
return pd.read_csv(
dataset_file,
compression="gzip",
dtype={"county_fips": str},
encoding="utf-8",
)

# Read raw data into pandas dataframe; county FIPS MUST be defined as string,
# else pandas reads as int and drops leading zeros
COUNTY_FIPS_DATASET = pd.read_csv(
COUNTY_FIPS_RAW,
compression="gzip",
dtype={"county_fips": str},
encoding="utf-8",
)

return COUNTY_FIPS_DATASET
def load_county_fips_dataset() -> pd.DataFrame:
"""
Load the county FIPS dataset into a pandas DataFrame.
If the dataset exists in the 'data' folder, load that local copy. Otherwise,
use the packaged fallback so runtime county lookup does not require network access.
"""

except Exception as e:
raise Exception(
f"Error downloading {COUNTY_FIPS_DATASET_FILENAME} from {HUGGINGFACE_REPO}: {e}"
)
local_dataset = DATA_FOLDER / COUNTY_FIPS_DATASET_FILENAME
if local_dataset.is_file():
return _read_county_fips_dataset(local_dataset)

package_dataset = resources.files(COUNTY_FIPS_PACKAGE).joinpath(
COUNTY_FIPS_DATASET_FILENAME
)
with package_dataset.open("rb") as dataset_file:
return _read_county_fips_dataset(dataset_file)


def map_county_string_to_enum(
Expand Down
Loading