diff --git a/changelog.d/dataset-input-contract.added.md b/changelog.d/dataset-input-contract.added.md new file mode 100644 index 00000000000..8dd0390f6b1 --- /dev/null +++ b/changelog.d/dataset-input-contract.added.md @@ -0,0 +1 @@ +Added dataset input contract helpers for data-generation packages. diff --git a/policyengine_us/data/__init__.py b/policyengine_us/data/__init__.py index 472ac5e1de7..fb3afce5aef 100644 --- a/policyengine_us/data/__init__.py +++ b/policyengine_us/data/__init__.py @@ -1,2 +1,12 @@ from .dataset_schema import USSingleYearDataset, USMultiYearDataset from .economic_assumptions import extend_single_year_dataset, get_parameter_last_year +from .dataset_input_contract import ( + DatasetInputKind, + DatasetInputMetadata, + dataset_input_metadata, + dataset_input_variables, + get_dataset_input_metadata, + is_dataset_exportable_variable, + is_dataset_input_variable, + is_formula_owned_variable, +) diff --git a/policyengine_us/data/dataset_input_contract.py b/policyengine_us/data/dataset_input_contract.py new file mode 100644 index 00000000000..d3fed3093aa --- /dev/null +++ b/policyengine_us/data/dataset_input_contract.py @@ -0,0 +1,233 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Literal + + +DatasetInputKind = Literal[ + "stochastic_status", + "medical_status", + "geographic_status", + "identifier_status", + "income_override", + "deprecated_alias", +] + + +@dataclass(frozen=True) +class DatasetInputMetadata: + """Metadata for variables datasets may intentionally provide.""" + + variable: str + kind: DatasetInputKind + rationale: str + + +_DATASET_INPUT_METADATA: dict[str, DatasetInputMetadata] = { + "takes_up_aca_if_eligible": DatasetInputMetadata( + variable="takes_up_aca_if_eligible", + kind="stochastic_status", + rationale="Dataset builders may model ACA take-up among eligible tax units.", + ), + "takes_up_basic_health_program_if_eligible": DatasetInputMetadata( + variable="takes_up_basic_health_program_if_eligible", + kind="stochastic_status", + rationale=( + "Dataset builders may model Basic Health Program take-up among " + "eligible people." + ), + ), + "takes_up_chip_if_eligible": DatasetInputMetadata( + variable="takes_up_chip_if_eligible", + kind="stochastic_status", + rationale="Dataset builders may model CHIP take-up among eligible people.", + ), + "takes_up_dc_ptc": DatasetInputMetadata( + variable="takes_up_dc_ptc", + kind="stochastic_status", + rationale=( + "Dataset builders may model DC property tax credit take-up among " + "eligible tax units." + ), + ), + "takes_up_early_head_start_if_eligible": DatasetInputMetadata( + variable="takes_up_early_head_start_if_eligible", + kind="stochastic_status", + rationale=( + "Dataset builders may model Early Head Start take-up among eligible people." + ), + ), + "takes_up_eitc": DatasetInputMetadata( + variable="takes_up_eitc", + kind="stochastic_status", + rationale="Dataset builders may model EITC take-up among eligible tax units.", + ), + "takes_up_head_start_if_eligible": DatasetInputMetadata( + variable="takes_up_head_start_if_eligible", + kind="stochastic_status", + rationale="Dataset builders may model Head Start take-up among eligible people.", + ), + "takes_up_housing_assistance_if_eligible": DatasetInputMetadata( + variable="takes_up_housing_assistance_if_eligible", + kind="stochastic_status", + rationale=( + "Dataset builders may model housing assistance take-up among " + "eligible SPM units." + ), + ), + "takes_up_medicaid_if_eligible": DatasetInputMetadata( + variable="takes_up_medicaid_if_eligible", + kind="stochastic_status", + rationale="Dataset builders may model Medicaid take-up among eligible people.", + ), + "takes_up_medicare_if_eligible": DatasetInputMetadata( + variable="takes_up_medicare_if_eligible", + kind="stochastic_status", + rationale="Dataset builders may model Medicare take-up among eligible people.", + ), + "takes_up_snap_if_eligible": DatasetInputMetadata( + variable="takes_up_snap_if_eligible", + kind="stochastic_status", + rationale="Dataset builders may model SNAP take-up among eligible SPM units.", + ), + "takes_up_ssi_if_eligible": DatasetInputMetadata( + variable="takes_up_ssi_if_eligible", + kind="stochastic_status", + rationale="Dataset builders may model SSI take-up among eligible people.", + ), + "takes_up_tanf_if_eligible": DatasetInputMetadata( + variable="takes_up_tanf_if_eligible", + kind="stochastic_status", + rationale="Dataset builders may model TANF take-up among eligible SPM units.", + ), + "would_claim_wic": DatasetInputMetadata( + variable="would_claim_wic", + kind="stochastic_status", + rationale="Dataset builders may model WIC claiming among eligible people.", + ), + "is_wic_at_nutritional_risk": DatasetInputMetadata( + variable="is_wic_at_nutritional_risk", + kind="medical_status", + rationale=( + "Dataset builders may model WIC nutritional-risk status; the model " + "uses the input directly." + ), + ), + "meets_ssi_disability_criteria": DatasetInputMetadata( + variable="meets_ssi_disability_criteria", + kind="medical_status", + rationale=( + "Dataset builders may provide the SSI medical-disability criterion " + "separately from broad disability flags." + ), + ), + "has_tin": DatasetInputMetadata( + variable="has_tin", + kind="identifier_status", + rationale=( + "Dataset builders may provide taxpayer identification status; the " + "fallback formula defaults to True when no data are supplied." + ), + ), + "has_itin": DatasetInputMetadata( + variable="has_itin", + kind="deprecated_alias", + rationale=( + "Deprecated alias accepted during migration from has_itin to has_tin." + ), + ), + "in_nyc": DatasetInputMetadata( + variable="in_nyc", + kind="geographic_status", + rationale=( + "Dataset builders may provide NYC residency directly when county " + "geography is unavailable or deliberately suppressed." + ), + ), + "fsla_overtime_premium": DatasetInputMetadata( + variable="fsla_overtime_premium", + kind="income_override", + rationale=( + "Dataset builders may provide measured or imputed FLSA overtime " + "premium income instead of relying on weekly-hours approximations." + ), + ), +} + + +def dataset_input_metadata() -> dict[str, DatasetInputMetadata]: + """Return metadata for variables datasets may intentionally provide.""" + return dict(_DATASET_INPUT_METADATA) + + +def dataset_input_variables( + *, + kind: DatasetInputKind | None = None, +) -> frozenset[str]: + """Return variables that are explicit dataset inputs under the US model.""" + if kind is None: + return frozenset(_DATASET_INPUT_METADATA) + return frozenset( + name + for name, metadata in _DATASET_INPUT_METADATA.items() + if metadata.kind == kind + ) + + +def get_dataset_input_metadata( + variable_name: str, +) -> DatasetInputMetadata | None: + """Return dataset-input metadata for a variable, if explicitly defined.""" + return _DATASET_INPUT_METADATA.get(variable_name) + + +def is_dataset_input_variable(variable_name: str) -> bool: + """Return whether a variable is an explicit dataset input.""" + return variable_name in _DATASET_INPUT_METADATA + + +def variable_has_formula(variable) -> bool: + """Return whether a variable is computed by formula/adds/subtracts logic.""" + return any( + bool(getattr(variable, attribute, None)) + for attribute in ("formulas", "adds", "subtracts") + ) + + +def is_formula_owned_variable(variable_name: str, *, system=None) -> bool: + """Return whether datasets should normally leave a variable to formulas. + + Ordinary input variables are not formula-owned. Formula-backed variables + listed in ``dataset_input_metadata`` are deliberate dataset overrides and + therefore also return ``False`` here. + """ + if system is None: + from policyengine_us import CountryTaxBenefitSystem + + system = CountryTaxBenefitSystem() + + variable = system.variables.get(variable_name) + if variable is None: + raise KeyError(f"Unknown variable: {variable_name}") + return variable_has_formula(variable) and not is_dataset_input_variable( + variable_name + ) + + +def is_dataset_exportable_variable(variable_name: str, *, system=None) -> bool: + """Return whether a dataset may export the variable as an input column. + + This helper is intended for data-generation packages. It treats ordinary + model input variables as exportable and also allows the explicit override + variables documented in ``dataset_input_metadata``. Formula-owned outputs + should be calculated by PolicyEngine-US rather than persisted in datasets. + """ + if system is None: + from policyengine_us import CountryTaxBenefitSystem + + system = CountryTaxBenefitSystem() + + variable = system.variables.get(variable_name) + if variable is None: + raise KeyError(f"Unknown variable: {variable_name}") + return variable.is_input_variable() or is_dataset_input_variable(variable_name) diff --git a/policyengine_us/tests/core/test_dataset_input_contract.py b/policyengine_us/tests/core/test_dataset_input_contract.py new file mode 100644 index 00000000000..98f95e956ce --- /dev/null +++ b/policyengine_us/tests/core/test_dataset_input_contract.py @@ -0,0 +1,79 @@ +import pytest + +from policyengine_us import CountryTaxBenefitSystem +from policyengine_us.data import ( + dataset_input_metadata, + dataset_input_variables, + get_dataset_input_metadata, + is_dataset_exportable_variable, + is_dataset_input_variable, + is_formula_owned_variable, +) + + +def test_dataset_input_contract_marks_stochastic_status_inputs(): + expected = { + "takes_up_aca_if_eligible", + "takes_up_dc_ptc", + "takes_up_eitc", + "takes_up_head_start_if_eligible", + "takes_up_early_head_start_if_eligible", + "takes_up_medicaid_if_eligible", + "takes_up_snap_if_eligible", + "takes_up_tanf_if_eligible", + "would_claim_wic", + } + + assert expected <= dataset_input_variables(kind="stochastic_status") + + +def test_dataset_input_contract_marks_known_formula_overrides(): + system = CountryTaxBenefitSystem() + + for variable in ("has_tin", "has_itin", "in_nyc", "fsla_overtime_premium"): + assert is_dataset_input_variable(variable) + assert is_dataset_exportable_variable(variable, system=system) + assert not is_formula_owned_variable(variable, system=system) + + +def test_dataset_input_contract_marks_medical_inputs(): + metadata = get_dataset_input_metadata("meets_ssi_disability_criteria") + + assert metadata is not None + assert metadata.kind == "medical_status" + assert "SSI" in metadata.rationale + assert "is_wic_at_nutritional_risk" in dataset_input_variables( + kind="medical_status" + ) + + +def test_formula_owned_helper_rejects_computed_outputs(): + system = CountryTaxBenefitSystem() + + assert is_formula_owned_variable("wic", system=system) + assert not is_dataset_exportable_variable("wic", system=system) + + +def test_dataset_input_contract_is_consistent_with_model_variables(): + system = CountryTaxBenefitSystem() + metadata = dataset_input_metadata() + + missing = sorted(set(metadata) - set(system.variables)) + assert missing == [] + + undocumented_defaults = { + name + for name, variable in system.variables.items() + if name.startswith(("takes_up_", "would_claim_")) + and getattr(variable, "default_value", None) is True + and name not in metadata + } + assert undocumented_defaults == set() + + +def test_dataset_contract_helpers_raise_for_unknown_variables(): + with pytest.raises(KeyError): + is_formula_owned_variable("not_a_variable") + + with pytest.raises(KeyError): + is_dataset_exportable_variable("not_a_variable") diff --git a/policyengine_us/variables/gov/usda/wic/wic_category.py b/policyengine_us/variables/gov/usda/wic/wic_category.py index 0953e4ee38d..0e66ee5278a 100644 --- a/policyengine_us/variables/gov/usda/wic/wic_category.py +++ b/policyengine_us/variables/gov/usda/wic/wic_category.py @@ -25,7 +25,7 @@ def formula(person, period, parameters): mother = person("is_mother", period) breastfeeding = person("is_breastfeeding", period) age = person("age", period) - # Categorize mothers based on the minimum age of children in the SPM unit. + # Categorize mothers based on the minimum age of children in the family. min_age_family = person.family.min(age) return select( [