diff --git a/packages/populace-fit/README.md b/packages/populace-fit/README.md new file mode 100644 index 0000000..5063905 --- /dev/null +++ b/packages/populace-fit/README.md @@ -0,0 +1,52 @@ +# populace-fit + +The conditional-models operator of the [populace](../../DESIGN.md) stack — +imported as `populace.fit`. It fits conditional distributions `P(y | x)` over a +`populace.frame.Frame` and draws from them. + +## Weight-aware by construction + +Every fit reads the Frame's typed weights. There is no unweighted default: a +fit that ignores weights cannot be expressed except by passing `weights="none"` +explicitly, and the function says why that is the only escape hatch. `weights` +selects which typed weight vector to use — by default the **design** weights of +the entity that owns the predictors and targets. + +This closes the 2026-06 microimpute landmine, where a silently-ignored +`weight_col` reproduced a high-income regime's mass at the wrong scale. Here the +weights are materialized into the fit by **weighted bootstrap**: training rows +are importance-resampled by weight before each forest is grown, so leaf +distributions — and every value drawn from them — reflect the weighted +population, not the unweighted sample. + +## The canonical model + +`QRF` (alias `RegimeGatedQRF`) is a regime-gated, sequentially-chained +quantile-regression-forest imputer: + +- **Regime gates.** Each numeric target's sign support (negative / zero / + positive) is detected structurally (unweighted) from the training data. A + zero-inflated target gets a zero-vs-nonzero gate so its zero mass is + preserved exactly; a sign-mixed target gets a gate per sign so draws never + interpolate across a zero crossing. +- **Chaining.** Targets are imputed sequentially; each conditions on the + predictors plus the targets already drawn, so the joint structure across + targets is preserved. +- **Draws.** A random quantile is sampled per row (seeded) and the forest is + queried at it, so the draws sample the weighted conditional. + +```python +from populace.fit import fit + +fitted = fit(frame, predictors=["age", "is_male"], targets=["capital_gains"]) +draws = fitted.predict(frame) # one column per target + +# Unweighted is opt-in and explicit: +fitted_unweighted = fit(frame, predictors, targets, weights="none") +``` + +## Dependencies + +The heavy dependencies (`scikit-learn`, `quantile-forest`) live here, never in +`populace-frame`: an analyst doing imputation installs this shard; an analyst +doing only calibration never pulls them. diff --git a/packages/populace-fit/pyproject.toml b/packages/populace-fit/pyproject.toml new file mode 100644 index 0000000..4a9cf9f --- /dev/null +++ b/packages/populace-fit/pyproject.toml @@ -0,0 +1,34 @@ +[project] +name = "populace-fit" +version = "0.1.0" +description = "The populace conditional-models operator: weight-aware conditional models over the Frame — the canonical regime-gated, sequentially-chained, weighted-bootstrap quantile-regression-forest imputer" +readme = "README.md" +requires-python = ">=3.13" +dependencies = [ + # Pin the kernel to its 0.x minor series (the import-time compat gate in + # __init__.py asserts the same floor): the constellation must resolve, not + # fail only at import. + "populace-frame>=0.1,<0.2", + # scikit-learn 1.9 removed sklearn.tree._tree.DTYPE, which quantile-forest + # imports; cap below it until quantile-forest tracks the 1.9 tree ABI + # (upstream fix: zillow/quantile-forest#152). + "scikit-learn>=1.5,<1.9", + "quantile-forest>=1.3", + "numpy>=2", + "pandas>=2.3", +] + +[tool.uv.sources] +populace-frame = { workspace = true } + +[dependency-groups] +dev = [ + "pytest>=8", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/populace"] diff --git a/packages/populace-fit/src/populace/fit/__init__.py b/packages/populace-fit/src/populace/fit/__init__.py new file mode 100644 index 0000000..ae6fb16 --- /dev/null +++ b/packages/populace-fit/src/populace/fit/__init__.py @@ -0,0 +1,135 @@ +"""populace.fit: the conditional-models operator of the populace stack. + +Fits conditional distributions ``P(targets | predictors)`` over a +:class:`~populace.frame.Frame` and draws from them. The operator is +**weight-aware by construction**: a fit reads the frame's typed weights, and the +only way to fit unweighted is to pass ``weights="none"`` explicitly +(:mod:`populace.fit.model`). The canonical model is a regime-gated, chained, +weighted-bootstrap quantile-regression-forest imputer (:mod:`populace.fit.qrf`). + +Importing this shard asserts compatibility with the installed +:mod:`populace.frame` kernel — the constellation mechanism from DESIGN.md: a +shard pins ``populace-frame`` in its metadata *and* checks the kernel major at +import, so a resolver that ignores ``[tool.uv.sources]`` cannot silently +assemble an incompatible pair. +""" + +from populace.frame import __version__ as _frame_version + +#: The populace-frame major this shard is built against. The kernel is +#: pre-1.0, so during the 0.x line compatibility is pinned at the *minor* +#: level (0.x and 0.y may differ incompatibly); from 1.0 on this becomes the +#: major. Kept in lockstep with the ``populace-frame>=...`` floor in +#: ``pyproject.toml``. +_REQUIRED_FRAME_SERIES = (0, 1) + + +def _assert_frame_compatible(version: str, required: tuple[int, int]) -> None: + """Raise unless the installed populace-frame is the expected series. + + Args: + version: The installed ``populace.frame.__version__``. + required: The ``(major, minor)`` series this shard requires. The minor + is enforced only while the major is ``0`` (the pre-1.0 convention + that 0.x minors may break compatibility); from major ``1`` on, only + the major must match. + + Raises: + ImportError: If the installed kernel is outside the required series. The + message names both versions and the fix. + """ + parts = version.split(".") + try: + installed = (int(parts[0]), int(parts[1])) + except (IndexError, ValueError): # pragma: no cover - defensive + raise ImportError( + f"populace-fit cannot parse populace-frame version {version!r}; " + f"expected a {required[0]}.{required[1]}.x kernel." + ) from None + + if required[0] == 0: + compatible = installed == required + expected = f"{required[0]}.{required[1]}.x" + else: + compatible = installed[0] == required[0] + expected = f"{required[0]}.x" + + if not compatible: + raise ImportError( + f"populace-fit requires populace-frame {expected}, but " + f"{version} is installed. Install the matching constellation " + "(the workspace releases the shards in lockstep): upgrade or pin " + f"populace-frame to {expected}." + ) + + +_assert_frame_compatible(_frame_version, _REQUIRED_FRAME_SERIES) + +from populace.fit.model import ( # noqa: E402 - after the compatibility gate + DESIGN_WEIGHTS, + NO_WEIGHTS, + ConditionalModel, + FittedModel, + WeightSpec, +) +from populace.fit.qrf import ( # noqa: E402 - after the compatibility gate + DEFAULT_N_ESTIMATORS, + DEFAULT_ZERO_ATOL, + FittedRegimeGatedQRF, + Regime, + RegimeGatedQRF, +) + +__version__ = "0.1.0" + +#: The canonical conditional model, under its short public name. +QRF = RegimeGatedQRF + + +def fit( + frame, + predictors: list[str], + targets: list[str], + *, + weights: WeightSpec = DESIGN_WEIGHTS, + **model_kwargs, +) -> FittedModel: + """Fit the canonical conditional model over ``frame``. + + Convenience constructor: builds a :class:`RegimeGatedQRF` with + ``model_kwargs`` and fits it. For a different model, instantiate it directly + and call its :meth:`~populace.fit.model.ConditionalModel.fit`. + + Args: + frame: The :class:`~populace.frame.Frame` to fit on. + predictors: Conditioning variable names (one entity). + targets: Variable names to learn the conditional of (same entity). + weights: Which typed weight vector to weight the fit by; defaults to the + owning entity's ``"design"`` weights. ``"none"`` fits unweighted — + the only way to do so. + **model_kwargs: Forwarded to :class:`RegimeGatedQRF` (e.g. + ``n_estimators``, ``zero_atol``, ``seed``). + + Returns: + A :class:`~populace.fit.model.FittedModel`. + """ + return RegimeGatedQRF(**model_kwargs).fit( + frame, predictors, targets, weights=weights + ) + + +__all__ = [ + "ConditionalModel", + "FittedModel", + "WeightSpec", + "DESIGN_WEIGHTS", + "NO_WEIGHTS", + "QRF", + "RegimeGatedQRF", + "FittedRegimeGatedQRF", + "Regime", + "DEFAULT_N_ESTIMATORS", + "DEFAULT_ZERO_ATOL", + "fit", + "__version__", +] diff --git a/packages/populace-fit/src/populace/fit/model.py b/packages/populace-fit/src/populace/fit/model.py new file mode 100644 index 0000000..6108795 --- /dev/null +++ b/packages/populace-fit/src/populace/fit/model.py @@ -0,0 +1,278 @@ +"""The conditional-model protocol and the weight-resolution it enforces. + +A conditional model fits ``P(y | x)`` over a :class:`~populace.frame.Frame` and +draws from it. The defining property of this operator — the one the 2026-06 +microimpute landmine violated — is that fitting is **weight-aware by +construction**: the weights come from the Frame's typed weight vectors, never +from a raw array a caller might forget to pass, and an unweighted fit is +impossible to request without writing ``weights="none"`` and meaning it. + +:func:`resolve_fit_weights` is the single authority for that rule. Every model +in :mod:`populace.fit` routes its weight handling through it, so "no silent +unweighted default" is enforced in one place rather than re-litigated per model. +""" + +from __future__ import annotations + +from typing import Protocol, runtime_checkable + +import numpy as np +import pandas as pd + +from populace.frame import Frame, WeightKind, assert_kind_transition + +__all__ = [ + "ConditionalModel", + "FittedModel", + "WeightSpec", + "DESIGN_WEIGHTS", + "NO_WEIGHTS", + "resolve_fit_weights", + "predictors_targets_entity", +] + +#: ``weights="design"`` — fit on the owning entity's design weights. The +#: default: a populace fit is weighted unless the caller opts out. +DESIGN_WEIGHTS = "design" + +#: ``weights="none"`` — the *only* way to fit unweighted. Explicit by design: +#: an unweighted fit is a deliberate statistical choice, not a silent fallback. +NO_WEIGHTS = "none" + +#: The weight specification accepted by a fit. A :class:`WeightKind` (or its +#: string value ``"design" | "importance" | "calibrated"``) selects which typed +#: weight vector of the owning entity to use; ``"none"`` fits unweighted. +WeightSpec = "WeightKind | str" + +# Map the accepted string spellings to the kernel's WeightKind. ``"none"`` is +# handled separately (it is the absence of a kind, not a kind). +_KIND_BY_NAME: dict[str, WeightKind] = {kind.value: kind for kind in WeightKind} + + +@runtime_checkable +class FittedModel(Protocol): + """A fitted conditional model that can draw imputed targets. + + A fitted model holds the conditional distribution learned at fit time and + draws from it on demand. Draws are stochastic: each :meth:`predict` call + samples fresh values from the conditional, advancing the model's seeded + random state so repeated calls give independent draws (while a freshly + fitted model with the same seed reproduces the same first draw). + """ + + def predict(self, frame_or_df: Frame | pd.DataFrame) -> pd.DataFrame: + """Draw imputed target values for the rows of ``frame_or_df``. + + Args: + frame_or_df: The rows to impute for. A :class:`~populace.frame.Frame` + supplies the predictor columns from the entity that owns them; a + :class:`pandas.DataFrame` is used directly (its columns must + cover the predictors). Row count and index are preserved. + + Returns: + A :class:`pandas.DataFrame` with one column per target, indexed to + match the input rows. Values are a single stochastic draw from the + fitted conditional. + """ + ... + + +@runtime_checkable +class ConditionalModel(Protocol): + """A model that fits a conditional distribution over a Frame. + + The protocol is intentionally minimal so trajectory and sequence models can + slot in behind it later (DESIGN.md, "populace-fit: conditional models"). + The single non-negotiable is the weight contract: ``weights`` selects a + typed weight vector of the entity that owns the predictors and targets, and + defaults to that entity's design weights. ``weights="none"`` is the only + way to fit unweighted. + """ + + def fit( + self, + frame: Frame, + predictors: list[str], + targets: list[str], + *, + weights: WeightSpec = DESIGN_WEIGHTS, + ) -> FittedModel: + """Fit the conditional distribution ``P(targets | predictors)``. + + Args: + frame: The frame to fit on. Predictors and targets must all live on + the same entity (resolved from the frame's column ownership). + predictors: Conditioning variable names. + targets: Variable names to learn the conditional of. + weights: Which typed weight vector of the owning entity to weight + the fit by. A :class:`~populace.frame.WeightKind` or its string + value selects the kind; the default ``"design"`` uses that + entity's design weights. ``"none"`` fits unweighted — the only + way to do so, and a deliberate choice. + + Returns: + A :class:`FittedModel`. + """ + ... + + +def _duplicates(names: list[str]) -> list[str]: + """Return the sorted set of names that appear more than once.""" + seen: set[str] = set() + dups: set[str] = set() + for name in names: + (dups if name in seen else seen).add(name) + return sorted(dups) + + +def predictors_targets_entity( + frame: Frame, predictors: list[str], targets: list[str] +) -> str: + """Resolve the single entity that owns every predictor and target. + + Conditional fitting is a one-entity operation: a row of the fit is a row of + one entity, and the weights that weight the fit are that entity's. Mixing + entities (e.g. a household predictor and a person target) would silently + cross-join rows at different grains, so it is refused here rather than + producing a misaligned fit. Broadcast a group column onto persons first + (``Frame.broadcast``) to fit across grains deliberately. + + Args: + frame: The frame whose column ownership resolves the entity. + predictors: Predictor variable names (at least one). + targets: Target variable names (at least one). + + Returns: + The name of the entity that owns all of ``predictors`` and ``targets``. + + Raises: + ValueError: If ``predictors`` or ``targets`` is empty, a name is not a + column on any entity table, or the names span more than one entity. + The message names the offending columns and their entities. + """ + if not predictors: + raise ValueError("fit requires at least one predictor.") + if not targets: + raise ValueError("fit requires at least one target.") + predictor_dups = _duplicates(predictors) + if predictor_dups: + raise ValueError(f"Duplicate predictors: {predictor_dups}.") + target_dups = _duplicates(targets) + if target_dups: + raise ValueError(f"Duplicate targets: {target_dups}.") + overlap = sorted(set(predictors) & set(targets)) + if overlap: + raise ValueError( + f"Columns are both predictor and target: {overlap}; a target " + "cannot condition on itself." + ) + owners: dict[str, str] = {} + for column in (*predictors, *targets): + owners[column] = frame.column_entity(column) # raises naming the column + entities = set(owners.values()) + if len(entities) != 1: + by_entity: dict[str, list[str]] = {} + for column, entity in owners.items(): + by_entity.setdefault(entity, []).append(column) + described = "; ".join( + f"{entity}: {sorted(columns)}" for entity, columns in sorted(by_entity.items()) + ) + raise ValueError( + "Predictors and targets must all live on one entity, but they span " + f"{sorted(entities)} ({described}). Broadcast a column onto the " + "person entity (Frame.broadcast) to fit across grains deliberately." + ) + return entities.pop() + + +def resolve_fit_weights( + frame: Frame, + entity: str, + weights: WeightSpec, +) -> np.ndarray | None: + """Resolve a ``weights`` spec to the per-row vector a fit trains on. + + This is the enforcement point for the operator's defining rule: a populace + fit is weighted unless the caller writes ``weights="none"``. The vector + returned is positionally aligned with ``entity``'s table — the grain every + predictor and target shares — so a model can hand it straight to its + weighted bootstrap. + + Weights are resolved through :meth:`~populace.frame.Frame.resolve_weights`, + not only an entity's *own* stored vector: a person-level fit on a + household-weighted frame reads the household weights broadcast onto persons, + carrying the household's kind. The kind discipline is unchanged — the + requested kind must match the *resolved* (possibly inherited) kind — so a + fit can never silently weight by a kind the caller did not ask for. + + Args: + frame: The frame carrying the typed weights. + entity: The entity that owns the predictors and targets (its effective + weights are the ones that weight the fit). + weights: ``"none"`` to fit unweighted (the only unweighted path), or a + :class:`~populace.frame.WeightKind` / its string value selecting + which typed weight vector of ``entity`` to use. + + Returns: + ``None`` when ``weights="none"`` (the model fits unweighted), otherwise + a float64 array of length ``frame.n(entity)``. + + Raises: + TypeError: If ``weights`` is neither a string nor a + :class:`~populace.frame.WeightKind`. + ValueError: If the spec is an unknown string, or the requested kind is + not the kind of ``entity``'s resolved weights. The message names the + valid specs / the resolved kind and an actionable fix. + """ + if isinstance(weights, WeightKind): + requested = weights + elif isinstance(weights, str): + if weights == NO_WEIGHTS: + return None + requested = _KIND_BY_NAME.get(weights) + if requested is None: + valid = [NO_WEIGHTS, *_KIND_BY_NAME] + raise ValueError( + f"Unknown weights spec {weights!r}; expected one of {valid}. " + "An unweighted fit must be requested explicitly with " + f"weights={NO_WEIGHTS!r}." + ) + else: + raise TypeError( + "weights must be a WeightKind or one of the strings " + f"{[NO_WEIGHTS, *_KIND_BY_NAME]}, got {type(weights).__name__}. " + f"To fit unweighted, pass weights={NO_WEIGHTS!r} explicitly." + ) + + # Resolve through the frame's effective weights, not only the entity's own + # stored vector: a person-level fit on a household-weighted frame inherits + # the household weights (and their kind). Raises naming the entity / the + # ambiguity if the weights cannot be resolved. + resolved = frame.resolve_weights(entity) + if resolved.kind is not requested: + try: + # Kinds only move forward (design -> importance -> calibrated). + # If the requested kind is reachable from the resolved one, telling + # the caller to advance the frame's weights is actionable. + assert_kind_transition(resolved.kind, requested) + except ValueError: + # The requested kind ranks *below* the resolved kind, so advancing + # is impossible (calibrated weights never revert to design). The + # only actionable fix is to request the kind the frame actually + # carries. + raise ValueError( + f"Requested {requested.value!r} weights for entity {entity!r}, " + f"but its resolved weights are {resolved.kind.value!r}. Weight " + "kinds only move forward " + "(design -> importance -> calibrated), so the frame cannot be " + f"reverted to {requested.value!r}; pass " + f"weights={resolved.kind.value!r} to fit on the weights the " + "frame carries." + ) from None + raise ValueError( + f"Requested {requested.value!r} weights for entity {entity!r}, but " + f"its resolved weights are {resolved.kind.value!r}. Either pass " + f"weights={resolved.kind.value!r}, or advance the frame's weights " + f"to {requested.value!r} first." + ) + return np.asarray(resolved.values, dtype=np.float64) diff --git a/packages/populace-fit/src/populace/fit/qrf.py b/packages/populace-fit/src/populace/fit/qrf.py new file mode 100644 index 0000000..0cf05be --- /dev/null +++ b/packages/populace-fit/src/populace/fit/qrf.py @@ -0,0 +1,714 @@ +"""The canonical conditional model: a regime-gated, chained, weighted QRF. + +This is the from-scratch successor to microimpute's regime-gated QRF imputer, +reimplemented against the :class:`~populace.frame.Frame`. Three ideas combine: + +**Weighted bootstrap (forests only).** ``quantile_forest`` (and random forests +generally) cannot honor a ``sample_weight`` in their *predictive* distribution: +a fully-grown leaf holds one training row, so weighting impurity does not move +the value a draw reads out, and the backend uses ``sample_weight`` only as a +zero-weight filter on leaf membership. So weights are materialized *into the +data*: before each forest is grown, training rows are drawn with replacement +with probability proportional to weight (:func:`_weighted_bootstrap`). The leaf +distributions then reflect the weighted population. This is the microimpute#196 +fix — the mechanism that makes a weighted fit actually shift the draws. + +**Regime gates.** A numeric target's sign support — which of +{negative, zero, positive} appear in training — defines its *regime*. A single +regressor over a zero-inflated or sign-mixed target either loses a tail (the +"fit on ``y > 0``" pattern drops the negatives) or interpolates across the zero +crossing (predicting values in the empty gap between the negative and positive +clusters). So a classifier gates each row into its sign class, and a separate +forest models the magnitude within each nonzero sign. Regime detection is +**structural**: it reads the unweighted support, because which signs *exist* is +a fact about the variable, not about the population's weighting. The gate is +weighted *directly* by ``sample_weight`` — which the histogram classifier +honors exactly — **not** by the forests' bootstrap: an n-of-n weighted +resample would delete a vanishingly rare sign class outright (a positive row at +weight 1 among thousands of zeros at weight 50 is drawn with probability ~4e-5), +collapsing the gate to a single class that can never draw the missing sign. + +**Chaining.** Targets are imputed sequentially; each conditions on the +predictors plus the targets already drawn, so the joint structure across +targets survives (sequential / chained-equations imputation). + +Draws sample the weighted conditional: a quantile ``q ~ Uniform(0, 1)`` is drawn +per row from the model's seeded RNG and the forest is queried at it, so over +rows the draws reproduce the (weighted) conditional distribution, not a point +estimate. +""" + +from __future__ import annotations + +from dataclasses import dataclass + +import numpy as np +import pandas as pd +from quantile_forest import RandomForestQuantileRegressor +from sklearn.ensemble import HistGradientBoostingClassifier + +from populace.fit.model import ( + DESIGN_WEIGHTS, + WeightSpec, + predictors_targets_entity, + resolve_fit_weights, +) +from populace.frame import Frame + +__all__ = [ + "RegimeGatedQRF", + "FittedRegimeGatedQRF", + "Regime", + "DEFAULT_N_ESTIMATORS", + "DEFAULT_ZERO_ATOL", +] + +#: Default number of trees per forest. Enough for stable leaf distributions on +#: the pool sizes this operator targets, small enough for fast CI fits. +DEFAULT_N_ESTIMATORS = 100 + +#: Absolute tolerance for "equals zero" in regime detection. A magnitude at or +#: below this counts as a structural zero (the gate's zero class). +DEFAULT_ZERO_ATOL = 1e-6 + + +class Regime: + """Sign-support regime labels for a numeric target. + + The label records which sign classes appear in the (unweighted) training + support and therefore which gate + forests the target needs. Exposed as + constants so callers can match on a fitted model's regimes without magic + strings. + """ + + #: Negative, zero, and positive all present: a three-way sign gate plus a + #: positive-magnitude and a negative-magnitude forest. + THREE_SIGN = "three_sign" + #: Zero and positive only: a zero-vs-positive gate plus a positive forest. + ZERO_INFLATED_POSITIVE = "zero_inflated_positive" + #: Zero and negative only: a zero-vs-negative gate plus a negative forest. + ZERO_INFLATED_NEGATIVE = "zero_inflated_negative" + #: Both signs, no zeros: a sign gate plus a forest per sign. + SIGN_ONLY = "sign_only" + #: Strictly positive: one forest, no gate. + POSITIVE_ONLY = "positive_only" + #: Strictly negative: one forest, no gate. + NEGATIVE_ONLY = "negative_only" + #: Constant zero in training: every draw is exactly zero, no model. + DEGENERATE_ZERO = "degenerate_zero" + + +def detect_regime(y: np.ndarray, *, zero_atol: float) -> str: + """Classify a target's training support into a :class:`Regime`. + + A sign class counts as present when at least one training value falls in it. + Detection is unweighted on purpose: the *existence* of a sign is structural, + a property of the variable, not of the population the weights describe. + + Args: + y: Training target values. + zero_atol: Magnitudes at or below this are zeros. + + Returns: + One of the :class:`Regime` label constants. + """ + if y.size == 0: + return Regime.DEGENERATE_ZERO + has_zero = bool((np.abs(y) <= zero_atol).any()) + has_pos = bool((y > zero_atol).any()) + has_neg = bool((y < -zero_atol).any()) + + if has_pos and has_neg and has_zero: + return Regime.THREE_SIGN + if has_pos and has_neg: + return Regime.SIGN_ONLY + if has_pos and has_zero: + return Regime.ZERO_INFLATED_POSITIVE + if has_neg and has_zero: + return Regime.ZERO_INFLATED_NEGATIVE + if has_pos: + return Regime.POSITIVE_ONLY + if has_neg: + return Regime.NEGATIVE_ONLY + return Regime.DEGENERATE_ZERO + + +def _validate_targets_finite(table: pd.DataFrame, targets: list[str]) -> None: + """Raise unless every target column is entirely finite. + + A NaN target is the silent corruption this guards against: the sign labels + (``y > zero_atol`` / ``y < -zero_atol``) are both ``False`` for NaN, so a + NaN row is relabeled to the *zero* class — a missing value masquerading as a + structural zero, NaN-blind. The model has no notion of missingness, so the + only sound contract is to require finite targets and refuse otherwise, + naming the offending column and its NaN count so the caller can find it. + + Predictors are not checked here: a forest can split around NaN features and + a missing predictor is not silently miscoded the way a missing target is. + + Args: + table: The entity table the fit reads targets from. + targets: Target column names. + + Raises: + ValueError: If any target column contains non-finite values. The message + names the first offending column and its NaN/inf count. + """ + for target in targets: + values = table[target].to_numpy(dtype=np.float64) + non_finite = int((~np.isfinite(values)).sum()) + if non_finite: + raise ValueError( + f"Target column {target!r} contains {non_finite} non-finite " + f"value(s) (NaN/inf) out of {len(values)}. A NaN target would be " + "silently relabeled to the zero class (the sign labels are " + "NaN-blind); fit requires finite targets. Drop or impute the " + f"missing {target!r} values before fitting." + ) + + +def _weighted_bootstrap( + x: np.ndarray, + y: np.ndarray, + weights: np.ndarray | None, + rng: np.random.Generator, +) -> tuple[np.ndarray, np.ndarray]: + """Materialize weights by importance-resampling the training rows. + + Draws ``len(x)`` rows with replacement with probability proportional to + weight, so the resampled data carries the weighted distribution. With + ``weights=None`` the data is returned unchanged (the explicit unweighted + path). This is the operative half of the microimpute#196 fix: it is what + makes leaf distributions — and the values drawn from them — weighted. + + Args: + x: Feature matrix, one row per training record. + y: Target vector aligned with ``x``. + weights: Per-row weights (non-negative, not all zero), or ``None`` to + return the data unchanged. + rng: Seeded generator the resample draws from. + + Returns: + ``(x_resampled, y_resampled)`` with the same row count as the input. + """ + if weights is None: + return x, y + total = float(weights.sum()) + probabilities = weights / total + selected = rng.choice(len(x), size=len(x), replace=True, p=probabilities) + return x[selected], y[selected] + + +def _make_gate(seed: int) -> HistGradientBoostingClassifier: + """Build the sign-gate classifier. + + A histogram gradient-boosted classifier; on the zero-inflated PolicyEngine + targets this calibrates the zero/nonzero probability better than a small + random forest, which matters because the gate's probability *is* the share + of draws that come out zero. + """ + return HistGradientBoostingClassifier(random_state=seed) + + +def _interp_rows( + quantiles: np.ndarray, grid: np.ndarray, predictions: np.ndarray +) -> np.ndarray: + """Per-row linear interpolation of grid predictions at per-row quantiles. + + Equivalent to ``[np.interp(q[i], grid, predictions[i]) for i]`` but + vectorized: ``grid`` (the quantile knots) is shared across rows, so one + ``searchsorted`` locates every row's bracket at once and the interpolation + is a single weighted blend. Quantiles outside ``grid`` clamp to the end + values (the ``np.interp`` convention), so ``q`` at/near 0 or 1 reads the + observed conditional min/max. + + Args: + quantiles: One quantile per row, shape ``(m,)``. + grid: Ascending quantile knots the forest was queried at, shape + ``(g,)``. + predictions: Predicted values, shape ``(m, g)``, row-aligned with + ``quantiles`` and column-aligned with ``grid``. + + Returns: + One interpolated value per row, shape ``(m,)``. + """ + upper = np.searchsorted(grid, quantiles, side="left") + upper = np.clip(upper, 1, len(grid) - 1) + lower = upper - 1 + grid_lo = grid[lower] + grid_hi = grid[upper] + span = grid_hi - grid_lo + weight = np.where(span > 0, (quantiles - grid_lo) / span, 0.0) + weight = np.clip(weight, 0.0, 1.0) # clamp q outside the grid to the ends + rows = np.arange(len(quantiles)) + values_lo = predictions[rows, lower] + values_hi = predictions[rows, upper] + return values_lo + weight * (values_hi - values_lo) + + +@dataclass(frozen=True) +class _Forest: + """A fitted quantile forest plus the feature columns it was fit on.""" + + model: RandomForestQuantileRegressor + columns: tuple[str, ...] + + def draw(self, frame: pd.DataFrame, quantiles: np.ndarray) -> np.ndarray: + """Draw one value per row at that row's quantile. + + The forest is queried on a shared fine grid of quantiles, then each + row's value is read out by **linearly interpolating** its predicted + grid values at its exact quantile — not by snapping to the nearest grid + point. Snapping quantizes every draw to one of the grid's quantiles, + which flattens the conditional and biases tail draws toward the + grid-bracket interior; interpolation reads the true per-row quantile. + + The grid includes points adjacent to 0 and 1 (see :data:`_QUANTILE_GRID`), + so the observed conditional min and max are drawable: ``q=1`` is the + observed maximum, not extrapolation, and a draw with quantile near 1 + must be able to reach it. + + The predict is chunked over rows (:data:`_PREDICT_CHUNK_ROWS`) so the + ``(n_rows x n_grid)`` prediction matrix never has to materialize whole — + at 3M+ rows that matrix alone would be tens of GB. + + Args: + frame: Feature rows (must carry the fitted columns). + quantiles: One quantile in ``[0, 1]`` per row. + + Returns: + One drawn value per row, positionally aligned with ``frame``. + """ + features = frame.loc[:, list(self.columns)].to_numpy(dtype=np.float64) + quantiles = np.asarray(quantiles, dtype=np.float64) + grid = _QUANTILE_GRID + n = len(features) + out = np.empty(n, dtype=np.float64) + for start in range(0, n, _PREDICT_CHUNK_ROWS): + stop = min(start + _PREDICT_CHUNK_ROWS, n) + block = features[start:stop] + predictions = np.asarray( + self.model.predict(block, quantiles=list(grid)) + ).reshape(len(block), len(grid)) + out[start:stop] = _interp_rows(quantiles[start:stop], grid, predictions) + return out + + +#: Fine symmetric quantile grid used to read per-row draws. The interior is an +#: evenly spaced grid over ``(0, 1)``; points adjacent to 0 and 1 are prepended +#: and appended so the *observed* conditional extremes are drawable. The maximum +#: is the ``q=1`` order statistic (and the minimum the ``q=0`` one), which a +#: forest can return exactly — it is reading an observed value, not +#: extrapolating past it — so excluding the endpoints (as the nearest-snap grid +#: did) needlessly truncates the tails. ``np.interp`` then maps any per-row +#: quantile, including ones at or beyond the grid ends, onto these values. +_GRID_EPS = 1e-6 +_QUANTILE_GRID = np.concatenate( + [ + [_GRID_EPS], + np.linspace(1.0 / 202.0, 1.0 - 1.0 / 202.0, 201), + [1.0 - _GRID_EPS], + ] +) + +#: Row-batch size for the draw predict. Bounds the ``(rows x grid)`` matrix so a +#: 3M+ row draw streams in fixed-memory blocks instead of allocating the whole +#: matrix at once. +_PREDICT_CHUNK_ROWS = 50_000 + + +def _fit_forest( + x: np.ndarray, + y: np.ndarray, + columns: tuple[str, ...], + weights: np.ndarray | None, + *, + seed: int, + n_estimators: int, + max_samples_leaf: int | float | None, + rng: np.random.Generator, +) -> _Forest: + """Weighted-bootstrap the rows, then grow a quantile forest on them. + + ``max_samples_leaf`` is passed through to the forest: the quantile-forest + default of ``1`` keeps only one sample per leaf, which thins each row's + conditional to ~``n_estimators`` atoms and undershoots tail mass; ``None`` + keeps every leaf sample, so the conditional reflects the full leaf + population. + """ + x_fit, y_fit = _weighted_bootstrap(x, y, weights, rng) + model = RandomForestQuantileRegressor( + n_estimators=n_estimators, + max_samples_leaf=max_samples_leaf, + random_state=seed, + ) + model.fit(x_fit, y_fit) + return _Forest(model=model, columns=columns) + + +@dataclass(frozen=True) +class _TargetModel: + """The fitted pipeline for one numeric target: its regime, gate, forests. + + Attributes: + regime: The detected :class:`Regime` label. + columns: The predictor columns this target was fit on (predictors plus + the targets chained before it). + gate: The sign-gate classifier, or ``None`` for ungated regimes. + positive: The positive-magnitude forest, or ``None``. + negative: The negative-magnitude forest, or ``None``. + """ + + regime: str + columns: tuple[str, ...] + gate: HistGradientBoostingClassifier | None + positive: _Forest | None + negative: _Forest | None + + +class RegimeGatedQRF: + """The canonical :class:`~populace.fit.model.ConditionalModel`. + + Fits a regime-gated, sequentially-chained quantile-regression-forest model + of ``P(targets | predictors)`` over a :class:`~populace.frame.Frame`, with + the frame's typed weights materialized by weighted bootstrap. See the module + docstring for the three mechanisms (weighted bootstrap, regime gates, + chaining). + + Args: + n_estimators: Trees per forest. + zero_atol: Magnitudes at or below this count as zeros in regime + detection. + max_samples_leaf: Samples retained per forest leaf for the conditional. + ``None`` (the default here) keeps **all** leaf samples, so the + per-row conditional reflects the full leaf population; the + quantile-forest default of ``1`` keeps only one sample per leaf, + thinning each row's conditional to ~``n_estimators`` atoms and + undershooting tail mass (roughly halving the share above a high + threshold). Pass an int/float to cap the leaf sample, matching the + quantile-forest semantics. + seed: Base random seed. Controls the weighted-bootstrap resample, the + forest randomness, and the per-row draw quantiles, so a fixed seed + makes a freshly fitted model's first draw reproducible. + """ + + def __init__( + self, + *, + n_estimators: int = DEFAULT_N_ESTIMATORS, + zero_atol: float = DEFAULT_ZERO_ATOL, + max_samples_leaf: int | float | None = None, + seed: int = 0, + ) -> None: + self.n_estimators = int(n_estimators) + self.zero_atol = float(zero_atol) + self.max_samples_leaf = max_samples_leaf + self.seed = int(seed) + + def fit( + self, + frame: Frame, + predictors: list[str], + targets: list[str], + *, + weights: WeightSpec = DESIGN_WEIGHTS, + ) -> FittedRegimeGatedQRF: + """Fit the conditional model. See + :meth:`~populace.fit.model.ConditionalModel.fit`. + + Resolves the single entity owning the predictors and targets, reads its + typed weights per ``weights`` (``"none"`` is the only unweighted path), + detects each target's regime structurally, and grows the gated, + weighted-bootstrap forests in chain order. + + Raises: + ValueError: If predictors/targets are empty, span more than one + entity, name unknown columns, request a weight kind the + entity's resolved weights are not, or a target column contains + non-finite (NaN/inf) values. Messages name the culprits. + """ + predictors = list(predictors) + targets = list(targets) + entity = predictors_targets_entity(frame, predictors, targets) + weight_values = resolve_fit_weights(frame, entity, weights) + table = frame.table(entity) + _validate_targets_finite(table, targets) + + # Split the model seed into two independent streams: one drives the + # fit (bootstrap resample, forest randomness, gate random_state), the + # other drives the predict-time draw quantiles. Seeding both from the + # same seed (as before) made the draw uniforms bit-identical to the + # gate's bootstrap-selection uniforms — the draws were not independent + # of the fit's resampling. SeedSequence.spawn keeps both reproducible + # from the one model seed, so determinism is preserved. + fit_seed, draw_seed = np.random.SeedSequence(self.seed).spawn(2) + rng = np.random.default_rng(fit_seed) + + target_models: dict[str, _TargetModel] = {} + for position, target in enumerate(targets): + chained = (*predictors, *targets[:position]) + y = table[target].to_numpy(dtype=np.float64) + features = table.loc[:, list(chained)].to_numpy(dtype=np.float64) + target_models[target] = self._fit_target( + features=features, + y=y, + columns=chained, + weights=weight_values, + rng=rng, + ) + + return FittedRegimeGatedQRF( + entity=entity, + predictors=predictors, + targets=targets, + target_models=target_models, + zero_atol=self.zero_atol, + draw_seed=draw_seed, + ) + + def _fit_target( + self, + *, + features: np.ndarray, + y: np.ndarray, + columns: tuple[str, ...], + weights: np.ndarray | None, + rng: np.random.Generator, + ) -> _TargetModel: + """Fit the gate and per-sign forests for one numeric target.""" + regime = detect_regime(y, zero_atol=self.zero_atol) + + def forest(mask: np.ndarray) -> _Forest: + sub_weights = None if weights is None else weights[mask] + return _fit_forest( + features[mask], + y[mask], + columns, + sub_weights, + seed=int(rng.integers(0, 2**31 - 1)), + n_estimators=self.n_estimators, + max_samples_leaf=self.max_samples_leaf, + rng=rng, + ) + + if regime == Regime.DEGENERATE_ZERO: + return _TargetModel(regime, columns, None, None, None) + + if regime in (Regime.POSITIVE_ONLY, Regime.NEGATIVE_ONLY): + single = forest(np.ones(len(y), dtype=bool)) + return _TargetModel( + regime, + columns, + None, + single if regime == Regime.POSITIVE_ONLY else None, + single if regime == Regime.NEGATIVE_ONLY else None, + ) + + # Gated regimes: a sign label per row. The gate is weighted directly by + # sample_weight (not by bootstrap), so every sign class survives even + # when one is vanishingly rare under the weights. + labels = self._sign_labels(y) + gate = self._fit_gate(features, labels, weights, rng) + pos_mask = y > self.zero_atol + neg_mask = y < -self.zero_atol + positive = forest(pos_mask) if pos_mask.any() else None + negative = forest(neg_mask) if neg_mask.any() else None + return _TargetModel(regime, columns, gate, positive, negative) + + def _sign_labels(self, y: np.ndarray) -> np.ndarray: + """Per-row sign code: ``-1`` negative, ``0`` zero, ``1`` positive.""" + labels = np.zeros(len(y), dtype=int) + labels[y > self.zero_atol] = 1 + labels[y < -self.zero_atol] = -1 + return labels + + def _fit_gate( + self, + features: np.ndarray, + labels: np.ndarray, + weights: np.ndarray | None, + rng: np.random.Generator, + ) -> HistGradientBoostingClassifier: + """Fit the sign-gate classifier, weighting it directly by sample_weight. + + Unlike the forests, the gate is *not* weighted by bootstrap. + ``HistGradientBoostingClassifier`` honors ``sample_weight`` exactly, so + passing the weights directly weights the gate's class probabilities + without resampling. An n-of-n weighted bootstrap would instead delete + rare low-weight classes entirely — a single positive row at weight 1 + among thousands of zeros at weight 50 is drawn with probability ~4e-5, + so the resampled labels routinely contain only the zero class and the + gate could never draw the positive sign (the reproduced gate bug). With + ``sample_weight`` every training row is present, so every sign class the + data contains survives into ``classes_``. + + A guard then enforces internal consistency: every sign class present in + the (unweighted) training labels must appear in the fitted gate's + ``classes_``. If sklearn ever dropped a class, drawing it at probability + zero would silently lose that sign, so we raise instead. + + Args: + features: The chained predictor matrix for this target's rows. + labels: Per-row sign codes (``-1`` / ``0`` / ``1``). + weights: Per-row weights, or ``None`` for an unweighted gate. + rng: Seeded generator supplying the gate's ``random_state`` (kept + for fit reproducibility even though no resample is drawn). + + Returns: + The fitted sign-gate classifier. + + Raises: + ValueError: If a sign class present in ``labels`` is absent from the + fitted gate's ``classes_`` (internal inconsistency). + """ + gate = _make_gate(int(rng.integers(0, 2**31 - 1))) + gate.fit(features, labels, sample_weight=weights) + training_classes = set(np.unique(labels).tolist()) + fitted_classes = set(np.asarray(gate.classes_).tolist()) + missing = sorted(training_classes - fitted_classes) + if missing: + raise ValueError( + "Sign gate dropped class(es) present in training: " + f"{missing} are in the training sign labels " + f"{sorted(training_classes)} but absent from the fitted gate's " + f"classes_ {sorted(fitted_classes)}. Drawing a missing class at " + "probability zero would silently lose that sign; refusing to " + "fit an inconsistent gate." + ) + return gate + + +class FittedRegimeGatedQRF: + """A fitted :class:`RegimeGatedQRF`, ready to draw. + + Holds the per-target gated forests and draws by, for each target in chain + order: gating each row to a sign class, drawing a magnitude from that + class's forest at a per-row quantile, and carrying the drawn column forward + as a predictor for later targets. The draw RNG is seeded from an + independent ``SeedSequence`` child of the model seed (separate from the + fit's resampling stream), so a freshly fitted model reproduces its first + :meth:`predict`, while successive calls on one fitted model advance the + state and give independent draws. + + Attributes: + entity: The entity the predictors and targets live on. + predictors: The conditioning columns. + targets: The drawn columns, in chain order. + """ + + def __init__( + self, + *, + entity: str, + predictors: list[str], + targets: list[str], + target_models: dict[str, _TargetModel], + zero_atol: float, + draw_seed: np.random.SeedSequence, + ) -> None: + self.entity = entity + self.predictors = list(predictors) + self.targets = list(targets) + self._target_models = target_models + self._zero_atol = zero_atol + self._rng = np.random.default_rng(draw_seed) + + def regimes(self) -> dict[str, str]: + """The detected :class:`Regime` label per target.""" + return {name: model.regime for name, model in self._target_models.items()} + + def predict(self, frame_or_df: Frame | pd.DataFrame) -> pd.DataFrame: + """Draw imputed targets. See + :meth:`~populace.fit.model.FittedModel.predict`. + + Raises: + ValueError: If a required predictor column is absent from the input. + """ + features = self._predictor_frame(frame_or_df) + out = pd.DataFrame(index=features.index) + # Accumulate drawn targets so each later target can condition on them + # (chained-equations imputation), mirroring the fit-time chain order. + augmented = features.copy() + for target in self.targets: + drawn = self._draw_target(augmented, self._target_models[target]) + out[target] = drawn + augmented[target] = np.asarray(drawn, dtype=np.float64) + return out + + def _predictor_frame(self, frame_or_df: Frame | pd.DataFrame) -> pd.DataFrame: + """Extract the predictor columns from a Frame or DataFrame input.""" + if isinstance(frame_or_df, Frame): + table = frame_or_df.table(self.entity) + else: + table = frame_or_df + missing = [c for c in self.predictors if c not in table.columns] + if missing: + kind = "frame" if isinstance(frame_or_df, Frame) else "DataFrame" + raise ValueError( + f"predict input ({kind}) is missing predictor column(s) " + f"{missing}; the model conditions on {self.predictors}." + ) + return table.loc[:, self.predictors].copy() + + def _draw_target( + self, features: pd.DataFrame, model: _TargetModel + ) -> np.ndarray: + """Draw one value per row for a single target via its regime pipeline.""" + n = len(features) + if model.regime == Regime.DEGENERATE_ZERO: + return np.zeros(n, dtype=np.float64) + + quantiles = self._rng.random(n) + if model.regime == Regime.POSITIVE_ONLY: + return model.positive.draw(features, quantiles) + if model.regime == Regime.NEGATIVE_ONLY: + return model.negative.draw(features, quantiles) + + signs = self._gate_draw(model.gate, features, model.columns) + values = np.zeros(n, dtype=np.float64) + pos_mask = signs == 1 + neg_mask = signs == -1 + if pos_mask.any() and model.positive is not None: + values[pos_mask] = model.positive.draw( + features.loc[pos_mask], quantiles[pos_mask] + ) + if neg_mask.any() and model.negative is not None: + values[neg_mask] = model.negative.draw( + features.loc[neg_mask], quantiles[neg_mask] + ) + return values + + def _gate_draw( + self, + gate: HistGradientBoostingClassifier, + features: pd.DataFrame, + columns: tuple[str, ...], + ) -> np.ndarray: + """Stochastically assign each row a sign code from the gate's proba. + + Draws the row's sign class from the categorical the gate predicts (not + the argmax), so the share of rows assigned to each sign reproduces the + gate's probabilities — that is what preserves the zero mass of a + zero-inflated target rather than collapsing it to the modal class. + + Args: + gate: The fitted sign-gate classifier. + features: The chained predictor frame for this target's rows. + columns: The exact column order the gate was fit on. + + Returns: + One sign code (``-1`` / ``0`` / ``1``) per row. + """ + x = features.loc[:, list(columns)].to_numpy(dtype=np.float64) + proba = np.asarray(gate.predict_proba(x)) + cumulative = np.cumsum(proba, axis=1) + u = self._rng.random(len(x)) + chosen = (cumulative >= u[:, None]).argmax(axis=1) + return np.asarray(gate.classes_)[chosen] + + def __repr__(self) -> str: + regimes = ", ".join(f"{k}:{v}" for k, v in self.regimes().items()) + return ( + f"FittedRegimeGatedQRF(entity={self.entity!r}, " + f"predictors={self.predictors}, regimes[{regimes}])" + ) diff --git a/packages/populace-fit/tests/conftest.py b/packages/populace-fit/tests/conftest.py new file mode 100644 index 0000000..6032778 --- /dev/null +++ b/packages/populace-fit/tests/conftest.py @@ -0,0 +1,240 @@ +"""Shared fixtures for the populace-fit suite. + +The builders here construct small, seeded frames whose *weighted* conditional +distribution differs sharply from the unweighted one — the microimpute#196 +shape — so the behavioral contracts can assert that weighting actually moves the +draws. Everything is sized for CI speed (n=5000, small forests). +""" + +from __future__ import annotations + +import numpy as np +import pandas as pd +import pytest + +from populace.frame import EntitySchema, Frame, WeightKind, Weights + +#: Person-only schema (person + a one-person-per household, so person weights +#: are unambiguous and every predictor/target lives on the person entity). +PERSON_SCHEMA = EntitySchema(group_entities=("household",)) + +#: Small but statistically stable sample size for the weighted-fit contracts. +CONTRACT_N = 5000 + + +def _person_household_frame( + columns: dict[str, np.ndarray], + weights: np.ndarray, + *, + kind: WeightKind = WeightKind.DESIGN, +) -> Frame: + """Assemble a one-person-per-household frame with person-level weights. + + Args: + columns: Person-level columns (each length ``n``). + weights: Person weight vector (length ``n``). + kind: Weight kind to tag the vector with. + + Returns: + A validated :class:`~populace.frame.Frame`. + """ + n = len(weights) + person = pd.DataFrame( + { + "person_id": np.arange(n, dtype="int64"), + "person_household_id": np.arange(n, dtype="int64"), + **columns, + } + ) + household = pd.DataFrame({"household_id": np.arange(n, dtype="int64")}) + return Frame( + {"person": person, "household": household}, + PERSON_SCHEMA, + {"person": Weights(values=weights, kind=kind)}, + ) + + +@pytest.fixture +def make_person_frame(): + """Factory: assemble a one-person-per-household frame from raw columns. + + A thin wrapper over the module builder, for tests that need a bespoke target + distribution (e.g. a degenerate-zero target). + + Returns: + A callable ``make(columns, weights=None)`` returning a + :class:`~populace.frame.Frame`; ``weights`` defaults to a constant + vector sized to the columns. + """ + + def make(columns: dict[str, np.ndarray], weights: np.ndarray | None = None): + n = len(next(iter(columns.values()))) + if weights is None: + weights = np.full(n, 5.0) + return _person_household_frame(columns, weights) + + return make + + +@pytest.fixture +def weight_correlated_frame(): + """Factory: a frame whose target is large exactly where weight is small. + + The microimpute#196 repro. About 20% of rows are a "low-weight, huge-value" + regime: their target is ~15x the bulk and they carry a small design weight, + so the *unweighted* mean is dominated by them while the *weighted* mean is + not. A fit that honors weights must reproduce the weighted mean; one that + ignores them reproduces the unweighted mean. The two truths differ by well + over 3x, so the contract can separate them cleanly. + + Crucially, the predictors (``age``, ``is_male``) are drawn *independently* + of the regime, so they do not reveal which rows are the huge-value ones. + That is exactly the #196 condition: the weight correlates with the target in + the part of its variance the predictors do **not** explain, so honoring the + weight is the only way to recover the weighted conditional. (Were the regime + identifiable from the predictors, each leaf would be pure and reweighting it + would not move its value — there would be no weighting effect to test.) + + Returns: + A callable ``make(seed=0, n=CONTRACT_N, low_fraction=0.2)`` returning + ``(frame, target_values, weight_values)``. + """ + + def make( + seed: int = 0, + n: int = CONTRACT_N, + low_fraction: float = 0.2, + ) -> tuple[Frame, np.ndarray, np.ndarray]: + rng = np.random.default_rng(seed) + # Predictors are independent of the regime: they carry no signal about + # which rows are the huge-value, low-weight ones. + age = rng.integers(18, 75, n).astype(float) + is_male = rng.integers(0, 2, n).astype(float) + low_weight = rng.random(n) < low_fraction + # Huge values (~15x the bulk) for the low-weight rows; the wide gap + # keeps the unweighted/weighted separation above 3x even after the mild + # shrinkage a forest applies at the top of the support. + target = np.where( + low_weight, + rng.normal(600_000.0, 40_000.0, n), + rng.normal(40_000.0, 7_000.0, n), + ) + target = np.clip(target, 0.0, None) + weights = np.where(low_weight, 1.0, 50.0) + frame = _person_household_frame( + {"age": age, "is_male": is_male, "target": target}, weights + ) + return frame, target, weights + + return make + + +@pytest.fixture +def rare_positive_frame(): + """Factory: a target positive on a few low-weight rows, zero elsewhere. + + The reviewer's gate-bug repro. About ``n_positive`` rows carry a positive + value at weight 1; every other row is zero at weight 50, so the *weighted* + positive share is tiny (~4e-5 at the defaults). An n-of-n weighted bootstrap + of the gate draws each positive row with probability ~4e-5, so the resampled + labels almost always contain only the zero class — the gate collapses to one + class and can never draw the positive sign. Weighting the gate directly by + ``sample_weight`` keeps every row, so both classes survive. + + The predictors carry a weak signal about which rows are positive (so the + forest can place them), but the positive rows remain rare enough under the + weights that the bootstrap would drop them. + + Returns: + A callable ``make(seed=0, n=CONTRACT_N, n_positive=10)`` returning + ``(frame, target_values, weight_values)``. + """ + + def make( + seed: int = 0, + n: int = CONTRACT_N, + n_positive: int = 10, + ) -> tuple[Frame, np.ndarray, np.ndarray]: + rng = np.random.default_rng(seed) + positive_idx = rng.choice(n, size=n_positive, replace=False) + is_positive = np.zeros(n, dtype=bool) + is_positive[positive_idx] = True + # A predictor weakly correlated with being positive, so the gate has a + # signal to learn (without it, even a correct gate could not separate + # the classes and the test would not isolate the bootstrap bug). + signal = rng.normal(0.0, 1.0, n) + signal[is_positive] += 2.5 + noise = rng.normal(0.0, 1.0, n) + target = np.where( + is_positive, np.abs(rng.normal(100_000.0, 10_000.0, n)), 0.0 + ) + weights = np.where(is_positive, 1.0, 50.0) + frame = _person_household_frame( + {"signal": signal, "noise": noise, "target": target}, weights + ) + return frame, target, weights + + return make + + +@pytest.fixture +def zero_inflated_frame(): + """Factory: a frame with a zero-inflated, sign-mixed target. + + The target is zero for a controlled share of rows, positive for most of the + rest, and negative for a small tail — the three-sign regime. Used to assert + the gate preserves the zero mass and both signs. + + Returns: + A callable ``make(seed=0, n=CONTRACT_N, zero_fraction=0.4, + neg_fraction=0.1)`` returning ``(frame, target_values)``. + """ + + def make( + seed: int = 0, + n: int = CONTRACT_N, + zero_fraction: float = 0.4, + neg_fraction: float = 0.1, + ) -> tuple[Frame, np.ndarray]: + rng = np.random.default_rng(seed) + x = rng.normal(0.0, 1.0, n) + u = rng.random(n) + target = np.zeros(n, dtype=float) + pos = u >= zero_fraction + neg_fraction + neg = (u >= zero_fraction) & (u < zero_fraction + neg_fraction) + target[pos] = np.abs(rng.normal(10_000.0, 2_000.0, n))[pos] + target[neg] = -np.abs(rng.normal(3_000.0, 800.0, n))[neg] + frame = _person_household_frame( + {"x": x, "target": target}, np.full(n, 10.0) + ) + return frame, target + + return make + + +@pytest.fixture +def correlated_targets_frame(): + """Factory: two targets with a strong built-in correlation. + + ``second`` is a noisy linear function of ``first`` that the predictors do + not explain on their own, so only a model that *chains* (conditions + ``second`` on the imputed ``first``) can reproduce their correlation. + + Returns: + A callable ``make(seed=0, n=CONTRACT_N)`` returning ``(frame, rho)`` + where ``rho`` is the true Pearson correlation of the two targets. + """ + + def make(seed: int = 0, n: int = CONTRACT_N) -> tuple[Frame, float]: + rng = np.random.default_rng(seed) + x = rng.normal(0.0, 1.0, n) + first = 100.0 + 5.0 * x + rng.normal(0.0, 20.0, n) + # second tracks first, with noise the predictor x cannot account for. + second = 3.0 * first + rng.normal(0.0, 30.0, n) + rho = float(np.corrcoef(first, second)[0, 1]) + frame = _person_household_frame( + {"x": x, "first": first, "second": second}, np.full(n, 10.0) + ) + return frame, rho + + return make diff --git a/packages/populace-fit/tests/test_compat.py b/packages/populace-fit/tests/test_compat.py new file mode 100644 index 0000000..a25f023 --- /dev/null +++ b/packages/populace-fit/tests/test_compat.py @@ -0,0 +1,68 @@ +"""The constellation mechanism: the kernel-compatibility gate at import. + +DESIGN.md requires each shard to assert kernel compatibility at import — a cheap +``frame.__version__`` check — so a resolver that ignores ``[tool.uv.sources]`` +cannot silently assemble an incompatible pair. These tests exercise the gate +directly (the real import already ran the gate successfully, or the suite would +not have loaded). +""" + +from __future__ import annotations + +import pytest + +import populace.fit +from populace.fit import __version__ as fit_version +from populace.fit import _assert_frame_compatible + + +def test_namespace_has_no_top_level_init() -> None: + """populace stays a PEP 420 namespace; the fit shard ships no __init__ for it. + + A shard clobbering ``populace/__init__.py`` would break side-by-side install + of populace-frame and populace-calibrate. (The fit *subpackage* has its own + ``__init__``; the *namespace* must not.) + """ + import populace + + assert getattr(populace, "__file__", None) is None + assert hasattr(populace, "__path__") + + +def test_fit_declares_its_own_version() -> None: + """The shard exposes its version for the constellation matrix.""" + assert fit_version == "0.1.0" + assert populace.fit.__version__ == "0.1.0" + + +def test_compat_gate_accepts_the_matching_series() -> None: + """The installed kernel passes the gate (this is the live configuration).""" + # Exact series match. + _assert_frame_compatible("0.1.0", (0, 1)) + # Patch differences within the series are fine. + _assert_frame_compatible("0.1.5", (0, 1)) + + +def test_compat_gate_rejects_a_too_old_or_too_new_kernel() -> None: + """A kernel outside the required 0.x minor series is refused at import. + + Pre-1.0, minors may break compatibility, so 0.0.x and 0.2.x are both + incompatible with a shard built for 0.1.x. The error names both versions. + """ + with pytest.raises(ImportError, match="requires populace-frame 0.1.x"): + _assert_frame_compatible("0.0.9", (0, 1)) + with pytest.raises(ImportError, match="0.2.0 is installed"): + _assert_frame_compatible("0.2.0", (0, 1)) + + +def test_compat_gate_uses_major_only_from_1_0() -> None: + """From 1.0 on, the gate matches the major and tolerates any minor.""" + _assert_frame_compatible("1.4.2", (1, 0)) + with pytest.raises(ImportError, match="requires populace-frame 2.x"): + _assert_frame_compatible("1.9.9", (2, 0)) + + +def test_compat_gate_rejects_an_unparseable_version() -> None: + """A version string the gate cannot parse is a clear ImportError.""" + with pytest.raises(ImportError, match="cannot parse"): + _assert_frame_compatible("not-a-version", (0, 1)) diff --git a/packages/populace-fit/tests/test_model.py b/packages/populace-fit/tests/test_model.py new file mode 100644 index 0000000..ea7a09d --- /dev/null +++ b/packages/populace-fit/tests/test_model.py @@ -0,0 +1,285 @@ +"""The weight contract and entity resolution of the conditional-model protocol. + +These tests pin the operator's defining rule (DESIGN.md, "populace-fit"): a fit +is weighted by construction, ``weights="none"`` is the *only* unweighted path, +and a misspelled or mismatched weights spec fails loudly rather than silently +falling back to an unweighted fit. They also pin that predictors and targets +must resolve to a single entity, and that the protocols are satisfied. +""" + +from __future__ import annotations + +import numpy as np +import pandas as pd +import pytest + +from populace.fit import ( + NO_WEIGHTS, + QRF, + ConditionalModel, + FittedModel, + FittedRegimeGatedQRF, + RegimeGatedQRF, + fit, +) +from populace.fit.model import predictors_targets_entity, resolve_fit_weights +from populace.frame import ( + EntitySchema, + Frame, + WeightKind, + Weights, +) + +# ---------------------------------------------------------------------------- +# Protocol conformance +# ---------------------------------------------------------------------------- + + +def test_qrf_is_a_conditional_model() -> None: + """The canonical model satisfies the ConditionalModel protocol.""" + assert isinstance(RegimeGatedQRF(), ConditionalModel) + assert QRF is RegimeGatedQRF + + +def test_fitted_model_satisfies_the_fitted_protocol( + weight_correlated_frame, +) -> None: + """A fitted model satisfies the FittedModel protocol.""" + frame, _, _ = weight_correlated_frame(seed=0, n=200) + fitted = fit(frame, ["age", "is_male"], ["target"], n_estimators=10, seed=0) + assert isinstance(fitted, FittedModel) + assert isinstance(fitted, FittedRegimeGatedQRF) + + +# ---------------------------------------------------------------------------- +# weights="none" is the only unweighted path +# ---------------------------------------------------------------------------- + + +def test_none_is_the_only_unweighted_path_typo_raises( + weight_correlated_frame, +) -> None: + """A misspelled weights kind raises — it never silently fits unweighted. + + This is the core safety property. ``"nonе"`` (or any unknown spelling) must + not degrade into an unweighted fit the way a forgotten ``weight_col`` did in + microimpute; it must raise and point the caller at ``weights="none"``. + """ + frame, _, _ = weight_correlated_frame(seed=0, n=200) + with pytest.raises(ValueError, match="Unknown weights spec"): + fit(frame, ["age", "is_male"], ["target"], weights="non") + with pytest.raises(ValueError, match="Unknown weights spec"): + fit(frame, ["age", "is_male"], ["target"], weights="unweighted") + # The error names the explicit escape hatch. + with pytest.raises(ValueError, match="weights='none'"): + fit(frame, ["age", "is_male"], ["target"], weights="off") + + +def test_non_string_non_kind_weights_raises(weight_correlated_frame) -> None: + """A weights spec of the wrong type raises a TypeError, not a stray fit.""" + frame, _, _ = weight_correlated_frame(seed=0, n=200) + with pytest.raises(TypeError, match="WeightKind"): + fit(frame, ["age", "is_male"], ["target"], weights=0) + with pytest.raises(TypeError, match="weights='none'"): + fit(frame, ["age", "is_male"], ["target"], weights=[1.0, 2.0]) + + +def test_weights_string_and_kind_are_equivalent(weight_correlated_frame) -> None: + """``weights="design"`` and ``weights=WeightKind.DESIGN`` resolve the same.""" + frame, _, _ = weight_correlated_frame(seed=0, n=300) + entity = "person" + by_string = resolve_fit_weights(frame, entity, "design") + by_kind = resolve_fit_weights(frame, entity, WeightKind.DESIGN) + np.testing.assert_array_equal(by_string, by_kind) + # And "none" yields no vector (the unweighted path). + assert resolve_fit_weights(frame, entity, NO_WEIGHTS) is None + + +def test_requesting_a_kind_the_entity_lacks_raises( + weight_correlated_frame, +) -> None: + """Asking for calibrated weights on a design-weighted entity is refused. + + The frame stores design weights; requesting ``"calibrated"`` must not + silently fall through to design (or to unweighted). It raises and names the + stored kind and the fix. + """ + frame, _, _ = weight_correlated_frame(seed=0, n=200) + with pytest.raises(ValueError, match="resolved weights are 'design'"): + fit(frame, ["age", "is_male"], ["target"], weights="calibrated") + with pytest.raises(ValueError, match="resolved weights are 'design'"): + resolve_fit_weights(frame, "person", WeightKind.IMPORTANCE) + + +def test_none_path_returns_no_vector_even_with_weights_present( + weight_correlated_frame, +) -> None: + """``weights="none"`` ignores the stored weights deliberately.""" + frame, _, _ = weight_correlated_frame(seed=0, n=200) + assert resolve_fit_weights(frame, "person", NO_WEIGHTS) is None + + +# ---------------------------------------------------------------------------- +# Entity resolution: predictors and targets must share one entity +# ---------------------------------------------------------------------------- + + +def _two_entity_frame() -> Frame: + """A frame with a person column and a household column at different grains.""" + person = pd.DataFrame( + { + "person_id": [0, 1, 2], + "person_household_id": [1, 1, 2], + "age": [40.0, 8.0, 33.0], + } + ) + household = pd.DataFrame({"household_id": [1, 2], "state_income": [50.0, 70.0]}) + schema = EntitySchema(group_entities=("household",)) + return Frame( + {"person": person, "household": household}, + schema, + {"household": Weights(values=np.array([10.0, 20.0]), kind=WeightKind.DESIGN)}, + ) + + +def test_fit_refuses_predictors_and_targets_spanning_entities() -> None: + """A person predictor and a household target span grains and are refused. + + Silently cross-joining a person column with a household column would fit at + a mismatched grain with mismatched weights. The resolver refuses and names + each column's entity. + """ + frame = _two_entity_frame() + with pytest.raises(ValueError, match="must all live on one entity"): + fit(frame, ["age"], ["state_income"], weights="none") + # The message names the entities involved. + with pytest.raises(ValueError, match="household"): + predictors_targets_entity(frame, ["age"], ["state_income"]) + + +def test_fit_refuses_unknown_columns() -> None: + """An unknown predictor or target names the missing column.""" + frame = _two_entity_frame() + with pytest.raises(ValueError, match="not found on any entity table"): + fit(frame, ["nonexistent"], ["age"], weights="none") + + +def test_fit_refuses_empty_predictors_or_targets() -> None: + """A fit needs at least one predictor and one target.""" + frame = _two_entity_frame() + with pytest.raises(ValueError, match="at least one predictor"): + predictors_targets_entity(frame, [], ["age"]) + with pytest.raises(ValueError, match="at least one target"): + predictors_targets_entity(frame, ["age"], []) + + +def test_predict_missing_predictor_column_raises(weight_correlated_frame) -> None: + """Predicting on a DataFrame that lacks a predictor names the gap.""" + frame, _, _ = weight_correlated_frame(seed=0, n=200) + fitted = fit(frame, ["age", "is_male"], ["target"], n_estimators=10, seed=0) + bad = frame.table("person").drop(columns=["is_male"]) + with pytest.raises(ValueError, match="missing predictor column"): + fitted.predict(bad) + + +# ---------------------------------------------------------------------------- +# The canonical CPS shape: person columns, household-only design weights +# ---------------------------------------------------------------------------- + + +def _cps_shape_frame( + *, kind: WeightKind = WeightKind.DESIGN, n_households: int = 400 +) -> Frame: + """A person table with predictors/targets, weighted only on the household. + + This is the canonical survey shape (a CPS person record carries no person + weight; the household weight is the design weight). Two persons per + household; the person fit must inherit the household weight through + membership. ``resolve_fit_weights`` calling ``weights_for("person")`` would + raise here — the bug fix B closes. + """ + rng = np.random.default_rng(0) + n_people = n_households * 2 + household_id = np.repeat(np.arange(n_households, dtype="int64"), 2) + person = pd.DataFrame( + { + "person_id": np.arange(n_people, dtype="int64"), + "person_household_id": household_id, + "age": rng.integers(18, 75, n_people).astype(float), + "income": rng.normal(40_000.0, 10_000.0, n_people).clip(0.0), + } + ) + household = pd.DataFrame( + {"household_id": np.arange(n_households, dtype="int64")} + ) + # A non-trivial design weight vector, one per household. + weights = Weights( + values=50.0 + np.arange(n_households, dtype=float), kind=kind + ) + schema = EntitySchema(group_entities=("household",)) + return Frame( + {"person": person, "household": household}, + schema, + {"household": weights}, + ) + + +def test_person_fit_on_household_weighted_frame_resolves_weighted() -> None: + """A person fit on a household-weighted frame fits weighted (was the bug). + + The frame stores design weights only on the household; the predictors and + target are person-level. ``resolve_fit_weights`` must resolve the inherited + household design weights onto persons rather than calling + ``weights_for("person")`` and raising. The resolved vector has one entry per + person, broadcast from the household weights. + """ + frame = _cps_shape_frame() + resolved = resolve_fit_weights(frame, "person", "design") + assert resolved is not None + assert resolved.shape == (frame.n("person"),) + # Two persons per household, so each household weight appears twice in a + # row: persons 0,1 -> hh 0 (50), persons 2,3 -> hh 1 (51), ... + expected = np.repeat(50.0 + np.arange(frame.n("household"), dtype=float), 2) + np.testing.assert_array_equal(resolved, expected) + + # And an end-to-end weighted fit runs without raising — the actual bug. + fitted = fit(frame, ["age"], ["income"], n_estimators=10, seed=0) + drawn = fitted.predict(frame)["income"].to_numpy() + assert drawn.shape == (frame.n("person"),) + + +def test_post_calibration_default_design_fit_raises_actionable_message() -> None: + """A default (design) fit on a calibrated frame names ``weights="calibrated"``. + + After calibration the frame carries calibrated weights, but the fit's + default is ``weights="design"``. Because kinds only move forward, the old + advice ("advance the frame's weights to design") was impossible. The fix + must instead tell the caller to pass ``weights="calibrated"`` — the kind the + frame actually carries. + """ + frame = _cps_shape_frame(kind=WeightKind.CALIBRATED) + with pytest.raises(ValueError) as excinfo: + # Default weights="design" on a calibrated frame. + fit(frame, ["age"], ["income"], n_estimators=10, seed=0) + message = str(excinfo.value) + # Actionable: name the kind to request. + assert "weights='calibrated'" in message + # And it must NOT give the impossible advice to advance to design. + assert "advance the frame's weights to 'design'" not in message + assert "only move forward" in message + + +def test_duplicate_or_overlapping_columns_raise(weight_correlated_frame) -> None: + """Duplicate predictors/targets, or a column that is both, are refused. + + A duplicate target silently fit twice (dict-key collision) then crashed at + predict; a target listed among predictors silently fit P(y|y). Both now + raise naming the culprit. + """ + frame, _, _ = weight_correlated_frame(seed=0, n=200) + with pytest.raises(ValueError, match="Duplicate targets"): + fit(frame, ["age"], ["target", "target"]) + with pytest.raises(ValueError, match="Duplicate predictors"): + fit(frame, ["age", "age"], ["target"]) + with pytest.raises(ValueError, match="both predictor and target"): + fit(frame, ["age", "target"], ["target"]) diff --git a/packages/populace-fit/tests/test_qrf.py b/packages/populace-fit/tests/test_qrf.py new file mode 100644 index 0000000..95050b8 --- /dev/null +++ b/packages/populace-fit/tests/test_qrf.py @@ -0,0 +1,501 @@ +"""Mechanics of the canonical model: regimes, gates, chaining, determinism. + +These tests pin the structural guarantees of the regime-gated, chained, +weighted-bootstrap QRF that are not the weighted-mean contract itself: that the +gate preserves a zero-inflated target's zero mass and sign structure, that +chaining reproduces a cross-target correlation, that the draw count matches the +input, and that a fixed seed makes a freshly fitted model reproducible. +""" + +from __future__ import annotations + +import numpy as np +import pytest + +from populace.fit import Regime, fit +from populace.fit.qrf import detect_regime + +# ---------------------------------------------------------------------------- +# Regime detection: structural, unweighted support classification +# ---------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + ("values", "expected"), + [ + ([1.0, 2.0, 3.0], Regime.POSITIVE_ONLY), + ([-1.0, -2.0, -3.0], Regime.NEGATIVE_ONLY), + ([0.0, 1.0, 0.0, 2.0], Regime.ZERO_INFLATED_POSITIVE), + ([0.0, -1.0, 0.0, -2.0], Regime.ZERO_INFLATED_NEGATIVE), + ([-1.0, 1.0, 2.0], Regime.SIGN_ONLY), + ([-1.0, 0.0, 1.0], Regime.THREE_SIGN), + ([0.0, 0.0, 0.0], Regime.DEGENERATE_ZERO), + ], +) +def test_detect_regime_classifies_support(values, expected) -> None: + """The regime is read from which signs appear in the (unweighted) support.""" + assert detect_regime(np.array(values), zero_atol=1e-6) == expected + + +def test_regime_detection_ignores_weights(weight_correlated_frame) -> None: + """Regime detection is structural: the same regime weighted or not. + + Which signs *exist* is a property of the variable, not the weighting, so a + weighted and an unweighted fit of the same target detect the same regime. + """ + frame, _, _ = weight_correlated_frame(seed=0) + weighted = fit(frame, ["age", "is_male"], ["target"], n_estimators=20, seed=0) + unweighted = fit( + frame, ["age", "is_male"], ["target"], weights="none", n_estimators=20, seed=0 + ) + assert weighted.regimes() == unweighted.regimes() + assert weighted.regimes()["target"] == Regime.POSITIVE_ONLY + + +# ---------------------------------------------------------------------------- +# Gates: a zero-inflated target's draws preserve zero mass and sign structure +# ---------------------------------------------------------------------------- + + +def test_zero_inflated_draws_preserve_zero_mass_and_signs( + zero_inflated_frame, +) -> None: + """The gate reproduces the zero share and both signs, with no zero-crossing. + + A single ungated regressor would smear the zeros into small nonzero values + and interpolate across the zero gap. The gate must instead (1) put roughly + the training share of draws at exactly zero, (2) keep some draws on each + sign, and (3) never produce a value strictly between the negative and + positive clusters that no training row occupies. + """ + frame, target = zero_inflated_frame(seed=0) + fitted = fit(frame, ["x"], ["target"], n_estimators=80, seed=0) + assert fitted.regimes()["target"] == Regime.THREE_SIGN + + draws = fitted.predict(frame)["target"].to_numpy() + + true_zero_share = float((np.abs(target) <= 1e-6).mean()) + draw_zero_share = float((draws == 0.0).mean()) + # Zero mass is preserved (not smeared away), within a sampling tolerance. + assert draw_zero_share == pytest.approx(true_zero_share, abs=0.07) + + # Both signs survive — the negative tail is not dropped. + assert (draws > 0).any() + assert (draws < 0).any() + + # No zero-crossing interpolation: drawn magnitudes lie in the support of + # their sign, never in the empty gap between the clusters. + min_train_pos = float(target[target > 1e-6].min()) + max_train_neg = float(target[target < -1e-6].max()) + nonzero = draws[draws != 0.0] + assert (nonzero[nonzero > 0] >= 0.5 * min_train_pos).all() + assert (nonzero[nonzero < 0] <= 0.5 * max_train_neg).all() + + +def test_gate_retains_a_rare_low_weight_class_and_draws_it( + rare_positive_frame, +) -> None: + """The gate keeps a vanishingly-rare sign class and can draw it. + + The reviewer's repro: ~10 positive rows at weight 1 among ~4990 zeros at + weight 50 (weighted positive share ~4e-5). The old gate was fit on an n-of-n + weighted bootstrap, which drew the positive rows with probability ~4e-5 and + so produced a single-class gate that drew the positive sign with probability + zero (0 positive draws across millions). Fitting the gate directly with + ``sample_weight`` keeps every row, so: + + (a) the fitted gate retains both classes (0 and 1); and + (b) across seeds the model produces a positive draw at least once — the + positive regime is reachable, not collapsed to zero. + """ + frame, target, _ = rare_positive_frame(seed=0) + # The target is zero-inflated positive: zeros plus a positive tail. + fitted = fit(frame, ["signal", "noise"], ["target"], n_estimators=60, seed=0) + assert fitted.regimes()["target"] == Regime.ZERO_INFLATED_POSITIVE + + # (a) The gate retained both sign classes (0 and 1); the bootstrap dropped 1. + gate = fitted._target_models["target"].gate + assert set(np.asarray(gate.classes_).tolist()) == {0, 1} + + # (b) Positive draws appear. One predict is ~n*4e-5 ~ 0.2 expected positives, + # so aggregate several seeds to make the assertion robust: the share is small + # but emphatically not zero (the bug drew exactly zero, forever). + total_positive = 0 + for _ in range(8): + draws = fitted.predict(frame)["target"].to_numpy() + total_positive += int((draws > 0).sum()) + assert total_positive > 0, ( + "the rare positive regime was never drawn; the gate collapsed to the " + "zero class (the reproduced bootstrap bug)" + ) + + +def test_gate_consistency_guard_rejects_a_dropped_class(make_person_frame) -> None: + """The internal-consistency guard fires if the gate drops a training class. + + Regime detection found a sign class, but the fitted gate's ``classes_`` lack + it — drawing it at probability zero would silently lose that sign. The fit + must raise rather than ship an inconsistent gate. We force the inconsistency + by monkeypatching the gate factory to return a classifier that always fits a + single class. + """ + import populace.fit.qrf as qrf_module + + n = 400 + rng = np.random.default_rng(0) + # A genuine zero-inflated-positive target (so a gate is built). + x = rng.normal(size=n) + target = np.where(rng.random(n) < 0.3, np.abs(rng.normal(1000.0, 100.0, n)), 0.0) + frame = make_person_frame({"x": x, "target": target}) + + class _SingleClassGate: + """A stand-in classifier that always fits only the zero class.""" + + def __init__(self, *args, **kwargs) -> None: + self.classes_ = np.array([0]) + + def fit(self, x, y, sample_weight=None): # noqa: A002 - mirror sklearn + return self + + original = qrf_module._make_gate + qrf_module._make_gate = lambda seed: _SingleClassGate() + try: + with pytest.raises(ValueError, match="Sign gate dropped class"): + fit(frame, ["x"], ["target"], n_estimators=10, seed=0) + finally: + qrf_module._make_gate = original + + +def test_degenerate_zero_target_draws_all_zero(make_person_frame) -> None: + """A target that is constant zero in training draws exactly zero.""" + n = 200 + rng = np.random.default_rng(0) + frame = make_person_frame({"x": rng.normal(size=n), "target": np.zeros(n)}) + fitted = fit(frame, ["x"], ["target"], n_estimators=10, seed=0) + assert fitted.regimes()["target"] == Regime.DEGENERATE_ZERO + draws = fitted.predict(frame)["target"].to_numpy() + assert (draws == 0.0).all() + + +def test_nan_target_raises_naming_the_column_and_count(make_person_frame) -> None: + """A target with NaNs is refused at fit, naming the column and NaN count. + + Without the guard, a NaN target is silently relabeled to the zero class — + the sign labels (``y > atol`` / ``y < -atol``) are both False for NaN — so + missing values masquerade as structural zeros, NaN-blind. Fit must instead + raise, naming the offending column and how many values are non-finite. + """ + n = 200 + rng = np.random.default_rng(0) + target = np.abs(rng.normal(1000.0, 100.0, n)) + target[5] = np.nan + target[17] = np.nan + target[42] = np.nan + frame = make_person_frame({"x": rng.normal(size=n), "target": target}) + + with pytest.raises(ValueError, match=r"Target column 'target' contains 3 "): + fit(frame, ["x"], ["target"], n_estimators=10, seed=0) + + +def test_inf_target_is_also_refused(make_person_frame) -> None: + """Infinity in a target is non-finite too, and refused the same way.""" + n = 120 + rng = np.random.default_rng(1) + target = np.abs(rng.normal(500.0, 50.0, n)) + target[3] = np.inf + frame = make_person_frame({"x": rng.normal(size=n), "target": target}) + with pytest.raises(ValueError, match="non-finite"): + fit(frame, ["x"], ["target"], n_estimators=10, seed=0) + + +# ---------------------------------------------------------------------------- +# Chaining: a target conditioned on a prior imputed target tracks correlation +# ---------------------------------------------------------------------------- + + +def test_chaining_reproduces_cross_target_correlation( + correlated_targets_frame, +) -> None: + """Chained draws of two correlated targets keep their correlation. + + ``second`` is a noisy linear function of ``first`` that the predictor ``x`` + alone cannot reconstruct. Because the model chains — conditioning + ``second`` on the drawn ``first`` — the drawn pair stays strongly + correlated. Without chaining, the two would be independent given ``x`` and + their drawn correlation would collapse. + """ + frame, true_rho = correlated_targets_frame(seed=0) + assert true_rho > 0.9 # the fixture builds a strong correlation + + fitted = fit(frame, ["x"], ["first", "second"], n_estimators=80, seed=0) + draws = fitted.predict(frame) + drawn_rho = float(np.corrcoef(draws["first"], draws["second"])[0, 1]) + + # The chained draws retain most of the correlation. + assert drawn_rho > 0.8 + + +def test_chaining_columns_grow_along_the_chain(correlated_targets_frame) -> None: + """Each target is fit on the predictors plus the targets chained before it.""" + frame, _ = correlated_targets_frame(seed=0) + fitted = fit(frame, ["x"], ["first", "second"], n_estimators=10, seed=0) + models = fitted._target_models + # first conditions on x only; second on x and the drawn first. + assert models["first"].columns == ("x",) + assert models["second"].columns == ("x", "first") + + +# ---------------------------------------------------------------------------- +# Tail-draw fidelity: full leaf samples, interpolation, drawable extremes +# ---------------------------------------------------------------------------- + +#: The heavy-tail threshold for the weight_correlated fixture: the bulk sits at +#: ~40k and the rare regime at ~600k, so 300k cleanly separates them. +_TAIL_THRESHOLD = 300_000.0 + + +def test_tail_share_tracks_weighted_truth_and_beats_nearest_snap( + weight_correlated_frame, +) -> None: + """Weighted-draw tail share matches truth and beats the pre-fix nearest-snap. + + The fixture's weighted-population share above 300k is ~0.0050. The pre-fix + forest kept one sample per leaf (``max_samples_leaf=1``) and snapped each + draw to the nearest of 201 interior grid points, which thinned the + conditional and undershot the tail by roughly a third (share ~0.0035). The + fix — ``max_samples_leaf=None`` (all leaf samples) plus linear + interpolation at the exact per-row quantile — recovers the tail. The + contract: + + (a) the fix's weighted tail share is within ~2x of the weighted truth; and + (b) it is materially closer to the truth than the nearest-snap baseline. + """ + frame, target, weights = weight_correlated_frame(seed=0) + truth_share = float( + np.average((target > _TAIL_THRESHOLD).astype(float), weights=weights) + ) + # The fixture must actually have a tail to test (guard against a fixture + # change collapsing it). + assert truth_share > 0.0 + + fitted = fit(frame, ["age", "is_male"], ["target"], n_estimators=100, seed=1) + draws = fitted.predict(frame)["target"].to_numpy() + fix_share = float( + np.average((draws > _TAIL_THRESHOLD).astype(float), weights=weights) + ) + + # The nearest-snap, one-sample-per-leaf baseline (the pre-fix behavior), + # reconstructed by monkeypatching the draw read-out and forcing + # max_samples_leaf=1. + import populace.fit.qrf as qrf_module + + interior_grid = np.linspace(1.0 / 202.0, 1.0 - 1.0 / 202.0, 201) + + def nearest_snap_draw(self, frame_in, quantiles): + feats = frame_in.loc[:, list(self.columns)].to_numpy(dtype=np.float64) + preds = np.asarray( + self.model.predict(feats, quantiles=list(interior_grid)) + ).reshape(len(feats), len(interior_grid)) + idx = np.clip( + np.rint(quantiles * (len(interior_grid) - 1)).astype(int), + 0, + len(interior_grid) - 1, + ) + return preds[np.arange(len(feats)), idx] + + original_draw = qrf_module._Forest.draw + qrf_module._Forest.draw = nearest_snap_draw + try: + baseline_fitted = fit( + frame, + ["age", "is_male"], + ["target"], + n_estimators=100, + max_samples_leaf=1, + seed=1, + ) + baseline_draws = baseline_fitted.predict(frame)["target"].to_numpy() + finally: + qrf_module._Forest.draw = original_draw + baseline_share = float( + np.average( + (baseline_draws > _TAIL_THRESHOLD).astype(float), weights=weights + ) + ) + + # (a) within ~2x of the weighted truth, both directions. + assert 0.5 * truth_share <= fix_share <= 2.0 * truth_share, ( + f"fix tail share {fix_share:.5f} not within 2x of truth {truth_share:.5f}" + ) + # (b) materially closer to truth than the nearest-snap baseline. + assert abs(fix_share - truth_share) < abs(baseline_share - truth_share), ( + f"fix share {fix_share:.5f} (err " + f"{abs(fix_share - truth_share):.5f}) is not closer to truth " + f"{truth_share:.5f} than the nearest-snap baseline {baseline_share:.5f} " + f"(err {abs(baseline_share - truth_share):.5f})" + ) + + +def test_high_quantile_draw_reaches_the_observed_max() -> None: + """A draw at q->1 reaches the observed conditional max via the grid endpoint. + + The pre-fix grid stopped at q=0.995 and snapped to it, so a lone extreme — + the observed maximum, which sits in the conditional's top atom above + q=0.995 — was unreachable: q=1 is the observed max, not extrapolation, and + the grid excluded it. The fix prepends/appends grid points adjacent to 0 and + 1, so the observed max is drawable. This is a direct probe of the forest's + draw at q=1 against the interior-only (winsorized) grid. + """ + from quantile_forest import RandomForestQuantileRegressor + + from populace.fit.qrf import _QUANTILE_GRID, _Forest + + rng = np.random.default_rng(0) + n = 2000 + # All rows share one feature value, so every tree has a single leaf holding + # the whole sample — the conditional is the marginal, with a clean max. + x = np.zeros((n, 1)) + y = rng.normal(100.0, 10.0, n) + observed_max = 1000.0 + y[0] = observed_max # a lone extreme: the observed conditional maximum + model = RandomForestQuantileRegressor( + n_estimators=50, max_samples_leaf=None, random_state=0 + ) + model.fit(x, y) + forest = _Forest(model=model, columns=("x",)) + + import pandas as pd + + probe = pd.DataFrame({"x": np.zeros(1)}) + draw_at_one = forest.draw(probe, np.array([1.0]))[0] + # The endpoint grid reaches the observed max (within sampling tolerance). + assert draw_at_one == pytest.approx(observed_max, rel=0.02) + + # The interior-only grid (the pre-fix winsorized grid) cannot: its top + # quantile q=0.995 reads far below the lone extreme. + interior_grid = np.linspace(1.0 / 202.0, 1.0 - 1.0 / 202.0, 201) + interior_top = float( + np.asarray(model.predict(np.zeros((1, 1)), quantiles=[interior_grid[-1]]))[0] + ) + assert interior_top < 0.5 * observed_max + # And the endpoint of the live grid is the one that closes the gap. + assert _QUANTILE_GRID[-1] > interior_grid[-1] + + +# ---------------------------------------------------------------------------- +# Shape and determinism +# ---------------------------------------------------------------------------- + + +def test_predict_row_count_matches_input(weight_correlated_frame) -> None: + """One drawn row per input row, with the input index preserved.""" + frame, _, _ = weight_correlated_frame(seed=0, n=300) + fitted = fit(frame, ["age", "is_male"], ["target"], n_estimators=10, seed=0) + + # From a Frame. + drawn_frame = fitted.predict(frame) + assert len(drawn_frame) == frame.n("person") + assert list(drawn_frame.columns) == ["target"] + + # From a bare DataFrame subset of arbitrary length and a non-range index. + subset = frame.table("person").iloc[:50].set_axis(range(100, 150)) + drawn_df = fitted.predict(subset) + assert len(drawn_df) == 50 + assert drawn_df.index.tolist() == list(range(100, 150)) + + +def test_fixed_seed_is_deterministic(weight_correlated_frame) -> None: + """Two models fit with the same seed draw identically on the first predict.""" + frame, _, _ = weight_correlated_frame(seed=0, n=400) + a = fit(frame, ["age", "is_male"], ["target"], n_estimators=25, seed=7) + b = fit(frame, ["age", "is_male"], ["target"], n_estimators=25, seed=7) + np.testing.assert_array_equal( + a.predict(frame)["target"].to_numpy(), + b.predict(frame)["target"].to_numpy(), + ) + + +def test_successive_predicts_draw_independently(weight_correlated_frame) -> None: + """Repeated predicts on one fitted model give independent draws. + + The draw RNG advances across calls, so a second predict is not a carbon + copy of the first (draws are samples, not a point estimate). + """ + frame, _, _ = weight_correlated_frame(seed=0, n=400) + fitted = fit(frame, ["age", "is_male"], ["target"], n_estimators=25, seed=7) + first = fitted.predict(frame)["target"].to_numpy() + second = fitted.predict(frame)["target"].to_numpy() + assert not np.array_equal(first, second) + + +def test_different_seeds_give_different_draws(weight_correlated_frame) -> None: + """Changing the seed changes the draws (the seed actually seeds).""" + frame, _, _ = weight_correlated_frame(seed=0, n=400) + a = fit(frame, ["age", "is_male"], ["target"], n_estimators=25, seed=1) + b = fit(frame, ["age", "is_male"], ["target"], n_estimators=25, seed=2) + assert not np.array_equal( + a.predict(frame)["target"].to_numpy(), + b.predict(frame)["target"].to_numpy(), + ) + + +def test_fit_and_draw_use_independent_rng_streams(weight_correlated_frame) -> None: + """The draw RNG is an independent SeedSequence child of the model seed. + + Before the fix the fitted model was seeded with the raw model seed, so its + draw uniforms were bit-identical to the fit's bootstrap-selection uniforms + (max |diff| = 0). The draw stream must instead come from the second spawned + child, independent of the fit's resampling, yet still reproducible. + """ + frame, _, _ = weight_correlated_frame(seed=0, n=400) + seed = 7 + fitted = fit(frame, ["age", "is_male"], ["target"], n_estimators=20, seed=seed) + _, draw_child = np.random.SeedSequence(seed).spawn(2) + expected = np.random.default_rng(draw_child).random(5) + actual = fitted._rng.random(5) # initial draw-stream state, before any predict + np.testing.assert_array_equal(actual, expected) + # And it must NOT be the raw-seed stream that the fit also used. + assert not np.array_equal(actual, np.random.default_rng(seed).random(5)) + + +def test_weighted_gate_reproduces_population_zero_share_not_sample( + make_person_frame, +) -> None: + """The zero gate reproduces the *weighted* zero-share, not the sample's. + + The sample is half zeros, but zeros carry low weight and positives high + weight, so the population (weighted) zero-share is 0.1 while the sample's is + 0.5. The predictor is independent of zero-ness, so a correctly weighted gate + reproduces the marginal weighted zero rate. A gate fit unweighted (or on an + unweighted resample) would reproduce 0.5. + """ + rng = np.random.default_rng(0) + n = 6000 + is_zero = np.arange(n) < n // 2 + target = np.where(is_zero, 0.0, 1.0 + rng.random(n)) + weights = np.where(is_zero, 1.0, 9.0) # weighted zero-share = 3000/30000 = 0.1 + x = rng.normal(size=n) # predictor independent of zero-ness + frame = make_person_frame({"x": x, "target": target}, weights=weights) + draws = fit(frame, ["x"], ["target"], n_estimators=80, seed=0).predict(frame) + draw_zero_share = float((draws["target"].to_numpy() == 0).mean()) + assert abs(draw_zero_share - 0.1) < 0.05 # tracks the weighted population + assert abs(draw_zero_share - 0.5) > 0.2 # decisively not the sample share + + +def test_forests_keep_all_leaf_samples(make_person_frame) -> None: + """max_samples_leaf=None is pinned independently of the readout fix. + + quantile-forest defaults to one sample per leaf, which thins each row's + conditional to ~n_estimators atoms and undershoots tail mass. The tail + fidelity fix sets max_samples_leaf=None; without this explicit pin a revert + to the default passes the rest of the suite (the tail test's baseline also + uses leaf=1, so interpolation alone clears it). This pins the leaf component. + """ + rng = np.random.default_rng(0) + n = 400 + frame = make_person_frame( + {"x": rng.normal(size=n), "target": np.abs(rng.normal(size=n)) + 0.1} + ) + fitted = fit(frame, ["x"], ["target"], n_estimators=8, seed=0) + forest = fitted._target_models["target"].positive.model + assert forest.max_samples_leaf is None diff --git a/packages/populace-fit/tests/test_weighted_fit_contract.py b/packages/populace-fit/tests/test_weighted_fit_contract.py new file mode 100644 index 0000000..60e39c0 --- /dev/null +++ b/packages/populace-fit/tests/test_weighted_fit_contract.py @@ -0,0 +1,101 @@ +"""The behavioral contract: a weighted fit shifts draws toward the weighted truth. + +This is the real realization of the placeholder the kernel left skipped in +``packages/populace-frame/tests/test_contracts.py`` +(``test_weighted_fit_shifts_draws_toward_weighted_truth``). It is the test +DESIGN.md names as the one that would have caught the 2026-06 microimpute +landmine: a ten-line behavioral check that a weighted fit moves the draws toward +the *weighted* population, and that an unweighted fit is impossible to express +without writing ``weights="none"`` and meaning it. + +The frame's donor has a weight-correlated target — about 20% of rows are a +low-weight regime carrying ~10x the bulk's value — so the weighted and +unweighted means differ by well over 3x. The contract: + +a. weighted draws' mean is within ~20% of the true *weighted* mean; +b. ``weights="none"`` draws' mean is within ~20% of the *unweighted* mean; +c. the two differ by more than 3x. +""" + +from __future__ import annotations + +import numpy as np +import pytest + +from populace.fit import NO_WEIGHTS, fit + +# Small forests keep the fit fast; n=5000 (the fixture default) is enough for +# the weighted/unweighted means to be stable across the seed. +_N_ESTIMATORS = 60 +_SEED = 1 + + +def test_weighted_fit_shifts_draws_toward_weighted_truth( + weight_correlated_frame, +) -> None: + """A weighted fit tracks the weighted mean; an unweighted fit does not. + + Realizes the kernel's skipped populace-fit contract. The donor target is + large exactly where the weight is small, so: + + (a) weighting the fit by the frame's design weights makes the draws' + unweighted mean land within 20% of the true *weighted* mean; + (b) fitting with ``weights="none"`` makes them land within 20% of the + *unweighted* mean instead; + (c) the two sets of draws differ by more than 3x — weighting is not a + rounding effect, it is the difference between two populations. + """ + frame, target, weights = weight_correlated_frame(seed=0) + + true_weighted_mean = float(np.average(target, weights=weights)) + true_unweighted_mean = float(target.mean()) + # The fixture is constructed so the two truths are far apart; assert it so a + # future fixture change that collapses the gap fails loudly here, not as a + # mysterious contract failure below. + assert true_unweighted_mean > 3.0 * true_weighted_mean + + fitted_weighted = fit( + frame, ["age", "is_male"], ["target"], n_estimators=_N_ESTIMATORS, seed=_SEED + ) + fitted_unweighted = fit( + frame, + ["age", "is_male"], + ["target"], + weights=NO_WEIGHTS, + n_estimators=_N_ESTIMATORS, + seed=_SEED, + ) + + weighted_draws = fitted_weighted.predict(frame)["target"].to_numpy() + unweighted_draws = fitted_unweighted.predict(frame)["target"].to_numpy() + + weighted_mean = float(weighted_draws.mean()) + unweighted_mean = float(unweighted_draws.mean()) + + # (a) weighted draws track the weighted truth. + assert weighted_mean == pytest.approx(true_weighted_mean, rel=0.20) + # (b) unweighted draws track the unweighted truth. + assert unweighted_mean == pytest.approx(true_unweighted_mean, rel=0.20) + # (c) and the two are genuinely different populations. + assert unweighted_mean > 3.0 * weighted_mean + + +def test_design_is_the_default_and_matches_explicit_design( + weight_correlated_frame, +) -> None: + """Omitting ``weights`` weights by design — there is no unweighted default. + + The default-weighted draws must track the weighted truth (not the + unweighted one), proving the operator is weighted *by construction*: a + caller who forgets ``weights`` still gets a weighted fit. + """ + frame, target, weights = weight_correlated_frame(seed=0) + true_weighted_mean = float(np.average(target, weights=weights)) + + default_draws = fit( + frame, ["age", "is_male"], ["target"], n_estimators=_N_ESTIMATORS, seed=_SEED + ).predict(frame)["target"] + + assert float(default_draws.mean()) == pytest.approx( + true_weighted_mean, rel=0.20 + ) diff --git a/packages/populace-frame/src/populace/frame/accounting.py b/packages/populace-frame/src/populace/frame/accounting.py index 8b71bd5..39604ce 100644 --- a/packages/populace-frame/src/populace/frame/accounting.py +++ b/packages/populace-frame/src/populace/frame/accounting.py @@ -53,7 +53,7 @@ def _resolve( "weighted accounting requires numeric or boolean values." ) values = series.to_numpy(dtype=np.float64) - weights = bundle._effective_weights(owner) + weights = bundle.resolve_weights(owner).values return values, weights, owner diff --git a/packages/populace-frame/src/populace/frame/bundle.py b/packages/populace-frame/src/populace/frame/bundle.py index ffe4ec4..5bb3c39 100644 --- a/packages/populace-frame/src/populace/frame/bundle.py +++ b/packages/populace-frame/src/populace/frame/bundle.py @@ -29,6 +29,7 @@ from populace.frame.weights import ( MassChange, MassChangeRecord, + WeightKind, Weights, assert_kind_transition, ) @@ -472,6 +473,83 @@ def weights_for(self, entity: str) -> Weights: ) return self._weights[entity] + def resolve_weights(self, entity: str) -> Weights: + """Return ``entity``'s weights as a typed :class:`Weights`, resolving inheritance. + + Unlike :meth:`weights_for` — which returns *only* the explicit vector + stored for ``entity`` and raises otherwise — this resolves the + *effective* weights and preserves their kind: an entity without its own + stored weights inherits the single weighted group entity's weights + broadcast through membership, and the returned :class:`Weights` carries + that source entity's :class:`~populace.frame.weights.WeightKind`. This + is what lets a person-level fit run on a household-weighted frame: the + person inherits the household's design/importance/calibrated kind, not a + bare ndarray that has dropped the kind. + + The values are exactly those of :meth:`_effective_weights` (so weighted + accounting and a fit weight the same rows identically); the kind is the + stored kind when ``entity`` has its own weights, otherwise the source + group entity's kind. + + Args: + entity: An entity declared by the schema. + + Returns: + The effective :class:`~populace.frame.weights.Weights` for + ``entity``: the stored vector as-is when present, otherwise the + inherited values tagged with the source entity's kind. + + Raises: + ValueError: If ``entity`` is not declared by the schema, or its + weights are ambiguous — the same zero/multiple-weighted-group + conditions :meth:`_effective_weights` raises on (the message + names them), or a group's members carry unequal person-level + weights. + """ + self.table(entity) # validates the entity name + if entity in self._weights: + return self._weights[entity] + values = self._effective_weights(entity) + source_kind = self._inherited_kind(entity) + return Weights(values=values, kind=source_kind) + + def _inherited_kind(self, entity: str) -> WeightKind: + """Kind of the weights ``entity`` inherits when it stores none. + + Mirrors :meth:`_effective_weights`'s resolution *exactly*, so the kind + always names the same source the values come from: + + - the person entity inherits the single weighted group entity's kind; + - a group entity without its own weights derives from the person-level + weights — which is the person's own kind when the person stores + weights, otherwise (recursively) the single weighted group's kind. + + The earlier version always returned the single weighted *group* + entity's kind, which (a) raised on person-weighted frames where + ``_effective_weights`` succeeds via the person path, and (b) could tag + person-derived values with a sibling group's kind. + """ + person_entity = self._schema.person_entity + if entity == person_entity: + candidates = [ + group + for group in self._schema.group_entities + if group in self._weights + ] + if len(candidates) != 1: + raise ValueError( + f"Cannot resolve weights for entity {entity!r}: no explicit " + f"weights and {len(candidates)} weighted group entities " + f"{candidates}; store weights for exactly one entity to " + "broadcast from." + ) + return self._weights[candidates[0]].kind + # A group entity without its own weights derives from the person-level + # weights — the same source _effective_weights uses for its values. + if person_entity in self._weights: + return self._weights[person_entity].kind + return self._inherited_kind(person_entity) + def stratum_mass(self) -> pd.Series: """Weighted person mass per stratum. diff --git a/packages/populace-frame/tests/test_bundle.py b/packages/populace-frame/tests/test_bundle.py index ca73173..a4c08df 100644 --- a/packages/populace-frame/tests/test_bundle.py +++ b/packages/populace-frame/tests/test_bundle.py @@ -247,6 +247,176 @@ def test_non_constant_member_weights_are_refused_for_group_collapse( wsum(uneven, "tax_unit_income") +class TestResolveWeights: + """Public kind-preserving weight resolution (the fit-on-a-grouped-frame fix). + + ``resolve_weights`` returns a typed :class:`Weights` for *any* entity, + resolving inheritance and carrying the source entity's kind — unlike + ``weights_for``, which only returns an entity's own stored vector. This is + what lets a person-level fit run on a household-weighted frame. + """ + + def test_person_resolve_on_household_weighted_frame_carries_kind_and_values( + self, make_bundle + ) -> None: + """A person resolve inherits the household's kind and broadcast values. + + ``weights_for("person")`` raises (no stored person weights); the new + ``resolve_weights("person")`` instead returns the household design + weights broadcast through membership, *as a Weights of kind design* — + not a bare ndarray that has dropped the kind. + """ + bundle = make_bundle(weight_values=(100.0, 200.0)) + # The old accessor still refuses an entity without its own weights. + with pytest.raises(ValueError, match="No weights stored"): + bundle.weights_for("person") + + resolved = bundle.resolve_weights("person") + assert isinstance(resolved, Weights) + # Kind is preserved from the source (household) entity. + assert resolved.kind is WeightKind.DESIGN + # Values are the household weights broadcast onto the 5 persons: + # persons 0-1 in hh 1 (100), persons 2-4 in hh 2 (200). + assert resolved.values.tolist() == [100.0, 100.0, 200.0, 200.0, 200.0] + # And exactly the effective-weight values accounting uses. + np.testing.assert_array_equal( + resolved.values, bundle._effective_weights("person") + ) + + def test_calibrated_household_resolves_to_calibrated_person( + self, make_bundle + ) -> None: + """A calibrated household frame resolves person weights as calibrated. + + The kind moves forward with the source: when the household weights are + calibrated, a person inherits ``calibrated``, so a fit that demands the + kind match sees ``calibrated`` (not a kind the inherited vector lost). + """ + bundle = make_bundle( + weight_values=(100.0, 200.0), kind=WeightKind.CALIBRATED + ) + resolved = bundle.resolve_weights("person") + assert resolved.kind is WeightKind.CALIBRATED + assert resolved.values.tolist() == [100.0, 100.0, 200.0, 200.0, 200.0] + + def test_entity_with_its_own_weights_returns_them_as_is( + self, make_bundle + ) -> None: + """When the entity stores weights, resolve returns that exact object.""" + bundle = make_bundle(weight_values=(100.0, 200.0)) + # The household stores its own weights: returned identically. + assert bundle.resolve_weights("household") is bundle.weights_for("household") + + def test_ambiguity_still_raises(self) -> None: + """Zero or multiple weighted group entities make a person resolve ambiguous. + + ``resolve_weights`` keeps the same ambiguity guard ``_effective_weights`` + has: with two weighted group entities there is no single source kind to + carry, so it refuses. + """ + schema = EntitySchema(group_entities=("household", "tax_unit")) + person = pd.DataFrame( + { + "person_id": range(4), + "person_household_id": [1, 1, 2, 2], + "person_tax_unit_id": [1, 1, 2, 3], + } + ) + household = pd.DataFrame({"household_id": [1, 2]}) + tax_unit = pd.DataFrame({"tax_unit_id": [1, 2, 3]}) + bundle = Frame( + {"person": person, "household": household, "tax_unit": tax_unit}, + schema, + { + "household": Weights( + values=np.array([5.0, 11.0]), kind=WeightKind.DESIGN + ), + "tax_unit": Weights( + values=np.array([1.0, 2.0, 3.0]), kind=WeightKind.DESIGN + ), + }, + ) + with pytest.raises(ValueError, match="weighted group entities"): + bundle.resolve_weights("person") + + def test_unknown_entity_is_named(self, make_bundle) -> None: + """Resolving an undeclared entity raises, naming the schema's entities.""" + bundle = make_bundle() + with pytest.raises(ValueError, match="Unknown entity"): + bundle.resolve_weights("firm") + + def test_group_resolve_on_a_person_only_weighted_frame(self) -> None: + """A group entity derives its weights from the person weights. + + Regression: when only the person entity is weighted (no weighted group + entity — the shape the fit suite's own fixtures build), resolving a + group entity must derive its weights from the person weights, exactly + as ``_effective_weights`` does, not raise "0 weighted group entities". + Because all accounting routes through ``resolve_weights``, the earlier + version broke ``wsum``/``wmean``/etc. on group columns of such frames. + """ + schema = EntitySchema(group_entities=("household",)) + person = pd.DataFrame( + {"person_id": range(4), "person_household_id": [1, 1, 2, 2]} + ) + household = pd.DataFrame( + {"household_id": [1, 2], "hh_value": [10.0, 20.0]} + ) + frame = Frame( + {"person": person, "household": household}, + schema, + { + "person": Weights( + values=np.array([1.0, 1.0, 2.0, 2.0]), kind=WeightKind.DESIGN + ) + }, + ) + resolved = frame.resolve_weights("household") + assert resolved.kind is WeightKind.DESIGN # from the person source + assert resolved.values.tolist() == [1.0, 2.0] # member-constant collapse + # accounting on a group column works (it raised before the fix) + assert wsum(frame, "hh_value") == 1.0 * 10.0 + 2.0 * 20.0 # 50.0 + + def test_mixed_kind_resolve_tags_values_with_their_source_kind(self) -> None: + """A resolved kind names the source the *values* came from. + + person calibrated + household design; resolving ``tax_unit`` derives + its values from the person (calibrated) weights, so the kind must be + calibrated too — not the sibling household's design. The earlier + version tagged person-derived values with the group's kind, which would + let ``resolve_fit_weights(..., "design")`` silently fit on calibrated + weights — the exact discipline this API exists to enforce. + """ + schema = EntitySchema(group_entities=("household", "tax_unit")) + person = pd.DataFrame( + { + "person_id": range(4), + "person_household_id": [1, 1, 2, 2], + "person_tax_unit_id": [1, 1, 2, 3], + } + ) + household = pd.DataFrame({"household_id": [1, 2]}) + tax_unit = pd.DataFrame({"tax_unit_id": [1, 2, 3]}) + frame = Frame( + {"person": person, "household": household, "tax_unit": tax_unit}, + schema, + { + "person": Weights( + values=np.array([1.0, 1.0, 2.0, 2.0]), + kind=WeightKind.CALIBRATED, + ), + "household": Weights( + values=np.array([5.0, 11.0]), kind=WeightKind.DESIGN + ), + }, + ) + resolved = frame.resolve_weights("tax_unit") + np.testing.assert_array_equal( + resolved.values, frame._effective_weights("tax_unit") + ) + assert resolved.kind is WeightKind.CALIBRATED # the value source, not design + + class TestStratumMass: def test_mass_per_stratum_via_household_broadcast(self, make_bundle) -> None: strata = pd.Series(["cps", "cps", "syn", "syn", "syn"], index=range(5)) diff --git a/uv.lock b/uv.lock index 717c857..eb9d105 100644 --- a/uv.lock +++ b/uv.lock @@ -12,6 +12,7 @@ resolution-markers = [ [manifest] members = [ + "populace-fit", "populace-frame", "populace-workspace", ] @@ -455,6 +456,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, ] +[[package]] +name = "joblib" +version = "1.5.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/41/f2/d34e8b3a08a9cc79a50b2208a93dce981fe615b64d5a4d4abee421d898df/joblib-1.5.3.tar.gz", hash = "sha256:8561a3269e6801106863fd0d6d84bb737be9e7631e33aaed3fb9ce5953688da3", size = 331603, upload-time = "2025-12-15T08:41:46.427Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071, upload-time = "2025-12-15T08:41:44.973Z" }, +] + [[package]] name = "jsonpickle" version = "4.1.2" @@ -905,6 +915,35 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/91/e2/74e268a755ce7c0c270667ce2374e7fc7c490770fe842f8861bbea5395b0/policyengine_us-1.722.5-py3-none-any.whl", hash = "sha256:103f923a4dfed86786a495638d3914a60328bb4db9ddc22790729707c55a5fbf", size = 11321479, upload-time = "2026-06-09T09:11:56.25Z" }, ] +[[package]] +name = "populace-fit" +version = "0.1.0" +source = { editable = "packages/populace-fit" } +dependencies = [ + { name = "numpy" }, + { name = "pandas" }, + { name = "populace-frame" }, + { name = "quantile-forest" }, + { name = "scikit-learn" }, +] + +[package.dev-dependencies] +dev = [ + { name = "pytest" }, +] + +[package.metadata] +requires-dist = [ + { name = "numpy", specifier = ">=2" }, + { name = "pandas", specifier = ">=2.3" }, + { name = "populace-frame", editable = "packages/populace-frame" }, + { name = "quantile-forest", specifier = ">=1.3" }, + { name = "scikit-learn", specifier = ">=1.5,<1.9" }, +] + +[package.metadata.requires-dev] +dev = [{ name = "pytest", specifier = ">=8" }] + [[package]] name = "populace-frame" version = "0.1.0" @@ -1172,6 +1211,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, ] +[[package]] +name = "quantile-forest" +version = "1.4.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "scikit-learn" }, + { name = "scipy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/62/6e/3f1493d4abcce71fdc82ed575475d3e02da7b03375129e84be2622e1532f/quantile_forest-1.4.1.tar.gz", hash = "sha256:713a23c69562b7551ba4a05c22ce9d0e90db6a73d043e760b29c331cb19dc552", size = 486249, upload-time = "2025-09-10T12:48:04.578Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/33/61/f8ff4e348dc2d265ea97287f921b92bca265229c48be64b94756ecff4078/quantile_forest-1.4.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:37c2da2ab54aceacdf5292065147f40a073b13cc3844262f0f3cbd5b8a8d928e", size = 955098, upload-time = "2025-09-10T12:47:52.137Z" }, + { url = "https://files.pythonhosted.org/packages/4f/95/75f3eea1c7cc3786c1ffdf4685e79c4979a4ae6ccedfed80362c9162f0d4/quantile_forest-1.4.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3f0436ac7622442c2995cf121e0960332e769791f3f3c7ea62363e8480803bb3", size = 718470, upload-time = "2025-09-10T12:47:53.566Z" }, + { url = "https://files.pythonhosted.org/packages/fe/f1/0f26386bf164ede156099d18e3e4493dd21dc48e329e1be68232e5cf8b52/quantile_forest-1.4.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a594bd3552507beffa6ca6002143601be5defd5cc7329154f41317110f895f7a", size = 709245, upload-time = "2025-09-10T12:47:54.54Z" }, + { url = "https://files.pythonhosted.org/packages/4f/cd/6501c8c200f34a87e1e94d7ea4f1a9dc842154fbfaa0fe65f072817fbc41/quantile_forest-1.4.1-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:697c48faf52a04e7e47f97187650d16cecc9c971fe2f83d56854b4a454289f60", size = 2403543, upload-time = "2025-09-10T12:47:55.956Z" }, + { url = "https://files.pythonhosted.org/packages/f2/be/f77c6705e974b23353c43da1cd93e11fe0afc7e859c2d14f748d25cc0376/quantile_forest-1.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:fe33f6a8b63b3617568cc1254e1802a70ce3ac23897790f3be10f8db5257fe83", size = 685417, upload-time = "2025-09-10T12:47:57.346Z" }, +] + [[package]] name = "requests" version = "2.34.2" @@ -1225,6 +1282,95 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/15/19/016553f86f207450aebebc2b2b5088d086b901cc8186c02ac4284db3bd88/ruff-0.15.16-py3-none-win_arm64.whl", hash = "sha256:8cd61783afb39638a7133ef0d2dfb1e91277593962f81b5a8423eb0b888a6121", size = 11134555, upload-time = "2026-06-04T16:33:00.136Z" }, ] +[[package]] +name = "scikit-learn" +version = "1.8.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "joblib" }, + { name = "numpy" }, + { name = "scipy" }, + { name = "threadpoolctl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0e/d4/40988bf3b8e34feec1d0e6a051446b1f66225f8529b9309becaeef62b6c4/scikit_learn-1.8.0.tar.gz", hash = "sha256:9bccbb3b40e3de10351f8f5068e105d0f4083b1a65fa07b6634fbc401a6287fd", size = 7335585, upload-time = "2025-12-10T07:08:53.618Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/03/aa/e22e0768512ce9255eba34775be2e85c2048da73da1193e841707f8f039c/scikit_learn-1.8.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0d6ae97234d5d7079dc0040990a6f7aeb97cb7fa7e8945f1999a429b23569e0a", size = 8513770, upload-time = "2025-12-10T07:08:03.251Z" }, + { url = "https://files.pythonhosted.org/packages/58/37/31b83b2594105f61a381fc74ca19e8780ee923be2d496fcd8d2e1147bd99/scikit_learn-1.8.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:edec98c5e7c128328124a029bceb09eda2d526997780fef8d65e9a69eead963e", size = 8044458, upload-time = "2025-12-10T07:08:05.336Z" }, + { url = "https://files.pythonhosted.org/packages/2d/5a/3f1caed8765f33eabb723596666da4ebbf43d11e96550fb18bdec42b467b/scikit_learn-1.8.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:74b66d8689d52ed04c271e1329f0c61635bcaf5b926db9b12d58914cdc01fe57", size = 8610341, upload-time = "2025-12-10T07:08:07.732Z" }, + { url = "https://files.pythonhosted.org/packages/38/cf/06896db3f71c75902a8e9943b444a56e727418f6b4b4a90c98c934f51ed4/scikit_learn-1.8.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8fdf95767f989b0cfedb85f7ed8ca215d4be728031f56ff5a519ee1e3276dc2e", size = 8900022, upload-time = "2025-12-10T07:08:09.862Z" }, + { url = "https://files.pythonhosted.org/packages/1c/f9/9b7563caf3ec8873e17a31401858efab6b39a882daf6c1bfa88879c0aa11/scikit_learn-1.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:2de443b9373b3b615aec1bb57f9baa6bb3a9bd093f1269ba95c17d870422b271", size = 7989409, upload-time = "2025-12-10T07:08:12.028Z" }, + { url = "https://files.pythonhosted.org/packages/49/bd/1f4001503650e72c4f6009ac0c4413cb17d2d601cef6f71c0453da2732fc/scikit_learn-1.8.0-cp313-cp313-win_arm64.whl", hash = "sha256:eddde82a035681427cbedded4e6eff5e57fa59216c2e3e90b10b19ab1d0a65c3", size = 7619760, upload-time = "2025-12-10T07:08:13.688Z" }, + { url = "https://files.pythonhosted.org/packages/d2/7d/a630359fc9dcc95496588c8d8e3245cc8fd81980251079bc09c70d41d951/scikit_learn-1.8.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:7cc267b6108f0a1499a734167282c00c4ebf61328566b55ef262d48e9849c735", size = 8826045, upload-time = "2025-12-10T07:08:15.215Z" }, + { url = "https://files.pythonhosted.org/packages/cc/56/a0c86f6930cfcd1c7054a2bc417e26960bb88d32444fe7f71d5c2cfae891/scikit_learn-1.8.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:fe1c011a640a9f0791146011dfd3c7d9669785f9fed2b2a5f9e207536cf5c2fd", size = 8420324, upload-time = "2025-12-10T07:08:17.561Z" }, + { url = "https://files.pythonhosted.org/packages/46/1e/05962ea1cebc1cf3876667ecb14c283ef755bf409993c5946ade3b77e303/scikit_learn-1.8.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:72358cce49465d140cc4e7792015bb1f0296a9742d5622c67e31399b75468b9e", size = 8680651, upload-time = "2025-12-10T07:08:19.952Z" }, + { url = "https://files.pythonhosted.org/packages/fe/56/a85473cd75f200c9759e3a5f0bcab2d116c92a8a02ee08ccd73b870f8bb4/scikit_learn-1.8.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:80832434a6cc114f5219211eec13dcbc16c2bac0e31ef64c6d346cde3cf054cb", size = 8925045, upload-time = "2025-12-10T07:08:22.11Z" }, + { url = "https://files.pythonhosted.org/packages/cc/b7/64d8cfa896c64435ae57f4917a548d7ac7a44762ff9802f75a79b77cb633/scikit_learn-1.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:ee787491dbfe082d9c3013f01f5991658b0f38aa8177e4cd4bf434c58f551702", size = 8507994, upload-time = "2025-12-10T07:08:23.943Z" }, + { url = "https://files.pythonhosted.org/packages/5e/37/e192ea709551799379958b4c4771ec507347027bb7c942662c7fbeba31cb/scikit_learn-1.8.0-cp313-cp313t-win_arm64.whl", hash = "sha256:bf97c10a3f5a7543f9b88cbf488d33d175e9146115a451ae34568597ba33dcde", size = 7869518, upload-time = "2025-12-10T07:08:25.71Z" }, + { url = "https://files.pythonhosted.org/packages/24/05/1af2c186174cc92dcab2233f327336058c077d38f6fe2aceb08e6ab4d509/scikit_learn-1.8.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:c22a2da7a198c28dd1a6e1136f19c830beab7fdca5b3e5c8bba8394f8a5c45b3", size = 8528667, upload-time = "2025-12-10T07:08:27.541Z" }, + { url = "https://files.pythonhosted.org/packages/a8/25/01c0af38fe969473fb292bba9dc2b8f9b451f3112ff242c647fee3d0dfe7/scikit_learn-1.8.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:6b595b07a03069a2b1740dc08c2299993850ea81cce4fe19b2421e0c970de6b7", size = 8066524, upload-time = "2025-12-10T07:08:29.822Z" }, + { url = "https://files.pythonhosted.org/packages/be/ce/a0623350aa0b68647333940ee46fe45086c6060ec604874e38e9ab7d8e6c/scikit_learn-1.8.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:29ffc74089f3d5e87dfca4c2c8450f88bdc61b0fc6ed5d267f3988f19a1309f6", size = 8657133, upload-time = "2025-12-10T07:08:31.865Z" }, + { url = "https://files.pythonhosted.org/packages/b8/cb/861b41341d6f1245e6ca80b1c1a8c4dfce43255b03df034429089ca2a2c5/scikit_learn-1.8.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fb65db5d7531bccf3a4f6bec3462223bea71384e2cda41da0f10b7c292b9e7c4", size = 8923223, upload-time = "2025-12-10T07:08:34.166Z" }, + { url = "https://files.pythonhosted.org/packages/76/18/a8def8f91b18cd1ba6e05dbe02540168cb24d47e8dcf69e8d00b7da42a08/scikit_learn-1.8.0-cp314-cp314-win_amd64.whl", hash = "sha256:56079a99c20d230e873ea40753102102734c5953366972a71d5cb39a32bc40c6", size = 8096518, upload-time = "2025-12-10T07:08:36.339Z" }, + { url = "https://files.pythonhosted.org/packages/d1/77/482076a678458307f0deb44e29891d6022617b2a64c840c725495bee343f/scikit_learn-1.8.0-cp314-cp314-win_arm64.whl", hash = "sha256:3bad7565bc9cf37ce19a7c0d107742b320c1285df7aab1a6e2d28780df167242", size = 7754546, upload-time = "2025-12-10T07:08:38.128Z" }, + { url = "https://files.pythonhosted.org/packages/2d/d1/ef294ca754826daa043b2a104e59960abfab4cf653891037d19dd5b6f3cf/scikit_learn-1.8.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:4511be56637e46c25721e83d1a9cea9614e7badc7040c4d573d75fbe257d6fd7", size = 8848305, upload-time = "2025-12-10T07:08:41.013Z" }, + { url = "https://files.pythonhosted.org/packages/5b/e2/b1f8b05138ee813b8e1a4149f2f0d289547e60851fd1bb268886915adbda/scikit_learn-1.8.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:a69525355a641bf8ef136a7fa447672fb54fe8d60cab5538d9eb7c6438543fb9", size = 8432257, upload-time = "2025-12-10T07:08:42.873Z" }, + { url = "https://files.pythonhosted.org/packages/26/11/c32b2138a85dcb0c99f6afd13a70a951bfdff8a6ab42d8160522542fb647/scikit_learn-1.8.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c2656924ec73e5939c76ac4c8b026fc203b83d8900362eb2599d8aee80e4880f", size = 8678673, upload-time = "2025-12-10T07:08:45.362Z" }, + { url = "https://files.pythonhosted.org/packages/c7/57/51f2384575bdec454f4fe4e7a919d696c9ebce914590abf3e52d47607ab8/scikit_learn-1.8.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:15fc3b5d19cc2be65404786857f2e13c70c83dd4782676dd6814e3b89dc8f5b9", size = 8922467, upload-time = "2025-12-10T07:08:47.408Z" }, + { url = "https://files.pythonhosted.org/packages/35/4d/748c9e2872637a57981a04adc038dacaa16ba8ca887b23e34953f0b3f742/scikit_learn-1.8.0-cp314-cp314t-win_amd64.whl", hash = "sha256:00d6f1d66fbcf4eba6e356e1420d33cc06c70a45bb1363cd6f6a8e4ebbbdece2", size = 8774395, upload-time = "2025-12-10T07:08:49.337Z" }, + { url = "https://files.pythonhosted.org/packages/60/22/d7b2ebe4704a5e50790ba089d5c2ae308ab6bb852719e6c3bd4f04c3a363/scikit_learn-1.8.0-cp314-cp314t-win_arm64.whl", hash = "sha256:f28dd15c6bb0b66ba09728cf09fd8736c304be29409bd8445a080c1280619e8c", size = 8002647, upload-time = "2025-12-10T07:08:51.601Z" }, +] + +[[package]] +name = "scipy" +version = "1.17.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7a/97/5a3609c4f8d58b039179648e62dd220f89864f56f7357f5d4f45c29eb2cc/scipy-1.17.1.tar.gz", hash = "sha256:95d8e012d8cb8816c226aef832200b1d45109ed4464303e997c5b13122b297c0", size = 30573822, upload-time = "2026-02-23T00:26:24.851Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/76/27/07ee1b57b65e92645f219b37148a7e7928b82e2b5dbeccecb4dff7c64f0b/scipy-1.17.1-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:5e3c5c011904115f88a39308379c17f91546f77c1667cea98739fe0fccea804c", size = 31590199, upload-time = "2026-02-23T00:19:17.192Z" }, + { url = "https://files.pythonhosted.org/packages/ec/ae/db19f8ab842e9b724bf5dbb7db29302a91f1e55bc4d04b1025d6d605a2c5/scipy-1.17.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:6fac755ca3d2c3edcb22f479fceaa241704111414831ddd3bc6056e18516892f", size = 28154001, upload-time = "2026-02-23T00:19:22.241Z" }, + { url = "https://files.pythonhosted.org/packages/5b/58/3ce96251560107b381cbd6e8413c483bbb1228a6b919fa8652b0d4090e7f/scipy-1.17.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:7ff200bf9d24f2e4d5dc6ee8c3ac64d739d3a89e2326ba68aaf6c4a2b838fd7d", size = 20325719, upload-time = "2026-02-23T00:19:26.329Z" }, + { url = "https://files.pythonhosted.org/packages/b2/83/15087d945e0e4d48ce2377498abf5ad171ae013232ae31d06f336e64c999/scipy-1.17.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:4b400bdc6f79fa02a4d86640310dde87a21fba0c979efff5248908c6f15fad1b", size = 22683595, upload-time = "2026-02-23T00:19:30.304Z" }, + { url = "https://files.pythonhosted.org/packages/b4/e0/e58fbde4a1a594c8be8114eb4aac1a55bcd6587047efc18a61eb1f5c0d30/scipy-1.17.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2b64ca7d4aee0102a97f3ba22124052b4bd2152522355073580bf4845e2550b6", size = 32896429, upload-time = "2026-02-23T00:19:35.536Z" }, + { url = "https://files.pythonhosted.org/packages/f5/5f/f17563f28ff03c7b6799c50d01d5d856a1d55f2676f537ca8d28c7f627cd/scipy-1.17.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:581b2264fc0aa555f3f435a5944da7504ea3a065d7029ad60e7c3d1ae09c5464", size = 35203952, upload-time = "2026-02-23T00:19:42.259Z" }, + { url = "https://files.pythonhosted.org/packages/8d/a5/9afd17de24f657fdfe4df9a3f1ea049b39aef7c06000c13db1530d81ccca/scipy-1.17.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:beeda3d4ae615106d7094f7e7cef6218392e4465cc95d25f900bebabfded0950", size = 34979063, upload-time = "2026-02-23T00:19:47.547Z" }, + { url = "https://files.pythonhosted.org/packages/8b/13/88b1d2384b424bf7c924f2038c1c409f8d88bb2a8d49d097861dd64a57b2/scipy-1.17.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6609bc224e9568f65064cfa72edc0f24ee6655b47575954ec6339534b2798369", size = 37598449, upload-time = "2026-02-23T00:19:53.238Z" }, + { url = "https://files.pythonhosted.org/packages/35/e5/d6d0e51fc888f692a35134336866341c08655d92614f492c6860dc45bb2c/scipy-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:37425bc9175607b0268f493d79a292c39f9d001a357bebb6b88fdfaff13f6448", size = 36510943, upload-time = "2026-02-23T00:20:50.89Z" }, + { url = "https://files.pythonhosted.org/packages/2a/fd/3be73c564e2a01e690e19cc618811540ba5354c67c8680dce3281123fb79/scipy-1.17.1-cp313-cp313-win_arm64.whl", hash = "sha256:5cf36e801231b6a2059bf354720274b7558746f3b1a4efb43fcf557ccd484a87", size = 24545621, upload-time = "2026-02-23T00:20:55.871Z" }, + { url = "https://files.pythonhosted.org/packages/6f/6b/17787db8b8114933a66f9dcc479a8272e4b4da75fe03b0c282f7b0ade8cd/scipy-1.17.1-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:d59c30000a16d8edc7e64152e30220bfbd724c9bbb08368c054e24c651314f0a", size = 31936708, upload-time = "2026-02-23T00:19:58.694Z" }, + { url = "https://files.pythonhosted.org/packages/38/2e/524405c2b6392765ab1e2b722a41d5da33dc5c7b7278184a8ad29b6cb206/scipy-1.17.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:010f4333c96c9bb1a4516269e33cb5917b08ef2166d5556ca2fd9f082a9e6ea0", size = 28570135, upload-time = "2026-02-23T00:20:03.934Z" }, + { url = "https://files.pythonhosted.org/packages/fd/c3/5bd7199f4ea8556c0c8e39f04ccb014ac37d1468e6cfa6a95c6b3562b76e/scipy-1.17.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:2ceb2d3e01c5f1d83c4189737a42d9cb2fc38a6eeed225e7515eef71ad301dce", size = 20741977, upload-time = "2026-02-23T00:20:07.935Z" }, + { url = "https://files.pythonhosted.org/packages/d9/b8/8ccd9b766ad14c78386599708eb745f6b44f08400a5fd0ade7cf89b6fc93/scipy-1.17.1-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:844e165636711ef41f80b4103ed234181646b98a53c8f05da12ca5ca289134f6", size = 23029601, upload-time = "2026-02-23T00:20:12.161Z" }, + { url = "https://files.pythonhosted.org/packages/6d/a0/3cb6f4d2fb3e17428ad2880333cac878909ad1a89f678527b5328b93c1d4/scipy-1.17.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:158dd96d2207e21c966063e1635b1063cd7787b627b6f07305315dd73d9c679e", size = 33019667, upload-time = "2026-02-23T00:20:17.208Z" }, + { url = "https://files.pythonhosted.org/packages/f3/c3/2d834a5ac7bf3a0c806ad1508efc02dda3c8c61472a56132d7894c312dea/scipy-1.17.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74cbb80d93260fe2ffa334efa24cb8f2f0f622a9b9febf8b483c0b865bfb3475", size = 35264159, upload-time = "2026-02-23T00:20:23.087Z" }, + { url = "https://files.pythonhosted.org/packages/4d/77/d3ed4becfdbd217c52062fafe35a72388d1bd82c2d0ba5ca19d6fcc93e11/scipy-1.17.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:dbc12c9f3d185f5c737d801da555fb74b3dcfa1a50b66a1a93e09190f41fab50", size = 35102771, upload-time = "2026-02-23T00:20:28.636Z" }, + { url = "https://files.pythonhosted.org/packages/bd/12/d19da97efde68ca1ee5538bb261d5d2c062f0c055575128f11a2730e3ac1/scipy-1.17.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:94055a11dfebe37c656e70317e1996dc197e1a15bbcc351bcdd4610e128fe1ca", size = 37665910, upload-time = "2026-02-23T00:20:34.743Z" }, + { url = "https://files.pythonhosted.org/packages/06/1c/1172a88d507a4baaf72c5a09bb6c018fe2ae0ab622e5830b703a46cc9e44/scipy-1.17.1-cp313-cp313t-win_amd64.whl", hash = "sha256:e30bdeaa5deed6bc27b4cc490823cd0347d7dae09119b8803ae576ea0ce52e4c", size = 36562980, upload-time = "2026-02-23T00:20:40.575Z" }, + { url = "https://files.pythonhosted.org/packages/70/b0/eb757336e5a76dfa7911f63252e3b7d1de00935d7705cf772db5b45ec238/scipy-1.17.1-cp313-cp313t-win_arm64.whl", hash = "sha256:a720477885a9d2411f94a93d16f9d89bad0f28ca23c3f8daa521e2dcc3f44d49", size = 24856543, upload-time = "2026-02-23T00:20:45.313Z" }, + { url = "https://files.pythonhosted.org/packages/cf/83/333afb452af6f0fd70414dc04f898647ee1423979ce02efa75c3b0f2c28e/scipy-1.17.1-cp314-cp314-macosx_10_14_x86_64.whl", hash = "sha256:a48a72c77a310327f6a3a920092fa2b8fd03d7deaa60f093038f22d98e096717", size = 31584510, upload-time = "2026-02-23T00:21:01.015Z" }, + { url = "https://files.pythonhosted.org/packages/ed/a6/d05a85fd51daeb2e4ea71d102f15b34fedca8e931af02594193ae4fd25f7/scipy-1.17.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:45abad819184f07240d8a696117a7aacd39787af9e0b719d00285549ed19a1e9", size = 28170131, upload-time = "2026-02-23T00:21:05.888Z" }, + { url = "https://files.pythonhosted.org/packages/db/7b/8624a203326675d7746a254083a187398090a179335b2e4a20e2ddc46e83/scipy-1.17.1-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:3fd1fcdab3ea951b610dc4cef356d416d5802991e7e32b5254828d342f7b7e0b", size = 20342032, upload-time = "2026-02-23T00:21:09.904Z" }, + { url = "https://files.pythonhosted.org/packages/c9/35/2c342897c00775d688d8ff3987aced3426858fd89d5a0e26e020b660b301/scipy-1.17.1-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:7bdf2da170b67fdf10bca777614b1c7d96ae3ca5794fd9587dce41eb2966e866", size = 22678766, upload-time = "2026-02-23T00:21:14.313Z" }, + { url = "https://files.pythonhosted.org/packages/ef/f2/7cdb8eb308a1a6ae1e19f945913c82c23c0c442a462a46480ce487fdc0ac/scipy-1.17.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:adb2642e060a6549c343603a3851ba76ef0b74cc8c079a9a58121c7ec9fe2350", size = 32957007, upload-time = "2026-02-23T00:21:19.663Z" }, + { url = "https://files.pythonhosted.org/packages/0b/2e/7eea398450457ecb54e18e9d10110993fa65561c4f3add5e8eccd2b9cd41/scipy-1.17.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:eee2cfda04c00a857206a4330f0c5e3e56535494e30ca445eb19ec624ae75118", size = 35221333, upload-time = "2026-02-23T00:21:25.278Z" }, + { url = "https://files.pythonhosted.org/packages/d9/77/5b8509d03b77f093a0d52e606d3c4f79e8b06d1d38c441dacb1e26cacf46/scipy-1.17.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d2650c1fb97e184d12d8ba010493ee7b322864f7d3d00d3f9bb97d9c21de4068", size = 35042066, upload-time = "2026-02-23T00:21:31.358Z" }, + { url = "https://files.pythonhosted.org/packages/f9/df/18f80fb99df40b4070328d5ae5c596f2f00fffb50167e31439e932f29e7d/scipy-1.17.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:08b900519463543aa604a06bec02461558a6e1cef8fdbb8098f77a48a83c8118", size = 37612763, upload-time = "2026-02-23T00:21:37.247Z" }, + { url = "https://files.pythonhosted.org/packages/4b/39/f0e8ea762a764a9dc52aa7dabcfad51a354819de1f0d4652b6a1122424d6/scipy-1.17.1-cp314-cp314-win_amd64.whl", hash = "sha256:3877ac408e14da24a6196de0ddcace62092bfc12a83823e92e49e40747e52c19", size = 37290984, upload-time = "2026-02-23T00:22:35.023Z" }, + { url = "https://files.pythonhosted.org/packages/7c/56/fe201e3b0f93d1a8bcf75d3379affd228a63d7e2d80ab45467a74b494947/scipy-1.17.1-cp314-cp314-win_arm64.whl", hash = "sha256:f8885db0bc2bffa59d5c1b72fad7a6a92d3e80e7257f967dd81abb553a90d293", size = 25192877, upload-time = "2026-02-23T00:22:39.798Z" }, + { url = "https://files.pythonhosted.org/packages/96/ad/f8c414e121f82e02d76f310f16db9899c4fcde36710329502a6b2a3c0392/scipy-1.17.1-cp314-cp314t-macosx_10_14_x86_64.whl", hash = "sha256:1cc682cea2ae55524432f3cdff9e9a3be743d52a7443d0cba9017c23c87ae2f6", size = 31949750, upload-time = "2026-02-23T00:21:42.289Z" }, + { url = "https://files.pythonhosted.org/packages/7c/b0/c741e8865d61b67c81e255f4f0a832846c064e426636cd7de84e74d209be/scipy-1.17.1-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:2040ad4d1795a0ae89bfc7e8429677f365d45aa9fd5e4587cf1ea737f927b4a1", size = 28585858, upload-time = "2026-02-23T00:21:47.706Z" }, + { url = "https://files.pythonhosted.org/packages/ed/1b/3985219c6177866628fa7c2595bfd23f193ceebbe472c98a08824b9466ff/scipy-1.17.1-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:131f5aaea57602008f9822e2115029b55d4b5f7c070287699fe45c661d051e39", size = 20757723, upload-time = "2026-02-23T00:21:52.039Z" }, + { url = "https://files.pythonhosted.org/packages/c0/19/2a04aa25050d656d6f7b9e7b685cc83d6957fb101665bfd9369ca6534563/scipy-1.17.1-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:9cdc1a2fcfd5c52cfb3045feb399f7b3ce822abdde3a193a6b9a60b3cb5854ca", size = 23043098, upload-time = "2026-02-23T00:21:56.185Z" }, + { url = "https://files.pythonhosted.org/packages/86/f1/3383beb9b5d0dbddd030335bf8a8b32d4317185efe495374f134d8be6cce/scipy-1.17.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6e3dcd57ab780c741fde8dc68619de988b966db759a3c3152e8e9142c26295ad", size = 33030397, upload-time = "2026-02-23T00:22:01.404Z" }, + { url = "https://files.pythonhosted.org/packages/41/68/8f21e8a65a5a03f25a79165ec9d2b28c00e66dc80546cf5eb803aeeff35b/scipy-1.17.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a9956e4d4f4a301ebf6cde39850333a6b6110799d470dbbb1e25326ac447f52a", size = 35281163, upload-time = "2026-02-23T00:22:07.024Z" }, + { url = "https://files.pythonhosted.org/packages/84/8d/c8a5e19479554007a5632ed7529e665c315ae7492b4f946b0deb39870e39/scipy-1.17.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:a4328d245944d09fd639771de275701ccadf5f781ba0ff092ad141e017eccda4", size = 35116291, upload-time = "2026-02-23T00:22:12.585Z" }, + { url = "https://files.pythonhosted.org/packages/52/52/e57eceff0e342a1f50e274264ed47497b59e6a4e3118808ee58ddda7b74a/scipy-1.17.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a77cbd07b940d326d39a1d1b37817e2ee4d79cb30e7338f3d0cddffae70fcaa2", size = 37682317, upload-time = "2026-02-23T00:22:18.513Z" }, + { url = "https://files.pythonhosted.org/packages/11/2f/b29eafe4a3fbc3d6de9662b36e028d5f039e72d345e05c250e121a230dd4/scipy-1.17.1-cp314-cp314t-win_amd64.whl", hash = "sha256:eb092099205ef62cd1782b006658db09e2fed75bffcae7cc0d44052d8aa0f484", size = 37345327, upload-time = "2026-02-23T00:22:24.442Z" }, + { url = "https://files.pythonhosted.org/packages/07/39/338d9219c4e87f3e708f18857ecd24d22a0c3094752393319553096b98af/scipy-1.17.1-cp314-cp314t-win_arm64.whl", hash = "sha256:200e1050faffacc162be6a486a984a0497866ec54149a01270adc8a59b7c7d21", size = 25489165, upload-time = "2026-02-23T00:22:29.563Z" }, +] + [[package]] name = "shellingham" version = "1.5.4"