diff --git a/changelog.d/imputer-canonical.breaking.md b/changelog.d/imputer-canonical.breaking.md new file mode 100644 index 0000000..8ed93b0 --- /dev/null +++ b/changelog.d/imputer-canonical.breaking.md @@ -0,0 +1,8 @@ +**Renamed `ZeroInflatedImputer` to the canonical `microimpute.Imputer`** and made it the opinionated default. The previous abstract base class `Imputer` is now `BaseImputer` (still exported). `microimpute.Imputer` is the regime-gated, QRF-based, sequentially-chained imputer: + +- **Sign-regime gating** (`{neg, 0, pos}`) on by default (`signregime=True`); pass `signregime=False` to impute each numeric target with the base model directly (no gate, the `REGIME_NO_GATE` path). +- **QRF base model** by default (`base_imputer_class=QRF`); swap for experiments. +- **Sequential chained-equations imputation is always on** — imputing a list of targets conditions each on the previously-imputed ones, preserving cross-variable joint structure. The old per-variable-independent path and its `sequential` flag are removed. +- The fitted result exposes fitted state sklearn-style — `regimes_`, `predictors_` (the chained predictor list per target), and `models_` (sub-estimators by role: single/gate/positive/negative). QRF base sub-estimators carry standard `feature_importances_`/`feature_names_in_`: `feature_importances_` is a `{fitted_feature: importance}` dict keyed by the forest's actual fitted columns (so names and values always align, even when a categorical predictor expands into dummy columns), and `feature_names_in_` reports the original input predictor names. + +Migration: replace `from microimpute.models.zero_inflated import ZeroInflatedImputer` with `from microimpute import Imputer`; references to the old base class `Imputer` become `BaseImputer`. diff --git a/changelog.d/imputer-canonical.fixed.md b/changelog.d/imputer-canonical.fixed.md new file mode 100644 index 0000000..d99f82a --- /dev/null +++ b/changelog.d/imputer-canonical.fixed.md @@ -0,0 +1 @@ +**`Imputer.fit(weight_col=...)` now actually weights numeric-target imputations.** Previously the regime-gated fit forwarded `weight_col` only to the auxiliary non-numeric base imputer; the gate classifiers and per-regime base imputers for numeric targets all trained unweighted, so sampling weights were silently ignored. Weights are now resolved once (column name, array, or Series) and threaded through every nested fit: gate classifiers receive them as `sample_weight` and per-regime base imputers as `weight_col`, sliced with the same row mask as the training slice. Separately, the QRF learner now honors `sample_weight` by weighted bootstrap resampling of its training rows: `quantile_forest` only uses native `sample_weight` as a zero-weight leaf filter (and fully-grown forest leaves are single-sample), so the previous native pass-through left leaf quantile distributions — and every value imputed from them — unweighted. diff --git a/microimpute/__init__.py b/microimpute/__init__.py index 2667c7e..e41536d 100644 --- a/microimpute/__init__.py +++ b/microimpute/__init__.py @@ -46,12 +46,7 @@ ) # Main configuration -from microimpute.config import ( - PLOT_CONFIG, - QUANTILES, - RANDOM_STATE, - VALIDATE_CONFIG, -) +from microimpute.config import PLOT_CONFIG, QUANTILES, RANDOM_STATE, VALIDATE_CONFIG # Import evaluation modules from microimpute.evaluations.cross_validation import cross_validate_model @@ -61,8 +56,10 @@ progressive_predictor_inclusion, ) -# Import main models and utilities -from microimpute.models import OLS, QRF, Imputer, ImputerResults, QuantReg +# Import main models and utilities. ``Imputer`` is the canonical +# regime-gated, QRF-based, sequentially-chained imputer; ``BaseImputer`` +# is the abstract base class all models extend. +from microimpute.models import OLS, QRF, BaseImputer, Imputer, ImputerResults, QuantReg # Import data handling functions from microimpute.utils.data import preprocess_data, unnormalize_predictions diff --git a/microimpute/comparisons/autoimpute.py b/microimpute/comparisons/autoimpute.py index 2121c6a..48e3b93 100644 --- a/microimpute/comparisons/autoimpute.py +++ b/microimpute/comparisons/autoimpute.py @@ -26,7 +26,7 @@ TRAIN_SIZE, VALIDATE_CONFIG, ) -from microimpute.models import OLS, QRF, Imputer, QuantReg +from microimpute.models import OLS, QRF, BaseImputer, QuantReg from microimpute.utils.data import ( un_asinh_transform_predictions, unlog_transform_predictions, @@ -121,7 +121,7 @@ class AutoImputeResult(BaseModel): receiver_data : pd.DataFrame Copy of the receiver data with the median-quantile imputations of the best performing model attached. fitted_models : Dict[str, Any] - Mapping model name → fitted Imputer instance. + Mapping model name → fitted BaseImputer instance. cv_results : Dict[str, Dict[str, Any]] Cross-validation results with separate quantile_loss and log_loss metrics for each model. """ @@ -203,7 +203,7 @@ def _setup_logging(log_level: str) -> int: def _evaluate_models_parallel( - model_classes: List[Type[Imputer]], + model_classes: List[Type[BaseImputer]], training_data: pd.DataFrame, predictors: List[str], imputed_variables: List[str], @@ -279,7 +279,7 @@ def _evaluate_models_parallel( def _generate_imputations_for_all_models( - model_classes: List[Type[Imputer]], + model_classes: List[Type[BaseImputer]], best_method: str, donor_data: pd.DataFrame, receiver_data: pd.DataFrame, @@ -500,7 +500,7 @@ def autoimpute( # Get model classes if not models: - model_classes: List[Type[Imputer]] = [QRF, OLS, QuantReg] + model_classes: List[Type[BaseImputer]] = [QRF, OLS, QuantReg] if HAS_MATCHING: model_classes.append(Matching) if HAS_MDN: diff --git a/microimpute/comparisons/autoimpute_helpers.py b/microimpute/comparisons/autoimpute_helpers.py index 8cc0f32..f28c9ab 100644 --- a/microimpute/comparisons/autoimpute_helpers.py +++ b/microimpute/comparisons/autoimpute_helpers.py @@ -24,7 +24,7 @@ validate_quantiles, ) from microimpute.evaluations import cross_validate_model -from microimpute.models import Imputer +from microimpute.models import BaseImputer from microimpute.utils.data import preprocess_data log = logging.getLogger(__name__) @@ -237,7 +237,7 @@ def prepare_data_for_imputation( def evaluate_model( - model: Type[Imputer], + model: Type[BaseImputer], data: pd.DataFrame, predictors: List[str], imputed_variables: List[str], @@ -290,7 +290,7 @@ def evaluate_model( def fit_and_predict_model( - model_class: Type[Imputer], + model_class: Type[BaseImputer], training_data: pd.DataFrame, imputing_data: pd.DataFrame, predictors: List[str], diff --git a/microimpute/evaluations/predictor_analysis.py b/microimpute/evaluations/predictor_analysis.py index 938c08b..f7c280f 100644 --- a/microimpute/evaluations/predictor_analysis.py +++ b/microimpute/evaluations/predictor_analysis.py @@ -30,7 +30,7 @@ TRAIN_SIZE, VALIDATE_CONFIG, ) -from microimpute.models import Imputer, ImputerResults +from microimpute.models import BaseImputer, ImputerResults from microimpute.utils.type_handling import ( DummyVariableProcessor, VariableTypeDetector, @@ -238,7 +238,7 @@ def leave_one_out_analysis( data: pd.DataFrame, predictors: List[str], imputed_variables: List[str], - model_class: Type[Imputer], + model_class: Type[BaseImputer], weight_col: Optional[Union[str, np.ndarray, pd.Series]] = None, quantiles: List[float] = QUANTILES, train_size: float = TRAIN_SIZE, @@ -254,7 +254,7 @@ def leave_one_out_analysis( data: DataFrame containing the data. predictors: List of predictor column names. imputed_variables: List of variables to impute. - model_class: The Imputer class to use for evaluation (e.g., OLS, QRF, QuantReg). + model_class: The BaseImputer class to use for evaluation (e.g., OLS, QRF, QuantReg). weight_col: Optional column name or array of sampling weights. quantiles: List of quantiles for evaluation (default: [0.1, 0.5, 0.9]). train_size: Proportion of data to use for training (default: 0.8). @@ -375,7 +375,7 @@ def progressive_predictor_inclusion( data: pd.DataFrame, predictors: List[str], imputed_variables: List[str], - model_class: Type[Imputer], + model_class: Type[BaseImputer], weight_col: Optional[Union[str, np.ndarray, pd.Series]] = None, quantiles: Optional[List[float]] = QUANTILES, train_size: Optional[float] = TRAIN_SIZE, @@ -391,7 +391,7 @@ def progressive_predictor_inclusion( data: DataFrame containing the data. predictors: List of candidate predictor column names. imputed_variables: List of variables to impute. - model_class: The Imputer class to use for evaluation (e.g., OLS, QRF, QuantReg). + model_class: The BaseImputer class to use for evaluation (e.g., OLS, QRF, QuantReg). weight_col: Optional column name or array of sampling weights. quantiles: List of quantiles for evaluation. train_size: Proportion of data to use for training. @@ -573,7 +573,7 @@ def _evaluate_model_performance( test_data: pd.DataFrame, predictors: List[str], imputed_variables: List[str], - model_class: Type[Imputer], + model_class: Type[BaseImputer], weight_col: Optional[Union[str, np.ndarray, pd.Series]], quantiles: List[float], random_state: int, diff --git a/microimpute/models/__init__.py b/microimpute/models/__init__.py index 4b14f6e..35f5ca5 100644 --- a/microimpute/models/__init__.py +++ b/microimpute/models/__init__.py @@ -2,9 +2,11 @@ This module provides a collection of statistical models for data imputation, including both parametric and non-parametric approaches. Each model extends -the base Imputer class and provides quantile-based predictions. +the base imputer class and provides quantile-based predictions. Available models: + - Imputer: canonical regime-gated, QRF-based, sequentially-chained + imputer (the opinionated default) - OLS: ordinary least squares regression with bootstrapped quantiles - QRF: quantile regression forest for non-parametric quantile regression - QuantReg: linear quantile regression model @@ -13,12 +15,12 @@ (optional, requires pytorch-tabular) Base classes: - - Imputer: abstract base class for all imputation models + - BaseImputer: abstract base class for all imputation models - ImputerResults: container for fitted model and prediction methods """ # Import base classes -from microimpute.models.imputer import Imputer, ImputerResults +from microimpute.models.imputer import BaseImputer, ImputerResults try: from microimpute.models.matching import Matching @@ -34,3 +36,7 @@ from microimpute.models.ols import OLS from microimpute.models.qrf import QRF from microimpute.models.quantreg import QuantReg + +# Canonical opinionated imputer: sign-regime gating + QRF base + sequential +# chained-equations imputation, all on by default. +from microimpute.models.regime_gated import Imputer diff --git a/microimpute/models/imputer.py b/microimpute/models/imputer.py index dae46fa..e15dc03 100644 --- a/microimpute/models/imputer.py +++ b/microimpute/models/imputer.py @@ -2,7 +2,7 @@ This module defines the core architecture for imputation models in MicroImpute. It provides two abstract base classes: -1. Imputer - For model initialization and fitting +1. BaseImputer - For model initialization and fitting 2. ImputerResults - For storing fitted models and making predictions All model implementations should extend these classes to ensure a consistent interface. @@ -35,7 +35,7 @@ def predict(self, X: pd.DataFrame, **kwargs) -> pd.Series: return pd.Series(self.constant_value, index=X.index, name=self.variable_name) -class Imputer(ABC): +class BaseImputer(ABC): """ Abstract base class for fitting imputation models. @@ -238,6 +238,47 @@ def preprocess_data_types( self.logger.error(f"Error during data preprocessing: {str(e)}") raise RuntimeError("Failed to preprocess data types") from e + @staticmethod + def _resolve_sample_weights( + X_train: pd.DataFrame, + weight_col: Optional[Union[str, np.ndarray, pd.Series]], + ) -> Optional[np.ndarray]: + """Resolve a weight specification to a validated per-row array. + + Accepts a column name in ``X_train``, a positional array, or a + Series aligned by index, and returns a float array positionally + aligned with ``X_train`` rows (``None`` when no weights given). + + Raises: + ValueError: If a named weight column is missing from + ``X_train``, or any resolved weight is non-positive or + NaN. + """ + if weight_col is None: + return None + if isinstance(weight_col, str): + if weight_col not in X_train.columns: + raise ValueError( + f"Weight column '{weight_col}' not found in training data" + ) + weights = X_train[weight_col] + elif isinstance(weight_col, np.ndarray): + weights = pd.Series(weight_col, index=X_train.index) + else: + weights = weight_col.reindex(X_train.index) + + # Check for NaN AND non-positive values together. NaN weights + # (e.g. from a Series reindex miss) would otherwise propagate + # into sample_weight passed to learners. + weights_arr = np.asarray(weights, dtype=float) + invalid_mask = np.isnan(weights_arr) | (weights_arr <= 0) + if invalid_mask.any(): + raise ValueError( + "Weights must be positive and finite; found " + f"{int(invalid_mask.sum())} non-positive or NaN weight(s)" + ) + return weights_arr + @validate_call(config=VALIDATE_CONFIG) def fit( self, @@ -261,7 +302,7 @@ def fit( X_train: DataFrame containing the training data. predictors: List of column names to use as predictors. imputed_variables: List of column names to impute. - weight_col: Optional name of the column or column array/series containing sampling weights. When provided, `X_train` will be sampled with replacement using this column as selection probabilities before fitting the model. + weight_col: Optional name of the column or column array/series containing sampling weights. When provided, the resolved per-row weights are passed to the model subclass as `sample_weight`, which honors them natively (e.g. OLS->WLS) or by weighted bootstrap resampling (QRF). skip_missing: If True, skip variables missing from training data with warning. If False, raise error for missing variables. not_numeric_categorical: Optional list of variable names that should be treated as numeric even if they would normally be detected as @@ -348,30 +389,7 @@ def fit( except Exception as e: raise ValueError(f"Invalid input data for model: {str(e)}") from e - weights = None - if weight_col is not None and isinstance(weight_col, str): - if weight_col not in X_train.columns: - raise ValueError( - f"Weight column '{weight_col}' not found in training data" - ) - weights = X_train[weight_col] - elif weight_col is not None and isinstance(weight_col, np.ndarray): - weights = pd.Series(weight_col, index=X_train.index) - elif weight_col is not None and isinstance(weight_col, pd.Series): - weights = weight_col.reindex(X_train.index) - - if weights is not None: - # Check for NaN AND non-positive values together. Previously only - # (weights <= 0).any() was checked, which returns False for NaN - # weights — those then propagated into .sample() as NaN - # probabilities or corrupted sample_weight passed to learners. - weights_arr = np.asarray(weights, dtype=float) - invalid_mask = np.isnan(weights_arr) | (weights_arr <= 0) - if invalid_mask.any(): - raise ValueError( - "Weights must be positive and finite; found " - f"{int(invalid_mask.sum())} non-positive or NaN weight(s)" - ) + weights_arr = self._resolve_sample_weights(X_train, weight_col) # Identify target types BEFORE preprocessing self.identify_target_types( @@ -393,21 +411,20 @@ def fit( self.imputed_vars_dummy_info = imputed_vars_dummy_info self.original_predictors = original_predictors - # Pass sample_weight through to the subclass so it can use each - # learner's native weighted-fit API (QRF, OLS→WLS, logistic, RFC all - # support sample_weight). This replaces the previous bootstrap - # resample, which silently discarded weights for the underlying - # estimator and inflated variance / shrank effective sample size. - sample_weight = None - if weights is not None: - sample_weight = np.asarray(weights_arr, dtype=float) - # Reindex if preprocess_data_types changed the row ordering - # (it currently does not, but guard against future drift). - if len(sample_weight) != len(X_train): - raise RuntimeError( - "Internal error: sample_weight length no longer matches " - "X_train after preprocessing" - ) + # Pass sample_weight through to the subclass so it can honor the + # weights with whichever mechanism its learner supports: an exact + # native weighted-fit API where available (OLS→WLS), or weighted + # bootstrap resampling where the native API cannot weight the + # predictive distribution (the forest-backed QRF — see + # qrf._weighted_resample). + sample_weight = weights_arr + if sample_weight is not None and len(sample_weight) != len(X_train): + # preprocess_data_types currently preserves row order/count, + # but guard against future drift. + raise RuntimeError( + "Internal error: sample_weight length no longer matches " + "X_train after preprocessing" + ) # Defer actual training to subclass with all parameters fit_kwargs = { diff --git a/microimpute/models/matching.py b/microimpute/models/matching.py index 07952d5..561f757 100644 --- a/microimpute/models/matching.py +++ b/microimpute/models/matching.py @@ -7,7 +7,7 @@ from pydantic import validate_call from microimpute.config import RANDOM_STATE, VALIDATE_CONFIG -from microimpute.models.imputer import Imputer, ImputerResults +from microimpute.models.imputer import BaseImputer, ImputerResults from microimpute.utils.statmatch_hotdeck import nnd_hotdeck_using_rpy2 MatchingHotdeckFn = Callable[ @@ -402,7 +402,7 @@ def _process_matching_results( raise RuntimeError("Failed to create output imputations") from output_error -class Matching(Imputer): +class Matching(BaseImputer): """ Statistical matching model for imputation using nearest neighbor distance hot deck method. diff --git a/microimpute/models/mdn.py b/microimpute/models/mdn.py index 003e430..a9ed94e 100644 --- a/microimpute/models/mdn.py +++ b/microimpute/models/mdn.py @@ -14,7 +14,7 @@ from microimpute.config import RANDOM_STATE, VALIDATE_CONFIG from microimpute.models.imputer import ( - Imputer, + BaseImputer, ImputerResults, _ConstantValueModel, ) @@ -800,7 +800,7 @@ def _predict( raise RuntimeError(f"Failed to predict with MDN model: {str(e)}") from e -class MDN(Imputer): +class MDN(BaseImputer): """ Mixture Density Network imputer using PyTorch Tabular. diff --git a/microimpute/models/ols.py b/microimpute/models/ols.py index 6cd6aff..15c6e71 100644 --- a/microimpute/models/ols.py +++ b/microimpute/models/ols.py @@ -10,7 +10,7 @@ from sklearn.linear_model import LogisticRegression from microimpute.config import VALIDATE_CONFIG -from microimpute.models.imputer import Imputer, ImputerResults +from microimpute.models.imputer import BaseImputer, ImputerResults class _LogisticRegressionModel: @@ -462,7 +462,7 @@ def _predict_quantile( ) from e -class OLS(Imputer): +class OLS(BaseImputer): """ Ordinary Least Squares regression model for imputation. diff --git a/microimpute/models/qrf.py b/microimpute/models/qrf.py index 8edc5d0..283bae8 100644 --- a/microimpute/models/qrf.py +++ b/microimpute/models/qrf.py @@ -11,7 +11,7 @@ from sklearn.ensemble import RandomForestClassifier from microimpute.config import VALIDATE_CONFIG -from microimpute.models.imputer import Imputer, ImputerResults +from microimpute.models.imputer import BaseImputer, ImputerResults try: import psutil @@ -39,6 +39,32 @@ def _get_sequential_predictors( return predictors + imputed_variables[:current_variable_index] +def _weighted_resample( + X: pd.DataFrame, + y: pd.Series, + sample_weight: np.ndarray, + rng: np.random.Generator, +) -> Tuple[pd.DataFrame, pd.Series]: + """Materialize sampling weights by weighted bootstrap resampling. + + The forest backends cannot honor ``sample_weight`` in their + predictive distributions: ``quantile_forest`` uses it only as a + zero-weight filter when assembling leaf membership, and fully-grown + forest leaves hold a single training sample each, so weighted + impurity does not move leaf values either. Drawing ``len(X)`` rows + with replacement with probability proportional to weight bakes the + weighted distribution into the training data, so leaf distributions + (and the values imputed from them) reflect the weights. + """ + weights = np.asarray(sample_weight, dtype=float) + probabilities = weights / weights.sum() + sel = rng.choice(len(X), size=len(X), replace=True, p=probabilities) + return ( + X.iloc[sel].reset_index(drop=True), + y.iloc[sel].reset_index(drop=True), + ) + + class _RandomForestClassifierModel: """Internal class to handle classification for categorical/boolean targets.""" @@ -65,10 +91,24 @@ def fit( Note: y should be the ORIGINAL categorical/boolean column, not dummy encoded. + + ``sample_weight`` is materialized by weighted bootstrap + resampling of the training rows (see ``_weighted_resample``) + because fully-grown random-forest leaves are single-sample, so + passing weights to the native ``fit`` would leave predicted + class probabilities effectively unweighted. """ self.output_column = y.name self.var_type = var_type + if sample_weight is not None: + X, y = _weighted_resample( + X, + y, + sample_weight, + np.random.default_rng(self.seed), + ) + if var_type == "boolean": # For boolean, convert to 0/1 but keep as single target y_encoded = y.astype(int) @@ -97,11 +137,8 @@ def fit( } self.classifier = RandomForestClassifier(**classifier_params) - fit_kwargs = {} - if sample_weight is not None: - fit_kwargs["sample_weight"] = np.asarray(sample_weight, dtype=float) self.feature_columns = list(X.columns) - self.classifier.fit(X, y_encoded, **fit_kwargs) + self.classifier.fit(X, y_encoded) def _align_features(self, X: pd.DataFrame) -> pd.DataFrame: """Reorder prediction features to the fitted sklearn column contract.""" @@ -181,18 +218,25 @@ def fit( """Fit the QRF model. Note: Assumes X is already preprocessed with categorical encoding - handled by the base Imputer class. + handled by the base imputer class. Args: X: Predictor DataFrame (preprocessed). y: Target Series. - sample_weight: Optional per-row sample weights, passed directly to - the underlying ``RandomForestQuantileRegressor.fit`` so each - row contributes to the weighted-survey estimator rather than - being treated as a bootstrap-resample probability. + sample_weight: Optional per-row sample weights, materialized + by weighted bootstrap resampling of the training rows + (see ``_weighted_resample``). + ``RandomForestQuantileRegressor.fit`` accepts + ``sample_weight`` but only uses it as a zero-weight + filter when assembling leaf membership, so passing it + natively would leave the leaf quantile distributions — + and every value imputed from them — unweighted. """ self.output_column = y.name + if sample_weight is not None: + X, y = _weighted_resample(X, y, sample_weight, self._rng) + # Remove random_state / sample_weight from kwargs if present, since # we set them explicitly below. qrf_kwargs_filtered = { @@ -205,11 +249,8 @@ def fit( self.qrf = RandomForestQuantileRegressor( random_state=self.seed, **qrf_kwargs_filtered ) - fit_kwargs = {} - if sample_weight is not None: - fit_kwargs["sample_weight"] = np.asarray(sample_weight, dtype=float) self.feature_columns = list(X.columns) - self.qrf.fit(X, y.values.ravel(), **fit_kwargs) + self.qrf.fit(X, y.values.ravel()) def _align_features(self, X: pd.DataFrame) -> pd.DataFrame: """Reorder prediction features to the fitted QRF column contract.""" @@ -347,6 +388,89 @@ def __init__( self.constant_targets = constant_targets or {} self.dummy_processor = dummy_processor + @property + def feature_names_in_(self) -> np.ndarray: + """sklearn-style input feature names seen during fit. + + Returns the original (pre-dummy-encoding) predictor names — the + columns the caller actually supplied — as an ``np.ndarray``, + matching the sklearn ``feature_names_in_`` convention. Falls back + to the encoded predictor list only if originals are unavailable. + + Note that for a forest fit on a categorical predictor the encoded + feature columns (and hence ``feature_importances_`` keys) differ + from these input names; ``feature_importances_`` is therefore + keyed by the forest's actual fitted columns rather than paired + positionally with this array. + """ + names = ( + self.original_predictors + if getattr(self, "original_predictors", None) + else self.predictors + ) + return np.asarray(names, dtype=object) + + @property + def feature_importances_(self): + """sklearn-style feature importances from the underlying forest. + + Returns a ``{encoded_feature_name: importance}`` dict keyed by the + forest's ACTUAL fitted feature columns + (``_QRFModel.feature_columns``). Keying by the fitted columns — + rather than pairing the raw importance array with + ``feature_names_in_`` — guarantees the names and values are always + self-consistent in length and order, including when a categorical + predictor expands into several dummy columns. + + For a single imputed variable, returns one such dict. For multiple + imputed variables, returns ``{variable: {feature: importance}}``. + + Raises ``AttributeError`` when no QRF-backed importances are + available (e.g. constant or classifier base models), so + ``hasattr`` reports ``False``. + """ + + def _importances_for(variable: str) -> Dict[str, float]: + model = self.models.get(variable) + forest = getattr(model, "qrf", None) + if forest is None: + raise AttributeError( + "feature_importances_ is unavailable: model for " + f"{variable!r} has no underlying forest" + ) + importances = getattr(forest, "feature_importances_", None) + if importances is None: + raise AttributeError( + f"feature_importances_ is unavailable for {variable!r}" + ) + # Key by the forest's actual fitted feature columns so the + # names and values always match in length and order. The QRF + # base model records these as ``feature_columns`` at fit time; + # fall back to the forest's own ``feature_names_in_`` if set. + feature_columns = list(getattr(model, "feature_columns", [])) + if not feature_columns: + fitted_names = getattr(forest, "feature_names_in_", None) + if fitted_names is not None: + feature_columns = list(fitted_names) + importances = np.asarray(importances) + if len(feature_columns) != len(importances): + # Should not happen — the forest's importance vector is by + # construction one entry per fitted column — but guard so + # we never silently pair mismatched names and values. + raise AttributeError( + "feature_importances_ is inconsistent for " + f"{variable!r}: {len(feature_columns)} fitted columns " + f"vs {len(importances)} importances" + ) + return { + name: float(importance) + for name, importance in zip(feature_columns, importances) + } + + if len(self.imputed_variables) == 1: + return _importances_for(self.imputed_variables[0]) + return {var: _importances_for(var) for var in self.imputed_variables} + def _get_encoded_predictors(self, current_predictors: List[str]) -> List[str]: """Get properly encoded predictor columns for sequential imputation. @@ -590,7 +714,7 @@ def _predict( raise RuntimeError(f"Failed to predict with QRF model: {str(e)}") from e -class QRF(Imputer): +class QRF(BaseImputer): """ Quantile Regression Forest model for imputation. @@ -722,7 +846,7 @@ def _fit_model( categorical_targets = getattr(self, "categorical_targets", {}) boolean_targets = getattr(self, "boolean_targets", {}) - # sample_weight is threaded via kwargs from the base Imputer.fit, + # sample_weight is threaded via kwargs from the base imputer fit, # bypassing the nested qrf/rfc structure so both classifier and # regressor paths see the same per-row weights. sample_weight = kwargs.pop("sample_weight", None) @@ -860,9 +984,9 @@ def _fit( X_train: DataFrame containing the training data. predictors: List of column names to use as predictors. imputed_variables: List of column names to impute. - sample_weight: Optional per-row sample weights threaded through - to ``RandomForestQuantileRegressor.fit`` / - ``RandomForestClassifier.fit``. + sample_weight: Optional per-row sample weights, honored by + weighted bootstrap resampling of each model's training + rows (see ``_weighted_resample``). **qrf_kwargs: Additional keyword arguments to pass to QRF. Returns: diff --git a/microimpute/models/quantreg.py b/microimpute/models/quantreg.py index c767d38..f315d27 100644 --- a/microimpute/models/quantreg.py +++ b/microimpute/models/quantreg.py @@ -10,7 +10,7 @@ from statsmodels.tools.sm_exceptions import IterationLimitWarning from microimpute.config import VALIDATE_CONFIG -from microimpute.models.imputer import Imputer, ImputerResults +from microimpute.models.imputer import BaseImputer, ImputerResults warnings.filterwarnings("ignore", category=IterationLimitWarning) @@ -268,7 +268,7 @@ def _predict( ) from e -class QuantReg(Imputer): +class QuantReg(BaseImputer): """ Quantile Regression model for imputation. diff --git a/microimpute/models/zero_inflated.py b/microimpute/models/regime_gated.py similarity index 78% rename from microimpute/models/zero_inflated.py rename to microimpute/models/regime_gated.py index fc9b517..d467d9d 100644 --- a/microimpute/models/zero_inflated.py +++ b/microimpute/models/regime_gated.py @@ -1,4 +1,4 @@ -"""Regime-aware zero-inflation wrapper around base imputers. +"""Regime-gated imputation wrapper around base imputers. Tabular microdata variables often fall into distinct *regimes* based on which of {negative, zero, positive} values appear in the training data. @@ -17,7 +17,7 @@ ``max(train_negatives)`` and ``min(train_positives)``, which is not a region any actual record occupies. -``ZeroInflatedImputer`` wraps any base ``Imputer`` and: +``Imputer`` wraps any base imputer and: - Detects the regime automatically at fit time from the training distribution — no per-variable hand configuration required. @@ -32,9 +32,18 @@ - At predict time, routes each record to the base imputer of its gate-assigned regime, guaranteeing no sign-interpolation leaks. +Sign-regime gating can be turned off with ``signregime=False``, in +which case each numeric target is imputed by a single base imputer over +the full training set (the ``REGIME_NO_GATE`` path). + The wrapper is generic over the base imputer — ``QRF`` is the obvious default, but ``MDN``, ``OLS``, or ``Matching`` all compose the same way. +Numeric targets are always imputed sequentially (chained-equations): +each target conditions on the original predictors plus the +previously-imputed numeric targets, so the imputed vector preserves +cross-variable joint structure. + Regime detection is based only on observed support. If the training data contains negative, zero, and positive values, the imputer uses the three-sign architecture. Callers do not provide sign/regime metadata. @@ -49,7 +58,7 @@ from pydantic import validate_call from microimpute.config import RANDOM_STATE, VALIDATE_CONFIG -from microimpute.models.imputer import Imputer, ImputerResults +from microimpute.models.imputer import BaseImputer, ImputerResults from microimpute.models.qrf import QRF # Regime labels. Kept as module-level constants so downstream code can @@ -61,6 +70,9 @@ REGIME_POSITIVE_ONLY = "POSITIVE_ONLY" REGIME_NEGATIVE_ONLY = "NEGATIVE_ONLY" REGIME_DEGENERATE_ZERO = "DEGENERATE_ZERO" +# Used when sign-regime gating is turned off (``signregime=False``): a +# single base imputer over the full training set, no gate. +REGIME_NO_GATE = "NO_GATE" def _make_classifier(kind: str, seed: int): @@ -83,6 +95,21 @@ def _make_classifier(kind: str, seed: int): raise ValueError(f"Unknown classifier_type {kind!r}; expected 'hist_gb' or 'rf'.") +def _subset_weights( + sample_weight: Optional[np.ndarray], + mask: np.ndarray, +) -> Optional[np.ndarray]: + """Slice a per-row weight vector with a boolean row mask. + + Sub-fits on row subsets (e.g. the positive part of a zero-inflated + target) must receive the weights sliced with the same mask so each + weight stays aligned with its row. + """ + if sample_weight is None: + return None + return sample_weight[mask] + + def _detect_regime( y: np.ndarray, *, @@ -129,14 +156,24 @@ def _detect_regime( return REGIME_DEGENERATE_ZERO -class ZeroInflatedImputer(Imputer): - """Imputer that wraps a base Imputer with regime-aware zero-gating. +class Imputer(BaseImputer): + """Canonical imputer: regime-aware zero-gating over a base imputer. + + Wraps a base ``BaseImputer`` (``QRF`` by default), detects each + numeric target's sign regime at fit time, and routes predictions + through the regime-specific gate + base imputer(s). Numeric targets + are always chained (sequential conditioning) in ``imputed_variables`` + order. Args: - base_imputer_class: ``Imputer`` subclass to use for the nonzero - regression step. Defaults to ``QRF``. + base_imputer_class: ``BaseImputer`` subclass to use for the + nonzero regression step. Defaults to ``QRF``. base_imputer_kwargs: Keyword arguments forwarded to the base imputer constructor. ``{}`` by default. + signregime: When ``True`` (default), detect the sign regime per + numeric target and gate accordingly. When ``False``, skip + regime detection and impute each numeric target with a single + base imputer over the full training set (``REGIME_NO_GATE``). zero_atol: Absolute tolerance for "equals zero" in the regime detector. Defaults to 1e-6, matching the upstream ``_MultiSourceBase`` convention. @@ -148,20 +185,20 @@ class ZeroInflatedImputer(Imputer): def __init__( self, - base_imputer_class: Optional[Type[Imputer]] = None, + base_imputer_class: Optional[Type[BaseImputer]] = None, base_imputer_kwargs: Optional[Dict[str, Any]] = None, + signregime: bool = True, zero_atol: float = 1e-6, classifier_type: str = "hist_gb", - sequential: bool = True, seed: Optional[int] = RANDOM_STATE, log_level: Optional[str] = "WARNING", ) -> None: super().__init__(seed=seed, log_level=log_level) self.base_imputer_class = base_imputer_class or QRF self.base_imputer_kwargs = dict(base_imputer_kwargs or {}) + self.signregime = bool(signregime) self.zero_atol = float(zero_atol) self.classifier_type = classifier_type - self.sequential = bool(sequential) # Filled in during fit(). self._regimes: Dict[str, str] = {} @@ -170,7 +207,7 @@ def __init__( def _fit(self, *args: Any, **kwargs: Any) -> Any: """Abstract-method placeholder; this class overrides ``fit`` directly.""" raise NotImplementedError( - "ZeroInflatedImputer overrides `fit` directly; `_fit` is not used." + "Imputer overrides `fit` directly; `_fit` is not used." ) def get_regime(self, variable: str) -> str: @@ -196,13 +233,24 @@ def fit( are handled per-variable: regime detection, then composition of gate + base imputer(s) as appropriate. - Returns a ``ZeroInflatedImputerResults`` that routes + ``weight_col`` (a column name, array, or Series of per-row + sampling weights) is resolved once and threaded through every + nested fit: gate classifiers receive it as ``sample_weight`` + and per-regime base imputers as ``weight_col``, sliced to the + regime's row subset. + + Returns a ``RegimeGatedImputerResults`` that routes predictions through each target's regime-specific pipeline. """ self._validate_data(X_train, predictors + imputed_variables) + # Resolve weights once to a validated per-row vector aligned + # with ``X_train`` so every nested fit (gate classifiers and + # per-regime base imputers) trains on the same weighting. + sample_weight = self._resolve_sample_weights(X_train, weight_col) + # Classify target variables as numeric / categorical / boolean / - # constant using the base Imputer's detector. + # constant using the base imputer's detector. self.identify_target_types( X_train, imputed_variables, @@ -240,17 +288,14 @@ def fit( # targets, so the imputed vector preserves cross-variable joint # structure. ``imputed_variables`` order is the chain order; a # single-target list is unaffected (no priors to chain on). + # Chaining is always on. for position, var in enumerate(numeric_targets): - seq_predictors = ( - list(predictors) + numeric_targets[:position] - if self.sequential - else list(predictors) - ) + seq_predictors = list(predictors) + numeric_targets[:position] y = X_train[var].to_numpy(dtype=float, copy=False) - regime = _detect_regime( - y, - zero_atol=self.zero_atol, - ) + if self.signregime: + regime = _detect_regime(y, zero_atol=self.zero_atol) + else: + regime = REGIME_NO_GATE self._regimes[var] = regime bundle = self._fit_single_numeric( X_train=X_train, @@ -258,6 +303,7 @@ def fit( variable=var, regime=regime, y=y, + sample_weight=sample_weight, not_numeric_categorical=nested_not_numeric_categorical, ) bundle["predictors"] = list(seq_predictors) @@ -284,7 +330,7 @@ def fit( else: aux_bundle = None - return ZeroInflatedImputerResults( + return RegimeGatedImputerResults( predictors=self.predictors, imputed_variables=self.imputed_variables, seed=self.seed, @@ -306,10 +352,16 @@ def _fit_single_numeric( variable: str, regime: str, y: np.ndarray, + sample_weight: Optional[np.ndarray] = None, not_numeric_categorical: Optional[List[str]] = None, ) -> Dict[str, Any]: """Fit the gate and base imputer(s) for one numeric target. + ``sample_weight`` is a per-row weight vector positionally + aligned with ``X_train``; it reaches the gate classifier as + ``sample_weight`` and each base imputer as ``weight_col``, + sliced with the same row mask as the training slice. + Returns a bundle dict with the regime, the gate classifier (or None), and the base imputer(s) keyed by their role. """ @@ -318,7 +370,11 @@ def _fit_single_numeric( if regime == REGIME_DEGENERATE_ZERO: return {"kind": "constant", "value": 0.0} - if regime in (REGIME_POSITIVE_ONLY, REGIME_NEGATIVE_ONLY): + if regime in ( + REGIME_POSITIVE_ONLY, + REGIME_NEGATIVE_ONLY, + REGIME_NO_GATE, + ): # No gate; single base imputer on the full training set. return { "kind": "single", @@ -326,6 +382,7 @@ def _fit_single_numeric( X_train, predictors, variable, + sample_weight=sample_weight, not_numeric_categorical=not_numeric_categorical, ), } @@ -333,12 +390,13 @@ def _fit_single_numeric( if regime == REGIME_ZI_POSITIVE: labels = (y > self.zero_atol).astype(int) clf = _make_classifier(self.classifier_type, self.seed) - clf.fit(X_pred, labels) + clf.fit(X_pred, labels, sample_weight=sample_weight) pos_mask = y > self.zero_atol pos_base = self._fit_base_single( X_train.loc[pos_mask], predictors, variable, + sample_weight=_subset_weights(sample_weight, pos_mask), not_numeric_categorical=not_numeric_categorical, ) return { @@ -350,12 +408,13 @@ def _fit_single_numeric( if regime == REGIME_ZI_NEGATIVE: labels = (y < -self.zero_atol).astype(int) clf = _make_classifier(self.classifier_type, self.seed) - clf.fit(X_pred, labels) + clf.fit(X_pred, labels, sample_weight=sample_weight) neg_mask = y < -self.zero_atol neg_base = self._fit_base_single( X_train.loc[neg_mask], predictors, variable, + sample_weight=_subset_weights(sample_weight, neg_mask), not_numeric_categorical=not_numeric_categorical, ) return { @@ -369,7 +428,7 @@ def _fit_single_numeric( # plus a base imputer per sign. labels = (y > 0).astype(int) clf = _make_classifier(self.classifier_type, self.seed) - clf.fit(X_pred, labels) + clf.fit(X_pred, labels, sample_weight=sample_weight) pos_mask = y > 0 neg_mask = ~pos_mask return { @@ -379,12 +438,14 @@ def _fit_single_numeric( X_train.loc[pos_mask], predictors, variable, + sample_weight=_subset_weights(sample_weight, pos_mask), not_numeric_categorical=not_numeric_categorical, ), "negative_base": self._fit_base_single( X_train.loc[neg_mask], predictors, variable, + sample_weight=_subset_weights(sample_weight, neg_mask), not_numeric_categorical=not_numeric_categorical, ), } @@ -397,7 +458,7 @@ def _fit_single_numeric( np.where(y < -self.zero_atol, 0, 1), ) clf = _make_classifier(self.classifier_type, self.seed) - clf.fit(X_pred, labels) + clf.fit(X_pred, labels, sample_weight=sample_weight) pos_mask = y > self.zero_atol neg_mask = y < -self.zero_atol return { @@ -407,12 +468,14 @@ def _fit_single_numeric( X_train.loc[pos_mask], predictors, variable, + sample_weight=_subset_weights(sample_weight, pos_mask), not_numeric_categorical=not_numeric_categorical, ), "negative_base": self._fit_base_single( X_train.loc[neg_mask], predictors, variable, + sample_weight=_subset_weights(sample_weight, neg_mask), not_numeric_categorical=not_numeric_categorical, ), } @@ -424,9 +487,14 @@ def _fit_base_single( X_train: pd.DataFrame, predictors: List[str], variable: str, + sample_weight: Optional[np.ndarray] = None, not_numeric_categorical: Optional[List[str]] = None, ) -> ImputerResults: - """Fit a single base Imputer on a (possibly filtered) slice.""" + """Fit a single base imputer on a (possibly filtered) slice. + + ``sample_weight`` must already be sliced to ``X_train``'s rows; + it is forwarded to the base imputer's ``weight_col``. + """ imputer = self.base_imputer_class( log_level="ERROR", **self.base_imputer_kwargs, @@ -435,11 +503,12 @@ def _fit_base_single( X_train=X_train, predictors=predictors, imputed_variables=[variable], + weight_col=sample_weight, not_numeric_categorical=not_numeric_categorical, ) -class ZeroInflatedImputerResults(ImputerResults): +class RegimeGatedImputerResults(ImputerResults): """Fitted regime-aware imputer ready for prediction.""" def __init__( @@ -498,6 +567,56 @@ def predict( } return self._predict_single_draw(X_test, quantile=None, **kwargs) + @property + def regimes_(self) -> Dict[str, str]: + """Detected sign regime per numeric target. + + Maps each numeric (regime-gated) target to its regime label + (one of the ``REGIME_*`` constants, or ``REGIME_NO_GATE`` when + sign-regime gating is disabled). + """ + return dict(self._regimes) + + @property + def predictors_(self) -> Dict[str, List[str]]: + """Chained predictor list per numeric target. + + Each target maps to the predictor columns used, in order — the + original predictors followed by the previously-imputed targets + this variable was chained on. + """ + return { + var: list(self._per_variable[var]["predictors"]) + for var in self._per_variable + } + + @property + def models_(self) -> Dict[str, Dict[str, Any]]: + """Fitted sub-estimators per numeric target, keyed by role. + + Each target maps to a ``{role: estimator}`` dict translating the + internal bundle keys to sklearn-style roles: ``base`` -> + ``single``, ``classifier`` -> ``gate``, ``positive_base`` -> + ``positive``, ``negative_base`` -> ``negative``. Only actual + fitted models are included; bookkeeping keys (``kind``, + ``predictors``, ``value``) are skipped, and a ``constant``-kind + bundle yields an empty role dict. + """ + key_to_role = { + "base": "single", + "classifier": "gate", + "positive_base": "positive", + "negative_base": "negative", + } + out: Dict[str, Dict[str, Any]] = {} + for var, bundle in self._per_variable.items(): + roles: Dict[str, Any] = {} + for key, role in key_to_role.items(): + if key in bundle: + roles[role] = bundle[key] + out[var] = roles + return out + def _predict_single_draw( self, X_test: pd.DataFrame, @@ -623,7 +742,8 @@ def _predict_single_variable( # Classes are [0=neg, 1=zero, 2=pos] per the fit encoding. cumulative = np.cumsum(probas, axis=1) u = self._rng.random(n) - # Each row i is assigned to class argmax over k of (cumulative[i,k] >= u[i]). + # Each row i is assigned to class argmax over k of + # (cumulative[i,k] >= u[i]). class_indices = (cumulative >= u[:, None]).argmax(axis=1) classes = clf.classes_[class_indices] values = np.zeros(n, dtype=float) @@ -699,7 +819,7 @@ def _predict(self, *args: Any, **kwargs: Any) -> Any: but the abstract method still must be satisfied. """ raise NotImplementedError( - "ZeroInflatedImputerResults overrides `predict` directly; " + "RegimeGatedImputerResults overrides `predict` directly; " "`_predict` is not used." ) @@ -707,11 +827,12 @@ def _predict(self, *args: Any, **kwargs: Any) -> Any: __all__ = [ "REGIME_DEGENERATE_ZERO", "REGIME_NEGATIVE_ONLY", + "REGIME_NO_GATE", "REGIME_POSITIVE_ONLY", "REGIME_SIGN_ONLY", "REGIME_THREE_SIGN", "REGIME_ZI_NEGATIVE", "REGIME_ZI_POSITIVE", - "ZeroInflatedImputer", - "ZeroInflatedImputerResults", + "Imputer", + "RegimeGatedImputerResults", ] diff --git a/microimpute/utils/statmatch_hotdeck.py b/microimpute/utils/statmatch_hotdeck.py index 9cbf14e..8abbf95 100644 --- a/microimpute/utils/statmatch_hotdeck.py +++ b/microimpute/utils/statmatch_hotdeck.py @@ -106,7 +106,7 @@ def nnd_hotdeck_using_rpy2( r_match = ro.StrVector(matching_variables) r_z = ro.StrVector(z_variables) - # Extract optional donor sample weights (threaded from Imputer.fit + # Extract optional donor sample weights (threaded from BaseImputer.fit # when weight_col was supplied). StatMatch accepts these via the # ``weight.don`` R argument; we pop it from matching_kwargs so that # other kwargs pass through unchanged. diff --git a/tests/test_models/test_imputers.py b/tests/test_models/test_imputers.py index bd7016f..f5fdea2 100644 --- a/tests/test_models/test_imputers.py +++ b/tests/test_models/test_imputers.py @@ -1,8 +1,8 @@ """ -Comprehensive test module for the Imputer abstract class and its implementations. +Comprehensive test module for the BaseImputer abstract class and its implementations. This module tests the compatibility and interchangeability of different -imputer models through the common Imputer interface, including edge cases +imputer models through the common BaseImputer interface, including edge cases and error handling. """ @@ -90,7 +90,7 @@ def data_with_edge_cases() -> pd.DataFrame: @pytest.mark.parametrize( "model_class", ALL_IMPUTER_MODELS, ids=lambda cls: cls.__name__ ) -def test_init_signatures(model_class: Type[Imputer]) -> None: +def test_init_signatures(model_class: Type[BaseImputer]) -> None: """Test that all models can be initialized without required arguments.""" model = model_class() assert model.predictors is None, ( @@ -105,7 +105,7 @@ def test_init_signatures(model_class: Type[Imputer]) -> None: "model_class", ALL_IMPUTER_MODELS, ids=lambda cls: cls.__name__ ) def test_fit_predict_interface( - model_class: Type[Imputer], diabetes_data: pd.DataFrame + model_class: Type[BaseImputer], diabetes_data: pd.DataFrame ) -> None: """Test the fit and predict methods for each model.""" quantiles = QUANTILES @@ -146,7 +146,7 @@ def test_fit_predict_interface( @pytest.mark.parametrize( "model_class", ALL_IMPUTER_MODELS, ids=lambda cls: cls.__name__ ) -def test_categorical_variables(model_class: Type[Imputer]) -> None: +def test_categorical_variables(model_class: Type[BaseImputer]) -> None: """Test that models handle categorical variables correctly.""" np.random.seed(42) data = pd.DataFrame( @@ -180,7 +180,7 @@ def test_categorical_variables(model_class: Type[Imputer]) -> None: @pytest.mark.parametrize( "model_class", ALL_IMPUTER_MODELS, ids=lambda cls: cls.__name__ ) -def test_boolean_variables(model_class: Type[Imputer]) -> None: +def test_boolean_variables(model_class: Type[BaseImputer]) -> None: """Test that models handle boolean variables correctly.""" np.random.seed(42) data = pd.DataFrame( @@ -214,7 +214,7 @@ def test_boolean_variables(model_class: Type[Imputer]) -> None: "model_class", ALL_IMPUTER_MODELS, ids=lambda cls: cls.__name__ ) def test_imputation_bool_targets( - model_class: Type[Imputer], + model_class: Type[BaseImputer], ) -> None: """Test imputing boolean target variables.""" diabetes = load_diabetes() @@ -242,7 +242,7 @@ def test_imputation_bool_targets( "model_class", CATEGORICAL_MODELS, ids=lambda cls: cls.__name__ ) def test_imputation_categorical_targets( - model_class: Type[Imputer], + model_class: Type[BaseImputer], ) -> None: """Test imputing categorical target variables.""" diabetes = load_diabetes() @@ -310,7 +310,7 @@ def test_imputation_categorical_targets( "model_class", CATEGORICAL_MODELS, ids=lambda cls: cls.__name__ ) def test_categorical_return_probs_false( - model_class: Type[Imputer], + model_class: Type[BaseImputer], ) -> None: """Test that categorical imputation with return_probs=False returns DataFrame.""" diabetes = load_diabetes() @@ -358,7 +358,7 @@ def test_categorical_return_probs_false( "model_class", ALL_IMPUTER_MODELS, ids=lambda cls: cls.__name__ ) def test_single_predictor( - model_class: Type[Imputer], simple_data: pd.DataFrame + model_class: Type[BaseImputer], simple_data: pd.DataFrame ) -> None: """Test models with only one predictor.""" X_train, X_test = preprocess_data(simple_data) @@ -382,7 +382,7 @@ def test_single_predictor( "model_class", ALL_IMPUTER_MODELS, ids=lambda cls: cls.__name__ ) def test_multiple_targets( - model_class: Type[Imputer], diabetes_data: pd.DataFrame + model_class: Type[BaseImputer], diabetes_data: pd.DataFrame ) -> None: """Test models with multiple target variables.""" predictors = ["age", "sex", "bmi", "bp"] @@ -414,7 +414,7 @@ def test_multiple_targets( @pytest.mark.parametrize( "model_class", ALL_IMPUTER_MODELS, ids=lambda cls: cls.__name__ ) -def test_constant_predictor(model_class: Type[Imputer]) -> None: +def test_constant_predictor(model_class: Type[BaseImputer]) -> None: """Test models with a constant predictor (no variance).""" np.random.seed(42) data = pd.DataFrame( @@ -446,7 +446,7 @@ def test_constant_predictor(model_class: Type[Imputer]) -> None: @pytest.mark.parametrize( "model_class", ALL_IMPUTER_MODELS, ids=lambda cls: cls.__name__ ) -def test_constant_target(model_class: Type[Imputer]) -> None: +def test_constant_target(model_class: Type[BaseImputer]) -> None: """Test models with a constant target variable.""" np.random.seed(42) @@ -473,7 +473,7 @@ def test_constant_target(model_class: Type[Imputer]) -> None: @pytest.mark.parametrize( "model_class", ALL_IMPUTER_MODELS, ids=lambda cls: cls.__name__ ) -def test_highly_correlated_predictors(model_class: Type[Imputer]) -> None: +def test_highly_correlated_predictors(model_class: Type[BaseImputer]) -> None: """Test models with highly correlated predictors.""" np.random.seed(42) n_samples = 100 @@ -509,7 +509,7 @@ def test_highly_correlated_predictors(model_class: Type[Imputer]) -> None: "model_class", ALL_IMPUTER_MODELS, ids=lambda cls: cls.__name__ ) def test_weighted_training( - model_class: Type[Imputer], diabetes_data: pd.DataFrame + model_class: Type[BaseImputer], diabetes_data: pd.DataFrame ) -> None: """Ensure models can be trained using sampling weights.""" X_train, _ = preprocess_data(diabetes_data) @@ -635,7 +635,7 @@ def test_weighted_fit_differs_from_unweighted( "model_class", ALL_IMPUTER_MODELS, ids=lambda cls: cls.__name__ ) def test_extreme_quantiles( - model_class: Type[Imputer], simple_data: pd.DataFrame + model_class: Type[BaseImputer], simple_data: pd.DataFrame ) -> None: """Test models with extreme quantile values.""" X_train, X_test = preprocess_data(simple_data) @@ -659,7 +659,9 @@ def test_extreme_quantiles( @pytest.mark.parametrize( "model_class", ALL_IMPUTER_MODELS, ids=lambda cls: cls.__name__ ) -def test_single_quantile(model_class: Type[Imputer], simple_data: pd.DataFrame) -> None: +def test_single_quantile( + model_class: Type[BaseImputer], simple_data: pd.DataFrame +) -> None: """Test models with a single quantile.""" X_train, X_test = preprocess_data(simple_data) @@ -698,7 +700,7 @@ def test_string_column_validation() -> None: @pytest.mark.parametrize( "model_class", ALL_IMPUTER_MODELS, ids=lambda cls: cls.__name__ ) -def test_missing_predictors_in_test(model_class: Type[Imputer]) -> None: +def test_missing_predictors_in_test(model_class: Type[BaseImputer]) -> None: """Test behavior when test data is missing predictor columns.""" np.random.seed(42) train_data = pd.DataFrame( @@ -741,7 +743,9 @@ def test_missing_predictors_in_test(model_class: Type[Imputer]) -> None: _REPRODUCIBILITY_MODELS, ids=lambda cls: cls.__name__, ) -def test_reproducibility(model_class: Type[Imputer], simple_data: pd.DataFrame) -> None: +def test_reproducibility( + model_class: Type[BaseImputer], simple_data: pd.DataFrame +) -> None: # Note: MDN is excluded because PyTorch MPS (Apple Silicon) doesn't support # deterministic operations, making reproducibility tests unreliable. """Test that models produce reproducible results.""" @@ -770,7 +774,7 @@ def test_reproducibility(model_class: Type[Imputer], simple_data: pd.DataFrame) @pytest.mark.parametrize( "model_class", ALL_IMPUTER_MODELS, ids=lambda cls: cls.__name__ ) -def test_large_number_of_predictors(model_class: Type[Imputer]) -> None: +def test_large_number_of_predictors(model_class: Type[BaseImputer]) -> None: """Test models with many predictors.""" np.random.seed(42) n_samples = 50 diff --git a/tests/test_models/test_qrf.py b/tests/test_models/test_qrf.py index e3099f7..3dda32f 100644 --- a/tests/test_models/test_qrf.py +++ b/tests/test_models/test_qrf.py @@ -170,6 +170,110 @@ def test_qrf_model_prediction_reorders_to_fitted_feature_order() -> None: assert not predictions.isna().any() +def test_qrf_feature_importances_self_consistent_with_categorical() -> None: + """Regression test: feature_importances_ keys and values must align. + + A categorical predictor expands into several dummy columns, so the + forest's importance vector is LONGER than the list of original input + predictor names. The prior implementation paired + ``feature_names_in_`` (original names) positionally with the raw + importance array — a silent length/order mismatch under dummy + encoding. The fix keys ``feature_importances_`` by the forest's + actual fitted columns (always self-consistent), and reports the + original predictor names via ``feature_names_in_``. + """ + rng = np.random.default_rng(0) + n = 400 + cat = rng.choice(["red", "green", "blue"], size=n) + offset = pd.Series(cat).map({"red": 0.0, "green": 5.0, "blue": 10.0}) + data = pd.DataFrame( + { + "x": rng.normal(size=n), + "color": cat, + "y": 20.0 + offset.to_numpy() + rng.exponential(1.0, size=n), + } + ) + + fitted = QRF().fit( + data, + predictors=["x", "color"], + imputed_variables=["y"], + n_estimators=20, + ) + + # feature_names_in_ is the ORIGINAL input predictor names (sklearn + # convention), as an ndarray. + assert isinstance(fitted.feature_names_in_, np.ndarray) + assert set(fitted.feature_names_in_) == {"x", "color"} + + importances = fitted.feature_importances_ + assert isinstance(importances, dict) + # Keyed by the forest's actual fitted columns -> dummy expansion means + # MORE importance entries than original predictor names. This is the + # exact mismatch the old positional pairing got wrong. + qrf_model = fitted.models["y"] + assert set(importances) == set(qrf_model.feature_columns) + assert len(importances) == len(qrf_model.qrf.feature_importances_) + assert len(importances) > len(fitted.feature_names_in_) + # Every value is finite and they sum to ~1 (sklearn importances). + assert all(np.isfinite(v) for v in importances.values()) + assert abs(sum(importances.values()) - 1.0) < 1e-6 + # The expanded color dummies carry signal -> at least one nonzero. + color_keys = [k for k in importances if str(k).startswith("color")] + assert color_keys + assert any(importances[k] > 0 for k in color_keys) + + +def test_qrf_feature_importances_multiple_variables_is_nested() -> None: + """Multiple imputed variables -> {var: {feature: importance}}.""" + rng = np.random.default_rng(1) + n = 400 + data = pd.DataFrame( + { + "x1": rng.normal(size=n), + "x2": rng.normal(size=n), + "y1": rng.exponential(1.0, size=n) + 1.0, + "y2": rng.exponential(1.0, size=n) + 1.0, + } + ) + fitted = QRF().fit( + data, + predictors=["x1", "x2"], + imputed_variables=["y1", "y2"], + n_estimators=15, + ) + importances = fitted.feature_importances_ + assert set(importances) == {"y1", "y2"} + for var in ("y1", "y2"): + per_var = importances[var] + assert isinstance(per_var, dict) + assert set(per_var) == set(fitted.models[var].feature_columns) + + +def test_qrf_feature_importances_absent_for_classifier_base() -> None: + """Non-QRF bases (classifier/constant) must NOT expose importances. + + ``feature_importances_`` raises ``AttributeError`` for a categorical + (classifier) target so ``hasattr`` reports ``False``, matching the + sklearn ``hasattr``-probe convention used by callers. + """ + rng = np.random.default_rng(2) + n = 300 + data = pd.DataFrame( + { + "x": rng.normal(size=n), + "cat_target": rng.choice(["a", "b", "c"], size=n), + } + ) + fitted = QRF().fit( + data, + predictors=["x"], + imputed_variables=["cat_target"], + n_estimators=15, + ) + assert not hasattr(fitted, "feature_importances_") + + def test_qrf_beta_distribution_sampling(): """Test different mean_quantile values for beta distribution sampling.""" np.random.seed(42) @@ -283,7 +387,9 @@ def test_qrf_row_filter_applies_common_training_mask() -> None: assert not predictions[0.5].isna().any().any() -def test_qrf_target_filters_reject_unknown_targets(simple_data: pd.DataFrame) -> None: +def test_qrf_target_filters_reject_unknown_targets( + simple_data: pd.DataFrame, +) -> None: with pytest.raises(ValueError, match="target_filters contains variables"): QRF().fit( simple_data, diff --git a/tests/test_models/test_zero_inflated.py b/tests/test_models/test_regime_gated.py similarity index 69% rename from tests/test_models/test_zero_inflated.py rename to tests/test_models/test_regime_gated.py index 7621c14..0a7452b 100644 --- a/tests/test_models/test_zero_inflated.py +++ b/tests/test_models/test_regime_gated.py @@ -1,4 +1,4 @@ -"""Regime-aware zero-inflation wrapper around base imputers. +"""Canonical regime-gated ``Imputer`` over base imputers. Tabular microdata variables fall into seven regimes based on which of {negative, zero, positive} values appear in the training data: @@ -24,7 +24,7 @@ 1. Regime detection from training data is correct. 2. Predictions respect the detected regime (no zero leaks, no sign-interpolation between positive and negative regimes). -3. Fit/predict lifecycle matches the base `Imputer` contract. +3. Fit/predict lifecycle matches the base ``BaseImputer`` contract. 4. Pure support detection: any observed negative, zero, or positive support participates in regime selection. """ @@ -62,31 +62,31 @@ class TestRegimeDetection: """Regime auto-detection from the training distribution.""" def test_regime_detection_is_importable(self) -> None: - from microimpute.models.zero_inflated import ZeroInflatedImputer + from microimpute.models.regime_gated import Imputer - assert ZeroInflatedImputer is not None + assert Imputer is not None def test_positive_plus_zero_is_zi_positive(self) -> None: """97% zeros + 3% positives → ZI_POSITIVE regime.""" - from microimpute.models.zero_inflated import ZeroInflatedImputer + from microimpute.models.regime_gated import Imputer rng = np.random.default_rng(0) y = np.where(rng.random(500) > 0.97, rng.exponential(100, 500), 0.0) data = _deterministic_frame(500, y) - imputer = ZeroInflatedImputer(base_imputer_class=QRF) + imputer = Imputer(base_imputer_class=QRF) imputer.fit(data, predictors=["age", "income_bin"], imputed_variables=["y"]) assert imputer.get_regime("y") == "ZI_POSITIVE" def test_negative_plus_zero_is_zi_negative(self) -> None: """97% zeros + 3% negatives → ZI_NEGATIVE regime (mirror).""" - from microimpute.models.zero_inflated import ZeroInflatedImputer + from microimpute.models.regime_gated import Imputer rng = np.random.default_rng(0) y = np.where(rng.random(500) > 0.97, -rng.exponential(100, 500), 0.0) data = _deterministic_frame(500, y) - imputer = ZeroInflatedImputer(base_imputer_class=QRF) + imputer = Imputer(base_imputer_class=QRF) imputer.fit(data, predictors=["age", "income_bin"], imputed_variables=["y"]) assert imputer.get_regime("y") == "ZI_NEGATIVE" @@ -96,7 +96,7 @@ def test_three_sign_mass_is_three_sign(self) -> None: Fixture models a capital-gains-like distribution: 70% zero, 15% positive, 15% negative, with distinct pos/neg means. """ - from microimpute.models.zero_inflated import ZeroInflatedImputer + from microimpute.models.regime_gated import Imputer rng = np.random.default_rng(0) n = 1000 @@ -108,28 +108,28 @@ def test_three_sign_mass_is_three_sign(self) -> None: y[neg_mask] = -rng.exponential(300, size=neg_mask.sum()) data = _deterministic_frame(n, y) - imputer = ZeroInflatedImputer(base_imputer_class=QRF) + imputer = Imputer(base_imputer_class=QRF) imputer.fit(data, predictors=["age", "income_bin"], imputed_variables=["y"]) assert imputer.get_regime("y") == "THREE_SIGN" def test_positive_only_is_positive_only(self) -> None: """All positive, no zeros → POSITIVE_ONLY (no gate, raw base imputer).""" - from microimpute.models.zero_inflated import ZeroInflatedImputer + from microimpute.models.regime_gated import Imputer rng = np.random.default_rng(0) y = rng.exponential(100, size=500) # strictly positive data = _deterministic_frame(500, y) - imputer = ZeroInflatedImputer(base_imputer_class=QRF) + imputer = Imputer(base_imputer_class=QRF) imputer.fit(data, predictors=["age", "income_bin"], imputed_variables=["y"]) assert imputer.get_regime("y") == "POSITIVE_ONLY" def test_constant_zero_is_degenerate(self) -> None: """All zeros → DEGENERATE_ZERO, predictions are exactly 0.""" - from microimpute.models.zero_inflated import ZeroInflatedImputer + from microimpute.models.regime_gated import Imputer data = _deterministic_frame(500, np.zeros(500)) - imputer = ZeroInflatedImputer(base_imputer_class=QRF) + imputer = Imputer(base_imputer_class=QRF) imputer.fit(data, predictors=["age", "income_bin"], imputed_variables=["y"]) assert imputer.get_regime("y") == "DEGENERATE_ZERO" @@ -140,7 +140,7 @@ def test_rare_negative_tail_triggers_three_sign(self) -> None: The negative mass is real and should trigger THREE_SIGN without caller-supplied support/sign metadata. """ - from microimpute.models.zero_inflated import ZeroInflatedImputer + from microimpute.models.regime_gated import Imputer rng = np.random.default_rng(0) n = 500 @@ -154,7 +154,7 @@ def test_rare_negative_tail_triggers_three_sign(self) -> None: assert (y == 0).sum() > 0, "fixture precondition" data = _deterministic_frame(n, y) - imputer = ZeroInflatedImputer(base_imputer_class=QRF) + imputer = Imputer(base_imputer_class=QRF) imputer.fit(data, predictors=["age", "income_bin"], imputed_variables=["y"]) assert imputer.get_regime("y") == "THREE_SIGN" @@ -163,10 +163,10 @@ class TestPredictionsRespectRegime: """Predicted values must lie in the regime's support.""" def _fit_predict(self, y: np.ndarray, n_pred: int = 200) -> pd.Series: - from microimpute.models.zero_inflated import ZeroInflatedImputer + from microimpute.models.regime_gated import Imputer data = _deterministic_frame(len(y), y, seed=1) - imputer = ZeroInflatedImputer(base_imputer_class=QRF) + imputer = Imputer(base_imputer_class=QRF) result = imputer.fit( data, predictors=["age", "income_bin"], imputed_variables=["y"] ) @@ -218,7 +218,7 @@ def test_three_sign_predictions_do_not_leak_across_zero(self) -> None: strictly below `max(train_negatives)` on records the gate calls negative. No record should land in the interpolated band between max(neg) and min(pos).""" - from microimpute.models.zero_inflated import ZeroInflatedImputer + from microimpute.models.regime_gated import Imputer rng = np.random.default_rng(0) n = 2000 @@ -232,7 +232,7 @@ def test_three_sign_predictions_do_not_leak_across_zero(self) -> None: y[neg_mask] = -(100 + rng.exponential(300, size=neg_mask.sum())) data = _deterministic_frame(n, y, seed=2) - imputer = ZeroInflatedImputer(base_imputer_class=QRF) + imputer = Imputer(base_imputer_class=QRF) result = imputer.fit( data, predictors=["age", "income_bin"], imputed_variables=["y"] ) @@ -259,13 +259,13 @@ def test_three_sign_predictions_do_not_leak_across_zero(self) -> None: class TestBaseImputerParity: - """Single-regime behavior must match the base Imputer (no gate overhead).""" + """Single-regime behavior must match the base imputer (no gate overhead).""" def test_positive_only_matches_bare_qrf(self) -> None: - """For a strictly-positive target, ZeroInflatedImputer should + """For a strictly-positive target, Imputer should produce equivalent distributions to QRF directly (since regime detection returns POSITIVE_ONLY and no gate is applied).""" - from microimpute.models.zero_inflated import ZeroInflatedImputer + from microimpute.models.regime_gated import Imputer rng = np.random.default_rng(0) y = rng.exponential(100, size=500) # strictly positive @@ -276,7 +276,7 @@ def test_positive_only_matches_bare_qrf(self) -> None: data, predictors=["age", "income_bin"], imputed_variables=["y"] ) - wrapped = ZeroInflatedImputer(base_imputer_class=QRF) + wrapped = Imputer(base_imputer_class=QRF) wrapped_result = wrapped.fit( data, predictors=["age", "income_bin"], imputed_variables=["y"] ) @@ -305,11 +305,111 @@ def test_positive_only_matches_bare_qrf(self) -> None: ) +class TestWeightedFit: + """``weight_col`` must reach every nested numeric-target fit. + + Regression tests for the silent weight-discard bug: ``fit`` only + forwarded ``weight_col`` to the auxiliary non-numeric base imputer, + so gate classifiers and per-regime base imputers for numeric + targets all trained unweighted and ``weight_col`` had no effect on + numeric imputations. + """ + + def test_weight_col_shifts_numeric_base_imputer(self) -> None: + """Weighted draws must track the weighted training distribution. + + Donor data mixes a heavily-downweighted high-value block + (weight 0.001, y ~ 1e6) into a dominant low-value block + (weight 1, y ~ 1e4), so the weighted and unweighted means + differ by ~20x. An unweighted fit imputes means near the + unweighted training mean; a weighted fit must not. + """ + from microimpute.models.regime_gated import Imputer + + rng = np.random.default_rng(0) + n = 5000 + weights = np.where(rng.random(n) < 0.2, 0.001, 1.0) + y = np.where(weights < 0.5, 1_000_000.0, 10_000.0) * (1 + 0.1 * rng.random(n)) + donor = pd.DataFrame({"x": rng.normal(size=n), "y": y, "weight": weights}) + target = pd.DataFrame({"x": rng.normal(size=2000)}) + + unweighted_mean = y.mean() + weighted_mean = np.average(y, weights=weights) + + means = {} + for weight_col in (None, "weight"): + imputer = Imputer(seed=0, log_level="WARNING") + fitted = imputer.fit( + X_train=donor, + predictors=["x"], + imputed_variables=["y"], + weight_col=weight_col, + ) + means[weight_col] = fitted.predict(target.copy())["y"].mean() + + assert abs(means["weight"] - weighted_mean) / weighted_mean < 0.2, ( + f"Weighted fit imputed mean {means['weight']:.0f}, expected " + f"within 20% of the weighted training mean {weighted_mean:.0f}." + ) + assert abs(means[None] - unweighted_mean) / unweighted_mean < 0.2, ( + f"Unweighted fit imputed mean {means[None]:.0f}, expected " + f"within 20% of the unweighted training mean " + f"{unweighted_mean:.0f}." + ) + assert means[None] / means["weight"] > 5.0, ( + "Weighted and unweighted fits imputed similar means " + f"({means[None]:.0f} vs {means['weight']:.0f}); weight_col " + "appears to be silently ignored for numeric targets." + ) + + def test_weight_col_shifts_gate_classifier(self) -> None: + """Weights must also reach the zero-gate classifier fit. + + Donor data is 50/50 zero vs positive, but the positive rows + are downweighted 20x, so the weighted positive share is ~4.8%. + A weighted ZI_POSITIVE fit must draw far fewer positives than + an unweighted one. + """ + from microimpute.models.regime_gated import Imputer + + rng = np.random.default_rng(0) + n = 4000 + positive = rng.random(n) < 0.5 + weights = np.where(positive, 0.05, 1.0) + y = np.where(positive, 100.0 + 50.0 * rng.random(n), 0.0) + donor = pd.DataFrame({"x": rng.normal(size=n), "y": y, "weight": weights}) + target = pd.DataFrame({"x": rng.normal(size=2000)}) + + shares = {} + for weight_col in (None, "weight"): + imputer = Imputer(seed=0, log_level="WARNING") + fitted = imputer.fit( + X_train=donor, + predictors=["x"], + imputed_variables=["y"], + weight_col=weight_col, + ) + assert fitted.regimes_["y"] == "ZI_POSITIVE" + preds = fitted.predict(target.copy())["y"] + shares[weight_col] = float((preds > 0).mean()) + + # Unweighted positive share is ~0.5; weighted is ~0.048. + assert shares[None] > 0.35, ( + f"Unweighted fit drew {shares[None]:.3f} positive; expected ~0.5." + ) + assert shares["weight"] < 0.15, ( + f"Weighted fit drew {shares['weight']:.3f} positive; expected " + "~0.05 — the gate classifier appears to ignore weights." + ) + + class TestSequentialPredictorTyping: """Numeric overrides must reach nested per-regime base imputers.""" - def test_chained_numeric_targets_stay_numeric_in_nested_base_fits(self) -> None: - from microimpute.models.zero_inflated import ZeroInflatedImputer + def test_chained_numeric_targets_stay_numeric_in_nested_base_fits( + self, + ) -> None: + from microimpute.models.regime_gated import Imputer rng = np.random.default_rng(42) y1 = np.tile(np.arange(20, dtype=float), 10) @@ -328,7 +428,7 @@ def test_chained_numeric_targets_stay_numeric_in_nested_base_fits(self) -> None: } ) - imputer = ZeroInflatedImputer(base_imputer_class=QRF) + imputer = Imputer(base_imputer_class=QRF) fitted = imputer.fit( data, predictors=["x"], @@ -342,8 +442,10 @@ def test_chained_numeric_targets_stay_numeric_in_nested_base_fits(self) -> None: assert set(predictions.columns) == {"y1", "y2"} assert not predictions.isna().any().any() - def test_not_numeric_categorical_applies_to_chained_base_predictors(self) -> None: - from microimpute.models.zero_inflated import ZeroInflatedImputer + def test_not_numeric_categorical_applies_to_chained_base_predictors( + self, + ) -> None: + from microimpute.models.regime_gated import Imputer rng = np.random.default_rng(42) n = 180 @@ -358,7 +460,7 @@ def test_not_numeric_categorical_applies_to_chained_base_predictors(self) -> Non } ) - imputer = ZeroInflatedImputer(base_imputer_class=QRF) + imputer = Imputer(base_imputer_class=QRF) fitted = imputer.fit( data, predictors=["x"], diff --git a/tests/test_models/test_regime_gated_chaining.py b/tests/test_models/test_regime_gated_chaining.py new file mode 100644 index 0000000..9b34b3f --- /dev/null +++ b/tests/test_models/test_regime_gated_chaining.py @@ -0,0 +1,178 @@ +"""Sequential (chained-equations) imputation + fitted state in Imputer. + +A correct chained imputer conditions each target on the previously imputed +ones, so it reproduces cross-variable correlation that is *not* explained +by the shared predictors. We construct two targets correlated through an +unobserved latent factor and confirm that imputing them together (one +chained list call) recovers that correlation, while imputing them in +separate per-variable calls (the old microplex per-column pattern) does +not. We also check the sklearn-style fitted attributes (``regimes_``, +``predictors_``, ``models_``) and the sub-estimators' standard +``feature_importances_`` / ``feature_names_in_``, including their +self-consistency under a categorical predictor. +""" + +import numpy as np +import pandas as pd + +from microimpute.models.qrf import QRF +from microimpute.models.regime_gated import REGIME_NO_GATE, Imputer + + +def _make_latent_correlated_frame(n: int, seed: int) -> pd.DataFrame: + """Two positive targets A, B that share a latent factor L *with + opposite sign*, so they are strongly NEGATIVELY correlated. + + The shared predictor X explains almost none of the A-B relationship; + it runs through L, which is never observed. An imputer that draws B + independently of A cannot reproduce this dependence. Only one that + conditions B on the already-imputed A recovers it, because A reveals + L (given X, A pins down L, which then pins down B). + """ + rng = np.random.default_rng(seed) + x = rng.normal(size=n) + latent = rng.normal(size=n) # unobserved + a = 10.0 + 0.2 * x + 1.5 * latent + 0.3 * rng.normal(size=n) + b = 20.0 + 0.2 * x - 1.5 * latent + 0.3 * rng.normal(size=n) + return pd.DataFrame({"x": x, "a": a, "b": b}) + + +def _chained_correlation() -> tuple[float, float]: + """Impute [a, b] together (chained) and return (imputed, true) corr.""" + train = _make_latent_correlated_frame(n=4000, seed=0) + test = _make_latent_correlated_frame(n=4000, seed=1) + fitted = Imputer(seed=0).fit( + X_train=train, predictors=["x"], imputed_variables=["a", "b"] + ) + preds = fitted.predict(test[["x"]]) + return ( + float(np.corrcoef(preds["a"], preds["b"])[0, 1]), + float(np.corrcoef(test["a"], test["b"])[0, 1]), + ) + + +def _independent_correlation() -> float: + """Impute a and b in *separate* single-variable calls (the old + microplex per-column pattern), then measure their correlation.""" + train = _make_latent_correlated_frame(n=4000, seed=0) + test = _make_latent_correlated_frame(n=4000, seed=1) + a = ( + Imputer(seed=0) + .fit(X_train=train, predictors=["x"], imputed_variables=["a"]) + .predict(test[["x"]])["a"] + ) + b = ( + Imputer(seed=0) + .fit(X_train=train, predictors=["x"], imputed_variables=["b"]) + .predict(test[["x"]])["b"] + ) + return float(np.corrcoef(a, b)[0, 1]) + + +def test_chaining_recovers_joint_correlation(): + seq_corr, true_corr = _chained_correlation() + indep_corr = _independent_correlation() + + # The true A-B correlation is strongly negative (opposite latent loads). + assert true_corr < -0.85, true_corr + # One chained list call conditions b on the already-imputed a (which + # reveals the latent factor), recovering the true negative dependence. + assert seq_corr < -0.7, seq_corr + assert abs(seq_corr - true_corr) < 0.15, (seq_corr, true_corr) + # Imputing a and b in separate calls never lets b see a, so it misses + # most of the dependence. + assert seq_corr < indep_corr - 0.4, (seq_corr, indep_corr) + + +def test_fitted_attributes_expose_regimes_models_predictors(): + train = _make_latent_correlated_frame(n=2000, seed=4) + fitted = Imputer(seed=0).fit( + X_train=train, predictors=["x"], imputed_variables=["a", "b"] + ) + + # Sklearn-style fitted state, no bespoke lineage object. + assert set(fitted.regimes_) == {"a", "b"} + + # a is imputed first: only the original predictor; b is chained on a. + assert fitted.predictors_["a"] == ["x"] + assert fitted.predictors_["b"] == ["x", "a"] + + # models_ maps roles to fitted sub-estimators. + roles = fitted.models_["b"] + assert roles, "expected at least one fitted sub-estimator" + assert all(v is not None for v in roles.values()) + + # At least one QRF base sub-estimator exposes the standard sklearn + # fitted attributes, keyed by b's chained predictors. + qrf_roles = [roles[r] for r in ("single", "positive", "negative") if r in roles] + assert qrf_roles, "expected a QRF base sub-estimator by role" + bases_with_importances = [ + base for base in qrf_roles if hasattr(base, "feature_importances_") + ] + assert bases_with_importances, ( + "expected a base estimator exposing feature_importances_" + ) + base = bases_with_importances[0] + assert hasattr(base, "feature_importances_") + assert set(base.feature_names_in_).issubset({"x", "a"}), list( + base.feature_names_in_ + ) + + +def test_signregime_false_disables_gating(): + train = _make_latent_correlated_frame(n=1500, seed=6) + fitted = Imputer(signregime=False, seed=0).fit( + X_train=train, predictors=["x"], imputed_variables=["a", "b"] + ) + assert fitted.regimes_["a"] == REGIME_NO_GATE + # Still produces values and still chains predictors. + preds = fitted.predict(train[["x"]]) + assert set(preds.columns) >= {"a", "b"} + assert fitted.predictors_["b"] == ["x", "a"] + + +def test_single_target_chained_predictor_is_just_the_originals(): + """A one-variable list has no prior to chain on.""" + train = _make_latent_correlated_frame(n=1500, seed=2) + fitted = Imputer(seed=7).fit( + X_train=train, predictors=["x"], imputed_variables=["a"] + ) + assert fitted.predictors_["a"] == ["x"] + + +def test_base_estimators_expose_self_consistent_feature_attributes(): + """Wrapper sub-estimators carry self-consistent sklearn attributes. + + Each chained numeric target's QRF base sub-estimator must expose a + ``feature_importances_`` dict keyed by the forest's ACTUAL fitted + columns (so names and values always align in length and order) and a + ``feature_names_in_`` reporting the original input predictor names. + ``b`` is chained on ``a``, so its base sub-estimator's fitted columns + are exactly ``{x, a}`` — the very name/value pairing the old + positional implementation could misalign. (Self-consistency under a + categorical predictor that expands into dummy columns is covered at + the ``QRFResults`` level in test_qrf.py, since the wrapper's gate + path requires numeric predictors.) + """ + train = _make_latent_correlated_frame(n=1200, seed=11) + fitted = Imputer(base_imputer_class=QRF, seed=0).fit( + X_train=train, predictors=["x"], imputed_variables=["a", "b"] + ) + # The per-variable single QRF bases each expose a self-consistent dict. + base_a = fitted.models_["a"]["single"] + base_b = fitted.models_["b"]["single"] + for base, var, expected_names in ( + (base_a, "a", {"x"}), + (base_b, "b", {"x", "a"}), + ): + # feature_names_in_ is the original input predictor names. + assert set(base.feature_names_in_) == expected_names + importances = base.feature_importances_ + # Single imputed variable per base -> a flat {feature: importance} + # dict keyed by the forest's actual fitted columns. + assert isinstance(importances, dict) + qrf_model = base.models[var] + assert set(importances) == set(qrf_model.feature_columns) + # One value per fitted column — no silent zip-shortening. + assert len(importances) == len(qrf_model.qrf.feature_importances_) + assert all(np.isfinite(v) for v in importances.values()) diff --git a/tests/test_models/test_zero_inflated_chaining.py b/tests/test_models/test_zero_inflated_chaining.py deleted file mode 100644 index 2a8b885..0000000 --- a/tests/test_models/test_zero_inflated_chaining.py +++ /dev/null @@ -1,78 +0,0 @@ -"""Sequential (chained-equations) imputation in ZeroInflatedImputer. - -A correct chained imputer conditions each target on the previously -imputed ones, so it reproduces cross-variable correlation that is *not* -explained by the shared predictors. We construct two targets that are -correlated through an unobserved latent factor and confirm that the -sequential imputer recovers that correlation while the non-sequential -(per-variable independent) imputer does not. -""" - -import numpy as np -import pandas as pd - -from microimpute.models.zero_inflated import ZeroInflatedImputer - - -def _make_latent_correlated_frame(n: int, seed: int) -> pd.DataFrame: - """Two positive targets A, B that share a latent factor L *with - opposite sign*, so they are strongly NEGATIVELY correlated. - - The shared predictor X explains almost none of the A-B relationship; - it runs through L, which is never observed. An imputer that draws B - independently of A cannot reproduce this dependence. Only one that - conditions B on the already-imputed A recovers it, because A reveals - L (given X, A pins down L, which then pins down B). - """ - rng = np.random.default_rng(seed) - x = rng.normal(size=n) - latent = rng.normal(size=n) # unobserved - a = 10.0 + 0.2 * x + 1.5 * latent + 0.3 * rng.normal(size=n) - b = 20.0 + 0.2 * x - 1.5 * latent + 0.3 * rng.normal(size=n) - return pd.DataFrame({"x": x, "a": a, "b": b}) - - -def _imputed_correlation(sequential: bool) -> tuple[float, float]: - train = _make_latent_correlated_frame(n=4000, seed=0) - test = _make_latent_correlated_frame(n=4000, seed=1) - - imp = ZeroInflatedImputer(sequential=sequential, seed=0) - fitted = imp.fit( - X_train=train, - predictors=["x"], - imputed_variables=["a", "b"], - ) - preds = fitted.predict(test[["x"]]) - imputed_corr = float(np.corrcoef(preds["a"], preds["b"])[0, 1]) - true_corr = float(np.corrcoef(test["a"], test["b"])[0, 1]) - return imputed_corr, true_corr - - -def test_sequential_chaining_recovers_joint_correlation(): - seq_corr, true_corr = _imputed_correlation(sequential=True) - indep_corr, _ = _imputed_correlation(sequential=False) - - # The true A-B correlation is strongly negative (opposite latent loads). - assert true_corr < -0.85, true_corr - # Chained imputation conditions b on the already-imputed a (which - # reveals the latent factor), recovering the true negative dependence - # almost exactly. - assert seq_corr < -0.7, seq_corr - assert abs(seq_corr - true_corr) < 0.15, (seq_corr, true_corr) - # Non-sequential per-variable imputation never sees a when drawing b, - # so it misses most of the dependence. - assert seq_corr < indep_corr - 0.4, (seq_corr, indep_corr) - - -def test_single_target_is_unaffected_by_sequential_flag(): - """A one-variable list has no prior to chain on, so the flag is a no-op.""" - train = _make_latent_correlated_frame(n=1500, seed=2) - test = _make_latent_correlated_frame(n=1500, seed=3) - - out = {} - for sequential in (True, False): - imp = ZeroInflatedImputer(sequential=sequential, seed=7) - fitted = imp.fit(X_train=train, predictors=["x"], imputed_variables=["a"]) - out[sequential] = fitted.predict(test[["x"]])["a"].to_numpy() - - np.testing.assert_allclose(out[True], out[False])