Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions changelog.d/imputer-canonical.breaking.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
**Renamed `ZeroInflatedImputer` to the canonical `microimpute.Imputer`** and made it the opinionated default. The previous abstract base class `Imputer` is now `BaseImputer` (still exported). `microimpute.Imputer` is the regime-gated, QRF-based, sequentially-chained imputer:

- **Sign-regime gating** (`{neg, 0, pos}`) on by default (`signregime=True`); pass `signregime=False` to impute each numeric target with the base model directly (no gate, the `REGIME_NO_GATE` path).
- **QRF base model** by default (`base_imputer_class=QRF`); swap for experiments.
- **Sequential chained-equations imputation is always on** — imputing a list of targets conditions each on the previously-imputed ones, preserving cross-variable joint structure. The old per-variable-independent path and its `sequential` flag are removed.
- The fitted result exposes fitted state sklearn-style — `regimes_`, `predictors_` (the chained predictor list per target), and `models_` (sub-estimators by role: single/gate/positive/negative). QRF base sub-estimators carry standard `feature_importances_`/`feature_names_in_`: `feature_importances_` is a `{fitted_feature: importance}` dict keyed by the forest's actual fitted columns (so names and values always align, even when a categorical predictor expands into dummy columns), and `feature_names_in_` reports the original input predictor names.

Migration: replace `from microimpute.models.zero_inflated import ZeroInflatedImputer` with `from microimpute import Imputer`; references to the old base class `Imputer` become `BaseImputer`.
1 change: 1 addition & 0 deletions changelog.d/imputer-canonical.fixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
**`Imputer.fit(weight_col=...)` now actually weights numeric-target imputations.** Previously the regime-gated fit forwarded `weight_col` only to the auxiliary non-numeric base imputer; the gate classifiers and per-regime base imputers for numeric targets all trained unweighted, so sampling weights were silently ignored. Weights are now resolved once (column name, array, or Series) and threaded through every nested fit: gate classifiers receive them as `sample_weight` and per-regime base imputers as `weight_col`, sliced with the same row mask as the training slice. Separately, the QRF learner now honors `sample_weight` by weighted bootstrap resampling of its training rows: `quantile_forest` only uses native `sample_weight` as a zero-weight leaf filter (and fully-grown forest leaves are single-sample), so the previous native pass-through left leaf quantile distributions — and every value imputed from them — unweighted.
13 changes: 5 additions & 8 deletions microimpute/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,7 @@
)

# Main configuration
from microimpute.config import (
PLOT_CONFIG,
QUANTILES,
RANDOM_STATE,
VALIDATE_CONFIG,
)
from microimpute.config import PLOT_CONFIG, QUANTILES, RANDOM_STATE, VALIDATE_CONFIG

# Import evaluation modules
from microimpute.evaluations.cross_validation import cross_validate_model
Expand All @@ -61,8 +56,10 @@
progressive_predictor_inclusion,
)

# Import main models and utilities
from microimpute.models import OLS, QRF, Imputer, ImputerResults, QuantReg
# Import main models and utilities. ``Imputer`` is the canonical
# regime-gated, QRF-based, sequentially-chained imputer; ``BaseImputer``
# is the abstract base class all models extend.
from microimpute.models import OLS, QRF, BaseImputer, Imputer, ImputerResults, QuantReg

# Import data handling functions
from microimpute.utils.data import preprocess_data, unnormalize_predictions
Expand Down
10 changes: 5 additions & 5 deletions microimpute/comparisons/autoimpute.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
TRAIN_SIZE,
VALIDATE_CONFIG,
)
from microimpute.models import OLS, QRF, Imputer, QuantReg
from microimpute.models import OLS, QRF, BaseImputer, QuantReg
from microimpute.utils.data import (
un_asinh_transform_predictions,
unlog_transform_predictions,
Expand Down Expand Up @@ -121,7 +121,7 @@ class AutoImputeResult(BaseModel):
receiver_data : pd.DataFrame
Copy of the receiver data with the median-quantile imputations of the best performing model attached.
fitted_models : Dict[str, Any]
Mapping model name → fitted Imputer instance.
Mapping model name → fitted BaseImputer instance.
cv_results : Dict[str, Dict[str, Any]]
Cross-validation results with separate quantile_loss and log_loss metrics for each model.
"""
Expand Down Expand Up @@ -203,7 +203,7 @@ def _setup_logging(log_level: str) -> int:


def _evaluate_models_parallel(
model_classes: List[Type[Imputer]],
model_classes: List[Type[BaseImputer]],
training_data: pd.DataFrame,
predictors: List[str],
imputed_variables: List[str],
Expand Down Expand Up @@ -279,7 +279,7 @@ def _evaluate_models_parallel(


def _generate_imputations_for_all_models(
model_classes: List[Type[Imputer]],
model_classes: List[Type[BaseImputer]],
best_method: str,
donor_data: pd.DataFrame,
receiver_data: pd.DataFrame,
Expand Down Expand Up @@ -500,7 +500,7 @@ def autoimpute(

# Get model classes
if not models:
model_classes: List[Type[Imputer]] = [QRF, OLS, QuantReg]
model_classes: List[Type[BaseImputer]] = [QRF, OLS, QuantReg]
if HAS_MATCHING:
model_classes.append(Matching)
if HAS_MDN:
Expand Down
6 changes: 3 additions & 3 deletions microimpute/comparisons/autoimpute_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
validate_quantiles,
)
from microimpute.evaluations import cross_validate_model
from microimpute.models import Imputer
from microimpute.models import BaseImputer
from microimpute.utils.data import preprocess_data

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -237,7 +237,7 @@ def prepare_data_for_imputation(


def evaluate_model(
model: Type[Imputer],
model: Type[BaseImputer],
data: pd.DataFrame,
predictors: List[str],
imputed_variables: List[str],
Expand Down Expand Up @@ -290,7 +290,7 @@ def evaluate_model(


def fit_and_predict_model(
model_class: Type[Imputer],
model_class: Type[BaseImputer],
training_data: pd.DataFrame,
imputing_data: pd.DataFrame,
predictors: List[str],
Expand Down
12 changes: 6 additions & 6 deletions microimpute/evaluations/predictor_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
TRAIN_SIZE,
VALIDATE_CONFIG,
)
from microimpute.models import Imputer, ImputerResults
from microimpute.models import BaseImputer, ImputerResults
from microimpute.utils.type_handling import (
DummyVariableProcessor,
VariableTypeDetector,
Expand Down Expand Up @@ -238,7 +238,7 @@ def leave_one_out_analysis(
data: pd.DataFrame,
predictors: List[str],
imputed_variables: List[str],
model_class: Type[Imputer],
model_class: Type[BaseImputer],
weight_col: Optional[Union[str, np.ndarray, pd.Series]] = None,
quantiles: List[float] = QUANTILES,
train_size: float = TRAIN_SIZE,
Expand All @@ -254,7 +254,7 @@ def leave_one_out_analysis(
data: DataFrame containing the data.
predictors: List of predictor column names.
imputed_variables: List of variables to impute.
model_class: The Imputer class to use for evaluation (e.g., OLS, QRF, QuantReg).
model_class: The BaseImputer class to use for evaluation (e.g., OLS, QRF, QuantReg).
weight_col: Optional column name or array of sampling weights.
quantiles: List of quantiles for evaluation (default: [0.1, 0.5, 0.9]).
train_size: Proportion of data to use for training (default: 0.8).
Expand Down Expand Up @@ -375,7 +375,7 @@ def progressive_predictor_inclusion(
data: pd.DataFrame,
predictors: List[str],
imputed_variables: List[str],
model_class: Type[Imputer],
model_class: Type[BaseImputer],
weight_col: Optional[Union[str, np.ndarray, pd.Series]] = None,
quantiles: Optional[List[float]] = QUANTILES,
train_size: Optional[float] = TRAIN_SIZE,
Expand All @@ -391,7 +391,7 @@ def progressive_predictor_inclusion(
data: DataFrame containing the data.
predictors: List of candidate predictor column names.
imputed_variables: List of variables to impute.
model_class: The Imputer class to use for evaluation (e.g., OLS, QRF, QuantReg).
model_class: The BaseImputer class to use for evaluation (e.g., OLS, QRF, QuantReg).
weight_col: Optional column name or array of sampling weights.
quantiles: List of quantiles for evaluation.
train_size: Proportion of data to use for training.
Expand Down Expand Up @@ -573,7 +573,7 @@ def _evaluate_model_performance(
test_data: pd.DataFrame,
predictors: List[str],
imputed_variables: List[str],
model_class: Type[Imputer],
model_class: Type[BaseImputer],
weight_col: Optional[Union[str, np.ndarray, pd.Series]],
quantiles: List[float],
random_state: int,
Expand Down
12 changes: 9 additions & 3 deletions microimpute/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@

This module provides a collection of statistical models for data imputation,
including both parametric and non-parametric approaches. Each model extends
the base Imputer class and provides quantile-based predictions.
the base imputer class and provides quantile-based predictions.

Available models:
- Imputer: canonical regime-gated, QRF-based, sequentially-chained
imputer (the opinionated default)
- OLS: ordinary least squares regression with bootstrapped quantiles
- QRF: quantile regression forest for non-parametric quantile regression
- QuantReg: linear quantile regression model
Expand All @@ -13,12 +15,12 @@
(optional, requires pytorch-tabular)

Base classes:
- Imputer: abstract base class for all imputation models
- BaseImputer: abstract base class for all imputation models
- ImputerResults: container for fitted model and prediction methods
"""

# Import base classes
from microimpute.models.imputer import Imputer, ImputerResults
from microimpute.models.imputer import BaseImputer, ImputerResults

try:
from microimpute.models.matching import Matching
Expand All @@ -34,3 +36,7 @@
from microimpute.models.ols import OLS
from microimpute.models.qrf import QRF
from microimpute.models.quantreg import QuantReg

# Canonical opinionated imputer: sign-regime gating + QRF base + sequential
# chained-equations imputation, all on by default.
from microimpute.models.regime_gated import Imputer
101 changes: 59 additions & 42 deletions microimpute/models/imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

This module defines the core architecture for imputation models in MicroImpute.
It provides two abstract base classes:
1. Imputer - For model initialization and fitting
1. BaseImputer - For model initialization and fitting
2. ImputerResults - For storing fitted models and making predictions

All model implementations should extend these classes to ensure a consistent interface.
Expand Down Expand Up @@ -35,7 +35,7 @@ def predict(self, X: pd.DataFrame, **kwargs) -> pd.Series:
return pd.Series(self.constant_value, index=X.index, name=self.variable_name)


class Imputer(ABC):
class BaseImputer(ABC):
"""
Abstract base class for fitting imputation models.

Expand Down Expand Up @@ -238,6 +238,47 @@ def preprocess_data_types(
self.logger.error(f"Error during data preprocessing: {str(e)}")
raise RuntimeError("Failed to preprocess data types") from e

@staticmethod
def _resolve_sample_weights(
X_train: pd.DataFrame,
weight_col: Optional[Union[str, np.ndarray, pd.Series]],
) -> Optional[np.ndarray]:
"""Resolve a weight specification to a validated per-row array.

Accepts a column name in ``X_train``, a positional array, or a
Series aligned by index, and returns a float array positionally
aligned with ``X_train`` rows (``None`` when no weights given).

Raises:
ValueError: If a named weight column is missing from
``X_train``, or any resolved weight is non-positive or
NaN.
"""
if weight_col is None:
return None
if isinstance(weight_col, str):
if weight_col not in X_train.columns:
raise ValueError(
f"Weight column '{weight_col}' not found in training data"
)
weights = X_train[weight_col]
elif isinstance(weight_col, np.ndarray):
weights = pd.Series(weight_col, index=X_train.index)
else:
weights = weight_col.reindex(X_train.index)

# Check for NaN AND non-positive values together. NaN weights
# (e.g. from a Series reindex miss) would otherwise propagate
# into sample_weight passed to learners.
weights_arr = np.asarray(weights, dtype=float)
invalid_mask = np.isnan(weights_arr) | (weights_arr <= 0)
if invalid_mask.any():
raise ValueError(
"Weights must be positive and finite; found "
f"{int(invalid_mask.sum())} non-positive or NaN weight(s)"
)
return weights_arr

@validate_call(config=VALIDATE_CONFIG)
def fit(
self,
Expand All @@ -261,7 +302,7 @@ def fit(
X_train: DataFrame containing the training data.
predictors: List of column names to use as predictors.
imputed_variables: List of column names to impute.
weight_col: Optional name of the column or column array/series containing sampling weights. When provided, `X_train` will be sampled with replacement using this column as selection probabilities before fitting the model.
weight_col: Optional name of the column or column array/series containing sampling weights. When provided, the resolved per-row weights are passed to the model subclass as `sample_weight`, which honors them natively (e.g. OLS->WLS) or by weighted bootstrap resampling (QRF).
skip_missing: If True, skip variables missing from training data with warning. If False, raise error for missing variables.
not_numeric_categorical: Optional list of variable names that should
be treated as numeric even if they would normally be detected as
Expand Down Expand Up @@ -348,30 +389,7 @@ def fit(
except Exception as e:
raise ValueError(f"Invalid input data for model: {str(e)}") from e

weights = None
if weight_col is not None and isinstance(weight_col, str):
if weight_col not in X_train.columns:
raise ValueError(
f"Weight column '{weight_col}' not found in training data"
)
weights = X_train[weight_col]
elif weight_col is not None and isinstance(weight_col, np.ndarray):
weights = pd.Series(weight_col, index=X_train.index)
elif weight_col is not None and isinstance(weight_col, pd.Series):
weights = weight_col.reindex(X_train.index)

if weights is not None:
# Check for NaN AND non-positive values together. Previously only
# (weights <= 0).any() was checked, which returns False for NaN
# weights — those then propagated into .sample() as NaN
# probabilities or corrupted sample_weight passed to learners.
weights_arr = np.asarray(weights, dtype=float)
invalid_mask = np.isnan(weights_arr) | (weights_arr <= 0)
if invalid_mask.any():
raise ValueError(
"Weights must be positive and finite; found "
f"{int(invalid_mask.sum())} non-positive or NaN weight(s)"
)
weights_arr = self._resolve_sample_weights(X_train, weight_col)

# Identify target types BEFORE preprocessing
self.identify_target_types(
Expand All @@ -393,21 +411,20 @@ def fit(
self.imputed_vars_dummy_info = imputed_vars_dummy_info
self.original_predictors = original_predictors

# Pass sample_weight through to the subclass so it can use each
# learner's native weighted-fit API (QRF, OLS→WLS, logistic, RFC all
# support sample_weight). This replaces the previous bootstrap
# resample, which silently discarded weights for the underlying
# estimator and inflated variance / shrank effective sample size.
sample_weight = None
if weights is not None:
sample_weight = np.asarray(weights_arr, dtype=float)
# Reindex if preprocess_data_types changed the row ordering
# (it currently does not, but guard against future drift).
if len(sample_weight) != len(X_train):
raise RuntimeError(
"Internal error: sample_weight length no longer matches "
"X_train after preprocessing"
)
# Pass sample_weight through to the subclass so it can honor the
# weights with whichever mechanism its learner supports: an exact
# native weighted-fit API where available (OLS→WLS), or weighted
# bootstrap resampling where the native API cannot weight the
# predictive distribution (the forest-backed QRF — see
# qrf._weighted_resample).
sample_weight = weights_arr
if sample_weight is not None and len(sample_weight) != len(X_train):
# preprocess_data_types currently preserves row order/count,
# but guard against future drift.
raise RuntimeError(
"Internal error: sample_weight length no longer matches "
"X_train after preprocessing"
)

# Defer actual training to subclass with all parameters
fit_kwargs = {
Expand Down
4 changes: 2 additions & 2 deletions microimpute/models/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pydantic import validate_call

from microimpute.config import RANDOM_STATE, VALIDATE_CONFIG
from microimpute.models.imputer import Imputer, ImputerResults
from microimpute.models.imputer import BaseImputer, ImputerResults
from microimpute.utils.statmatch_hotdeck import nnd_hotdeck_using_rpy2

MatchingHotdeckFn = Callable[
Expand Down Expand Up @@ -402,7 +402,7 @@ def _process_matching_results(
raise RuntimeError("Failed to create output imputations") from output_error


class Matching(Imputer):
class Matching(BaseImputer):
"""
Statistical matching model for imputation using nearest neighbor distance
hot deck method.
Expand Down
4 changes: 2 additions & 2 deletions microimpute/models/mdn.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from microimpute.config import RANDOM_STATE, VALIDATE_CONFIG
from microimpute.models.imputer import (
Imputer,
BaseImputer,
ImputerResults,
_ConstantValueModel,
)
Expand Down Expand Up @@ -800,7 +800,7 @@ def _predict(
raise RuntimeError(f"Failed to predict with MDN model: {str(e)}") from e


class MDN(Imputer):
class MDN(BaseImputer):
"""
Mixture Density Network imputer using PyTorch Tabular.

Expand Down
4 changes: 2 additions & 2 deletions microimpute/models/ols.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from sklearn.linear_model import LogisticRegression

from microimpute.config import VALIDATE_CONFIG
from microimpute.models.imputer import Imputer, ImputerResults
from microimpute.models.imputer import BaseImputer, ImputerResults


class _LogisticRegressionModel:
Expand Down Expand Up @@ -462,7 +462,7 @@ def _predict_quantile(
) from e


class OLS(Imputer):
class OLS(BaseImputer):
"""
Ordinary Least Squares regression model for imputation.

Expand Down
Loading