diff --git a/.github/workflows/workflow.yml b/.github/workflows/workflow.yml index 16384e0..0727381 100644 --- a/.github/workflows/workflow.yml +++ b/.github/workflows/workflow.yml @@ -24,7 +24,7 @@ jobs: uses: ./.github/actions/setup-uv-env with: python-version: ${{ env.PYTHON_VERSION }} - install-args: "--extra report --extra rna --extra tabpfn" + install-args: "--extra report --extra rna --extra tabpfn --extra tabicl" # - name: Run type checks #needs a lot of type fixing # run: | @@ -61,7 +61,7 @@ jobs: uses: ./.github/actions/setup-uv-env with: python-version: ${{ matrix.python-version }} - install-args: "--extra rna --extra report --extra tabpfn --extra clustering + install-args: "--extra rna --extra report --extra tabpfn --extra tabicl --extra clustering --group test_duration" #---------------------------------------------- @@ -101,7 +101,7 @@ jobs: uses: ./.github/actions/setup-uv-env with: python-version: ${{ env.PYTHON_VERSION }} - install-args: "--extra rna --extra report --extra tabpfn --extra clustering + install-args: "--extra rna --extra report --extra tabpfn --extra tabicl --extra clustering --group test_duration" - name: Download TabPFN checkpoint @@ -143,7 +143,7 @@ jobs: uses: ./.github/actions/setup-uv-env with: python-version: ${{ env.PYTHON_VERSION }} - install-args: "--extra report --extra rna --extra tabpfn" + install-args: "--extra report --extra rna --extra tabpfn --extra tabicl" - name: Python Semantic Release uses: python-semantic-release/python-semantic-release@v10.2.0 diff --git a/pyproject.toml b/pyproject.toml index c59907d..06864cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,6 +77,12 @@ tabpfn = [ "torch>=2.3.0,<3", ] +### TabICL +tabicl = [ + "tabicl>=2.1.1", + "torch>=2.11.0", +] + [dependency-groups] dev = [ "ipykernel", diff --git a/src/mother/ml/models/m_tabicl.py b/src/mother/ml/models/m_tabicl.py new file mode 100644 index 0000000..bc2ebcd --- /dev/null +++ b/src/mother/ml/models/m_tabicl.py @@ -0,0 +1,1200 @@ +""" +Purposes +-------- +This module provides Mother-compatible wrappers for TabICL models and exposes +Optuna-ready hyperparameter handling through the MotherML API. + +More information on the following repository: https://github.com/soda-inria/tabicl + +Hyperparameters Tuned by Optuna +------------------------------- +The shared tuning logic in _TabICLHyperParams.get_hyperparameter_space suggests: + +- n_estimators + - Type: int + - Range: [1, 12] + - Meaning: number of ensemble estimators + +- softmax_temperature (only if present in model init params, typically classifier) + - Type: float + - Range: [0.5, 2.0] + - Meaning: softmax temperature for probability calibration + +- average_logits (only if present in model init params, typically classifier) + - Type: categorical + - Values: [True, False] + - Meaning: average logits before softmax vs average probabilities after softmax + +- outlier_threshold (only if present in model init params, typically regressor) + - Type: float + - Range: [2.0, 8.0] + - Meaning: clipping threshold used to handle outlier context examples +""" + +import logging +from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union + +import numpy as np +import pandas as pd +from optuna.trial import FixedTrial, Trial +from six import iteritems +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.exceptions import NotFittedError +from sklearn.model_selection import ( + GroupKFold, + KFold, + StratifiedGroupKFold, + StratifiedKFold, +) +from sklearn.utils import check_X_y +from sklearn.utils.validation import check_is_fitted +from tabicl import TabICLClassifier, TabICLRegressor + +from mother.ml.core import AbstractMotherPipeline +from mother.ml.models import utils + +module_logger = logging.getLogger(__name__) + +# Default quantile for regression uncertainty estimation (interquartile range) +DEFAULT_QUANTILES: list[float] = [0.25, 0.5, 0.75] + + +class _TabICLHyperParams(AbstractMotherPipeline): + """Shared Mother-style parameter management mixin for TabICL estimators. + + Provides a unified implementation of `get_params`, `set_params`, + `get_hyperparameter_space` and input validation that is inherited by + both :class:`TabICLClassifierMother` and :class:`TabICLRegressorMother`. + + This class is not intended to be instantiated directly. It must be combined + with a concrete ``TabICLClassifier`` or ``TabICLRegressor`` via multiple + inheritance, and ``_store_initial_params`` must be called at the end of the + child ``__init__`` once the parent TabICL constructor has run. + + Attributes + ---------- + _init_params : dict + Snapshot of all constructor parameters taken after the parent TabICL + ``__init__`` has executed. Used as the single source of truth by + `get_params`, `set_params`, and the conditional logic in + `get_hyperparameter_space`. + _is_fitted : bool + Guard flag set to ``True`` in `fit`. Prevents parameter + modification after the model has been trained. + """ + + def __init__(self) -> None: + self._init_params: dict = {} + self._is_fitted: bool = False + + def _store_initial_params(self) -> None: + """Snapshot the current instance attributes into ``_init_params``. + + Must be called at the **end** of the child class ``__init__``, after + ``super().__init__(**kwargs)`` has run, so that all hyperparameters + assigned by the parent TabICL constructor (via ``self.xxx = ...``) + are captured. + """ + non_optimised_params = {"_init_params", "_is_fitted"} + self._init_params = {key: value for key, value in self.__dict__.items() if key not in non_optimised_params} + + def get_hyperparameter_space(self, X, y, trial: Union[Trial, FixedTrial], prefix: str = "") -> dict: + """Suggest hyperparameters for an Optuna trial. + + The set of suggested parameters depends on which keys are present in + ``_init_params`` (populated from the concrete child class defaults): + + - ``n_estimators`` is always suggested (range [1, 12]). + - ``softmax_temperature`` is suggested only for classifiers. + - ``average_logits`` is suggested only for classifiers, and is forced + to ``False`` when ``n_estimators == 1`` (averaging is meaningless + with a single estimator). + - ``outlier_threshold`` is suggested only for regressors. + + All parameter names passed to Optuna are prefixed with ``prefix`` to + avoid name collisions when multiple models share the same trial. + The returned dict keys are also prefixed for use with + `set_params`. + + Parameters + ---------- + X : array-like + Training features (not used directly, required by the interface). + y : array-like + Training targets (not used directly, required by the interface). + trial : optuna.trial.Trial + Current Optuna trial object used to sample hyperparameter values. + prefix : str, default "" + String prepended to every parameter name, both in Optuna and in + the returned dict (e.g. ``"clf__"`` for pipeline compatibility). + + Returns + ------- + dict + Mapping of prefixed parameter names to sampled values. + """ + suggested_params: dict[str, object] = { + "n_estimators": trial.suggest_int(prefix + "n_estimators", 1, 12, log=False), + } + + # Conditionally suggest parameters based on the model type, inferred from the presence of keys in _init_params + is_classifier = isinstance(self, TabICLClassifier) + is_regressor = isinstance(self, TabICLRegressor) + + if is_classifier and "softmax_temperature" in self._init_params: + suggested_params["softmax_temperature"] = trial.suggest_float( + prefix + "softmax_temperature", 0.5, 2.0, log=False + ) + + if is_classifier and "average_logits" in self._init_params: + if suggested_params["n_estimators"] == 1: + suggested_params["average_logits"] = False + else: + suggested_params["average_logits"] = trial.suggest_categorical(prefix + "average_logits", (True, False)) + + if is_regressor and "outlier_threshold" in self._init_params: + suggested_params["outlier_threshold"] = trial.suggest_float( + prefix + "outlier_threshold", 2.0, 8.0, log=False + ) + + if isinstance(trial, Trial): + module_logger.info( + "Suggested TabICL parameters in trial %s: %s", + trial.number, + suggested_params, + ) + else: + module_logger.info("Fixed Trial TabICL parameters: %s", suggested_params) + + return utils.add_prefix_to_dict_keys(suggested_params, prefix=prefix) + + def get_params(self, deep=True) -> dict: + """Return the hyperparameters stored in ``_init_params``. + + Overrides the default sklearn MRO resolution so that the Mother + parameter store is always used instead of ``TabICLClassifier``'s + own ``get_params``. + + Parameters + ---------- + deep : bool, default True + Ignored; kept for sklearn API compatibility. + + Returns + ------- + dict + Current hyperparameter names and values. + """ + return dict(self._init_params) + + def set_params(self, **params): + """Update hyperparameter values before the model is fitted. + + Updates both ``_init_params`` (the Mother store) and the + corresponding instance attributes so that TabICL uses the new + values on the next `fit` call. + + Modification is blocked after fitting (``_is_fitted=True``) and + an error is logged instead of raising, to keep pipeline behaviour + predictable. + + Parameters + ---------- + **params + Keyword arguments mapping parameter names to new values. + Unknown keys (not in ``_init_params`` and not an instance + attribute) are silently ignored. + + Returns + ------- + self + """ + if self._is_fitted: + module_logger.error("The model is already fitted. You cannot change the parameters in the fitted model.") + return self + + for key, value in params.items(): + if key in self._init_params: + self._init_params[key] = value + + if hasattr(self, key): + setattr(self, key, value) + + return self + + @staticmethod + def _check_input_type(X, y: Optional[Union[np.ndarray, pd.Series]]) -> None: + """Validate types and shapes of ``X`` and ``y`` before fitting. + + Raises a :exc:`TypeError` or :exc:`ValueError` early so that + errors are surfaced before TabICL runs its own internal checks. + NaN values are allowed because TabICL handles missing data natively. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Feature matrix. + y : np.ndarray or pd.Series of shape (n_samples,) + Target vector. + + Raises + ------ + TypeError + If ``X`` or ``y`` is not array-like. + ValueError + If ``X`` or ``y`` is empty. + """ + if not hasattr(X, "shape") or (X.shape is None): + raise TypeError(f"X must be array-like, not {type(X)}.") + if y is None or (not hasattr(y, "shape")) or (y.shape is None): + raise TypeError(f"y must be np.ndarray or pd.Series, not {type(y)}") + if (X.shape[0] == 0) or (y.shape[0] == 0): + raise ValueError("X and y must not be empty.") + + # Allow nan values since TabICL can handle them + # Ensure_2d is True to enforce 2D X (n_samples, n_features), y remains 1D + check_X_y(X, y, ensure_2d=True, force_all_finite="allow-nan") + + +# =========================================================== Classifier model + + +class TabICLClassifierMother(TabICLClassifier, _TabICLHyperParams): + """Mother-compatible wrapper around :class:`tabicl.TabICLClassifier`. + + Combines TabICL's in-context-learning classifier with the MotherML + hyperparameter management API (Optuna-ready, sklearn-compatible). + + Parameters + ---------- + n_estimators : int, default 8 + Number of ensemble estimators. Higher values improve stability at + the cost of inference time. + softmax_temperature : float, default 0.9 + Temperature applied to logits before the softmax. Values below 1.0 + sharpen the distribution; values above 1.0 soften it. + average_logits : bool, default True + If ``True``, logits are averaged across estimators before the final + softmax. Automatically set to ``False`` when ``n_estimators == 1``. + allow_auto_download : bool, default True + Whether to automatically download the model checkpoint if not found + locally. + checkpoint_version : str, default "tabicl-classifier-v2-20260212.ckpt" + Identifier of the pre-trained checkpoint to load. + kv_cache : bool, default False + Whether to use key-value caching during inference for faster + repeated predictions on the same context. + **kwargs + Any additional keyword arguments are forwarded to + :class:`tabicl.TabICLClassifier`. + + Examples + -------- + Basic binary classification: + + >>> import numpy as np + >>> from mother.ml.models.m_tabicl import TabICLClassifierMother + >>> X_train = np.random.rand(50, 8) + >>> y_train = (X_train[:, 0] > 0.5).astype(int) + >>> X_test = np.random.rand(10, 8) + >>> clf = TabICLClassifierMother() + >>> clf.fit(X_train, y_train) + TabICLClassifierMother(...) + >>> y_pred = clf.predict(X_test) + >>> y_proba = clf.predict_proba(X_test) # shape (10, 2) + + Using inside a Mother pipeline with Optuna optimisation: + + >>> from mother.ml.models.m_tabicl import TabICLClassifierMother + >>> clf = TabICLClassifierMother(n_estimators=8, softmax_temperature=0.9) + >>> # clf.get_hyperparameter_space(X, y, trial) returns an Optuna-ready dict + """ + + def __init__(self, **kwargs): + """Initialise the classifier with Mother defaults merged with user kwargs. + + Initialisation order: + + 1. ``_TabICLHyperParams.__init__`` sets ``_init_params={}`` and + ``_is_fitted=False``. + 2. `default_parameters` values are injected into ``kwargs`` only + for keys not already supplied by the caller. + 3. ``super().__init__(**kwargs)`` runs ``TabICLClassifier.__init__``, + which assigns every parameter as an instance attribute + (``self.n_estimators = n_estimators``, etc.). + 4. `_store_initial_params` snapshots ``self.__dict__`` into + ``_init_params`` for use by `get_params` and + `get_hyperparameter_space`. + """ + _TabICLHyperParams.__init__(self) + + # Set default parameters if not provided in kwargs, keeping user choices + for key, value in self.default_parameters().items(): + kwargs.setdefault(key, value) + + super().__init__(**kwargs) + self._store_initial_params() + + def get_params(self, deep=True) -> dict: + """Return the classifier's hyperparameters from the Mother store. + + Explicitly overrides ``TabICLClassifier.get_params`` (first in MRO) + to guarantee the Mother ``_init_params`` dict is always returned. + + Parameters + ---------- + deep : bool, default True + Ignored; kept for sklearn API compatibility. + + Returns + ------- + dict + Current hyperparameter names and values. + """ + return dict(self._init_params) + + def set_params(self, **params): + """Update classifier hyperparameters before fitting. + + Mirrors the pattern of `get_params`: explicitly overrides + ``TabICLClassifier.set_params`` so that both the Mother store + (``_init_params``) and the underlying ``TabICLClassifier`` instance + attributes are updated in sync. + + Parameters + ---------- + **params + Keyword arguments mapping parameter names to new values. + + Returns + ------- + self + """ + if self._is_fitted: + module_logger.error("The model is already fitted. You cannot change the parameters in the fitted model.") + return self + + for key, value in iteritems(params): + if key in self._init_params.keys(): + self._init_params[key] = value + + # super class: TabICLClassifier to update the parameters + super().set_params(**params) + return self + + def default_parameters(self, prefix: str = "") -> dict: + """Return the default hyperparameter values for this classifier. + + Used during ``__init__`` to fill in any parameters not explicitly + provided by the caller, and by the MotherML tuning infrastructure + to know which parameters exist for this model. Default parameters + selected based on TabICL github's repository. + + Parameters + ---------- + prefix : str, default "" + Optional prefix prepended to every key (e.g. ``"clf__"`` for + pipeline use). + + Returns + ------- + dict + Default hyperparameter names (optionally prefixed) and values. + """ + return utils.add_prefix_to_dict_keys( + { + "n_estimators": 8, + "softmax_temperature": 0.9, + "average_logits": True, + "allow_auto_download": True, + "checkpoint_version": "tabicl-classifier-v2-20260212.ckpt", + "kv_cache": False, + }, + prefix=prefix, + ) + + def fit( + self, + X: Union[np.ndarray, pd.DataFrame], + y: Union[np.ndarray, pd.Series], + ) -> "TabICLClassifierMother": + """Fit the TabICL classifier on labelled data. + + Validates inputs, then delegates to + `tabicl.TabICLClassifier.fit`. Sets ``_is_fitted=True`` to + block further parameter modification via `set_params`. + + Parameters + ---------- + X : Union[np.ndarray, pd.DataFrame] of shape (n_samples, n_features) + Feature matrix. Lists are accepted and automatically converted + to a column vector with a warning. + y : Union[np.ndarray, pd.Series] of shape (n_samples,) + Class labels (integer or string). Lists are accepted and + automatically converted. + + Returns + ------- + self + The fitted classifier instance. + + Raises + ------ + TypeError + If ``X`` or ``y`` is not array-like. + ValueError + If ``X`` or ``y`` is empty. + """ + if isinstance(X, list): + X = np.asarray(X) + if X.ndim == 1: + X = X.reshape(-1, 1) + module_logger.warning( + "X is given as list type. It is converted into np.array with shape %s.", + X.shape, + ) + if isinstance(y, list): + y = np.array(y) + module_logger.warning( + "y is given as list type. It is converted into np.array with shape %s.", + y.shape, + ) + + self._check_input_type(X, y) + + # Fit the TabICLClassifier on the data + super().fit(np.array(X), np.array(y)) + self._is_fitted = True + return self + + +# =========================================================== Regressor Model +class TabICLRegressorMother(TabICLRegressor, _TabICLHyperParams): + """Mother-compatible wrapper around :class:`tabicl.TabICLRegressor`. + + Combines TabICL's in-context-learning regressor with MotherML's + hyperparameter and pipeline API. + + Parameters + ---------- + n_estimators : int, default 8 + Number of ensemble estimators. Higher values typically improve + prediction stability at the cost of runtime. + outlier_threshold : float, default 4.0 + Clipping threshold used by TabICL to reduce sensitivity to extreme + context examples. + allow_auto_download : bool, default True + Whether to automatically download model checkpoints if needed. + kv_cache : bool, default False + Whether to enable key-value cache acceleration for repeated inference. + **kwargs + Any additional keyword arguments accepted by + :class:`tabicl.TabICLRegressor`. + + Examples + -------- + Basic regression: + + >>> import numpy as np + >>> from mother.ml.models.m_tabicl import TabICLRegressorMother + >>> X_train = np.random.rand(60, 6) + >>> y_train = X_train[:, 0] * 2.5 - X_train[:, 1] + np.random.normal(0, 0.05, 60) + >>> X_test = np.random.rand(8, 6) + >>> reg = TabICLRegressorMother() + >>> reg.fit(X_train, y_train) + TabICLRegressorMother(...) + >>> y_pred = reg.predict(X_test) + + Regression with uncertainty estimates: + + >>> uncertainty_df = reg.predict_uncertainty(X_test) + >>> list(uncertainty_df.columns) + ['mean_predictions', 'knowledge_uncertainty', 'data_uncertainty', 'total_uncertainty'] + """ + + def __init__(self, **kwargs): + # Initialize empty hyperparameter dictionary + _TabICLHyperParams.__init__(self) + + # Update the kwargs with hyperparameters with the defaults parameters + for key, value in self.default_parameters().items(): + kwargs.setdefault(key, value) + + # Set up the TabICLRegressor original class with the hyperparameters + super().__init__(**kwargs) + + # Copy the hyperparameters in the child class (for optuna optimization) + self._store_initial_params() + + def get_params(self, deep=True) -> dict: + """Return the regressor's hyperparameters from the Mother store. + + Parameters + ---------- + deep : bool, default True + Ignored; kept for sklearn API compatibility. + + Returns + ------- + dict + Current hyperparameter names and values. + """ + return _TabICLHyperParams.get_params(self, deep=deep) + + def set_params(self, **params): + """Update regressor hyperparameters before fitting. + + Parameters + ---------- + **params + Keyword arguments mapping parameter names to new values. + + Returns + ------- + self + """ + if self._is_fitted: + module_logger.error("The model is already fitted. You cannot change the parameters in the fitted model.") + return self + + # Update the mother wrapper class + for key, value in iteritems(params): + if key in self._init_params.keys(): + self._init_params[key] = value + + # Update the original tabicl regressor class + super().set_params(**params) + + return self + + def default_parameters(self, prefix: str = "") -> dict: + """Return default hyperparameter values for the regressor. + + Parameters + ---------- + prefix : str, default "" + Optional prefix prepended to each key (for pipeline usage). + + Returns + ------- + dict + Default hyperparameter names (optionally prefixed) and values. + """ + return utils.add_prefix_to_dict_keys( + { + "n_estimators": 8, + "outlier_threshold": 4.0, + "allow_auto_download": True, + "kv_cache": False, + }, + prefix=prefix, + ) + + def fit( + self, + X: Union[np.ndarray, pd.DataFrame], + y: Union[np.ndarray, pd.Series], + ) -> "TabICLRegressorMother": + """Fit the TabICL regressor on labeled data. + + Parameters + ---------- + X : Union[np.ndarray, pd.DataFrame] of shape (n_samples, n_features) + Feature matrix. Lists are accepted and converted to arrays. + y : Union[np.ndarray, pd.Series] of shape (n_samples,) + Continuous target values. Lists are accepted and converted. + + Returns + ------- + self + The fitted regressor instance. + """ + if isinstance(X, list): + X = np.asarray(X) + if X.ndim == 1: + X = X.reshape(-1, 1) + module_logger.warning( + "X is given as list type. It is converted into np.array with shape %s.", + X.shape, + ) + if isinstance(y, list): + y = np.array(y) + module_logger.warning( + "y is given as list type. It is converted into np.array with shape %s.", + y.shape, + ) + + self._check_input_type(X, y) + + # fit the original tabicl regressor class on the data as array type + # to be consistent with the sklearn input type + super().fit(np.array(X), np.array(y)) + self._is_fitted = True + return self + + def predict_uncertainty( + self, + X: Union[np.ndarray, pd.DataFrame], + return_quantiles: bool = False, + quantiles: Optional[List] = None, + uncertainty_for_opt: bool = False, + **kwargs, + ) -> Union[pd.DataFrame, pd.Series, tuple[pd.DataFrame, np.ndarray]]: + """ + Predict the target values and estimate uncertainty of given input. + The uncertainty is measured by interquartile range for each sample. + + Args: + X : Union[np.ndarray, pd.DataFrame] + Input to predict and estimate the uncertainty. + quantiles : list = [.25, .5, .75] + List of quantiles to calculate the uncertainty. + return_quantiles : bool + If True, return quantile values (default is False). + + Returns: + Union[pd.DataFrame, tuple[pd.DataFrame, np.array]]: + - If `return_quantiles=False`: A DataFrame with columns: + - 'mean_predictions': The mean predictions for each sample (mean of quantiles). + - 'knowledge_uncertainty': None, + currently not supported for this method (included just for compatibility) + - 'data_uncertainty': None, + currently not supported for this method (included just for compatibility) + - 'total_uncertainty': The uncertainty quantified for each sample (interquartile range). + - If `return_quantiles=True`: A tuple containing: + - The DataFrame described above. + - np.array of quantile values whose shape is (# samples, # quantiles). + """ + check_is_fitted(self) + + # Update the quantiles list with default quantiles + if quantiles is None: + quantiles = list() + for q in DEFAULT_QUANTILES: + if q not in quantiles: + quantiles.append(q) + quantiles.sort() + + pred_res: Union[np.ndarray, dict] = self.predict( + np.array(X), output_type="quantiles", alphas=quantiles, **kwargs + ) + + if not isinstance(pred_res, np.ndarray): + raise TypeError( + "Expected TabICLRegressor.predict with output_type='quantiles' to return an array, " + f"got {type(pred_res)}." + ) + + output: pd.DataFrame = pd.DataFrame( + { + "mean_predictions": pred_res.mean(axis=1).tolist(), + "knowledge_uncertainty": None, # Not available for this model + "data_uncertainty": None, # Not available for this model + "total_uncertainty": (pred_res[:, quantiles.index(0.75)] - pred_res[:, quantiles.index(0.25)]).tolist(), + }, + ) + + # Apply the correct index if a dataframe is given as input + if isinstance(X, pd.DataFrame): + output.index = X.index + + # If return_quantiles is True, also return the quantiles values as a numpy array + if uncertainty_for_opt: + return output.loc[:, "total_uncertainty"] + if return_quantiles: + return output, pred_res + + return output + + +class TabICLEmbeddingTransformer(BaseEstimator, TransformerMixin): + """Transformer that extracts TabICL row-interaction representations as embeddings. + + Hooks into the ``row_interactor`` output of the underlying TabICL model to + capture per-row embedding vectors. These vectors encode both within-row + feature interactions and the distributional context of the full dataset. + + For training data, k-fold cross-validation is used to generate out-of-fold + embeddings and avoid data leakage. For new data, a single model fitted on + all available training samples generates the embeddings. + + Parameters + ---------- + model_type : {'classification', 'regression'}, default='classification' + Whether to use a :class:`tabicl.TabICLClassifier` or + :class:`tabicl.TabICLRegressor` as the underlying model. + Ignored when a pre-fitted *model* is provided. + n_folds : int, default=5 + Number of folds for cross-validation when generating training embeddings. + use_kfold : bool, default=True + Whether to use k-fold cross-validation for training embeddings. + If ``False``, a single model is fitted on all data and its representations + for the training set are stored (note: this introduces data leakage for + the training embeddings). + random_state : int or None, default=42 + Random seed for reproducibility of k-fold splitting and TabICL ensemble. + embedding_column_name : str, default='tabiclembedding' + Name or prefix for the output embedding columns. + return_separate_columns : bool, default=True + If ``True``, each embedding dimension is returned as a separate column + (e.g. ``tabiclembedding_0``, ``tabiclembedding_1``, …). + If ``False``, each row's embedding vector is stored as a single object + in one column. + model : TabICLClassifier or TabICLRegressor or None, default=None + A pre-fitted TabICL estimator. When provided the model is used as-is + and k-fold fitting is skipped. Cannot be combined with + ``use_kfold=True``. + **kwargs + Additional keyword arguments forwarded to + :class:`tabicl.TabICLClassifier` or :class:`tabicl.TabICLRegressor` + when creating new estimators. + + Attributes + ---------- + model : TabICLClassifier or TabICLRegressor + The fitted estimator used for representing new data. + train_embeddings_ : ndarray of shape (n_samples, embedding_dim) + Out-of-fold (or full-data, when ``use_kfold=False``) embeddings for + the training samples. + input_features_ : list of str or None + Column names seen during ``fit``. ``None`` when ``X`` was a NumPy array. + train_index_ : Index or None + Row index from the training ``DataFrame``. ``None`` when ``X`` was a + NumPy array. + + Examples + -------- + Classification embeddings with out-of-fold training vectors: + + >>> import pandas as pd + >>> from mother.ml.models.m_tabicl import TabICLEmbeddingTransformer + >>> X = pd.DataFrame({"f1": [0.1, 0.2, 0.9, 1.0], "f2": [1.0, 0.9, 0.2, 0.1]}) + >>> y = pd.Series([0, 0, 1, 1]) + >>> emb = TabICLEmbeddingTransformer(model_type="classification", n_folds=2) + >>> X_emb_train = emb.fit_transform(X, y) + >>> X_emb_test = emb.transform(X.iloc[:2]) + + Regression embeddings as one vector column: + + >>> emb_reg = TabICLEmbeddingTransformer( + ... model_type="regression", + ... return_separate_columns=False, + ... n_folds=2, + ... ) + >>> y_reg = pd.Series([1.2, 1.0, 2.5, 2.8]) + >>> X_emb = emb_reg.fit_transform(X, y_reg) + >>> X_emb.columns.tolist() + ['tabiclembedding'] + """ + + def __init__( + self, + model_type: Literal["classification", "regression"] = "classification", + n_folds: int = 5, + use_kfold: bool = True, + random_state: Optional[int] = None, + embedding_column_name: str = "tabiclembedding", + return_separate_columns: bool = True, + model: Optional[Union[TabICLClassifier, TabICLRegressor]] = None, + **kwargs: Any, + ) -> None: + self.model_type = model_type + self.n_folds = n_folds + self.use_kfold = use_kfold + self.random_state = random_state + self.embedding_column_name = embedding_column_name + self.return_separate_columns = return_separate_columns + self.kwargs: Dict[str, Any] = kwargs + self.model = model + self.pre_fitted: bool = model is not None + + # Populated during fit + self.input_features_: Optional[List[str]] = None + self.train_embeddings_: Optional[np.ndarray] = None + self.train_index_: Optional[pd.Index] = None + self._embedding_dim: Optional[int] = None + + if self.pre_fitted and self.use_kfold: + raise ValueError( + "Cannot use k-fold fitting when a pre-fitted model is already given. " + "Set either use_kfold=False or model=None." + ) + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + def _make_estimator(self) -> Union[TabICLClassifier, TabICLRegressor]: + """Instantiate a new TabICL estimator with the configured parameters.""" + kwargs = dict(self.kwargs) + + if self.random_state is not None: + kwargs.setdefault("random_state", self.random_state) + + if self.model_type == "classification": + return TabICLClassifier(**kwargs) + + elif self.model_type == "regression": + return TabICLRegressor(**kwargs) + + else: + raise ValueError(f"Invalid model_type '{self.model_type}'. Use 'classification' or 'regression'.") + + def _extract_representations( + self, + fitted_estimator: Union[TabICLClassifier, TabICLRegressor], + X_test: Union[np.ndarray, pd.DataFrame], + ) -> np.ndarray: + """Extract per-row representations from the row-interaction transformer. + + Registers a forward hook on ``fitted_estimator.model_.row_interactor`` + to capture its output tensor during a single ``predict_proba`` / ``predict`` + call. + + Two cases arise depending on whether KV caching is active: + + **Why the hook is placed on row_interactor:** + + TabICL's ``row_interactor`` is a *within-row* transformer: it uses learnable + CLS tokens to aggregate the column embeddings of each row independently + (the row axis ``T`` is a batch dimension — rows do not attend to each other + here). The training context is baked in *before* this step, by + ``col_embedder``, which uses set-attention over all feature columns with + access to the training distribution. By the time ``row_interactor`` runs, + the per-column embeddings already encode the dataset context, making the + output of ``row_interactor`` the cleanest representation of each sample that + is still upstream of the cross-row ICL predictor head. + + There is **no public API** in TabICL for extracting these representations + (unlike TabPFN's ``get_embeddings()``). A PyTorch forward hook is therefore + the correct and minimal approach — it requires no modification of TabICL + internals and is removed immediately after the forward pass. + + Two shapes arise at the hook depending on KV caching: + + * **No KV cache** (default): all rows are concatenated and passed jointly — + ``[X_train | X_test]`` — so the hook captures tensors of shape + ``(B, train_size + test_size, repr_dim)``. Only the test-sample slice + ``[:, train_size:, :]`` is kept. + + * **KV cache active** (``kv_cache="kv"`` or ``kv_cache="repr"``): training + data is handled via pre-computed cached projections, so + ``row_interactor`` is only called on the test samples and captures + ``(B, test_size, repr_dim)``. The training context is still present + because ``col_embedder`` attended to the cached training projections + before producing the inputs to ``row_interactor``. + + Parameters + ---------- + fitted_estimator : TabICLClassifier or TabICLRegressor + A fully fitted TabICL estimator (``fit`` already called). + X_test : array-like of shape (n_test_samples, n_features) + Samples whose representations should be extracted. + + Returns + ------- + ndarray of shape (n_samples, repr_dim) + Per-row representation vectors averaged over all ensemble members. + """ + representations_list: List[np.ndarray] = [] + + def _hook(module, input, output): # noqa: ANN001 + + # Get the output (tensor), detach from calculation graph, move the tensor on the CPU to convert it to numpy + # array and append it to the representations list + representations_list.append(output.detach().cpu().float().numpy()) + + hook = fitted_estimator.model_.row_interactor.register_forward_hook(_hook) + + # Force a forward pass to trigger the hook and populate representations_list. + try: + if self.model_type == "classification": + if not isinstance(fitted_estimator, TabICLClassifier): + raise TypeError( + f"model_type='classification' requires a fitted TabICLClassifier, got {type(fitted_estimator)}." + ) + fitted_estimator.predict_proba(self._to_array(X_test)) + else: + if not isinstance(fitted_estimator, TabICLRegressor): + raise TypeError( + f"model_type='regression' requires a fitted TabICLRegressor, got {type(fitted_estimator)}." + ) + fitted_estimator.predict(self._to_array(X_test)) + # Remove the hook even if prediction fails to avoid side effects. + finally: + hook.remove() + + n_test: int = X_test.shape[0] + + # Get the number of training samples if KV cache is not used (train and test samples passed together to the row + # interactor) + train_size: int = fitted_estimator.n_samples_in_ + + # Each entry in representations_list has shape (batch_B, T (number of lines in row_interactor), repr_dim). + all_repr = np.concatenate(representations_list, axis=0) # (total_estimators, T, repr_dim) + T: int = all_repr.shape[1] + + if T == n_test: + # KV cache was active: row_interactor processed only test samples. + test_repr = all_repr + else: + # No cache: row_interactor processed [X_train | X_test] jointly. + # Slice off the train rows (T == train_size + n_test). + test_repr = all_repr[:, train_size:, :] # (total_estimators, n_test, repr_dim) + + # Average over all ensemble members to obtain a single vector per sample (n_estimators parameters in the model + # class). + return test_repr.mean(axis=0) # (n_test, repr_dim) + + def _to_array(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray: + return X.values if isinstance(X, pd.DataFrame) else X + + def _to_array_y(self, y: Union[np.ndarray, pd.Series]) -> np.ndarray: + return np.array(y) if isinstance(y, pd.Series) else y + + # ------------------------------------------------------------------ + # sklearn API + # ------------------------------------------------------------------ + + def fit( + self, + X: Union[np.ndarray, pd.DataFrame], + y: Optional[Union[np.ndarray, pd.Series]] = None, + groups: Optional[Union[np.ndarray, pd.Series]] = None, + ) -> "TabICLEmbeddingTransformer": + """Fit the transformer and compute training embeddings. + + Uses k-fold cross-validation (when ``use_kfold=True``) to compute + out-of-fold embeddings for the training data, avoiding data leakage. + A final model is fitted on all training data to enable embedding of + new test samples via :meth:`transform`. + + Parameters + ---------- + X : array-like or DataFrame of shape (n_samples, n_features) + Training data. + y : array-like or Series of shape (n_samples,) + Target values. Required by TabICL for in-context learning. + groups : array-like of shape (n_samples,) or None, default=None + Group labels used for group-aware k-fold splitting. When provided, + classification uses :class:`~sklearn.model_selection.StratifiedGroupKFold` + and regression uses :class:`~sklearn.model_selection.GroupKFold`. + + Returns + ------- + self : TabICLEmbeddingTransformer + """ + if y is None: + raise ValueError("TabICL requires target values (y) for fitting.") + + is_df: bool = isinstance(X, pd.DataFrame) + self.train_index_ = X.index if is_df else None + self.input_features_ = X.columns.tolist() if is_df else None + + X_arr: np.ndarray = self._to_array(X) + y_arr: np.ndarray = self._to_array_y(y) + groups_arr: Optional[np.ndarray] = np.array(groups) if isinstance(groups, pd.Series) else groups + + if self.pre_fitted: + if self.model is None: + raise RuntimeError("Internal error: pre_fitted=True but model is None.") + + # Use the supplied model directly — no fitting required. + module_logger.info( + "A pre-fitted model was provided. Extracting training representations without refitting." + ) + + self.train_embeddings_ = self._extract_representations(self.model, X) + + else: + # Train the model using kfold to avoid data leakage in the embeddings representation + if self.use_kfold and X_arr.shape[0] >= self.n_folds: + module_logger.info("Fitting TabICL with %d-fold cross-validation.", self.n_folds) + + fold_iterator: Iterator[Tuple[np.ndarray, np.ndarray]] = self._build_fold_iterator( + X_arr, y_arr, groups_arr + ) + + # List to hold (original_index, embedding) pairs for each validation fold, which will be concatenated + # and sorted + embedding_chunks: List[Tuple[int, np.ndarray]] = [] + + for train_idx, val_idx in fold_iterator: + # Generate the estimator for this fold and fit it on the training split + fold_est = self._make_estimator() + X_train_fold = X.iloc[train_idx] if is_df else X_arr[train_idx] + X_val_fold = X.iloc[val_idx] if is_df else X_arr[val_idx] + fold_est.fit(np.array(X_train_fold), y_arr[train_idx]) + + # Extract the embeddings on the validation split to avoid data leakage + val_repr: np.ndarray = self._extract_representations(fold_est, np.array(X_val_fold)) + + # Save the original indices and corresponding embeddings for this fold + for orig_idx, emb in zip(val_idx, val_repr): + embedding_chunks.append((int(orig_idx), emb)) + + # Restore original sample order and concatenate the embeddings in a single matrix + embedding_chunks.sort(key=lambda chunk_tuple: chunk_tuple[0]) + self.train_embeddings_ = np.stack([emb for _, emb in embedding_chunks], axis=0) + + # Train the main model on *all* data for use during transform(). + module_logger.info("Fitting main TabICL model on full training data.") + self.model = self._make_estimator() + self.model.fit(np.array(X), y_arr) + + else: + if self.use_kfold: + module_logger.warning( + "Number of samples (%d) is less than n_folds (%d). " + "Falling back to a single model without k-fold.", + X_arr.shape[0], + self.n_folds, + ) + self.model = self._make_estimator() + self.model.fit(np.array(X), y_arr) + + # No OOF available — use self-context embeddings (data leakage risk). + self.train_embeddings_ = self._extract_representations(self.model, np.array(X)) + + self._embedding_dim = self.train_embeddings_.shape[1] + return self + + def _build_fold_iterator( + self, + X: np.ndarray, + y: np.ndarray, + groups: Optional[np.ndarray], + ) -> Iterator[Tuple[np.ndarray, np.ndarray]]: + """Return a k-fold iterator appropriate for the task and group settings.""" + if self.model_type == "classification": + if groups is not None: + n_splits = min(self.n_folds, len(np.unique(groups))) + if n_splits < 2: + raise ValueError( + f"At least 2 groups are required for group-aware k-fold splitting. " + f"Found {len(np.unique(groups))} unique group(s)." + ) + kf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=self.random_state) + return kf.split(X, y, groups=groups) + else: + kf = StratifiedKFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_state) + return kf.split(X, y) + else: + if groups is not None: + n_splits = min(self.n_folds, len(np.unique(groups))) + if n_splits < 2: + raise ValueError( + f"At least 2 groups are required for group-aware k-fold splitting. " + f"Found {len(np.unique(groups))} unique group(s)." + ) + kf = GroupKFold(n_splits=n_splits) + return kf.split(X, y, groups=groups) + else: + kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_state) + return kf.split(X) + + def transform( + self, + X: Union[np.ndarray, pd.DataFrame], + ) -> pd.DataFrame: + """Transform samples into TabICL row-interaction representations. + + Generates embeddings for samples using the fitted model through a single forward pass with + hook on the ``row_interactor``. + + Parameters + ---------- + X : array-like or DataFrame of shape (n_samples, n_features) + Samples to embed. + + Returns + ------- + pd.DataFrame + Embeddings. Shape depends on ``return_separate_columns``: + + * ``True`` → ``(n_samples, embedding_dim)`` with columns + ``tabiclembedding_0``, …, ``tabiclembedding_{d-1}``. + * ``False`` → ``(n_samples, 1)`` with a single column + ``tabiclembedding`` whose values are 1-D arrays. + """ + if self.model is None: + raise NotFittedError("The model must be fitted before calling transform. Call fit() first.") + + check_is_fitted(self, "model") + + is_df: bool = isinstance(X, pd.DataFrame) + index = X.index if is_df else None + + if self.input_features_ is not None and is_df: + missing = set(self.input_features_) - set(X.columns) + if missing: + raise ValueError(f"Features {missing} seen during training are missing from X.") + X = X[self.input_features_] + + embeddings = self._extract_representations(self.model, X) + return self._to_dataframe(embeddings, index) + + def fit_transform( # type: ignore[override] + self, + X: Union[np.ndarray, pd.DataFrame], + y: Optional[Union[np.ndarray, pd.Series]] = None, + groups: Optional[Union[np.ndarray, pd.Series]] = None, + ) -> pd.DataFrame: + """Fit and return embeddings for the training data. + + Delegates to :meth:`fit` and then returns the cached + ``train_embeddings_`` computed during fitting (out-of-fold when + ``use_kfold=True``). + + Parameters + ---------- + X : array-like or DataFrame of shape (n_samples, n_features) + y : array-like or Series of shape (n_samples,) + groups : array-like of shape (n_samples,) or None, default=None + + Returns + ------- + pd.DataFrame + Training embeddings (out-of-fold when ``use_kfold=True``). + """ + self.fit(X, y, groups) + + if self.train_embeddings_ is None: + raise RuntimeError("train_embeddings_ was not computed during fit.") + + return self._to_dataframe(self.train_embeddings_, self.train_index_) + + def get_feature_names_out(self) -> np.ndarray: + """Return output feature names. + + Returns + ------- + ndarray of str + Column names matching the output of :meth:`transform`. + """ + if self._embedding_dim is None: + raise NotFittedError("Transformer has not been fitted yet.") + check_is_fitted(self, "model") + + if self.return_separate_columns: + return np.array([f"{self.embedding_column_name}_{i}" for i in range(self._embedding_dim)]) + return np.array([self.embedding_column_name]) + + # ------------------------------------------------------------------ + # Private helpers + # ------------------------------------------------------------------ + + def _to_dataframe( + self, + embeddings: np.ndarray, + index: Optional[pd.Index], + ) -> pd.DataFrame: + if self.return_separate_columns: + cols: List[str] = [f"{self.embedding_column_name}_{i}" for i in range(embeddings.shape[1])] + if index is not None: + return pd.DataFrame(embeddings, columns=cols, index=index) + return pd.DataFrame(embeddings, columns=cols) + else: + data = {self.embedding_column_name: list(embeddings)} + if index is not None: + return pd.DataFrame(data, index=index) + return pd.DataFrame(data) diff --git a/test/unit/test_ml.py b/test/unit/test_ml.py index defa59c..b9837c0 100644 --- a/test/unit/test_ml.py +++ b/test/unit/test_ml.py @@ -28,6 +28,10 @@ def all_classification_algorithms(request): from mother.ml.models.m_tabpfn import TabPFNClassifierMother model = TabPFNClassifierMother() + elif algorithm == "tabicl": + from mother.ml.models.m_tabicl import TabICLClassifierMother + + model = TabICLClassifierMother() elif algorithm == "lasso": from mother.ml.models.m_lasso import LassoClassifierBinaryMother @@ -50,6 +54,10 @@ def all_regression_algorithms(request): from mother.ml.models.m_tabpfn import TabPFNRegressorMother model = TabPFNRegressorMother() + elif algorithm == "tabicl": + from mother.ml.models.m_tabicl import TabICLRegressorMother + + model = TabICLRegressorMother() elif algorithm == "lasso": from mother.ml.models.m_lasso import LassoRegressorMother diff --git a/test/unit/test_mother_cv.py b/test/unit/test_mother_cv.py index d01c7cd..47881aa 100644 --- a/test/unit/test_mother_cv.py +++ b/test/unit/test_mother_cv.py @@ -237,6 +237,12 @@ def all_classification_algorithms(request) -> BaseEstimator: from mother.ml.models.m_lasso import LassoClassifierBinaryMother model = LassoClassifierBinaryMother() + + elif algorithm == "tabicl": + from mother.ml.models.m_tabicl import TabICLClassifierMother + + model = TabICLClassifierMother() + return model @@ -254,6 +260,10 @@ def all_regression_algorithms(request) -> BaseEstimator: model = TabPFNRegressorMother() elif algorithm == "lasso": model = LassoRegressorMother() + elif algorithm == "tabicl": + from mother.ml.models.m_tabicl import TabICLRegressorMother + + model = TabICLRegressorMother() return model diff --git a/test/unit/test_tabicl.py b/test/unit/test_tabicl.py new file mode 100644 index 0000000..bc24534 --- /dev/null +++ b/test/unit/test_tabicl.py @@ -0,0 +1,480 @@ +import numpy as np +import pandas as pd +import pytest +from optuna.trial import FixedTrial +from sklearn.datasets import load_diabetes, load_wine +from sklearn.exceptions import NotFittedError +from sklearn.model_selection import train_test_split + +from mother.ml.models.m_tabicl import ( + TabICLClassifierMother, + TabICLEmbeddingTransformer, + TabICLRegressorMother, +) + +# Row-wise transformers embedding size (last dimension of the output before prediction head) +TABICL_EMBEDDING_SIZE = 512 + + +def get_data_containers(X, y): + """Convert X and y into different container formats for testing.""" + return [ + (X, y, "original"), + (np.array(X), np.array(y), "numpy arrays"), + ( + pd.DataFrame(X) if not isinstance(X, pd.DataFrame) else X, + pd.Series(y) if not isinstance(y, pd.Series) else y, + "pandas DataFrame/Series", + ), + ] + + +rng = np.random.default_rng(42) +X_small = rng.random((12, 4)) +y_small = rng.integers(0, 2, 12) + + +# This fixture will run tests that use it with each of the different data container formats defined in +# get_data_containers. +@pytest.fixture(params=[container for container in get_data_containers(X_small, y_small)], ids=lambda x: x[2]) +def data_containers(request): + return request.param + + +@pytest.mark.slow +class TestTabICLRegression: + model = TabICLRegressorMother(n_estimators=1) + X, y = load_diabetes(return_X_y=True, as_frame=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42) + + def test_default_parameters(self): + defaults = self.model.default_parameters() + assert defaults["n_estimators"] == 8 + assert defaults["outlier_threshold"] == 4.0 + assert defaults["allow_auto_download"] is True + assert defaults["kv_cache"] is False + + def test_set_and_get_params(self): + params = {"n_estimators": 3, "outlier_threshold": 5.0, "kv_cache": False} + self.model.set_params(**params) + got = self.model.get_params() + + for key, value in params.items(): + assert getattr(self.model, key) == value + assert got[key] == value + + def test_set_params_raises_error_on_unknown_keys(self): + model = TabICLRegressorMother() + with pytest.raises(ValueError): + model.set_params(unknown_parameter=123) + + @pytest.mark.parametrize( + "invalid_input", [(None, None), (1, 2), (0.1, 0.2)], ids=["None", "int not allowed", "float not allowed"] + ) + def test_invalid_input_raises(self, invalid_input): + with pytest.raises(TypeError): + self.model.fit(*invalid_input) + + def test_empty_input_raises(self): + X = np.empty((0, 4)) + y = np.empty((0,)) + with pytest.raises(ValueError): + self.model.fit(X, y) + + def test_predict_shape_and_values(self): + self.model.fit(self.X_train, self.y_train) + predictions = self.model.predict(self.X_test) + + assert isinstance(predictions, np.ndarray) + assert predictions.shape == self.y_test.shape + assert np.all(np.isfinite(predictions)) + + def test_predict_with_uncertainty_outputs(self): + self.model.fit(self.X_train, self.y_train) + output, quantiles = self.model.predict_uncertainty(self.X_test, return_quantiles=True) + + assert isinstance(output, pd.DataFrame) + assert isinstance(quantiles, np.ndarray) + assert list(output.columns) == [ + "mean_predictions", + "knowledge_uncertainty", + "data_uncertainty", + "total_uncertainty", + ] + assert quantiles.shape[0] == len(self.X_test) + assert quantiles.shape[1] >= 3 + assert (output["total_uncertainty"] >= 0).all() + + def test_predict_with_uncertainty_opt(self): + self.model.fit(self.X_train, self.y_train) + output = self.model.predict_uncertainty(self.X_test) + output_opt = self.model.predict_uncertainty(self.X_test, uncertainty_for_opt=True) + assert isinstance(output, pd.DataFrame) + assert isinstance(output_opt, pd.Series) + assert output["total_uncertainty"].equals(output_opt) + + def test_hyperparameter_space_regressor(self): + trial = FixedTrial({"reg__n_estimators": 5, "reg__outlier_threshold": 3.2}) + space = self.model.get_hyperparameter_space(self.X_train, self.y_train, trial, prefix="reg__") + + assert space["reg__n_estimators"] == 5 + assert space["reg__outlier_threshold"] == 3.2 + assert "reg__softmax_temperature" not in space + assert "reg__average_logits" not in space + + def test_set_params_after_fit_does_not_change(self): + model = TabICLRegressorMother(n_estimators=1) + model.fit(self.X_train, self.y_train) + before = model.n_estimators + model.set_params(n_estimators=6) + assert model.n_estimators == before + + def test_predict_uncertainty_adds_default_quantiles(self): + model = TabICLRegressorMother(n_estimators=1) + model.fit(self.X_train, self.y_train) + output, quantiles = model.predict_uncertainty(self.X_test, return_quantiles=True, quantiles=[0.5]) + assert isinstance(quantiles, np.ndarray) + assert isinstance(output, pd.DataFrame) + assert quantiles.shape[1] >= 3 + assert output.shape[0] == self.X_test.shape[0] + + def test_predict_uncertainty_with_numpy_input_index(self): + model = TabICLRegressorMother(n_estimators=1) + model.fit(self.X_train, self.y_train) + output = model.predict_uncertainty(np.array(self.X_test)) + assert isinstance(output.index, pd.RangeIndex) + + def test_predict_uncertainty_raises_if_predict_returns_wrong_type(self, monkeypatch): + model = TabICLRegressorMother(n_estimators=1) + model.fit(self.X_train, self.y_train) + + def fake_predict(*args, **kwargs): # noqa: ANN002, ANN003 + return {"not": "an array"} + + monkeypatch.setattr(model, "predict", fake_predict) + with pytest.raises(TypeError): + model.predict_uncertainty(self.X_test) + + +@pytest.mark.slow +class TestTabICLClassification: + model = TabICLClassifierMother(n_estimators=1) + X, y = load_wine(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42) + + def test_default_parameters(self): + defaults = self.model.default_parameters() + assert defaults["n_estimators"] == 8 + assert defaults["softmax_temperature"] == 0.9 + assert defaults["average_logits"] is True + assert defaults["allow_auto_download"] is True + assert defaults["kv_cache"] is False + + def test_set_and_get_params(self): + params = {"n_estimators": 3, "softmax_temperature": 1.2, "average_logits": False} + self.model.set_params(**params) + got = self.model.get_params() + + for key, value in params.items(): + assert getattr(self.model, key) == value + assert got[key] == value + + def test_set_params_raises_error_on_unknown_keys(self): + model = TabICLClassifierMother(n_estimators=1) + with pytest.raises(ValueError): + model.set_params(unknown_parameter=123) + + @pytest.mark.parametrize( + "invalid_input", [(None, None), (1, 2), (0.1, 0.2)], ids=["None", "int not allowed", "float not allowed"] + ) + def test_invalid_input_raises(self, invalid_input): + with pytest.raises(TypeError): + self.model.fit(*invalid_input) + + def test_predict_with_data_containers(self, data_containers: tuple): + # Test if no error is raised when fitting with different data container formats + self.model.fit(data_containers[0], data_containers[1]) + + def test_predict_and_predict_proba_shapes(self): + self.model.fit(self.X_train, self.y_train) + predictions = self.model.predict(self.X_test) + probabilities = self.model.predict_proba(self.X_test) + + assert predictions.shape == self.y_test.shape + assert probabilities.shape[0] == len(self.X_test) + assert probabilities.shape[1] == len(np.unique(self.y_train)) + assert np.all(np.isfinite(predictions)) + + def test_hyperparameter_space_classifier(self): + trial = FixedTrial( + { + "clf__n_estimators": 4, + "clf__softmax_temperature": 1.1, + "clf__average_logits": True, + "clf__outlier_threshold": 3.0, + } + ) + space = self.model.get_hyperparameter_space(self.X_train, self.y_train, trial, prefix="clf__") + + assert space["clf__n_estimators"] == 4 + assert space["clf__softmax_temperature"] == 1.1 + assert space["clf__average_logits"] is True + + def test_hyperparameter_space_forces_average_logits_with_one_estimator(self): + trial = FixedTrial({"clf__n_estimators": 1, "clf__softmax_temperature": 0.8, "clf__outlier_threshold": 3.0}) + space = self.model.get_hyperparameter_space(self.X_train, self.y_train, trial, prefix="clf__") + + assert space["clf__n_estimators"] == 1 + assert space["clf__average_logits"] is False + + def test_set_params_after_fit_does_not_change(self): + model = TabICLClassifierMother(n_estimators=1) + model.fit(self.X_train, self.y_train) + before = model.n_estimators + model.set_params(n_estimators=6) + assert model.n_estimators == before + + +@pytest.mark.slow +class TestTabICLEmbeddingTransformer: + X = pd.DataFrame(np.random.rand(24, 8), columns=[f"f{i}" for i in range(8)]) + regression_y = pd.Series(np.random.rand(24)) + classification_y = pd.Series(np.random.randint(0, 2, size=24)) + + def test_init_invalid_kfold_with_prefitted_model(self): + with pytest.raises(ValueError): + TabICLEmbeddingTransformer( + model_type="regression", + use_kfold=True, + model=TabICLRegressorMother(n_estimators=1), + ) + + def test_fit_transform_regression_basic(self): + transformer = TabICLEmbeddingTransformer( + model_type="regression", + use_kfold=False, + n_estimators=1, + random_state=42, + return_separate_columns=True, + ) + result = transformer.fit_transform(self.X, self.regression_y) + + assert isinstance(result, pd.DataFrame) + assert result.shape[0] == self.X.shape[0] + assert result.shape[1] == TABICL_EMBEDDING_SIZE + + def test_fit_transform_with_groups_classification(self): + groups = np.random.randint(0, 3, size=self.X.shape[0]) + transformer = TabICLEmbeddingTransformer( + model_type="classification", + use_kfold=True, + n_folds=3, + n_estimators=1, + random_state=0, + ) + result = transformer.fit_transform(self.X, self.classification_y, groups=groups) + assert result.shape[0] == self.X.shape[0] + assert result.shape[1] == TABICL_EMBEDDING_SIZE + + def test_fit_transform_without_y_raises(self): + transformer = TabICLEmbeddingTransformer(model_type="regression", use_kfold=False) + with pytest.raises(ValueError): + transformer.fit_transform(self.X, None) + + def test_transform_before_fit_raises(self): + transformer = TabICLEmbeddingTransformer(model_type="regression", use_kfold=False) + with pytest.raises(NotFittedError): + transformer.transform(self.X) + + def test_missing_feature_raises(self): + transformer = TabICLEmbeddingTransformer( + model_type="regression", + use_kfold=False, + n_estimators=1, + random_state=0, + ) + transformer.fit(self.X, self.regression_y) + bad_X = self.X.drop(columns=["f0"]) + + with pytest.raises(ValueError): + transformer.transform(bad_X) + + def test_return_single_vector_column(self): + transformer = TabICLEmbeddingTransformer( + model_type="regression", + use_kfold=False, + n_estimators=1, + return_separate_columns=False, + random_state=0, + ) + result = transformer.fit_transform(self.X, self.regression_y) + + assert list(result.columns) == ["tabiclembedding"] + assert result.shape[0] == self.X.shape[0] + assert isinstance(result.iloc[0, 0], np.ndarray) + + def test_prefitted_model_transform(self): + model = TabICLRegressorMother(n_estimators=1).fit(self.X, self.regression_y) + transformer = TabICLEmbeddingTransformer( + model_type="regression", + use_kfold=False, + model=model, + n_estimators=1, + ) + result = transformer.transform(self.X) + + assert isinstance(result, pd.DataFrame) + assert result.shape[0] == self.X.shape[0] + assert result.shape[1] == TABICL_EMBEDDING_SIZE + + def test_array_input(self): + transformer = TabICLEmbeddingTransformer( + model_type="regression", + use_kfold=False, + n_estimators=1, + random_state=0, + ) + result = transformer.fit_transform(self.X.to_numpy(), self.regression_y) + assert result.shape[0] == self.X.shape[0] + assert result.shape[1] == TABICL_EMBEDDING_SIZE + + def test_get_feature_names_out(self): + transformer = TabICLEmbeddingTransformer( + model_type="regression", + use_kfold=False, + n_estimators=1, + random_state=0, + ) + result = transformer.fit_transform(self.X, self.regression_y) + names = transformer.get_feature_names_out() + + assert result.shape[1] == names.shape[0] + + def test_get_feature_names_out_before_fit_raises(self): + transformer = TabICLEmbeddingTransformer( + model_type="regression", + use_kfold=False, + n_estimators=1, + random_state=0, + ) + with pytest.raises(NotFittedError): + transformer.get_feature_names_out() + + def test_fit_with_prefitted_but_missing_model_raises_runtime_error(self): + transformer = TabICLEmbeddingTransformer(model_type="regression", use_kfold=False) + transformer.pre_fitted = True + transformer.model = None + with pytest.raises(RuntimeError): + transformer.fit(self.X, self.regression_y) + + def test_fit_with_prefitted_model_path(self): + model = TabICLRegressorMother(n_estimators=1).fit(self.X, self.regression_y) + transformer = TabICLEmbeddingTransformer( + model_type="regression", + use_kfold=False, + model=model, + n_estimators=1, + ) + transformer.fit(self.X, self.regression_y) + assert transformer.train_embeddings_ is not None + + def test_fit_falls_back_when_samples_less_than_folds(self): + X_tiny = self.X.iloc[:3].copy() + y_tiny = self.regression_y.iloc[:3].copy() + transformer = TabICLEmbeddingTransformer( + model_type="regression", + use_kfold=True, + n_folds=5, + n_estimators=1, + random_state=0, + ) + result = transformer.fit_transform(X_tiny, y_tiny) + assert result.shape[0] == 3 + + def test_fit_transform_regression_with_groups_uses_group_kfold(self): + groups = np.random.randint(0, 3, size=self.X.shape[0]) + transformer = TabICLEmbeddingTransformer( + model_type="regression", + use_kfold=True, + n_folds=3, + n_estimators=1, + random_state=0, + ) + result = transformer.fit_transform(self.X, self.regression_y, groups=groups) + assert result.shape[0] == self.X.shape[0] + + def test_fit_transform_regression_without_groups_uses_kfold(self): + transformer = TabICLEmbeddingTransformer( + model_type="regression", + use_kfold=True, + n_folds=3, + n_estimators=1, + random_state=0, + ) + result = transformer.fit_transform(self.X, self.regression_y) + assert result.shape[0] == self.X.shape[0] + + def test_fit_transform_classification_without_groups(self): + transformer = TabICLEmbeddingTransformer( + model_type="classification", + use_kfold=True, + n_folds=3, + n_estimators=1, + random_state=0, + ) + result = transformer.fit_transform(self.X, self.classification_y) + assert result.shape[0] == self.X.shape[0] + + def test_invalid_model_type_raises(self): + transformer = TabICLEmbeddingTransformer(model_type="unknown", use_kfold=False) # type: ignore + with pytest.raises(ValueError): + transformer.fit_transform(self.X, self.regression_y) + + def test_transform_reorders_dataframe_columns(self): + transformer = TabICLEmbeddingTransformer( + model_type="regression", + use_kfold=False, + n_estimators=1, + random_state=0, + ) + transformer.fit(self.X, self.regression_y) + + X_reordered = self.X[["f3", "f1", "f2", "f0", "f4", "f5", "f6", "f7"]].copy() + X_reordered["extra"] = 1.0 + out = transformer.transform(X_reordered) + assert out.shape[0] == self.X.shape[0] + + def test_get_feature_names_out_single_column_mode(self): + transformer = TabICLEmbeddingTransformer( + model_type="regression", + use_kfold=False, + n_estimators=1, + return_separate_columns=False, + random_state=0, + ) + transformer.fit_transform(self.X, self.regression_y) + names = transformer.get_feature_names_out() + assert names.tolist() == ["tabiclembedding"] + + def test_single_column_mode_with_numpy_input_uses_default_index(self): + transformer = TabICLEmbeddingTransformer( + model_type="regression", + use_kfold=False, + n_estimators=1, + return_separate_columns=False, + random_state=0, + ) + out = transformer.fit_transform(self.X.to_numpy(), self.regression_y) + assert isinstance(out.index, pd.RangeIndex) + + def test_extract_representations_wrong_estimator_type_raises(self): + reg = TabICLRegressorMother(n_estimators=1).fit(self.X, self.regression_y) + clf = TabICLClassifierMother(n_estimators=1).fit(self.X, self.classification_y) + + transformer_cls = TabICLEmbeddingTransformer(model_type="classification", use_kfold=False, n_estimators=1) + with pytest.raises(TypeError): + transformer_cls._extract_representations(reg, self.X) + + transformer_reg = TabICLEmbeddingTransformer(model_type="regression", use_kfold=False, n_estimators=1) + with pytest.raises(TypeError): + transformer_reg._extract_representations(clf, self.X) diff --git a/uv.lock b/uv.lock index 30777db..b69b548 100644 --- a/uv.lock +++ b/uv.lock @@ -2752,6 +2752,10 @@ rna = [ { name = "scanpy", version = "1.11.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" }, { name = "scanpy", version = "1.12", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" }, ] +tabicl = [ + { name = "tabicl" }, + { name = "torch" }, +] tabpfn = [ { name = "tabpfn" }, { name = "torch" }, @@ -2818,12 +2822,14 @@ requires-dist = [ { name = "scikit-learn", specifier = "<=1.5.0" }, { name = "seaborn", marker = "extra == 'report'", specifier = ">=0.13.2,<0.14" }, { name = "single-source", specifier = ">=0.4.0,<0.5" }, + { name = "tabicl", marker = "extra == 'tabicl'", specifier = ">=2.1.1" }, { name = "tabpfn", marker = "extra == 'tabpfn'", specifier = "==2.1.0" }, + { name = "torch", marker = "extra == 'tabicl'", specifier = ">=2.11.0" }, { name = "torch", marker = "extra == 'tabpfn'", specifier = ">=2.3.0,<3" }, { name = "torch", marker = "extra == 'torch'", specifier = ">=2.3.0,<3" }, { name = "umap-learn", marker = "extra == 'report'", specifier = ">=0.5.7,<0.6" }, ] -provides-extras = ["torch", "report", "rna", "clustering", "tabpfn"] +provides-extras = ["torch", "report", "rna", "clustering", "tabpfn", "tabicl"] [package.metadata.requires-dev] dev = [ @@ -5248,6 +5254,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" }, ] +[[package]] +name = "tabicl" +version = "2.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "einops" }, + { name = "huggingface-hub" }, + { name = "numpy" }, + { name = "psutil" }, + { name = "scikit-learn" }, + { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "scipy", version = "1.17.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "torch" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e0/c3/ef8bcc7645bc1eedae4563df6429f6f5438aaea5e293682a9367bf2c2ddb/tabicl-2.1.1.tar.gz", hash = "sha256:7abdb1fa878e7a1edfa1c6606bf4189e9cccc20935e8011d7e690f4693c9c5c5", size = 224680, upload-time = "2026-04-29T15:57:46.302Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/60/d3/f6eadcef58322b1b0253df1c17d9230db9934274f52e0d952458f6c6ab41/tabicl-2.1.1-py3-none-any.whl", hash = "sha256:cb4405cc93335c688bc9bcb703c7944032fcf542b43ebb66820f1a5acb5651b1", size = 252909, upload-time = "2026-04-29T15:57:47.775Z" }, +] + [[package]] name = "tabpfn" version = "2.1.0"