Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/scripts/seqopt_deap_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import tracemalloc
import numpy as np

from aaanalysis.protein_design_pro._backend.seqopt.nsga2 import (
from aaanalysis.protein_design._backend.seqopt.nsga2 import (
fast_non_dominated_sort, crowding_distance, select_nsga2)

try:
Expand Down
2 changes: 1 addition & 1 deletion CONTEXT.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ _Avoid_: interaction (overloaded with the relational/PPI [[relational / interact
### SeqOpt directed-evolution vocabulary

**SeqOpt**:
The **search/optimization** layer of [[design / engineering]] (**[pro]**, `aaanalysis/protein_design_pro/`), the counterpart to [[SeqMut]]'s scoring: it *searches* over sequence variants of **one wild-type** for those that best satisfy several objectives at once. This is **protein engineering** — machine-learning-guided **directed evolution** of an *existing* sequence (Yang et al. 2019; Wittmann et al. 2021) — explicitly **not** *de novo protein design* (building new proteins from the ground up, e.g. RFdiffusion→ProteinMPNN→AlphaFold; Yang et al. 2026), which is out of scope. A `Tool` (`run` → [[Pareto front]] `df_pareto`, `eval` → Pareto-quality metrics), paired with `SeqOptPlot`. Reuses model-bound [[SeqMut]] as the fitness engine and `ShapModel` for residue guidance, so it is `pro` (imports SHAP) even though [[SeqMut]] stays core. Two modes — `"impact"` (SHAP-guided, adaptive) and `"importance"` (feature-importance-guided, greedy) — see [[guidance mode (impact / importance)]]. It realizes the search that [[SeqMut]] and [[combined variant]] defer to.
The **search/optimization** layer of [[design / engineering]] (a **core** class in `aaanalysis/protein_design/`), the counterpart to [[SeqMut]]'s scoring: it *searches* over sequence variants of **one wild-type** for those that best satisfy several objectives at once. This is **protein engineering** — machine-learning-guided **directed evolution** of an *existing* sequence (Yang et al. 2019; Wittmann et al. 2021) — explicitly **not** *de novo protein design* (building new proteins from the ground up, e.g. RFdiffusion→ProteinMPNN→AlphaFold; Yang et al. 2026), which is out of scope. A `Tool` (`run` → [[Pareto front]] `df_pareto`, `eval` → Pareto-quality metrics), paired with `SeqOptPlot`. Reuses model-bound [[SeqMut]] as the fitness engine. Two modes — `"impact"` (SHAP-guided, adaptive) and `"importance"` (feature-importance-guided, greedy) — see [[guidance mode (impact / importance)]]. **Only `mode="impact"` needs `aaanalysis[pro]`** (it imports `ShapModel`/SHAP lazily; absent, it raises a friendly install hint); everything else (importance mode, the NSGA-II/EA layer, all plots) runs in a base install. It realizes the search that [[SeqMut]] and [[combined variant]] defer to.
_Avoid_: optimizer (overloaded with numerical/perf optimization — this is evolutionary *search*); generator, sampler; **de novo design / protein design** (SeqOpt does protein *engineering* / directed evolution, not generation of new proteins).

**population** / **generation**:
Expand Down
14 changes: 3 additions & 11 deletions aaanalysis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from .feature_engineering import AAclust, AAclustPlot, SequenceFeature, NumericalFeature, CPP, CPPGrid, CPPPlot
from .pu_learning import dPULearn, dPULearnPlot
from .explainable_ai import TreeModel
from .protein_design import AAMut, AAMutPlot, SeqMut, SeqMutPlot
from .protein_design import AAMut, AAMutPlot, SeqMut, SeqMutPlot, SeqOpt, SeqOptPlot
from .plotting import (plot_get_clist, plot_get_cmap, plot_get_cdict,
plot_settings, plot_legend, plot_gcfs, plot_rank)
from .metrics import (comp_auc_adjusted, comp_bic_score, comp_kld,
Expand Down Expand Up @@ -54,6 +54,8 @@
"AAMutPlot",
"SeqMut",
"SeqMutPlot",
"SeqOpt",
"SeqOptPlot",
"TreeModel",
# "ShapModel" # SHAP
"plot_get_clist",
Expand Down Expand Up @@ -118,16 +120,6 @@ def missing_feature_stub(feature_name, error, mode="pro"):
globals()["ShapModel"] = missing_feature_stub("ShapModel", e, mode="pro")


try:
from .protein_design_pro import SeqOpt, SeqOptPlot
__all__.extend(["SeqOpt", "SeqOptPlot"])
except ImportError as e:
SeqOpt = None
SeqOptPlot = None
globals()["SeqOpt"] = missing_feature_stub("SeqOpt", e, mode="pro")
globals()["SeqOptPlot"] = missing_feature_stub("SeqOptPlot", e, mode="pro")


try:
from .seq_analysis_pro import comp_seq_sim
__all__.append("comp_seq_sim")
Expand Down
16 changes: 11 additions & 5 deletions aaanalysis/protein_design/__init__.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,28 @@
"""
Protein design: per-mutation and per-sequence ΔCPP feature-impact analysis.
Protein design / engineering: per-mutation and per-sequence ΔCPP feature-impact analysis
and multi-objective directed-evolution optimization.

Public objects: AAMut(+Plot), SeqMut(+Plot).
Public objects: AAMut(+Plot), SeqMut(+Plot), SeqOpt(+Plot).
Consumes CPP feature impact from ``feature_engineering`` to score amino-acid mutations
(``AAMut``) and whole-sequence variants (``SeqMut``); each is paired with a plot class
for visualization.
(``AAMut``) and whole-sequence variants (``SeqMut``); ``SeqOpt`` searches the variant space
(NSGA-II) for the multi-objective Pareto front. Each is paired with a plot class. ``SeqOpt`` is
a core class — only its SHAP-guided ``mode="impact"`` needs ``aaanalysis[pro]`` (imported lazily).

See ``.claude/rules/code-conventions.md`` for conventions, ``CONTEXT.md`` for domain
terms (protein-design (mutation / ΔCPP) vocabulary).
terms (protein-design (mutation / ΔCPP) + SeqOpt directed-evolution vocabulary).
"""
from ._aamut import AAMut
from ._aamut_plot import AAMutPlot
from ._seqmut import SeqMut
from ._seqmut_plot import SeqMutPlot
from ._seqopt import SeqOpt
from ._seqopt_plot import SeqOptPlot

__all__ = [
"AAMut",
"AAMutPlot",
"SeqMut",
"SeqMutPlot",
"SeqOpt",
"SeqOptPlot",
]
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
"""
This is a script for the frontend of the SeqOpt class (**[pro]**): SHAP-guided, fuzzy-labeled
multi-objective directed-evolution optimization over sequence variants of one wild-type. SeqOpt
reuses a model-bound SeqMut as its fitness engine and ShapModel for per-generation residue
guidance, and is therefore gated behind the ``pro`` extra.
This is a script for the frontend of the SeqOpt class: multi-objective directed-evolution
optimization over sequence variants of one wild-type, reusing a model-bound SeqMut as its
fitness engine. SeqOpt is a core class; only its SHAP-guided ``mode="impact"`` (per-generation
ShapModel refit) needs the optional ``aaanalysis[pro]`` dependency, imported lazily.
"""
from typing import Optional, List, Any, Callable, Tuple, Dict
import numpy as np
Expand All @@ -11,8 +11,9 @@
import aaanalysis.utils as ut
from aaanalysis.template_classes import Tool
from aaanalysis.feature_engineering._sequence_feature import SequenceFeature
from aaanalysis.protein_design import SeqMut
from aaanalysis.explainable_ai_pro import ShapModel
from ._seqmut import SeqMut
# ShapModel (and its heavy SHAP dependency) is imported lazily inside mode="impact" only, so
# SeqOpt stays importable in a base install; mode="impact" then needs aaanalysis[pro].
from ._backend.seqopt.genome import canonical, apply_genome, variant_label
from ._backend.seqopt.run import evolve_nsga2, evolve_greedy
from ._backend.seqopt.nsga2 import normalize_objectives_, rank_and_crowding
Expand Down Expand Up @@ -108,8 +109,9 @@ def residue_weights_(df_feat, col, base):
# II Main Functions
class SeqOpt(Tool):
"""
Sequence Optimizer (SeqOpt) class (**[pro]**, requires ``aaanalysis[pro]``) for SHAP-guided,
multi-objective directed evolution over sequence variants [Breimann24a]_.
Sequence Optimizer (SeqOpt) class for multi-objective directed evolution over sequence
variants [Breimann24a]_. Core class; only the SHAP-guided ``mode="impact"`` needs
``aaanalysis[pro]`` (``mode="importance"`` and everything else run in a base install).

``SeqOpt`` performs **protein engineering**, not **de novo protein design**. The two are
distinct paradigms: *de novo design* builds **new proteins from the ground up** rather than
Expand Down Expand Up @@ -186,9 +188,10 @@ def __init__(self,
mode : str, default='impact'
Residue-guidance mode. ``'impact'`` refits :class:`ShapModel` every generation under
fuzzy labeling (the new variant's prediction score as a soft label vs. the balanced
reference) and mutates the strongest-``feat_impact`` residues. ``'importance'`` uses
the static ``feat_importance`` ranking from ``df_feat`` (no SHAP, no refit) and walks
positions highest-first.
reference) and mutates the strongest-``feat_impact`` residues — this is the only
SeqOpt feature that needs ``aaanalysis[pro]`` (SHAP), imported lazily. ``'importance'``
uses the static ``feat_importance`` ranking from ``df_feat`` (no SHAP, no refit) and
walks positions highest-first (no pro dependency).
model : object, optional
A fitted classifier exposing ``predict_proba`` used as the fitness engine (the
``delta_pred`` objective) and, in ``mode='impact'``, as the model whose attribution
Expand Down Expand Up @@ -227,12 +230,23 @@ def __init__(self,
target_class=target_class)
self._model = model
self._target_class = target_class
# mode='impact' needs the labeled reference set for the per-generation ShapModel refit.
# mode='impact' needs the labeled reference set + the (pro) ShapModel for the
# per-generation refit. ShapModel is imported lazily so SeqOpt stays core-importable.
self._ShapModel = None
if self._mode == "impact":
if model is None or df_seq_ref is None or labels is None:
raise ValueError("mode='impact' requires 'model', 'df_seq_ref' and 'labels' "
"(the balanced reference the per-generation ShapModel refit is "
"anchored on); use mode='importance' for a SHAP-free run.")
try:
from aaanalysis.explainable_ai_pro import ShapModel
except ImportError as e:
raise ImportError(
"SeqOpt mode='impact' needs the SHAP-based ShapModel. Install via:\n"
"\n\tpip install 'aaanalysis[pro]'\n\n"
"or use mode='importance' (feature-importance-guided; no pro dependency)."
) from e
self._ShapModel = ShapModel
ut.check_df_seq(df_seq=df_seq_ref)
labels = ut.check_labels(labels=labels, len_required=len(df_seq_ref))
self._df_seq_ref = df_seq_ref
Expand Down Expand Up @@ -335,7 +349,7 @@ def _impact_weights(self, df_seq, df_feat, best_genome, wt_seq, base, jmd_n_len,
p_var = float(proba[-1]) if proba.ndim == 1 and len(proba) else float(np.ravel(proba)[-1])
p_var = min(max(p_var, 0.0), 1.0)
labels_fuzzy = list(np.asarray(self._labels, dtype=float)) + [p_var]
sm = ShapModel(random_state=self._random_state, verbose=False)
sm = self._ShapModel(random_state=self._random_state, verbose=False)
sm.fit(X, labels_fuzzy, fuzzy_labeling=True)
df_imp = sm.add_feat_impact(df_feat.copy(), samples=int(len(X) - 1), names="var", drop=True)
impact_cols = [c for c in df_imp.columns if c.startswith(ut.COL_FEAT_IMPACT)
Expand Down Expand Up @@ -401,8 +415,10 @@ def run(self,
Parameters
----------
df_seq : pd.DataFrame, shape (1, n_seq_info)
The single wild-type, in the **position-based** format (``sequence``, ``tmd_start``,
``tmd_stop``). See :meth:`SequenceFeature.get_df_parts` for the full specification.
DataFrame containing an ``entry`` column with unique protein identifiers, in the
**position-based** format (``sequence``, ``tmd_start``, ``tmd_stop``). See
:meth:`SequenceFeature.get_df_parts` for the full ``df_seq`` format specification.
Must hold **exactly one** wild-type sequence (SeqOpt optimizes one sequence per run).
df_feat : pd.DataFrame
CPP feature set (output of :meth:`CPP.run`) defining the features and the residue
attribution (``feat_importance`` / ``feat_impact``, ``positions``) the search reads.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
This is a script for the frontend of the SeqOptPlot class (**[pro]**) for visualizing SeqOpt
This is a script for the frontend of the SeqOptPlot class for visualizing SeqOpt
multi-objective directed-evolution results: the Pareto-front objective scatter and the
per-generation hypervolume convergence trace.
"""
Expand Down Expand Up @@ -38,8 +38,7 @@ def _objective_cols(df_pareto):
# II Main Functions
class SeqOptPlot:
"""
Plotting class for :class:`SeqOpt` (Sequence Optimizer) results (**[pro]**, requires
``aaanalysis[pro]``) [Breimann24a]_.
Plotting class for :class:`SeqOpt` (Sequence Optimizer) results [Breimann24a]_.

Visualizes the Pareto front produced by :meth:`SeqOpt.run`: a 2-D objective scatter colored
by non-dominated rank, and the per-generation hypervolume convergence trace.
Expand Down
19 changes: 0 additions & 19 deletions aaanalysis/protein_design_pro/__init__.py

This file was deleted.

Empty file.
9 changes: 9 additions & 0 deletions docs/adr/0043-seqopt-optimization-layer.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,15 @@ dependency forced": that decision keeps `SeqMut` and the `protein_design` **core
the new optimizer lives in a separate `*_pro` module. `SeqMut` (core) remains the fitness engine and
is imported by `SeqOpt`.

> **Amended (2026-06-26): D1 reversed — `SeqOpt`/`SeqOptPlot` are now CORE.** Only the SHAP-guided
> `mode="impact"` actually needs SHAP, and it is the minority path (`mode="importance"` + the whole
> NSGA-II/EA layer + plots are SHAP-free). `ShapModel` is therefore imported **lazily** inside the
> impact path, so SeqOpt is importable in a base install; constructing `mode="impact"` without
> `shap` raises a friendly `aaanalysis[pro]` hint. `SeqOpt`/`SeqOptPlot` + the `_backend/seqopt/`
> code moved from `protein_design_pro/` into the core `protein_design/` subpackage, and the
> now-empty `protein_design_pro` package was removed. The pro-gating in `aaanalysis/__init__.py` is
> replaced by core exports.

**D2 — Two guidance modes, named after their `df_feat` attribution column.**
- `mode="impact"` (default, headline): per-round `ShapModel` refit under **fuzzy labeling** →
fresh per-residue `|feat_impact|` → an **adaptive NSGA-II** population evolves the Pareto front.
Expand Down
2 changes: 1 addition & 1 deletion docs/source/index/docstring_guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ Rules:
-
* - ``SeqOpt`` / ``SeqOptPlot``
- ``seqopt`` / ``seqopt_plot``
- ``pro``
- core (``mode="impact"`` needs ``pro``)
* - ``TreeModel``
- ``tm``
-
Expand Down
3 changes: 2 additions & 1 deletion docs/source/index/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,8 @@ Added

**Protein Design**

- **SeqOpt — multi-objective protein engineering** (``[pro]``): A new ``SeqOpt`` optimizer
- **SeqOpt — multi-objective protein engineering** (**core**; only ``mode="impact"`` needs
``aaanalysis[pro]``): A new ``SeqOpt`` optimizer
(with ``SeqOptPlot``) performs **machine-learning-guided directed evolution** of one
wild-type — searching the Pareto front across several objectives at once, with a
model-bound ``SeqMut`` as the fitness engine and a re-implementation of NSGA-II for
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/api_tests/test_backend_import_hygiene.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
"num_feat": {"_numerical_feature"},
"aaclust": {"_aaclust", "_aaclust_plot"},
},
"protein_design_pro": {
"protein_design": {
"seqopt": {"_seqopt"},
},
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
"""This is a script to test SeqOpt.run / SeqOpt.eval and SeqOptPlot (**[pro]**).
"""This is a script to test SeqOpt.run / SeqOpt.eval and SeqOptPlot.

Guarded by ``shap`` (the whole protein_design_pro subpackage imports ShapModel); skipped in a
core-only environment. Tiny deterministic wild-type + real-scale df_feat so the genuine ΔCPP /
SeqOpt is a core class — these tests run in a base install. Only the SHAP-guided
``mode="impact"`` tests skip without ``shap`` (they call ``pytest.importorskip("shap")``
locally). Tiny deterministic wild-type + real-scale df_feat so the genuine ΔCPP /
SequenceFeature engine runs while staying fast.
"""
import numpy as np
Expand All @@ -13,8 +14,7 @@
import aaanalysis as aa
import aaanalysis.utils as ut

pytest.importorskip("shap")
from aaanalysis.protein_design_pro import SeqOpt, SeqOptPlot # noqa: E402
from aaanalysis.protein_design import SeqOpt, SeqOptPlot

settings.register_profile("ci", deadline=None)
settings.load_profile("ci")
Expand Down Expand Up @@ -201,6 +201,19 @@ def test_delta_pred_without_model_raises(self, wt, df_feat):


class TestSeqOptInit:
def test_seqopt_is_core(self):
# SeqOpt is a core class (not pro-gated): the real class, exported, not a stub.
assert "SeqOpt" in aa.__all__ and "SeqOptPlot" in aa.__all__
assert aa.SeqOpt.__module__ == "aaanalysis.protein_design._seqopt"

def test_impact_without_shap_raises_pro_hint(self, model, wt, monkeypatch):
# mode="impact" imports ShapModel lazily; without the pro dependency it raises a
# friendly install hint (SeqOpt itself stays importable in a base install).
import sys
monkeypatch.setitem(sys.modules, "aaanalysis.explainable_ai_pro", None)
with pytest.raises(ImportError, match=r"aaanalysis\[pro\]"):
SeqOpt(mode="impact", model=model, df_seq_ref=wt, labels=[1], random_state=0)

def test_impact_requires_reference(self, model):
with pytest.raises(ValueError, match="mode='impact'"):
SeqOpt(mode="impact", model=model, df_seq_ref=None, labels=None)
Expand All @@ -215,6 +228,7 @@ def test_df_scales_accepted(self, model):
assert so is not None

def test_impact_mode_runs(self, wt, df_feat):
pytest.importorskip("shap")
pytest.importorskip("sklearn")
from sklearn.ensemble import RandomForestClassifier
ref = pd.DataFrame({ut.COL_ENTRY: [f"R{i}" for i in range(8)],
Expand All @@ -234,6 +248,7 @@ def test_impact_mode_runs(self, wt, df_feat):
assert _non_dominated(df)

def test_impact_mode_df_seq_ref_with_extra_columns(self, wt, df_feat):
pytest.importorskip("shap")
# Regression: a reference from load_dataset carries jmd_n/tmd/jmd_c/label columns; the
# per-generation ShapModel refit must keep only the position-based columns (else the
# appended variant row NaN-trips check_df_seq).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@
from hypothesis import given, settings
import hypothesis.strategies as some

from aaanalysis.protein_design_pro._backend.seqopt.nsga2 import (
from aaanalysis.protein_design._backend.seqopt.nsga2 import (
normalize_objectives_, fast_non_dominated_sort, crowding_distance, crowded_better,
dcd_tournament, select_nsga2)
from aaanalysis.protein_design_pro._backend.seqopt.metrics import hypervolume, spread
from aaanalysis.protein_design_pro._backend.seqopt.genome import (
from aaanalysis.protein_design._backend.seqopt.metrics import hypervolume, spread
from aaanalysis.protein_design._backend.seqopt.genome import (
canonical, apply_genome, variant_label, random_genome, init_population, repair,
crossover_uniform, crossover_npoint, mutate)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
deap = pytest.importorskip("deap")
from deap import base, creator, tools # noqa: E402

from aaanalysis.protein_design_pro._backend.seqopt.nsga2 import ( # noqa: E402
from aaanalysis.protein_design._backend.seqopt.nsga2 import ( # noqa: E402
normalize_objectives_, fast_non_dominated_sort, crowding_distance, select_nsga2)

settings.register_profile("ci", deadline=None)
Expand Down
Loading