alex-mextner · alex-mextner · Jun 17, 2026 · Jun 16, 2026 · Jun 17, 2026
diff --git a/riglib/cli.py b/riglib/cli.py
diff --git a/riglib/config.py b/riglib/config.py
@@ -105,6 +105,11 @@ class LoadedConfig:
     global_path: Path | None = None
     repo_path: Path | None = None
     layers: list[str] = field(default_factory=list)
+    # Provenance: which config FILE last declared each top-level key. Built during the cascade
+    # (global first, then repo/explicit overwrites), so a key present only in the global config
+    # maps to the global path even when a repo rig.yaml is also loaded. Used to name the CORRECT
+    # source file in an unknown-item error instead of always blaming the repo file.
+    key_sources: dict[str, Path] = field(default_factory=dict)
 
     @property
     def agent_tools_source(self) -> str | None:
@@ -120,6 +125,36 @@ def defaults(self) -> dict[str, Any]:
         d = self.data.get("defaults")
         return d if isinstance(d, dict) else {}
 
+    @property
+    def primary_config_path(self) -> Path:
+        """The most-specific config file backing this config, for naming in error messages.
+
+        The repo ``rig.yaml`` OVERRIDES the global config, so an invalid key is most likely
+        the repo one — name it first; fall back to the global path, then to a notional repo
+        path so an error always points somewhere concrete to edit.
+        """
+        if self.repo_path is not None:
+            return self.repo_path
+        if self.global_path is not None:
+            return self.global_path
+        return repo_config_path(self.repo_root)
+
+    def source_for_key(self, dotted_key: str) -> Path:
+        """The config FILE that declared ``dotted_key`` (e.g. ``mcp.items.review``).
+
+        Resolves provenance by the TOP-LEVEL key (``mcp``): a key that came solely from the
+        global ``~/.config/rig/config.yaml`` names the global file, while a key the repo
+        ``rig.yaml`` set (or overrode) names the repo file. This is what lets an unknown-item
+        error point at the file actually carrying the stale entry — e.g. a removed MCP slot
+        left in the global config is reported against the global file, not the repo's.
+
+        Falls back to :attr:`primary_config_path` when the key has no tracked provenance (a
+        config built directly in tests, or a key that is not top-level).
+        """
+        top = dotted_key.split(".", 1)[0]
+        src = self.key_sources.get(top)
+        return src if src is not None else self.primary_config_path
+
 
 def load(
     repo_root: Path,
@@ -138,34 +173,51 @@ def load(
     layers: list[str] = []
     gpath: Path | None = None
     rpath: Path | None = None
+    # Track which file last declared each top-level key, in cascade order (global, then
+    # repo/explicit). A key the repo layer sets overwrites the global provenance — so the final
+    # mapping names the file the user must edit to remove the offending key.
+    key_sources: dict[str, Path] = {}
+
+    def _merge_layer(path: Path, label: str) -> None:
+        """Merge one config file into the accumulator and record its key provenance + layer.
+
+        Single seam so a new layer can't forget to update ``key_sources`` (the bug the
+        provenance feature exists to avoid). ``merged`` is rebound via ``nonlocal`` because
+        ``_deep_merge`` returns a fresh dict rather than mutating in place.
+        """
+        nonlocal merged
+        data = _load_yaml(path)
+        merged = _deep_merge(merged, data)
+        for k in data:
+            key_sources[k] = path
+        layers.append(f"{label}:{path}")
 
     if include_global:
         gpath = global_config_path()
         if gpath.is_file():
-            merged = _deep_merge(merged, _load_yaml(gpath))
-            layers.append(f"global:{gpath}")
+            _merge_layer(gpath, "global")
 
     if explicit_config is not None:
         rpath = explicit_config.resolve()
         if not rpath.is_file():
             raise ConfigError(f"--config file not found: {rpath}")
-        merged = _deep_merge(merged, _load_yaml(rpath))
-        layers.append(f"config:{rpath}")
+        _merge_layer(rpath, "config")
     else:
         rpath = repo_config_path(repo_root)
         if rpath.is_file():
-            merged = _deep_merge(merged, _load_yaml(rpath))
-            layers.append(f"repo:{rpath}")
+            _merge_layer(rpath, "repo")
 
     validate(merged)
     merged.pop("scope", None)  # `scope` is a removed legacy key — drop it so it never
     # lingers in loaded.data, gets re-serialized, or is mistaken for a live setting.
+    key_sources.pop("scope", None)
     return LoadedConfig(
         data=merged,
         repo_root=repo_root,
         global_path=gpath if gpath and gpath.is_file() else None,
         repo_path=rpath if rpath and rpath.is_file() else None,
         layers=layers,
+        key_sources=key_sources,
     )
 
 

diff --git a/riglib/errors.py b/riglib/errors.py
@@ -0,0 +1,273 @@
+"""Error system v2 — every rig error is WHAT / WHY / HOW-to-fix, with a stable exit code.
+
+Runtime reach: raised by config/catalog/plan/status code paths; rendered by the top-level
+CLI handler (:func:`guard`, wired into ``cli.main``). Centralizing the shape here means a
+human staring at a failure always gets the same three things — what happened, why (the root
+cause, the offending config FILE PATH + key), and a concrete command/edit to fix it — and a
+script always gets a meaningful, *stable* exit code per failure class.
+
+Invariants:
+- The exit-code constants below are a PUBLIC CONTRACT. Scripts/CI branch on them; changing a
+  value is a breaking change. They are documented in ``rig --help`` and ``docs/config-schema.md``.
+- They follow the ``structured-exit-codes`` skill: 0 success, 2 invalid-config/usage,
+  127 missing-dependency (shell convention), plus rig-specific classes (drift/unknown-item/
+  missing-target/not-a-repo) that a caller wants to distinguish.
+- Stdlib-only at import time (AGENTS.md hard rule) — no third-party imports here.
+
+History: born from two same-day prod failures whose errors were thin and undiagnosable —
+``unknown mcp item(s): review (known: none)`` (no hint it was a REMOVED slot or how to remove
+it) and a dead hook path surfacing only as a generic harness "PreToolUse error". This module
+exists so neither recurs: the removed-slot registry names ``mcp.items.review`` and prints the
+exact removal PR + fix, and missing-target names the gone file + how to regenerate it.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+# ── stable, per-class exit codes (PUBLIC CONTRACT — do not renumber) ───────────────
+EXIT_OK = 0
+# 1 is reserved for an UNEXPECTED/internal failure (an unhandled exception): a caller can
+# tell "rig itself crashed" (1) from "rig diagnosed a known problem" (>=2).
+EXIT_INTERNAL = 1
+EXIT_CONFIG = 2  # malformed / invalid config value (usage class, per the skill)
+EXIT_DRIFT = 3  # `rig status` found drift (config and disk disagree)
+EXIT_UNKNOWN_ITEM = 4  # config names a catalog item that doesn't exist (typo / removed slot)
+EXIT_MISSING_TARGET = 5  # config references a path/binary that's gone on disk
+EXIT_NOT_A_REPO = 6  # a repo-scoped command run outside a git repository
+EXIT_MISSING_DEP = 127  # a required external tool/binary isn't installed (shell convention)
+
+
+@dataclass
+class RigError(Exception):
+    """A structured, renderable error: WHAT happened / WHY / HOW to fix + an exit code.
+
+    ``what`` — the symptom, one line ("unknown mcp item: reviewr").
+    ``why``  — the root cause + context: the offending CONFIG FILE PATH + key when relevant.
+    ``fix``  — a concrete command or edit the user can run/make right now.
+    ``exit_code`` — the failure class (one of the EXIT_* constants); subclasses pin it.
+    """
+
+    what: str
+    why: str = ""
+    fix: str = ""
+    exit_code: int = EXIT_INTERNAL
+
+    def __post_init__(self) -> None:
+        # Exception's own machinery wants args set; keep str(e) == the WHAT line.
+        super().__init__(self.what)
+
+    def __str__(self) -> str:
+        return self.what
+
+
+@dataclass
+class ConfigError(RigError):
+    """Malformed/invalid config — a bad value, type, or unknown key. Fail-closed (exit 2)."""
+
+    exit_code: int = EXIT_CONFIG
+
+
+@dataclass
+class UnknownItemError(RigError):
+    """Config names a catalog item that doesn't exist (typo or a removed slot). Exit 4."""
+
+    exit_code: int = EXIT_UNKNOWN_ITEM
+
+
+@dataclass
+class MissingTargetError(RigError):
+    """Config references a path/binary that's gone on disk (a dead hook path, …). Exit 5."""
+
+    exit_code: int = EXIT_MISSING_TARGET
+
+
+@dataclass
+class NotARepoError(RigError):
+    """A repo-scoped command was run outside a git repository. Exit 6."""
+
+    exit_code: int = EXIT_NOT_A_REPO
+
+
+@dataclass
+class MissingDepError(RigError):
+    """A required external tool/binary isn't installed. Exit 127 (shell convention)."""
+
+    exit_code: int = EXIT_MISSING_DEP
+
+
+# ── rendering ──────────────────────────────────────────────────────────────────────
+def _c(code: str, s: str, color: bool) -> str:
+    return f"\033[{code}m{s}\033[0m" if color else s
+
+
+def render(err: RigError, *, color: bool = True) -> str:
+    """Render a :class:`RigError` as the consistent 3-part block (what / why / fix).
+
+    Always shows the WHAT (prefixed ``error:``); shows WHY and FIX only when populated, so a
+    terse error doesn't print empty labels. The label words ``why``/``fix`` always appear when
+    their field is set — the contract the CLI handler and tests rely on.
+    """
+    lines = [_c("31", f"error: {err.what}", color)]
+    if err.why:
+        lines.append(_c("2", "  why: ", color) + err.why)
+    if err.fix:
+        lines.append(_c("32", "  fix: ", color) + err.fix)
+    return "\n".join(lines)
+
+
+def guard(fn) -> int:
+    """Run ``fn`` and translate any :class:`RigError` into render() + its exit code.
+
+    The single top-level CLI handler: a command body raises a structured error and this turns
+    it into a consistent printed block + the stable per-class exit code. A non-RigError
+    (a real bug) is NOT swallowed — it propagates so the stack trace is visible.
+    """
+    try:
+        return fn()
+    except RigError as exc:
+        import sys
+
+        print(render(exc, color=sys.stdout.isatty()))
+        return exc.exit_code
+
+
+# ── did-you-mean (Levenshtein) ──────────────────────────────────────────────────────
+def _levenshtein(a: str, b: str) -> int:
+    """Classic edit distance, stdlib-only (small strings — item names — so O(n*m) is fine)."""
+    if a == b:
+        return 0
+    if not a:
+        return len(b)
+    if not b:
+        return len(a)
+    prev = list(range(len(b) + 1))
+    for i, ca in enumerate(a, 1):
+        cur = [i]
+        for j, cb in enumerate(b, 1):
+            cur.append(min(prev[j] + 1, cur[j - 1] + 1, prev[j - 1] + (ca != cb)))
+        prev = cur
+    return prev[-1]
+
+
+def did_you_mean(bad: str, candidates: set[str]) -> str | None:
+    """The nearest catalog name to ``bad`` within a sensible edit-distance threshold.
+
+    Returns ``None`` when the catalog is empty or nothing is close enough — so we never
+    fabricate a bogus suggestion for a wildly different token. The threshold scales with the
+    typo length (a longer name tolerates more edits) but is capped so "zzzzzzzz" → "review"
+    is rejected.
+    """
+    if not candidates:
+        return None
+    best = min(candidates, key=lambda c: (_levenshtein(bad, c), c))
+    dist = _levenshtein(bad, best)
+    # allow up to ~40% of the longer string's length, capped at 3 — enough for a real typo,
+    # tight enough to reject an unrelated word.
+    threshold = min(3, max(1, round(0.4 * max(len(bad), len(best)))))
+    return best if dist <= threshold else None
+
+
+# ── removed / deprecated slot registry ──────────────────────────────────────────────
+@dataclass(frozen=True)
+class RemovedSlot:
+    """A catalog slot that USED to exist and was removed — so its config key is now invalid.
+
+    ``reason`` names WHY/WHEN it went (the PR + the rationale); the error builder turns it
+    into a precise "remove ``<key>`` from ``<config path>``" fix instead of a useless
+    "unknown item (known: none)".
+    """
+
+    category: str
+    name: str
+    reason: str
+
+
+# Seeded with the motivating case. Keyed (category, name). Add an entry whenever a catalog
+# slot is removed so a lingering config reference explains itself instead of looking like a typo.
+_REMOVED_SLOTS: dict[tuple[str, str], RemovedSlot] = {
+    ("mcp", "review"): RemovedSlot(
+        category="mcp",
+        name="review",
+        reason="removed in agent-tools #32: review is a CLI + skill, not an MCP server "
+        "(the `review --mcp` entrypoint was dropped in the subcommand refactor)",
+    ),
+}
+
+
+def removed_slot(category: str, name: str) -> RemovedSlot | None:
+    """Look up a (category, name) in the removed-slot registry; ``None`` if it was never a slot."""
+    return _REMOVED_SLOTS.get((category, name))
+
+
+# ── error builders (the heuristics, assembled into structured errors) ───────────────
+def unknown_item_error(
+    *,
+    category: str,
+    key: str,
+    bad: str,
+    known: set[str],
+    config_path: str,
+) -> UnknownItemError:
+    """Build the error for a config that names a non-existent catalog item.
+
+    Priority of explanation (most specific first):
+      1. **removed slot** — the name was a real slot that got removed: cite the PR + tell the
+         user to remove ``<key>`` from ``<config path>``.
+      2. **empty category** — the catalog has NO slots in this category: tell them to remove the
+         whole ``<category>`` block, NOT "known: none".
+      3. **did-you-mean** — the catalog has slots and one is close: suggest it.
+      4. **fallthrough** — list the known names so they can pick a valid one.
+
+    ``key`` is the dotted config key (``mcp.items.review``), ``bad`` the bare item name
+    (``review``), ``config_path`` the offending file — all three appear in the output.
+    """
+    removed = removed_slot(category, bad)
+    if removed is not None:
+        return UnknownItemError(
+            what=f"removed {category} slot: {bad}",
+            why=f"`{key}` ({removed.reason}); declared in {config_path}",
+            fix=f"remove `{key}` from {config_path}",
+        )
+
+    if not known:
+        return UnknownItemError(
+            what=f"unknown {category} item: {bad}",
+            why=f"the {category} catalog has no slots (declared in {config_path})",
+            fix=f"remove the `{category}` block from {config_path} — there are no {category} "
+            f"slots to enable",
+        )
+
+    suggestion = did_you_mean(bad, known)
+    if suggestion is not None:
+        return UnknownItemError(
+            what=f"unknown {category} item: {bad}",
+            why=f"`{key}` names an item not in the {category} catalog (declared in {config_path})",
+            fix=f"did you mean `{suggestion}`? fix `{key}` in {config_path}",
+        )
+
+    known_list = ", ".join(sorted(known))
+    return UnknownItemError(
+        what=f"unknown {category} item: {bad}",
+        why=f"`{key}` names an item not in the {category} catalog (declared in {config_path})",
+        fix=f"use one of: {known_list} — or remove `{key}` from {config_path}",
+    )
+
+
+def missing_target_error(
+    *,
+    what_kind: str,
+    target: str,
+    why: str,
+    regen: str,
+) -> MissingTargetError:
+    """Build the error for a config that points at a path/binary that's gone on disk.
+
+    ``what_kind`` is a short noun ("hook", "binary", "skill"); ``target`` the missing path;
+    ``why`` the root cause; ``regen`` how to recreate it (a concrete command).
+    """
+    return MissingTargetError(
+        what=f"missing {what_kind}: {target}",
+        why=why,
+        fix=regen,
+    )