diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index de6f684..b99a374 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,6 +30,28 @@ jobs: - name: Smoke run: bash tests/smoke.sh + # The REAL tmux reboot-cycle e2e (apply → execute the boot script → assert a server comes up + # with config + a session; a fake claude child → non-empty cc-map; a real resurrect snapshot; + # old-boot cleanup). Opt-in (RIG_TMUX_E2E=1) + network (clones the real plugins), so it lives in + # its OWN job — the hermetic `pytest -q` above stays offline-safe. Auto-skips if tmux/git/network + # is unavailable (never a false block); proves the reboot fix on every PR when they ARE present. + tmux-e2e: + name: tmux reboot-cycle e2e + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v5 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - name: Install tmux + run: sudo apt-get update && sudo apt-get install -y tmux + - name: Install package + test deps + run: pip install -e '.[test]' pyyaml + - name: Real tmux e2e (opt-in) + env: + RIG_TMUX_E2E: "1" + run: python -m pytest tests/test_tmux_e2e.py -q -rs + secret-scan: # Mirrors the agent-tools secret-scan gate (gitleaks, pinned). Block tier: a # high-confidence finding fails the job. diff --git a/AGENTS.md b/AGENTS.md index 20cfc6f..df8117c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -46,12 +46,19 @@ decided by TTY + config + flags. `init` is the canonical onboarding command (the - **Agent-hook `cmd` is always written absolute.** The `agents-hooks/v1` runner rejects relative paths; the install action rewrites the `/ABSOLUTE/PATH/TO/...` placeholder to the real script path in the agent-tools checkout. -- **Never mutate a LIVE running service.** rig prepares on-disk artifacts; the user reloads. - The `tmux` block writes `rig.tmux.conf` + the managed scripts + a boot launchd plist and - wires `~/.tmux.conf`, but NEVER runs `tmux source-file` against the user's live server and - NEVER `launchctl load`s the boot plist (that would disrupt an active session). The `models` - schedule is the one exception (a non-interactive cron is safe to (re)load). Migration backs - up the original (`~/.tmux.conf.rig-bak`) and never overwrites an existing backup. +- **Never mutate a LIVE running service in a way that disrupts an active session.** rig prepares + on-disk artifacts; the user reloads their config. The `tmux` block writes `rig.tmux.conf` + the + managed scripts + the boot script + a boot launchd plist and wires `~/.tmux.conf`, but NEVER + runs `tmux source-file` against the user's live server (that would re-apply config under their + feet). **The tmux LIVE ACTIVATION is the deliberate exception** (a clean machine must end up + FULLY working with zero manual steps, CTO 2026-06-16): on `rig apply` rig also clones the + plugins, creates `~/.tmux/resurrect`, `launchctl load -w`s the BOOT agent, takes a first + `resurrect save`, and cleans continuum's stale boot. These are SAFE for an active session — the + boot agent's script is idempotent (`has-session` → exit 0, never spawns a duplicate or touches + existing panes), and a first `resurrect save` is read-only w.r.t. the live session. It mirrors + the `models` schedule exception (a non-interactive launchd agent is safe to (re)load). Gate the + whole activation behind `RIG_TMUX_DRY_RUN` (the unit suite + CI set it). Migration backs up the + original (`~/.tmux.conf.rig-bak-`, timestamped) and never overwrites an existing backup. ## The integration seam (agent-tools) @@ -64,11 +71,18 @@ should hard-code agent-tools paths. - `python -m pytest -q` — the unit suite. Fast, hermetic; uses a fake agent-tools checkout (`tests/conftest.py::fake_agent_tools`) and `tmp_path` — tests never touch the real HOME - or a real agent-tools checkout. + or a real agent-tools checkout. The autouse guards `RIG_TMUX_DRY_RUN=1` / + `_isolate_scheduler` keep the tmux live-activation + the scheduler out of the suite. +- `RIG_TMUX_E2E=1 python -m pytest -q tests/test_tmux_e2e.py` — the **opt-in** real-tmux e2e + (the acceptance gate for the tmux reboot cycle: it drives a REAL tmux server on a private + `-L` socket and clones the real plugins, so it needs tmux + git + network). It is OFF in the + default `pytest` run to keep that hermetic; the tmux BFS / artifact logic it proves is ALSO + covered hermetically by the unit suite (`test_pane_has_claude_*` etc.). Auto-skips offline. - `bash tests/smoke.sh` — end-to-end: `--help`, `doctor`, a headless `init` against a sample config in a throwaway repo with an isolated `HOME`, idempotency, status, pytest. Needs a real agent-tools checkout (`RIG_AGENT_TOOLS_SOURCE`); self-skips the apply leg - without one. + without one. The init leg sets `RIG_TMUX_DRY_RUN=1` so the tmux artifacts land without the + live activation. - Add a test with every behavior change. TDD red-first is the house style. ## Style diff --git a/docs/config-schema.md b/docs/config-schema.md index 29bacd7..885aaf4 100644 --- a/docs/config-schema.md +++ b/docs/config-schema.md @@ -486,6 +486,9 @@ tmux: session: main # the canonical session name boot: enabled: true # a launchd agent that brings tmux up after a macOS reboot + login_shell: + enabled: true # restored panes are LOGIN shells (so ~/.zprofile/PATH is sourced) + shell: "" # "" → resolve the user's $SHELL at apply; else an absolute path ``` | Key | Type | Default | Meaning | @@ -503,7 +506,10 @@ tmux: | `cc_restore.enabled` | bool | `true` | wire cc-save/cc-restore via resurrect post-save/post-restore hooks | | `anti_sprawl.enabled` | bool | `true` | install the attach-or-create entry script | | `anti_sprawl.session` | str | `main` | the one canonical session name | -| `boot.enabled` | bool | `true` | write a launchd agent (macOS) that starts tmux after a reboot | +| `boot.enabled` | bool | `true` | write a launchd agent (macOS) that runs the boot script after a reboot, and `launchctl load -w` it on apply | +| `boot.label` | str | `ai.hyperide.tmux-boot` | the launchd agent label (and plist filename stem) | +| `login_shell.enabled` | bool | `true` | set a **login-shell** `default-command` so restored panes source `~/.zprofile`/PATH (resurrect otherwise restores a non-login shell with a broken env) | +| `login_shell.shell` | str | `""` | login shell path. `""` resolves the user's `$SHELL` at apply (falling back to `/bin/zsh` then `/bin/sh`); a non-empty override **must be an absolute path** to the shell binary (a relative name or a command-with-args is rejected, so it can't silently produce a broken `default-command`) and is used verbatim. The path is **baked at generation** — NOT a tmux `${SHELL}` reference, because tmux rejects `${VAR:-default}` and would abort the whole config | **Apply mechanism — import-preferred, managed-block fallback.** @@ -537,14 +543,35 @@ a weeks-stale session. rig's generator pins the order: plugin options → cc-res **Moshi tweak (opt-in, BEFORE continuum init)** → resurrect init → **continuum init LAST** → tpm init last-of-all. So the Moshi tweak can never wipe continuum's hook again. +**Boot + live activation (clean machine → fully working, zero manual steps).** A `rig apply` +with `boot.enabled` writes a launchd agent whose entrypoint is the generated **boot script** +(`tmux-boot.sh`), then **`launchctl load -w`**s it so it fires at login across reboots. The boot +script runs `tmux new-session -d` (NOT `tmux start-server`): a bare `start-server` starts an +**empty** server that loads neither the config nor any plugin (tmux sources the conf only on the +first session), so `@continuum-restore` never fires — `tmux ls` says "no server running" after +login. Creating a session loads `~/.tmux.conf` → the sourced `rig.tmux.conf` → continuum → +restore. The boot script is idempotent (`has-session` → exit 0), so a warm login never spawns a +duplicate. On the same apply rig also: creates `~/.tmux/resurrect` (absent → resurrect writes no +snapshot → nothing to restore); **clones** `tpm` + `tmux-resurrect` + `tmux-continuum` into +`~/.tmux/plugins` if missing (so the `@plugin` decls resolve on a clean machine); takes a first +`resurrect save` (so a reboot has something to restore); and on macOS **cleans continuum's own +stale boot** (its `osx_iterm/terminal_start_tmux.sh` Login Items + an old `Tmux.Start` launchd +agent) that would otherwise compete with rig's boot agent. Every step is idempotent and non-fatal +(an offline machine just skips the clone and retries on the next apply). Set `RIG_TMUX_DRY_RUN=1` +to write the on-disk artifacts but skip all live activation (CI / containers). + **cc-restore — per-window Claude Code resume by exact session id.** rig installs two managed scripts and wires them via `@resurrect-hook-post-save-all` / `@resurrect-hook-post-restore-all`: -- **`cc-save.sh`** — for every pane whose command is `claude`, take its cwd, find the **newest** - session id under `~/.claude/projects//`, and write a `window/pane → cwd → - session_id` map. **Encoding (verified against real on-disk dirs):** the projects-dir name is - the cwd with **every `/` and `.` replaced by `-`** (e.g. `/Users/u/.files` → - `-Users-u--files`). +- **`cc-save.sh`** — for every pane whose **process tree** contains a `claude` process, take its + cwd, find the **newest** session id under `~/.claude/projects//`, and write a + `window/pane → cwd → session_id` map. **Detection is by the process TREE, not the command + string:** Claude Code shows up in `pane_current_command` as its VERSION (e.g. `2.1.178`), and + the real `claude` process is a CHILD of the pane's shell — so cc-save walks the pane's + descendants (`ps -eo pid,ppid,comm`) for a process whose command is `claude`. (Filtering on + `pane_current_command == claude` matched nothing → an empty map → cc never resumed.) + **Encoding (verified against real on-disk dirs):** the projects-dir name is the cwd with + **every `/` and `.` replaced by `-`** (e.g. `/Users/u/.files` → `-Users-u--files`). - **`cc-restore.sh`** — after a reboot, for each mapped window run `claude --resume ` — **only into a fresh shell pane** (never on top of a running `claude`). A stale/missing id falls back to `claude --continue` (most-recent session in that cwd) so a reboot is never left @@ -572,17 +599,19 @@ user's shell rc): `[ -z "$TMUX" ] && exec ~/.config/rig/tmux/tmux-attach.sh`. On there is **one** canonical tmux path — the rumored second wrapper (`ln`/`.ln.conf`) does not exist here (`/bin/ln` is coreutils; no `~/.ln.conf`), so there is nothing to reconcile against. -**boot.** rig's launchd agent is the **single** boot path. It writes -`~/Library/LaunchAgents/ai.hyperide.tmux-boot.plist` (`RunAtLoad`) that runs `tmux start-server` -at login; `@continuum-restore 'on'` then restores the saved session into it — less iTerm-coupled -than the old `osx_iterm_start_tmux.sh` approach. **rig deliberately keeps `@continuum-boot 'off'` -in the generated config:** `@continuum-boot 'on'` would make tmux-continuum install its OWN, -untracked boot artifact (the iTerm-coupled `Tmux.Start.plist` on macOS / a systemd user unit on -Linux) — a second, competing boot path rig can't manage. So continuum handles *restore*, rig's -launchd agent handles *boot*. **rig writes the plist but does NOT `launchctl load` it,** and never -runs `tmux source-file` against the user's **live** server — the on-disk result is prepared; the -**user reloads / reboots** when ready (so an active session is never disrupted). The boot path can -only be fully proven by an actual reboot. +**boot.** rig's launchd agent is the **single** boot path; the mechanics are in the +"Boot + live activation" section above — in short: the agent's `RunAtLoad` plist runs the +generated **boot script** (`tmux-boot.sh` → `tmux new-session -d`, NOT a bare `tmux start-server`, +which would start an empty server with no conf/plugins loaded), `@continuum-restore 'on'` restores +the saved session into it, and `rig apply` **`launchctl load -w`s** the agent so it fires at login. +**rig deliberately keeps `@continuum-boot 'off'`** in the generated config: `@continuum-boot 'on'` +would make tmux-continuum install its OWN, untracked boot artifact (the iTerm-coupled +`Tmux.Start.plist` on macOS / a systemd user unit on Linux) — a second, competing boot path; rig +also **cleans** that stale artifact on macOS. So continuum handles *restore*, rig's launchd agent +handles *boot*. rig never runs `tmux source-file` against the user's **live** server (the user +reloads their config when ready); the boot agent's script is idempotent (`has-session` → exit 0, +no duplicate, no pane touched), so loading it does not disrupt an active session. The +boot-from-cold path can only be fully proven by an actual reboot. --- diff --git a/rig.yaml b/rig.yaml index e573bf4..cae3fd8 100644 --- a/rig.yaml +++ b/rig.yaml @@ -102,4 +102,5 @@ harness: # moshi: { enabled: true } # opt-in; emitted BEFORE continuum init (root-cause fix) # cc_restore: { enabled: true } # per-window `claude --resume ` after a reboot # anti_sprawl: { enabled: true, session: main } # attach-or-create one canonical session -# boot: { enabled: true } # launchd agent brings tmux up after a macOS reboot +# boot: { enabled: true } # launchd agent brings tmux up after a macOS reboot (load -w on apply) +# login_shell: { enabled: true } # restored panes are login shells (~/.zprofile/PATH sourced) diff --git a/riglib/actions/runner.py b/riglib/actions/runner.py index 3b1edcd..9cc17c6 100644 --- a/riglib/actions/runner.py +++ b/riglib/actions/runner.py @@ -958,6 +958,18 @@ def _schedule_dry_run() -> bool: return os.environ.get("RIG_SCHEDULE_DRY_RUN", "").strip().lower() in ("1", "true", "yes") +def _tmux_dry_run() -> bool: + """Honor RIG_TMUX_DRY_RUN — write the tmux artifacts but DON'T run the LIVE activation. + + The live activation (clone tpm/resurrect/continuum, create ~/.tmux/resurrect, ``launchctl + load -w`` the boot agent, take a first ``resurrect save``, clean continuum's stale macOS + boot Login Items) is real network + daemon + ``tmux``-server access — unwanted in CI / + containers / the unit suite. With the flag set, the on-disk artifacts still land; the live + effects are skipped. Mirrors :func:`_schedule_dry_run`. + """ + return os.environ.get("RIG_TMUX_DRY_RUN", "").strip().lower() in ("1", "true", "yes") + + def _do_provision_schedule(action: Action, on_conflict: str) -> ActionResult: """Install the daily model-freshness schedule IF MISSING (idempotent). @@ -1074,6 +1086,22 @@ def _launchctl_loaded(label: str) -> bool: return res.returncode == 0 +def _launchctl_load_enable(plist: Path) -> int: + """``launchctl load -w `` — load the agent AND enable it across reboots (``-w``). + + Separate from :func:`_launchctl` because ``-w`` is a FLAG that must be its own argv token + (``["launchctl", "load", "-w", ]``), not folded into the verb. Used to actually FIRE + the tmux boot agent at login — rig previously wrote the plist but never loaded it (DEFECT 1). + """ + try: + res = subprocess.run( + ["launchctl", "load", "-w", str(plist)], capture_output=True, text=True, timeout=20 + ) + except (OSError, subprocess.SubprocessError): + return 1 + return res.returncode + + # ── git/gh shells (isolated for testability) ────────────────────────────────────── def _git_global(key: str) -> str | None: try: @@ -1130,6 +1158,7 @@ def tmux_plan_from_action(action: Action): cc_restore=dict(opts.get("cc_restore", {}) or {}), anti_sprawl=dict(opts.get("anti_sprawl", {}) or {}), boot=dict(opts.get("boot", {}) or {}), + login_shell=dict(opts.get("login_shell", {}) or {}), ) @@ -1206,8 +1235,12 @@ def _do_provision_tmux(action: Action, on_conflict: str) -> ActionResult: ) # 2) the managed scripts (chmod +x) — cc-save/cc-restore always, the anti-sprawl - # attach-or-create entry when enabled. `managed_scripts()` is the ONE source apply and - # drift share, so they can't diverge on which scripts exist. + # attach-or-create entry when enabled, the boot script when boot is enabled. `managed_scripts()` + # is the ONE source apply and drift share, so they can't diverge on which scripts exist. + # `boot_script_conflicted` records a DIFFERING boot script left untouched under skip: the plist + # the launchd agent runs points at THIS script, so loading the agent would run a stale boot + # script — suppress the load in that case too (review P1), like the plist/conf conflicts below. + boot_script_conflicted = False for path, body in plan.managed_scripts(): out = fsutil.write_file(path, body, on_conflict) if out.status == "error": @@ -1229,6 +1262,8 @@ def _do_provision_tmux(action: Action, on_conflict: str) -> ActionResult: changed = True details.append(f"restored +x on {path.name}") if conflict_skip: + if path == plan.boot_script_path: + boot_script_conflicted = True # a pre-existing DIFFERING file at rig's script path was left untouched under # on_conflict=skip — but the generated config wires a resurrect hook at this path, so # resurrect would run the user's/stale file. SURFACE it in the detail, but do NOT set @@ -1240,6 +1275,11 @@ def _do_provision_tmux(action: Action, on_conflict: str) -> ActionResult: ) # 3) the boot launchd plist (macOS) — written, never loaded (the user reboots). + # `boot_plist_conflicted` records a DIFFERING plist left untouched under on_conflict=skip: the + # activation must then NOT `launchctl load -w` it (that would ENABLE a stale/unmanaged boot + # path despite skip semantics — codex finding). Same for a conflict-skipped ~/.tmux.conf below. + boot_plist_conflicted = False + boot_plist_changed = False # plist (re)written this apply → an already-loaded agent is reloaded. if plan.boot_enabled and sys.platform == "darwin": plan.boot_plist_path.parent.mkdir(parents=True, exist_ok=True) boot_out = fsutil.write_file(plan.boot_plist_path, plan.render_boot_plist(), on_conflict) @@ -1249,9 +1289,11 @@ def _do_provision_tmux(action: Action, on_conflict: str) -> ActionResult: extra_backups.append(boot_out.backup) if boot_out.status != "skipped": changed = True + boot_plist_changed = True details.append(f"wrote boot plist {plan.boot_plist_path.name} (load on next login/reboot)") elif not boot_out.detail.startswith("identical"): # rig OWNS the boot plist — a DIFFERING one left untouched under skip is stale. + boot_plist_conflicted = True skipped_conflicts.append( f"{plan.boot_plist_path.name} differs and on_conflict=skip — NOT updated " f"(re-run with backup/overwrite to refresh the boot plist)" @@ -1272,6 +1314,7 @@ def _do_provision_tmux(action: Action, on_conflict: str) -> ActionResult: details.append(f"backed up original → {backup_target.name}") changed = True + conf_conflicted = False desired_conf = _tmux_conf_with_managed( plan, existing, splice_managed_block, neutralize_inline_rig_lines ) @@ -1281,6 +1324,7 @@ def _do_provision_tmux(action: Action, on_conflict: str) -> ActionResult: # which go through fsutil.write_file's skip path). A non-existent conf is always created # (there's nothing to conflict with). backup/overwrite both proceed to write. if on_conflict == "skip" and existing: + conf_conflicted = True details.append(f"~/.tmux.conf differs but on_conflict=skip — left unwired ({conf.name})") else: conf.parent.mkdir(parents=True, exist_ok=True) @@ -1288,6 +1332,29 @@ def _do_provision_tmux(action: Action, on_conflict: str) -> ActionResult: changed = True details.append(f"wired {plan.apply_mode} into {conf.name}") + # 5) LIVE activation (DEFECTS 1/4/5/6) — make a CLEAN machine fully working with no manual + # steps: create ~/.tmux/resurrect, clone missing plugins, launchctl-load the boot agent, + # take a FIRST resurrect save (only if none exists), clean continuum's stale macOS boot. Each + # step is idempotent (skip-if-present) and runs even when the file-write path was a no-op (a + # deleted plugin must be re-cloned on re-apply). RIG_TMUX_DRY_RUN skips ALL of it (CI/unit). + # Returns (real changes, warnings): ONLY real changes mark `changed` (so a steady-state + # re-apply stays a no-op); warnings (failed clone/launchctl — offline) are surfaced but never + # inflate ApplyReport.changed (codex/opus idempotency finding). + # + # boot_load_safe gates the launchctl-load: if the boot plist, the BOOT SCRIPT it runs, OR the + # ~/.tmux.conf wiring was CONFLICT-skipped (left stale/unwired), loading the agent would enable + # a stale/unmanaged boot path despite skip semantics — so we suppress the load then (review + # findings). Plugins / the resurrect dir / the first save are still safe to run. + act_changes, act_warnings = _tmux_activate( + plan, + boot_load_safe=not (boot_plist_conflicted or boot_script_conflicted or conf_conflicted), + boot_plist_changed=boot_plist_changed, + ) + if act_changes: + changed = True + details.extend(act_changes) + skipped_conflicts.extend(act_warnings) + if not changed: # nothing was written. If a managed script was conflict-skipped, that is UNRESOLVED drift # (the hook points at an unmanaged file) — report it as `skipped` with the warning, NOT a @@ -1311,6 +1378,250 @@ def _do_provision_tmux(action: Action, on_conflict: str) -> ActionResult: return ActionResult(action, status, f"tmux/config: {'; '.join(details)}", headline) +def _tmux_activate( + plan, *, boot_load_safe: bool = True, boot_plist_changed: bool = False +) -> tuple[list[str], list[str]]: + """Bring the rig-managed tmux LIVE on this machine (DEFECTS 1/4/5/6). + + Returns ``(changes, warnings)``: ``changes`` are real mutations the run performed (the caller + marks the apply ``changed`` ONLY for these, so a steady-state re-apply with nothing to do is a + genuine no-op); ``warnings`` are non-fatal degradations (a failed clone / launchctl on an + offline machine) — surfaced to the operator but NEVER counted as a change (else every re-apply + would falsely report ``created`` — codex/opus idempotency finding). + + ``boot_load_safe`` (caller-supplied): when False — the boot plist / boot script / ``~/.tmux.conf`` + wiring was CONFLICT-skipped (left stale/unwired) — the launchctl-load is SUPPRESSED so we never + enable a stale/unmanaged boot path despite on_conflict=skip (review finding). The non-boot steps + (plugins / resurrect dir / first save) still run; they don't risk activating a stale boot. + + ``boot_plist_changed`` (caller-supplied): the boot plist was (re)written this apply. We load the + agent when it is NOT loaded; we only UNLOAD-then-reload an ALREADY-loaded agent when the plist + CHANGED. A steady-state re-apply (loaded + unchanged) does nothing — so we never restart the + agent every run (re-spawning a ``main`` session on the live server) and a transient load failure + can't disable a working unchanged agent (review findings). + + Steps, each idempotent and non-fatal (a clean machine must end up FULLY working with zero + manual steps; a partial/offline machine degrades, never aborts the whole apply): + + 4) create ``~/.tmux/resurrect`` so resurrect can write its ``tmux_resurrect_*.txt`` + snapshot (absent dir = no snapshot ever written = nothing to restore on reboot). + 6) clone the canonical tmux plugins (tpm + resurrect + continuum) into ``~/.tmux/plugins`` + if MISSING (default branch, one-shot, never auto-upgraded — see tmux.PLUGINS' trust + contract), so the ``@plugin`` declarations actually resolve on a clean machine. + 1) on macOS, ``launchctl load -w`` the boot agent so it FIRES at login (rig used to write + the plist but never load it). The boot script itself is idempotent (``has-session`` → + exit 0), so loading it never disrupts an active session. + 5) on macOS, clean continuum's OWN stale boot (``osx_iterm/terminal_start_tmux.sh`` Login + Items + an old ``Tmux.Start`` launchd agent) that competes with rig's boot agent — gated + on ``boot.enabled`` (if the user opted OUT of rig boot, never nuke their own autostart). + 6b) take a FIRST ``resurrect save`` ONLY when no snapshot exists yet — so a re-apply never + re-saves (idempotency) and never clobbers a good snapshot with an empty/partial one. + + ``RIG_TMUX_DRY_RUN`` skips every live step (the file artifacts already landed in the caller). + """ + if _tmux_dry_run(): + return [], [] + + from .. import tmux as tmod + + changes: list[str] = [] + warnings: list[str] = [] + + # 4) the resurrect snapshot dir. + resurrect_dir = plan.home / ".tmux" / "resurrect" + if not resurrect_dir.is_dir(): + resurrect_dir.mkdir(parents=True, exist_ok=True) + changes.append("created ~/.tmux/resurrect") + + # 6) clone missing plugins (idempotent: skip a COMPLETE checkout that already exists). A + # partial dir from a prior failed clone is NOT treated as installed — it is removed and + # re-cloned, so the "offline retries next apply" contract holds (codex finding). + plugins_dir = plan.home / ".tmux" / "plugins" + plugins_dir.mkdir(parents=True, exist_ok=True) + for name, (repo, entrypoint) in tmod.PLUGINS.items(): + dest = plugins_dir / name + if (dest / entrypoint).exists(): + continue + if dest.exists(): + # a partial/broken checkout (entrypoint missing) — clear it so the clone retries clean. + shutil.rmtree(dest, ignore_errors=True) + rc = _git_clone(repo, dest) + if rc == 0: + changes.append(f"installed plugin {name}") + else: + # non-fatal (offline / no git): a WARNING, not a change — must not inflate `changed`. + shutil.rmtree(dest, ignore_errors=True) # drop any partial dir the failed clone left. + warnings.append(f"plugin {name} NOT installed (clone failed rc={rc} — offline?)") + + # 1) launchctl-load the boot agent (macOS) so it fires at login. `-w` enables it across reboots. + # Load ONLY when there is real work: + # - NOT loaded → load it (no unload first — nothing to unload; a steady-state re-apply where + # it's already loaded does NOTHING, so apply stays a no-op AND we never restart it every run + # / re-spawn a `main` session on the live server: review Low). + # - loaded BUT the plist was rewritten this apply (boot_plist_changed) → unload then load -w so + # launchd picks up the new definition (codex: a stale loaded job must be refreshed). + # We unload ONLY in the changed-plist branch, so a transient load failure can leave the agent + # off only when we deliberately refreshed a CHANGED plist (surfaced as a warning) — never for an + # unchanged steady-state agent (review Medium: unconditional unload could disable a working one). + # SUPPRESSED entirely when boot_load_safe is False — the plist / boot script / ~/.tmux.conf was + # conflict-skipped (stale/unwired), so loading would enable a stale boot path (review finding). + # + # rig_boot_active tracks whether rig's REPLACEMENT boot agent is actually in place after this + # block — freshly loaded, or already-loaded-and-still-safe. The stale-boot cleanup (step 5) is + # gated on it: removing continuum's own autostart (Login Items / Tmux.Start) while rig has NOT + # got a working replacement loaded would leave the machine with NO tmux autostart at all on the + # next login (a conflict-skip / offline / launchctl-failure path). So we only clean once rig's + # boot is confirmed active (review finding). + rig_boot_active = False + if plan.boot_enabled and sys.platform == "darwin" and plan.boot_plist_path.is_file(): + if not boot_load_safe: + warnings.append( + "boot agent NOT loaded — the boot plist, boot script, or ~/.tmux.conf was " + "conflict-skipped (stale/unwired); re-run with on_conflict=backup/overwrite to load it" + ) + elif not _launchctl_loaded(plan.boot_label): + rc = _launchctl_load_enable(plan.boot_plist_path) + if rc != 0: + warnings.append(f"boot agent NOT loaded (launchctl rc={rc})") + else: + changes.append(f"launchctl load -w {plan.boot_plist_path.name}") + rig_boot_active = True + elif boot_plist_changed: + # refresh a CHANGED plist into the already-running agent: unload then load -w. + _launchctl("unload", str(plan.boot_plist_path)) + rc = _launchctl_load_enable(plan.boot_plist_path) + if rc != 0: + warnings.append( + f"boot agent reload FAILED (launchctl rc={rc}) — it may be left unloaded; " + f"re-run `rig apply` or `launchctl load -w {plan.boot_plist_path}`" + ) + else: + changes.append(f"reloaded boot agent {plan.boot_plist_path.name} (plist changed)") + rig_boot_active = True + else: + # already loaded + safe + unchanged → rig's boot is in place (steady-state re-apply). + rig_boot_active = True + + # 5) clean continuum's stale macOS boot (Login Items + old Tmux.Start agent) — macOS only, only + # when rig owns boot (don't remove the user's own autostart if they opted out of rig boot), AND + # only when rig's REPLACEMENT boot is actually active (never strip the last autostart while our + # own replacement failed to load / was conflict-skipped — review finding). + if plan.boot_enabled and sys.platform == "darwin" and rig_boot_active: + if _clean_stale_continuum_boot(plan): + changes.append("cleaned stale continuum boot (Login Items / old Tmux.Start)") + + # 6b) take a FIRST resurrect save ONLY if no snapshot exists yet — so a re-apply is a no-op and + # an existing good snapshot is never clobbered by an empty/partial one (opus finding). + if not _resurrect_snapshot_exists(plan) and _tmux_resurrect_save(plan) == 0: + changes.append("took first resurrect save") + + return changes, warnings + + +def _resurrect_snapshot_exists(plan) -> bool: + """True if a resurrect snapshot already exists (so we DON'T re-take a first save). resurrect + writes ``tmux_resurrect_*.txt`` files plus a ``last`` symlink to the newest; either signals a + prior save.""" + resurrect_dir = plan.home / ".tmux" / "resurrect" + if not resurrect_dir.is_dir(): + return False + if (resurrect_dir / "last").exists(): + return True + return any(resurrect_dir.glob("tmux_resurrect_*.txt")) + + +def _git_clone(repo: str, dest: Path) -> int: + """Shallow-clone ``repo`` to ``dest``. Returns 0 on success, non-zero on any failure (so the + caller treats an offline/no-git machine as 'plugin not installed', never an apply abort).""" + if not shutil.which("git"): + return 127 + try: + res = subprocess.run( + ["git", "clone", "--depth", "1", repo, str(dest)], + capture_output=True, text=True, timeout=120, + ) + except (OSError, subprocess.SubprocessError): + return 1 + return res.returncode + + +def _tmux_resurrect_save(plan) -> int: + """Take a resurrect snapshot of the CURRENT live tmux server. Returns 0 on success, non-zero + otherwise. Non-fatal: a machine with no running server (or no resurrect plugin) just doesn't + get a save (a later apply, after the user starts tmux, retries). + + Saves the server ONLY IF one is ALREADY running — it does NOT start a server to save (opus + finding): booting a bare ``main`` session and immediately saving could snapshot an empty, + pre-restore state and clobber a good snapshot. resurrect ships a standalone ``scripts/save.sh`` + that writes the snapshot without a key-binding; we invoke it directly. Does NOT depend on the + boot script existing (so it still works when ``boot.enabled`` is false but a server is up — + the boot script is only written when boot is enabled). + """ + tmux_bin = shutil.which("tmux") + if not tmux_bin: + return 127 + save_script = plan.home / ".tmux" / "plugins" / "tmux-resurrect" / "scripts" / "save.sh" + if not save_script.is_file(): + return 1 + try: + # only save when a server is already running — never start one just to snapshot it. + # Probe with `list-sessions`, NOT `has-session`: a bare `has-session` (no `-t`) resolves a + # target session from $TMUX / the most-recent session, so OUTSIDE tmux (a launchd/cron/plain + # apply — exactly how rig runs) it can return non-zero even when a server IS up, silently + # skipping the first save. `list-sessions` exits 0 iff any server with a session is alive, + # regardless of attach context (1 + "no server running" otherwise). (review finding) + probe = subprocess.run( + [tmux_bin, "list-sessions"], capture_output=True, text=True, timeout=10 + ) + if probe.returncode != 0: + return 1 # no live server → nothing to save (a later apply retries when one is up). + res = subprocess.run( + ["bash", str(save_script)], capture_output=True, text=True, timeout=30, + ) + except (OSError, subprocess.SubprocessError): + return 1 + return res.returncode + + +def _clean_stale_continuum_boot(plan) -> bool: + """Disable/remove continuum's OWN macOS boot artifacts that compete with rig's boot agent + (DEFECT 5). Returns True if there WAS stale state and it was cleaned; False on a clean + machine (so a re-apply is a no-op — true idempotency, not "ran osx_disable.sh again"). + + continuum, when ``@continuum-boot on`` was ever set, installs an iTerm/Terminal-coupled boot: + - ``~/.tmux/plugins/tmux-continuum/scripts/…`` registers ``osx_iterm_start_tmux.sh`` / + ``osx_terminal_start_tmux.sh`` as macOS Login Items, AND + - an old ``Tmux.Start`` launchd agent (``~/Library/LaunchAgents/Tmux.Start.plist``). + Both fight rig's single launchd boot agent. The stale-boot SIGNAL is the old + ``Tmux.Start.plist`` (continuum writes it when its boot is enabled; rig never does). Only when + that signal is present do we (a) run continuum's documented ``osx_disable.sh`` to un-register + its Login Items, and (b) ``launchctl bootout`` + remove the old plist. A machine with no + Tmux.Start plist has no stale boot → nothing to do → return False. + """ + old_plist = plan.home / "Library" / "LaunchAgents" / "Tmux.Start.plist" + if not old_plist.is_file(): + return False # no stale continuum boot present — idempotent no-op. + + # un-register continuum's Login Items via its own documented disable script (if installed). + osx_disable = ( + plan.home / ".tmux" / "plugins" / "tmux-continuum" / "scripts" / "osx_disable.sh" + ) + if osx_disable.is_file(): + try: + subprocess.run(["bash", str(osx_disable)], capture_output=True, text=True, timeout=20) + except (OSError, subprocess.SubprocessError): + pass + # bootout + unload + remove the old Tmux.Start launchd agent (continuum's iTerm boot). + uid = os.getuid() if hasattr(os, "getuid") else 0 + _launchctl("bootout", f"gui/{uid}/Tmux.Start") + _launchctl("unload", str(old_plist)) + try: + old_plist.unlink() + except OSError: + pass + return True + + def _tmux_conf_with_managed(plan, existing: str, splice, neutralize) -> str: """The desired ``~/.tmux.conf`` text for the plan's apply mode (pure). diff --git a/riglib/config.py b/riglib/config.py index b0e0424..cd91785 100644 --- a/riglib/config.py +++ b/riglib/config.py @@ -467,6 +467,7 @@ def _validate_github(gh: dict[str, Any]) -> None: "cc_restore", "anti_sprawl", "boot", + "login_shell", } _TMUX_SUBKEYS = { "resurrect": {"processes", "capture_pane_contents"}, @@ -475,6 +476,7 @@ def _validate_github(gh: dict[str, Any]) -> None: "cc_restore": {"enabled"}, "anti_sprawl": {"enabled", "session"}, "boot": {"enabled", "label"}, + "login_shell": {"enabled", "shell"}, } @@ -558,7 +560,7 @@ def _validate_tmux(t: dict[str, Any]) -> None: f"tmux.continuum.save_interval must be an int >= 1, got {interval!r}" ) - for sub in ("moshi", "cc_restore", "anti_sprawl", "boot"): + for sub in ("moshi", "cc_restore", "anti_sprawl", "boot", "login_shell"): block = t.get(sub, {}) if isinstance(block, dict): value = block.get("enabled") @@ -574,3 +576,18 @@ def _validate_tmux(t: dict[str, Any]) -> None: label = boot.get("label") if label is not None and not isinstance(label, str): raise ConfigError(f"tmux.boot.label must be a string, got {label!r}") + login = t.get("login_shell", {}) + if isinstance(login, dict): + shell = login.get("shell") + if shell is not None: + if not isinstance(shell, str): + raise ConfigError(f"tmux.login_shell.shell must be a string, got {shell!r}") + # Empty string → "resolve $SHELL at apply". A NON-empty override must be an ABSOLUTE + # path to the shell BINARY ONLY — no relative name, and NO arguments (whitespace). rig + # appends ` -l` itself; a value like `/bin/zsh -l` would render `'/bin/zsh -l' -l`, + # making tmux try to exec a binary literally named "/bin/zsh -l" (review P2). + if shell and (not shell.startswith("/") or any(c.isspace() for c in shell)): + raise ConfigError( + "tmux.login_shell.shell must be an absolute path to the shell BINARY with no " + f"arguments (rig adds `-l`), or empty to use $SHELL — got {shell!r}" + ) diff --git a/riglib/drift.py b/riglib/drift.py index bf60fac..d6ced13 100644 --- a/riglib/drift.py +++ b/riglib/drift.py @@ -25,6 +25,7 @@ _git_global, _launchctl_loaded, _read_crontab, + _tmux_dry_run, build_hook_descriptor, crontab_with_managed, descriptor_text, @@ -706,6 +707,41 @@ def _check_tmux(action: Action, report: DriftReport) -> None: "(a later line can undo the generated ordering) — apply re-appends it") ) + # 5) live activation state (DEFECTS 4/6): the resurrect snapshot dir + the plugin checkouts. + # apply now MANAGES these; status must surface them so a clean machine's "no snapshot / no + # plugins" doesn't read as in-sync (codex finding). A missing/partial plugin or absent + # resurrect dir is `missing` drift apply reconciles. (launchd loaded-state is intentionally + # NOT checked here — `rig status` must stay read-only + offline, and a `launchctl list` probe + # is a live-daemon query; the plist content drift above already covers the agent definition.) + # + # RIG_TMUX_DRY_RUN suppresses the LIVE half of apply (no plugin clone, no resurrect dir), so + # status MUST suppress the matching live-state drift too — else status would report drift apply + # is deliberately not converging (apply/status would disagree, and `rig status` could never read + # in-sync under the flag, e.g. CI/smoke). The file artifacts (sections 1-4) are written even + # under dry-run, so their drift stays checked above; only this live section is gated. Reuse the + # runner's flag helper so the dry-run truthiness has ONE definition across apply + status. + if _tmux_dry_run(): + return + + from .tmux import PLUGINS + + resurrect_dir = plan.home / ".tmux" / "resurrect" + if not resurrect_dir.is_dir(): + report.items.append( + DriftItem("missing", "tmux", action.item, resurrect_dir, + "~/.tmux/resurrect missing (resurrect writes no snapshot → nothing to " + "restore on reboot) — apply creates it") + ) + plugins_dir = plan.home / ".tmux" / "plugins" + for name, (_repo, entry) in PLUGINS.items(): + dest = plugins_dir / name + if not (dest / entry).exists(): + report.items.append( + DriftItem("missing", "tmux", action.item, dest, + f"tmux plugin {name} not installed (the @plugin decl won't resolve) " + "— apply clones it") + ) + def _file_drift(report: DriftReport, action: Action, path: Path, desired: str, label: str) -> None: """Append a ``missing`` (absent) or ``modified`` (content differs) DriftItem for a rig file.""" diff --git a/riglib/plan.py b/riglib/plan.py index 91f5dba..495fe4e 100644 --- a/riglib/plan.py +++ b/riglib/plan.py @@ -809,6 +809,19 @@ def _build_tmux(config: LoadedConfig, plan: InstallPlan) -> None: # target is plan-resolved.) conf_path = _expand(str(t.get("conf_path", "~/.tmux.conf")), config.repo_root) generated_dir = _expand(str(t.get("generated_dir", "~/.config/rig/tmux")), config.repo_root) + + # RESOLVE the login shell to a CONCRETE path ONCE here (plan time), not per render. An empty + # `login_shell.shell` means "use the user's $SHELL"; resolving it at render would make + # rig.tmux.conf depend on $SHELL/FS at the moment of EACH render — so `rig apply` (one $SHELL) + # and `rig status` (launchd/cron/CI, different/empty $SHELL) would render DIFFERENT + # default-command lines → permanent flapping drift apply "fixes" every run (review Medium). By + # baking the path into the action options, render/drift are deterministic and idempotent. + login_shell = dict(t.get("login_shell", {}) or {}) + if login_shell.get("enabled", True) is not False and not login_shell.get("shell"): + from .tmux import resolve_login_shell + + login_shell["shell"] = resolve_login_shell() + plan.actions.append( Action( kind="provision_tmux", @@ -826,6 +839,7 @@ def _build_tmux(config: LoadedConfig, plan: InstallPlan) -> None: "cc_restore": dict(t.get("cc_restore", {}) or {}), "anti_sprawl": dict(t.get("anti_sprawl", {}) or {}), "boot": dict(t.get("boot", {}) or {}), + "login_shell": login_shell, }, ) ) diff --git a/riglib/tmux.py b/riglib/tmux.py index cd70f76..2f6c6ac 100644 --- a/riglib/tmux.py +++ b/riglib/tmux.py @@ -37,6 +37,7 @@ from __future__ import annotations +import os import shlex import shutil from dataclasses import dataclass @@ -62,6 +63,31 @@ # explicit so a re-apply pins it rather than relying on the plugin default drifting. DEFAULT_SAVE_INTERVAL = 15 +# DEFECT 6: the tmux plugins rig installs on a CLEAN machine (empty ~/.tmux/plugins). The +# `@plugin` declarations in rig.tmux.conf only RESOLVE once these are cloned; tpm itself reads +# them. ONE source of truth — ``{dir_name: (clone_url, real_entrypoint)}`` — consumed by the +# activation (clone-if-missing), the completeness check (is this a full checkout?), and drift, +# so they can NEVER disagree on the entrypoint. NB the entrypoint is the PLUGIN's own basename, +# NOT the repo dir name: tmux-resurrect ships ``resurrect.tmux`` (not ``tmux-resurrect.tmux``), +# tmux-continuum ships ``continuum.tmux``, tpm ships ``tpm``. The generated +# ``run-shell ~/.tmux/plugins//`` lines use these exact names, so the completeness +# check MUST match them or a REAL checkout is judged partial and re-cloned every apply (the +# entrypoint-name-drift bug review caught). +# +# TRUST / UPDATE CONTRACT (explicit — these ARE the canonical upstream repos tpm itself clones): +# the activation does a one-shot ``git clone --depth 1`` of each repo's DEFAULT BRANCH the first +# time it is MISSING, and then NEVER touches it again (an existing complete checkout is left +# exactly as-is — the user owns plugin updates via tpm's own ``prefix + U``). rig deliberately +# does NOT pin a commit SHA: tpm's whole model is "clone the canonical plugin repos", a pinned SHA +# would silently rot and diverge from what every other tpm user runs, and the user can already pin +# via their own tooling. So the stored value is the repo URL (not ``url@sha``); the contract is +# "install the canonical plugin if absent, never auto-upgrade" — NOT "pin an exact ref". +PLUGINS = { + "tpm": ("https://github.com/tmux-plugins/tpm", "tpm"), + "tmux-resurrect": ("https://github.com/tmux-plugins/tmux-resurrect", "resurrect.tmux"), + "tmux-continuum": ("https://github.com/tmux-plugins/tmux-continuum", "continuum.tmux"), +} + # The canonical single-session name for the anti-sprawl attach-or-create entry. A reconnect # re-attaches THIS session instead of spawning a duplicate. DEFAULT_SESSION = "main" @@ -85,6 +111,11 @@ CC_MAP_NAME = "cc-sessions.map" ATTACH_NAME = "tmux-attach.sh" RIG_CONF_NAME = "rig.tmux.conf" +# The boot entrypoint the launchd agent runs. DEFECT 1: the agent must NOT run a bare +# `tmux start-server` (an EMPTY server — tmux loads ~/.tmux.conf only on the FIRST session, so +# continuum-restore never fires). This script does `tmux new-session -d` (which loads the conf +# → the sourced rig.tmux.conf → continuum → restore) so a cold boot actually comes up restored. +BOOT_NAME = "tmux-boot.sh" # The base backup suffix for the migrated ~/.tmux.conf. The runner writes a UNIQUE timestamped # backup (`.rig-bak-`) on every migrating apply (see runner._timestamped_backup_path), so a @@ -115,6 +146,8 @@ class TmuxPlan: anti_sprawl_session: str boot_enabled: bool boot_label: str + login_shell_enabled: bool + login_shell: str # "" → resolve $SHELL at config-eval time in the shell; else a literal path # ── resolved artifact paths ────────────────────────────────────────────────────── @property @@ -137,9 +170,14 @@ def cc_map_path(self) -> Path: def attach_path(self) -> Path: return self.generated_dir / ATTACH_NAME + @property + def boot_script_path(self) -> Path: + return self.generated_dir / BOOT_NAME + def managed_scripts(self) -> list[tuple[Path, str]]: """The (path, body) pairs apply writes and drift checks — ONE source so they can't - diverge: cc-save + cc-restore always, the attach-or-create entry when anti-sprawl is on. + diverge: cc-save + cc-restore always, the attach-or-create entry when anti-sprawl is on, + the boot script when boot is on (the launchd agent's entrypoint — DEFECT 1). """ scripts = [ (self.cc_save_path, self.render_cc_save()), @@ -147,6 +185,8 @@ def managed_scripts(self) -> list[tuple[Path, str]]: ] if self.anti_sprawl_enabled: scripts.append((self.attach_path, self.render_attach_script())) + if self.boot_enabled: + scripts.append((self.boot_script_path, self.render_boot_script())) return scripts @property @@ -199,6 +239,34 @@ def render_rig_conf(self) -> str: out.append(f"set -g @resurrect-capture-pane-contents '{'on' if self.capture_pane_contents else 'off'}'") out.append(f"set -g @continuum-restore '{'on' if self.continuum_restore else 'off'}'") out.append(f"set -g @continuum-save-interval '{self.save_interval}'") + # DEFECT 3 (the reboot bug): resurrect restores panes with a NON-login shell + # (its `default-command ''`), so ~/.zprofile (PATH, etc.) is NOT sourced → restored panes + # have a broken env. Set a LOGIN-shell default-command so every (new AND restored) pane + # sources the full login env. Default-on; configurable via tmux.login_shell. + # + # CRITICAL (live-cycle bug): the shell path is a CONCRETE path resolved at GENERATION time, + # NOT a tmux `${SHELL}` reference. tmux expands `${VAR}` itself in a double-quoted option + # value but does NOT support the `${VAR:-default}` bashism — `set -g default-command + # "${SHELL:-/bin/sh} -l"` makes tmux abort the WHOLE source-file with "invalid environment + # variable" at that line, so continuum/tpm/everything after it never loads → an empty, + # config-less server (caught only by the REAL e2e, never by a parse-check). So we bake the + # path. DETERMINISM: the plan resolves the shell ONCE (plan._build_tmux) and bakes the + # concrete path into the action — so render does NOT read $SHELL/FS here and rig.tmux.conf + # is identical across applies/status regardless of the ambient $SHELL (review Medium: a + # per-render resolve made drift flap). The `or resolve_login_shell()` is only a fallback + # for a direct build_tmux() with an empty shell (tests); the real path is plan-baked. + if self.login_shell_enabled: + shell = self.login_shell or resolve_login_shell() + # tmux's `default-command` takes ONE option value, which tmux then runs as a SHELL + # command (`sh -c ""`). So the value must be a SINGLE tmux argument whose INNER + # text, when the shell parses it, is ` -l`. We therefore: shell-quote the + # path (so a path with a space stays one argv for the inner shell), append ` -l`, and + # wrap the whole thing in tmux double-quotes so tmux sees it as one value. e.g. a plain + # path → set -g default-command "/bin/zsh -l"; a spaced path → + # set -g default-command "'/Apps/My Shell/zsh' -l". A bare path without metachars stays + # unquoted by shlex (readable common case); only a risky path gets the inner quotes. + command = f"{shlex.quote(shell)} -l" + out.append(f'set -g default-command "{command}"') # @continuum-boot is emitted ONLY as the on/off that matches rig's OWN boot mechanism. # CRITICAL: `@continuum-boot 'on'` makes continuum install its OWN unmanaged boot artifact # (the iTerm-coupled `Tmux.Start.plist` on macOS / a systemd user unit on Linux) — exactly @@ -267,6 +335,14 @@ def import_line(self) -> str: def render_cc_save(self) -> str: """cc-save: record each claude pane's cwd → newest Claude Code session id for that cwd. + DEFECT 2 (the reboot bug): Claude Code does NOT show up as ``claude`` in tmux's + ``pane_current_command`` — it shows as its VERSION string (e.g. ``2.1.178``), and the + REAL ``claude`` process is a CHILD of the pane's shell. Filtering on + ``pane_current_command == claude`` therefore matched NOTHING → the map stayed empty → cc + never resumed. So cc-save now walks the pane's process TREE: it takes ``pane_pid`` and + recursively descends children (``ps -eo pid,ppid,comm``) looking for a process whose + command basename is ``claude``. A pane with a ``claude`` descendant is a cc pane. + Encoding (VERIFIED on a real machine, see module/test docs): the projects dir name is the cwd with every ``/`` AND ``.`` replaced by ``-`` (so ``/Users/u/.files`` → ``-Users-u--files``). The newest ``*.jsonl`` (by mtime) under that dir is the latest @@ -282,9 +358,12 @@ def render_cc_save(self) -> str: projects = "$HOME/.claude/projects" return f"""#!/usr/bin/env bash # rig-managed: cc-save — GENERATED by rig from rig.yaml. Do not hand-edit. -# Records, for every tmux pane currently running `claude`, a map line: +# Records, for every tmux pane whose process TREE contains a `claude` process, a map line: # :. # so cc-restore can relaunch `claude --resume ` in the right window after a reboot. +# WHY a tree walk and not `pane_current_command == claude`: Claude Code shows up in +# `pane_current_command` as its VERSION (e.g. 2.1.178); the real `claude` process is a CHILD of +# the pane's shell. So we descend the pane PID's children and match a process named `claude`. # Encoding: the ~/.claude/projects/ dir name is the pane cwd with every '/' and '.' # replaced by '-' (verified against real on-disk dirs). # Limitation: the session id is per-CWD (newest jsonl), not strictly per-pane — two claude @@ -312,14 +391,41 @@ def render_cc_save(self) -> str: basename "$newest" .jsonl }} +# Snapshot the whole process table ONCE (pid ppid comm) — walking the tree per pane against a +# live `ps` each time would race; one snapshot is consistent and cheap. `comm` is the basename +# tmux/ps report (`claude`), not the full argv, so a path like /opt/homebrew/bin/claude still +# reports `claude`. Stored as parallel maps pid->ppid and pid->comm for an O(depth) descent. +PS_SNAPSHOT=$(ps -eo pid=,ppid=,comm= 2>/dev/null || true) + +pane_has_claude() {{ + # BFS over the descendants of the pane's pid; return 0 if any descendant's command is `claude`. + local root="$1" + local -a queue=("$root") + local pid ppid comm cur + while [ "${{#queue[@]}}" -gt 0 ]; do + cur="${{queue[0]}}" + queue=("${{queue[@]:1}}") + # scan the snapshot for: (a) `cur`'s own command, and (b) `cur`'s direct children to enqueue. + while read -r pid ppid comm; do + [ -n "$pid" ] || continue + if [ "$pid" = "$cur" ]; then + case "$comm" in + claude|*/claude) return 0 ;; + esac + fi + if [ "$ppid" = "$cur" ]; then + queue+=("$pid") + fi + done <<< "$PS_SNAPSHOT" + done + return 1 +}} + : > "$MAP_FILE" -# iterate every pane; emit a map line only for panes whose command is `claude`. -tmux list-panes -a -F '#{{session_name}}:#{{window_index}}.#{{pane_index}} #{{pane_current_command}} #{{pane_current_path}}' \\ - | while IFS=$'\\t' read -r addr cmd cwd; do - case "$cmd" in - claude|*/claude) ;; - *) continue ;; - esac +# iterate every pane; emit a map line only for panes whose process tree contains `claude`. +tmux list-panes -a -F '#{{session_name}}:#{{window_index}}.#{{pane_index}} #{{pane_pid}} #{{pane_current_path}}' \\ + | while IFS=$'\\t' read -r addr pane_pid cwd; do + pane_has_claude "$pane_pid" || continue enc=$(encode_cwd "$cwd") sid=$(newest_session_id "$PROJECTS/$enc") || continue printf '%s\\t%s\\t%s\\n' "$addr" "$cwd" "$sid" >> "$MAP_FILE" @@ -405,31 +511,63 @@ def render_attach_script(self) -> str: else exec tmux new-session -s "$SESSION" fi +""" + + # ── the boot script (DEFECT 1: load the conf via a real session, then restore) ──── + def render_boot_script(self) -> str: + """The launchd agent's entrypoint: bring tmux up AT LOGIN with the config LOADED. + + DEFECT 1 (the reboot bug): the old plist ran ``tmux start-server`` directly, which starts + a server WITHOUT loading ~/.tmux.conf or any plugin (tmux sources the conf only on the + FIRST session), so ``@continuum-restore`` never fired → an EMPTY server → ``tmux ls`` said + "no server running" after login. This script instead creates a detached session + (``tmux new-session -d``), which DOES load the conf → the sourced ``rig.tmux.conf`` → + continuum's ``run-shell`` init → (with ``@continuum-restore on``) the saved session is + restored INTO the server. Idempotent: if the canonical session already exists (a warm + login), it does nothing rather than spawn a duplicate (anti-sprawl at boot). + """ + tmux_bin = _resolve_tmux_bin() + session = shlex.quote(self.anti_sprawl_session) + # A non-default conf must be passed via `-f` (tmux only auto-loads ~/.tmux.conf), else the + # boot session starts WITHOUT the managed config → continuum/resurrect never set → no + # restore. Default path needs no -f. shlex.quote the path (a HOME with a space). + f_arg = "" + if self.conf_path != self.home / ".tmux.conf": + f_arg = f" -f {shlex.quote(str(self.conf_path))}" + tmux_q = shlex.quote(tmux_bin) + return f"""#!/usr/bin/env bash +# rig-managed: tmux boot — GENERATED by rig from rig.yaml. Do not hand-edit. +# The launchd agent runs THIS at login. It creates a detached session (which loads ~/.tmux.conf +# → the sourced rig.tmux.conf → continuum), so `@continuum-restore on` restores the saved +# session into the server. Merely starting a bare server would NOT load the conf (it loads only +# on the first session) → continuum never fires → "no server running" after login. +set -euo pipefail +TMUX_BIN={tmux_q} +SESSION={session} +# already up (warm login) → do nothing (no duplicate session — anti-sprawl at boot). +if "$TMUX_BIN" has-session -t "$SESSION" 2>/dev/null; then + exit 0 +fi +# create the detached session — this loads the conf and triggers continuum-restore. +"$TMUX_BIN"{f_arg} new-session -d -s "$SESSION" """ # ── the boot launchd plist (macOS) ─────────────────────────────────────────────── def render_boot_plist(self) -> str: - """A launchd agent that brings tmux up at login so continuum can restore. + """A launchd agent that runs the boot SCRIPT at login so continuum can restore. - Less iTerm-coupled than the old ``osx_iterm_start_tmux.sh`` approach: it simply starts - a detached tmux server (``tmux start-server``) at load; continuum (``@continuum-boot - on`` + ``@continuum-restore on``) then restores the saved session into it. ``KeepAlive`` - is false — we only need it to fire once at login. + DEFECT 1: the plist's single program argument is the generated boot script (NOT a bare + ``tmux start-server`` — that starts an EMPTY server with no conf/plugins loaded, so + continuum-restore never fires). The script does ``tmux new-session -d`` to load the conf + and trigger the restore. ``KeepAlive`` is false — we only need it to fire once at login; + ``rig apply`` ``launchctl load -w``s it so it is enabled across reboots. """ - tmux_bin = _resolve_tmux_bin() # plistlib gives idiomatic, escape-safe XML (no hand-rolled string concat / injection). import plistlib - # Honor a non-default conf path: tmux only auto-loads ~/.tmux.conf, so a custom - # conf_path must be passed via `-f` or the login server starts WITHOUT the managed - # config (continuum/resurrect options never set → no restore). Default path needs no -f. - args = [tmux_bin] - if self.conf_path != self.home / ".tmux.conf": - args += ["-f", str(self.conf_path)] - args.append("start-server") payload = { "Label": self.boot_label, - "ProgramArguments": args, + "ProgramArguments": [str(self.boot_script_path)], "RunAtLoad": True, "KeepAlive": False, } @@ -452,6 +590,39 @@ def _resurrect_token(name: str) -> str: _TMUX_FALLBACK_PATHS = ("/opt/homebrew/bin/tmux", "/usr/local/bin/tmux", "/usr/bin/tmux") +def resolve_login_shell() -> str: + """The user's login shell as a CONCRETE path, resolved from a STABLE source. + + Baked into the action at PLAN time, NOT a tmux ``${SHELL}`` reference (tmux's option-value env + expansion rejects ``${VAR:-default}`` and a bare ``${SHELL}`` is fragile under launchd — a + wrong ref aborts the WHOLE source-file with "invalid environment variable", so continuum/tpm + never load). + + CRITICAL — resolve from the PASSWD DATABASE, not ``$SHELL`` (review P1): ``apply`` and + ``status`` each rebuild a FRESH plan, so resolving from the volatile ``$SHELL`` env var would + bake a DIFFERENT shell when the two run under different environments (``SHELL=/bin/bash rig + apply`` then ``SHELL=/usr/bin/fish rig status``, or a launchd/cron status with no $SHELL) → + permanent flapping drift. ``pwd.getpwuid(getuid()).pw_shell`` is the user's REAL login shell + from the system account database — IDENTICAL across every invocation regardless of the ambient + env. We use it first; only if it is unavailable/empty do we fall back to ``$SHELL``, then + ``/bin/zsh`` (macOS default), then ``/bin/sh`` (always present). + """ + try: + import pwd + + passwd_shell = pwd.getpwuid(os.getuid()).pw_shell + if passwd_shell.startswith("/"): + return passwd_shell + except (KeyError, OSError, ImportError, AttributeError): + pass # no passwd entry / non-POSIX — fall through to the env/default chain. + env_shell = os.environ.get("SHELL", "") + if env_shell.startswith("/"): + return env_shell + if Path("/bin/zsh").exists(): + return "/bin/zsh" + return "/bin/sh" + + def _resolve_tmux_bin() -> str: """The tmux binary path for the boot plist: prefer PATH, else the first existing common location (Intel /usr/local, Apple-silicon /opt/homebrew, system /usr/bin) — never a blind @@ -478,6 +649,7 @@ def build_tmux( cc_restore: dict | None = None, anti_sprawl: dict | None = None, boot: dict | None = None, + login_shell: dict | None = None, ) -> TmuxPlan: """Resolve the desired :class:`TmuxPlan` from the (already-validated) tmux config block. @@ -485,7 +657,7 @@ def build_tmux( HOME-relative ``conf_path`` / ``generated_dir`` are expanded against it. Every nested knob defaults to the safe, root-cause-fixing value; an empty block yields the full default config (claude in resurrect, capture-pane on, continuum restore+boot, Moshi off, cc-restore - on, anti-sprawl on, boot on). + on, anti-sprawl on, boot on, login-shell default-command on). """ resurrect = resurrect or {} continuum = continuum or {} @@ -493,6 +665,7 @@ def build_tmux( cc_restore = cc_restore or {} anti_sprawl = anti_sprawl or {} boot = boot or {} + login_shell = login_shell or {} def _expand(p: str | Path) -> Path: s = str(p) @@ -531,6 +704,10 @@ def _knob(block: dict, key: str, default): anti_sprawl_session=str(_knob(anti_sprawl, "session", DEFAULT_SESSION)), boot_enabled=bool(_knob(boot, "enabled", True)), boot_label=str(_knob(boot, "label", DEFAULT_BOOT_LABEL)), + login_shell_enabled=bool(_knob(login_shell, "enabled", True)), + # An explicit shell override is used verbatim; "" means "resolve $SHELL in the shell at + # pane-spawn time" (the safe default — the login server inherits the user's $SHELL). + login_shell=str(_knob(login_shell, "shell", "")), ) diff --git a/tests/conftest.py b/tests/conftest.py index 3edcace..f375064 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -78,6 +78,22 @@ def _fake_write_crontab(contents): monkeypatch.setattr(runner, "_write_crontab", _fake_write_crontab) +@pytest.fixture(autouse=True) +def _isolate_tmux_activation(monkeypatch): + """Never let a test run the LIVE tmux activation (clone plugins / launchctl / first save). + + ``_do_provision_tmux`` now ALSO activates the rig-managed tmux on a clean machine: it clones + tpm/resurrect/continuum, creates ~/.tmux/resurrect, ``launchctl load -w``s the boot agent, + takes a first ``resurrect save``, and cleans continuum's stale macOS boot Login Items + (DEFECTS 1/4/5/6). Those are real network + daemon + ``tmux``-server effects. This guard sets + ``RIG_TMUX_DRY_RUN=1`` suite-wide so the file-write path still runs (and is asserted) while the + live activation is skipped. The DEDICATED activation tests + the REAL e2e clear/override this + (``monkeypatch.delenv`` or ``setenv(..., "0")``) and stub the seams, so they exercise the real + logic safely. Mirrors ``_isolate_scheduler``. + """ + monkeypatch.setenv("RIG_TMUX_DRY_RUN", "1") + + @pytest.fixture def fake_agent_tools(tmp_path: Path) -> Path: """A minimal but structurally-valid agent-tools checkout.""" diff --git a/tests/smoke.sh b/tests/smoke.sh index be23cc9..5b5d012 100755 --- a/tests/smoke.sh +++ b/tests/smoke.sh @@ -34,6 +34,10 @@ else TMP="$(mktemp -d)" trap 'rm -rf "$TMP"' EXIT export HOME="$TMP/home"; mkdir -p "$HOME" + # exercise the tmux catalog area through the real CLI but NEVER touch the live machine: the + # dry-run guard writes the on-disk artifacts (config + boot script + plist) into the throwaway + # HOME while skipping every live step (no plugin clone, no `launchctl load`, no resurrect save). + export RIG_TMUX_DRY_RUN=1 git config --global user.email smoke@rig.test git config --global user.name rig-smoke ( cd "$TMP" && git init -q ) @@ -51,9 +55,15 @@ ci: items: secret-scan: { enabled: true, tier: block } ship: { enabled: true, install_to: ~/bin, gh_alias: false } -mcp: - items: - review: { enabled: true, command: "review --mcp" } +tmux: + enabled: true + apply: import + resurrect: { processes: [ssh, psql], capture_pane_contents: true } + continuum: { restore: true, save_interval: 15, boot: true } + cc_restore: { enabled: true } + anti_sprawl: { enabled: true, session: main } + boot: { enabled: true } + login_shell: { enabled: true } YAML # dry-run first (must write nothing) @@ -74,6 +84,17 @@ YAML [[ -f "$HOME/.claude/skills/$sk_name/SKILL.md" ]] || fail "harness skill link does not resolve" pass "rig init --yes installed skills + CI + dispatcher + harness skill links" + # tmux v2: the managed config + boot script (DEFECT 1) land on disk (dry-run skips only the + # LIVE steps — plugin clone / launchctl load — not the artifact writes). + [[ -f "$HOME/.config/rig/tmux/rig.tmux.conf" ]] || fail "tmux: rig.tmux.conf not generated" + [[ -f "$HOME/.config/rig/tmux/tmux-boot.sh" ]] || fail "tmux: boot script (DEFECT 1) not generated" + grep -Eq '^[[:space:]]*[^#].*new-session -d' "$HOME/.config/rig/tmux/tmux-boot.sh" \ + || fail "tmux: boot script must use 'new-session -d', not a bare start-server (DEFECT 1)" + # require the real directive, not a comment that merely mentions it (codex finding) + grep -Eq '^[[:space:]]*set -g default-command' "$HOME/.config/rig/tmux/rig.tmux.conf" \ + || fail "tmux: login-shell 'set -g default-command' (DEFECT 3) not in generated config" + pass "rig init --yes generated tmux v2 config + boot script (login-shell, new-session -d)" + # idempotency: a second apply changes nothing (no created/updated/backed_up in summary) out="$($RIG apply -C "$TMP" --config "$TMP/rig.yaml" 2>&1)" summary="$(echo "$out" | grep '^Summary:' || true)" diff --git a/tests/test_tmux.py b/tests/test_tmux.py index f6e003d..f436d48 100644 --- a/tests/test_tmux.py +++ b/tests/test_tmux.py @@ -117,6 +117,38 @@ def test_tmux_boot_label_must_be_string(): validate({"version": 1, "tmux": {"boot": {"label": "com.me.tmux"}}}) +def test_tmux_login_shell_block_accepted(): + validate({"version": 1, "tmux": {"login_shell": {"enabled": True, "shell": "/bin/zsh"}}}) + validate({"version": 1, "tmux": {"login_shell": {"enabled": False}}}) + + +def test_tmux_login_shell_enabled_must_be_bool(): + with pytest.raises(ConfigError): + validate({"version": 1, "tmux": {"login_shell": {"enabled": "yes"}}}) + + +def test_tmux_login_shell_shell_must_be_string(): + with pytest.raises(ConfigError): + validate({"version": 1, "tmux": {"login_shell": {"shell": 123}}}) + + +def test_tmux_login_shell_shell_must_be_absolute_path(): + """A non-empty shell override must be an ABSOLUTE path to the BINARY with NO args (rig adds + `-l`). A relative name, OR an absolute path WITH args (`/bin/zsh -l` — the review-caught case + that passed `startswith('/')` but rendered `'/bin/zsh -l' -l`), is rejected.""" + for bad in ("zsh", "zsh -l", "bin/zsh", "/bin/zsh -l", "/bin/zsh --login", "/opt/My App/zsh"): + with pytest.raises(ConfigError): + validate({"version": 1, "tmux": {"login_shell": {"shell": bad}}}) + # empty (use $SHELL) and a bare absolute binary path are both fine. + validate({"version": 1, "tmux": {"login_shell": {"shell": ""}}}) + validate({"version": 1, "tmux": {"login_shell": {"shell": "/usr/bin/fish"}}}) + + +def test_tmux_login_shell_unknown_key_rejected(): + with pytest.raises(ConfigError): + validate({"version": 1, "tmux": {"login_shell": {"bogus": True}}}) + + def test_tmux_enabled_null_is_accepted_and_provisions(): """`enabled: null` (explicit None) is valid and, per the docs ('not false' provisions), must NOT be treated as disabled.""" @@ -215,6 +247,185 @@ def test_cc_save_avoids_ls_head_pipe(): assert any("listing" in ln and "newest=" in ln for ln in code_lines) +# ── DEFECT 2: cc-save must detect claude by the pane's PROCESS TREE, not the command string ── +def test_cc_save_walks_pane_process_tree_not_command_string(): + """DEFECT 2 (the reboot bug): cc shows up in `pane_current_command` as its VERSION string + (e.g. `2.1.178`), and the real `claude` process is a CHILD of the pane's shell. Filtering on + `pane_current_command == claude` therefore matched NOTHING → the map was always empty → cc + never resumed. cc-save must walk the pane's process TREE (pane_pid + descendants) for a + process whose command is `claude` / `*/claude`.""" + body = _plan().render_cc_save() + # it must enumerate the pane PID + its descendants (a tree walk), not just read the command. + assert "pane_pid" in body or "#{pane_pid}" in body + # the OLD broken filter on the command string must be GONE from the EXECUTABLE code (a + # comment may still reference it to explain WHY the tree walk replaced it). + code_lines = [ln for ln in body.splitlines() if not ln.lstrip().startswith("#")] + assert not any("pane_current_command" in ln for ln in code_lines) + # a process-tree walk: descend children via ppid (ps -o pid,ppid or pgrep -P). + assert "ppid" in body or "pgrep -P" in body + + +def test_cc_save_matches_claude_basename_in_the_tree(): + """The matched descendant's command is `claude` or `*/claude` (an absolute path to the + binary), never a substring like `claudette` — match the basename.""" + body = _plan().render_cc_save() + assert "claude" in body + # it must record the pane→cwd→session-id map (the whole point) once a claude descendant is found. + assert "MAP_FILE" in body and ".jsonl" in body + + +def test_cc_save_still_records_cwd_and_session_id(): + """Detection changed (tree walk), but the RECORDED data is unchanged: pane addr, cwd, id.""" + body = _plan().render_cc_save() + assert "pane_current_path" in body or "#{pane_current_path}" in body + assert "encode_cwd" in body + assert "newest_session_id" in body + + +def _run_pane_has_claude(snapshot: str, root: str) -> int: + """Extract the REAL `pane_has_claude` BFS from the generated cc-save script and run it against + a SYNTHETIC ps snapshot — HERMETIC (no tmux/network/real processes), so the tree-walk logic is + covered even when the e2e is opted out (opus finding: the BFS was only exercised in the e2e). + Returns the function's exit code (0 = a `claude` descendant of `root` was found).""" + import shlex + import subprocess + import textwrap + + body = _plan().render_cc_save() + # pull out the pane_has_claude() function definition (from its `pane_has_claude() {` to the + # matching closing brace at column 0 — the script formats it that way). + start = body.index("pane_has_claude() {") + end = body.index("\n}\n", start) + len("\n}\n") + fn = body[start:end] + script = textwrap.dedent(f"""\ + set -euo pipefail + PS_SNAPSHOT={shlex.quote(snapshot)} + {fn} + if pane_has_claude {shlex.quote(root)}; then exit 0; else exit 1; fi + """) + return subprocess.run(["bash", "-c", script], capture_output=True, text=True).returncode + + +def test_pane_has_claude_finds_a_direct_child(): + """The pane's direct child is `claude` → detected (the common case: cc is a child of the shell).""" + snap = "100 1 bash\n200 100 claude\n300 100 sleep\n" + assert _run_pane_has_claude(snap, "100") == 0 + + +def test_pane_has_claude_finds_a_deep_descendant(): + """A `claude` two levels down (shell → node → claude) is still found by the BFS.""" + snap = "100 1 bash\n200 100 node\n300 200 claude\n" + assert _run_pane_has_claude(snap, "100") == 0 + + +def test_pane_has_claude_matches_absolute_path_basename(): + """A descendant reported by its absolute path (`/opt/homebrew/bin/claude`) matches `*/claude`.""" + snap = "100 1 zsh\n200 100 /opt/homebrew/bin/claude\n" + assert _run_pane_has_claude(snap, "100") == 0 + + +def test_pane_has_claude_no_match_when_absent(): + """No `claude` anywhere in the tree → not found (exit non-zero).""" + snap = "100 1 bash\n200 100 vim\n300 100 less\n" + assert _run_pane_has_claude(snap, "100") != 0 + + +def test_pane_has_claude_does_not_match_substring(): + """A `claudette` process is NOT a `claude` match (basename equality, not substring).""" + snap = "100 1 bash\n200 100 claudette\n" + assert _run_pane_has_claude(snap, "100") != 0 + + +def test_pane_has_claude_ignores_claude_outside_the_subtree(): + """A `claude` that is NOT a descendant of the queried pane is ignored (no false positive).""" + snap = "100 1 bash\n200 100 sleep\n900 1 claude\n" # 900 is a sibling, not under 100 + assert _run_pane_has_claude(snap, "100") != 0 + + +# ── DEFECT 3: restored panes must be LOGIN shells (so ~/.zprofile / PATH is sourced) ───────── +def test_render_default_command_is_a_login_shell(): + """DEFECT 3 (the reboot bug): resurrect restores panes with a NON-login shell (resurrect's + `default-command ''`), so `~/.zprofile` (PATH etc.) is NOT sourced → restored panes have a + broken env. The generated config must set a login-shell default-command.""" + conf = _plan().render_rig_conf() + assert "set -g default-command" in conf + # a login shell: the user's $SHELL with -l (so ~/.zprofile / ~/.bash_profile is sourced). + assert "-l" in conf + line = next(ln for ln in conf.splitlines() if "default-command" in ln) + assert "SHELL" in line or "/bin/" in line + + +def test_login_shell_default_command_is_default_on(): + """The login-shell default-command defaults ON (the safe value); a plan with no login_shell + block still emits it.""" + assert "set -g default-command" in tmux.build_tmux(repo_home=Path("/home/u")).render_rig_conf() + + +def test_login_shell_can_be_disabled(): + """Configurable: `login_shell.enabled: false` omits the default-command (the user keeps + resurrect's default non-login behavior).""" + conf = tmux.build_tmux( + repo_home=Path("/home/u"), login_shell={"enabled": False} + ).render_rig_conf() + assert "set -g default-command" not in conf + + +def test_login_shell_honors_configured_shell(): + """An explicit `login_shell.shell` is used verbatim (e.g. a non-default shell binary).""" + conf = tmux.build_tmux( + repo_home=Path("/home/u"), login_shell={"enabled": True, "shell": "/usr/bin/fish"} + ).render_rig_conf() + line = next(ln for ln in conf.splitlines() if "default-command" in ln) + assert "/usr/bin/fish" in line and "-l" in line + + +def test_plan_resolves_login_shell_to_a_concrete_path(fake_agent_tools, tmp_path, monkeypatch): + """DETERMINISM (review): the plan resolves an EMPTY login_shell.shell to a CONCRETE absolute + path at plan time and bakes it — so render does NOT depend on $SHELL/FS at render time.""" + plan = _build({"tmux": {"enabled": True}}, tmp_path, fake_agent_tools) + a = next(act for act in plan.actions if act.kind == "provision_tmux") + baked = a.options["login_shell"]["shell"] + assert baked.startswith("/") and " " not in baked # a concrete absolute binary path + + +def test_login_shell_resolves_from_passwd_not_ambient_shell(monkeypatch): + """The resolver reads the PASSWD database (stable login shell), NOT the volatile $SHELL env — + so it is identical whatever $SHELL is set to (review P1: $SHELL-based resolve flapped).""" + import os + import pwd + + from riglib import tmux as tmod + + real = pwd.getpwuid(os.getuid()).pw_shell + if not real.startswith("/"): + pytest.skip("no absolute passwd shell on this host") + monkeypatch.setenv("SHELL", "/some/other/shell-XYZ") # $SHELL differs from passwd + assert tmod.resolve_login_shell() == real # passwd wins, $SHELL ignored + + +def test_login_shell_deterministic_across_separate_plans_under_different_shell( + fake_agent_tools, tmp_path, monkeypatch +): + """THE real apply→status path (review P1): `apply` and `status` each REBUILD a fresh plan. Two + independently-built plans under DIFFERENT $SHELL must bake the SAME shell, so a + `SHELL=/bin/bash rig apply` followed by `SHELL=/usr/bin/fish rig status` does NOT flap drift.""" + from riglib.actions.runner import tmux_plan_from_action + + monkeypatch.setattr(Path, "home", classmethod(lambda cls: tmp_path)) + monkeypatch.setenv("SHELL", "/bin/bash") + plan_apply = _build({"tmux": {"enabled": True}}, tmp_path, fake_agent_tools) + a_apply = next(act for act in plan_apply.actions if act.kind == "provision_tmux") + render_apply = tmux_plan_from_action(a_apply).render_rig_conf() + # a SEPARATE plan rebuild (as `rig status` does) under a DIFFERENT $SHELL. + monkeypatch.setenv("SHELL", "/usr/bin/fish") + plan_status = _build({"tmux": {"enabled": True}}, tmp_path, fake_agent_tools) + a_status = next(act for act in plan_status.actions if act.kind == "provision_tmux") + render_status = tmux_plan_from_action(a_status).render_rig_conf() + line_apply = next(ln for ln in render_apply.splitlines() if "default-command" in ln) + line_status = next(ln for ln in render_status.splitlines() if "default-command" in ln) + assert line_apply == line_status # identical despite the two different ambient $SHELLs + + def test_render_ordering_continuum_hook_is_last_plugin_init(): """THE root-cause guarantee: continuum's run-shell init comes AFTER the Moshi status-right tweak (and after resurrect), so the Moshi tweak can never wipe continuum's autosave hook. @@ -348,7 +559,22 @@ def test_boot_plist_is_well_formed_and_labelled(): assert parsed["Label"] == p.boot_label assert parsed["RunAtLoad"] is True assert parsed["KeepAlive"] is False - assert parsed["ProgramArguments"][-1] == "start-server" + # DEFECT 1: the plist must run the BOOT SCRIPT (which `new-session -d` loads the conf → + # continuum restores), NOT a bare `tmux start-server` (an EMPTY server that loads nothing). + assert parsed["ProgramArguments"][-1] == str(p.boot_script_path) + assert parsed["ProgramArguments"][-1].endswith(tmux.BOOT_NAME) + + +def test_boot_plist_does_not_run_bare_start_server(): + """DEFECT 1 (the reboot bug): `tmux start-server` starts a server WITHOUT loading the conf + or plugins (tmux loads the conf only on the FIRST session), so continuum-restore never + fires → empty server. The plist must invoke the boot SCRIPT instead.""" + import plistlib + + args = plistlib.loads( + tmux.build_tmux(repo_home=Path("/home/u")).render_boot_plist().encode("utf-8") + )["ProgramArguments"] + assert "start-server" not in args def test_boot_label_is_configurable(): @@ -358,40 +584,59 @@ def test_boot_label_is_configurable(): assert p.boot_plist_path.name == "com.me.tmux.plist" -def test_boot_plist_tmux_bin_falls_back_to_existing_path(monkeypatch): - """When tmux isn't on PATH, the boot plist must point at an EXISTING common location, not a - blind Apple-silicon hard-code (codex P2).""" +def test_boot_plist_points_at_the_boot_script(): + """The plist's single program argument is the generated boot script — the one indirection + that lets the boot bring up a REAL session (loading the conf) instead of an empty server.""" import plistlib - monkeypatch.setattr(tmux.shutil, "which", lambda name: None) # not on PATH - # only the Intel path "exists" - monkeypatch.setattr(tmux.Path, "exists", lambda self: str(self) == "/usr/local/bin/tmux") p = tmux.build_tmux(repo_home=Path("/home/u")) - parsed = plistlib.loads(p.render_boot_plist().encode("utf-8")) - assert parsed["ProgramArguments"][0] == "/usr/local/bin/tmux" + args = plistlib.loads(p.render_boot_plist().encode("utf-8"))["ProgramArguments"] + assert args == [str(p.boot_script_path)] -def test_boot_plist_passes_f_for_custom_conf_path(): - """A non-default conf_path must reach the login server via `-f`, else it starts WITHOUT - the managed config (continuum/resurrect never set → no restore) (codex P2).""" - import plistlib +# ── boot script (DEFECT 1: new-session -d loads the conf → continuum restores) ────────────── +def test_boot_script_creates_a_session_to_load_the_conf(): + """The boot script must `tmux new-session -d` (which loads ~/.tmux.conf → the sourced + rig.tmux.conf → continuum), NOT `tmux start-server` (an empty server that loads nothing).""" + body = tmux.build_tmux(repo_home=Path("/home/u")).render_boot_script() + assert body.startswith("#!/") + assert "new-session -d" in body + assert "start-server" not in body - p = tmux.build_tmux(repo_home=Path("/home/u"), conf_path="~/.config/tmux/custom.conf") - args = plistlib.loads(p.render_boot_plist().encode("utf-8"))["ProgramArguments"] - assert "-f" in args - assert any(a.endswith("custom.conf") for a in args) - assert args[-1] == "start-server" +def test_boot_script_is_idempotent_attach_or_create(): + """A second login/boot must NOT spawn a duplicate session — the boot script creates the + canonical session only if it does not already exist (anti-sprawl at boot).""" + body = tmux.build_tmux( + repo_home=Path("/home/u"), anti_sprawl={"enabled": True, "session": "main"} + ).render_boot_script() + assert "has-session" in body + assert "main" in body -def test_boot_plist_omits_f_for_default_conf_path(): - """The default ~/.tmux.conf is auto-loaded by tmux, so no `-f` is emitted (keeps the - common-case plist minimal).""" - import plistlib - p = tmux.build_tmux(repo_home=Path("/home/u")) # default conf_path - args = plistlib.loads(p.render_boot_plist().encode("utf-8"))["ProgramArguments"] - assert "-f" not in args - assert args[-1] == "start-server" +def test_boot_script_passes_f_for_custom_conf_path(): + """A non-default conf_path must reach the boot session via `-f`, else the session starts + WITHOUT the managed config (continuum/resurrect never set → no restore).""" + body = tmux.build_tmux( + repo_home=Path("/home/u"), conf_path="~/.config/tmux/custom.conf" + ).render_boot_script() + assert "-f" in body + assert "custom.conf" in body + + +def test_boot_script_omits_f_for_default_conf_path(): + """The default ~/.tmux.conf is auto-loaded by tmux, so no `-f` is emitted.""" + body = tmux.build_tmux(repo_home=Path("/home/u")).render_boot_script() + assert " -f " not in body + + +def test_boot_script_tmux_bin_falls_back_to_existing_path(monkeypatch): + """When tmux isn't on PATH, the boot script must reference an EXISTING common location, not a + blind Apple-silicon hard-code (codex P2).""" + monkeypatch.setattr(tmux.shutil, "which", lambda name: None) # not on PATH + monkeypatch.setattr(tmux.Path, "exists", lambda self: str(self) == "/usr/local/bin/tmux") + body = tmux.build_tmux(repo_home=Path("/home/u")).render_boot_script() + assert "/usr/local/bin/tmux" in body # ── anti-sprawl attach-or-create ───────────────────────────────────────────────────────── @@ -665,13 +910,38 @@ def test_plan_resolves_relative_paths_against_repo_root(fake_agent_tools, tmp_pa def test_plan_carries_resolved_knobs(fake_agent_tools, tmp_path): plan = _build( {"tmux": {"enabled": True, "apply": "block", "moshi": {"enabled": True}, - "continuum": {"save_interval": 9}}}, + "continuum": {"save_interval": 9}, + "login_shell": {"enabled": False, "shell": "/bin/zsh"}}}, tmp_path, fake_agent_tools, ) a = next(a for a in plan.actions if a.kind == "provision_tmux") assert a.options["apply_mode"] == "block" assert a.options["moshi"]["enabled"] is True assert a.options["continuum"]["save_interval"] == 9 + assert a.options["login_shell"] == {"enabled": False, "shell": "/bin/zsh"} + + +def test_apply_login_shell_default_command_lands_in_generated_conf(tmp_path, monkeypatch): + """The login-shell default-command (DEFECT 3) reaches the generated rig.tmux.conf on apply.""" + from riglib.actions import runner + + home = tmp_path / "home" + home.mkdir() + monkeypatch.setattr(Path, "home", classmethod(lambda cls: home)) + runner._do_provision_tmux(_tmux_action(home), "backup") + conf = (home / ".config" / "rig" / "tmux" / "rig.tmux.conf").read_text() + assert "set -g default-command" in conf and "-l" in conf + + +def test_apply_login_shell_disabled_omits_default_command(tmp_path, monkeypatch): + from riglib.actions import runner + + home = tmp_path / "home" + home.mkdir() + monkeypatch.setattr(Path, "home", classmethod(lambda cls: home)) + runner._do_provision_tmux(_tmux_action(home, login_shell={"enabled": False}), "backup") + conf = (home / ".config" / "rig" / "tmux" / "rig.tmux.conf").read_text() + assert "set -g default-command" not in conf # ── install (runner) — import mode, idempotent, migration + backup ─────────────────────── @@ -701,6 +971,19 @@ def _tmux_action(home, **over): ) +def _stage_live_tmux_state(home): + """Create the LIVE activation state (the resurrect dir + complete plugin checkouts) a real + apply would. Drift now checks these (codex finding), but the unit suite applies under + RIG_TMUX_DRY_RUN (no live activation), so an in-sync drift test must stage them itself — + otherwise drift correctly reports 'no plugins / no resurrect dir'. Uses the REAL plugin + entrypoints from tmux.PLUGINS (NOT `.tmux` — resurrect ships `resurrect.tmux`).""" + (home / ".tmux" / "resurrect").mkdir(parents=True, exist_ok=True) + plugins = home / ".tmux" / "plugins" + for name, (_repo, entry) in tmux.PLUGINS.items(): + (plugins / name).mkdir(parents=True, exist_ok=True) + (plugins / name / entry).write_text("#!/usr/bin/env bash\n", encoding="utf-8") + + def test_apply_import_writes_generated_file_and_import_line(tmp_path, monkeypatch): from riglib.actions import runner @@ -1114,10 +1397,53 @@ def test_drift_tmux_in_sync_after_apply(tmp_path, monkeypatch): monkeypatch.setattr(Path, "home", classmethod(lambda cls: home)) a = _tmux_action(home) runner._do_provision_tmux(a, "backup") + _stage_live_tmux_state(home) # a real apply would have created plugins + resurrect dir report = detect(InstallPlan(actions=[a])) assert not [d for d in report.items if d.category == "tmux"] +def test_drift_tmux_live_state_missing(tmp_path, monkeypatch): + """DEFECTS 4/6 drift: a missing resurrect dir / plugin checkout is surfaced by `rig status` + so a clean machine doesn't read as in-sync while apply still has live work to do (codex + finding). Here the live state is NOT staged → drift reports both.""" + from riglib.actions import runner + from riglib.drift import detect + from riglib.plan import InstallPlan + + home = tmp_path / "home" + home.mkdir() + monkeypatch.setattr(Path, "home", classmethod(lambda cls: home)) + a = _tmux_action(home) + runner._do_provision_tmux(a, "backup") # writes the config artifacts (dry-run: no live activation) + # status checks the LIVE state only on a real machine (under RIG_TMUX_DRY_RUN apply skips the + # live activation, so status correctly suppresses the matching live drift to stay consistent). + monkeypatch.delenv("RIG_TMUX_DRY_RUN", raising=False) + drift = [d for d in detect(InstallPlan(actions=[a])).items if d.category == "tmux"] + assert any("resurrect" in d.detail and d.direction == "missing" for d in drift) + assert any("plugin" in d.detail and "tpm" in str(d.target) for d in drift) + + +def test_drift_tmux_live_state_suppressed_under_dry_run(tmp_path, monkeypatch): + """Under RIG_TMUX_DRY_RUN apply skips the LIVE activation (no plugin clone, no resurrect + dir), so status MUST suppress the matching live-state drift — else status would report drift + apply deliberately won't converge (apply/status disagree, and status could never read in-sync + under the flag, e.g. CI/smoke). The file-artifact drift stays checked; only the live half is + gated. (The autouse _isolate_tmux_activation fixture already sets the flag for this test.)""" + from riglib.actions import runner + from riglib.drift import detect + from riglib.plan import InstallPlan + + home = tmp_path / "home" + home.mkdir() + monkeypatch.setattr(Path, "home", classmethod(lambda cls: home)) + a = _tmux_action(home) + runner._do_provision_tmux(a, "backup") # writes config artifacts; dry-run skips live activation + # the resurrect dir + plugin checkouts are absent (dry-run never created them), but with the + # flag set status must NOT flag them — only the live half is gated, file artifacts stay in-sync. + drift = [d for d in detect(InstallPlan(actions=[a])).items if d.category == "tmux"] + assert not any("resurrect" in d.detail or "plugin" in d.detail for d in drift), drift + + def test_drift_tmux_modified_generated_file(tmp_path, monkeypatch): from riglib.actions import runner from riglib.drift import detect @@ -1201,6 +1527,7 @@ def test_drift_tmux_block_mode_missing_and_modified(tmp_path, monkeypatch): monkeypatch.setattr(Path, "home", classmethod(lambda cls: home)) a = _tmux_action(home, apply_mode="block") runner._do_provision_tmux(a, "backup") + _stage_live_tmux_state(home) # a real apply would have created plugins + resurrect dir # in sync right after apply assert not [d for d in detect(InstallPlan(actions=[a])).items if d.category == "tmux"] # tamper INSIDE the managed block → 'modified' (content compared, not just sentinels) @@ -1228,6 +1555,7 @@ def test_drift_tmux_boot_plist(tmp_path, monkeypatch): monkeypatch.setattr(_sys, "platform", "darwin") a = _tmux_action(home, boot={"enabled": True}) runner._do_provision_tmux(a, "backup") + _stage_live_tmux_state(home) # a real apply would have created plugins + resurrect dir assert not [d for d in detect(InstallPlan(actions=[a])).items if d.category == "tmux"] # delete the boot plist → drift surfaces it (it was previously unchecked). (home / "Library" / "LaunchAgents" / "ai.hyperide.tmux-boot.plist").unlink() @@ -1311,6 +1639,435 @@ def test_apply_surfaces_generated_file_backups(tmp_path, monkeypatch): assert "backups:" in res.detail +# ── DEFECTS 1/4/5/6: live activation (plugins, resurrect dir, first save, launchctl, cleanup) ── +def _activation_seams(monkeypatch, *, plugins_present=False, launchctl_loaded=False): + """Stub the live seams an activation touches and return a record of the calls made. + + Records git clones, launchctl verbs, tmux `resurrect save` runs, and continuum boot-cleanup + runs — so a test can assert WHICH side effects an activation performed without any real + network / daemon / tmux-server access. + """ + from riglib.actions import runner + + rec = {"clones": [], "launchctl": [], "load_w": [], "saves": 0, "cleanups": 0} + + def _clone(repo, dest): + rec["clones"].append((repo, str(dest))) + Path(dest).mkdir(parents=True, exist_ok=True) + return 0 + + def _load_w(plist): + rec["load_w"].append(str(plist)) + return 0 + + def _cleanup(plan): + rec["cleanups"] += 1 + return True + + monkeypatch.setattr(runner, "_git_clone", _clone) + monkeypatch.setattr(runner, "_launchctl", lambda verb, arg: rec["launchctl"].append((verb, arg)) or 0) + monkeypatch.setattr(runner, "_launchctl_load_enable", _load_w) + monkeypatch.setattr(runner, "_launchctl_loaded", lambda label: launchctl_loaded) + monkeypatch.setattr(runner, "_tmux_resurrect_save", lambda plan: rec.update(saves=rec["saves"] + 1) or 0) + monkeypatch.setattr(runner, "_clean_stale_continuum_boot", _cleanup) + return rec + + +def test_activation_creates_resurrect_dir(tmp_path, monkeypatch): + """DEFECT 4: ~/.tmux/resurrect must EXIST after apply, else resurrect never writes a + snapshot (tmux_resurrect_*.txt) → nothing to restore on reboot.""" + from riglib.actions import runner + + home = tmp_path / "home" + home.mkdir() + monkeypatch.setenv("HOME", str(home)) + monkeypatch.setattr(Path, "home", classmethod(lambda cls: home)) + monkeypatch.delenv("RIG_TMUX_DRY_RUN", raising=False) # run the real activation + _activation_seams(monkeypatch) + runner._do_provision_tmux(_tmux_action(home), "backup") + assert (home / ".tmux" / "resurrect").is_dir() + + +def test_activation_clones_missing_plugins(tmp_path, monkeypatch): + """DEFECT 6: on a CLEAN machine ~/.tmux/plugins is empty, so the @plugin declarations don't + resolve. Activation must clone tpm + resurrect + continuum.""" + from riglib.actions import runner + from riglib import tmux as tmod + + home = tmp_path / "home" + home.mkdir() + monkeypatch.setenv("HOME", str(home)) + monkeypatch.setattr(Path, "home", classmethod(lambda cls: home)) + monkeypatch.delenv("RIG_TMUX_DRY_RUN", raising=False) + rec = _activation_seams(monkeypatch) + runner._do_provision_tmux(_tmux_action(home), "backup") + cloned = {repo for repo, _dest in rec["clones"]} + assert cloned == {url for url, _entry in tmod.PLUGINS.values()} + + +def test_activation_skips_already_present_plugins(tmp_path, monkeypatch): + """Idempotent: a COMPLETE plugin checkout that already exists is NOT re-cloned.""" + from riglib.actions import runner + + home = tmp_path / "home" + home.mkdir() + monkeypatch.setenv("HOME", str(home)) + monkeypatch.setattr(Path, "home", classmethod(lambda cls: home)) + monkeypatch.delenv("RIG_TMUX_DRY_RUN", raising=False) + # pre-create a COMPLETE tpm checkout (its `tpm` entrypoint present) so it's "already installed". + tpm = home / ".tmux" / "plugins" / "tpm" + tpm.mkdir(parents=True) + (tpm / "tpm").write_text("#!/usr/bin/env bash\n", encoding="utf-8") + rec = _activation_seams(monkeypatch) + runner._do_provision_tmux(_tmux_action(home), "backup") + cloned_dests = {dest for _repo, dest in rec["clones"]} + assert not any(d.endswith("/tpm") for d in cloned_dests) # tpm not re-cloned + assert any(d.endswith("/tmux-resurrect") for d in cloned_dests) # the missing ones still cloned + + +def test_activation_recloned_partial_plugin_dir(tmp_path, monkeypatch): + """A partial/broken plugin dir (from a failed clone) is NOT treated as installed — it is + cleared and re-cloned, so the 'offline retries next apply' contract holds (codex finding).""" + from riglib.actions import runner + + home = tmp_path / "home" + home.mkdir() + monkeypatch.setenv("HOME", str(home)) + monkeypatch.setattr(Path, "home", classmethod(lambda cls: home)) + monkeypatch.delenv("RIG_TMUX_DRY_RUN", raising=False) + # a tpm dir that EXISTS but is missing its `tpm` entrypoint → partial → must be re-cloned. + partial = home / ".tmux" / "plugins" / "tpm" + partial.mkdir(parents=True) + (partial / "stray.txt").write_text("leftover from a failed clone\n", encoding="utf-8") + rec = _activation_seams(monkeypatch) + runner._do_provision_tmux(_tmux_action(home), "backup") + assert any(d.endswith("/tpm") for _r, d in rec["clones"]), "partial tpm was not re-cloned" + + +def test_activation_launchctl_loads_the_boot_agent_with_w(tmp_path, monkeypatch): + """DEFECT 1: `rig apply` must `launchctl load -w` the boot agent (it previously wrote the + plist but NEVER loaded it → the agent didn't fire). `-w` enables it across reboots.""" + import sys as _sys + from riglib.actions import runner + + home = tmp_path / "home" + home.mkdir() + monkeypatch.setenv("HOME", str(home)) + monkeypatch.setattr(Path, "home", classmethod(lambda cls: home)) + monkeypatch.setattr(_sys, "platform", "darwin") + monkeypatch.delenv("RIG_TMUX_DRY_RUN", raising=False) + rec = _activation_seams(monkeypatch) + runner._do_provision_tmux(_tmux_action(home, boot={"enabled": True}), "backup") + # a `launchctl load -w ` must have been issued for the boot plist (via the dedicated + # _launchctl_load_enable helper, which builds `launchctl load -w ` with -w as its own + # token — see the helper's docstring for why it is separate from the 2-arg _launchctl). + plist = str(home / "Library" / "LaunchAgents" / "ai.hyperide.tmux-boot.plist") + assert rec["load_w"] == [plist], rec["load_w"] + + +def test_activation_takes_a_first_resurrect_save(tmp_path, monkeypatch): + """DEFECT 6: after apply, take a first `resurrect save` so there is a snapshot to restore.""" + from riglib.actions import runner + + home = tmp_path / "home" + home.mkdir() + monkeypatch.setenv("HOME", str(home)) + monkeypatch.setattr(Path, "home", classmethod(lambda cls: home)) + monkeypatch.delenv("RIG_TMUX_DRY_RUN", raising=False) + rec = _activation_seams(monkeypatch) + runner._do_provision_tmux(_tmux_action(home), "backup") + assert rec["saves"] >= 1 + + +def test_activation_cleans_stale_continuum_boot(tmp_path, monkeypatch): + """DEFECT 5: continuum's own osx_iterm/terminal_start_tmux.sh register as macOS Login Items + and compete with rig's launchd agent. Activation must clean them (osx_disable.sh / bootout).""" + import sys as _sys + from riglib.actions import runner + + home = tmp_path / "home" + home.mkdir() + monkeypatch.setenv("HOME", str(home)) + monkeypatch.setattr(Path, "home", classmethod(lambda cls: home)) + monkeypatch.setattr(_sys, "platform", "darwin") + monkeypatch.delenv("RIG_TMUX_DRY_RUN", raising=False) + rec = _activation_seams(monkeypatch) + runner._do_provision_tmux(_tmux_action(home, boot={"enabled": True}), "backup") + assert rec["cleanups"] >= 1 + + +def test_activation_suppresses_boot_load_when_plist_conflict_skipped(tmp_path, monkeypatch): + """on_conflict=skip + a DIFFERING boot plist on disk: activation must NOT launchctl-load it + (loading a stale/unmanaged boot path despite skip semantics — codex finding).""" + import sys as _sys + from riglib.actions import runner + + home = tmp_path / "home" + home.mkdir() + monkeypatch.setenv("HOME", str(home)) + monkeypatch.setattr(Path, "home", classmethod(lambda cls: home)) + monkeypatch.setattr(_sys, "platform", "darwin") + monkeypatch.delenv("RIG_TMUX_DRY_RUN", raising=False) + rec = _activation_seams(monkeypatch) + # pre-seed a DIFFERING boot plist so on_conflict=skip leaves it stale (conflict-skipped). + la = home / "Library" / "LaunchAgents" + la.mkdir(parents=True) + (la / "ai.hyperide.tmux-boot.plist").write_text("STALE\n", encoding="utf-8") + runner._do_provision_tmux(_tmux_action(home, boot={"enabled": True}), "skip") + assert rec["load_w"] == [] # the stale boot agent was NOT loaded + # AND the stale-continuum-boot cleanup must NOT run: rig has not loaded its replacement, so + # stripping continuum's own autostart now would leave NO tmux autostart at all (review finding). + assert rec["cleanups"] == 0 + + +def test_activation_does_not_reload_already_loaded_unchanged_agent(tmp_path, monkeypatch): + """A steady-state re-apply (agent already loaded, plist unchanged) must NOT unload/reload the + boot agent — so it never restarts it / re-spawns `main` every apply, and a transient load + failure can't disable a working agent (review Medium/Low). The launchctl load is NOT called.""" + import sys as _sys + from riglib.actions import runner + + home = tmp_path / "home" + home.mkdir() + monkeypatch.setenv("HOME", str(home)) + monkeypatch.setattr(Path, "home", classmethod(lambda cls: home)) + monkeypatch.setattr(_sys, "platform", "darwin") + monkeypatch.delenv("RIG_TMUX_DRY_RUN", raising=False) + rec = _activation_seams(monkeypatch, launchctl_loaded=True) + # first apply writes the plist (changed) and would load; then everything is current + loaded. + runner._do_provision_tmux(_tmux_action(home, boot={"enabled": True}), "backup") + rec["load_w"].clear() + rec["launchctl"].clear() + # second apply: plist unchanged + agent loaded → NO load, NO unload. + runner._do_provision_tmux(_tmux_action(home, boot={"enabled": True}), "backup") + assert rec["load_w"] == [] # not (re)loaded + assert not any(v == "unload" for v, _a in rec["launchctl"]) # not unloaded + + +def test_activation_reloads_when_plist_changed_and_agent_loaded(tmp_path, monkeypatch): + """When the plist is REWRITTEN (e.g. an upgrade changes the boot script path) AND the agent is + already loaded, activation unloads + load -w's it so launchd picks up the new definition.""" + import sys as _sys + from riglib.actions import runner + + home = tmp_path / "home" + home.mkdir() + monkeypatch.setenv("HOME", str(home)) + monkeypatch.setattr(Path, "home", classmethod(lambda cls: home)) + monkeypatch.setattr(_sys, "platform", "darwin") + monkeypatch.delenv("RIG_TMUX_DRY_RUN", raising=False) + rec = _activation_seams(monkeypatch, launchctl_loaded=True) + # pre-seed a DIFFERING plist so this apply REWRITES it (boot_plist_changed) under backup. + la = home / "Library" / "LaunchAgents" + la.mkdir(parents=True) + (la / "ai.hyperide.tmux-boot.plist").write_text("OLD\n", encoding="utf-8") + runner._do_provision_tmux(_tmux_action(home, boot={"enabled": True}), "backup") + plist = str(la / "ai.hyperide.tmux-boot.plist") + assert rec["load_w"] == [plist] # reloaded the changed plist + assert ("unload", plist) in rec["launchctl"] # unloaded first to refresh the definition + + +def test_failed_clone_warning_surfaced_on_changed_apply(tmp_path, monkeypatch): + """A clone failure on a FIRST (config-writing → changed) apply must still surface the + 'plugin NOT installed' warning in the result detail, not be swallowed (review Low-4).""" + from riglib.actions import runner + + home = tmp_path / "home" + home.mkdir() + monkeypatch.setenv("HOME", str(home)) + monkeypatch.setattr(Path, "home", classmethod(lambda cls: home)) + monkeypatch.delenv("RIG_TMUX_DRY_RUN", raising=False) + _activation_seams(monkeypatch) + monkeypatch.setattr(runner, "_git_clone", lambda repo, dest: 1) # offline: every clone fails + # a FIRST apply (no config yet) → config is written → status is created/changed. + res = runner._do_provision_tmux(_tmux_action(home), "backup") + assert res.status in ("created", "backed_up") + assert "NOT installed" in res.detail # the offline-clone warning reached the user + + +def test_activation_suppresses_boot_load_when_boot_script_conflict_skipped(tmp_path, monkeypatch): + """on_conflict=skip + a DIFFERING tmux-boot.sh on disk: activation must NOT launchctl-load the + agent (the plist runs THAT script, so loading would run a stale boot script — review P1).""" + import sys as _sys + from riglib.actions import runner + + home = tmp_path / "home" + home.mkdir() + monkeypatch.setenv("HOME", str(home)) + monkeypatch.setattr(Path, "home", classmethod(lambda cls: home)) + monkeypatch.setattr(_sys, "platform", "darwin") + monkeypatch.delenv("RIG_TMUX_DRY_RUN", raising=False) + rec = _activation_seams(monkeypatch) + # pre-seed a DIFFERING boot script so on_conflict=skip leaves it stale (conflict-skipped). + gen = home / ".config" / "rig" / "tmux" + gen.mkdir(parents=True) + (gen / "tmux-boot.sh").write_text("#!/usr/bin/env bash\necho STALE BOOT\n", encoding="utf-8") + runner._do_provision_tmux(_tmux_action(home, boot={"enabled": True}), "skip") + assert rec["load_w"] == [] # the agent that runs the stale boot script was NOT loaded + + +def test_activation_does_not_clean_continuum_boot_when_rig_boot_disabled(tmp_path, monkeypatch): + """The stale-boot cleanup is gated on rig owning boot: if the user opted OUT of rig boot + (boot.enabled false) but relies on continuum's own autostart, activation must NOT nuke it + (opus finding).""" + import sys as _sys + from riglib.actions import runner + + home = tmp_path / "home" + home.mkdir() + monkeypatch.setenv("HOME", str(home)) + monkeypatch.setattr(Path, "home", classmethod(lambda cls: home)) + monkeypatch.setattr(_sys, "platform", "darwin") + monkeypatch.delenv("RIG_TMUX_DRY_RUN", raising=False) + rec = _activation_seams(monkeypatch) + runner._do_provision_tmux(_tmux_action(home, boot={"enabled": False}), "backup") + assert rec["cleanups"] == 0 # boot disabled → the user's own autostart is left alone + + +def test_activation_re_apply_is_a_noop(tmp_path, monkeypatch): + """Idempotency (opus/codex finding): a second real activation with everything already present + (plugins complete, boot agent loaded, a snapshot on disk) makes NO changes → the apply is a + `skipped` no-op, never a spurious `created` on every run.""" + import sys as _sys + from riglib.actions import runner + + home = tmp_path / "home" + home.mkdir() + monkeypatch.setenv("HOME", str(home)) + monkeypatch.setattr(Path, "home", classmethod(lambda cls: home)) + monkeypatch.setattr(_sys, "platform", "darwin") + monkeypatch.delenv("RIG_TMUX_DRY_RUN", raising=False) + _activation_seams(monkeypatch, launchctl_loaded=True) # agent already loaded → no re-load change + # nothing stale to clean on a settled machine → cleanup is a no-op (not a change). + monkeypatch.setattr(runner, "_clean_stale_continuum_boot", lambda plan: False) + # pre-stage a fully-activated machine: complete plugin checkouts + a resurrect snapshot. + _stage_live_tmux_state(home) + (home / ".tmux" / "resurrect" / "tmux_resurrect_20260101T000000.txt").write_text( + "snap\n", encoding="utf-8") + # first apply writes the config artifacts; second must be a pure no-op. + runner._do_provision_tmux(_tmux_action(home, boot={"enabled": True}), "backup") + res2 = runner._do_provision_tmux(_tmux_action(home, boot={"enabled": True}), "backup") + assert res2.status == "skipped", res2.detail + + +def test_activation_skips_first_save_when_snapshot_exists(tmp_path, monkeypatch): + """The FIRST save fires ONLY when no snapshot exists — a re-apply must not re-save and risk + clobbering a good snapshot with an empty/partial one (opus finding). Exercises the REAL + _tmux_resurrect_save guard (not the seam stub).""" + from riglib.actions import runner + + home = tmp_path / "home" + home.mkdir() + monkeypatch.setenv("HOME", str(home)) + monkeypatch.setattr(Path, "home", classmethod(lambda cls: home)) + monkeypatch.delenv("RIG_TMUX_DRY_RUN", raising=False) + rec = _activation_seams(monkeypatch) + # a snapshot already exists on disk. + resurrect = home / ".tmux" / "resurrect" + resurrect.mkdir(parents=True) + (resurrect / "tmux_resurrect_20260101T000000.txt").write_text("snap\n", encoding="utf-8") + runner._do_provision_tmux(_tmux_action(home), "backup") + assert rec["saves"] == 0 # snapshot present → no re-save + + +def test_failed_clone_is_a_warning_not_a_change(tmp_path, monkeypatch): + """A failed plugin clone (offline) is surfaced as a WARNING but must NOT mark the apply + `changed` — else every re-apply falsely reports `created` (codex/opus idempotency finding). + With nothing else to do (snapshot present, plugins the only gap), a clone-failure-only run is + `skipped`, and the warning is in the detail.""" + from riglib.actions import runner + + home = tmp_path / "home" + home.mkdir() + monkeypatch.setenv("HOME", str(home)) + monkeypatch.setattr(Path, "home", classmethod(lambda cls: home)) + monkeypatch.delenv("RIG_TMUX_DRY_RUN", raising=False) + _activation_seams(monkeypatch) # stub launchctl/cleanup/save; the clone is overridden below + # make every clone fail (offline) … + monkeypatch.setattr(runner, "_git_clone", lambda repo, dest: 1) + # nothing stale to clean (so a clone-failure is the ONLY activation outcome). + monkeypatch.setattr(runner, "_clean_stale_continuum_boot", lambda plan: False) + # … and pre-stage the resurrect dir + a snapshot so NOTHING else is a change. + resurrect = home / ".tmux" / "resurrect" + resurrect.mkdir(parents=True) + (resurrect / "tmux_resurrect_20260101T000000.txt").write_text("snap\n", encoding="utf-8") + # second apply (config already current from a first) so only activation could change anything. + runner._do_provision_tmux(_tmux_action(home), "backup") + res2 = runner._do_provision_tmux(_tmux_action(home), "backup") + assert res2.status == "skipped", res2.detail # a failed clone did NOT inflate `changed` + assert "NOT installed" in res2.detail # the warning is still surfaced + + +def test_resurrect_save_does_not_start_a_server(tmp_path, monkeypatch): + """_tmux_resurrect_save must NOT start a server just to snapshot it (it would save an empty + pre-restore state). With no live server it returns non-zero and runs no save (opus finding).""" + from riglib.actions import runner + + home = tmp_path / "home" + home.mkdir() + monkeypatch.setenv("HOME", str(home)) + monkeypatch.setattr(Path, "home", classmethod(lambda cls: home)) + # install a real save.sh so the function gets past its existence check. + save = home / ".tmux" / "plugins" / "tmux-resurrect" / "scripts" / "save.sh" + save.parent.mkdir(parents=True) + save.write_text("#!/usr/bin/env bash\ntouch ${SAVED_MARKER:-/dev/null}\n", encoding="utf-8") + calls: list[list[str]] = [] + + def fake_run(cmd, **kw): + calls.append(cmd) + import subprocess as sp + # `tmux list-sessions` → non-zero (no server). Anything else shouldn't be reached. + return sp.CompletedProcess(cmd, 1, stdout="", stderr="no server running") + + monkeypatch.setattr(runner.subprocess, "run", fake_run) + monkeypatch.setattr(runner.shutil, "which", lambda n: "/usr/bin/tmux" if n == "tmux" else None) + plan = tmux.build_tmux(repo_home=home) + rc = runner._tmux_resurrect_save(plan) + assert rc != 0 # no live server → no save + # it probed for a live server (list-sessions) but NEVER ran save.sh and NEVER started a server. + assert any("list-sessions" in " ".join(c) for c in calls) + assert not any("save.sh" in " ".join(c) for c in calls) + assert not any("new-session" in " ".join(c) for c in calls) + + +def test_dry_run_skips_all_live_activation(tmp_path, monkeypatch): + """RIG_TMUX_DRY_RUN=1 writes the on-disk artifacts but performs NO live effect (clone / + launchctl / save / cleanup) — the seam the unit suite + CI rely on.""" + import sys as _sys + from riglib.actions import runner + + home = tmp_path / "home" + home.mkdir() + monkeypatch.setenv("HOME", str(home)) + monkeypatch.setattr(Path, "home", classmethod(lambda cls: home)) + monkeypatch.setattr(_sys, "platform", "darwin") + monkeypatch.setenv("RIG_TMUX_DRY_RUN", "1") + rec = _activation_seams(monkeypatch) + runner._do_provision_tmux(_tmux_action(home, boot={"enabled": True}), "backup") + # the files still land … + assert (home / ".config" / "rig" / "tmux" / "rig.tmux.conf").is_file() + # … but NOTHING live ran. + assert rec["clones"] == [] and rec["launchctl"] == [] + assert rec["saves"] == 0 and rec["cleanups"] == 0 + + +def test_clean_continuum_boot_is_idempotent_noop_when_absent(tmp_path, monkeypatch): + """The cleanup is idempotent: with no stale continuum boot present it does nothing and + never errors (a clean machine has nothing to clean).""" + import sys as _sys + from riglib.actions import runner + + home = tmp_path / "home" + home.mkdir() + monkeypatch.setenv("HOME", str(home)) + monkeypatch.setattr(Path, "home", classmethod(lambda cls: home)) + monkeypatch.setattr(_sys, "platform", "darwin") + monkeypatch.setattr(runner, "_launchctl", lambda verb, arg: 0) + plan = tmux.build_tmux(repo_home=home) + # must not raise even though no Tmux.Start.plist / osx_disable.sh exists. + runner._clean_stale_continuum_boot(plan) + + def test_drift_tmux_reports_all_drifted_regions(tmp_path, monkeypatch): """Two regions drift at once → BOTH reported (no early-return masking the second).""" from riglib.actions import runner diff --git a/tests/test_tmux_e2e.py b/tests/test_tmux_e2e.py new file mode 100644 index 0000000..08ffca5 --- /dev/null +++ b/tests/test_tmux_e2e.py @@ -0,0 +1,417 @@ +"""REAL tmux e2e — the acceptance test for the rig-tmux-v2 reboot-cycle fix. + +What this is +------------ +The #24/#26 tmux provisioning passed unit + tmux parse-check but the LIVE reboot cycle +(apply -> save -> REBOOT -> restore) broke on a real machine — multiple defects that a +pure-render/unit suite cannot catch. This file is the acceptance gate the CTO asked for: it +drives REAL ``tmux`` (a real server, real sessions, the actual generated scripts) in a throwaway +``$HOME`` and asserts the whole cycle works with ZERO manual steps. It is the proof that a +clean-machine ``rig apply`` leaves tmux persistence FULLY working. + +How it is reached +----------------- +``pytest`` runs it by default. Every tmux call goes through a PRIVATE ``-L `` (a per-test +``tmux`` shim on PATH injects ``-L``), so it NEVER touches the developer's real tmux server. A +session-scoped teardown kills every spawned server/socket. If ``tmux``/``git`` is not installed +the whole module skips (the unit suite already covers the render/plan/drift logic hermetically). + +What it proves (maps 1:1 to the six defects) +-------------------------------------------- +1. boot: the generated boot script (NOT a bare ``start-server``) brings a server UP with the + config LOADED and a session present; ``rig apply`` ``launchctl load -w``s the agent (asserted + on the artifact + the load call, since a test can't reboot). +2. cc-save: a FAKE ``claude`` child under a pane's shell makes cc-save write a NON-EMPTY + cwd->id map (the old command-string filter wrote nothing); cc-restore would relaunch + ``claude --resume `` into a fresh shell pane. +3. login shell: the generated config sets a login-shell ``default-command``. +4. resurrect dir: ``~/.tmux/resurrect`` exists and a real ``.txt`` snapshot is written. +5. old-boot cleanup: the stale continuum Login Items / ``Tmux.Start`` agent are removed. +6. plugins: tpm + resurrect + continuum are installed so the ``@plugin`` decls resolve, and a + first ``resurrect save`` lands a snapshot. + +Invariants +---------- +- PRIVATE socket only (``-L``). Never the default server. Teardown kills it. +- The generated scripts are run UNMODIFIED (via the PATH ``tmux`` shim) — testing the real + artifact rig writes, not a paraphrase of it. +""" + +from __future__ import annotations + +import os +import shlex +import shutil +import subprocess +import time +import uuid +import socket +from pathlib import Path + +import pytest + +from riglib import tmux as tmod +from riglib.actions import runner + + +def _github_reachable() -> bool: + """True if GitHub's https port is reachable — the e2e clones the real tmux plugins from there + (it needs the REAL resurrect ``save.sh`` to write a real snapshot, the whole acceptance point). + Even under the opt-in flag we still skip (not fail) when offline. Cheap TCP probe, 3s timeout.""" + try: + with socket.create_connection(("github.com", 443), timeout=3): + return True + except OSError: + return False + + +# This module drives a REAL tmux server AND clones the real plugins from GitHub — real network + +# daemon access. The repo's plain `python -m pytest -q` is documented as fast + HERMETIC (AGENTS.md), +# so this e2e is OPT-IN via `RIG_TMUX_E2E=1` (codex finding): default CI/pytest stays hermetic and +# offline-safe; the BFS / artifact logic it exercises is ALSO covered hermetically (the unit suite +# + `test_pane_has_claude_*` here, which run with no network). The acceptance gate the CTO runs is +# `RIG_TMUX_E2E=1 pytest tests/test_tmux_e2e.py`. Even when opted in, it skips (never fails) when +# tmux/git is absent or GitHub is unreachable. The autouse RIG_TMUX_DRY_RUN guard (conftest) is +# cleared per-test where a live step is exercised. +_E2E_OPTED_IN = os.environ.get("RIG_TMUX_E2E", "").strip() in ("1", "true", "yes") +pytestmark = pytest.mark.skipif( + not _E2E_OPTED_IN + or shutil.which("tmux") is None + or shutil.which("git") is None + or not _github_reachable(), + reason="real-tmux e2e is opt-in: set RIG_TMUX_E2E=1 (needs tmux + git + network; auto-skips offline)", +) + + +# ── a PATH tmux shim that pins every `tmux …` call to a private -L socket ──────────────────── +def _install_tmux_shim(bindir: Path, socket: str) -> None: + """Write a `tmux` wrapper on PATH that injects `-L ` so the UNMODIFIED generated + scripts (which call bare `tmux`) hit a private server, never the developer's default one.""" + real = shutil.which("tmux") + shim = bindir / "tmux" + shim.write_text( + "#!/usr/bin/env bash\n" + f'exec {real} -L {socket} "$@"\n', + encoding="utf-8", + ) + shim.chmod(0o755) + + +@pytest.fixture +def tmux_env(tmp_path, monkeypatch): + """A throwaway HOME + a private tmux socket + a PATH shim, with teardown that kills the + server. Yields (home, socket, run) where `run` executes a command with the shimmed PATH.""" + home = tmp_path / "home" + home.mkdir() + socket = f"rigtest-{uuid.uuid4().hex[:8]}" + bindir = tmp_path / "bin" + bindir.mkdir() + _install_tmux_shim(bindir, socket) + + real_tmux = shutil.which("tmux") + env = dict(os.environ) + env["HOME"] = str(home) + env["PATH"] = f"{bindir}{os.pathsep}{env.get('PATH', '')}" + # NB: the private `-L ` already isolates the server; do NOT also set a deep + # TMUX_TMPDIR — a unix socket path has a ~104-char limit on macOS and a pytest tmp dir blows + # it ("File name too long"). The unique -L name under the default /tmp tmpdir is short + safe. + # don't let an inherited $TMUX (we may run inside tmux) confuse nested calls. + env.pop("TMUX", None) + env.pop("TMUX_TMPDIR", None) + + monkeypatch.setenv("HOME", str(home)) + monkeypatch.setattr(Path, "home", classmethod(lambda cls: home)) + # CRITICAL: the generated boot script bakes its tmux path at GENERATION time via + # `_resolve_tmux_bin` -> `shutil.which("tmux")`, which (in-process) is the REAL tmux on the + # DEFAULT socket — NOT the shim. Left unpatched, the boot script would create a session on + # the user's real tmux server (the exact thing the private -L socket is meant to prevent). + # Point the resolver at the shim so EVERY tmux the rig artifacts invoke hits the private socket. + from riglib import tmux as _tmod + monkeypatch.setattr(_tmod, "_resolve_tmux_bin", lambda: str(bindir / "tmux")) + + def run(cmd, **kw): + return subprocess.run( + cmd, env=env, capture_output=True, text=True, timeout=kw.pop("timeout", 30), **kw + ) + + try: + yield home, socket, run + finally: + # kill the private server (best-effort) — never leave a stray tmux around. + subprocess.run([real_tmux, "-L", socket, "kill-server"], + capture_output=True, text=True, timeout=15) + + +def _wait_for_claude_descendant(run, *, timeout_s=10): + """Poll until a process whose `comm` is `claude` is a descendant of some tmux pane on the + private socket — removes the race between launching the fake claude and `ps` seeing it. + + Uses the SAME tree-walk the cc-save script does, so if this sees the descendant, cc-save will + too. Returns the pane addr when found; raises if it never appears (a real failure, not a flake). + """ + deadline = time.time() + timeout_s + while time.time() < deadline: + panes = run(["tmux", "list-panes", "-a", "-F", "#{session_name}:#{window_index}.#{pane_index} #{pane_pid}"]) + snap = subprocess.run(["ps", "-eo", "pid=,ppid=,comm="], capture_output=True, text=True, timeout=10).stdout + tree = {} + comm = {} + for ln in snap.splitlines(): + parts = ln.split(None, 2) + if len(parts) == 3: + pid, ppid, c = parts + tree.setdefault(ppid, []).append(pid) + comm[pid] = c + for ln in panes.stdout.splitlines(): + addr, _, pane_pid = ln.partition(" ") + stack = [pane_pid.strip()] + seen = set() + while stack: + cur = stack.pop() + if cur in seen: + continue + seen.add(cur) + if comm.get(cur, "").rsplit("/", 1)[-1] == "claude": + return addr + stack.extend(tree.get(cur, [])) + time.sleep(0.3) + raise AssertionError("fake claude never became a visible descendant of a pane within timeout") + + +def _action(home, **over): + from riglib.plan import Action + + options = { + "apply_mode": "import", + "conf_path": str(home / ".tmux.conf"), + "generated_dir": str(home / ".config" / "rig" / "tmux"), + "resurrect": {}, + "continuum": {}, + "moshi": {}, + "cc_restore": {}, + "anti_sprawl": {"enabled": True, "session": "main"}, + "boot": {"enabled": True}, + "login_shell": {}, + } + options.update(over) + return Action(kind="provision_tmux", category="tmux", item="config", + source=home, target=home / ".tmux.conf", options=options) + + +def _apply_with_real_plugins(home, monkeypatch): + """Run the full provision WITH real plugin clones + the resurrect dir, but keep launchctl + stubbed (a test can't load a real launch agent without polluting the host). Returns the + ActionResult. The boot script + cc scripts + config are all real on-disk artifacts.""" + monkeypatch.delenv("RIG_TMUX_DRY_RUN", raising=False) + loads: list[str] = [] + monkeypatch.setattr(runner, "_launchctl", lambda verb, arg: 0) + monkeypatch.setattr(runner, "_launchctl_loaded", lambda label: False) + monkeypatch.setattr(runner, "_launchctl_load_enable", lambda plist: loads.append(str(plist)) or 0) + monkeypatch.setattr(runner, "_clean_stale_continuum_boot", lambda plan: False) + # Stub the in-apply first-save: it would boot a session on the host's DEFAULT tmux server + # (the runner's _tmux_resurrect_save calls the boot script with the unshimmed PATH). The e2e + # drives boot + save EXPLICITLY through the private-socket shim instead, so the assertions run + # against an isolated server. We still let the REAL _git_clone + resurrect-dir creation run. + monkeypatch.setattr(runner, "_tmux_resurrect_save", lambda plan: 0) + res = runner._do_provision_tmux(_action(home), "backup") + return res, loads + + +# ── the acceptance e2e ─────────────────────────────────────────────────────────────────────── +def test_clean_machine_apply_brings_tmux_up_with_config_and_session(tmux_env, monkeypatch): + """DEFECTS 1/3/4/6: a clean-HOME apply installs plugins + scripts + config + boot agent, the + boot script brings a REAL server up WITH the config loaded AND a session present, and the + config sets a login-shell default-command.""" + home, socket, run = tmux_env + res, loads = _apply_with_real_plugins(home, monkeypatch) + assert res.status in ("created", "backed_up"), res.detail + + gen = home / ".config" / "rig" / "tmux" + # DEFECT 6: plugins cloned so @plugin decls resolve. + for name in ("tpm", "tmux-resurrect", "tmux-continuum"): + assert (home / ".tmux" / "plugins" / name).is_dir(), f"{name} not installed" + # DEFECT 4: resurrect snapshot dir exists. + assert (home / ".tmux" / "resurrect").is_dir() + # DEFECT 1: the boot agent plist points at the boot SCRIPT (not a bare start-server) and + # rig launchctl-load-enabled it (the load call recorded; a test can't actually reboot). + plist = home / "Library" / "LaunchAgents" / "ai.hyperide.tmux-boot.plist" + if os.uname().sysname == "Darwin": + assert plist.is_file() + assert loads == [str(plist)], loads + boot_script = gen / "tmux-boot.sh" + assert boot_script.is_file() and os.access(boot_script, os.X_OK) + + # DEFECT 3: the generated config sets a login-shell default-command. + conf_text = (gen / "rig.tmux.conf").read_text() + assert "set -g default-command" in conf_text and "-l" in conf_text + + # the ~/.tmux.conf imports the generated file (so a new session loads it). + assert f"source-file '{gen / 'rig.tmux.conf'}'" in (home / ".tmux.conf").read_text() + + # DEFECT 1 — EXECUTE the boot entrypoint (can't reboot in a test): it must bring a server UP + # with the config LOADED and a session present. Run it via the shimmed PATH (private socket). + r = run([str(boot_script)]) + assert r.returncode == 0, f"boot script failed: {r.stderr}" + time.sleep(1.0) # let the detached session + plugin inits settle before querying the server. + # a server is now running with the canonical session. + ls = run(["tmux", "ls"]) + assert ls.returncode == 0, f"`tmux ls` says no server: {ls.stderr or ls.stdout}" + assert "main" in ls.stdout, ls.stdout + # the CONFIG was actually loaded by that first session: a rig-set option is live on the server. + opt = run(["tmux", "show-options", "-g", "@continuum-save-interval"]) + assert "15" in opt.stdout, f"config not loaded — continuum option absent: {opt.stdout!r} {opt.stderr!r}" + # idempotent: a second boot does NOT create a duplicate session (anti-sprawl at boot). + run([str(boot_script)]) + ls2 = run(["tmux", "ls"]) + assert ls2.stdout.count("main") == 1, f"boot spawned a duplicate session: {ls2.stdout}" + + +def test_cc_save_populates_map_from_a_real_claude_child(tmux_env, monkeypatch): + """DEFECT 2 (the headline reboot bug): a FAKE `claude` running as a CHILD of a pane's shell + must make cc-save write a NON-EMPTY cwd->session-id map — the OLD `pane_current_command == + claude` filter wrote nothing because cc shows up as its version string, not `claude`.""" + home, socket, run = tmux_env + _apply_with_real_plugins(home, monkeypatch) + gen = home / ".config" / "rig" / "tmux" + + # A fake `claude` whose process `comm` reports `claude` (the production case: cc shows up as a + # VERSION string in pane_current_command, the real `claude` is a CHILD). The process must have a + # `comm` whose basename is `claude` on BOTH platforms — and the two single-trick approaches each + # fail on one OS: `exec -a claude sleep` rewrites only argv[0] (Linux `comm` still reads `sleep` + # → descendant invisible → CI failed); a COPY of the `sleep` binary won't run on macOS (SIP + # refuses to exec an unsigned copy of a system binary). A SYMLINK named `claude` → the real + # `sleep` works on both: the kernel sets `comm` from the invoked name, so `comm`'s basename is + # `claude` on Linux AND macOS. Run by a LAUNCHER that keeps it a genuine child of the pane shell + # (background it, the shell stays alive) — a bare send-keys `claude &` gets reparented by + # job-control and detaches. + work = home / "proj" + work.mkdir() + fake_claude = home / "fakebin" / "claude" + fake_claude.parent.mkdir() + real_sleep = shutil.which("sleep") or "/bin/sleep" + fake_claude.symlink_to(real_sleep) # symlink named `claude` → comm basename == claude on both OSes + launcher = home / "launch.sh" + launcher.write_text( + f"#!/usr/bin/env bash\n{shlex.quote(str(fake_claude))} 300 &\nsleep 300\n", encoding="utf-8" + ) + launcher.chmod(0o755) + # the pane RUNS the launcher (so claude is a real descendant), in the known cwd. + run(["tmux", "new-session", "-d", "-s", "main", "-c", str(work), str(launcher)]) + _wait_for_claude_descendant(run, timeout_s=10) + + # seed a Claude Code session file under the encoded projects dir for that cwd, so cc-save has + # an id to record (encoding: every '/' and '.' -> '-'). + enc = str(work).replace("/", "-").replace(".", "-") + proj = home / ".claude" / "projects" / enc + proj.mkdir(parents=True) + sid = "11111111-2222-3333-4444-555555555555" + (proj / f"{sid}.jsonl").write_text("{}\n", encoding="utf-8") + + # RUN the real generated cc-save (via the shimmed tmux → private socket). + r = run(["bash", str(gen / "cc-save.sh")]) + assert r.returncode == 0, f"cc-save failed: {r.stderr}" + + map_file = gen / "cc-sessions.map" + assert map_file.is_file(), "cc-save wrote no map file" + lines = [ln for ln in map_file.read_text().splitlines() if ln.strip()] + assert lines, "DEFECT 2: cc-save map is EMPTY — the claude child was not detected via the tree" + # the recorded line is \t\t for our claude pane. + assert any(str(work) in ln and sid in ln for ln in lines), lines + + +def test_cc_restore_relaunches_claude_resume_into_fresh_shell(tmux_env, monkeypatch): + """DEFECT 2 (restore half): with a seeded map, cc-restore sends `cd && claude --resume + ` into a FRESH shell pane (never on top of a running claude / an editor).""" + home, socket, run = tmux_env + _apply_with_real_plugins(home, monkeypatch) + gen = home / ".config" / "rig" / "tmux" + + work = home / "proj" + work.mkdir() + # A FAKE `claude` on the pane's PATH so the resume that cc-restore types runs a harmless + # sleep (NOT the real Claude Code, which would launch its onboarding TUI). `exec -a claude` + # makes its `comm` report `claude`. We then assert the resume command was TYPED into the pane. + fakebin = home / "fakebin" + fakebin.mkdir() + (fakebin / "claude").write_text( + "#!/usr/bin/env bash\nexec -a claude sleep 300\n", encoding="utf-8" + ) + (fakebin / "claude").chmod(0o755) + # a fresh shell pane (bash, no rc) with the fake claude FIRST on PATH, in the known cwd. + run(["tmux", "new-session", "-d", "-s", "main", "-c", str(work), + f"PATH={fakebin}:$PATH exec bash --norc -i"]) + # resolve the real pane addr (window base-index may be 1, not 0). + addr = run(["tmux", "list-panes", "-a", "-F", + "#{session_name}:#{window_index}.#{pane_index}"]).stdout.splitlines()[0] + # seed the projects session file (so the id is "live" → --resume, not --continue). + enc = str(work).replace("/", "-").replace(".", "-") + proj = home / ".claude" / "projects" / enc + proj.mkdir(parents=True) + sid = "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee" + (proj / f"{sid}.jsonl").write_text("{}\n", encoding="utf-8") + # seed the map cc-restore reads. + (gen / "cc-sessions.map").write_text(f"{addr}\t{work}\t{sid}\n", encoding="utf-8") + + # RUN the real cc-restore: it must send `cd && claude --resume ` into the pane. + r = run(["bash", str(gen / "cc-restore.sh")]) + assert r.returncode == 0, f"cc-restore failed: {r.stderr}" + time.sleep(0.5) + cap = run(["tmux", "capture-pane", "-t", addr, "-p"]) + # tmux HARD-WRAPS the pane at the terminal width, so a long command is split across lines + # (`claud\ne`). Join the captured lines (drop the wrap newlines) before substring-matching. + joined = cap.stdout.replace("\n", "") + # the resume command line was typed into the fresh shell pane. + assert "claude --resume" in joined, cap.stdout + assert sid in joined, cap.stdout + assert str(work).replace("\n", "") in joined, cap.stdout + + +def test_resurrect_writes_a_real_snapshot(tmux_env, monkeypatch): + """DEFECTS 4/6: with the resurrect dir present + the plugin installed, a real `resurrect save` + writes a `tmux_resurrect_*.txt` snapshot — so a reboot has something to restore.""" + home, socket, run = tmux_env + _apply_with_real_plugins(home, monkeypatch) + + resurrect_dir = home / ".tmux" / "resurrect" + assert resurrect_dir.is_dir() + save_script = home / ".tmux" / "plugins" / "tmux-resurrect" / "scripts" / "save.sh" + assert save_script.is_file(), "resurrect plugin not installed (save.sh missing)" + + # a real session to snapshot, then run resurrect's own save (private socket via the shim). + run(["tmux", "new-session", "-d", "-s", "main"]) + r = run(["bash", str(save_script)]) + assert r.returncode == 0, f"resurrect save failed: {r.stderr}" + snaps = list(resurrect_dir.glob("tmux_resurrect_*.txt")) + assert snaps, f"no resurrect snapshot written in {resurrect_dir}" + + +def test_old_continuum_boot_cleanup_removes_stale_entries(tmux_env, monkeypatch): + """DEFECT 5: a pre-existing stale continuum boot (its osx_disable.sh + an old Tmux.Start + launch agent) is cleaned by the activation. We MOCK their presence and assert removal.""" + home, socket, run = tmux_env + # simulate continuum's stale boot: an osx_disable.sh under the plugin + a Tmux.Start plist. + cont = home / ".tmux" / "plugins" / "tmux-continuum" / "scripts" + cont.mkdir(parents=True) + disable_ran = home / "disable-ran" + (cont / "osx_disable.sh").write_text( + f"#!/usr/bin/env bash\ntouch {disable_ran}\n", encoding="utf-8" + ) + (cont / "osx_disable.sh").chmod(0o755) + la = home / "Library" / "LaunchAgents" + la.mkdir(parents=True) + old_plist = la / "Tmux.Start.plist" + old_plist.write_text("\n", encoding="utf-8") + + boot_calls: list[tuple[str, str]] = [] + monkeypatch.setattr(runner, "_launchctl", lambda verb, arg: boot_calls.append((verb, arg)) or 0) + plan = tmod.build_tmux(repo_home=home) + cleaned = runner._clean_stale_continuum_boot(plan) + + assert cleaned is True + # continuum's documented disable script was run … + assert disable_ran.is_file(), "osx_disable.sh was not executed" + # … the old Tmux.Start plist was removed … + assert not old_plist.exists(), "stale Tmux.Start.plist not removed" + # … and we issued a bootout/unload for it. + assert any("Tmux.Start" in arg or "Tmux.Start" in verb for verb, arg in boot_calls), boot_calls + # idempotent: a second run (now nothing present) cleans nothing and doesn't error. + assert runner._clean_stale_continuum_boot(plan) is False