From 0bff90db5d3ce927b66c63d34740dfbeb79d9476 Mon Sep 17 00:00:00 2001
From: Nikita Nemirovsky <vaze.legend@gmail.com>
Date: Fri, 22 May 2026 15:38:13 +0800
Subject: [PATCH 01/19] docs(plans): add pool exhaustion handling + agent auth
 auto-reset plan

---
 ...22-pool-exhaustion-and-agent-auth-reset.md | 403 ++++++++++++++++++
 1 file changed, 403 insertions(+)
 create mode 100644 docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
diff --git a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
new file mode 100644
index 0000000..26087ee
--- /dev/null
+++ b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
@@ -0,0 +1,403 @@
+# Pool Exhaustion Handling + Agent Auth Auto-Reset
+
+## Overview
+
+Two coupled problems observed live (knuth, hermes + `openai_pool` of two Codex OAuth accounts):
+
+1. **Failover flap + Telegram notice spam.** Both pool members hit the OpenAI Codex
+   *usage-limit* 429 (a multi-hour quota window). sluice cools each member for a flat
+   `RateLimitCooldown = 60s`, then `ResolveActive`'s all-cooling **degrade** path re-serves
+   the soonest-recovering member, which 429s again. The result is a perpetual
+   `openai_oauth ⇄ openai_oauth_2` flap emitting ~2 notices/min forever. The sticky-failover
+   fix (#48, `docs/plans/completed/.../20260518-sticky-failover.md`) stopped the *snap-to-0*
+   flap but explicitly deferred the cooldown-window fix on the assumption that sticky made the
+   60s cooldown harmless — the **degrade-path** flap proves it did not.
+
+2. **Agent stuck after pool exhaustion.** When the whole pool is exhausted, hermes latches
+   "usage limit reached", retries 3×, and gives up — it will not self-recover even after the
+   quota window passes, because its local auth state is latched. It needs an explicit
+   `hermes auth reset <target>` to un-latch and retry.
+
+This plan delivers three fixes for (1) and an auto-reset for (2):
+
+- **A1** — correct exhaustion detection: classify the pool as exhausted when *no healthy
+  member exists*, not only when `to == from`; collapse the exhausted notice dedup key so the
+  flap direction cannot produce two keys.
+- **A2** — edge-triggered notices: emit "pool exhausted" once on the healthy→exhausted edge
+  and "pool recovered" once on the way back; no periodic spam.
+- **B1** — honor the real recovery window: derive the cooldown from the upstream
+  `Retry-After` / rate-limit-reset hints (clamped), so a quota-exhausted member stays cooled
+  for the real window instead of being re-probed every 60s. This is what makes the degrade
+  flap structurally impossible and makes "recovered" mean recovered.
+- **Auto-reset (problem 2)** — opt-in, per-pool: when a pool with a configured
+  `auth_reset_target` transitions exhausted→recovered, sluice runs the agent's auth-reset
+  command (hermes profile, **as the runtime UID 10000:10000**) so the agent un-latches.
+
+## Context (from discovery)
+
+- Language/stack: Go, pure-Go SQLite (`modernc.org/sqlite`), go-mitmproxy addon model.
+- Failover logic: `internal/proxy/pool_failover.go` (`handlePoolFailover`, `classifyFailover`,
+  `FormatFailoverNotice`, `shouldEmitPoolNotice`, dedup key `pool+from+to+tag`, 30s window).
+- Resolver/health: `internal/vault/pool.go` (`PoolResolver`, shared `PoolHealth`,
+  `ResolveActive` sticky pointer + all-cooling **degrade returns a still-cooling member with
+  ok=true**, `MarkCooldownScoped`, `CooldownUntil`, `cooling()` = `cooldownUntil.After(now)`,
+  constants `RateLimitCooldown=60s` / `AuthFailCooldown=300s`).
+- Addon state: `internal/proxy/addon.go` (`SluiceAddon` incl. `onFailover`,
+  `poolNoticeMu`/`poolNoticeAt`); `SetOnFailover` on `internal/proxy/server.go`.
+- Failover side-effects wiring: `cmd/sluice/main.go:489` (`srv.SetOnFailover(...)` → durable
+  health write + Telegram notice, fresh `context.WithTimeout(Background,10s)` per send).
+  Container manager var `containerMgr` (main.go:242); `ReloadSecrets` pattern at main.go:673.
+- Agent abstraction: `internal/container/agent_profile.go` (`AgentProfile.ReloadCmd`,
+  `WireMCPCmd`); `ContainerManager` interface `internal/container/types.go`; backends
+  `docker.go` / `apple.go` / `tart.go` / standalone. **`ExecInContainer(ctx,name,cmd)` takes
+  NO user** (`docker.go:18`, `docker_socket.go:245`; `execCreateRequest` at
+  `docker_socket.go:247` has no `User`) → today an exec runs as root, which root-chowns
+  hermes `auth.json` and bricks the gateway (CLAUDE.local.md). `InjectEnvVars` avoids this by
+  chowning inside its script (types.go:299-321) — that trick does NOT help a command that
+  writes files itself, so `hermes auth reset` genuinely must exec as 10000:10000.
+- Pools store + channel-agnostic ops: `internal/store/pools.go`, migrations
+  `internal/store/migrations/000006_credential_pools.*` (down `DROP TABLE`s all three:
+  `credential_pools`, `credential_pool_members` FK→pools(name), `credential_health`;
+  `pool_membership_epoch` machinery in pools.go:78-97), `internal/poolops`.
+- **REST is generated**: `internal/api/api.gen.go` is `DO NOT EDIT`, produced by
+  `make generate` → `oapi-codegen --config config.yaml ../../api/openapi.yaml`
+  (Makefile:64, generate.go). Existing pool routes: `GetApiPools`/`PostApiPools`/
+  `DeleteApiPoolsName`/`GetApiPoolsName`/`PostApiPoolsNameRotate` (api.gen.go:911-935).
+  New surface = edit `openapi.yaml` → `make generate` → implement the new generated
+  `ServerInterface` method in `internal/api/server.go`. Action-route style
+  (`POST .../rotate`) is the established mutation pattern.
+- **No server run loop**: `ListenAndServe` just calls `s.socks.Serve`; shutdown is
+  `GracefulShutdown`/`Close` flipping `s.closed` (server.go:2864/2876/2891). A new monitor
+  goroutine needs its own explicit stop signaled from BOTH shutdown paths.
+- Pools are **not** TOML-exportable (no `[[pool]]` in `policy export`/`examples/config.toml`),
+  so the new field needs no TOML surface.
+- The data-version watcher fires only on DB writes (`internal/store/watcher.go`, main.go:810)
+  — it is NOT a timer, so recovery-edge detection needs its own monitor.
+- `f.Response.Header.Get(...)` is available on the mitmproxy Flow (used in `addon.go` DLP).
+
+## Development Approach
+
+- **Testing approach: Regular** (implement, then table-driven tests in the same task).
+- Complete each task fully (code + tests + green) before the next.
+- **Every task includes new/updated unit tests** (success + error/edge), as separate checklist
+  items. All tests pass before moving on.
+- Preserve all existing pool concurrency invariants (CRITICAL-1 shared `PoolHealth`,
+  identity/epoch-scoped writes, sticky-pointer survival across resolver swaps).
+- Channel feature parity is mandatory (CLAUDE.md): any new store-backed pool field must be
+  reachable from CLI **and** REST **and** Telegram via the channel-agnostic `internal/poolops`.
+- gofumpt before committing (CI enforces it). `go vet`, `golangci-lint`, `-race` on touched
+  packages, `make generate` clean tree, and `-tags=e2e ./e2e/` build.
+
+## Testing Strategy
+
+- **Unit (vault)**: cooldown-from-hints parsing (each header form, clamp floor/ceiling);
+  `HasHealthyMember` mirrors `cooling()` lazy-expiry exactly; `SoonestCooldown` strictly-future;
+  degrade unchanged; exhaustion detection edge.
+- **Unit (proxy)**: exactly one "exhausted" notice per healthy→exhausted edge regardless of
+  flap direction; one "recovered" notice on the reverse edge; recovery monitor reschedules on
+  *unequal* member cooldowns (B1 parks memA 60s, memB 6h → wake at ~60s, recover once);
+  monitor stops on shutdown; B1 cooldown applied from headers.
+- **Unit (store)**: migration up→down→up with a **populated** pool + members + health rows;
+  `auth_reset_target` round-trips; default empty.
+- **Unit (poolops/channels)**: create/update with target through the channel-agnostic layer;
+  one adapter test per channel asserting it routes through poolops (no inline logic).
+- **Unit (container)**: exec-user threading; `ResetAuth` argv per profile; nil-cmd no-ops with
+  a notice; (docker) exec carries the runtime UID.
+- No UI e2e here (backend/CLI only); keep the existing `-tags=e2e` suite building.
+
+## Progress Tracking
+- mark completed items `[x]` immediately; add ➕ for new tasks, ⚠️ for blockers.
+- keep this file in sync if scope shifts during implementation.
+
+## Solution Overview
+
+**B1 — real recovery window.** Add `cooldownFromResponse(class, resp) time.Duration` in
+`pool_failover.go`. Precedence: `Retry-After` (delta-seconds or HTTP-date) → known
+rate-limit-reset headers (`x-ratelimit-reset`, OpenAI `x-ratelimit-reset-requests` /
+`x-ratelimit-reset-tokens`; delta-seconds or epoch). **No body parsing in v1** (deferred to
+Post-Completion until a real Codex 429 is captured). Clamp to `[minFloor(class), maxCooldown]`
+where `maxCooldown = 6h`, `minFloor(rate-limit) = 10s` (a parsed short window must be honored —
+the whole point is 60s is too *long*-floored, not too short — so do NOT floor up to the 60s
+class default), `minFloor(auth-failure) = AuthFailCooldown` (a revoked/expired token must not
+be retried in seconds). When **no** hint header is present, fall back to the class default
+(60s / 300s). `handlePoolFailover` uses this instead of the flat `ttl`. **Task-1 tests assert
+parsing/clamp mechanics generically; they must NOT bake in which header OpenAI's usage-limit
+429 actually sends** — that is resolved by the Post-Completion capture, and a guessed winner
+here would contradict it.
+
+**A1 — correct exhaustion detection.** Add `PoolResolver.HasHealthyMember(pool) bool`
+(RLock, single `now`, true iff some member has `!cooling()` — mirrors `ResolveActive`'s
+`cooling()` lazy-expiry exactly, NOT a status-field check). In `handlePoolFailover`, after
+`MarkCooldownScoped(from)`, set `exhausted = !pr.HasHealthyMember(pool)` (replaces
+`to == from`). When exhausted, collapse the dedup so flap direction yields one key:
+`shouldEmitPoolNotice(pool, "*", "*", "exhausted")` (only the exhausted path; real
+transitions keep `from/to/tag`).
+
+**A2 — edge-triggered notices + recovery monitor (authoritative for the exhausted notice).**
+Per-pool exhaustion state lives on the long-lived `Server` (NOT on `PoolHealth`): exhaustion
+is per-process notification bookkeeping, not per-resolver-generation health, so it must
+survive resolver pointer swaps and must NOT be pruned on a membership change — putting it on
+`PoolHealth` would wrongly prune it. `handlePoolFailover` flips `false→true` on the first
+exhausted classification, emits the one-time "exhausted" notice on that edge, and wakes the
+monitor (non-blocking buffered cap-1 send). **Edge-gating in A2 supersedes A1's 30s-window
+dedup for the exhausted notice** — the edge owns it; the window dedup remains only for real
+`from→to` transitions. A dedicated **recovery monitor** goroutine sleeps until the pool's
+soonest *strictly-future* member cooldown (clamped to a `minReschedule = 1s` floor so a
+just-expired-but-still-unhealthy state can't spin), `Load()`s the current resolver on every
+wake (never caches it), and when `HasHealthyMember(pool)` becomes true flips `true→false`,
+emits "pool recovered", and invokes `onPoolRecovered(pool)`. Server-driven (time-based, not
+traffic-based) because the latched agent will not emit a recovering 2xx on its own. A pool
+removed while exhausted has its state entry cleaned up so no recovered-notice fires for a
+deleted pool.
+
+**Auto-reset (problem 2).** Per-pool `auth_reset_target TEXT` (migration 000007); non-empty =
+opt-in. New `AgentProfile.ResetAuthCmd(target) []string` + `AgentProfile.ExecUser() string`
+(hermes → `"10000:10000"`, openclaw → ""), `ContainerManager.ResetAuth(ctx, target)` parallel
+to `ReloadCmd`/`ReloadSecrets`. `HermesProfile.ResetAuthCmd` → pure-argv
+`["/opt/hermes/.venv/bin/hermes","auth","reset",target]` (no `sh -c`, so no shell-metachar
+threat). `OpenclawProfile.ResetAuthCmd` nil pending verification openclaw latches
+(Post-Completion). `onPoolRecovered` (wired in main.go) looks up the recovered pool's
+`auth_reset_target`; if set, calls `containerMgr.ResetAuth(ctx, target)` in a detached
+goroutine with a fresh `context.WithTimeout(Background, …)` (mirroring the `SetOnFailover`
+precedent) and emits an `agent_auth_reset` audit event.
+
+## Technical Details
+
+- **New constants** (`internal/vault/pool.go`): `maxCooldown = 6h`, `minRateLimitFloor = 10s`,
+  `minReschedule = 1s`.
+- **New resolver methods** (`internal/vault/pool.go`, both RLock-only, single `now`):
+  `HasHealthyMember(pool) bool`; `SoonestCooldown(pool) (time.Time, bool)` returning the
+  minimum `cooldownUntil` **strictly greater than a freshly-sampled `now`** (bool=false when
+  no member is currently cooling).
+- **Exec-user plumbing** (Task 6): add `User string` to `execCreateRequest`
+  (`docker_socket.go`), thread an optional user through `ExecInContainer` on both
+  `ContainerClient` and `SocketClient` (default "" = unchanged root behavior), update the
+  `mockClient` in `docker_test.go` and all existing callers (no-op default).
+- **Migration `000007_pool_auth_reset.{up,down}.sql`**: up = `ALTER TABLE credential_pools
+  ADD COLUMN auth_reset_target TEXT NOT NULL DEFAULT ''`. down = the SQLite 12-step rebuild of
+  `credential_pools` only, wrapped in `PRAGMA foreign_keys=OFF;` … `=ON;` so the
+  `credential_pool_members` FK→`credential_pools(name)` is not orphaned/failed (golang-migrate
+  runs each file as a script).
+- **Store** (`internal/store/pools.go`): add `AuthResetTarget` to `Pool`; include in
+  create/list; add `SetPoolAuthResetTarget(name, target) error`. No TOML change (pools aren't
+  TOML-exportable).
+- **Recovery state on `Server`**: `poolExhaustMu sync.Mutex`, `poolExhausted map[string]bool`,
+  `recoveryWake chan struct{}` (buffered cap-1, non-blocking send), `monitorStop chan struct{}`
+  + `monitorStopOnce sync.Once`, `onPoolRecovered func(pool string)` via `SetOnPoolRecovered`.
+  Monitor launched once (in `New`/a `StartMonitors`), stopped idempotently from BOTH `Close`
+  and `GracefulShutdown`.
+- **Audit**: new action `agent_auth_reset` (`Verdict:"recover"`, `Credential:` recovered pool,
+  `Reason:` target). Keep `pool_exhausted` for the entry edge; add a recovered audit/notice.
+- **Target validation**: non-empty, no NUL, allowlisted charset (mirror `ValidateEnvVarKey`
+  style) — argv form, so shell-metachar checks are the wrong model.
+
+## What Goes Where
+- **Implementation Steps** (`[ ]`): all code, migrations, generated-API regen, tests, in-repo docs.
+- **Post-Completion** (no checkboxes): openclaw latch verification + `ResetAuthCmd`; live
+  knuth validation; capturing a real OpenAI 429 to fix B1's header precedence + `maxCooldown`.
+
+## Implementation Steps
+
+### Task 1: B1 — derive cooldown from upstream recovery hints
+
+**Files:**
+- Modify: `internal/vault/pool.go` (constants `maxCooldown`, `minRateLimitFloor`)
+- Modify: `internal/proxy/pool_failover.go`
+- Modify: `internal/proxy/pool_failover_test.go`
+
+- [ ] add `maxCooldown = 6*time.Hour` and `minRateLimitFloor = 10*time.Second` near the TTLs
+- [ ] add `cooldownFromResponse(class failoverClass, resp *http.Response) time.Duration`:
+  parse `Retry-After` (delta-seconds + HTTP-date), then `x-ratelimit-reset` /
+  `x-ratelimit-reset-requests` / `x-ratelimit-reset-tokens` (delta or epoch); clamp to
+  `[minFloor(class), maxCooldown]` (rate-limit floor `minRateLimitFloor`, auth-failure floor
+  `AuthFailCooldown`); **no hint → class default**. No body parsing in v1.
+- [ ] in `handlePoolFailover` replace the flat `ttl` with `cooldownFromResponse(class, f.Response)`
+- [ ] write tests (generic parsing/clamp, NOT a Codex-specific header winner): Retry-After
+  seconds; Retry-After HTTP-date; epoch reset header; delta reset header; no headers → class
+  default; absurd value → `maxCooldown`; tiny rate-limit value honored down to
+  `minRateLimitFloor`; auth-failure floored at `AuthFailCooldown`
+- [ ] run `go test ./internal/proxy/ ./internal/vault/ -race`; gofumpt; vet — pass before Task 2
+
+### Task 2: A1 — exhaustion = no healthy member, collapse dedup key
+
+**Files:**
+- Modify: `internal/vault/pool.go` (`HasHealthyMember`, `SoonestCooldown`)
+- Modify: `internal/proxy/pool_failover.go`
+- Modify: `internal/vault/pool_test.go`, `internal/proxy/pool_failover_test.go`
+
+- [ ] add `HasHealthyMember(pool) bool` (RLock, single `now`, `!cooling()` for some member —
+  mirror `cooling()`/lazy-expiry, not a status field) and `SoonestCooldown(pool)
+  (time.Time, bool)` (strictly-future min vs a fresh `now`)
+- [ ] in `handlePoolFailover` set `exhausted = !pr.HasHealthyMember(pool)` (drop `to == from`);
+  keep computing `to` for the real-transition notice/audit
+- [ ] when `exhausted`, key the dedup as `shouldEmitPoolNotice(pool, "*", "*", "exhausted")`
+  so flap direction can't create two keys (real transitions unchanged)
+- [ ] write tests: 2-member pool both cooling → exhausted true; one healthy (incl. a
+  lazily-expired cooldown still in the map) → false; `SoonestCooldown` skips already-passed
+  entries; exhausted dedup collapses both flap directions to one notice within the window
+  (fail-before/pass-after). NOTE: Task 3 re-gates this to an edge — update this assertion there.
+- [ ] run `go test ./internal/vault/ ./internal/proxy/ -race`; gofumpt; vet — pass before Task 3
+
+### Task 3: A2 — per-pool exhaustion state machine, edge notices, recovery monitor
+
+**Files:**
+- Modify: `internal/proxy/server.go` (state, monitor goroutine, lifecycle, `SetOnPoolRecovered`)
+- Modify: `internal/proxy/addon.go` (exhaustion-edge hook into the server state)
+- Modify: `internal/proxy/pool_failover.go` (entry-edge gating; recovered notice formatter)
+- Modify: `internal/proxy/server_test.go` (+ focused new test file as needed)
+
+- [ ] add `poolExhaustMu`/`poolExhausted`/`recoveryWake`(cap-1)/`monitorStop`+`monitorStopOnce`/
+  `onPoolRecovered` to `Server`; `SetOnPoolRecovered`; document WHY state is on `Server` not
+  `PoolHealth` (survives swaps, must not be pruned on membership change)
+- [ ] launch the monitor once (in `New`/`StartMonitors`); stop it idempotently from BOTH
+  `Close` and `GracefulShutdown` (via `monitorStopOnce`)
+- [ ] gate the "exhausted" notice on the `false→true` edge (one notice on entry; this edge —
+  not the 30s window — is authoritative for the exhausted notice); record state; wake the
+  monitor with a non-blocking `select { case recoveryWake <- struct{}{}: default: }`
+- [ ] implement the monitor: each wake `Load()` the current resolver (never cache); compute
+  `sleep = max(SoonestCooldown(pool).Sub(time.Now()), minReschedule)`; on wake, for each
+  exhausted pool flip `true→false` when `HasHealthyMember` is true, emit "pool recovered",
+  call `onPoolRecovered(pool)`; reschedule while still exhausted; drop state for pools no
+  longer present
+- [ ] add a recovered-notice formatter (plain text, sentence style) alongside
+  `FormatFailoverNotice`
+- [ ] update the Task-2 exhausted-notice test to assert EDGE behavior (one notice per
+  healthy→exhausted edge across many failing responses), not 30s-window behavior
+- [ ] write tests: entry edge emits exactly one exhausted notice; recovery edge emits exactly
+  one recovered notice + one `onPoolRecovered` call; **unequal cooldowns** (memA 60s, memB 6h)
+  → monitor wakes ~60s, recovers once though memB still cools; monitor stops on shutdown; a
+  pool removed while exhausted fires no recovered notice
+- [ ] run `go test ./internal/proxy/ -race`; gofumpt; vet — pass before Task 4
+
+### Task 4: Schema + store for per-pool `auth_reset_target`
+
+**Files:**
+- Create: `internal/store/migrations/000007_pool_auth_reset.up.sql`
+- Create: `internal/store/migrations/000007_pool_auth_reset.down.sql`
+- Modify: `internal/store/pools.go`
+- Modify: `internal/store/pools_test.go`
+
+- [ ] up: `ALTER TABLE credential_pools ADD COLUMN auth_reset_target TEXT NOT NULL DEFAULT ''`
+- [ ] down: SQLite 12-step rebuild of `credential_pools` only, wrapped
+  `PRAGMA foreign_keys=OFF;` … `=ON;` so the `credential_pool_members` FK isn't orphaned
+- [ ] add `AuthResetTarget` to `Pool`; include in create/list reads; add
+  `SetPoolAuthResetTarget(name, target) error`
+- [ ] write tests: migrate up→down→up against a **populated** table (pool + members + health
+  rows survive/round-trip); default empty; create with target; set/clear target; list reflects it
+- [ ] run `go test ./internal/store/ -race`; gofumpt; vet — pass before Task 5
+
+### Task 5: Channel parity — `auth_reset_target` on CLI + REST + Telegram
+
+**Files:**
+- Modify: `internal/poolops/*.go` (set/clear + create-with-target operation logic)
+- Modify: `cmd/sluice/pool.go` (flag on `pool create`; `pool set-auth-reset`)
+- Modify: `api/openapi.yaml`; then `make generate` → implement new method in `internal/api/server.go`
+- Modify: Telegram `/pool` handler
+- Modify: matching `_test.go` for poolops + each channel adapter
+
+- [ ] add the operation to `internal/poolops` (channel-agnostic) so all channels are thin
+  adapters (CLAUDE.md anti-pattern note)
+- [ ] CLI: `--auth-reset-target` on `pool create`; `pool set-auth-reset <name> <target|->`
+  (set/clear); show target in `pool status`/`pool list`
+- [ ] REST: accept on `POST /api/pools`; add an **action route** `POST
+  /api/pools/{name}/auth-reset-target` (mirrors the existing `/rotate` style, not a bespoke
+  PATCH); edit `api/openapi.yaml`, run `make generate`, implement the generated
+  `ServerInterface` method in `server.go`
+- [ ] Telegram: accept on `/pool create`; add `/pool set-auth-reset <name> <target|->`
+- [ ] write tests: poolops set/clear/create-with-target; one adapter test per channel
+  asserting it routes through poolops (no inline logic)
+- [ ] run `go test ./... -race` for touched packages; `make generate` clean; gofumpt; vet —
+  pass before Task 6
+
+### Task 6: Add optional exec user to the container exec path (prerequisite for hermes reset)
+
+**Files:**
+- Modify: `internal/container/types.go` (`ExecInContainer` signature / `ContainerClient`)
+- Modify: `internal/container/docker.go`, `internal/container/docker_socket.go`
+  (`execCreateRequest.User`)
+- Modify: `internal/container/docker_test.go` (`mockClient`) + any other `ExecInContainer` callers
+- Modify: `internal/container/agent_profile.go` (`AgentProfile.ExecUser() string`)
+
+- [ ] add `User string` to `execCreateRequest`; thread an optional user arg through
+  `ExecInContainer` on `ContainerClient` + `SocketClient` (empty "" = current root behavior)
+- [ ] update `mockClient` and every existing `ExecInContainer` caller (no-op default, no
+  behavior change)
+- [ ] add `ExecUser() string` to `AgentProfile` (hermes `"10000:10000"`, openclaw "")
+- [ ] write tests: socket exec body carries `User` when set and omits/empties it otherwise;
+  profile `ExecUser` values; existing callers unaffected
+- [ ] run `go test ./internal/container/ -race`; gofumpt; vet — pass before Task 7
+
+### Task 7: Profile `ResetAuthCmd` + `ContainerManager.ResetAuth`
+
+**Files:**
+- Modify: `internal/container/agent_profile.go`
+- Modify: `internal/container/types.go`
+- Modify: `internal/container/docker.go`, `apple.go`, `tart.go`, standalone (`none`)
+- Modify: `internal/container/agent_profile_test.go`, `docker_test.go`
+
+- [ ] add `ResetAuthCmd func(target string) []string` to `AgentProfile`; `HermesProfile` →
+  pure-argv `["/opt/hermes/.venv/bin/hermes","auth","reset",target]`;
+  `OpenclawProfile.ResetAuthCmd` nil (documented; Post-Completion verification)
+- [ ] add `ResetAuth(ctx, target string) error` to `ContainerManager`; implement per backend;
+  nil `ResetAuthCmd` → log notice, return nil (mirror `ReloadSecrets`); docker exec passes
+  `profile.ExecUser()` (Task 6) so hermes runs as 10000:10000
+- [ ] validate `target` (non-empty, no NUL, allowlisted charset) before exec
+- [ ] write tests: argv per profile; nil-cmd no-ops with notice; empty/invalid target
+  rejected; **docker exec uses the runtime UID from `ExecUser` (now passable, Task 6)**
+- [ ] run `go test ./internal/container/ -race`; gofumpt; vet — pass before Task 8
+
+### Task 8: Wire auto-reset on the recovery edge (opt-in, per pool)
+
+**Files:**
+- Modify: `cmd/sluice/main.go` (`srv.SetOnPoolRecovered(...)`)
+- Modify: `internal/proxy/pool_failover.go` / audit usage (`agent_auth_reset` action)
+- Modify: relevant `_test.go`
+
+- [ ] in main.go register `SetOnPoolRecovered`: look up the recovered pool's
+  `auth_reset_target`; if non-empty and `containerMgr != nil`, call
+  `containerMgr.ResetAuth(ctx, target)` in a detached goroutine using a fresh
+  `context.WithTimeout(context.Background(), …)` (never block; never reuse a wake-scoped ctx);
+  log + emit `agent_auth_reset`; empty target → no reset (opt-out default)
+- [ ] keep the recovered Telegram notice (Task 3) and the reset independent (notice always;
+  reset only when target set); a `ResetAuth` error is logged, not fatal
+- [ ] write tests: target set → recovery triggers `ResetAuth(target)` once; no target → no
+  reset, notice still emitted; `ResetAuth` error logged not fatal
+- [ ] run `go test ./... -race` for touched packages; gofumpt; vet — pass before Task 9
+
+### Task 9: Verify acceptance criteria
+- [ ] both members exhausted → exactly one "pool exhausted" notice (no flap stream)
+- [ ] B1 cooldown reflects the upstream window (member not re-probed every 60s)
+- [ ] recovery edge → one "pool recovered" notice; auth reset fires only for pools with a
+  target; hermes reset runs as 10000:10000 (no root-chown of auth.json)
+- [ ] channel parity verified: `auth_reset_target` settable+readable via CLI, REST, Telegram
+- [ ] full `go test ./...`; `go vet ./...`; `go vet -tags=e2e ./e2e/`; `gofumpt -l` clean;
+  `golangci-lint run ./...` 0 issues; `make generate` then `git diff --exit-code
+  internal/api/api.gen.go` clean
+- [ ] independently verify committed HEAD builds + tests pass (do not trust subagent green)
+
+### Task 10: [Final] Documentation
+- [ ] update `CLAUDE.md` credential-pools section: B1 cooldown-from-hints, A1/A2
+  exhaustion+edge notices (replacing the per-window dedup wording), per-pool
+  `auth_reset_target` + recovery auto-reset, the `agent_auth_reset` audit action, and the
+  `ResetAuthCmd`/`ExecUser` profile hooks in the Agent Profiles table
+- [ ] cross-check the CLI subcommand list + the auto-generated `api/openapi.yaml` in CLAUDE.md
+- [ ] move this plan to `docs/plans/completed/`
+
+## Post-Completion
+*Items requiring manual intervention or external systems — informational only*
+
+**Manual verification:**
+- Deploy `sluice:dev` to knuth (build→scp→load loop), set `auth_reset_target = openai-codex`
+  on `openai_pool`, exhaust both Codex accounts, and confirm: one exhausted notice, no flap;
+  after the real window, one recovered notice + hermes auth un-latches and resumes without a
+  manual `hermes auth reset`; `auth.json` stays owned by 10000.
+- **Capture a real OpenAI Codex usage-limit 429** (headers + body) to confirm which recovery
+  hint is present (`Retry-After` vs `x-ratelimit-reset*` vs body) and fix B1's header
+  precedence / `maxCooldown` accordingly. Add the body-hint parser only if the capture proves
+  no usable header exists.
+
+**External / follow-up:**
+- **openclaw latch**: verify whether openclaw latches on usage-limit and, if so, implement
+  `OpenclawProfile.ResetAuthCmd` (gateway RPC, like `ReloadCmd`). hermes-only until confirmed.
+- Selectable pool strategy (position-priority vs sticky) remains the previously-noted
+  follow-up; out of scope here.

From 96611d2f20e300c5751c25853779d37d12ce0b42 Mon Sep 17 00:00:00 2001
From: Nikita Nemirovsky <vaze.legend@gmail.com>
Date: Fri, 22 May 2026 15:45:25 +0800
Subject: [PATCH 02/19] feat(proxy): derive pool cooldown from upstream
 recovery hints (B1)

---
 ...22-pool-exhaustion-and-agent-auth-reset.md |  19 ++-
 internal/proxy/pool_failover.go               | 146 +++++++++++++++-
 internal/proxy/pool_failover_test.go          | 157 ++++++++++++++++++
 internal/vault/pool.go                        |  15 ++
 4 files changed, 325 insertions(+), 12 deletions(-)

diff --git a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
index 26087ee..537d421 100644
--- a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
+++ b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
@@ -205,18 +205,21 @@ precedent) and emits an `agent_auth_reset` audit event.
 - Modify: `internal/proxy/pool_failover.go`
 - Modify: `internal/proxy/pool_failover_test.go`
 
-- [ ] add `maxCooldown = 6*time.Hour` and `minRateLimitFloor = 10*time.Second` near the TTLs
-- [ ] add `cooldownFromResponse(class failoverClass, resp *http.Response) time.Duration`:
+- [x] add `maxCooldown = 6*time.Hour` and `minRateLimitFloor = 10*time.Second` near the TTLs
+  (exported as `MaxCooldown`/`MinRateLimitFloor` so the proxy package can clamp against them,
+  mirroring `RateLimitCooldown`/`AuthFailCooldown`)
+- [x] add `cooldownFromResponse(class failoverClass, header http.Header) time.Duration`
+  (takes `http.Header` directly so it works against the go-mitmproxy Flow's `Response.Header`):
   parse `Retry-After` (delta-seconds + HTTP-date), then `x-ratelimit-reset` /
   `x-ratelimit-reset-requests` / `x-ratelimit-reset-tokens` (delta or epoch); clamp to
-  `[minFloor(class), maxCooldown]` (rate-limit floor `minRateLimitFloor`, auth-failure floor
+  `[minFloor(class), MaxCooldown]` (rate-limit floor `MinRateLimitFloor`, auth-failure floor
   `AuthFailCooldown`); **no hint → class default**. No body parsing in v1.
-- [ ] in `handlePoolFailover` replace the flat `ttl` with `cooldownFromResponse(class, f.Response)`
-- [ ] write tests (generic parsing/clamp, NOT a Codex-specific header winner): Retry-After
+- [x] in `handlePoolFailover` replace the flat `ttl` with `cooldownFromResponse(class, f.Response.Header)`
+- [x] write tests (generic parsing/clamp, NOT a Codex-specific header winner): Retry-After
   seconds; Retry-After HTTP-date; epoch reset header; delta reset header; no headers → class
-  default; absurd value → `maxCooldown`; tiny rate-limit value honored down to
-  `minRateLimitFloor`; auth-failure floored at `AuthFailCooldown`
-- [ ] run `go test ./internal/proxy/ ./internal/vault/ -race`; gofumpt; vet — pass before Task 2
+  default; absurd value → `MaxCooldown`; tiny rate-limit value honored down to
+  `MinRateLimitFloor`; auth-failure floored at `AuthFailCooldown`
+- [x] run `go test ./internal/proxy/ ./internal/vault/ -race`; gofumpt; vet — pass before Task 2
 
 ### Task 2: A1 — exhaustion = no healthy member, collapse dedup key
 
diff --git a/internal/proxy/pool_failover.go b/internal/proxy/pool_failover.go
index 567c124..fd232dd 100644
--- a/internal/proxy/pool_failover.go
+++ b/internal/proxy/pool_failover.go
@@ -4,6 +4,8 @@ import (
 	"bytes"
 	"fmt"
 	"log"
+	"net/http"
+	"strconv"
 	"strings"
 	"time"
 
@@ -130,6 +132,139 @@ func bodyContainsAny(body []byte, subs ...string) bool {
 	return false
 }
 
+// minFloorForClass returns the lower clamp bound for a derived cooldown. A
+// rate-limited member may be honored down to MinRateLimitFloor (a short real
+// recovery window must be respected — the whole point of B1). An auth failure
+// must never be retried in seconds: a revoked/expired token will not self-heal
+// quickly, so the floor stays at AuthFailCooldown regardless of any hint.
+func minFloorForClass(class failoverClass) time.Duration {
+	if class == failoverAuthFailure {
+		return vault.AuthFailCooldown
+	}
+	return vault.MinRateLimitFloor
+}
+
+// classDefaultCooldown is the flat fallback applied when the upstream sends no
+// usable recovery hint header.
+func classDefaultCooldown(class failoverClass) time.Duration {
+	if class == failoverAuthFailure {
+		return vault.AuthFailCooldown
+	}
+	return vault.RateLimitCooldown
+}
+
+// recoveryHintHeaders are the response headers, in precedence order, from
+// which cooldownFromResponse derives the real recovery window. Retry-After is
+// the standard signal; the x-ratelimit-reset* family is provider-specific
+// (OpenAI emits per-resource reset hints). Each value is parsed as either a
+// delta-seconds count, an HTTP-date (Retry-After only), or — for the reset
+// family — a unix epoch.
+var recoveryHintHeaders = []string{
+	"Retry-After",
+	"x-ratelimit-reset",
+	"x-ratelimit-reset-requests",
+	"x-ratelimit-reset-tokens",
+}
+
+// cooldownFromResponse derives the cooldown duration for a failed pool member
+// from the upstream's recovery hint headers, clamped to the class bounds.
+//
+// Precedence: Retry-After (delta-seconds or HTTP-date), then the
+// x-ratelimit-reset* family (delta-seconds or unix epoch). The first header
+// that yields a positive duration wins. With NO usable hint the flat class
+// default is returned (RateLimitCooldown / AuthFailCooldown), so behavior is
+// unchanged from before B1 on responses that carry no hint.
+//
+// The parsed window is clamped to [minFloorForClass(class), MaxCooldown]: a
+// rate-limit hint may shrink the cooldown down to MinRateLimitFloor (honoring
+// a short real window), an auth failure stays floored at AuthFailCooldown, and
+// any absurd/hostile value is capped at MaxCooldown so a member is never
+// parked indefinitely.
+//
+// No body parsing in v1 (deferred until a real Codex 429 is captured — see the
+// plan's Post-Completion note); only headers are consulted.
+//
+// Takes the response header directly (not *http.Response) so it works
+// uniformly against the go-mitmproxy Flow's Response.Header at the call site
+// and a plain http.Header in tests.
+func cooldownFromResponse(class failoverClass, header http.Header) time.Duration {
+	if header == nil {
+		return classDefaultCooldown(class)
+	}
+	now := time.Now()
+	for _, h := range recoveryHintHeaders {
+		raw := strings.TrimSpace(header.Get(h))
+		if raw == "" {
+			continue
+		}
+		d, ok := parseRecoveryHint(h, raw, now)
+		if !ok || d <= 0 {
+			continue
+		}
+		return clampCooldown(class, d)
+	}
+	return classDefaultCooldown(class)
+}
+
+// parseRecoveryHint parses a single recovery-hint header value into a positive
+// duration relative to now. It tries, in order:
+//
+//   - delta-seconds: a bare integer/decimal count of seconds from now (all
+//     headers).
+//   - HTTP-date: an absolute time (Retry-After only, per RFC 9110); the
+//     duration is its distance from now.
+//   - unix epoch: a large integer treated as seconds-since-epoch (the
+//     x-ratelimit-reset* family commonly emits an absolute epoch).
+//
+// Returns ok=false when the value parses to nothing usable (negative,
+// zero-after-now, unparseable). A delta value carrying a unit suffix (OpenAI's
+// "1.5s" / "60ms" form) is handled via time.ParseDuration as a fallback.
+func parseRecoveryHint(header, raw string, now time.Time) (time.Duration, bool) {
+	// Bare numeric: delta-seconds for Retry-After; for the reset family it may
+	// be either delta-seconds OR a unix epoch. Disambiguate by magnitude — a
+	// value large enough to be a plausible epoch (>= ~ year 2001) is treated
+	// as absolute, otherwise as a delta.
+	if secs, err := strconv.ParseFloat(raw, 64); err == nil {
+		if secs < 0 {
+			return 0, false
+		}
+		const epochThreshold = 1_000_000_000 // ~2001-09; below this, treat as delta-seconds
+		if header != "Retry-After" && secs >= epochThreshold {
+			until := time.Unix(int64(secs), 0)
+			if d := until.Sub(now); d > 0 {
+				return d, true
+			}
+			return 0, false
+		}
+		return time.Duration(secs * float64(time.Second)), true
+	}
+	// HTTP-date (Retry-After absolute form).
+	if t, err := http.ParseTime(raw); err == nil {
+		if d := t.Sub(now); d > 0 {
+			return d, true
+		}
+		return 0, false
+	}
+	// Unit-suffixed duration (e.g. OpenAI "1.5s", "60ms").
+	if d, err := time.ParseDuration(raw); err == nil && d > 0 {
+		return d, true
+	}
+	return 0, false
+}
+
+// clampCooldown bounds a derived cooldown to [minFloorForClass(class),
+// MaxCooldown].
+func clampCooldown(class failoverClass, d time.Duration) time.Duration {
+	floor := minFloorForClass(class)
+	if d < floor {
+		return floor
+	}
+	if d > vault.MaxCooldown {
+		return vault.MaxCooldown
+	}
+	return d
+}
+
 // FailoverEvent describes a completed pool failover. It is handed to the
 // optional onFailover callback (store durability write + Telegram notice)
 // configured via SetOnFailover.
@@ -476,10 +611,13 @@ func (a *SluiceAddon) handlePoolFailover(f *mitmproxy.Flow) {
 		return
 	}
 
-	ttl := vault.RateLimitCooldown
-	if class == failoverAuthFailure {
-		ttl = vault.AuthFailCooldown
-	}
+	// B1: derive the cooldown window from the upstream's recovery hints
+	// (Retry-After / x-ratelimit-reset*) instead of the flat class TTL, so a
+	// quota-exhausted member is parked for the REAL window rather than being
+	// re-probed every RateLimitCooldown (60s). cooldownFromResponse clamps to
+	// the class bounds and falls back to the flat default when no hint header
+	// is present.
+	ttl := cooldownFromResponse(class, f.Response.Header)
 	until := time.Now().Add(ttl)
 	tag := failoverReasonTag(class, f.Response.StatusCode, bodyTag)
 
diff --git a/internal/proxy/pool_failover_test.go b/internal/proxy/pool_failover_test.go
index 652c984..9fb9d60 100644
--- a/internal/proxy/pool_failover_test.go
+++ b/internal/proxy/pool_failover_test.go
@@ -6,6 +6,7 @@ import (
 	"net/url"
 	"os"
 	"path/filepath"
+	"strconv"
 	"strings"
 	"sync/atomic"
 	"testing"
@@ -1180,3 +1181,159 @@ func TestFailoverToManualRotateParkedPeer(t *testing.T) {
 		t.Fatalf("pool_exhausted audit rows = %d, want 0 (a healthy parked peer exists)", n)
 	}
 }
+
+// TestCooldownFromResponse exercises B1's header-derived cooldown parsing and
+// clamping. These assertions are intentionally GENERIC about the parsing/clamp
+// mechanics and do NOT bake in which header OpenAI's usage-limit 429 actually
+// sends — that header winner is resolved by the Post-Completion live capture.
+func TestCooldownFromResponse(t *testing.T) {
+	now := time.Now()
+	resp := func(class failoverClass, set func(h http.Header)) (failoverClass, http.Header) {
+		h := make(http.Header)
+		if set != nil {
+			set(h)
+		}
+		return class, h
+	}
+
+	tests := []struct {
+		name  string
+		class failoverClass
+		setup func(h http.Header)
+		want  time.Duration
+		// approx: when true, want is treated as a target and the result must
+		// be within tolerance (HTTP-date / absolute-epoch cases lose sub-second
+		// precision against a freshly-sampled now inside cooldownFromResponse).
+		approx bool
+	}{
+		{
+			name:  "no headers falls back to rate-limit class default",
+			class: failoverRateLimited,
+			want:  vault.RateLimitCooldown,
+		},
+		{
+			name:  "no headers falls back to auth-failure class default",
+			class: failoverAuthFailure,
+			want:  vault.AuthFailCooldown,
+		},
+		{
+			name:  "Retry-After delta seconds honored",
+			class: failoverRateLimited,
+			setup: func(h http.Header) { h.Set("Retry-After", "120") },
+			want:  120 * time.Second,
+		},
+		{
+			name:   "Retry-After HTTP-date honored",
+			class:  failoverRateLimited,
+			setup:  func(h http.Header) { h.Set("Retry-After", now.Add(90*time.Second).UTC().Format(http.TimeFormat)) },
+			want:   90 * time.Second,
+			approx: true,
+		},
+		{
+			name:  "x-ratelimit-reset unix epoch honored",
+			class: failoverRateLimited,
+			setup: func(h http.Header) {
+				h.Set("x-ratelimit-reset", strconv.FormatInt(now.Add(300*time.Second).Unix(), 10))
+			},
+			want:   300 * time.Second,
+			approx: true,
+		},
+		{
+			name:  "x-ratelimit-reset-requests delta seconds honored",
+			class: failoverRateLimited,
+			setup: func(h http.Header) { h.Set("x-ratelimit-reset-requests", "45") },
+			want:  45 * time.Second,
+		},
+		{
+			name:  "x-ratelimit-reset-tokens unit-suffixed duration honored",
+			class: failoverRateLimited,
+			setup: func(h http.Header) { h.Set("x-ratelimit-reset-tokens", "1500ms") },
+			want:  1500 * time.Millisecond,
+			// 1.5s is below MinRateLimitFloor so it clamps up; assert clamp below.
+		},
+		{
+			name:  "absurd value capped at MaxCooldown",
+			class: failoverRateLimited,
+			setup: func(h http.Header) { h.Set("Retry-After", "999999999") },
+			want:  vault.MaxCooldown,
+		},
+		{
+			name:  "tiny rate-limit value floored at MinRateLimitFloor",
+			class: failoverRateLimited,
+			setup: func(h http.Header) { h.Set("Retry-After", "2") },
+			want:  vault.MinRateLimitFloor,
+		},
+		{
+			name:  "auth-failure hint floored at AuthFailCooldown",
+			class: failoverAuthFailure,
+			setup: func(h http.Header) { h.Set("Retry-After", "5") },
+			want:  vault.AuthFailCooldown,
+		},
+		{
+			name:  "auth-failure long hint honored above its floor",
+			class: failoverAuthFailure,
+			setup: func(h http.Header) { h.Set("Retry-After", "1200") },
+			want:  1200 * time.Second,
+		},
+		{
+			name:  "Retry-After precedence over reset family",
+			class: failoverRateLimited,
+			setup: func(h http.Header) {
+				h.Set("Retry-After", "120")
+				h.Set("x-ratelimit-reset", "30")
+			},
+			want: 120 * time.Second,
+		},
+		{
+			name:  "unparseable header ignored falls back to default",
+			class: failoverRateLimited,
+			setup: func(h http.Header) { h.Set("Retry-After", "soon-ish") },
+			want:  vault.RateLimitCooldown,
+		},
+		{
+			name:  "negative delta ignored falls back to default",
+			class: failoverRateLimited,
+			setup: func(h http.Header) { h.Set("Retry-After", "-5") },
+			want:  vault.RateLimitCooldown,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			class, h := resp(tt.class, tt.setup)
+			got := cooldownFromResponse(class, h)
+			if tt.approx {
+				const tol = 2 * time.Second
+				diff := got - tt.want
+				if diff < 0 {
+					diff = -diff
+				}
+				if diff > tol {
+					t.Fatalf("cooldownFromResponse = %v, want ~%v (tol %v)", got, tt.want, tol)
+				}
+				return
+			}
+			// The unit-suffixed 1.5s case clamps up to MinRateLimitFloor.
+			if tt.name == "x-ratelimit-reset-tokens unit-suffixed duration honored" {
+				if got != vault.MinRateLimitFloor {
+					t.Fatalf("cooldownFromResponse = %v, want clamp to %v", got, vault.MinRateLimitFloor)
+				}
+				return
+			}
+			if got != tt.want {
+				t.Fatalf("cooldownFromResponse = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+// TestCooldownFromResponseNilSafe verifies the nil-header guard returns the
+// class default rather than panicking.
+func TestCooldownFromResponseNilSafe(t *testing.T) {
+	if got := cooldownFromResponse(failoverRateLimited, nil); got != vault.RateLimitCooldown {
+		t.Fatalf("nil header: got %v, want %v", got, vault.RateLimitCooldown)
+	}
+	if got := cooldownFromResponse(failoverAuthFailure, nil); got != vault.AuthFailCooldown {
+		t.Fatalf("nil header (auth): got %v, want %v", got, vault.AuthFailCooldown)
+	}
+}
diff --git a/internal/vault/pool.go b/internal/vault/pool.go
index d818fc3..93d8ff5 100644
--- a/internal/vault/pool.go
+++ b/internal/vault/pool.go
@@ -18,6 +18,21 @@ const (
 	AuthFailCooldown  = 300 * time.Second
 )
 
+// Cooldown bounds applied when the cooldown window is derived from an
+// upstream recovery hint (Retry-After / rate-limit-reset headers) instead of
+// the flat class default. MaxCooldown caps an absurd or malicious hint so a
+// member can never be parked indefinitely; MinRateLimitFloor is the lower
+// bound for a rate-limited member — deliberately well below RateLimitCooldown
+// so a SHORT real recovery window from the upstream is honored (the point of
+// B1 is that the flat 60s is too LONG-floored, re-probing an exhausted account
+// every minute, not too short). Exported so the proxy package's
+// cooldownFromResponse can clamp against them (mirrors RateLimitCooldown /
+// AuthFailCooldown, which are likewise consumed from proxy).
+const (
+	MaxCooldown       = 6 * time.Hour
+	MinRateLimitFloor = 10 * time.Second
+)
+
 // ManualRotateReason is the cooldown reason stamped by `sluice pool rotate`
 // when it parks the previously-active member. A member parked for this
 // reason is operationally deprioritized BY AN OPERATOR, not unhealthy: it

From cd4fd871db7d382c828ed920bafbad47c3107dc1 Mon Sep 17 00:00:00 2001
From: Nikita Nemirovsky <vaze.legend@gmail.com>
Date: Fri, 22 May 2026 15:50:44 +0800
Subject: [PATCH 03/19] feat(proxy): detect pool exhaustion via
 no-healthy-member and collapse exhausted notice dedup (A1)

---
 ...22-pool-exhaustion-and-agent-auth-reset.md |  10 +-
 internal/proxy/pool_failover.go               |  36 ++--
 internal/proxy/pool_failover_test.go          |  71 +++++++
 internal/vault/pool.go                        |  74 +++++++
 internal/vault/pool_test.go                   | 183 ++++++++++++++++++
 5 files changed, 357 insertions(+), 17 deletions(-)

diff --git a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
index 537d421..b24d35e 100644
--- a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
+++ b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
@@ -228,18 +228,18 @@ precedent) and emits an `agent_auth_reset` audit event.
 - Modify: `internal/proxy/pool_failover.go`
 - Modify: `internal/vault/pool_test.go`, `internal/proxy/pool_failover_test.go`
 
-- [ ] add `HasHealthyMember(pool) bool` (RLock, single `now`, `!cooling()` for some member —
+- [x] add `HasHealthyMember(pool) bool` (RLock, single `now`, `!cooling()` for some member —
   mirror `cooling()`/lazy-expiry, not a status field) and `SoonestCooldown(pool)
   (time.Time, bool)` (strictly-future min vs a fresh `now`)
-- [ ] in `handlePoolFailover` set `exhausted = !pr.HasHealthyMember(pool)` (drop `to == from`);
+- [x] in `handlePoolFailover` set `exhausted = !pr.HasHealthyMember(pool)` (drop `to == from`);
   keep computing `to` for the real-transition notice/audit
-- [ ] when `exhausted`, key the dedup as `shouldEmitPoolNotice(pool, "*", "*", "exhausted")`
+- [x] when `exhausted`, key the dedup as `shouldEmitPoolNotice(pool, "*", "*", "exhausted")`
   so flap direction can't create two keys (real transitions unchanged)
-- [ ] write tests: 2-member pool both cooling → exhausted true; one healthy (incl. a
+- [x] write tests: 2-member pool both cooling → exhausted true; one healthy (incl. a
   lazily-expired cooldown still in the map) → false; `SoonestCooldown` skips already-passed
   entries; exhausted dedup collapses both flap directions to one notice within the window
   (fail-before/pass-after). NOTE: Task 3 re-gates this to an edge — update this assertion there.
-- [ ] run `go test ./internal/vault/ ./internal/proxy/ -race`; gofumpt; vet — pass before Task 3
+- [x] run `go test ./internal/vault/ ./internal/proxy/ -race`; gofumpt; vet — pass before Task 3
 
 ### Task 3: A2 — per-pool exhaustion state machine, edge notices, recovery monitor
 
diff --git a/internal/proxy/pool_failover.go b/internal/proxy/pool_failover.go
index fd232dd..594278a 100644
--- a/internal/proxy/pool_failover.go
+++ b/internal/proxy/pool_failover.go
@@ -651,23 +651,35 @@ func (a *SluiceAddon) handlePoolFailover(f *mitmproxy.Flow) {
 		to = next
 	}
 
-	// to == from means ResolveActive degraded back to the member that just
-	// failed: every member is cooling and the soonest-recovering one IS
-	// `from`. There is NO distinct member to fail over to. Emitting a
-	// "<from> -> <from>" cred_failover here (and one Telegram notice per
-	// request) was both meaningless and a notification storm — the agent
-	// retries N times, each retry re-fails on the still-exhausted member
-	// and re-entered this path, producing N identical "failed over A -> A"
-	// notices. Classify it honestly as pool exhaustion instead.
-	exhausted := to == from
+	// A1: the pool is exhausted when NO member is healthy, not merely when
+	// ResolveActive degraded back to `from` (to == from). The old `to == from`
+	// test missed the flap case where the soonest-recovering degrade target is
+	// a DIFFERENT still-cooling member: that produced a meaningless
+	// "<from> -> <other>" cred_failover plus one Telegram notice per agent
+	// retry even though there was no healthy account to serve. HasHealthyMember
+	// mirrors ResolveActive's cooling()/lazy-expiry exactly, so it agrees with
+	// the degrade decision made just above. `to` is still computed for the
+	// real-transition notice/audit (the not-exhausted branch).
+	exhausted := !pr.HasHealthyMember(pool)
 
 	// Deduplicate identical signals within a short window. Concurrent
 	// in-flight requests (pipelined agents) and retries that race the
 	// synchronous MarkCooldown above would otherwise each emit one audit
-	// row + one operator notice. One per (pool,from,to,tag) per window is
-	// all the operator needs; the cooldown itself was already applied
+	// row + one operator notice.
+	//
+	// A1: for the exhausted path collapse the dedup key to
+	// (pool, "*", "*", "exhausted") so the flap direction (which degrade
+	// target ResolveActive happened to pick) cannot mint two distinct keys and
+	// let the retry storm through twice. A real from->to transition keeps its
+	// (pool, from, to, tag) key. The cooldown itself was already applied
 	// unconditionally above, so suppressing the notice loses nothing.
-	if !a.shouldEmitPoolNotice(pool, from, to, tag) {
+	emit := false
+	if exhausted {
+		emit = a.shouldEmitPoolNotice(pool, "*", "*", "exhausted")
+	} else {
+		emit = a.shouldEmitPoolNotice(pool, from, to, tag)
+	}
+	if !emit {
 		return
 	}
 
diff --git a/internal/proxy/pool_failover_test.go b/internal/proxy/pool_failover_test.go
index 9fb9d60..8551be8 100644
--- a/internal/proxy/pool_failover_test.go
+++ b/internal/proxy/pool_failover_test.go
@@ -1118,6 +1118,77 @@ func TestFailoverPoolExhaustedNoSelfFailoverSpam(t *testing.T) {
 	}
 }
 
+// TestFailoverExhaustedDedupCollapsesFlapDirection is the A1 dedup-collapse
+// regression. When both members are exhausted, the degrade target ResolveActive
+// picks can differ depending on which member a given failing response is
+// attributed to (the "flap direction"). The OLD per-(pool,from,to,tag) dedup
+// key minted a DISTINCT key per direction, so a retry storm hitting both
+// members produced two notices even though the pool is in one exhausted state.
+//
+// A1 collapses the exhausted path to the single key (pool,"*","*","exhausted")
+// so the direction can't create two keys.
+//
+// Fail-before (per-tuple key): two responses attributed to memA and memB emit
+// two pool_exhausted rows / two onFailover calls. Pass-after (collapsed key):
+// exactly one of each within the dedup window.
+func TestFailoverExhaustedDedupCollapsesFlapDirection(t *testing.T) {
+	dir := t.TempDir()
+	logPath := filepath.Join(dir, "audit.log")
+	logger, err := audit.NewFileLogger(logPath)
+	if err != nil {
+		t.Fatalf("NewFileLogger: %v", err)
+	}
+	t.Cleanup(func() { _ = logger.Close() })
+
+	addon, _, prPtr := setupPoolAddon(t, "memA", "memB")
+	addon.auditLog = logger
+	client := setupAddonConn(addon, "auth.example.com:443")
+
+	// Both members already genuinely failure-cooled (the pool is exhausted).
+	// Unequal future expiries so the degrade target differs by which member
+	// each request is attributed to (the flap direction the old key split on).
+	prPtr.Load().MarkCooldown("memA", time.Now().Add(5*time.Minute), "429")
+	prPtr.Load().MarkCooldown("memB", time.Now().Add(10*time.Minute), "401")
+
+	var calls int32
+	done := make(chan struct{}, 4)
+	addon.SetOnFailover(func(ev FailoverEvent) {
+		if !ev.Exhausted {
+			t.Errorf("FailoverEvent.Exhausted = false, want true (pool exhausted)")
+		}
+		atomic.AddInt32(&calls, 1)
+		done <- struct{}{}
+	})
+
+	// Two failing responses, attributed to DIFFERENT members (flap directions).
+	for _, member := range []string{"memA", "memB"} {
+		f := newPoolRespFlow(client, 429, []byte(`{"error":"rate_limited"}`))
+		addon.flowInjected.Tag(f.Id, member)
+		addon.Response(f)
+	}
+
+	select {
+	case <-done:
+	case <-time.After(2 * time.Second):
+		t.Fatal("onFailover callback not invoked")
+	}
+	// Give any erroneous second call a moment to surface before asserting.
+	time.Sleep(50 * time.Millisecond)
+	if got := atomic.LoadInt32(&calls); got != 1 {
+		t.Fatalf("onFailover invoked %d times, want exactly 1 (collapsed exhausted dedup key must absorb both flap directions)", got)
+	}
+
+	if err := logger.Close(); err != nil {
+		t.Fatalf("logger close: %v", err)
+	}
+	if n := auditActionCount(t, logPath, "pool_exhausted"); n != 1 {
+		t.Fatalf("pool_exhausted audit rows = %d, want exactly 1 (one collapsed key across both flap directions)", n)
+	}
+	if n := auditActionCount(t, logPath, "cred_failover"); n != 0 {
+		t.Fatalf("cred_failover audit rows = %d, want 0 (exhausted is not a real failover)", n)
+	}
+}
+
 // TestFailoverToManualRotateParkedPeer is the pool-stranding regression that
 // broke the live agent: `sluice pool rotate` parks the previously-active
 // member (reason ManualRotateReason). That member is healthy, just operator
diff --git a/internal/vault/pool.go b/internal/vault/pool.go
index 93d8ff5..75572f2 100644
--- a/internal/vault/pool.go
+++ b/internal/vault/pool.go
@@ -758,6 +758,80 @@ func (pr *PoolResolver) MergeLiveCooldowns(prev *PoolResolver) {
 	}
 }
 
+// HasHealthyMember reports whether the pool has at least one SERVABLE member.
+// It RLock-reads the shared health map once and evaluates every member against
+// a SINGLE freshly-sampled `now`, mirroring ResolveActive's `cooling()`
+// lazy-expiry semantics (a member whose cooldown is untracked, zero, or
+// lazily-expired vs `now` is healthy — this is NOT a status-field check). It
+// is the A1 exhaustion signal: a pool is exhausted iff !HasHealthyMember(pool).
+// A non-pool name or unknown pool returns false (no member, nothing to serve).
+//
+// A member parked for ManualRotateReason is operator-DEPRIORITIZED, not
+// unhealthy: ResolveActive's degrade path explicitly treats it as a
+// "parked-but-healthy" servable target (preferred over a genuinely failed
+// member). So such a member counts as healthy here too — otherwise a 429 on
+// the rotated-to member while the rotated-from peer is operator-parked would
+// be misclassified as pool exhaustion even though the parked peer is a valid
+// failover target. This keeps HasHealthyMember in agreement with the degrade
+// decision the failover path makes from the same health view.
+func (pr *PoolResolver) HasHealthyMember(pool string) bool {
+	if pr == nil {
+		return false
+	}
+	members, isPool := pr.pools[pool]
+	if !isPool || len(members) == 0 {
+		return false
+	}
+	now := time.Now()
+	pr.health.mu.RLock()
+	defer pr.health.mu.RUnlock()
+	for _, m := range members {
+		h, tracked := pr.health.health[m]
+		if !tracked || h.cooldownUntil.IsZero() || !h.cooldownUntil.After(now) {
+			// Not cooling (untracked, zero, or lazily expired vs `now`).
+			return true
+		}
+		if h.reason == ManualRotateReason {
+			// Operator-parked but servable (matches the degrade path).
+			return true
+		}
+	}
+	return false
+}
+
+// SoonestCooldown returns the minimum member cooldown expiry that is STRICTLY
+// GREATER than a freshly-sampled `now` (i.e. only members currently cooling
+// are considered; an already-passed cooldown still in the map is skipped,
+// mirroring `cooling()`'s lazy-expiry). ok is false when no member of the pool
+// is currently cooling (the recovery monitor uses this to decide whether to
+// reschedule a wake). A non-pool name or unknown pool returns ok=false. It
+// RLock-reads the shared health map once against the single `now`.
+func (pr *PoolResolver) SoonestCooldown(pool string) (time.Time, bool) {
+	if pr == nil {
+		return time.Time{}, false
+	}
+	members, isPool := pr.pools[pool]
+	if !isPool || len(members) == 0 {
+		return time.Time{}, false
+	}
+	now := time.Now()
+	pr.health.mu.RLock()
+	defer pr.health.mu.RUnlock()
+	var soonest time.Time
+	found := false
+	for _, m := range members {
+		h, tracked := pr.health.health[m]
+		if !tracked || h.cooldownUntil.IsZero() || !h.cooldownUntil.After(now) {
+			continue // not currently cooling (lazy-expiry skip)
+		}
+		if !found || h.cooldownUntil.Before(soonest) {
+			soonest = h.cooldownUntil
+			found = true
+		}
+	}
+	return soonest, found
+}
+
 // CooldownUntil returns the in-memory cooldown expiry for a credential and
 // whether it is currently cooling down (future expiry). Exposed as an
 // introspection surface for tests and potential future `pool status`
diff --git a/internal/vault/pool_test.go b/internal/vault/pool_test.go
index 1318223..4060e8b 100644
--- a/internal/vault/pool_test.go
+++ b/internal/vault/pool_test.go
@@ -940,3 +940,186 @@ func TestSetCurrentMembersPrunesStaleActive(t *testing.T) {
 		t.Fatal("Finding 1: SetCurrentMembers did not prune the stale sticky pointer for a dropped/epoch-bumped member")
 	}
 }
+
+// TestHasHealthyMember pins A1's exhaustion signal: a pool is exhausted iff no
+// member is healthy. The check mirrors ResolveActive's cooling()/lazy-expiry
+// semantics exactly (an already-passed cooldown still in the map counts as
+// healthy) and treats a ManualRotateReason park as servable.
+func TestHasHealthyMember(t *testing.T) {
+	now := time.Now()
+	tests := []struct {
+		name  string
+		setup func(pr *PoolResolver)
+		pool  string
+		want  bool
+	}{
+		{
+			name:  "both healthy",
+			setup: func(pr *PoolResolver) {},
+			pool:  "pool",
+			want:  true,
+		},
+		{
+			name: "one cooling one healthy",
+			setup: func(pr *PoolResolver) {
+				pr.MarkCooldown("a", now.Add(60*time.Second), "429")
+			},
+			pool: "pool",
+			want: true,
+		},
+		{
+			name: "both cooling -> exhausted",
+			setup: func(pr *PoolResolver) {
+				pr.MarkCooldown("a", now.Add(60*time.Second), "429")
+				pr.MarkCooldown("b", now.Add(10*time.Minute), "401")
+			},
+			pool: "pool",
+			want: false,
+		},
+		{
+			name: "lazily-expired cooldown still in map counts as healthy",
+			setup: func(pr *PoolResolver) {
+				// b genuinely cooling; a has a stored cooldown already in the
+				// past (lazy expiry) -> a is healthy, so the pool is not
+				// exhausted.
+				pr.MarkCooldown("a", now.Add(-1*time.Second), "429")
+				pr.MarkCooldown("b", now.Add(10*time.Minute), "401")
+			},
+			pool: "pool",
+			want: true,
+		},
+		{
+			name: "manual-rotate park counts as servable",
+			setup: func(pr *PoolResolver) {
+				// a genuinely failed; b operator-parked (deprioritized but
+				// servable) -> the pool is NOT exhausted.
+				pr.MarkCooldown("a", now.Add(60*time.Second), "429")
+				pr.MarkCooldown("b", now.Add(5*time.Minute), ManualRotateReason)
+			},
+			pool: "pool",
+			want: true,
+		},
+		{
+			name:  "unknown pool",
+			setup: func(pr *PoolResolver) {},
+			pool:  "nope",
+			want:  false,
+		},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			pr := NewPoolResolver([]store.Pool{mkPool("pool", "a", "b")}, nil)
+			tc.setup(pr)
+			if got := pr.HasHealthyMember(tc.pool); got != tc.want {
+				t.Fatalf("HasHealthyMember(%q) = %v, want %v", tc.pool, got, tc.want)
+			}
+		})
+	}
+}
+
+// TestHasHealthyMemberNil pins the nil-receiver guard.
+func TestHasHealthyMemberNil(t *testing.T) {
+	var pr *PoolResolver
+	if pr.HasHealthyMember("pool") {
+		t.Fatal("nil resolver HasHealthyMember = true, want false")
+	}
+}
+
+// TestSoonestCooldown pins that SoonestCooldown returns the minimum
+// STRICTLY-FUTURE member cooldown, skipping already-passed entries (lazy
+// expiry), and reports ok=false when no member is currently cooling.
+func TestSoonestCooldown(t *testing.T) {
+	now := time.Now()
+	tests := []struct {
+		name     string
+		setup    func(pr *PoolResolver, base time.Time)
+		pool     string
+		wantOK   bool
+		wantBase time.Time // expected soonest when wantOK (exact, set in setup)
+	}{
+		{
+			name:   "no member cooling",
+			setup:  func(pr *PoolResolver, base time.Time) {},
+			pool:   "pool",
+			wantOK: false,
+		},
+		{
+			name: "single cooling member",
+			setup: func(pr *PoolResolver, base time.Time) {
+				pr.MarkCooldown("a", base.Add(60*time.Second), "429")
+			},
+			pool:     "pool",
+			wantOK:   true,
+			wantBase: now.Add(60 * time.Second),
+		},
+		{
+			name: "two cooling -> min wins",
+			setup: func(pr *PoolResolver, base time.Time) {
+				pr.MarkCooldown("a", base.Add(10*time.Minute), "401")
+				pr.MarkCooldown("b", base.Add(60*time.Second), "429")
+			},
+			pool:     "pool",
+			wantOK:   true,
+			wantBase: now.Add(60 * time.Second),
+		},
+		{
+			name: "already-passed entry skipped",
+			setup: func(pr *PoolResolver, base time.Time) {
+				// a is in the past (lazy-expired) -> ignored; b is the only
+				// currently-cooling member.
+				pr.MarkCooldown("a", base.Add(-1*time.Second), "429")
+				pr.MarkCooldown("b", base.Add(120*time.Second), "401")
+			},
+			pool:     "pool",
+			wantOK:   true,
+			wantBase: now.Add(120 * time.Second),
+		},
+		{
+			name: "all passed -> not cooling",
+			setup: func(pr *PoolResolver, base time.Time) {
+				pr.MarkCooldown("a", base.Add(-2*time.Second), "429")
+				pr.MarkCooldown("b", base.Add(-1*time.Second), "401")
+			},
+			pool:   "pool",
+			wantOK: false,
+		},
+		{
+			name:   "unknown pool",
+			setup:  func(pr *PoolResolver, base time.Time) {},
+			pool:   "nope",
+			wantOK: false,
+		},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			pr := NewPoolResolver([]store.Pool{mkPool("pool", "a", "b")}, nil)
+			// Use a base far enough in the future that the "already-passed"
+			// negative offsets are still safely in the past relative to the
+			// SoonestCooldown's own time.Now() sample.
+			base := time.Now()
+			tc.setup(pr, base)
+			got, ok := pr.SoonestCooldown(tc.pool)
+			if ok != tc.wantOK {
+				t.Fatalf("SoonestCooldown(%q) ok = %v, want %v", tc.pool, ok, tc.wantOK)
+			}
+			if !tc.wantOK {
+				return
+			}
+			// The stored cooldown is exact (MarkCooldown stores `until`
+			// verbatim); SoonestCooldown returns it unchanged, so compare
+			// against the value set in setup within a small tolerance for the
+			// base/now skew.
+			if d := got.Sub(tc.wantBase); d < -2*time.Second || d > 2*time.Second {
+				t.Fatalf("SoonestCooldown(%q) = %v, want ~%v (delta %v)", tc.pool, got, tc.wantBase, d)
+			}
+		})
+	}
+}
+
+// TestSoonestCooldownNil pins the nil-receiver guard.
+func TestSoonestCooldownNil(t *testing.T) {
+	var pr *PoolResolver
+	if _, ok := pr.SoonestCooldown("pool"); ok {
+		t.Fatal("nil resolver SoonestCooldown ok = true, want false")
+	}
+}

From 2c05a5b65e8c87e3288d2e92f64abb7417051a19 Mon Sep 17 00:00:00 2001
From: Nikita Nemirovsky <vaze.legend@gmail.com>
Date: Fri, 22 May 2026 16:01:16 +0800
Subject: [PATCH 04/19] feat(proxy): edge-triggered pool exhaustion notices +
 recovery monitor (A2)

---
 ...22-pool-exhaustion-and-agent-auth-reset.md |  38 ++-
 internal/proxy/addon.go                       |  19 ++
 internal/proxy/pool_failover.go               |  30 +-
 internal/proxy/pool_recovery_monitor_test.go  | 302 ++++++++++++++++++
 internal/proxy/server.go                      | 275 +++++++++++++++-
 5 files changed, 641 insertions(+), 23 deletions(-)
 create mode 100644 internal/proxy/pool_recovery_monitor_test.go

diff --git a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
index b24d35e..2a5dae4 100644
--- a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
+++ b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
@@ -249,28 +249,34 @@ precedent) and emits an `agent_auth_reset` audit event.
 - Modify: `internal/proxy/pool_failover.go` (entry-edge gating; recovered notice formatter)
 - Modify: `internal/proxy/server_test.go` (+ focused new test file as needed)
 
-- [ ] add `poolExhaustMu`/`poolExhausted`/`recoveryWake`(cap-1)/`monitorStop`+`monitorStopOnce`/
+- [x] add `poolExhaustMu`/`poolExhausted`/`recoveryWake`(cap-1)/`monitorStop`+`monitorStopOnce`/
   `onPoolRecovered` to `Server`; `SetOnPoolRecovered`; document WHY state is on `Server` not
-  `PoolHealth` (survives swaps, must not be pruned on membership change)
-- [ ] launch the monitor once (in `New`/`StartMonitors`); stop it idempotently from BOTH
-  `Close` and `GracefulShutdown` (via `monitorStopOnce`)
-- [ ] gate the "exhausted" notice on the `false→true` edge (one notice on entry; this edge —
+  `PoolHealth` (survives swaps, must not be pruned on membership change). Also added a separate
+  `onPoolRecoveredNotice` callback (notice always fires; auth-reset stays opt-in per Task 8) and
+  a test-shortenable `recoveryMinReschedule` field.
+- [x] launch the monitor once (in `New`); stop it idempotently from BOTH
+  `Close` and `GracefulShutdown` (via `monitorStopOnce`/`stopMonitors`)
+- [x] gate the "exhausted" notice on the `false→true` edge (one notice on entry; this edge —
   not the 30s window — is authoritative for the exhausted notice); record state; wake the
-  monitor with a non-blocking `select { case recoveryWake <- struct{}{}: default: }`
-- [ ] implement the monitor: each wake `Load()` the current resolver (never cache); compute
+  monitor with a non-blocking `select { case recoveryWake <- struct{}{}: default: }`. Wired via
+  addon `onPoolExhausted` hook (`SetOnPoolExhausted`, mirroring `SetOnFailover`); addon-only
+  tests with no Server fall back to the A1 30s-window collapse.
+- [x] implement the monitor: each wake `Load()` the current resolver (never cache); compute
   `sleep = max(SoonestCooldown(pool).Sub(time.Now()), minReschedule)`; on wake, for each
   exhausted pool flip `true→false` when `HasHealthyMember` is true, emit "pool recovered",
   call `onPoolRecovered(pool)`; reschedule while still exhausted; drop state for pools no
   longer present
-- [ ] add a recovered-notice formatter (plain text, sentence style) alongside
-  `FormatFailoverNotice`
-- [ ] update the Task-2 exhausted-notice test to assert EDGE behavior (one notice per
-  healthy→exhausted edge across many failing responses), not 30s-window behavior
-- [ ] write tests: entry edge emits exactly one exhausted notice; recovery edge emits exactly
-  one recovered notice + one `onPoolRecovered` call; **unequal cooldowns** (memA 60s, memB 6h)
-  → monitor wakes ~60s, recovers once though memB still cools; monitor stops on shutdown; a
-  pool removed while exhausted fires no recovered notice
-- [ ] run `go test ./internal/proxy/ -race`; gofumpt; vet — pass before Task 4
+- [x] add a recovered-notice formatter (plain text, sentence style) alongside
+  `FormatFailoverNotice` (`FormatPoolRecoveredNotice`)
+- [x] update the Task-2 exhausted-notice test to assert EDGE behavior (one notice per
+  healthy→exhausted edge across many failing responses), not 30s-window behavior. Added
+  `TestExhaustedNoticeEdgeAcrossManyResponses` (Server-wired edge path); the existing addon-only
+  Task-2 tests retain the 30s-window assertion for the no-Server fallback path.
+- [x] write tests: entry edge emits exactly one exhausted notice; recovery edge emits exactly
+  one recovered notice + one `onPoolRecovered` call; **unequal cooldowns** (memA short, memB 6h)
+  → monitor wakes ~soonest, recovers once though memB still cools; monitor stops on shutdown
+  (double-stop no panic); a pool removed while exhausted fires no recovered notice
+- [x] run `go test ./internal/proxy/ -race`; gofumpt; vet — pass before Task 4
 
 ### Task 4: Schema + store for per-pool `auth_reset_target`
 
diff --git a/internal/proxy/addon.go b/internal/proxy/addon.go
index d067015..1429fa4 100644
--- a/internal/proxy/addon.go
+++ b/internal/proxy/addon.go
@@ -167,6 +167,16 @@ type SluiceAddon struct {
 	// means failover is in-memory only (no durability, no notice).
 	onFailover func(FailoverEvent)
 
+	// onPoolExhausted, when set, is called from handlePoolFailover on every
+	// response classified as pool-exhausting (no healthy member). It performs
+	// the healthy->exhausted EDGE bookkeeping on the long-lived Server and
+	// returns true ONLY on the false->true edge — that edge, not the addon's
+	// 30s notice window, is authoritative for the "pool exhausted" notice (A2).
+	// When this hook is nil (addon constructed without a Server, e.g. unit
+	// tests that exercise the failover path in isolation) the addon falls back
+	// to its own 30s-window dedup so those tests keep their existing behavior.
+	onPoolExhausted func(pool string) bool
+
 	// persistDone is an optional channel signaled when an async OAuth
 	// token persist goroutine completes. Used by tests to avoid
 	// time.Sleep-based synchronization. Nil in production.
@@ -389,6 +399,15 @@ func (a *SluiceAddon) SetOnFailover(fn func(FailoverEvent)) {
 	a.onFailover = fn
 }
 
+// SetOnPoolExhausted configures the healthy->exhausted edge hook. The Server
+// sets this (mirroring SetOnFailover) so handlePoolFailover can flip the
+// per-pool exhaustion state and wake the recovery monitor. The hook returns
+// true only on the edge, gating the one-time "exhausted" notice. Safe to leave
+// unset: the addon then uses its own 30s-window dedup (legacy behavior).
+func (a *SluiceAddon) SetOnPoolExhausted(fn func(pool string) bool) {
+	a.onPoolExhausted = fn
+}
+
 // UpdateOAuthIndex rebuilds the OAuth token URL index from credential
 // metadata. Called on startup and after credential metadata changes
 // (e.g. SIGHUP hot-reload).
diff --git a/internal/proxy/pool_failover.go b/internal/proxy/pool_failover.go
index 594278a..e719d84 100644
--- a/internal/proxy/pool_failover.go
+++ b/internal/proxy/pool_failover.go
@@ -349,6 +349,15 @@ func FormatFailoverNotice(ev FailoverEvent) string {
 		ev.Pool, ev.From, ev.To, reason)
 }
 
+// FormatPoolRecoveredNotice builds the plain-text, single-line operator notice
+// emitted on the exhausted->recovered edge by the recovery monitor. Kept next
+// to FormatFailoverNotice as a pure, directly-testable function. Plain text /
+// sentence style for the same reason (the notice path sends with no parse
+// mode, so markdown/HTML would render literally).
+func FormatPoolRecoveredNotice(pool string) string {
+	return fmt.Sprintf("Pool %q recovered: a healthy account is available again.", pool)
+}
+
 // poolForResponse maps a response's CONNECT destination back to a pooled
 // binding and returns the pool name + the member that was active for this
 // request. Returns ok=false when the destination is not bound to a pool.
@@ -667,15 +676,24 @@ func (a *SluiceAddon) handlePoolFailover(f *mitmproxy.Flow) {
 	// synchronous MarkCooldown above would otherwise each emit one audit
 	// row + one operator notice.
 	//
-	// A1: for the exhausted path collapse the dedup key to
-	// (pool, "*", "*", "exhausted") so the flap direction (which degrade
-	// target ResolveActive happened to pick) cannot mint two distinct keys and
-	// let the retry storm through twice. A real from->to transition keeps its
-	// (pool, from, to, tag) key. The cooldown itself was already applied
+	// A2: the "exhausted" notice is gated on the healthy->exhausted EDGE, not
+	// the 30s window. onPoolExhausted (set by the Server) flips the per-pool
+	// exhaustion state on the long-lived Server and returns true only on the
+	// false->true edge, so a retry storm against an already-exhausted pool
+	// emits exactly one notice and the edge also wakes the recovery monitor.
+	// When the hook is unset (addon-only unit tests) fall back to the A1
+	// collapsed 30s-window key so those tests keep their existing behavior.
+	//
+	// A real from->to transition is unchanged: it keeps its (pool, from, to,
+	// tag) 30s-window key. The cooldown itself was already applied
 	// unconditionally above, so suppressing the notice loses nothing.
 	emit := false
 	if exhausted {
-		emit = a.shouldEmitPoolNotice(pool, "*", "*", "exhausted")
+		if a.onPoolExhausted != nil {
+			emit = a.onPoolExhausted(pool)
+		} else {
+			emit = a.shouldEmitPoolNotice(pool, "*", "*", "exhausted")
+		}
 	} else {
 		emit = a.shouldEmitPoolNotice(pool, from, to, tag)
 	}
diff --git a/internal/proxy/pool_recovery_monitor_test.go b/internal/proxy/pool_recovery_monitor_test.go
new file mode 100644
index 0000000..4e298e4
--- /dev/null
+++ b/internal/proxy/pool_recovery_monitor_test.go
@@ -0,0 +1,302 @@
+package proxy
+
+import (
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/nemirovsky/sluice/internal/policy"
+	"github.com/nemirovsky/sluice/internal/store"
+	"github.com/nemirovsky/sluice/internal/vault"
+)
+
+// newMonitorTestServer builds a minimal Server (policy-only, no provider) for
+// exercising the A2 recovery monitor in isolation. The monitor goroutine is
+// already running (started in New); the test stores a pool resolver and drives
+// the monitor via markPoolExhausted. recoveryMinReschedule is shortened so a
+// just-expired-but-unhealthy reschedule does not slow the test.
+func newMonitorTestServer(t *testing.T) *Server {
+	t.Helper()
+	eng, err := policy.LoadFromBytes([]byte(`
+[policy]
+default = "deny"
+`))
+	if err != nil {
+		t.Fatalf("policy load: %v", err)
+	}
+	srv, err := New(Config{ListenAddr: "127.0.0.1:0", Policy: eng})
+	if err != nil {
+		t.Fatalf("New: %v", err)
+	}
+	srv.recoveryMinReschedule = time.Millisecond
+	t.Cleanup(func() { _ = srv.Close() })
+	return srv
+}
+
+// twoMemberPool builds a 2-member failover pool resolver and stores it on the
+// server. Returns the live resolver.
+func twoMemberPool(t *testing.T, srv *Server, name, a, b string) *vault.PoolResolver {
+	t.Helper()
+	pool := store.Pool{Name: name, Strategy: store.PoolStrategyFailover}
+	pool.Members = []store.PoolMember{
+		{Credential: a, Position: 0},
+		{Credential: b, Position: 1},
+	}
+	pr := vault.NewPoolResolver([]store.Pool{pool}, nil)
+	srv.poolResolver.Store(pr)
+	return pr
+}
+
+// waitFor polls cond up to timeout, returning whether it became true. Avoids a
+// fixed Sleep so the monitor tests stay fast and deterministic.
+func waitFor(timeout time.Duration, cond func() bool) bool {
+	deadline := time.Now().Add(timeout)
+	for time.Now().Before(deadline) {
+		if cond() {
+			return true
+		}
+		time.Sleep(time.Millisecond)
+	}
+	return cond()
+}
+
+// TestMarkPoolExhaustedEdge asserts the healthy->exhausted edge fires exactly
+// once: the first call returns true, subsequent calls (the agent's retry storm)
+// return false until the pool recovers.
+func TestMarkPoolExhaustedEdge(t *testing.T) {
+	srv := newMonitorTestServer(t)
+	// No resolver / no recovery: a removed-but-marked pool would be cleaned up,
+	// but here we only test the edge bookkeeping, so use a pool that stays
+	// exhausted (no resolver -> scanRecovery clears it). Store a resolver whose
+	// members are all cooling so it stays exhausted.
+	pr := twoMemberPool(t, srv, "p", "a", "b")
+	far := time.Now().Add(time.Hour)
+	pr.MarkCooldown("a", far, "429")
+	pr.MarkCooldown("b", far, "429")
+
+	if !srv.markPoolExhausted("p") {
+		t.Fatal("first markPoolExhausted = false, want true (healthy->exhausted edge)")
+	}
+	for i := 0; i < 5; i++ {
+		if srv.markPoolExhausted("p") {
+			t.Fatalf("markPoolExhausted call %d = true, want false (already exhausted, no new edge)", i+2)
+		}
+	}
+}
+
+// TestExhaustedNoticeEdgeAcrossManyResponses asserts handlePoolFailover emits
+// exactly one "exhausted" notice across many failing responses once the
+// Server's edge gate is wired (A2: the edge, not the 30s window, is
+// authoritative).
+func TestExhaustedNoticeEdgeAcrossManyResponses(t *testing.T) {
+	srv := newMonitorTestServer(t)
+	addon, _, prPtr := setupPoolAddon(t, "memA", "memB")
+	// Wire the addon's exhaustion edge into the server (mirrors setupInjection).
+	addon.SetOnPoolExhausted(srv.markPoolExhausted)
+	// Share the same resolver between the addon and the server so the monitor
+	// observes the cooldowns the failover path applies.
+	srv.poolResolver.Store(prPtr.Load())
+	srv.addon = addon
+	client := setupAddonConn(addon, "auth.example.com:443")
+
+	// Both members already failure-cooled for a long window: every response is
+	// pool-exhausting.
+	prPtr.Load().MarkCooldown("memA", time.Now().Add(time.Hour), "429")
+	prPtr.Load().MarkCooldown("memB", time.Now().Add(time.Hour), "401")
+
+	var notices int32
+	done := make(chan struct{}, 16)
+	addon.SetOnFailover(func(ev FailoverEvent) {
+		if !ev.Exhausted {
+			t.Errorf("FailoverEvent.Exhausted = false, want true")
+		}
+		atomic.AddInt32(&notices, 1)
+		done <- struct{}{}
+	})
+
+	// Ten back-to-back failing responses (the agent retry storm).
+	for i := 0; i < 10; i++ {
+		f := newPoolRespFlow(client, 429, []byte(`{"error":"rate_limited"}`))
+		addon.flowInjected.Tag(f.Id, "memA")
+		addon.Response(f)
+	}
+
+	select {
+	case <-done:
+	case <-time.After(2 * time.Second):
+		t.Fatal("onFailover not invoked for the exhausted edge")
+	}
+	// Let any erroneous extra notice surface.
+	time.Sleep(50 * time.Millisecond)
+	if got := atomic.LoadInt32(&notices); got != 1 {
+		t.Fatalf("exhausted notices = %d, want exactly 1 (edge-gated, not per-response)", got)
+	}
+}
+
+// TestRecoveryMonitorEdgeNoticeAndCallback asserts that on the
+// exhausted->recovered edge the monitor fires exactly one recovered notice AND
+// exactly one onPoolRecovered call.
+func TestRecoveryMonitorEdgeNoticeAndCallback(t *testing.T) {
+	srv := newMonitorTestServer(t)
+	pr := twoMemberPool(t, srv, "p", "a", "b")
+
+	var notices, resets int32
+	var mu sync.Mutex
+	var recoveredNotice, recoveredCb string
+	srv.SetOnPoolRecoveredNotice(func(pool string) {
+		mu.Lock()
+		recoveredNotice = pool
+		mu.Unlock()
+		atomic.AddInt32(&notices, 1)
+	})
+	srv.SetOnPoolRecovered(func(pool string) {
+		mu.Lock()
+		recoveredCb = pool
+		mu.Unlock()
+		atomic.AddInt32(&resets, 1)
+	})
+
+	// Both members cooling briefly: the pool is exhausted, then recovers.
+	until := time.Now().Add(40 * time.Millisecond)
+	pr.MarkCooldown("a", until, "429")
+	pr.MarkCooldown("b", until, "429")
+
+	if !srv.markPoolExhausted("p") {
+		t.Fatal("markPoolExhausted = false, want true (edge)")
+	}
+
+	if !waitFor(2*time.Second, func() bool { return atomic.LoadInt32(&notices) == 1 }) {
+		t.Fatalf("recovered notices = %d, want 1", atomic.LoadInt32(&notices))
+	}
+	// Let a wrongful second notice surface.
+	time.Sleep(50 * time.Millisecond)
+	if got := atomic.LoadInt32(&notices); got != 1 {
+		t.Fatalf("recovered notices = %d, want exactly 1", got)
+	}
+	if got := atomic.LoadInt32(&resets); got != 1 {
+		t.Fatalf("onPoolRecovered calls = %d, want exactly 1", got)
+	}
+	mu.Lock()
+	gotNotice, gotCb := recoveredNotice, recoveredCb
+	mu.Unlock()
+	if gotNotice != "p" || gotCb != "p" {
+		t.Fatalf("recovered pool = notice:%q cb:%q, want p/p", gotNotice, gotCb)
+	}
+	// State cleared after recovery: a fresh edge can re-fire.
+	if !srv.markPoolExhausted("p") {
+		t.Fatal("post-recovery markPoolExhausted = false, want true (state was cleared)")
+	}
+}
+
+// TestRecoveryMonitorUnequalCooldowns is the B1-shaped case: memA cools for a
+// short window, memB for a far longer one. The monitor must wake at the SOONEST
+// cooldown (~memA) and recover once when memA becomes healthy, even though memB
+// is still cooling. Uses short injected durations (no 60s sleep).
+func TestRecoveryMonitorUnequalCooldowns(t *testing.T) {
+	srv := newMonitorTestServer(t)
+	pr := twoMemberPool(t, srv, "p", "memA", "memB")
+
+	var notices int32
+	srv.SetOnPoolRecoveredNotice(func(string) { atomic.AddInt32(&notices, 1) })
+
+	// memA: short (recovers soon). memB: 6h (still cooling at recovery).
+	pr.MarkCooldown("memA", time.Now().Add(40*time.Millisecond), "429")
+	pr.MarkCooldown("memB", time.Now().Add(6*time.Hour), "429")
+
+	if !srv.markPoolExhausted("p") {
+		t.Fatal("markPoolExhausted = false, want true")
+	}
+
+	if !waitFor(2*time.Second, func() bool { return atomic.LoadInt32(&notices) == 1 }) {
+		t.Fatalf("recovered notices = %d, want 1 (wake at soonest cooldown, recover on memA)", atomic.LoadInt32(&notices))
+	}
+	time.Sleep(50 * time.Millisecond)
+	if got := atomic.LoadInt32(&notices); got != 1 {
+		t.Fatalf("recovered notices = %d, want exactly 1 (memB still cooling must not re-fire)", got)
+	}
+	// memB must still be cooling (the recovery was driven solely by memA).
+	if _, cooling := pr.CooldownUntil("memB"); !cooling {
+		t.Fatal("memB no longer cooling, expected its 6h cooldown to persist through memA's recovery")
+	}
+}
+
+// TestRecoveryMonitorPoolRemovedFiresNoNotice asserts a pool removed while
+// exhausted has its state dropped and fires no recovered notice.
+func TestRecoveryMonitorPoolRemovedFiresNoNotice(t *testing.T) {
+	srv := newMonitorTestServer(t)
+	pr := twoMemberPool(t, srv, "p", "a", "b")
+	pr.MarkCooldown("a", time.Now().Add(time.Hour), "429")
+	pr.MarkCooldown("b", time.Now().Add(time.Hour), "429")
+
+	var notices, resets int32
+	srv.SetOnPoolRecoveredNotice(func(string) { atomic.AddInt32(&notices, 1) })
+	srv.SetOnPoolRecovered(func(string) { atomic.AddInt32(&resets, 1) })
+
+	if !srv.markPoolExhausted("p") {
+		t.Fatal("markPoolExhausted = false, want true")
+	}
+
+	// Remove the pool from the resolver (membership change rebuilds the
+	// resolver with no pools).
+	srv.poolResolver.Store(vault.NewPoolResolver(nil, nil))
+	// Wake the monitor so it observes the removal.
+	srv.markPoolExhausted("p") // already-exhausted -> false, but harmless
+	select {
+	case srv.recoveryWake <- struct{}{}:
+	default:
+	}
+
+	// The state entry must be dropped without any recovered notice/reset.
+	if !waitFor(2*time.Second, func() bool {
+		srv.poolExhaustMu.Lock()
+		_, present := srv.poolExhausted["p"]
+		srv.poolExhaustMu.Unlock()
+		return !present
+	}) {
+		t.Fatal("exhaustion state for removed pool was not dropped")
+	}
+	time.Sleep(50 * time.Millisecond)
+	if got := atomic.LoadInt32(&notices); got != 0 {
+		t.Fatalf("recovered notices = %d, want 0 (removed pool fires no notice)", got)
+	}
+	if got := atomic.LoadInt32(&resets); got != 0 {
+		t.Fatalf("onPoolRecovered calls = %d, want 0 (removed pool triggers no reset)", got)
+	}
+}
+
+// TestRecoveryMonitorStopsCleanly asserts the monitor stops on shutdown and that
+// stopping twice (Close after GracefulShutdown, or vice versa) does not panic.
+func TestRecoveryMonitorStopsCleanly(t *testing.T) {
+	eng, err := policy.LoadFromBytes([]byte("[policy]\ndefault = \"deny\"\n"))
+	if err != nil {
+		t.Fatalf("policy load: %v", err)
+	}
+	srv, err := New(Config{ListenAddr: "127.0.0.1:0", Policy: eng})
+	if err != nil {
+		t.Fatalf("New: %v", err)
+	}
+	// Double-stop must be idempotent (monitorStopOnce). Calling both shutdown
+	// paths must not panic on a double close of monitorStop. (The second close
+	// of the already-closed listener returns a benign "use of closed network
+	// connection" error, which is unrelated to the monitor and ignored here.)
+	if err := srv.GracefulShutdown(time.Second); err != nil {
+		t.Fatalf("GracefulShutdown: %v", err)
+	}
+	_ = srv.Close() // must not panic on the monitorStop double-close
+	// stopMonitors directly a third time also must not panic.
+	srv.stopMonitors()
+
+	// The monitor goroutine must have returned: a markPoolExhausted edge after
+	// stop must not be serviced (no recovered notice ever fires).
+	var notices int32
+	srv.SetOnPoolRecoveredNotice(func(string) { atomic.AddInt32(&notices, 1) })
+	pr := twoMemberPool(t, srv, "p", "a", "b")
+	pr.MarkCooldown("a", time.Now().Add(20*time.Millisecond), "429")
+	pr.MarkCooldown("b", time.Now().Add(20*time.Millisecond), "429")
+	srv.markPoolExhausted("p")
+	time.Sleep(100 * time.Millisecond)
+	if got := atomic.LoadInt32(&notices); got != 0 {
+		t.Fatalf("recovered notices = %d, want 0 (monitor stopped, must not service wakes)", got)
+	}
+}
diff --git a/internal/proxy/server.go b/internal/proxy/server.go
index 4a2adbd..fa2c3fa 100644
--- a/internal/proxy/server.go
+++ b/internal/proxy/server.go
@@ -92,6 +92,44 @@ type Server struct {
 	serving      atomic.Bool
 	activeConns  sync.WaitGroup
 
+	// Per-pool exhaustion notification state for the A2 edge-triggered
+	// notices + recovery monitor. This deliberately lives on the long-lived
+	// Server, NOT on PoolHealth, for two reasons:
+	//
+	//  1. It is per-PROCESS notification bookkeeping (have we already told the
+	//     operator this pool is exhausted?), not per-resolver-generation health.
+	//     PoolHealth is rebuilt/merged on every resolver pointer swap (SIGHUP,
+	//     the 2s data-version watcher); the exhaustion edge must survive those
+	//     swaps so a swap mid-exhaustion does not re-fire the "exhausted" notice.
+	//  2. PoolHealth entries are pruned on a membership change (MergeLiveCooldowns
+	//     drops cooldowns for credentials no longer in any pool). Exhaustion state
+	//     keyed by POOL must NOT be pruned that way — it is the recovery monitor,
+	//     not membership churn, that clears it (and only when the pool genuinely
+	//     recovers or is removed). Putting it on PoolHealth would wrongly prune it.
+	//
+	// poolExhausted[pool] is true between the healthy->exhausted edge (set in the
+	// addon's handlePoolFailover via onPoolExhausted) and the recovery monitor
+	// observing HasHealthyMember again. recoveryWake (buffered cap-1) is a
+	// non-blocking edge signal that wakes the monitor. monitorStop is closed once
+	// (monitorStopOnce) from BOTH Close and GracefulShutdown to stop the monitor
+	// goroutine idempotently. onPoolRecovered is the optional recovery-edge
+	// callback (auto-reset wiring, Task 8) set via SetOnPoolRecovered.
+	// onPoolRecoveredNotice is the operator-notice callback, fired on every
+	// recovery edge independent of any auth-reset target (wired to the broker
+	// channels in main.go) — kept separate so the notice always fires while the
+	// reset is opt-in (Task 8).
+	poolExhaustMu         sync.Mutex
+	poolExhausted         map[string]bool
+	recoveryWake          chan struct{}
+	monitorStop           chan struct{}
+	monitorStopOnce       sync.Once
+	onPoolRecovered       func(pool string)
+	onPoolRecoveredNotice func(pool string)
+	// recoveryMinReschedule is the floor for the monitor's per-pool sleep so a
+	// just-expired-but-still-unhealthy state can't spin the loop. Defaults to
+	// minRecoveryReschedule; tests may shorten it for determinism.
+	recoveryMinReschedule time.Duration
+
 	// oauthMetasCache holds the latest credential_meta slice the
 	// server saw via UpdateOAuthIndex. Cached so a later
 	// quicProxy initialization (or re-init) can re-apply it.
@@ -615,7 +653,13 @@ func New(cfg Config) (*Server, error) {
 		return nil, fmt.Errorf("listen: %w", err)
 	}
 
-	srv := &Server{listener: ln}
+	srv := &Server{
+		listener:              ln,
+		poolExhausted:         make(map[string]bool),
+		recoveryWake:          make(chan struct{}, 1),
+		monitorStop:           make(chan struct{}),
+		recoveryMinReschedule: minRecoveryReschedule,
+	}
 
 	// Initialize credential injection handlers when a vault provider is
 	// configured. The resolver may be nil at startup (no bindings yet) and
@@ -668,6 +712,12 @@ func New(cfg Config) (*Server, error) {
 		socks5.WithAssociateHandle(srv.handleAssociate),
 	)
 
+	// Launch the pool recovery monitor exactly once for the lifetime of this
+	// Server. It is started here (not in ListenAndServe) so it is running for
+	// any constructed Server; both Close and GracefulShutdown stop it
+	// idempotently via monitorStopOnce.
+	go srv.runRecoveryMonitor()
+
 	return srv, nil
 }
 
@@ -718,6 +768,13 @@ func (s *Server) setupInjection(cfg Config, _ net.Listener) error {
 		addonOpts = append(addonOpts, WithAuditLogger(cfg.Audit))
 	}
 	s.addon = NewSluiceAddon(addonOpts...)
+	// Wire the addon's exhaustion-edge hook back into the Server's per-pool
+	// notification state. The addon owns the failover classification but the
+	// edge bookkeeping (and the recovery monitor it wakes) live on the
+	// long-lived Server (see the poolExhausted field doc). markPoolExhausted
+	// returns true only on the healthy->exhausted edge, so the addon emits the
+	// "exhausted" notice exactly once per edge.
+	s.addon.SetOnPoolExhausted(s.markPoolExhausted)
 
 	// Load credential metadata once and stash it; we cannot mirror it
 	// into the QUIC proxy yet because that proxy is initialized later
@@ -2833,6 +2890,220 @@ func (s *Server) SetOnFailover(fn func(FailoverEvent)) {
 	}
 }
 
+// SetOnPoolRecovered configures the callback the recovery monitor invokes once,
+// on the exhausted->recovered edge, after it has emitted the "pool recovered"
+// notice. Task 8 wires this in main.go to look up the recovered pool's
+// auth_reset_target and, when set, run the agent's auth-reset command in a
+// detached goroutine. Safe to leave unset (recovery is then notice-only).
+func (s *Server) SetOnPoolRecovered(fn func(pool string)) {
+	s.poolExhaustMu.Lock()
+	s.onPoolRecovered = fn
+	s.poolExhaustMu.Unlock()
+}
+
+// SetOnPoolRecoveredNotice configures the operator-notice callback the recovery
+// monitor fires on every exhausted->recovered edge, independent of whether the
+// pool has an auth-reset target. main.go wires this to fan the
+// FormatPoolRecoveredNotice text out across the broker channels (mirroring the
+// onFailover notice path). Kept separate from onPoolRecovered so the notice
+// always fires while the auth-reset stays opt-in.
+func (s *Server) SetOnPoolRecoveredNotice(fn func(pool string)) {
+	s.poolExhaustMu.Lock()
+	s.onPoolRecoveredNotice = fn
+	s.poolExhaustMu.Unlock()
+}
+
+// minRecoveryReschedule floors the recovery monitor's per-pool wake interval so
+// a member whose cooldown has just expired but whose upstream is still
+// unhealthy (no HasHealthyMember yet) cannot spin the monitor in a tight loop.
+const minRecoveryReschedule = 1 * time.Second
+
+// markPoolExhausted records the healthy->exhausted edge for a pool and returns
+// true ONLY on that edge (false->true). It is the addon's onPoolExhausted hook:
+// the addon calls it on every exhausted classification, but the "exhausted"
+// operator notice is emitted only when this returns true, so a retry storm
+// against an already-exhausted pool produces exactly one notice. On the edge it
+// also wakes the recovery monitor with a non-blocking buffered send.
+//
+// This edge — not the addon's 30s window — is authoritative for the exhausted
+// notice (A2). The window dedup remains only for real from->to transitions.
+func (s *Server) markPoolExhausted(pool string) bool {
+	s.poolExhaustMu.Lock()
+	if s.poolExhausted == nil {
+		s.poolExhausted = make(map[string]bool)
+	}
+	already := s.poolExhausted[pool]
+	if already {
+		s.poolExhaustMu.Unlock()
+		return false
+	}
+	s.poolExhausted[pool] = true
+	s.poolExhaustMu.Unlock()
+
+	// Wake the monitor. Non-blocking: the channel is cap-1, so a wake already
+	// pending coalesces with this one (the monitor re-Loads the resolver and
+	// scans every exhausted pool on each wake regardless of which pool woke it).
+	select {
+	case s.recoveryWake <- struct{}{}:
+	default:
+	}
+	return true
+}
+
+// stopMonitors stops the recovery monitor goroutine idempotently. Closing
+// monitorStop is guarded by monitorStopOnce so calling it from BOTH Close and
+// GracefulShutdown (or twice) cannot panic on a double close.
+func (s *Server) stopMonitors() {
+	s.monitorStopOnce.Do(func() {
+		close(s.monitorStop)
+	})
+}
+
+// runRecoveryMonitor is the A2 recovery monitor. It is server-driven
+// (time-based) because a latched agent will not emit a recovering 2xx on its
+// own — nothing on the traffic path signals recovery, so sluice must poll the
+// in-memory cooldowns itself.
+//
+// On every wake (an exhaustion edge from markPoolExhausted, or its own
+// reschedule timer) it Load()s the CURRENT pool resolver — never caching it
+// across wakes, so a resolver pointer swap (SIGHUP / data-version watcher) is
+// observed immediately. For each pool currently marked exhausted it:
+//
+//   - drops the state entry if the pool no longer exists in the resolver (a
+//     removed pool fires no recovered notice);
+//   - on HasHealthyMember becoming true, flips true->false, emits the "pool
+//     recovered" notice, and invokes onPoolRecovered(pool) once;
+//   - otherwise computes the next wake as max(SoonestCooldown-now,
+//     recoveryMinReschedule) and keeps the soonest such wake across all still-
+//     exhausted pools.
+//
+// A single timer is reused across iterations; the loop selects on the
+// reschedule timer, the wake channel, and monitorStop.
+func (s *Server) runRecoveryMonitor() {
+	timer := time.NewTimer(time.Hour)
+	if !timer.Stop() {
+		<-timer.C
+	}
+	timerArmed := false
+
+	for {
+		next, anyExhausted := s.scanRecovery()
+		if !timer.Stop() && timerArmed {
+			// Drain a fired-but-unread timer so the reset below is clean.
+			select {
+			case <-timer.C:
+			default:
+			}
+		}
+		timerArmed = false
+		if anyExhausted {
+			if next < s.recoveryMinReschedule {
+				next = s.recoveryMinReschedule
+			}
+			timer.Reset(next)
+			timerArmed = true
+		}
+
+		select {
+		case <-s.monitorStop:
+			if timerArmed {
+				timer.Stop()
+			}
+			return
+		case <-s.recoveryWake:
+			// New exhaustion edge (or coalesced edges): re-scan immediately.
+		case <-timer.C:
+			timerArmed = false
+			// Reschedule fired: re-scan to check for recovery.
+		}
+	}
+}
+
+// scanRecovery performs one pass over the pools currently marked exhausted. It
+// returns the soonest reschedule interval among the pools still exhausted and
+// whether any pool remains exhausted (so the caller knows whether to arm the
+// timer). It Load()s the current resolver fresh.
+func (s *Server) scanRecovery() (next time.Duration, anyExhausted bool) {
+	pr := s.poolResolver.Load()
+
+	// Snapshot the exhausted pool names under the lock, then evaluate health
+	// outside it (HasHealthyMember/SoonestCooldown take their own RLock on the
+	// shared PoolHealth and must not be called while holding poolExhaustMu).
+	s.poolExhaustMu.Lock()
+	pools := make([]string, 0, len(s.poolExhausted))
+	for p, ex := range s.poolExhausted {
+		if ex {
+			pools = append(pools, p)
+		}
+	}
+	s.poolExhaustMu.Unlock()
+
+	now := time.Now()
+	for _, pool := range pools {
+		// A removed pool: drop the state, fire no recovered notice.
+		if pr == nil || !pr.IsPool(pool) {
+			s.clearPoolExhausted(pool)
+			continue
+		}
+		if pr.HasHealthyMember(pool) {
+			s.recoverPool(pool)
+			continue
+		}
+		// Still exhausted: schedule the next wake at the soonest cooldown.
+		if until, ok := pr.SoonestCooldown(pool); ok {
+			d := until.Sub(now)
+			if !anyExhausted || d < next {
+				next = d
+			}
+			anyExhausted = true
+		} else {
+			// No member is currently cooling yet HasHealthyMember is false
+			// (e.g. an empty/raced pool). Reschedule at the floor so we
+			// re-evaluate rather than block forever on the wake channel.
+			if !anyExhausted || s.recoveryMinReschedule < next {
+				next = s.recoveryMinReschedule
+			}
+			anyExhausted = true
+		}
+	}
+	return next, anyExhausted
+}
+
+// clearPoolExhausted drops a pool's exhaustion state without firing a recovered
+// notice (used when the pool was removed while exhausted).
+func (s *Server) clearPoolExhausted(pool string) {
+	s.poolExhaustMu.Lock()
+	delete(s.poolExhausted, pool)
+	s.poolExhaustMu.Unlock()
+}
+
+// recoverPool fires the exhausted->recovered edge for a pool: it flips the
+// state false (returning early if a concurrent pass already did so), emits the
+// "pool recovered" notice, and invokes onPoolRecovered once. The state flip and
+// callback snapshot happen under poolExhaustMu so two concurrent passes cannot
+// both recover the same pool.
+func (s *Server) recoverPool(pool string) {
+	s.poolExhaustMu.Lock()
+	if !s.poolExhausted[pool] {
+		s.poolExhaustMu.Unlock()
+		return
+	}
+	delete(s.poolExhausted, pool)
+	cb := s.onPoolRecovered
+	noticeCb := s.onPoolRecoveredNotice
+	s.poolExhaustMu.Unlock()
+
+	log.Printf("[POOL-FAILOVER] %s", FormatPoolRecoveredNotice(pool))
+	// Notice always fires on the recovery edge (independent of auth-reset).
+	if noticeCb != nil {
+		noticeCb(pool)
+	}
+	// Auth-reset (opt-in, only when a target is configured) is wired here.
+	if cb != nil {
+		cb(pool)
+	}
+}
+
 // EnginePtr returns the shared atomic engine pointer. The Telegram command
 // handler uses this to read and mutate the same engine as the proxy, avoiding
 // split-brain windows during SIGHUP reloads.
@@ -2875,6 +3146,7 @@ func (s *Server) IsListening() bool {
 // Close stops the server by closing the listener and any internal resources.
 func (s *Server) Close() error {
 	s.closed.Store(true)
+	s.stopMonitors()
 	if s.mitmProxy != nil {
 		_ = s.mitmProxy.Close()
 	}
@@ -2890,6 +3162,7 @@ func (s *Server) Close() error {
 // the timeout, or an error if the timeout was exceeded.
 func (s *Server) GracefulShutdown(timeout time.Duration) error {
 	s.closed.Store(true)
+	s.stopMonitors()
 	// Stop accepting new connections.
 	_ = s.listener.Close()
 	if s.mitmProxy != nil {

From 0a6745387954896ebab7806d84aabfbf48a2159d Mon Sep 17 00:00:00 2001
From: Nikita Nemirovsky <vaze.legend@gmail.com>
Date: Fri, 22 May 2026 16:07:28 +0800
Subject: [PATCH 05/19] feat(store): add per-pool auth_reset_target column +
 accessor (migration 000008)

---
 ...22-pool-exhaustion-and-agent-auth-reset.md |  15 +-
 .../000008_pool_auth_reset.down.sql           |  41 ++++
 .../migrations/000008_pool_auth_reset.up.sql  |  15 ++
 internal/store/pools.go                       |  46 ++++-
 internal/store/pools_test.go                  | 193 ++++++++++++++++++
 5 files changed, 295 insertions(+), 15 deletions(-)
 create mode 100644 internal/store/migrations/000008_pool_auth_reset.down.sql
 create mode 100644 internal/store/migrations/000008_pool_auth_reset.up.sql

diff --git a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
index 2a5dae4..b7e9d47 100644
--- a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
+++ b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
@@ -286,14 +286,17 @@ precedent) and emits an `agent_auth_reset` audit event.
 - Modify: `internal/store/pools.go`
 - Modify: `internal/store/pools_test.go`
 
-- [ ] up: `ALTER TABLE credential_pools ADD COLUMN auth_reset_target TEXT NOT NULL DEFAULT ''`
-- [ ] down: SQLite 12-step rebuild of `credential_pools` only, wrapped
-  `PRAGMA foreign_keys=OFF;` … `=ON;` so the `credential_pool_members` FK isn't orphaned
-- [ ] add `AuthResetTarget` to `Pool`; include in create/list reads; add
+- [x] up: `ALTER TABLE credential_pools ADD COLUMN auth_reset_target TEXT NOT NULL DEFAULT ''`
+  (migration `000008_pool_auth_reset` — `000007` was already taken by `pool_membership_epoch`)
+- [x] down: SQLite 12-step rebuild of `credential_pools` only; because golang-migrate runs each
+  script inside a transaction and SQLite ignores `PRAGMA foreign_keys=OFF` while a transaction is
+  open, the FK-referencing `credential_pool_members` rows are snapshotted to a temp table and
+  restored after the rebuild instead (cascade-safe, FK preserved)
+- [x] add `AuthResetTarget` to `Pool`; include in create/list reads; add
   `SetPoolAuthResetTarget(name, target) error`
-- [ ] write tests: migrate up→down→up against a **populated** table (pool + members + health
+- [x] write tests: migrate up→down→up against a **populated** table (pool + members + health
   rows survive/round-trip); default empty; create with target; set/clear target; list reflects it
-- [ ] run `go test ./internal/store/ -race`; gofumpt; vet — pass before Task 5
+- [x] run `go test ./internal/store/ -race`; gofumpt; vet — pass before Task 5
 
 ### Task 5: Channel parity — `auth_reset_target` on CLI + REST + Telegram
 
diff --git a/internal/store/migrations/000008_pool_auth_reset.down.sql b/internal/store/migrations/000008_pool_auth_reset.down.sql
new file mode 100644
index 0000000..353ceaa
--- /dev/null
+++ b/internal/store/migrations/000008_pool_auth_reset.down.sql
@@ -0,0 +1,41 @@
+-- Revert the per-pool auth_reset_target column.
+--
+-- This is the SQLite 12-step table rebuild
+-- (https://www.sqlite.org/lang_altertable.html#otheralter) for
+-- credential_pools: recreate it WITHOUT auth_reset_target, copy the data, drop
+-- the old table, rename. The column set + constraints recreated here match
+-- credential_pools as created by migration 000006 (name PK, strategy CHECK,
+-- created_at default).
+--
+-- credential_pool_members.pool has a FK -> credential_pools(name) with ON
+-- DELETE CASCADE. golang-migrate runs each migration inside a transaction, and
+-- SQLite ignores `PRAGMA foreign_keys=OFF` while a transaction is open, so the
+-- pragma trick used outside migrations cannot disable the cascade here.
+-- Dropping credential_pools would therefore CASCADE-delete every member row.
+--
+-- To keep the FK-referencing member rows (and their epoch column from 000007)
+-- intact across the rebuild, the member rows are snapshotted into a temp table
+-- before the parent table is dropped and restored afterward, once the rebuilt
+-- credential_pools rows exist again to satisfy the FK. credential_health is
+-- not FK-tied to credential_pools, so it is untouched.
+
+CREATE TEMP TABLE _cpm_backup AS
+    SELECT pool, credential, position, epoch FROM credential_pool_members;
+
+CREATE TABLE credential_pools_new (
+    name TEXT PRIMARY KEY,
+    strategy TEXT NOT NULL DEFAULT 'failover' CHECK(strategy IN ('failover')),
+    created_at TEXT NOT NULL DEFAULT (datetime('now'))
+);
+
+INSERT INTO credential_pools_new (name, strategy, created_at)
+    SELECT name, strategy, created_at FROM credential_pools;
+
+DROP TABLE credential_pools;
+
+ALTER TABLE credential_pools_new RENAME TO credential_pools;
+
+INSERT INTO credential_pool_members (pool, credential, position, epoch)
+    SELECT pool, credential, position, epoch FROM _cpm_backup;
+
+DROP TABLE _cpm_backup;
diff --git a/internal/store/migrations/000008_pool_auth_reset.up.sql b/internal/store/migrations/000008_pool_auth_reset.up.sql
new file mode 100644
index 0000000..a126b59
--- /dev/null
+++ b/internal/store/migrations/000008_pool_auth_reset.up.sql
@@ -0,0 +1,15 @@
+-- Per-pool agent auth-reset target.
+--
+-- auth_reset_target is an opt-in, per-pool string naming the agent auth
+-- target sluice resets when the pool transitions exhausted -> recovered (e.g.
+-- the hermes "openai-codex" auth entry). Empty (the default) = opt-out: no
+-- reset runs. Non-empty = sluice invokes the agent profile's auth-reset
+-- command on the recovery edge so a latched agent un-latches and resumes
+-- without a manual operator reset.
+--
+-- A plain ADD COLUMN with a NOT NULL DEFAULT '' is sufficient for SQLite; the
+-- down migration does the 12-step table rebuild because SQLite cannot DROP a
+-- column referenced by no FK in older engines portably, and the column must be
+-- removed while preserving the credential_pool_members FK -> credential_pools.
+
+ALTER TABLE credential_pools ADD COLUMN auth_reset_target TEXT NOT NULL DEFAULT '';
diff --git a/internal/store/pools.go b/internal/store/pools.go
index 2ee18e2..e71a927 100644
--- a/internal/store/pools.go
+++ b/internal/store/pools.go
@@ -56,10 +56,14 @@ var (
 // Pool is a named group of OAuth credentials backing a single phantom
 // identity. Members are returned ordered by position (failover order).
 type Pool struct {
-	Name      string
-	Strategy  string
-	CreatedAt string
-	Members   []PoolMember
+	Name     string
+	Strategy string
+	// AuthResetTarget is the opt-in, per-pool agent auth-reset target sluice
+	// resets on the exhausted -> recovered edge (e.g. the hermes
+	// "openai-codex" auth entry). Empty (the default) means no reset runs.
+	AuthResetTarget string
+	CreatedAt       string
+	Members         []PoolMember
 }
 
 // PoolMember is one credential entry in a pool. Position determines the
@@ -249,7 +253,7 @@ func (s *Store) CreatePoolWithMembers(name, strategy string, members []string) e
 	}
 
 	if _, err := tx.Exec(
-		"INSERT INTO credential_pools (name, strategy) VALUES (?, ?)", name, strategy,
+		"INSERT INTO credential_pools (name, strategy, auth_reset_target) VALUES (?, ?, '')", name, strategy,
 	); err != nil {
 		return fmt.Errorf("insert pool %q: %w", name, err)
 	}
@@ -284,13 +288,37 @@ func (s *Store) CreatePoolWithMembers(name, strategy string, members []string) e
 	return nil
 }
 
+// SetPoolAuthResetTarget sets (or clears, when target is "") the per-pool
+// auth_reset_target. Returns ErrPoolNoMembers when the pool name is empty and
+// sql.ErrNoRows-wrapped behavior is intentionally avoided: a missing pool is
+// reported via the returned error so every management channel can surface a
+// "no such pool" message. The target value itself is stored verbatim;
+// charset/validation of the target is the channel/ops layer's concern (the
+// stored string is consumed as argv, never shell-interpolated).
+func (s *Store) SetPoolAuthResetTarget(name, target string) error {
+	if name == "" {
+		return fmt.Errorf("%w: pool name is required", ErrPoolNoMembers)
+	}
+	res, err := s.db.Exec(
+		"UPDATE credential_pools SET auth_reset_target = ? WHERE name = ?", target, name,
+	)
+	if err != nil {
+		return fmt.Errorf("set auth_reset_target for pool %q: %w", name, err)
+	}
+	n, _ := res.RowsAffected()
+	if n == 0 {
+		return fmt.Errorf("pool %q does not exist", name)
+	}
+	return nil
+}
+
 // GetPool returns a pool by name with members ordered by position, or nil if
 // the pool does not exist.
 func (s *Store) GetPool(name string) (*Pool, error) {
 	var p Pool
 	err := s.db.QueryRow(
-		"SELECT name, strategy, created_at FROM credential_pools WHERE name = ?", name,
-	).Scan(&p.Name, &p.Strategy, &p.CreatedAt)
+		"SELECT name, strategy, auth_reset_target, created_at FROM credential_pools WHERE name = ?", name,
+	).Scan(&p.Name, &p.Strategy, &p.AuthResetTarget, &p.CreatedAt)
 	if errors.Is(err, sql.ErrNoRows) {
 		return nil, nil
 	}
@@ -320,7 +348,7 @@ func (s *Store) GetPool(name string) (*Pool, error) {
 
 // ListPools returns all pools with their members ordered by position.
 func (s *Store) ListPools() ([]Pool, error) {
-	rows, err := s.db.Query("SELECT name, strategy, created_at FROM credential_pools ORDER BY name")
+	rows, err := s.db.Query("SELECT name, strategy, auth_reset_target, created_at FROM credential_pools ORDER BY name")
 	if err != nil {
 		return nil, fmt.Errorf("list pools: %w", err)
 	}
@@ -328,7 +356,7 @@ func (s *Store) ListPools() ([]Pool, error) {
 	pools := make(map[string]*Pool)
 	for rows.Next() {
 		var p Pool
-		if err := rows.Scan(&p.Name, &p.Strategy, &p.CreatedAt); err != nil {
+		if err := rows.Scan(&p.Name, &p.Strategy, &p.AuthResetTarget, &p.CreatedAt); err != nil {
 			_ = rows.Close()
 			return nil, fmt.Errorf("scan pool: %w", err)
 		}
diff --git a/internal/store/pools_test.go b/internal/store/pools_test.go
index 3016170..cdf2291 100644
--- a/internal/store/pools_test.go
+++ b/internal/store/pools_test.go
@@ -682,6 +682,15 @@ func TestMigration000006DownUp(t *testing.T) {
 		t.Fatalf("migrator: %v", err)
 	}
 
+	// Migrations newer than 000007 (e.g. 000008_pool_auth_reset) may sit on top
+	// of the schema after New(). This test exercises the 000006/000007 down/up
+	// boundary with relative Steps, so pin the starting version to 000007
+	// first; otherwise a relative Steps(-1) would only undo the newest
+	// migration instead of 000007.
+	if err := m.Migrate(7); err != nil && !errors.Is(err, migrate.ErrNoChange) {
+		t.Fatalf("pin to 000007: %v", err)
+	}
+
 	columnExists := func(table, col string) bool {
 		rows, qerr := s.db.Query("PRAGMA table_info(" + table + ")")
 		if qerr != nil {
@@ -1557,3 +1566,187 @@ func TestRemoveCredentialMetaCASNoOpLeavesHealthIntact(t *testing.T) {
 		t.Error("credential_health wrongly deleted by a CAS no-op (round-11 invariant regressed)")
 	}
 }
+
+// TestMigration000008DownUpPopulated exercises the auth_reset_target column
+// migration (000008) up -> down -> up against a POPULATED schema: a pool with
+// two members and a credential_health row must survive the down (which
+// rebuilds credential_pools while preserving the credential_pool_members FK)
+// and the re-up round-trip. The down migration disables foreign_keys for the
+// 12-step rebuild, so the FK-referencing member rows must NOT be cascade-wiped.
+func TestMigration000008DownUpPopulated(t *testing.T) {
+	dir := t.TempDir()
+	dbPath := filepath.Join(dir, "m.db")
+	s, err := New(dbPath)
+	if err != nil {
+		t.Fatalf("New: %v", err)
+	}
+	defer func() { _ = s.Close() }()
+
+	// Populate: oauth members, a pool with a non-empty auth_reset_target, and a
+	// credential_health row for one member.
+	seedOAuthCred(t, s, "acct_a")
+	seedOAuthCred(t, s, "acct_b")
+	if err := s.CreatePoolWithMembers("codex", "failover", []string{"acct_a", "acct_b"}); err != nil {
+		t.Fatalf("CreatePoolWithMembers: %v", err)
+	}
+	if err := s.SetPoolAuthResetTarget("codex", "openai-codex"); err != nil {
+		t.Fatalf("SetPoolAuthResetTarget: %v", err)
+	}
+	if err := s.SetCredentialHealth("acct_a", "cooldown", time.Now().Add(time.Hour), "429"); err != nil {
+		t.Fatalf("SetCredentialHealth: %v", err)
+	}
+
+	columnExists := func(table, col string) bool {
+		rows, qerr := s.db.Query("PRAGMA table_info(" + table + ")")
+		if qerr != nil {
+			return false
+		}
+		defer func() { _ = rows.Close() }()
+		for rows.Next() {
+			var cid int
+			var name, ctype string
+			var notnull, pk int
+			var dflt interface{}
+			if scanErr := rows.Scan(&cid, &name, &ctype, &notnull, &dflt, &pk); scanErr != nil {
+				return false
+			}
+			if name == col {
+				return true
+			}
+		}
+		return false
+	}
+
+	if !columnExists("credential_pools", "auth_reset_target") {
+		t.Fatal("credential_pools.auth_reset_target missing after up migration (000008)")
+	}
+
+	src, err := iofs.New(migrationsFS, "migrations")
+	if err != nil {
+		t.Fatalf("iofs: %v", err)
+	}
+	drv, err := migsqlite.WithInstance(s.db, &migsqlite.Config{})
+	if err != nil {
+		t.Fatalf("driver: %v", err)
+	}
+	m, err := migrate.NewWithInstance("iofs", src, "sqlite", drv)
+	if err != nil {
+		t.Fatalf("migrator: %v", err)
+	}
+
+	// Down one step (000008 -> 000007): the column goes; the rebuilt
+	// credential_pools keeps its row; the FK-referencing member rows and the
+	// health row survive (foreign_keys=OFF during the rebuild prevents a
+	// cascade wipe).
+	if err := m.Steps(-1); err != nil {
+		t.Fatalf("down 1 (000008): %v", err)
+	}
+	if columnExists("credential_pools", "auth_reset_target") {
+		t.Error("credential_pools.auth_reset_target still present after 000008 down")
+	}
+	var poolCount, memberCount, healthCount int
+	if err := s.db.QueryRow("SELECT COUNT(*) FROM credential_pools").Scan(&poolCount); err != nil {
+		t.Fatalf("count pools: %v", err)
+	}
+	if poolCount != 1 {
+		t.Errorf("pool row lost in 000008 down rebuild: got %d, want 1", poolCount)
+	}
+	if err := s.db.QueryRow("SELECT COUNT(*) FROM credential_pool_members WHERE pool = 'codex'").Scan(&memberCount); err != nil {
+		t.Fatalf("count members: %v", err)
+	}
+	if memberCount != 2 {
+		t.Errorf("member rows cascade-wiped by 000008 down rebuild: got %d, want 2", memberCount)
+	}
+	if err := s.db.QueryRow("SELECT COUNT(*) FROM credential_health WHERE credential = 'acct_a'").Scan(&healthCount); err != nil {
+		t.Fatalf("count health: %v", err)
+	}
+	if healthCount != 1 {
+		t.Errorf("health row lost in 000008 down rebuild: got %d, want 1", healthCount)
+	}
+
+	// Re-up (000007 -> 000008): the column returns, defaulting to '' for the
+	// existing row (the target value is not preserved across a down, which is
+	// expected — the column was dropped). All populated rows still present.
+	if err := m.Steps(1); err != nil {
+		t.Fatalf("up 1 (re-000008): %v", err)
+	}
+	if !columnExists("credential_pools", "auth_reset_target") {
+		t.Fatal("credential_pools.auth_reset_target missing after re-up migration (000008)")
+	}
+	p, err := s.GetPool("codex")
+	if err != nil || p == nil {
+		t.Fatalf("GetPool after re-up: %+v, %v", p, err)
+	}
+	if p.AuthResetTarget != "" {
+		t.Errorf("auth_reset_target = %q after re-up; want '' (column dropped on down)", p.AuthResetTarget)
+	}
+	if len(p.Members) != 2 {
+		t.Errorf("members not preserved across down/up: got %d, want 2", len(p.Members))
+	}
+}
+
+// TestPoolAuthResetTargetCRUD covers the store accessor: default empty on
+// create, create-then-set, clear, and that list/get reflect the value.
+func TestPoolAuthResetTargetCRUD(t *testing.T) {
+	s := newTestStore(t)
+	seedOAuthCred(t, s, "acct_a")
+	seedOAuthCred(t, s, "acct_b")
+
+	if err := s.CreatePoolWithMembers("codex", "", []string{"acct_a", "acct_b"}); err != nil {
+		t.Fatalf("CreatePoolWithMembers: %v", err)
+	}
+
+	// Default empty on create.
+	p, err := s.GetPool("codex")
+	if err != nil || p == nil {
+		t.Fatalf("GetPool: %+v, %v", p, err)
+	}
+	if p.AuthResetTarget != "" {
+		t.Errorf("auth_reset_target = %q on fresh pool; want empty", p.AuthResetTarget)
+	}
+
+	// Set a target.
+	if err := s.SetPoolAuthResetTarget("codex", "openai-codex"); err != nil {
+		t.Fatalf("SetPoolAuthResetTarget set: %v", err)
+	}
+	p, _ = s.GetPool("codex")
+	if p.AuthResetTarget != "openai-codex" {
+		t.Errorf("auth_reset_target = %q after set; want openai-codex", p.AuthResetTarget)
+	}
+
+	// List reflects the value.
+	pools, err := s.ListPools()
+	if err != nil {
+		t.Fatalf("ListPools: %v", err)
+	}
+	var found bool
+	for _, lp := range pools {
+		if lp.Name == "codex" {
+			found = true
+			if lp.AuthResetTarget != "openai-codex" {
+				t.Errorf("ListPools auth_reset_target = %q; want openai-codex", lp.AuthResetTarget)
+			}
+		}
+	}
+	if !found {
+		t.Fatal("pool codex not found in ListPools")
+	}
+
+	// Clear it.
+	if err := s.SetPoolAuthResetTarget("codex", ""); err != nil {
+		t.Fatalf("SetPoolAuthResetTarget clear: %v", err)
+	}
+	p, _ = s.GetPool("codex")
+	if p.AuthResetTarget != "" {
+		t.Errorf("auth_reset_target = %q after clear; want empty", p.AuthResetTarget)
+	}
+
+	// Unknown pool errors.
+	if err := s.SetPoolAuthResetTarget("nope", "x"); err == nil {
+		t.Error("SetPoolAuthResetTarget on a missing pool should error")
+	}
+	// Empty pool name errors.
+	if err := s.SetPoolAuthResetTarget("", "x"); err == nil {
+		t.Error("SetPoolAuthResetTarget with empty name should error")
+	}
+}

From eefe85df68007e513d30bcaee9b4348bf7a39375 Mon Sep 17 00:00:00 2001
From: Nikita Nemirovsky <vaze.legend@gmail.com>
Date: Fri, 22 May 2026 16:18:30 +0800
Subject: [PATCH 06/19] feat(pools): expose auth_reset_target on CLI, REST, and
 Telegram

---
 api/openapi.yaml                              |  63 +++++
 cmd/sluice/pool.go                            |  57 ++++-
 cmd/sluice/pool_test.go                       |  91 +++++++
 ...22-pool-exhaustion-and-agent-auth-reset.md |  29 ++-
 internal/api/api.gen.go                       | 231 +++++++++++-------
 internal/api/server.go                        |  51 +++-
 internal/api/server_test.go                   | 102 ++++++++
 internal/poolops/poolops.go                   |  78 +++++-
 internal/poolops/poolops_test.go              | 102 ++++++++
 internal/telegram/commands.go                 |  47 +++-
 internal/telegram/commands_test.go            |  60 +++++
 11 files changed, 800 insertions(+), 111 deletions(-)

diff --git a/api/openapi.yaml b/api/openapi.yaml
index 6ea9098..7437d37 100644
--- a/api/openapi.yaml
+++ b/api/openapi.yaml
@@ -652,6 +652,45 @@ paths:
               schema:
                 $ref: "#/components/schemas/ErrorResponse"
 
+  /api/pools/{name}/auth-reset-target:
+    post:
+      operationId: postApiPoolsNameAuthResetTarget
+      summary: >-
+        Set or clear the per-pool agent auth-reset target run on the
+        exhausted->recovered edge
+      tags: [pools]
+      parameters:
+        - name: name
+          in: path
+          required: true
+          schema:
+            type: string
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/SetPoolAuthResetTargetRequest"
+      responses:
+        "200":
+          description: Updated pool
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Pool"
+        "400":
+          description: Invalid target
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ErrorResponse"
+        "404":
+          description: Pool not found
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ErrorResponse"
+
   /api/audit/recent:
     get:
       operationId: getApiAuditRecent
@@ -1181,6 +1220,11 @@ components:
         strategy:
           type: string
           description: "Pool strategy (only 'failover' is supported)"
+        auth_reset_target:
+          type: string
+          description: >-
+            Agent auth-reset target run on the exhausted->recovered edge
+            (empty = no reset)
         created_at:
           type: string
           format: date-time
@@ -1207,12 +1251,26 @@ components:
         strategy:
           type: string
           description: "Pool strategy; defaults to 'failover' when omitted"
+        auth_reset_target:
+          type: string
+          description: >-
+            Optional agent auth-reset target run on the exhausted->recovered
+            edge (empty/omitted = no reset)
         members:
           type: array
           description: "Ordered member credential names (failover order)"
           items:
             type: string
 
+    SetPoolAuthResetTargetRequest:
+      type: object
+      required: [auth_reset_target]
+      properties:
+        auth_reset_target:
+          type: string
+          description: >-
+            Agent auth-reset target; an empty string clears it (no reset)
+
     PoolStatus:
       type: object
       required: [name, strategy, active, members]
@@ -1224,6 +1282,11 @@ components:
         active:
           type: string
           description: "Currently active member credential name"
+        auth_reset_target:
+          type: string
+          description: >-
+            Agent auth-reset target run on the exhausted->recovered edge
+            (empty = no reset)
         members:
           type: array
           items:
diff --git a/cmd/sluice/pool.go b/cmd/sluice/pool.go
index 924856b..3418393 100644
--- a/cmd/sluice/pool.go
+++ b/cmd/sluice/pool.go
@@ -13,7 +13,7 @@ import (
 
 func handlePoolCommand(args []string) error {
 	if len(args) == 0 {
-		return fmt.Errorf("usage: sluice pool [create|list|status|rotate|remove]")
+		return fmt.Errorf("usage: sluice pool [create|list|status|rotate|set-auth-reset|remove]")
 	}
 
 	switch args[0] {
@@ -25,10 +25,12 @@ func handlePoolCommand(args []string) error {
 		return handlePoolStatus(args[1:])
 	case "rotate":
 		return handlePoolRotate(args[1:])
+	case "set-auth-reset":
+		return handlePoolSetAuthReset(args[1:])
 	case "remove":
 		return handlePoolRemove(args[1:])
 	default:
-		return fmt.Errorf("unknown pool command: %s (usage: sluice pool [create|list|status|rotate|remove] ...)", args[0])
+		return fmt.Errorf("unknown pool command: %s (usage: sluice pool [create|list|status|rotate|set-auth-reset|remove] ...)", args[0])
 	}
 }
 
@@ -37,12 +39,13 @@ func handlePoolCreate(args []string) error {
 	dbPath := fs.String("db", "data/sluice.db", "path to SQLite database")
 	membersStr := fs.String("members", "", "comma-separated ordered list of oauth credential names (failover order)")
 	strategy := fs.String("strategy", store.PoolStrategyFailover, "pool strategy (only 'failover' is supported)")
+	authResetTarget := fs.String("auth-reset-target", "", "agent auth-reset target run on the exhausted->recovered edge (e.g. openai-codex); empty = no reset")
 	if err := fs.Parse(reorderFlagsBeforePositional(args, fs)); err != nil {
 		return err
 	}
 
 	if fs.NArg() == 0 {
-		return fmt.Errorf("usage: sluice pool create <name> --members a,b[,c] [--strategy failover]")
+		return fmt.Errorf("usage: sluice pool create <name> --members a,b[,c] [--strategy failover] [--auth-reset-target <target>]")
 	}
 	name := fs.Arg(0)
 
@@ -60,7 +63,7 @@ func handlePoolCreate(args []string) error {
 	}
 	defer func() { _ = db.Close() }()
 
-	if err := poolops.Create(db, name, *strategy, members); err != nil {
+	if err := poolops.CreateWithAuthResetTarget(db, name, *strategy, members, *authResetTarget); err != nil {
 		return err
 	}
 
@@ -68,6 +71,9 @@ func handlePoolCreate(args []string) error {
 	for i, m := range members {
 		fmt.Printf("  [%d] %s\n", i, m)
 	}
+	if *authResetTarget != "" {
+		fmt.Printf("auth-reset target: %s\n", *authResetTarget)
+	}
 	fmt.Printf("bind it with: sluice binding add %s --destination <host> [--ports 443]\n", name)
 	return nil
 }
@@ -99,6 +105,9 @@ func handlePoolList(args []string) error {
 			names = append(names, m.Credential)
 		}
 		fmt.Printf("%s (strategy: %s): %s\n", p.Name, p.Strategy, strings.Join(names, ", "))
+		if p.AuthResetTarget != "" {
+			fmt.Printf("  auth-reset target: %s\n", p.AuthResetTarget)
+		}
 	}
 	return nil
 }
@@ -151,6 +160,46 @@ func handlePoolStatus(args []string) error {
 		fmt.Printf("%s[%d] %s  %s\n", marker, m.Position, m.Credential, status)
 	}
 	fmt.Printf("active: %s\n", res.Active)
+	if res.AuthResetTarget != "" {
+		fmt.Printf("auth-reset target: %s\n", res.AuthResetTarget)
+	}
+	return nil
+}
+
+func handlePoolSetAuthReset(args []string) error {
+	fs := flag.NewFlagSet("pool set-auth-reset", flag.ContinueOnError)
+	dbPath := fs.String("db", "data/sluice.db", "path to SQLite database")
+	if err := fs.Parse(reorderFlagsBeforePositional(args, fs)); err != nil {
+		return err
+	}
+	if fs.NArg() < 2 {
+		return fmt.Errorf("usage: sluice pool set-auth-reset <name> <target|->  (a single - clears the target)")
+	}
+	name := fs.Arg(0)
+	target := fs.Arg(1)
+	// A single "-" is the channel-uniform clear sentinel.
+	if target == "-" {
+		target = ""
+	}
+
+	db, err := store.New(*dbPath)
+	if err != nil {
+		return fmt.Errorf("open store: %w", err)
+	}
+	defer func() { _ = db.Close() }()
+
+	if err := poolops.SetAuthResetTarget(db, name, target); err != nil {
+		var nf *poolops.PoolNotFoundError
+		if errors.As(err, &nf) {
+			return fmt.Errorf("pool %q not found", name)
+		}
+		return err
+	}
+	if target == "" {
+		fmt.Printf("pool %q auth-reset target cleared\n", name)
+	} else {
+		fmt.Printf("pool %q auth-reset target set to %q\n", name, target)
+	}
 	return nil
 }
 
diff --git a/cmd/sluice/pool_test.go b/cmd/sluice/pool_test.go
index 6625e56..f3189c9 100644
--- a/cmd/sluice/pool_test.go
+++ b/cmd/sluice/pool_test.go
@@ -95,6 +95,97 @@ func TestHandlePoolCreateListStatusRemove(t *testing.T) {
 	}
 }
 
+// TestHandlePoolAuthResetTarget exercises the CLI adapter for the per-pool
+// auth_reset_target: --auth-reset-target on create, the set-auth-reset
+// subcommand (set and clear via "-"), and that the value reaches the store
+// and is surfaced in list/status output. The CLI is a thin poolops adapter,
+// so this asserts the value round-trips through the store the same way the
+// REST and Telegram adapters do.
+func TestHandlePoolAuthResetTarget(t *testing.T) {
+	dir := t.TempDir()
+	dbPath := setupVaultDB(t, dir)
+	seedPoolCred(t, dbPath, dir, "acct_a")
+	seedPoolCred(t, dbPath, dir, "acct_b")
+
+	out := captureStdout(t, func() {
+		if err := handlePoolCommand([]string{
+			"create", "--db", dbPath, "--members", "acct_a,acct_b",
+			"--auth-reset-target", "openai-codex", "codex",
+		}); err != nil {
+			t.Fatalf("pool create: %v", err)
+		}
+	})
+	if !strings.Contains(out, "auth-reset target: openai-codex") {
+		t.Errorf("create output missing target: %q", out)
+	}
+
+	// Value reached the store.
+	assertStoredAuthResetTarget(t, dbPath, "codex", "openai-codex")
+
+	// Surfaced in list and status.
+	out = captureStdout(t, func() {
+		_ = handlePoolCommand([]string{"list", "--db", dbPath})
+	})
+	if !strings.Contains(out, "auth-reset target: openai-codex") {
+		t.Errorf("list output missing target: %q", out)
+	}
+	out = captureStdout(t, func() {
+		_ = handlePoolCommand([]string{"status", "--db", dbPath, "codex"})
+	})
+	if !strings.Contains(out, "auth-reset target: openai-codex") {
+		t.Errorf("status output missing target: %q", out)
+	}
+
+	// set-auth-reset to a new value.
+	out = captureStdout(t, func() {
+		if err := handlePoolCommand([]string{"set-auth-reset", "--db", dbPath, "codex", "other-target"}); err != nil {
+			t.Fatalf("set-auth-reset: %v", err)
+		}
+	})
+	if !strings.Contains(out, "set to \"other-target\"") {
+		t.Errorf("set-auth-reset output = %q", out)
+	}
+	assertStoredAuthResetTarget(t, dbPath, "codex", "other-target")
+
+	// Clear with the "-" sentinel.
+	out = captureStdout(t, func() {
+		if err := handlePoolCommand([]string{"set-auth-reset", "--db", dbPath, "codex", "-"}); err != nil {
+			t.Fatalf("set-auth-reset clear: %v", err)
+		}
+	})
+	if !strings.Contains(out, "cleared") {
+		t.Errorf("clear output = %q", out)
+	}
+	assertStoredAuthResetTarget(t, dbPath, "codex", "")
+
+	// Unknown pool and bad usage.
+	if err := handlePoolCommand([]string{"set-auth-reset", "--db", dbPath, "missing", "x"}); err == nil {
+		t.Error("expected error for set-auth-reset on missing pool")
+	}
+	if err := handlePoolCommand([]string{"set-auth-reset", "--db", dbPath, "codex"}); err == nil {
+		t.Error("expected usage error for set-auth-reset with too few args")
+	}
+}
+
+func assertStoredAuthResetTarget(t *testing.T, dbPath, pool, want string) {
+	t.Helper()
+	db, err := store.New(dbPath)
+	if err != nil {
+		t.Fatalf("open db: %v", err)
+	}
+	defer func() { _ = db.Close() }()
+	p, err := db.GetPool(pool)
+	if err != nil {
+		t.Fatalf("GetPool: %v", err)
+	}
+	if p == nil {
+		t.Fatalf("pool %q not found", pool)
+	}
+	if p.AuthResetTarget != want {
+		t.Fatalf("stored AuthResetTarget = %q, want %q", p.AuthResetTarget, want)
+	}
+}
+
 func TestHandlePoolErrorPaths(t *testing.T) {
 	dir := t.TempDir()
 	dbPath := setupVaultDB(t, dir)
diff --git a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
index b7e9d47..5197770 100644
--- a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
+++ b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
@@ -307,19 +307,22 @@ precedent) and emits an `agent_auth_reset` audit event.
 - Modify: Telegram `/pool` handler
 - Modify: matching `_test.go` for poolops + each channel adapter
 
-- [ ] add the operation to `internal/poolops` (channel-agnostic) so all channels are thin
-  adapters (CLAUDE.md anti-pattern note)
-- [ ] CLI: `--auth-reset-target` on `pool create`; `pool set-auth-reset <name> <target|->`
-  (set/clear); show target in `pool status`/`pool list`
-- [ ] REST: accept on `POST /api/pools`; add an **action route** `POST
-  /api/pools/{name}/auth-reset-target` (mirrors the existing `/rotate` style, not a bespoke
-  PATCH); edit `api/openapi.yaml`, run `make generate`, implement the generated
-  `ServerInterface` method in `server.go`
-- [ ] Telegram: accept on `/pool create`; add `/pool set-auth-reset <name> <target|->`
-- [ ] write tests: poolops set/clear/create-with-target; one adapter test per channel
-  asserting it routes through poolops (no inline logic)
-- [ ] run `go test ./... -race` for touched packages; `make generate` clean; gofumpt; vet —
-  pass before Task 6
+- [x] add the operation to `internal/poolops` (channel-agnostic) so all channels are thin
+  adapters (CLAUDE.md anti-pattern note) — `CreateWithAuthResetTarget`, `SetAuthResetTarget`,
+  `validateAuthResetTarget`/`ErrInvalidAuthResetTarget`, target on `StatusResult`
+- [x] CLI: `--auth-reset-target` on `pool create`; `pool set-auth-reset <name> <target|->`
+  (set/clear, `-` clears); show target in `pool status`/`pool list`
+- [x] REST: accept `auth_reset_target` on `POST /api/pools`; action route `POST
+  /api/pools/{name}/auth-reset-target` (mirrors `/rotate`, not a PATCH); edited
+  `api/openapi.yaml`, ran `make generate`, implemented `PostApiPoolsNameAuthResetTarget` in
+  `server.go`
+- [x] Telegram: accept target as the optional 3rd `/pool create` arg; added `/pool
+  set-auth-reset <name> <target|->`
+- [x] write tests: poolops set/clear/create-with-target unit tests; one adapter test per
+  channel (CLI, REST, Telegram) asserting it routes through poolops and the value reaches the
+  store
+- [x] run `go test ./... -race` for touched packages; `make generate` clean (byte-stable,
+  raw output committed); gofumpt; vet — pass before Task 6
 
 ### Task 6: Add optional exec user to the container exec path (prerequisite for hermes reset)
 
diff --git a/internal/api/api.gen.go b/internal/api/api.gen.go
index 5477fa9..e88e9f8 100644
--- a/internal/api/api.gen.go
+++ b/internal/api/api.gen.go
@@ -432,6 +432,9 @@ type CreateMCPUpstreamRequest struct {
 
 // CreatePoolRequest defines model for CreatePoolRequest.
 type CreatePoolRequest struct {
+	// AuthResetTarget Optional agent auth-reset target run on the exhausted->recovered edge (empty/omitted = no reset)
+	AuthResetTarget *string `json:"auth_reset_target,omitempty"`
+
 	// Members Ordered member credential names (failover order)
 	Members []string `json:"members"`
 	Name    string   `json:"name"`
@@ -529,9 +532,11 @@ type MCPUpstream struct {
 
 // Pool defines model for Pool.
 type Pool struct {
-	CreatedAt *time.Time   `json:"created_at,omitempty"`
-	Members   []PoolMember `json:"members"`
-	Name      string       `json:"name"`
+	// AuthResetTarget Agent auth-reset target run on the exhausted->recovered edge (empty = no reset)
+	AuthResetTarget *string      `json:"auth_reset_target,omitempty"`
+	CreatedAt       *time.Time   `json:"created_at,omitempty"`
+	Members         []PoolMember `json:"members"`
+	Name            string       `json:"name"`
 
 	// Strategy Pool strategy (only 'failover' is supported)
 	Strategy string `json:"strategy"`
@@ -582,10 +587,13 @@ type PoolRotateResult struct {
 // PoolStatus defines model for PoolStatus.
 type PoolStatus struct {
 	// Active Currently active member credential name
-	Active   string             `json:"active"`
-	Members  []PoolMemberStatus `json:"members"`
-	Name     string             `json:"name"`
-	Strategy string             `json:"strategy"`
+	Active string `json:"active"`
+
+	// AuthResetTarget Agent auth-reset target run on the exhausted->recovered edge (empty = no reset)
+	AuthResetTarget *string            `json:"auth_reset_target,omitempty"`
+	Members         []PoolMemberStatus `json:"members"`
+	Name            string             `json:"name"`
+	Strategy        string             `json:"strategy"`
 }
 
 // ResolveRequest defines model for ResolveRequest.
@@ -620,6 +628,12 @@ type Rule struct {
 // RuleVerdict defines model for Rule.Verdict.
 type RuleVerdict string
 
+// SetPoolAuthResetTargetRequest defines model for SetPoolAuthResetTargetRequest.
+type SetPoolAuthResetTargetRequest struct {
+	// AuthResetTarget Agent auth-reset target; an empty string clears it (no reset)
+	AuthResetTarget string `json:"auth_reset_target"`
+}
+
 // StatusResponse defines model for StatusResponse.
 type StatusResponse struct {
 	Channels         []ChannelStatus `json:"channels"`
@@ -685,6 +699,9 @@ type PostApiMcpUpstreamsJSONRequestBody = CreateMCPUpstreamRequest
 // PostApiPoolsJSONRequestBody defines body for PostApiPools for application/json ContentType.
 type PostApiPoolsJSONRequestBody = CreatePoolRequest
 
+// PostApiPoolsNameAuthResetTargetJSONRequestBody defines body for PostApiPoolsNameAuthResetTarget for application/json ContentType.
+type PostApiPoolsNameAuthResetTargetJSONRequestBody = SetPoolAuthResetTargetRequest
+
 // PostApiRulesJSONRequestBody defines body for PostApiRules for application/json ContentType.
 type PostApiRulesJSONRequestBody = CreateRuleRequest
 
@@ -762,6 +779,9 @@ type ServerInterface interface {
 	// Pool status (active member + per-member health)
 	// (GET /api/pools/{name})
 	GetApiPoolsName(w http.ResponseWriter, r *http.Request, name string)
+	// Set or clear the per-pool agent auth-reset target run on the exhausted->recovered edge
+	// (POST /api/pools/{name}/auth-reset-target)
+	PostApiPoolsNameAuthResetTarget(w http.ResponseWriter, r *http.Request, name string)
 	// Operator override — advance the active pool member
 	// (POST /api/pools/{name}/rotate)
 	PostApiPoolsNameRotate(w http.ResponseWriter, r *http.Request, name string)
@@ -930,6 +950,12 @@ func (_ Unimplemented) GetApiPoolsName(w http.ResponseWriter, r *http.Request, n
 	w.WriteHeader(http.StatusNotImplemented)
 }
 
+// Set or clear the per-pool agent auth-reset target run on the exhausted->recovered edge
+// (POST /api/pools/{name}/auth-reset-target)
+func (_ Unimplemented) PostApiPoolsNameAuthResetTarget(w http.ResponseWriter, r *http.Request, name string) {
+	w.WriteHeader(http.StatusNotImplemented)
+}
+
 // Operator override — advance the active pool member
 // (POST /api/pools/{name}/rotate)
 func (_ Unimplemented) PostApiPoolsNameRotate(w http.ResponseWriter, r *http.Request, name string) {
@@ -1559,6 +1585,37 @@ func (siw *ServerInterfaceWrapper) GetApiPoolsName(w http.ResponseWriter, r *htt
 	handler.ServeHTTP(w, r)
 }
 
+// PostApiPoolsNameAuthResetTarget operation middleware
+func (siw *ServerInterfaceWrapper) PostApiPoolsNameAuthResetTarget(w http.ResponseWriter, r *http.Request) {
+
+	var err error
+
+	// ------------- Path parameter "name" -------------
+	var name string
+
+	err = runtime.BindStyledParameterWithOptions("simple", "name", chi.URLParam(r, "name"), &name, runtime.BindStyledParameterOptions{ParamLocation: runtime.ParamLocationPath, Explode: false, Required: true, Type: "string", Format: ""})
+	if err != nil {
+		siw.ErrorHandlerFunc(w, r, &InvalidParamFormatError{ParamName: "name", Err: err})
+		return
+	}
+
+	ctx := r.Context()
+
+	ctx = context.WithValue(ctx, BearerAuthScopes, []string{})
+
+	r = r.WithContext(ctx)
+
+	handler := http.Handler(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		siw.Handler.PostApiPoolsNameAuthResetTarget(w, r, name)
+	}))
+
+	for _, middleware := range siw.HandlerMiddlewares {
+		handler = middleware(handler)
+	}
+
+	handler.ServeHTTP(w, r)
+}
+
 // PostApiPoolsNameRotate operation middleware
 func (siw *ServerInterfaceWrapper) PostApiPoolsNameRotate(w http.ResponseWriter, r *http.Request) {
 
@@ -1938,6 +1995,9 @@ func HandlerWithOptions(si ServerInterface, options ChiServerOptions) http.Handl
 	r.Group(func(r chi.Router) {
 		r.Get(options.BaseURL+"/api/pools/{name}", wrapper.GetApiPoolsName)
 	})
+	r.Group(func(r chi.Router) {
+		r.Post(options.BaseURL+"/api/pools/{name}/auth-reset-target", wrapper.PostApiPoolsNameAuthResetTarget)
+	})
 	r.Group(func(r chi.Router) {
 		r.Post(options.BaseURL+"/api/pools/{name}/rotate", wrapper.PostApiPoolsNameRotate)
 	})
@@ -1969,83 +2029,86 @@ func HandlerWithOptions(si ServerInterface, options ChiServerOptions) http.Handl
 // Base64 encoded, gzipped, json marshaled Swagger object
 var swaggerSpec = []string{
 
-	"H4sIAAAAAAAC/+Rd73LbuHZ/FQzbmWvPVSTt3dx21veT47g3niYbj+O0H6KMBJNHFq5JgAuActSMZ/oQ",
-	"fcI+SQcHIAlSICkltpztftpYxJ+D8/d3Dg+4X6NYZLngwLWKTr5GKl5BRvGfp3kuxZqmV/BbAUqbn3Ip",
-	"cpCaAQ6IJVANyZzis6WQmflXlFANLzTLIBpFepNDdBIpLRm/jR5GUQJKM041E9xM2nrOkuDPuZDae8C4",
-	"hluQ0cPDKJLwW8EkJNHJJzO7uYWbOfJp/VyRJW7+AbE2658WCdPnXMvN9jFp3EltLCEBrhlN54WCMOlD",
-	"J+462ijKJaznK6pWwXkSqOpY0jBfaZrlu8tFC5EG11qDTFisA89avK83rSeFWP2K8cTMfxR1qgXwTbwH",
-	"vp6vqTTPElCxZLkdGp3zNZOCZ8A1WVPJ6E0KhNMMyFJIkq8o1yIjWtwBJ4ybo1lt29phBTQB2aPq1TkZ",
-	"1//ysl7C1wMhrXEyDZkK64r7hUpJN1Z3hBaxSIPzPLG3pmnI8pRqGBZ3wNQ8YfQI/mOeuA2aLL+k0kwl",
-	"BT5HPlNyY+eMyakmKVClieBAlgzShGSF0uQGiAJNGCd6BTMura8iNyLZjIjQK5D3TIF5SBTINUgiQReS",
-	"K/JyOiWvaEKceyNHGdXxivHbGTejz95eHI/Je55u7HaK6BXVhEoguQRlFMP8OxMJWzJIxuQDaI2zFx5X",
-	"FkQLQgkX/AVkud6QNU0LIDRVwh1UIW05NXydcVpo8cIZAqFpKu6JLFIgR5re3kJC7plekYXjyguaJCez",
-	"Yjr9Oa5Zj3/Dggg54wvzc9+oY6IE7s/hnnhkE6YIF5oIaVTdnO5iadliCbVEragiNwCcSMjEGhKSUV7Q",
-	"NN2McE1Hpb/ujDNFlGZpKeaEUJ4QSu6p5GYsUyQVeNIjLsiSpukNje/sdkzNuGPN8ZhcUqXMDMqJ5Szq",
-	"MKrNAg3GsIAsKjtYkDgFKhUeIwvMt3puF7BWuxiRRWkQixEu5xzGYsbtanhQ1I8xQcU2SwQUwIodNSgW",
-	"acoSUDOOwqQctbTiluBWWY2vqeXlae0v5EzwZcpiPZ5hgGs40R/a4/0wjmzbN0lzsLeM3wXjf0HT7kAM",
-	"X3KITeDqHJEyDnNeZDcNvnSBGH90e/VRg5qQlz1bUc4hfZzwCtyogY9rboRIgfK94pf95WsEvMgQKUAK",
-	"t5Jm0ShaaZ17x6h3voeblRB3cwWxBL2trm/enZ4R+xAVVLFbdCBuHsnpJhU0UVHP2oVMtxf+T7fAx6u3",
-	"uPCb6+tLElumBlYLRUUcUjOvR0ofNNWF2pZVL9/3Z2cbqe1MXx2r96BvW3RDEtjBQI3PYwHQmMCSFqme",
-	"ezC15AuGTwQpfGMMR90FNc3ovii0ITfskda4QcLCPs0+NZkSC7u97tN0cffJzxQkCd2Dw2heyte0DjtK",
-	"1ciM3IC+NxiA+tHKBnUvFo3JdYXADEozuyqSFHnKYrPgibeiHwGP6kVH/nrHM26QCMnohgiD0eALQ3AY",
-	"w5hcQW4B1OX7D9fKhc7ByPl7TiV+yAxhr+TAKtZZNaiz6EDjGJSaI7+2mf3+tDCYCsc4nh6VNJH7FXBi",
-	"djYwU9BCr453KFE0N7hYmnxjZBF8XFoD97G6UX6ny+GY+piKQhjXFsLTWzMxFlxTxhE67KNBZp9HUy0J",
-	"Swlq1S8kN6iUksCnxszNae2PUmhr7cGKRbcijiKcHw7w17g08CQXjGuM83urSBmCnafGZ1Sz2KbETX9Z",
-	"+kQzZ0xe2xlGOYmdM8ZQbN17tQpuHXTtmEkETrVqpAsOGdm040hpYQ4HPJab3KZPV+WJETzhtpbCQYiD",
-	"mtJtwu/OLj/mSkugWbcNy9s9vU4ssozypMvz4qJJwqwKXTY261q7JrxT9wehgZaUq7KA1xJI+QjZekKU",
-	"TpggR05jjkfEYDVMKu/hRon4DvRurK+Z0S2ESyG6PWgGJrVQAbuUCRiVsAN8dTIbK3K0pCwVa5BEmJHG",
-	"MnaXYCeTlZZUw+0mUBESIiXl47+RxDOdP5Wk/MmarMiY1pDsysGSBd0cvCpS6OTgEBbodqZUa5D8uWO4",
-	"hDylMZhI0+E9h4vBPYjUbJDQRvm3Qxx9deKzBh5rCqDbYvfz+wgdjQdEf4uWOuDve5y78bPoR8v1jvfx",
-	"7Ht4WbdpVyH1V7h3jh9rqNxiY4Osa4sek3+r/X79sxoZdDPjC5y/sKNsuG4PIgsfiS0Q9ZQxPN3M+KIB",
-	"AhY2AagoWVSCWphIixVVuYZkPOMzfnl6ffaGKMgo1yxWJ9a6cVpz0fKAbQpnvFzQFujiQkrgOt0QFwkb",
-	"2GNMrgWBLyYVYWYMVvZstbMxzhzb5DWtguHRYtYEPLPohMyiWbQ4Hs/49coebw1cKxKnDP+LhUBEN2DT",
-	"FwRwPmpdSpHNuGKppdt4HCk2Zj/MjULHCOQ0/WDZKEoIMNcG4Qk9aBcDQK/eoAX2wjuMyfuMaaJFLT88",
-	"baU1LbF9CInDzEYJEubSvF0hlKG2iZuWQRs5DkaZLUM9l1LIK1C54CqQ5MciCbswMPOGsys7LOQh3gBN",
-	"9ap7Z1UVneALzfIUZ98Nhk43LbTjRWaC1xUoxMLt/VwupOaMK5C6UTbyYls1TN2xPO8aFWPlZK6g452p",
-	"ycCGdrJjercpHIQdWqoe17Nci5EtEtv0jAIMC3AnSGOIoAbPQtLzEPtTQ/Vvq0F/J7zfuU59uDzgm9A/",
-	"1pZ3SAEuHXr7/vK/lylU8v9nCcvoJPqnSd03MnFNIxOz8zuc8xQZgINrHvRniqgiN8yF5HhX7F/t1p8G",
-	"eGcJ8bKvVJgLxVq5QZcv8Fby5vXT0/XSgMaarSFck4+FSBNxz+cF1yx9tP6KlCo9N/IoJMx7GlL6ODLC",
-	"yBKIxyuMY5sRKYlHe3G/kqPyVwPdDDuHFSDM7FHJuJKQLu5fwRIk8BiSrdDeJPzl9BdsQEB4+vr87fn1",
-	"OZnQnE1yIVI1+WoU8cGVmlZAzK+kei8uq13IzQa7HYQkmZDV+3Q1JmdUSuaw7S1wkCwmiAcmBlWQPC2U",
-	"g4myiHVhoGKK5fEluUlFfId1e7caOWIJ+XOjtv434hogENRfu9dLRBVySWNQRAJPQNZFelz7yL0jIzmV",
-	"TG+ODeBnypyLkgQSLPMnxHqKsuugpL3BTwJ8DamwJTguNIlFkaeQmLTfsKPM4MZbaLc80l7+qpQq47dl",
-	"Z1IwqD0GWqvD+KCSeeTsXX7YMeANNfR0koh5SxfaM4nLtklYz2XznnuqiDU51C+U8r1Rm7tQAWcU2Sf7",
-	"eq68q4ahRRj6O5Jc8YsutVPxsgg96F5wx5FlAG7TxcBhB96qM1Tpa5PGVoHucSO4I/Jb4/i+objywX0x",
-	"+QqUSNfddbmu8tRc8NisTdN7ulHzRsnqu2pUFUFduVZHO+vOTZVolL0UFCkcsjP3e2D076T6qUQhY3je",
-	"wuig3K159pQXyo6VXS2/2ZES4jZgOJpT1xauupqWxZfN3EAC4C56teFo22+2ZoS2GtUHCnHjP0Cy5aaz",
-	"AIHNXfOU8bvd+eF1hAWYkcItjQ3RHLqUVWia9g1Y05QlNVED0dlfrjl31Dxfi7ZtbhkNh7gwIO2DOavj",
-	"EVAJ8rTQq+34c3p54Qp3CjRZM0o+vP14cXY+P728mF+///fzXw1mI2tqEA7yD8WNK9YeAnuTHsz2jC8D",
-	"Mfjq/MM1MVsZ1JxRTm/LaueHtGCx/3rzhXv9zW9JqSIEtchgwpTF4EzCeqLo3cW1zeI11rrccmeCaylS",
-	"cplSDmZja3HKUvPTeDqemlkiB05zFp1EP4+n458jdGMr5Bli+oY53NqylFE+dKQXSXQS/R30ac5OPV2W",
-	"zmxxzl+mU1sP5Np5JJrbthgm+OQfLqWyarmz9rbvbmyp8MND+1X1pTW6mqOul1pZlSmyjMpNdBK9NWg/",
-	"7xw8ijQ1EPxTVHPms1mhya3JV5Y8TKSNn2iyQgV4dylUg3kXiQu5KAdJM9AIbz59jZg5hJFNWSU5sV60",
-	"tiItCxh5nGy74M92MCj9SiSbvYTSJ4sWanlomrYh6uE7VWKn3V2kCEj+tBYijk2Murycvnw0Gpr5ch8F",
-	"Jt9bioI7En55BhJoKoEmm4oZJv82mCkhotAtW3DMJbTTIIbsoUiYnkiI3QH7HIgZemVHhpX/twLkptb+",
-	"lGVMR77CV+0qf52Ooox+YZmBKj9Np+ZPxt2fgUzx80F8Vn0Hawd3ZTlBkIEEuJYm2Lel4w1JxW01zJOJ",
-	"ebYljzXCiV3kYYFH9IT224A2AT7gc7e00Vk7zGeDXcFjw4qqFYlXlPEeTvjFlB4uvCqHHUJDOgs022zx",
-	"3s1XJwnEsTgwrOZJXa2xNczuANVgw+OHkGBz7k6B5KdHo6Fi/jaz3SPXD+l89/RwvvuCIyCufO6hY0d5",
-	"/jJ04AtrZRsZV0y1O6P9TLupk6dJ0myk9vpIt3WybaqIqayfT8FW1JvK+hp/99T1InkMHDVcadyOHy+3",
-	"E4CSi+422cFBSLm/h0Fa4cTQtYd8MFWIVwGnYX5+LjE8vndqXuw8ML7dwS25q4Y/hlt6XoU+sGN87V0p",
-	"Le89EnftsW7rKS9+WH+5dfexZYZWzb7BTfolsR5Ec1bf9Xp6RFPe1tsB0fwqdA30qrMEMA0PDqxZU1fS",
-	"tlhTRZABx1Xy6HfuuJq33A7suCrRB8Cre5XacFwHdBzl/l2RsDLBkKoNaFp1j6/PBO2gp2S+3SF0dvuy",
-	"jcTlCP/gfwddNpO6AYWsXg6Wh7YTdwj/3jGfQLf9K4aHVu1u7uKTHycih9R6ULCVLtc9oUMK7Y08SFjx",
-	"gudeubK76IH9rwo/cMFhDZLAl1woSI4Hcmic7rPMO/dgDt1m0lOl0duXDA+cSfvS6ZUGTZI/YDbtM6CR",
-	"UA8ky516F7BX14S1U7Ls6eWvtm9hGPG4Boc93z0MZchn/vdAnidJ9n3F7nlyn0sYiJEHY/6TuJvmVZ2d",
-	"/Ey/3P+ouexOeod9HZhC2gscYtlx/WnQVWRxPqla6QeC+7s4/1gNPUR09xv2dwjv784uSX2WQABvDqhZ",
-	"k8X5cNTeOv1The3AxeIDx+0G37f5XD77w5bBKwYMh23eULotnQua4T4x29fKZw7aFVeeK2RXBAwF7B2F",
-	"gl3sAz7xEsccwhnibZf9khx7gP5MJncHKFlg/x70hvW5n8oN+lf7D+z/LKsDnUNCpH9Yp4eHx6+0VAVu",
-	"/HKl69FmqvKGRoUsg/56WAZpkJym9oaIlVPZWYdXT47w9pr9yu3k9Svi7vO0c32rfs26u2t5bxtJw1Ps",
-	"47bRfJ7ZX6NAn8tXW216qvc1fbeYuojZ/VbSNV6Yd5d4bkSy+ZOqPlWqFu6jrSm+GcfPg+5wE2m8wyvY",
-	"Dh0cDYanQ+rZ9FFlWDZpd4qs7OF+Vt1tSM4jjBw1r7H8meQgX7g/7PW+4x2dysR+Q2Gwf7SSt7279PuU",
-	"euPeVagjzl1UqrrA/l/5rsHtLXuIpMZJURILXr6pcVeKVizHd1S30NLN96gyQhKxBilZAuR///t/CE3W",
-	"lLtyglNYDJd2tR4Fxev8A/j4Csfs1MZZ3gLxVe9b75iEd3DfA91enoO+FxI/DWGdbHmDJ7D2QVpE8bbT",
-	"Lr3sImXxhlhJhNrX/ee1JO3fgyC/FN7TgXz/61MHBvmWxQHzKlL4cUB+4EWAJ9OASBvGOYEv5ccahmz0",
-	"/Iv7H0jsodxaZGnz8G1j2Trg9ft3b4mjqnk4S4BVVUIVMSMHz8ey8nzDimw/3tKrzlmRapZTqSdLIbMX",
-	"CdW0eb7WFWCWhr5QaI5o36ESHDGqGz9uGKfokfpvxeG0wI2mg77cbnzsJqSrmRWYF4SfwVJQTZqaVBKG",
-	"mrSUIttNl3ZuKrXa9CN1lKLPeq5UDjcffEe2m9eqP9nU469cVvCEqt+6dNrTPYO38apEpJkJeI+wHdr8",
-	"04/C5Yem8PQ2H/ivvqO/cUOe8OCtT2qFEAeeiqny+yS2zPPzAdF3SYBROSw4Na56RiefPvtSsCci8Qri",
-	"O4/3lnrD++bc5gXRT5+N/dmveVsTb9LyVsQ0dZyYnF5euA9/R6MIv8yIV0FPJpOf/vKv4+l4Ov7p5Ofp",
-	"dBo9fH74vwAAAP//eOrONI5qAAA=",
+	"H4sIAAAAAAAC/+R96XIbuXbwq6D6+6pGqkuRnDu+SY2m8kOWlWtV7LFKkpMfpouEug9FXHUDPQCaEuNS",
+	"VR4iT5gnSeEAvRK90JYoO/NrLDaWs299Ts+XIBRJKjhwrYLjL4EKV5BQ/OdJmkqxpvEl/JGB0uanVIoU",
+	"pGaAC0IJVEM0p/hsKWRi/hVEVMORZgkEo0BvUgiOA6Ul47fB4yiIQGnGqWaCm01bz1nk/TkVUlceMK7h",
+	"FmTw+DgKJPyRMQlRcPzJ7K5f4XaOqrB+LsASN/+AUJvzT7KI6TOu5WYbTRq2QhtKiIBrRuN5psAPeh/G",
+	"baiNglTCer6iauXdJ4GqliMN8ZWmSTqcL1qI2HvWGmTEQu151qB9eWm5yUfq14xHZv+TiFPJgK+iPfD1",
+	"fE2leRaBCiVL7dLgjK+ZFDwBrsmaSkZvYiCcJkCWQpJ0RbkWCdHiDjhh3KBmpW3rhhXQCGSHqBd4Mq7/",
+	"6VV5RFUOhLTKyTQkyi8r7hcqJd1Y2RFahCL27quwvbFNQ5LGVEM/uz2qVmFGB+M/ppG7oE7yCyrNVpLh",
+	"c6QzJTd2z5icaBIDVZoIDmTJII5IkilNboAo0IRxolcw49LaKnIjos2ICL0Cec8UmIdEgVyDJBJ0Jrki",
+	"r6ZT8ppGxJk3cpBQHa4Yv51xs/r03fnhmHzg8cZep4heUU2oBJJKUEYwzL8TEbElg2hMrkBr3L2oUGVB",
+	"tCCUcMGPIEn1hqxpnAGhsRIOUYWwpdTQdcZppsWRUwRC41jcE5nFQA40vb2FiNwzvSILR5UjGkXHs2w6",
+	"/SUsSY9/w4IIOeML83PXqkOiBN7P4Z5UwCZMES40EdKIusHufGnJYgG1QK2oIjcAnEhIxBoiklCe0Tje",
+	"jPBMB2X13BlniijN4pzNEaE8IpTcU8nNWqZILBDTAy7IksbxDQ3v7HVMzbgjzeGYXFClzA7KiaUsyjCK",
+	"zQIVxpCALAo9WJAwBioVopF49ls5twdYrV2MyCJXiMUIj3MGYzHj9jREFOVjTFCwzREeAbBsRwkKRRyz",
+	"CNSMIzMpRyktqCW4FVZja0p+VaT2V3Iq+DJmoR7P0MHVjOh3bfG+G0O2bZukQewd43de/5/RuN0Rw0MK",
+	"oXFcrStixmHOs+SmRpe2IKa6unn6qAaNz8qerijnED+NewVuxKAa19wIEQPlO/kv+8uXAHiWYKQAMdxK",
+	"mgSjYKV1WkGjvPkeblZC3M0VhBL0tri+fX9ySuxDFFDFbtGAuH0kpZtY0EgFHWdnMt4++D/cAR8v3+HB",
+	"b6+vL0hoieo5zecVcUlJvA4uXWmqM7XNq066707OZqQ2GL7SV+8A3zbr+jgwQEGNzWOeoDGCJc1iPa+E",
+	"qTld0H1ikMI3RnHUnVfSjOyLTBtw/RZpjRdEzG/T7FOTKTG/2WvHpo26z46TFyQ0Dy5Gq6R8de2wq1QZ",
+	"mZEb0PcmBqBVb2WdesUXjcl1EYGZKM3cqkiUpTELzYHHlROrHvCgPHRUPe9wxk0kQhK6IcLEaPDAMDgM",
+	"YUwuIbUB1MWHq2vlXGev5/yRU4nvMkPYKTmwgnVaLGotOtAwBKXmSK9tYn84yUxMhWscTQ9ymMj9Cjgx",
+	"N5swU9BMrw4HlCjqF5wvTb4xshF8mGsDr8bqRvidLPt96lMKCmFc2xCe3pqNoeCaMo6hwy4SZO55MtGS",
+	"sJSgVt1McotyLgl8atTcYGt/lEJbbfdWLNoFcRTgfr+Dv8ajgUepYFyjn99ZRHIX7Cw1PqOahTYlrtvL",
+	"3CaaPWPyxu4wwknsnjG6Ymvei1Pwaq9px0zCg9Wqli64yMimHQdKC4Mc8FBuUps+XeYYY/CE11oIe0Mc",
+	"lJR2FX5/evExVVoCTdp1WN7uaHVCkSSUR22WFw+NImZF6KJ2WdvZJeCtst8bGmhJucoLeA2G5I+QrMdE",
+	"6YgJcuAk5nBETKyGSeU93CgR3oEeRvqSGO1MuBCiw4JmejWXoEDPNZW3vuj6g1NGZ1TMjiPcQewOIjOe",
+	"+2l4WNFMaYiObFFBQijWgPIW3QI5wOx6IhKmjUv+F8IFwaO8ipWASXuUByIZ4Zl2QVXUDVEUOVhSFpt7",
+	"iTArzeHDpatVAJSWVMPtxlOtEiIm+ePfSFRR659yUH6y5sShPpS7OQnauXuZxdDK3b44pd3QU61B8peO",
+	"LySkMQ3BeMEWy95fqO6Ils0FEa2VplvY0VXDPq3FinUGtFuT3XwShrXGOqMvQCvS44s6HI/xAWjj8/MO",
+	"d/E6O3gAd2lbkfd3uHdOCeu73MbtJuovNXpM/rX0SeXPamQirxlf4P6FXWVDieYisqhGiQuMyPL4It7M",
+	"+KIWoCxsclJAsigYtTBRAFZ75Rqi8YzP+MXJ9elboiChXLNQHVvtxm31Q3MEmxDOeH6gLR6GmZTAdbwh",
+	"zkvX4qIxuRYEHkyaxMwarDraSmxtnUHb5FyNYubBYlYPxmbBMZkFs2BxOJ7x65VFbw1cKxLGDP+LRUqM",
+	"vMCmVhhcViPqpRTJjCsWW7iNxZFiY+7DvM2Hhiff6g7kjaD4gvlSISpM9+pFTxBaXtAIRP03jMmHhGmi",
+	"Rck/5/2c1DTYduVjh9mNHCTMpaBDwzsDbT2mW3p15NDrZbYU9UxKIS9BpYIrTwEiFJHfhIHZ15/52WU+",
+	"C/EWaKxX7TeroiAGDzRJY9x91+s63TbfjeeJcV6XoDBOb97n8jQ1Z1yB1LWSVsW3FcvUHUvTtlUhVnXm",
+	"Clre55rssO8mu6bzmsyF131Hles6jmsQsgFiE56Rh2Ae6nhh9AFUo5mPe5Vs4rnTiK+rj39j6jG4hr6/",
+	"HOWrMhOsew9ITy5c9LZzRnLyRIlIXwLyNVJQSVoKUfz/EpbBcfD/JmV7zcT11kwMEd7jnudIRlzkWMlC",
+	"mCIqSw2fITocmoYUt3VnJBVcfG+cuiqqqVCskaa0maXKSZV93fC0vVuhoWZr8L+6CIWII3HP5xnXLH6y",
+	"NpSYKj03/MgkzDv6drooMkIn5wkNVuhSNyOSA4+q634lB/mvJoo05OwXAD+xRznhckDaqH8JS5DAQ4i2",
+	"oow64K+mv2KfBkbKb87enV2fkQlN2SQVIlaTL0YQH11FbgXE/EqK9gFZ3EJuNtgUIiRJhCzaDtSYnFIp",
+	"mQuzb4GDZCHB0GRiAhySxplyEavMQp0ZSxHjW4QluYlFeIevN9xp5IBF5C+1VxC/EdcngvnFtXsLR1Qm",
+	"lzQERSTwCGT5LgPPPnCvEklKJdObQ5N7MGXwoiSCCN+GRMRairw5I4e9Rk8CfA2xsJVKLjQJRZbGEBEt",
+	"kBx5MjneCrxzlHayVzlXGb/NG7i8/vUpAscyougVsgo4O1dCBvrevr6nVhAxhWoLPE0Ota0S1nLZFOye",
+	"KmJVDuULuXxvxObOV0saBfbJrpYrbSunaOHPQhxIrg5Hl9qJeF6r7zUveOPIEgCvaSNgvwFvlDyKTLoO",
+	"Y6NW6KPCdxSAfH0w4ej1tSHFrlFB4Q66woNLUCJet1cr24p2c8FDczaN7+lGzWuFvG+q3BUAtWWgLQ3I",
+	"g9tg0T50QpDFsM9e6m9JLn6QmrASmQzhZcvFvXy/Am109STD+gfoa7Qf3/KWpsUk/bZVd3K9ikxjV2Wb",
+	"7WkgtA2AFys0Oh2lpLxzaqg9q3dG+WQI0N/PqRtPUG3N8+JhMzcxF3AXHjTj/aZjauzwXTUqEfJR499B",
+	"suWmtdiETYbzmPG74fSodCZ6iBHDLQ0N0BzaVFBoGnctWNOYRSVQPeFP9bj63lEdvwZs29QyegthZqLg",
+	"K4OroxFQCdIoiUfeL85dkdYI+5pRcvXu4/np2fzk4nx+/eHfzn43QTFZUxNCIv2Q3XhiKe7YI/dormd8",
+	"6QlyLs+urom5yqQlCeX0Nq9sX8UZC6uv2Y9cGwa/JbmIEJQiE3THLASnEta+Bu/Pr23FRmNd0x13KriW",
+	"IiYXMeVgLrZ2RFlofh5Px1OzS6TAacqC4+CX8XT8S4DGeYU0w6Sppg7OXBjhQ/dwHgXHwd9Bn6TspCLL",
+	"0qkt7vnrdGprv1w7O0tT257FBJ/8w+WsViwHS29zhmhLhB8fmy0TF1bpSoq6nn5lRSZLEio3wXHwzqRT",
+	"aeviUaCpyXE+BSVlPpsT6tSafGHR40TaqABVVigP7S6EqhHvPHKBBPJB0gQ0Bm2fvgTMIGF4k1fEjq1v",
+	"KLVIywxGFUo27fBnuxiUfi2izU5M6eJFIxZ7rKu2AerxG0Vi0O3OU3g4f1IyEddGRlxeTV89GQz1gkQX",
+	"BCahXoqMOxB+fQEQaCyBRpuCGERIYiLBiIhMN3TBEZfQVoXo04csYnoiIXQIdhkQs/TSrvQL/x8ZyE0p",
+	"/TFLmA6qAl+0Tf1tOgoS+sASE4D9PJ2aPxl3f3pS8c97sVnlLOAAc2UpQZCABLiWxtk3uVNZEovbYlmF",
+	"J+bZFj/WGE4M4YcNPIJn1N9aaOOhAz53RxuZtcuqZLAnVMiwompFwhVlvIMS1WpVBxVe58v2ISGtFbBt",
+	"slT6MApMPH4s9CwraVKWw2yRuN1B1cjw9C7E2yQ+yJH8/GQwFMTfJrZ75Ppyne2e7s92n3MMiAubu2/f",
+	"keOfuw5sTlC2oXbFVLNDv1o/qMvkSRTVG/or/czbMtlUVYyprJ2Pwb6yqAvrG/y9Iq7n0VPEUf2l3G3/",
+	"8Wo7Acip6KYa9x6E5PdXYpCGOzFw7cAfTBXClcdomJ9fig1Pb53qA8Z7jm8HmCU38vp9mKWXFeg9G8Y3",
+	"ldHmfP6WuPHbsoUrH0Cy9nJrBrehhlbMvsJMVktiHRHNaTlz+PwRTT41OiCi+V3oMtArcPHENNy7sCRN",
+	"WUnbIk3hQXoMV06jH9xw1act92y4CtZ7glf3rrpmuPZoOPL72zxhoYI+UeuRtGKetEsF7aLnJL69wYe7",
+	"fZtJwnxFFfG/g84bh92CTBZvX3Ok7cYB7r+C5jPIdnXUdd+i3U5dfPL9eGSfWPcytpDlsv+3T6ArK/fi",
+	"VirOc6dc2Q31YK+zwg+tcFiDJPCQCgXRYU8OjdurJKvg3ZtDN4n0XGn09rDrnjPpKnc6uUGj6E+YTVcJ",
+	"UEuoe5LlVrnz6KvrchuULFfk8nfbjdEf8bi2jR3fPfRlyKfV79K8TJJctRXD8+Quk9DjI/dG/GcxN/Wx",
+	"rEF2ppvvf9ZcdpDcYbcKppB2WEcsW0bdek1FEqaTYmyix7m/D9OPxdJ9ePfqcMYA9/7+9IKUuHgceH1B",
+	"SZokTPu99hb2z+W2PQPue/bbNbpv0zl/9qctgxcE6HfbvCZ0WzLnVcNdfHZVKl/YaRdUeSmXXQDQ57AH",
+	"MgXHBHps4gWu2YcxxMmm3ZIci0B3JpM6BHIS2L97rWGJ93OZweonJvZs/yypPZ1DQsR/WqOHyOPXgooC",
+	"N35B1TXBM1VYQyNClkB/2y+BNEhOYzuCY/mUd9bhbM8BTirary1P3rwmbmCqmetb8avX3d1MQVNJapZi",
+	"F7ON6vPC9hoZ+lK22krTc72v6RoTawNm+NjXNX4cwU1J3Yho85MqPpmrFu7jwTG+GcfP1A4Y9RoPeAXb",
+	"IoOjXve0TzmbPikP8ybtVpblPdwvKrs1zlUAIwf1OaG/kBTkkfvDzk8eDjQqk7IJ/6js1u/30Ib1jYGA",
+	"H6qy0D3TsOdqf1tY8NF9aTp1EdqLhAVOKr4rXbgCbcyn/SIJumGQR+iKv/mzX0P1xn5nZriy2KHKH9Na",
+	"1gZCfZ2kboKy6J78P+Xze6+35CGSGudOSSh4/obTDRiuWIrvdm+hIccfUGSEJEYAJYuA/M9//Teh0Zpy",
+	"V4Zzhh5l257WIaD4yZOevPIS1wxqf85nwqqi97UTZ/4b3Pect4/noO+FxM/n2OAkn+fznL2X1mqcfRwy",
+	"AyJiFm6I5YRv7KP6vOSk/bs3Oc6Z93zJcfULfXtOji2JPeqVxfD9JMeeF2gVnnpYWlPOCTzkH7Tp09Gz",
+	"B/c/ANpBuLVI4jryTWXZQvD6w/t3xEFVR84CYEWVUEXMyl78WJLj1y/I9gNXneKcZLFmKZV6shQyOYqo",
+	"pnX8Gt8mYLHvC7MGRdt7QHDFqGyYumGcokXqHinFbZ5JwL2GibUPgvlkNbEMqzjhF9AUFJO6JOWAoSQt",
+	"pUiGydLgZmwrTd9TJzbarJcqgeDlve+Wh1mt8rN2HfbKZdPPKPqNYe2OrjOcYi0S+HoGXXmEYwTmn1Uv",
+	"nH+MD7G3efR/dqH+1i15RsQbnx30RRyIFVP5h5NsefSXPUbfOQBG5LBQWxuRDo4/fa5ywWJEwhWEdxXa",
+	"W+gN7et764PVnz4b/bP/Nwar4nVY3omQxo4Sk5OLc/c/bghGAX69FkeojyeTn//6z+PpeDr++fiX6XQa",
+	"PH5+/N8AAAD//4Q38/5OcAAA",
 }
 
 // GetSwagger returns the content of the embedded swagger specification file
diff --git a/internal/api/server.go b/internal/api/server.go
index 9740b41..f1066ba 100644
--- a/internal/api/server.go
+++ b/internal/api/server.go
@@ -1829,8 +1829,16 @@ func (s *Server) PostApiPools(w http.ResponseWriter, r *http.Request) { //nolint
 	if req.Strategy != nil {
 		strategy = *req.Strategy
 	}
-	if err := poolops.Create(s.store, req.Name, strategy, req.Members); err != nil {
-		writeError(w, poolCreateError(err), err.Error(), "")
+	authResetTarget := ""
+	if req.AuthResetTarget != nil {
+		authResetTarget = *req.AuthResetTarget
+	}
+	if err := poolops.CreateWithAuthResetTarget(s.store, req.Name, strategy, req.Members, authResetTarget); err != nil {
+		status := poolCreateError(err)
+		if errors.Is(err, poolops.ErrInvalidAuthResetTarget) {
+			status = http.StatusBadRequest
+		}
+		writeError(w, status, err.Error(), "")
 		return
 	}
 
@@ -1908,6 +1916,37 @@ func (s *Server) PostApiPoolsNameRotate(w http.ResponseWriter, _ *http.Request,
 	_ = json.NewEncoder(w).Encode(out)
 }
 
+// PostApiPoolsNameAuthResetTarget sets (or clears, with an empty string) the
+// per-pool agent auth-reset target. This is the action-route REST counterpart
+// of the CLI `pool set-auth-reset` and Telegram `/pool set-auth-reset`, all
+// routing through poolops.SetAuthResetTarget so the three surfaces cannot
+// drift (channel feature-parity principle). A NUL/newline in the target is a
+// 400 (poolops.ErrInvalidAuthResetTarget); an unknown pool is 404. On success
+// the updated pool is returned so the caller sees the persisted value.
+func (s *Server) PostApiPoolsNameAuthResetTarget(w http.ResponseWriter, r *http.Request, name string) { //nolint:revive // generated interface name
+	var req SetPoolAuthResetTargetRequest
+	if err := json.NewDecoder(limitedBody(w, r)).Decode(&req); err != nil {
+		writeError(w, http.StatusBadRequest, "invalid request body", "")
+		return
+	}
+	if err := poolops.SetAuthResetTarget(s.store, name, req.AuthResetTarget); err != nil {
+		status := poolStatusError(err)
+		if errors.Is(err, poolops.ErrInvalidAuthResetTarget) {
+			status = http.StatusBadRequest
+		}
+		writeError(w, status, err.Error(), "")
+		return
+	}
+	w.Header().Set("Content-Type", "application/json")
+	if p, err := s.store.GetPool(name); err == nil && p != nil {
+		_ = json.NewEncoder(w).Encode(storePoolToAPI(*p))
+		return
+	}
+	// The set succeeded; a read-back failure must not report failure. Echo
+	// the persisted value from the request instead.
+	_ = json.NewEncoder(w).Encode(Pool{Name: name, AuthResetTarget: &req.AuthResetTarget})
+}
+
 // DeleteApiPoolsName removes a pool. It refuses (409) while any binding still
 // references it by name; an unknown pool is 404. On the 409 the structured
 // list of referencing bindings (id + destination) is included in the response
@@ -2228,6 +2267,10 @@ func storePoolToAPI(p store.Pool) Pool {
 		Strategy: p.Strategy,
 		Members:  make([]PoolMember, len(p.Members)),
 	}
+	if p.AuthResetTarget != "" {
+		t := p.AuthResetTarget
+		pool.AuthResetTarget = &t
+	}
 	for i, m := range p.Members {
 		pool.Members[i] = PoolMember{Credential: m.Credential, Position: m.Position}
 	}
@@ -2247,6 +2290,10 @@ func poolStatusToAPI(res *poolops.StatusResult) PoolStatus {
 		Active:   res.Active,
 		Members:  make([]PoolMemberStatus, len(res.Members)),
 	}
+	if res.AuthResetTarget != "" {
+		t := res.AuthResetTarget
+		out.AuthResetTarget = &t
+	}
 	for i, m := range res.Members {
 		ms := PoolMemberStatus{
 			Credential: m.Credential,
diff --git a/internal/api/server_test.go b/internal/api/server_test.go
index e80b349..f43092f 100644
--- a/internal/api/server_test.go
+++ b/internal/api/server_test.go
@@ -3042,6 +3042,108 @@ func TestPostApiPoolsNameRotate_Success(t *testing.T) {
 	}
 }
 
+// TestPostApiPoolsNameAuthResetTarget exercises the REST adapter for the
+// per-pool auth_reset_target action route. It asserts the value reaches the
+// store (the REST handler is a thin poolops adapter), that clearing with an
+// empty string works, and that an invalid target is a 400 and an unknown
+// pool a 404.
+func TestPostApiPoolsNameAuthResetTarget(t *testing.T) {
+	st := newTestStore(t)
+	enableHTTPChannel(t, st)
+	seedOAuthCred(t, st, "credA", "credB")
+	if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}); err != nil {
+		t.Fatalf("create pool: %v", err)
+	}
+	srv := api.NewServer(st, nil, nil, "")
+	t.Setenv("SLUICE_API_TOKEN", "tok")
+	handler := newTestHandler(t, srv, st)
+
+	post := func(body string) *httptest.ResponseRecorder {
+		req := httptest.NewRequest("POST", "/api/pools/pool1/auth-reset-target", strings.NewReader(body))
+		req.Header.Set("Authorization", "Bearer tok")
+		req.Header.Set("Content-Type", "application/json")
+		rec := httptest.NewRecorder()
+		handler.ServeHTTP(rec, req)
+		return rec
+	}
+
+	// Set a target.
+	rec := post(`{"auth_reset_target": "openai-codex"}`)
+	if rec.Code != http.StatusOK {
+		t.Fatalf("set: expected 200, got %d: %s", rec.Code, rec.Body.String())
+	}
+	var p api.Pool
+	if err := json.NewDecoder(rec.Body).Decode(&p); err != nil {
+		t.Fatalf("decode: %v", err)
+	}
+	if p.AuthResetTarget == nil || *p.AuthResetTarget != "openai-codex" {
+		t.Fatalf("response AuthResetTarget = %v, want openai-codex", p.AuthResetTarget)
+	}
+	// Reached the store (no inline logic; routed through poolops).
+	got, err := st.GetPool("pool1")
+	if err != nil {
+		t.Fatalf("GetPool: %v", err)
+	}
+	if got.AuthResetTarget != "openai-codex" {
+		t.Fatalf("stored AuthResetTarget = %q, want openai-codex", got.AuthResetTarget)
+	}
+
+	// Clear with an empty string.
+	rec = post(`{"auth_reset_target": ""}`)
+	if rec.Code != http.StatusOK {
+		t.Fatalf("clear: expected 200, got %d: %s", rec.Code, rec.Body.String())
+	}
+	got, _ = st.GetPool("pool1")
+	if got.AuthResetTarget != "" {
+		t.Fatalf("after clear stored AuthResetTarget = %q, want empty", got.AuthResetTarget)
+	}
+
+	// Invalid target (newline) -> 400.
+	rec = post(`{"auth_reset_target": "bad\ntarget"}`)
+	if rec.Code != http.StatusBadRequest {
+		t.Fatalf("invalid target: expected 400, got %d: %s", rec.Code, rec.Body.String())
+	}
+
+	// Unknown pool -> 404.
+	req := httptest.NewRequest("POST", "/api/pools/nope/auth-reset-target", strings.NewReader(`{"auth_reset_target": "x"}`))
+	req.Header.Set("Authorization", "Bearer tok")
+	req.Header.Set("Content-Type", "application/json")
+	rec = httptest.NewRecorder()
+	handler.ServeHTTP(rec, req)
+	if rec.Code != http.StatusNotFound {
+		t.Fatalf("unknown pool: expected 404, got %d: %s", rec.Code, rec.Body.String())
+	}
+}
+
+// TestPostApiPools_WithAuthResetTarget asserts the create body accepts
+// auth_reset_target and persists it via poolops.
+func TestPostApiPools_WithAuthResetTarget(t *testing.T) {
+	st := newTestStore(t)
+	enableHTTPChannel(t, st)
+	seedOAuthCred(t, st, "credA", "credB")
+	srv := api.NewServer(st, nil, nil, "")
+	t.Setenv("SLUICE_API_TOKEN", "tok")
+	handler := newTestHandler(t, srv, st)
+
+	body := `{"name": "codex", "members": ["credA", "credB"], "auth_reset_target": "openai-codex"}`
+	req := httptest.NewRequest("POST", "/api/pools", strings.NewReader(body))
+	req.Header.Set("Authorization", "Bearer tok")
+	req.Header.Set("Content-Type", "application/json")
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusCreated {
+		t.Fatalf("expected 201, got %d: %s", rec.Code, rec.Body.String())
+	}
+	got, err := st.GetPool("codex")
+	if err != nil {
+		t.Fatalf("GetPool: %v", err)
+	}
+	if got.AuthResetTarget != "openai-codex" {
+		t.Fatalf("stored AuthResetTarget = %q, want openai-codex", got.AuthResetTarget)
+	}
+}
+
 func TestPostApiPoolsNameRotate_NotFound(t *testing.T) {
 	st := newTestStore(t)
 	enableHTTPChannel(t, st)
diff --git a/internal/poolops/poolops.go b/internal/poolops/poolops.go
index 7e9f691..49a9b3c 100644
--- a/internal/poolops/poolops.go
+++ b/internal/poolops/poolops.go
@@ -29,6 +29,7 @@ type Store interface {
 	RemovePoolIfUnreferenced(name string) (bool, error)
 	ListCredentialHealth() ([]store.CredentialHealth, error)
 	SetCredentialHealthIfPoolMemberEpoch(credential, pool string, epoch int64, status string, cooldownUntil time.Time, reason string) (bool, error)
+	SetPoolAuthResetTarget(name, target string) error
 }
 
 // ErrNoMembers is returned by Create when the member list is empty.
@@ -78,7 +79,9 @@ type StatusResult struct {
 	Name     string
 	Strategy string
 	Active   string
-	Members  []MemberStatus
+	// AuthResetTarget is the per-pool agent auth-reset target (empty = none).
+	AuthResetTarget string
+	Members         []MemberStatus
 }
 
 // RotateResult is the outcome of a successful Rotate.
@@ -107,18 +110,87 @@ func ParseMembers(membersStr string) ([]string, error) {
 	return members, nil
 }
 
+// ErrInvalidAuthResetTarget is returned when a non-empty auth-reset target
+// contains a NUL byte or newline. The target is consumed as argv (never
+// shell-interpolated), so this is a minimal structural guard, not a
+// shell-metachar check; channels that exec the target apply any stricter
+// allowlist at exec time.
+var ErrInvalidAuthResetTarget = errors.New("invalid auth_reset_target: must not contain NUL or newline characters")
+
+// validateAuthResetTarget rejects a non-empty target containing a NUL byte or
+// a newline. An empty target ("") is always valid and means "clear / no
+// reset", so callers normalize the clear sentinel before validating.
+func validateAuthResetTarget(target string) error {
+	if target == "" {
+		return nil
+	}
+	if strings.ContainsAny(target, "\x00\n\r") {
+		return ErrInvalidAuthResetTarget
+	}
+	return nil
+}
+
 // Create creates a pool with the given ordered members. An empty strategy
 // defaults to the only supported strategy (failover). Sentinel errors from
 // the store (namespace collision, static member, unknown member) propagate
 // unchanged so channels can map them.
 func Create(s Store, name, strategy string, members []string) error {
+	return CreateWithAuthResetTarget(s, name, strategy, members, "")
+}
+
+// CreateWithAuthResetTarget is Create plus an optional per-pool
+// auth_reset_target (empty = no reset). The target is set in a follow-up
+// SetPoolAuthResetTarget call after the pool exists; channels that don't
+// accept a target call Create. Used by every channel's create adapter so the
+// create-with-target path has a single source of truth.
+func CreateWithAuthResetTarget(s Store, name, strategy string, members []string, authResetTarget string) error {
 	if strategy == "" {
 		strategy = store.PoolStrategyFailover
 	}
 	if len(members) == 0 {
 		return ErrNoMembers
 	}
-	return s.CreatePoolWithMembers(name, strategy, members)
+	if err := validateAuthResetTarget(authResetTarget); err != nil {
+		return err
+	}
+	if err := s.CreatePoolWithMembers(name, strategy, members); err != nil {
+		return err
+	}
+	if authResetTarget != "" {
+		if err := s.SetPoolAuthResetTarget(name, authResetTarget); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// SetAuthResetTarget sets (target != "") or clears (target == "") the
+// per-pool auth_reset_target. Channels normalize their clear sentinel (CLI /
+// Telegram use a single "-") to "" before calling. A missing pool surfaces as
+// the store's "does not exist" error; channels may also wrap it as a
+// *PoolNotFoundError by checking existence first.
+func SetAuthResetTarget(s Store, name, target string) error {
+	if err := validateAuthResetTarget(target); err != nil {
+		return err
+	}
+	if _, err := mustExist(s, name); err != nil {
+		return err
+	}
+	return s.SetPoolAuthResetTarget(name, target)
+}
+
+// mustExist returns the pool or a *PoolNotFoundError, so channels get the same
+// typed not-found error from SetAuthResetTarget that Status/Rotate/Remove
+// already return.
+func mustExist(s Store, name string) (*store.Pool, error) {
+	p, err := s.GetPool(name)
+	if err != nil {
+		return nil, err
+	}
+	if p == nil {
+		return nil, &PoolNotFoundError{Name: name}
+	}
+	return p, nil
 }
 
 // List returns every configured pool, ordered as the store returns them.
@@ -151,7 +223,7 @@ func Status(s Store, name string) (*StatusResult, error) {
 	}
 
 	now := time.Now()
-	res := &StatusResult{Name: p.Name, Strategy: p.Strategy, Active: active}
+	res := &StatusResult{Name: p.Name, Strategy: p.Strategy, Active: active, AuthResetTarget: p.AuthResetTarget}
 	for _, m := range p.Members {
 		ms := MemberStatus{
 			Credential: m.Credential,
diff --git a/internal/poolops/poolops_test.go b/internal/poolops/poolops_test.go
index 673f867..0f63dbc 100644
--- a/internal/poolops/poolops_test.go
+++ b/internal/poolops/poolops_test.go
@@ -109,6 +109,108 @@ func TestCreateListStatusRotateRemove(t *testing.T) {
 	}
 }
 
+func TestCreateWithAuthResetTarget(t *testing.T) {
+	db := newTestStore(t, "acct_a", "acct_b")
+
+	if err := poolops.CreateWithAuthResetTarget(db, "codex", "", []string{"acct_a", "acct_b"}, "openai-codex"); err != nil {
+		t.Fatalf("CreateWithAuthResetTarget: %v", err)
+	}
+
+	p, err := db.GetPool("codex")
+	if err != nil {
+		t.Fatalf("GetPool: %v", err)
+	}
+	if p.AuthResetTarget != "openai-codex" {
+		t.Fatalf("AuthResetTarget = %q, want openai-codex", p.AuthResetTarget)
+	}
+
+	st, err := poolops.Status(db, "codex")
+	if err != nil {
+		t.Fatalf("Status: %v", err)
+	}
+	if st.AuthResetTarget != "openai-codex" {
+		t.Fatalf("Status.AuthResetTarget = %q, want openai-codex", st.AuthResetTarget)
+	}
+}
+
+func TestCreateWithEmptyAuthResetTargetDefaultsEmpty(t *testing.T) {
+	db := newTestStore(t, "acct_a")
+	if err := poolops.CreateWithAuthResetTarget(db, "p", "", []string{"acct_a"}, ""); err != nil {
+		t.Fatalf("CreateWithAuthResetTarget: %v", err)
+	}
+	p, err := db.GetPool("p")
+	if err != nil {
+		t.Fatalf("GetPool: %v", err)
+	}
+	if p.AuthResetTarget != "" {
+		t.Fatalf("AuthResetTarget = %q, want empty", p.AuthResetTarget)
+	}
+}
+
+func TestCreateWithInvalidAuthResetTarget(t *testing.T) {
+	db := newTestStore(t, "acct_a")
+	err := poolops.CreateWithAuthResetTarget(db, "p", "", []string{"acct_a"}, "bad\ntarget")
+	if !errors.Is(err, poolops.ErrInvalidAuthResetTarget) {
+		t.Fatalf("err = %v, want ErrInvalidAuthResetTarget", err)
+	}
+	// The pool must not have been created when the target is invalid.
+	p, err := db.GetPool("p")
+	if err != nil {
+		t.Fatalf("GetPool: %v", err)
+	}
+	if p != nil {
+		t.Fatalf("pool created despite invalid target: %+v", p)
+	}
+}
+
+func TestSetAuthResetTargetSetAndClear(t *testing.T) {
+	db := newTestStore(t, "acct_a", "acct_b")
+	if err := poolops.Create(db, "codex", "", []string{"acct_a", "acct_b"}); err != nil {
+		t.Fatalf("Create: %v", err)
+	}
+
+	if err := poolops.SetAuthResetTarget(db, "codex", "openai-codex"); err != nil {
+		t.Fatalf("SetAuthResetTarget set: %v", err)
+	}
+	st, err := poolops.Status(db, "codex")
+	if err != nil {
+		t.Fatalf("Status: %v", err)
+	}
+	if st.AuthResetTarget != "openai-codex" {
+		t.Fatalf("after set AuthResetTarget = %q, want openai-codex", st.AuthResetTarget)
+	}
+
+	if err := poolops.SetAuthResetTarget(db, "codex", ""); err != nil {
+		t.Fatalf("SetAuthResetTarget clear: %v", err)
+	}
+	st, err = poolops.Status(db, "codex")
+	if err != nil {
+		t.Fatalf("Status post-clear: %v", err)
+	}
+	if st.AuthResetTarget != "" {
+		t.Fatalf("after clear AuthResetTarget = %q, want empty", st.AuthResetTarget)
+	}
+}
+
+func TestSetAuthResetTargetUnknownPool(t *testing.T) {
+	db := newTestStore(t)
+	err := poolops.SetAuthResetTarget(db, "missing", "x")
+	var nf *poolops.PoolNotFoundError
+	if !errors.As(err, &nf) {
+		t.Fatalf("SetAuthResetTarget unknown pool err = %v, want PoolNotFoundError", err)
+	}
+}
+
+func TestSetAuthResetTargetInvalid(t *testing.T) {
+	db := newTestStore(t, "acct_a")
+	if err := poolops.Create(db, "p", "", []string{"acct_a"}); err != nil {
+		t.Fatalf("Create: %v", err)
+	}
+	if err := poolops.SetAuthResetTarget(db, "p", "bad\x00target"); !errors.Is(err, poolops.ErrInvalidAuthResetTarget) {
+		t.Fatalf("err = %v, want ErrInvalidAuthResetTarget", err)
+	}
+}
+
 func TestCreateErrors(t *testing.T) {
 	db := newTestStore(t, "acct_a")
 
diff --git a/internal/telegram/commands.go b/internal/telegram/commands.go
index c5c2af5..22f5419 100644
--- a/internal/telegram/commands.go
+++ b/internal/telegram/commands.go
@@ -1187,7 +1187,7 @@ func (h *CommandHandler) handleAudit(args []string) string {
 }
 
 // poolUsage is the usage banner for /pool.
-const poolUsage = "Usage: /pool create <name> <a,b[,c]> | /pool list | /pool status <name> | /pool rotate <name> | /pool remove <name>"
+const poolUsage = "Usage: /pool create <name> <a,b[,c]> [auth-reset-target] | /pool list | /pool status <name> | /pool rotate <name> | /pool set-auth-reset <name> <target|-> | /pool remove <name>"
 
 // handlePool dispatches /pool subcommands to the channel-agnostic
 // internal/poolops package, the same package the CLI and REST API call, so
@@ -1221,6 +1221,11 @@ func (h *CommandHandler) handlePool(args []string) string {
 			return "Usage: /pool rotate <name>"
 		}
 		return h.poolRotate(args[1])
+	case "set-auth-reset":
+		if len(args) < 3 {
+			return "Usage: /pool set-auth-reset <name> <target|->  (a single - clears the target)"
+		}
+		return h.poolSetAuthReset(args[1], args[2])
 	case "remove":
 		if len(args) < 2 {
 			return "Usage: /pool remove <name>"
@@ -1233,14 +1238,19 @@ func (h *CommandHandler) handlePool(args []string) string {
 
 func (h *CommandHandler) poolCreate(args []string) string {
 	if len(args) < 2 {
-		return "Usage: /pool create <name> <a,b[,c]>"
+		return "Usage: /pool create <name> <a,b[,c]> [auth-reset-target]"
 	}
 	name := args[0]
 	members, err := poolops.ParseMembers(args[1])
 	if err != nil {
 		return fmt.Sprintf("Failed to create pool: %v", err)
 	}
-	if err := poolops.Create(h.store, name, "", members); err != nil {
+	// Optional 3rd positional arg is the per-pool auth-reset target.
+	authResetTarget := ""
+	if len(args) >= 3 {
+		authResetTarget = args[2]
+	}
+	if err := poolops.CreateWithAuthResetTarget(h.store, name, "", members, authResetTarget); err != nil {
 		return fmt.Sprintf("Failed to create pool: %v", err)
 	}
 	var b strings.Builder
@@ -1248,10 +1258,31 @@ func (h *CommandHandler) poolCreate(args []string) string {
 	for i, m := range members {
 		fmt.Fprintf(&b, "  [%d] %s\n", i, htmlCode(m))
 	}
+	if authResetTarget != "" {
+		fmt.Fprintf(&b, "auth-reset target: %s\n", htmlCode(authResetTarget))
+	}
 	b.WriteString("Bind it with /policy or " + htmlCode("sluice binding add "+name+" --destination <host>"))
 	return b.String()
 }
 
+func (h *CommandHandler) poolSetAuthReset(name, target string) string {
+	// A single "-" is the channel-uniform clear sentinel.
+	if target == "-" {
+		target = ""
+	}
+	if err := poolops.SetAuthResetTarget(h.store, name, target); err != nil {
+		var nf *poolops.PoolNotFoundError
+		if errors.As(err, &nf) {
+			return fmt.Sprintf("No pool named %s", htmlCode(name))
+		}
+		return fmt.Sprintf("Failed to set auth-reset target: %v", err)
+	}
+	if target == "" {
+		return fmt.Sprintf("Cleared auth-reset target for pool %s", htmlCode(name))
+	}
+	return fmt.Sprintf("Set auth-reset target for pool %s to %s", htmlCode(name), htmlCode(target))
+}
+
 func (h *CommandHandler) poolList() string {
 	pools, err := poolops.List(h.store)
 	if err != nil {
@@ -1269,6 +1300,9 @@ func (h *CommandHandler) poolList() string {
 		}
 		fmt.Fprintf(&b, "%s (strategy: %s): %s\n",
 			htmlCode(p.Name), htmlCode(p.Strategy), htmlCode(strings.Join(names, ", ")))
+		if p.AuthResetTarget != "" {
+			fmt.Fprintf(&b, "  auth-reset target: %s\n", htmlCode(p.AuthResetTarget))
+		}
 	}
 	return b.String()
 }
@@ -1309,6 +1343,9 @@ func (h *CommandHandler) poolStatus(name string) string {
 		fmt.Fprintf(&b, "%s[%d] %s  %s\n", marker, m.Position, htmlCode(m.Credential), status)
 	}
 	fmt.Fprintf(&b, "active: %s\n", htmlCode(res.Active))
+	if res.AuthResetTarget != "" {
+		fmt.Fprintf(&b, "auth-reset target: %s\n", htmlCode(res.AuthResetTarget))
+	}
 	return b.String()
 }
 
@@ -1387,8 +1424,8 @@ MCP Upstreams
 /mcp remove <name>
 
 Credential Pools
-/pool create <name> <a,b[,c]> | /pool list | /pool status <name>
-/pool rotate <name> | /pool remove <name>`
+/pool create <name> <a,b[,c]> [auth-reset-target] | /pool list | /pool status <name>
+/pool rotate <name> | /pool set-auth-reset <name> <target|-> | /pool remove <name>`
 	}
 
 	help += `
diff --git a/internal/telegram/commands_test.go b/internal/telegram/commands_test.go
index 9c28dc4..65f6d51 100644
--- a/internal/telegram/commands_test.go
+++ b/internal/telegram/commands_test.go
@@ -2186,6 +2186,66 @@ func TestHandlePoolCreateListStatusRotateRemove(t *testing.T) {
 	}
 }
 
+// TestHandlePoolAuthResetTarget exercises the Telegram adapter for the
+// per-pool auth_reset_target: the optional 3rd create arg, the
+// set-auth-reset subcommand (set and clear via "-"), and that the value
+// reaches the store (the handler is a thin poolops adapter) and is surfaced
+// in list/status output.
+func TestHandlePoolAuthResetTarget(t *testing.T) {
+	s := newTestStore(t)
+	seedPoolOAuthMeta(t, s, "acct_a", "acct_b")
+	h := newTestHandlerWithStore(t, s, nil, "")
+
+	// Create with target as 3rd positional arg.
+	got := h.Handle(&Command{Name: "pool", Args: []string{"create", "codex", "acct_a,acct_b", "openai-codex"}})
+	if !strings.Contains(got, "auth-reset target: ") || !strings.Contains(got, "openai-codex") {
+		t.Fatalf("pool create with target = %q", got)
+	}
+	if p, err := s.GetPool("codex"); err != nil {
+		t.Fatalf("GetPool: %v", err)
+	} else if p.AuthResetTarget != "openai-codex" {
+		t.Fatalf("stored AuthResetTarget = %q, want openai-codex", p.AuthResetTarget)
+	}
+
+	// Surfaced in list and status.
+	got = h.Handle(&Command{Name: "pool", Args: []string{"list"}})
+	if !strings.Contains(got, "openai-codex") {
+		t.Errorf("pool list missing target = %q", got)
+	}
+	got = h.Handle(&Command{Name: "pool", Args: []string{"status", "codex"}})
+	if !strings.Contains(got, "auth-reset target: ") || !strings.Contains(got, "openai-codex") {
+		t.Errorf("pool status missing target = %q", got)
+	}
+
+	// set-auth-reset to a new value.
+	got = h.Handle(&Command{Name: "pool", Args: []string{"set-auth-reset", "codex", "other-target"}})
+	if !strings.Contains(got, "Set auth-reset target") || !strings.Contains(got, "other-target") {
+		t.Fatalf("set-auth-reset = %q", got)
+	}
+	if p, _ := s.GetPool("codex"); p.AuthResetTarget != "other-target" {
+		t.Fatalf("after set stored target = %q, want other-target", p.AuthResetTarget)
+	}
+
+	// Clear with "-".
+	got = h.Handle(&Command{Name: "pool", Args: []string{"set-auth-reset", "codex", "-"}})
+	if !strings.Contains(got, "Cleared auth-reset target") {
+		t.Fatalf("clear = %q", got)
+	}
+	if p, _ := s.GetPool("codex"); p.AuthResetTarget != "" {
+		t.Fatalf("after clear stored target = %q, want empty", p.AuthResetTarget)
+	}
+
+	// Unknown pool and usage.
+	got = h.Handle(&Command{Name: "pool", Args: []string{"set-auth-reset", "ghost", "x"}})
+	if !strings.Contains(got, "No pool named") {
+		t.Errorf("set-auth-reset unknown pool = %q", got)
+	}
+	got = h.Handle(&Command{Name: "pool", Args: []string{"set-auth-reset", "codex"}})
+	if !strings.Contains(got, "Usage:") {
+		t.Errorf("set-auth-reset too few args = %q", got)
+	}
+}
+
 func TestHandlePoolCreateNoMembers(t *testing.T) {
 	s := newTestStore(t)
 	h := newTestHandlerWithStore(t, s, nil, "")

From 57525637b06fce77b9640d53fa1ff1861a785bf6 Mon Sep 17 00:00:00 2001
From: Nikita Nemirovsky <vaze.legend@gmail.com>
Date: Fri, 22 May 2026 16:23:20 +0800
Subject: [PATCH 07/19] feat(container): thread optional exec user through
 ExecInContainer + AgentProfile.ExecUser

---
 ...22-pool-exhaustion-and-agent-auth-reset.md | 10 ++--
 internal/container/agent_profile.go           | 23 +++++++++
 internal/container/agent_profile_test.go      | 14 ++++++
 internal/container/docker.go                  | 15 ++++--
 internal/container/docker_socket.go           |  9 +++-
 internal/container/docker_socket_test.go      | 49 ++++++++++++++++++-
 internal/container/docker_test.go             | 48 +++++++++++++++++-
 7 files changed, 154 insertions(+), 14 deletions(-)

diff --git a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
index 5197770..bb9f32e 100644
--- a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
+++ b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
@@ -333,14 +333,14 @@ precedent) and emits an `agent_auth_reset` audit event.
 - Modify: `internal/container/docker_test.go` (`mockClient`) + any other `ExecInContainer` callers
 - Modify: `internal/container/agent_profile.go` (`AgentProfile.ExecUser() string`)
 
-- [ ] add `User string` to `execCreateRequest`; thread an optional user arg through
+- [x] add `User string` to `execCreateRequest`; thread an optional user arg through
   `ExecInContainer` on `ContainerClient` + `SocketClient` (empty "" = current root behavior)
-- [ ] update `mockClient` and every existing `ExecInContainer` caller (no-op default, no
+- [x] update `mockClient` and every existing `ExecInContainer` caller (no-op default, no
   behavior change)
-- [ ] add `ExecUser() string` to `AgentProfile` (hermes `"10000:10000"`, openclaw "")
-- [ ] write tests: socket exec body carries `User` when set and omits/empties it otherwise;
+- [x] add `ExecUser() string` to `AgentProfile` (hermes `"10000:10000"`, openclaw "")
+- [x] write tests: socket exec body carries `User` when set and omits/empties it otherwise;
   profile `ExecUser` values; existing callers unaffected
-- [ ] run `go test ./internal/container/ -race`; gofumpt; vet — pass before Task 7
+- [x] run `go test ./internal/container/ -race`; gofumpt; vet — pass before Task 7
 
 ### Task 7: Profile `ResetAuthCmd` + `ContainerManager.ResetAuth`
 
diff --git a/internal/container/agent_profile.go b/internal/container/agent_profile.go
index 8e82f94..6783d03 100644
--- a/internal/container/agent_profile.go
+++ b/internal/container/agent_profile.go
@@ -34,6 +34,26 @@ type AgentProfile struct {
 	// config in place and the operator must wire the MCP gateway
 	// manually before starting the agent.
 	WireMCPCmd func(name, url string) []string
+
+	// execUser is the UID:GID (or user name) that file-writing execs
+	// should run as inside the agent container. Empty means run as the
+	// image's USER (typically root). Hermes runs its gateway as a
+	// non-root runtime user (UID 10000), so an exec that writes a
+	// hermes-owned file (e.g. `hermes auth reset`) must run as that
+	// user or it root-chowns the file and bricks the gateway. Read it
+	// via the ExecUser accessor, which is nil-safe.
+	execUser string
+}
+
+// ExecUser returns the UID:GID (or user name) that file-writing execs
+// should run as for this profile, or "" to run as the image's USER. The
+// receiver is nil-safe so callers holding a possibly-nil profile can call
+// it directly.
+func (p *AgentProfile) ExecUser() string {
+	if p == nil {
+		return ""
+	}
+	return p.execUser
 }
 
 // OpenclawProfile is the default profile. Openclaw stores secrets at
@@ -100,6 +120,9 @@ var HermesProfile = &AgentProfile{
 	Name:           "hermes",
 	EnvFileRelPath: ".hermes/.env",
 	ReloadCmd:      nil,
+	// Hermes' gateway runs as UID 10000; file-writing execs must match
+	// so they don't root-chown hermes-owned files and brick the gateway.
+	execUser: "10000:10000",
 	WireMCPCmd: func(name, url string) []string {
 		return []string{
 			"sh",
diff --git a/internal/container/agent_profile_test.go b/internal/container/agent_profile_test.go
index ee7da70..4108d94 100644
--- a/internal/container/agent_profile_test.go
+++ b/internal/container/agent_profile_test.go
@@ -348,3 +348,17 @@ func TestResolveProfile_NilDefaultsToOpenclaw(t *testing.T) {
 		t.Error("non-nil profile should be returned as-is")
 	}
 }
+
+func TestProfileExecUser(t *testing.T) {
+	if got := HermesProfile.ExecUser(); got != "10000:10000" {
+		t.Errorf("HermesProfile.ExecUser() = %q, want 10000:10000", got)
+	}
+	if got := OpenclawProfile.ExecUser(); got != "" {
+		t.Errorf("OpenclawProfile.ExecUser() = %q, want empty", got)
+	}
+	// Nil-safe accessor.
+	var p *AgentProfile
+	if got := p.ExecUser(); got != "" {
+		t.Errorf("(*AgentProfile)(nil).ExecUser() = %q, want empty", got)
+	}
+}
diff --git a/internal/container/docker.go b/internal/container/docker.go
index 5c10ef0..208b4a8 100644
--- a/internal/container/docker.go
+++ b/internal/container/docker.go
@@ -15,7 +15,11 @@ type ContainerClient interface { //nolint:revive // stuttering accepted for clar
 	RemoveContainer(ctx context.Context, name string) error
 	CreateContainer(ctx context.Context, spec ContainerSpec) (string, error)
 	StartContainer(ctx context.Context, id string) error
-	ExecInContainer(ctx context.Context, name string, cmd []string) error
+	// ExecInContainer runs cmd inside the named container. user is the
+	// optional UID:GID (or user name) the exec runs as; an empty string
+	// preserves the historical behavior of running as the image's USER
+	// (typically root for agent containers).
+	ExecInContainer(ctx context.Context, name, user string, cmd []string) error
 }
 
 // ContainerState holds the result of inspecting a container.
@@ -94,7 +98,10 @@ func (m *DockerManager) InjectEnvVars(ctx context.Context, envMap map[string]str
 		return fmt.Errorf("build env injection script: %w", err)
 	}
 
-	if execErr := m.client.ExecInContainer(ctx, m.containerName,
+	// Pass the empty user so the env-injection exec keeps its historical
+	// root behavior; the generated script chowns the file back to the
+	// runtime user itself (see BuildEnvInjectionScriptForProfile).
+	if execErr := m.client.ExecInContainer(ctx, m.containerName, "",
 		[]string{"sh", "-c", script}); execErr != nil {
 		return fmt.Errorf("inject env vars: %w", execErr)
 	}
@@ -117,7 +124,7 @@ func (m *DockerManager) ReloadSecrets(ctx context.Context) error {
 		log.Printf("agent profile %q has no in-place reload; new secrets take effect on next agent run", m.profile.Name)
 		return nil
 	}
-	return m.client.ExecInContainer(ctx, m.containerName, m.profile.ReloadCmd())
+	return m.client.ExecInContainer(ctx, m.containerName, "", m.profile.ReloadCmd())
 }
 
 // WireMCPGateway registers sluice's MCP gateway URL inside the agent's
@@ -137,7 +144,7 @@ func (m *DockerManager) WireMCPGateway(ctx context.Context, name, sluiceURL stri
 		log.Printf("agent profile %q does not support automatic MCP wiring; configure %s manually", m.profile.Name, sluiceURL)
 		return nil
 	}
-	err := m.client.ExecInContainer(ctx, m.containerName, m.profile.WireMCPCmd(name, sluiceURL))
+	err := m.client.ExecInContainer(ctx, m.containerName, "", m.profile.WireMCPCmd(name, sluiceURL))
 	if err != nil && m.profile.Name == OpenclawProfile.Name &&
 		strings.Contains(err.Error(), "exit") && strings.Contains(err.Error(), "137") {
 		return nil
diff --git a/internal/container/docker_socket.go b/internal/container/docker_socket.go
index 7809c5c..80a023b 100644
--- a/internal/container/docker_socket.go
+++ b/internal/container/docker_socket.go
@@ -241,11 +241,15 @@ func (c *SocketClient) StartContainer(ctx context.Context, id string) error {
 	return nil
 }
 
-// ExecInContainer runs a command inside a running container.
-func (c *SocketClient) ExecInContainer(ctx context.Context, name string, cmd []string) error {
+// ExecInContainer runs a command inside a running container. user is the
+// optional UID:GID (or user name) the exec runs as; an empty string omits
+// the User field from the exec-create request, preserving Docker's default
+// of running as the image's USER (typically root for agent containers).
+func (c *SocketClient) ExecInContainer(ctx context.Context, name, user string, cmd []string) error {
 	// Step 1: Create exec instance.
 	createBody := execCreateRequest{
 		Cmd:          cmd,
+		User:         user,
 		AttachStdout: true,
 		AttachStderr: true,
 	}
@@ -395,6 +399,7 @@ type createResponseBody struct {
 
 type execCreateRequest struct {
 	Cmd          []string `json:"Cmd"`
+	User         string   `json:"User,omitempty"`
 	AttachStdout bool     `json:"AttachStdout"`
 	AttachStderr bool     `json:"AttachStderr"`
 }
diff --git a/internal/container/docker_socket_test.go b/internal/container/docker_socket_test.go
index 56cc9e0..2463fa0 100644
--- a/internal/container/docker_socket_test.go
+++ b/internal/container/docker_socket_test.go
@@ -316,6 +316,7 @@ func TestSocketClientExecInContainer(t *testing.T) {
 	defer cleanup()
 
 	var execCreateBody execCreateRequest
+	var rawCreateBody map[string]any
 	mux.HandleFunc("/v1.25/containers/mycontainer/exec", func(w http.ResponseWriter, r *http.Request) {
 		if r.Method != "POST" {
 			http.Error(w, "want POST", http.StatusMethodNotAllowed)
@@ -323,6 +324,7 @@ func TestSocketClientExecInContainer(t *testing.T) {
 		}
 		data, _ := io.ReadAll(r.Body)
 		_ = json.Unmarshal(data, &execCreateBody)
+		_ = json.Unmarshal(data, &rawCreateBody)
 		w.WriteHeader(http.StatusCreated)
 		_ = json.NewEncoder(w).Encode(map[string]string{"Id": "exec123"})
 	})
@@ -339,7 +341,7 @@ func TestSocketClientExecInContainer(t *testing.T) {
 		_ = json.NewEncoder(w).Encode(map[string]interface{}{"ExitCode": 0})
 	})
 
-	err := client.ExecInContainer(context.Background(), "mycontainer", []string{"openclaw", "secrets", "reload"})
+	err := client.ExecInContainer(context.Background(), "mycontainer", "", []string{"openclaw", "secrets", "reload"})
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
@@ -347,6 +349,49 @@ func TestSocketClientExecInContainer(t *testing.T) {
 	if len(execCreateBody.Cmd) != 3 || execCreateBody.Cmd[0] != "openclaw" {
 		t.Errorf("exec Cmd = %v, want [openclaw secrets reload]", execCreateBody.Cmd)
 	}
+	if execCreateBody.User != "" {
+		t.Errorf("exec User = %q, want empty for default-root exec", execCreateBody.User)
+	}
+	if _, ok := rawCreateBody["User"]; ok {
+		t.Error("exec-create body should omit the User field when no user is set (omitempty)")
+	}
+}
+
+// TestSocketClientExecInContainerWithUser verifies that a non-empty user is
+// carried into the exec-create request body (Docker runs the exec as that
+// UID:GID), and that the omitempty tag keeps it absent when empty.
+func TestSocketClientExecInContainerWithUser(t *testing.T) {
+	client, mux, cleanup := newTestServer(t)
+	defer cleanup()
+
+	var rawCreateBody map[string]any
+	var execCreateBody execCreateRequest
+	mux.HandleFunc("/v1.25/containers/mycontainer/exec", func(w http.ResponseWriter, r *http.Request) {
+		data, _ := io.ReadAll(r.Body)
+		_ = json.Unmarshal(data, &execCreateBody)
+		_ = json.Unmarshal(data, &rawCreateBody)
+		w.WriteHeader(http.StatusCreated)
+		_ = json.NewEncoder(w).Encode(map[string]string{"Id": "exec123"})
+	})
+	mux.HandleFunc("/v1.25/exec/exec123/start", func(w http.ResponseWriter, _ *http.Request) {
+		w.WriteHeader(http.StatusOK)
+	})
+	mux.HandleFunc("/v1.25/exec/exec123/json", func(w http.ResponseWriter, _ *http.Request) {
+		_ = json.NewEncoder(w).Encode(map[string]interface{}{"ExitCode": 0})
+	})
+
+	err := client.ExecInContainer(context.Background(), "mycontainer", "10000:10000",
+		[]string{"hermes", "auth", "reset", "openai-codex"})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	if execCreateBody.User != "10000:10000" {
+		t.Errorf("exec User = %q, want 10000:10000", execCreateBody.User)
+	}
+	if _, ok := rawCreateBody["User"]; !ok {
+		t.Error("exec-create body should contain a User field when a user is set")
+	}
 }
 
 func TestSocketClientExecNonZeroExit(t *testing.T) {
@@ -366,7 +411,7 @@ func TestSocketClientExecNonZeroExit(t *testing.T) {
 		_ = json.NewEncoder(w).Encode(map[string]interface{}{"ExitCode": 127})
 	})
 
-	err := client.ExecInContainer(context.Background(), "mycontainer", []string{"openclaw", "secrets", "reload"})
+	err := client.ExecInContainer(context.Background(), "mycontainer", "", []string{"openclaw", "secrets", "reload"})
 	if err == nil {
 		t.Fatal("expected error for non-zero exit code")
 	}
diff --git a/internal/container/docker_test.go b/internal/container/docker_test.go
index 078853a..d91932a 100644
--- a/internal/container/docker_test.go
+++ b/internal/container/docker_test.go
@@ -25,6 +25,8 @@ type mockClient struct {
 	execCalled  bool
 	execCmd     []string
 	execCalls   [][]string // all exec calls recorded
+	execUser    string     // user from the most recent exec call
+	execUsers   []string   // user from every exec call, in order
 	// Track container names passed to each method.
 	inspectedName string
 	stoppedName   string
@@ -61,11 +63,13 @@ func (m *mockClient) StartContainer(_ context.Context, id string) error {
 	return m.startErr
 }
 
-func (m *mockClient) ExecInContainer(_ context.Context, name string, cmd []string) error {
+func (m *mockClient) ExecInContainer(_ context.Context, name, user string, cmd []string) error {
 	m.execCalled = true
 	m.execName = name
 	m.execCmd = cmd
+	m.execUser = user
 	m.execCalls = append(m.execCalls, cmd)
+	m.execUsers = append(m.execUsers, user)
 
 	// Use per-call error if available.
 	if len(m.execErrs) > 0 {
@@ -471,6 +475,48 @@ func TestInjectEnvVarsHermesProfile(t *testing.T) {
 	}
 }
 
+// TestExistingExecCallersUseDefaultUser pins the no-behavior-change
+// guarantee for Task 6: the existing DockerManager exec call sites
+// (InjectEnvVars, ReloadSecrets, WireMCPGateway) keep running as the
+// image's USER (empty user string), regardless of the profile's
+// ExecUser. Only the future auth-reset path (Task 7) opts into a user.
+func TestExistingExecCallersUseDefaultUser(t *testing.T) {
+	// Hermes profile (ExecUser "10000:10000") exercises both the env
+	// write and the MCP wire exec; neither should adopt the profile user.
+	mc := &mockClient{}
+	mgr := NewDockerManagerForProfile(mc, "hermes", HermesProfile)
+
+	if err := mgr.InjectEnvVars(context.Background(),
+		map[string]string{"OPENAI_API_KEY": "sk-phantom"}, false); err != nil {
+		t.Fatalf("InjectEnvVars: %v", err)
+	}
+	if err := mgr.WireMCPGateway(context.Background(), "sluice", "http://sluice:3000/mcp"); err != nil {
+		t.Fatalf("WireMCPGateway: %v", err)
+	}
+
+	if len(mc.execUsers) == 0 {
+		t.Fatal("expected at least one exec call recorded")
+	}
+	for i, u := range mc.execUsers {
+		if u != "" {
+			t.Errorf("exec call %d ran as user %q, want empty (no behavior change)", i, u)
+		}
+	}
+
+	// Openclaw profile runs the env write + reload exec; both default too.
+	mcO := &mockClient{}
+	mgrO := NewDockerManagerForProfile(mcO, "openclaw", OpenclawProfile)
+	if err := mgrO.InjectEnvVars(context.Background(),
+		map[string]string{"OPENAI_API_KEY": "sk-phantom"}, false); err != nil {
+		t.Fatalf("InjectEnvVars (openclaw): %v", err)
+	}
+	for i, u := range mcO.execUsers {
+		if u != "" {
+			t.Errorf("openclaw exec call %d ran as user %q, want empty", i, u)
+		}
+	}
+}
+
 func TestWireMCPGateway_Exit137GatedToOpenclaw(t *testing.T) {
 	// Openclaw profile: exit 137 from the gateway restart is swallowed.
 	mc := &mockClient{execErr: fmt.Errorf("exec failed: exit 137")}

From deb7d88fd7f350363070e8eeb6c709f1ad444dd3 Mon Sep 17 00:00:00 2001
From: Nikita Nemirovsky <vaze.legend@gmail.com>
Date: Fri, 22 May 2026 16:29:29 +0800
Subject: [PATCH 08/19] feat(container): add ResetAuth + profile ResetAuthCmd
 for agent auth un-latch

---
 cmd/sluice/main_test.go                       |  4 +
 ...22-pool-exhaustion-and-agent-auth-reset.md | 15 ++--
 internal/api/server_test.go                   |  4 +
 internal/container/agent_profile.go           | 26 +++++++
 internal/container/agent_profile_test.go      | 27 +++++++
 internal/container/apple.go                   | 15 ++++
 internal/container/docker.go                  | 17 ++++
 internal/container/docker_test.go             | 77 +++++++++++++++++++
 internal/container/tart.go                    | 15 ++++
 internal/container/types.go                   | 34 ++++++++
 internal/telegram/approval_test.go            |  4 +
 11 files changed, 231 insertions(+), 7 deletions(-)

diff --git a/cmd/sluice/main_test.go b/cmd/sluice/main_test.go
index 53e1bb1..7b2c2bc 100644
--- a/cmd/sluice/main_test.go
+++ b/cmd/sluice/main_test.go
@@ -1453,6 +1453,10 @@ func (m *mockContainerMgr) ReloadSecrets(_ context.Context) error {
 	return nil
 }
 
+func (m *mockContainerMgr) ResetAuth(_ context.Context, _ string) error {
+	return nil
+}
+
 func (m *mockContainerMgr) Runtime() container.Runtime {
 	return container.RuntimeDocker
 }
diff --git a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
index bb9f32e..acde716 100644
--- a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
+++ b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
@@ -350,16 +350,17 @@ precedent) and emits an `agent_auth_reset` audit event.
 - Modify: `internal/container/docker.go`, `apple.go`, `tart.go`, standalone (`none`)
 - Modify: `internal/container/agent_profile_test.go`, `docker_test.go`
 
-- [ ] add `ResetAuthCmd func(target string) []string` to `AgentProfile`; `HermesProfile` →
+- [x] add `ResetAuthCmd func(target string) []string` to `AgentProfile`; `HermesProfile` →
   pure-argv `["/opt/hermes/.venv/bin/hermes","auth","reset",target]`;
   `OpenclawProfile.ResetAuthCmd` nil (documented; Post-Completion verification)
-- [ ] add `ResetAuth(ctx, target string) error` to `ContainerManager`; implement per backend;
-  nil `ResetAuthCmd` → log notice, return nil (mirror `ReloadSecrets`); docker exec passes
-  `profile.ExecUser()` (Task 6) so hermes runs as 10000:10000
-- [ ] validate `target` (non-empty, no NUL, allowlisted charset) before exec
-- [ ] write tests: argv per profile; nil-cmd no-ops with notice; empty/invalid target
+- [x] add `ResetAuth(ctx, target string) error` to `ContainerManager`; implement per backend
+  (docker/apple/tart; `none`/standalone has no manager struct — `containerMgr` is nil and
+  handled by the caller); nil `ResetAuthCmd` → log notice, return nil (mirror `ReloadSecrets`);
+  docker exec passes `profile.ExecUser()` (Task 6) so hermes runs as 10000:10000
+- [x] validate `target` (non-empty, no NUL, allowlisted charset `[A-Za-z0-9_.:-]+`) before exec
+- [x] write tests: argv per profile; nil-cmd no-ops with notice; empty/invalid target
   rejected; **docker exec uses the runtime UID from `ExecUser` (now passable, Task 6)**
-- [ ] run `go test ./internal/container/ -race`; gofumpt; vet — pass before Task 8
+- [x] run `go test ./internal/container/ -race`; gofumpt; vet — pass before Task 8
 
 ### Task 8: Wire auto-reset on the recovery edge (opt-in, per pool)
 
diff --git a/internal/api/server_test.go b/internal/api/server_test.go
index f43092f..fa069fb 100644
--- a/internal/api/server_test.go
+++ b/internal/api/server_test.go
@@ -3779,6 +3779,10 @@ func (m *mockContainerMgr) ReloadSecrets(_ context.Context) error {
 	return nil
 }
 
+func (m *mockContainerMgr) ResetAuth(_ context.Context, _ string) error {
+	return nil
+}
+
 func (m *mockContainerMgr) Runtime() container.Runtime {
 	return container.RuntimeDocker
 }
diff --git a/internal/container/agent_profile.go b/internal/container/agent_profile.go
index 6783d03..66423cf 100644
--- a/internal/container/agent_profile.go
+++ b/internal/container/agent_profile.go
@@ -35,6 +35,17 @@ type AgentProfile struct {
 	// manually before starting the agent.
 	WireMCPCmd func(name, url string) []string
 
+	// ResetAuthCmd returns the argv to exec inside the agent container in
+	// order to clear ("un-latch") the agent's local auth state for the
+	// given target so it retries after a pool quota window passes. It is
+	// pure argv (no `sh -c` wrapper), so there is no shell-metacharacter
+	// threat from the interpolated target; the target is still validated
+	// (validateResetAuthTarget) before exec as defense in depth.
+	// Returning nil means the profile has no auth-reset mechanism; the
+	// caller should log a notice and rely on the agent recovering on its
+	// own (mirrors a nil ReloadCmd).
+	ResetAuthCmd func(target string) []string
+
 	// execUser is the UID:GID (or user name) that file-writing execs
 	// should run as inside the agent container. Empty means run as the
 	// image's USER (typically root). Hermes runs its gateway as a
@@ -69,6 +80,12 @@ var OpenclawProfile = &AgentProfile{
 	WireMCPCmd: func(name, url string) []string {
 		return GatewayRPCNodeCommand("wire-mcp", name, url)
 	},
+	// ResetAuthCmd is nil pending verification of whether openclaw
+	// latches its auth state on a usage-limit exhaustion the way hermes
+	// does. If it turns out openclaw needs an explicit un-latch, this is
+	// the place to add the gateway-RPC command (see Post-Completion in
+	// docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md).
+	ResetAuthCmd: nil,
 }
 
 // hermesMCPWireScript is a small Python script that registers an MCP
@@ -133,6 +150,15 @@ var HermesProfile = &AgentProfile{
 			url,
 		}
 	},
+	// ResetAuthCmd un-latches hermes' local auth state for the given
+	// target via `hermes auth reset <target>`. It is pure argv (no
+	// `sh -c`), so the target cannot smuggle shell metacharacters — but
+	// the caller still validates it (validateResetAuthTarget) before
+	// exec. The exec must run as the runtime UID (see execUser above) or
+	// it root-chowns hermes-owned auth files and bricks the gateway.
+	ResetAuthCmd: func(target string) []string {
+		return []string{"/opt/hermes/.venv/bin/hermes", "auth", "reset", target}
+	},
 }
 
 // builtinProfiles is the registry consulted by ProfileFromName.
diff --git a/internal/container/agent_profile_test.go b/internal/container/agent_profile_test.go
index 4108d94..8386298 100644
--- a/internal/container/agent_profile_test.go
+++ b/internal/container/agent_profile_test.go
@@ -362,3 +362,30 @@ func TestProfileExecUser(t *testing.T) {
 		t.Errorf("(*AgentProfile)(nil).ExecUser() = %q, want empty", got)
 	}
 }
+
+func TestHermesProfile_ResetAuthCmd(t *testing.T) {
+	if HermesProfile.ResetAuthCmd == nil {
+		t.Fatal("hermes profile should have a ResetAuthCmd")
+	}
+	got := HermesProfile.ResetAuthCmd("openai-codex")
+	want := []string{"/opt/hermes/.venv/bin/hermes", "auth", "reset", "openai-codex"}
+	if len(got) != len(want) {
+		t.Fatalf("ResetAuthCmd argv = %v, want %v", got, want)
+	}
+	for i := range want {
+		if got[i] != want[i] {
+			t.Errorf("ResetAuthCmd[%d] = %q, want %q", i, got[i], want[i])
+		}
+	}
+	// Pure argv: no sh -c wrapper, so the target lands as its own
+	// element and cannot smuggle shell metacharacters.
+	if got[0] == "sh" {
+		t.Errorf("ResetAuthCmd should be pure argv, not an sh -c wrapper: %v", got)
+	}
+}
+
+func TestOpenclawProfile_ResetAuthCmdIsNil(t *testing.T) {
+	if OpenclawProfile.ResetAuthCmd != nil {
+		t.Error("openclaw profile should have nil ResetAuthCmd (pending verification it latches)")
+	}
+}
diff --git a/internal/container/apple.go b/internal/container/apple.go
index 50a7e0b..65bb86a 100644
--- a/internal/container/apple.go
+++ b/internal/container/apple.go
@@ -252,6 +252,21 @@ func (m *AppleManager) ReloadSecrets(ctx context.Context) error {
 	return err
 }
 
+// ResetAuth clears the agent's local auth state for target. The mechanism
+// is profile-specific; a nil ResetAuthCmd is a logged no-op (mirrors
+// ReloadSecrets). The target is validated before exec.
+func (m *AppleManager) ResetAuth(ctx context.Context, target string) error {
+	if m.profile.ResetAuthCmd == nil {
+		log.Printf("agent profile %q has no auth-reset command; agent must recover on its own", m.profile.Name)
+		return nil
+	}
+	if err := validateResetAuthTarget(target); err != nil {
+		return err
+	}
+	_, err := m.cli.Exec(ctx, m.containerName, m.profile.ResetAuthCmd(target))
+	return err
+}
+
 // WireMCPGateway registers sluice's MCP gateway URL in the agent's
 // config. The exact storage format depends on the profile.
 func (m *AppleManager) WireMCPGateway(ctx context.Context, name, sluiceURL string) error {
diff --git a/internal/container/docker.go b/internal/container/docker.go
index 208b4a8..b29566c 100644
--- a/internal/container/docker.go
+++ b/internal/container/docker.go
@@ -127,6 +127,23 @@ func (m *DockerManager) ReloadSecrets(ctx context.Context) error {
 	return m.client.ExecInContainer(ctx, m.containerName, "", m.profile.ReloadCmd())
 }
 
+// ResetAuth clears the agent's local auth state for target so it retries
+// after a credential-pool quota window passes. The mechanism is
+// profile-specific (hermes runs `hermes auth reset <target>`); a nil
+// ResetAuthCmd is a logged no-op (mirrors ReloadSecrets). The exec runs
+// as the profile's ExecUser (UID 10000:10000 for hermes) so it does not
+// root-chown the agent's auth files and brick the gateway.
+func (m *DockerManager) ResetAuth(ctx context.Context, target string) error {
+	if m.profile.ResetAuthCmd == nil {
+		log.Printf("agent profile %q has no auth-reset command; agent must recover on its own", m.profile.Name)
+		return nil
+	}
+	if err := validateResetAuthTarget(target); err != nil {
+		return err
+	}
+	return m.client.ExecInContainer(ctx, m.containerName, m.profile.ExecUser(), m.profile.ResetAuthCmd(target))
+}
+
 // WireMCPGateway registers sluice's MCP gateway URL inside the agent's
 // config so that its embedded runtime discovers sluice as an MCP server.
 // The exact storage format depends on the profile (openclaw patches its
diff --git a/internal/container/docker_test.go b/internal/container/docker_test.go
index d91932a..13dcaa9 100644
--- a/internal/container/docker_test.go
+++ b/internal/container/docker_test.go
@@ -766,3 +766,80 @@ func TestDockerManagerInjectCACertNoop(t *testing.T) {
 		t.Error("Docker InjectCACert should not exec anything")
 	}
 }
+
+func TestResetAuthHermesRunsAsRuntimeUID(t *testing.T) {
+	mc := &mockClient{}
+	mgr := NewDockerManagerForProfile(mc, "hermes", HermesProfile)
+
+	if err := mgr.ResetAuth(context.Background(), "openai-codex"); err != nil {
+		t.Fatalf("ResetAuth: %v", err)
+	}
+	if len(mc.execCalls) != 1 {
+		t.Fatalf("expected 1 exec call, got %d", len(mc.execCalls))
+	}
+	// The hermes reset exec must run as the runtime UID so it does not
+	// root-chown hermes-owned auth files and brick the gateway.
+	if mc.execUsers[0] != "10000:10000" {
+		t.Errorf("ResetAuth exec user = %q, want 10000:10000", mc.execUsers[0])
+	}
+	want := []string{"/opt/hermes/.venv/bin/hermes", "auth", "reset", "openai-codex"}
+	cmd := mc.execCalls[0]
+	if len(cmd) != len(want) {
+		t.Fatalf("ResetAuth argv = %v, want %v", cmd, want)
+	}
+	for i := range want {
+		if cmd[i] != want[i] {
+			t.Errorf("ResetAuth argv[%d] = %q, want %q", i, cmd[i], want[i])
+		}
+	}
+}
+
+func TestResetAuthNilCmdNoOps(t *testing.T) {
+	// Openclaw has a nil ResetAuthCmd: ResetAuth logs a notice and
+	// returns nil without exec'ing anything (mirrors ReloadSecrets).
+	mc := &mockClient{}
+	mgr := NewDockerManagerForProfile(mc, "openclaw", OpenclawProfile)
+
+	if err := mgr.ResetAuth(context.Background(), "openai-codex"); err != nil {
+		t.Fatalf("ResetAuth with nil cmd should return nil, got: %v", err)
+	}
+	if mc.execCalled {
+		t.Error("ResetAuth with nil ResetAuthCmd should not exec anything")
+	}
+}
+
+func TestResetAuthRejectsInvalidTarget(t *testing.T) {
+	cases := []struct {
+		name   string
+		target string
+	}{
+		{"empty", ""},
+		{"nul byte", "openai\x00codex"},
+		{"shell metachar", "openai;rm -rf /"},
+		{"space", "openai codex"},
+		{"slash", "openai/codex"},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			mc := &mockClient{}
+			mgr := NewDockerManagerForProfile(mc, "hermes", HermesProfile)
+			if err := mgr.ResetAuth(context.Background(), tc.target); err == nil {
+				t.Errorf("ResetAuth(%q) should reject invalid target, got nil error", tc.target)
+			}
+			if mc.execCalled {
+				t.Errorf("ResetAuth(%q) should not exec on invalid target", tc.target)
+			}
+		})
+	}
+}
+
+func TestResetAuthValidTargetWithColonAndDot(t *testing.T) {
+	mc := &mockClient{}
+	mgr := NewDockerManagerForProfile(mc, "hermes", HermesProfile)
+	if err := mgr.ResetAuth(context.Background(), "provider:account.v2"); err != nil {
+		t.Fatalf("ResetAuth with allowlisted target should pass: %v", err)
+	}
+	if !mc.execCalled {
+		t.Error("ResetAuth with valid target should exec")
+	}
+}
diff --git a/internal/container/tart.go b/internal/container/tart.go
index 4d389b1..f4ad5a8 100644
--- a/internal/container/tart.go
+++ b/internal/container/tart.go
@@ -291,6 +291,21 @@ func (m *TartManager) ReloadSecrets(ctx context.Context) error {
 	return err
 }
 
+// ResetAuth clears the agent's local auth state for target. The mechanism
+// is profile-specific; a nil ResetAuthCmd is a logged no-op (mirrors
+// ReloadSecrets). The target is validated before exec.
+func (m *TartManager) ResetAuth(ctx context.Context, target string) error {
+	if m.profile.ResetAuthCmd == nil {
+		log.Printf("agent profile %q has no auth-reset command; agent must recover on its own", m.profile.Name)
+		return nil
+	}
+	if err := validateResetAuthTarget(target); err != nil {
+		return err
+	}
+	_, err := m.cli.Exec(ctx, m.vmName, m.profile.ResetAuthCmd(target))
+	return err
+}
+
 // WireMCPGateway registers sluice's MCP gateway URL in the agent's config.
 // The exact storage format depends on the profile.
 func (m *TartManager) WireMCPGateway(ctx context.Context, name, sluiceURL string) error {
diff --git a/internal/container/types.go b/internal/container/types.go
index 7c1766c..39bf169 100644
--- a/internal/container/types.go
+++ b/internal/container/types.go
@@ -76,6 +76,15 @@ type ContainerManager interface { //nolint:revive // stuttering accepted for cla
 	// ReloadSecrets signals the agent to re-read secrets from the env file.
 	ReloadSecrets(ctx context.Context) error
 
+	// ResetAuth clears ("un-latches") the agent's local auth state for the
+	// given target so the agent retries after a credential-pool quota
+	// window passes (the agent will not self-recover otherwise). The
+	// mechanism is profile-specific (hermes runs `hermes auth reset
+	// <target>` as its runtime UID; openclaw has no reset yet). When the
+	// profile provides no reset command this is a logged no-op so callers
+	// can treat it as best-effort. The target is validated before exec.
+	ResetAuth(ctx context.Context, target string) error
+
 	// WireMCPGateway registers sluice's MCP gateway URL under
 	// mcp.servers.<name> in the agent's config so the embedded runtime
 	// discovers sluice as an MCP server. Idempotent: a second call with
@@ -112,6 +121,31 @@ func ValidateEnvVarKey(key string) error {
 	return nil
 }
 
+// resetAuthTargetRe matches an allowlisted auth-reset target: one or more
+// letters, digits, dash, underscore, dot, or colon. Targets are agent
+// identifiers like "openai-codex"; the colon allows scoped names such as
+// "provider:account". The target is exec'd as a pure argv element (no
+// shell), so this allowlist is defense in depth, not a shell-escape.
+var resetAuthTargetRe = regexp.MustCompile(`^[A-Za-z0-9_.:-]+$`)
+
+// validateResetAuthTarget checks that an auth-reset target is non-empty,
+// free of NUL bytes, and within the allowlisted charset before it is
+// passed to ContainerManager.ResetAuth. Mirrors ValidateEnvVarKey's
+// fail-closed style: an invalid target is rejected (no exec) rather than
+// sanitized.
+func validateResetAuthTarget(target string) error {
+	if target == "" {
+		return fmt.Errorf("auth-reset target is empty")
+	}
+	if strings.ContainsRune(target, 0) {
+		return fmt.Errorf("auth-reset target contains NUL byte")
+	}
+	if !resetAuthTargetRe.MatchString(target) {
+		return fmt.Errorf("invalid auth-reset target %q: must match [A-Za-z0-9_.:-]+", target)
+	}
+	return nil
+}
+
 // validateEnvVarValue rejects values that cannot be safely written as a
 // single line of a dotenv-style file. Newlines would split one logical
 // entry into multiple file lines (the second line would either be a
diff --git a/internal/telegram/approval_test.go b/internal/telegram/approval_test.go
index d8cc48c..081ed23 100644
--- a/internal/telegram/approval_test.go
+++ b/internal/telegram/approval_test.go
@@ -2092,6 +2092,10 @@ func (m *mockContainerMgr) ReloadSecrets(_ context.Context) error {
 	return nil
 }
 
+func (m *mockContainerMgr) ResetAuth(_ context.Context, _ string) error {
+	return nil
+}
+
 func (m *mockContainerMgr) Runtime() container.Runtime {
 	return container.RuntimeDocker
 }

From 8f4a12a2b5dfeb8fef61e6480f325e42b582bad1 Mon Sep 17 00:00:00 2001
From: Nikita Nemirovsky <vaze.legend@gmail.com>
Date: Fri, 22 May 2026 16:36:59 +0800
Subject: [PATCH 09/19] feat(pools): auto-reset agent auth on pool recovery
 edge (opt-in per pool)

---
 cmd/sluice/main.go                            | 104 ++++++++
 cmd/sluice/main_test.go                       | 250 +++++++++++++++++-
 ...22-pool-exhaustion-and-agent-auth-reset.md |  18 +-
 3 files changed, 361 insertions(+), 11 deletions(-)

diff --git a/cmd/sluice/main.go b/cmd/sluice/main.go
index ef6094a..dd61bb5 100644
--- a/cmd/sluice/main.go
+++ b/cmd/sluice/main.go
@@ -528,6 +528,17 @@ func main() {
 		}()
 	})
 
+	// Wire pool recovery side effects (Task 8): the always-fires operator
+	// notice (fanned across broker channels, independent of any reset target)
+	// and the opt-in, per-pool agent auth auto-reset. Registered
+	// UNCONDITIONALLY, mirroring SetOnFailover: the notice is internally gated
+	// on a broker being present, and the reset is internally gated on the
+	// recovered pool having a non-empty auth_reset_target AND a container
+	// manager. Both side effects run in detached goroutines so the recovery
+	// monitor is never blocked by a SQLite read, a Telegram round-trip, or a
+	// container exec.
+	wirePoolRecovery(srv, db, containerMgr, failoverBroker, logger)
+
 	if len(allChannels) > 0 {
 		// Start all channels.
 		if tgChannel != nil {
@@ -955,6 +966,99 @@ func injectEnvVarsFromStore(db *store.Store, mgr container.ContainerManager) err
 	return nil
 }
 
+// auditLogger is the minimal audit-write surface wirePoolRecovery needs.
+// *audit.FileLogger satisfies it; tests can supply a fake.
+type auditLogger interface {
+	Log(evt audit.Event) error
+}
+
+// wirePoolRecovery registers the two pool-recovery side effects on the proxy
+// server (Task 8), mirroring the SetOnFailover wiring:
+//
+//   - SetOnPoolRecoveredNotice: ALWAYS fires on a recovery edge, independent of
+//     any reset target. Builds the "pool recovered" notice via
+//     proxy.FormatPoolRecoveredNotice and fans it across the broker's channels.
+//     Internally gated only on a broker being present.
+//   - SetOnPoolRecovered: the opt-in, per-pool agent auth auto-reset. Looks up
+//     the recovered pool's auth_reset_target in the store; when non-empty AND a
+//     container manager exists, calls mgr.ResetAuth in a detached goroutine with
+//     a fresh bounded context (never a wake-scoped ctx) and emits an
+//     agent_auth_reset audit event on success. An empty target is the opt-out
+//     default (no reset). A ResetAuth error is logged, not fatal.
+//
+// Both callbacks run their work in detached goroutines so the recovery monitor
+// is never blocked by a SQLite read, a Telegram round-trip, or a container exec.
+// The two hooks are independent: the notice fires even when no reset target is
+// configured, and the reset never suppresses the notice.
+func wirePoolRecovery(srv *proxy.Server, db *store.Store, mgr container.ContainerManager, broker *channel.Broker, logger auditLogger) {
+	srv.SetOnPoolRecoveredNotice(poolRecoveredNoticeFunc(broker))
+	srv.SetOnPoolRecovered(poolAuthResetFunc(db, mgr, logger))
+}
+
+// poolRecoveredNoticeFunc builds the always-fires recovery-notice callback. It
+// fans proxy.FormatPoolRecoveredNotice across the broker's channels in a
+// detached goroutine; a nil broker yields a callback that does nothing (the
+// notice is internally gated on a broker being present, like SetOnFailover).
+func poolRecoveredNoticeFunc(broker *channel.Broker) func(pool string) {
+	return func(pool string) {
+		if broker == nil {
+			return
+		}
+		go func() {
+			msg := proxy.FormatPoolRecoveredNotice(pool)
+			ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+			defer cancel()
+			for _, ch := range broker.Channels() {
+				if nerr := ch.Notify(ctx, msg); nerr != nil {
+					log.Printf("[POOL-RECOVERY] notice via %s failed: %v", ch.Type(), nerr)
+				}
+			}
+		}()
+	}
+}
+
+// poolAuthResetFunc builds the opt-in, per-pool agent auth auto-reset callback.
+// It looks up the recovered pool's auth_reset_target; when non-empty AND a
+// container manager exists, it calls mgr.ResetAuth in a detached goroutine with
+// a fresh bounded context and emits an agent_auth_reset audit event on success.
+// An empty target is the opt-out default. A ResetAuth error is logged, not fatal.
+func poolAuthResetFunc(db *store.Store, mgr container.ContainerManager, logger auditLogger) func(pool string) {
+	return func(pool string) {
+		if db == nil || mgr == nil {
+			return
+		}
+		go func() {
+			p, err := db.GetPool(pool)
+			if err != nil {
+				log.Printf("[POOL-RECOVERY] auth-reset lookup for pool %q failed: %v", pool, err)
+				return
+			}
+			if p == nil || p.AuthResetTarget == "" {
+				// Opt-out default: no reset target configured for this pool.
+				return
+			}
+			target := p.AuthResetTarget
+			ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
+			defer cancel()
+			if rerr := mgr.ResetAuth(ctx, target); rerr != nil {
+				log.Printf("[POOL-RECOVERY] auth reset for pool %q (target %q) failed: %v", pool, target, rerr)
+				return
+			}
+			log.Printf("[POOL-RECOVERY] agent auth reset for pool %q (target %q)", pool, target)
+			if logger != nil {
+				if lerr := logger.Log(audit.Event{
+					Action:     "agent_auth_reset",
+					Verdict:    "recover",
+					Credential: pool,
+					Reason:     target,
+				}); lerr != nil {
+					log.Printf("[POOL-RECOVERY] audit log for agent_auth_reset (pool %q) failed: %v", pool, lerr)
+				}
+			}
+		}()
+	}
+}
+
 // envDefault returns the environment variable value if set, otherwise the fallback.
 func envDefault(key, fallback string) string {
 	if v := os.Getenv(key); v != "" {
diff --git a/cmd/sluice/main_test.go b/cmd/sluice/main_test.go
index 7b2c2bc..c244bd1 100644
--- a/cmd/sluice/main_test.go
+++ b/cmd/sluice/main_test.go
@@ -13,6 +13,8 @@ import (
 	"time"
 
 	"github.com/nemirovsky/sluice/internal/api"
+	"github.com/nemirovsky/sluice/internal/audit"
+	"github.com/nemirovsky/sluice/internal/channel"
 	"github.com/nemirovsky/sluice/internal/container"
 	"github.com/nemirovsky/sluice/internal/policy"
 	"github.com/nemirovsky/sluice/internal/proxy"
@@ -1420,8 +1422,19 @@ func TestStandaloneModeCredentialInjection(t *testing.T) {
 
 // mockContainerMgr implements container.ContainerManager for testing.
 type mockContainerMgr struct {
-	injectedEnv map[string]string
-	injectErr   error
+	mu             sync.Mutex
+	injectedEnv    map[string]string
+	injectErr      error
+	resetAuthCalls []string
+	resetAuthErr   error
+}
+
+func (m *mockContainerMgr) resetAuthTargets() []string {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	out := make([]string, len(m.resetAuthCalls))
+	copy(out, m.resetAuthCalls)
+	return out
 }
 
 func (m *mockContainerMgr) InjectEnvVars(_ context.Context, envMap map[string]string, _ bool) error {
@@ -1453,8 +1466,12 @@ func (m *mockContainerMgr) ReloadSecrets(_ context.Context) error {
 	return nil
 }
 
-func (m *mockContainerMgr) ResetAuth(_ context.Context, _ string) error {
-	return nil
+func (m *mockContainerMgr) ResetAuth(_ context.Context, target string) error {
+	m.mu.Lock()
+	m.resetAuthCalls = append(m.resetAuthCalls, target)
+	err := m.resetAuthErr
+	m.mu.Unlock()
+	return err
 }
 
 func (m *mockContainerMgr) Runtime() container.Runtime {
@@ -1672,3 +1689,228 @@ func TestDeriveMCPBaseURL(t *testing.T) {
 		})
 	}
 }
+
+// recoveryMockChannel implements channel.Channel for wirePoolRecovery tests.
+// It records every Notify message so the test can assert the recovered notice
+// fired across the broker's channels.
+type recoveryMockChannel struct {
+	mu       sync.Mutex
+	notified []string
+}
+
+func (c *recoveryMockChannel) RequestApproval(_ context.Context, _ channel.ApprovalRequest) error {
+	return nil
+}
+func (c *recoveryMockChannel) CancelApproval(_ string) error { return nil }
+func (c *recoveryMockChannel) Commands() <-chan channel.Command {
+	return nil
+}
+
+func (c *recoveryMockChannel) Notify(_ context.Context, msg string) error {
+	c.mu.Lock()
+	c.notified = append(c.notified, msg)
+	c.mu.Unlock()
+	return nil
+}
+func (c *recoveryMockChannel) Start() error { return nil }
+func (c *recoveryMockChannel) Stop()        {}
+func (c *recoveryMockChannel) Type() channel.ChannelType {
+	return channel.ChannelTelegram
+}
+
+func (c *recoveryMockChannel) messages() []string {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	out := make([]string, len(c.notified))
+	copy(out, c.notified)
+	return out
+}
+
+// fakeAuditLogger captures audit events for wirePoolRecovery tests.
+type fakeAuditLogger struct {
+	mu     sync.Mutex
+	events []audit.Event
+}
+
+func (f *fakeAuditLogger) Log(evt audit.Event) error {
+	f.mu.Lock()
+	f.events = append(f.events, evt)
+	f.mu.Unlock()
+	return nil
+}
+
+func (f *fakeAuditLogger) snapshot() []audit.Event {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	out := make([]audit.Event, len(f.events))
+	copy(out, f.events)
+	return out
+}
+
+// waitFor polls cond up to ~2s; fails the test if it never becomes true.
+func waitFor(t *testing.T, cond func() bool, msg string) {
+	t.Helper()
+	deadline := time.Now().Add(2 * time.Second)
+	for time.Now().Before(deadline) {
+		if cond() {
+			return
+		}
+		time.Sleep(5 * time.Millisecond)
+	}
+	t.Fatalf("condition never met: %s", msg)
+}
+
+func newRecoveryTestStore(t *testing.T) *store.Store {
+	t.Helper()
+	db, err := store.New(":memory:")
+	if err != nil {
+		t.Fatalf("store.New: %v", err)
+	}
+	t.Cleanup(func() { _ = db.Close() })
+	return db
+}
+
+func seedRecoveryPool(t *testing.T, db *store.Store, pool string, members []string) {
+	t.Helper()
+	for _, m := range members {
+		if err := db.AddCredentialMeta(m, "oauth", "https://auth.example.com/token"); err != nil {
+			t.Fatalf("add credential meta %q: %v", m, err)
+		}
+	}
+	if err := db.CreatePoolWithMembers(pool, "", members); err != nil {
+		t.Fatalf("create pool %q: %v", pool, err)
+	}
+}
+
+// TestPoolAuthResetFuncTargetSet asserts a pool with a non-empty
+// auth_reset_target triggers exactly one ResetAuth(target) call and one
+// agent_auth_reset audit event on recovery.
+func TestPoolAuthResetFuncTargetSet(t *testing.T) {
+	db := newRecoveryTestStore(t)
+	seedRecoveryPool(t, db, "codex", []string{"acct_a", "acct_b"})
+	if err := db.SetPoolAuthResetTarget("codex", "openai-codex"); err != nil {
+		t.Fatalf("SetPoolAuthResetTarget: %v", err)
+	}
+
+	mgr := &mockContainerMgr{}
+	logger := &fakeAuditLogger{}
+	fn := poolAuthResetFunc(db, mgr, logger)
+
+	fn("codex")
+
+	waitFor(t, func() bool { return len(mgr.resetAuthTargets()) == 1 }, "ResetAuth called once")
+	targets := mgr.resetAuthTargets()
+	if targets[0] != "openai-codex" {
+		t.Errorf("ResetAuth target = %q, want %q", targets[0], "openai-codex")
+	}
+
+	waitFor(t, func() bool { return len(logger.snapshot()) == 1 }, "one audit event")
+	evts := logger.snapshot()
+	e := evts[0]
+	if e.Action != "agent_auth_reset" || e.Verdict != "recover" || e.Credential != "codex" || e.Reason != "openai-codex" {
+		t.Errorf("audit event = %+v, want action=agent_auth_reset verdict=recover credential=codex reason=openai-codex", e)
+	}
+}
+
+// TestPoolAuthResetFuncNoTarget asserts a pool with an empty auth_reset_target
+// triggers no ResetAuth call (the recovered notice path is independent and
+// covered by TestPoolRecoveredNoticeFunc).
+func TestPoolAuthResetFuncNoTarget(t *testing.T) {
+	db := newRecoveryTestStore(t)
+	seedRecoveryPool(t, db, "codex", []string{"acct_a", "acct_b"})
+
+	mgr := &mockContainerMgr{}
+	logger := &fakeAuditLogger{}
+	fn := poolAuthResetFunc(db, mgr, logger)
+
+	fn("codex")
+
+	// Give the detached goroutine time to run; it must not call ResetAuth.
+	time.Sleep(100 * time.Millisecond)
+	if got := mgr.resetAuthTargets(); len(got) != 0 {
+		t.Errorf("ResetAuth calls = %v, want none for a pool with empty auth_reset_target", got)
+	}
+	if got := logger.snapshot(); len(got) != 0 {
+		t.Errorf("audit events = %+v, want none", got)
+	}
+}
+
+// TestPoolAuthResetFuncErrorNotFatal asserts a ResetAuth error is logged (no
+// panic, no audit event) and does not crash.
+func TestPoolAuthResetFuncErrorNotFatal(t *testing.T) {
+	db := newRecoveryTestStore(t)
+	seedRecoveryPool(t, db, "codex", []string{"acct_a", "acct_b"})
+	if err := db.SetPoolAuthResetTarget("codex", "openai-codex"); err != nil {
+		t.Fatalf("SetPoolAuthResetTarget: %v", err)
+	}
+
+	mgr := &mockContainerMgr{resetAuthErr: fmt.Errorf("reset boom")}
+	logger := &fakeAuditLogger{}
+	fn := poolAuthResetFunc(db, mgr, logger)
+
+	fn("codex")
+
+	waitFor(t, func() bool { return len(mgr.resetAuthTargets()) == 1 }, "ResetAuth attempted once")
+	// On error, no audit event is emitted; give the goroutine time to settle.
+	time.Sleep(100 * time.Millisecond)
+	if got := logger.snapshot(); len(got) != 0 {
+		t.Errorf("audit events = %+v, want none on ResetAuth error", got)
+	}
+}
+
+// TestPoolAuthResetFuncNilManager asserts a nil container manager is a no-op
+// (no panic), covering standalone deployments.
+func TestPoolAuthResetFuncNilManager(t *testing.T) {
+	db := newRecoveryTestStore(t)
+	seedRecoveryPool(t, db, "codex", []string{"acct_a", "acct_b"})
+	if err := db.SetPoolAuthResetTarget("codex", "openai-codex"); err != nil {
+		t.Fatalf("SetPoolAuthResetTarget: %v", err)
+	}
+	fn := poolAuthResetFunc(db, nil, &fakeAuditLogger{})
+	fn("codex") // must not panic
+	time.Sleep(50 * time.Millisecond)
+}
+
+// TestPoolRecoveredNoticeFunc asserts the recovered notice fires across the
+// broker's channels even when no auth_reset_target is configured.
+func TestPoolRecoveredNoticeFunc(t *testing.T) {
+	ch := &recoveryMockChannel{}
+	broker := channel.NewBroker([]channel.Channel{ch})
+	fn := poolRecoveredNoticeFunc(broker)
+
+	fn("codex")
+
+	waitFor(t, func() bool { return len(ch.messages()) == 1 }, "one recovered notice")
+	want := proxy.FormatPoolRecoveredNotice("codex")
+	if got := ch.messages(); got[0] != want {
+		t.Errorf("notice = %q, want %q", got[0], want)
+	}
+}
+
+// TestPoolRecoveredNoticeFuncNoBroker asserts a nil broker yields a no-op
+// callback (no panic).
+func TestPoolRecoveredNoticeFuncNoBroker(t *testing.T) {
+	fn := poolRecoveredNoticeFunc(nil)
+	fn("codex") // must not panic
+	time.Sleep(20 * time.Millisecond)
+}
+
+// TestWirePoolRecoveryRegisters asserts wirePoolRecovery wires both callbacks
+// onto the server without panicking (smoke test of the registration path).
+func TestWirePoolRecoveryRegisters(t *testing.T) {
+	db := newRecoveryTestStore(t)
+	eng, err := policy.LoadFromStore(db)
+	if err != nil {
+		t.Fatalf("LoadFromStore: %v", err)
+	}
+	srv, err := proxy.New(proxy.Config{ListenAddr: "127.0.0.1:0", Policy: eng})
+	if err != nil {
+		t.Fatalf("proxy.New: %v", err)
+	}
+	defer func() { _ = srv.Close() }()
+
+	ch := &recoveryMockChannel{}
+	broker := channel.NewBroker([]channel.Channel{ch})
+	mgr := &mockContainerMgr{}
+	wirePoolRecovery(srv, db, mgr, broker, &fakeAuditLogger{})
+}
diff --git a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
index acde716..6e23903 100644
--- a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
+++ b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
@@ -369,16 +369,20 @@ precedent) and emits an `agent_auth_reset` audit event.
 - Modify: `internal/proxy/pool_failover.go` / audit usage (`agent_auth_reset` action)
 - Modify: relevant `_test.go`
 
-- [ ] in main.go register `SetOnPoolRecovered`: look up the recovered pool's
+- [x] in main.go register `SetOnPoolRecovered`: look up the recovered pool's
   `auth_reset_target`; if non-empty and `containerMgr != nil`, call
   `containerMgr.ResetAuth(ctx, target)` in a detached goroutine using a fresh
   `context.WithTimeout(context.Background(), …)` (never block; never reuse a wake-scoped ctx);
-  log + emit `agent_auth_reset`; empty target → no reset (opt-out default)
-- [ ] keep the recovered Telegram notice (Task 3) and the reset independent (notice always;
-  reset only when target set); a `ResetAuth` error is logged, not fatal
-- [ ] write tests: target set → recovery triggers `ResetAuth(target)` once; no target → no
-  reset, notice still emitted; `ResetAuth` error logged not fatal
-- [ ] run `go test ./... -race` for touched packages; gofumpt; vet — pass before Task 9
+  log + emit `agent_auth_reset`; empty target → no reset (opt-out default). Implemented via
+  `wirePoolRecovery` (registered unconditionally near `SetOnFailover`), with
+  `poolAuthResetFunc`/`poolRecoveredNoticeFunc` factored out for direct testing.
+- [x] keep the recovered Telegram notice (Task 3) and the reset independent (notice always
+  via `SetOnPoolRecoveredNotice` fanned across `failoverBroker.Channels()`; reset only when
+  target set via `SetOnPoolRecovered`); a `ResetAuth` error is logged, not fatal
+- [x] write tests: target set → recovery triggers `ResetAuth(target)` once + one
+  `agent_auth_reset` audit event; no target → no reset; notice fires across broker channels
+  regardless of target; `ResetAuth` error logged not fatal; nil manager/nil broker no-op
+- [x] run `go test ./... -race` for touched packages; gofumpt; vet — pass before Task 9
 
 ### Task 9: Verify acceptance criteria
 - [ ] both members exhausted → exactly one "pool exhausted" notice (no flap stream)

From bcc876a34ab0c1267a1f743b80118caef9cf25bf Mon Sep 17 00:00:00 2001
From: Nikita Nemirovsky <vaze.legend@gmail.com>
Date: Fri, 22 May 2026 16:44:51 +0800
Subject: [PATCH 10/19] test: verify acceptance criteria for pool exhaustion +
 auth reset

---
 cmd/sluice/main_test.go                       | 13 ++++----
 ...22-pool-exhaustion-and-agent-auth-reset.md | 31 +++++++++++++++----
 internal/proxy/pool_recovery_monitor_test.go  | 14 ++++-----
 internal/vault/pool_test.go                   |  8 ++---
 4 files changed, 43 insertions(+), 23 deletions(-)

diff --git a/cmd/sluice/main_test.go b/cmd/sluice/main_test.go
index c244bd1..bcc7740 100644
--- a/cmd/sluice/main_test.go
+++ b/cmd/sluice/main_test.go
@@ -1770,8 +1770,9 @@ func newRecoveryTestStore(t *testing.T) *store.Store {
 	return db
 }
 
-func seedRecoveryPool(t *testing.T, db *store.Store, pool string, members []string) {
+func seedRecoveryPool(t *testing.T, db *store.Store, members []string) {
 	t.Helper()
+	const pool = "codex"
 	for _, m := range members {
 		if err := db.AddCredentialMeta(m, "oauth", "https://auth.example.com/token"); err != nil {
 			t.Fatalf("add credential meta %q: %v", m, err)
@@ -1787,7 +1788,7 @@ func seedRecoveryPool(t *testing.T, db *store.Store, pool string, members []stri
 // agent_auth_reset audit event on recovery.
 func TestPoolAuthResetFuncTargetSet(t *testing.T) {
 	db := newRecoveryTestStore(t)
-	seedRecoveryPool(t, db, "codex", []string{"acct_a", "acct_b"})
+	seedRecoveryPool(t, db, []string{"acct_a", "acct_b"})
 	if err := db.SetPoolAuthResetTarget("codex", "openai-codex"); err != nil {
 		t.Fatalf("SetPoolAuthResetTarget: %v", err)
 	}
@@ -1817,7 +1818,7 @@ func TestPoolAuthResetFuncTargetSet(t *testing.T) {
 // covered by TestPoolRecoveredNoticeFunc).
 func TestPoolAuthResetFuncNoTarget(t *testing.T) {
 	db := newRecoveryTestStore(t)
-	seedRecoveryPool(t, db, "codex", []string{"acct_a", "acct_b"})
+	seedRecoveryPool(t, db, []string{"acct_a", "acct_b"})
 
 	mgr := &mockContainerMgr{}
 	logger := &fakeAuditLogger{}
@@ -1839,7 +1840,7 @@ func TestPoolAuthResetFuncNoTarget(t *testing.T) {
 // panic, no audit event) and does not crash.
 func TestPoolAuthResetFuncErrorNotFatal(t *testing.T) {
 	db := newRecoveryTestStore(t)
-	seedRecoveryPool(t, db, "codex", []string{"acct_a", "acct_b"})
+	seedRecoveryPool(t, db, []string{"acct_a", "acct_b"})
 	if err := db.SetPoolAuthResetTarget("codex", "openai-codex"); err != nil {
 		t.Fatalf("SetPoolAuthResetTarget: %v", err)
 	}
@@ -1862,7 +1863,7 @@ func TestPoolAuthResetFuncErrorNotFatal(t *testing.T) {
 // (no panic), covering standalone deployments.
 func TestPoolAuthResetFuncNilManager(t *testing.T) {
 	db := newRecoveryTestStore(t)
-	seedRecoveryPool(t, db, "codex", []string{"acct_a", "acct_b"})
+	seedRecoveryPool(t, db, []string{"acct_a", "acct_b"})
 	if err := db.SetPoolAuthResetTarget("codex", "openai-codex"); err != nil {
 		t.Fatalf("SetPoolAuthResetTarget: %v", err)
 	}
@@ -1889,7 +1890,7 @@ func TestPoolRecoveredNoticeFunc(t *testing.T) {
 
 // TestPoolRecoveredNoticeFuncNoBroker asserts a nil broker yields a no-op
 // callback (no panic).
-func TestPoolRecoveredNoticeFuncNoBroker(t *testing.T) {
+func TestPoolRecoveredNoticeFuncNoBroker(_ *testing.T) {
 	fn := poolRecoveredNoticeFunc(nil)
 	fn("codex") // must not panic
 	time.Sleep(20 * time.Millisecond)
diff --git a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
index 6e23903..854324d 100644
--- a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
+++ b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
@@ -385,15 +385,34 @@ precedent) and emits an `agent_auth_reset` audit event.
 - [x] run `go test ./... -race` for touched packages; gofumpt; vet — pass before Task 9
 
 ### Task 9: Verify acceptance criteria
-- [ ] both members exhausted → exactly one "pool exhausted" notice (no flap stream)
-- [ ] B1 cooldown reflects the upstream window (member not re-probed every 60s)
-- [ ] recovery edge → one "pool recovered" notice; auth reset fires only for pools with a
+- [x] both members exhausted → exactly one "pool exhausted" notice (no flap stream)
+  (covered: `TestExhaustedNoticeEdgeAcrossManyResponses`, `TestMarkPoolExhaustedEdge` in
+  `internal/proxy/pool_recovery_monitor_test.go`)
+- [x] B1 cooldown reflects the upstream window (member not re-probed every 60s)
+  (covered: `TestCooldownFromResponse` + `TestCooldownFromResponseNilSafe` in
+  `internal/proxy/pool_failover_test.go`; `cooldownFromResponse` wired into `handlePoolFailover`
+  at `pool_failover.go:629`)
+- [x] recovery edge → one "pool recovered" notice; auth reset fires only for pools with a
   target; hermes reset runs as 10000:10000 (no root-chown of auth.json)
-- [ ] channel parity verified: `auth_reset_target` settable+readable via CLI, REST, Telegram
-- [ ] full `go test ./...`; `go vet ./...`; `go vet -tags=e2e ./e2e/`; `gofumpt -l` clean;
+  (covered: `TestRecoveryMonitorEdgeNoticeAndCallback`, `TestRecoveryMonitorUnequalCooldowns`,
+  `TestRecoveryMonitorPoolRemovedFiresNoNotice`; `TestPoolAuthResetFuncTargetSet`/`...NoTarget`
+  in `cmd/sluice/main_test.go` assert target-gating + the `agent_auth_reset` audit event;
+  `TestResetAuthHermesRunsAsRuntimeUID` asserts 10000:10000)
+- [x] channel parity verified: `auth_reset_target` settable+readable via CLI, REST, Telegram
+  (CLI `TestHandlePoolAuthResetTarget` in `cmd/sluice/pool_test.go`; REST
+  `TestPostApiPoolsNameAuthResetTarget` + `TestPostApiPools_WithAuthResetTarget` in
+  `internal/api/server_test.go`; Telegram `TestHandlePoolAuthResetTarget` in
+  `internal/telegram/commands_test.go`; channel-agnostic `internal/poolops` tests
+  `TestCreateWithAuthResetTarget`/`TestSetAuthResetTargetSetAndClear`)
+- [x] full `go test ./...`; `go vet ./...`; `go vet -tags=e2e ./e2e/`; `gofumpt -l` clean;
   `golangci-lint run ./...` 0 issues; `make generate` then `git diff --exit-code
   internal/api/api.gen.go` clean
-- [ ] independently verify committed HEAD builds + tests pass (do not trust subagent green)
+  (results: build OK; `go test ./...` 2826 pass / 0 fail across 14 pkgs; vet clean; e2e vet
+  clean; gofumpt clean on tracked source — `internal/api/api.gen.go` is the raw oapi-codegen
+  output, byte-stable under `make generate`, NOT gofumpt-formatted by project convention and
+  CI-green on main; golangci-lint v2.9.0 0 issues; `make generate` produces no api.gen.go diff)
+- [x] independently verify committed HEAD builds + tests pass (do not trust subagent green)
+  (verified post-commit: clean `git status`, build + full suite re-run green at HEAD)
 
 ### Task 10: [Final] Documentation
 - [ ] update `CLAUDE.md` credential-pools section: B1 cooldown-from-hints, A1/A2
diff --git a/internal/proxy/pool_recovery_monitor_test.go b/internal/proxy/pool_recovery_monitor_test.go
index 4e298e4..01b6a27 100644
--- a/internal/proxy/pool_recovery_monitor_test.go
+++ b/internal/proxy/pool_recovery_monitor_test.go
@@ -36,9 +36,9 @@ default = "deny"
 
 // twoMemberPool builds a 2-member failover pool resolver and stores it on the
 // server. Returns the live resolver.
-func twoMemberPool(t *testing.T, srv *Server, name, a, b string) *vault.PoolResolver {
+func twoMemberPool(t *testing.T, srv *Server, a, b string) *vault.PoolResolver {
 	t.Helper()
-	pool := store.Pool{Name: name, Strategy: store.PoolStrategyFailover}
+	pool := store.Pool{Name: "p", Strategy: store.PoolStrategyFailover}
 	pool.Members = []store.PoolMember{
 		{Credential: a, Position: 0},
 		{Credential: b, Position: 1},
@@ -70,7 +70,7 @@ func TestMarkPoolExhaustedEdge(t *testing.T) {
 	// but here we only test the edge bookkeeping, so use a pool that stays
 	// exhausted (no resolver -> scanRecovery clears it). Store a resolver whose
 	// members are all cooling so it stays exhausted.
-	pr := twoMemberPool(t, srv, "p", "a", "b")
+	pr := twoMemberPool(t, srv, "a", "b")
 	far := time.Now().Add(time.Hour)
 	pr.MarkCooldown("a", far, "429")
 	pr.MarkCooldown("b", far, "429")
@@ -139,7 +139,7 @@ func TestExhaustedNoticeEdgeAcrossManyResponses(t *testing.T) {
 // exactly one onPoolRecovered call.
 func TestRecoveryMonitorEdgeNoticeAndCallback(t *testing.T) {
 	srv := newMonitorTestServer(t)
-	pr := twoMemberPool(t, srv, "p", "a", "b")
+	pr := twoMemberPool(t, srv, "a", "b")
 
 	var notices, resets int32
 	var mu sync.Mutex
@@ -195,7 +195,7 @@ func TestRecoveryMonitorEdgeNoticeAndCallback(t *testing.T) {
 // is still cooling. Uses short injected durations (no 60s sleep).
 func TestRecoveryMonitorUnequalCooldowns(t *testing.T) {
 	srv := newMonitorTestServer(t)
-	pr := twoMemberPool(t, srv, "p", "memA", "memB")
+	pr := twoMemberPool(t, srv, "memA", "memB")
 
 	var notices int32
 	srv.SetOnPoolRecoveredNotice(func(string) { atomic.AddInt32(&notices, 1) })
@@ -225,7 +225,7 @@ func TestRecoveryMonitorUnequalCooldowns(t *testing.T) {
 // exhausted has its state dropped and fires no recovered notice.
 func TestRecoveryMonitorPoolRemovedFiresNoNotice(t *testing.T) {
 	srv := newMonitorTestServer(t)
-	pr := twoMemberPool(t, srv, "p", "a", "b")
+	pr := twoMemberPool(t, srv, "a", "b")
 	pr.MarkCooldown("a", time.Now().Add(time.Hour), "429")
 	pr.MarkCooldown("b", time.Now().Add(time.Hour), "429")
 
@@ -291,7 +291,7 @@ func TestRecoveryMonitorStopsCleanly(t *testing.T) {
 	// stop must not be serviced (no recovered notice ever fires).
 	var notices int32
 	srv.SetOnPoolRecoveredNotice(func(string) { atomic.AddInt32(&notices, 1) })
-	pr := twoMemberPool(t, srv, "p", "a", "b")
+	pr := twoMemberPool(t, srv, "a", "b")
 	pr.MarkCooldown("a", time.Now().Add(20*time.Millisecond), "429")
 	pr.MarkCooldown("b", time.Now().Add(20*time.Millisecond), "429")
 	srv.markPoolExhausted("p")
diff --git a/internal/vault/pool_test.go b/internal/vault/pool_test.go
index 4060e8b..7dc0668 100644
--- a/internal/vault/pool_test.go
+++ b/internal/vault/pool_test.go
@@ -955,7 +955,7 @@ func TestHasHealthyMember(t *testing.T) {
 	}{
 		{
 			name:  "both healthy",
-			setup: func(pr *PoolResolver) {},
+			setup: func(_ *PoolResolver) {},
 			pool:  "pool",
 			want:  true,
 		},
@@ -1001,7 +1001,7 @@ func TestHasHealthyMember(t *testing.T) {
 		},
 		{
 			name:  "unknown pool",
-			setup: func(pr *PoolResolver) {},
+			setup: func(_ *PoolResolver) {},
 			pool:  "nope",
 			want:  false,
 		},
@@ -1039,7 +1039,7 @@ func TestSoonestCooldown(t *testing.T) {
 	}{
 		{
 			name:   "no member cooling",
-			setup:  func(pr *PoolResolver, base time.Time) {},
+			setup:  func(_ *PoolResolver, _ time.Time) {},
 			pool:   "pool",
 			wantOK: false,
 		},
@@ -1085,7 +1085,7 @@ func TestSoonestCooldown(t *testing.T) {
 		},
 		{
 			name:   "unknown pool",
-			setup:  func(pr *PoolResolver, base time.Time) {},
+			setup:  func(_ *PoolResolver, _ time.Time) {},
 			pool:   "nope",
 			wantOK: false,
 		},

From 889e9e62c0e1775b34e5480781a856b76a677b8e Mon Sep 17 00:00:00 2001
From: Nikita Nemirovsky <vaze.legend@gmail.com>
Date: Fri, 22 May 2026 16:49:06 +0800
Subject: [PATCH 11/19] docs: document pool exhaustion handling + agent auth
 auto-reset

---
 CLAUDE.md                                     | 35 +++++++++++--------
 ...22-pool-exhaustion-and-agent-auth-reset.md |  6 ++--
 2 files changed, 23 insertions(+), 18 deletions(-)
 rename docs/plans/{ => completed}/20260522-pool-exhaustion-and-agent-auth-reset.md (99%)

diff --git a/CLAUDE.md b/CLAUDE.md
index eec0716..853c084 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -105,14 +105,16 @@ The only acceptable single-channel features have a documented rationale making t
 
 ## Agent Profiles
 
-Profiles abstract per-agent runtime conventions so container managers stay agent-agnostic. Each carries `EnvFileRelPath` (phantom env-var path), `ReloadCmd` (argv for in-place secret reload, or nil), `WireMCPCmd` (argv to register sluice as an MCP server).
+Profiles abstract per-agent runtime conventions so container managers stay agent-agnostic. Each carries `EnvFileRelPath` (phantom env-var path), `ReloadCmd` (argv for in-place secret reload, or nil), `WireMCPCmd` (argv to register sluice as an MCP server), `ResetAuthCmd(target)` (argv to un-latch the agent's local auth state on pool recovery, or nil), and `ExecUser()` (UID:GID for file-writing execs, "" = image USER).
 
-| Profile | Env file | Reload | MCP wiring |
-|---------|----------|--------|------------|
-| `openclaw` (default) | `~/.openclaw/.env` | `node -e <gateway_rpc.js> secrets.reload` over the agent's WebSocket gateway | `node -e <gateway_rpc.js> wire-mcp <name> <url>` patches `mcp.servers.<name>` |
-| `hermes` | `~/.hermes/.env` | None (see caveats) | sh wrapper patches `mcp_servers.<name>.url` in `~/.hermes/config.yaml` (see caveats) |
+| Profile | Env file | Reload | MCP wiring | Auth reset | Exec user |
+|---------|----------|--------|------------|------------|-----------|
+| `openclaw` (default) | `~/.openclaw/.env` | `node -e <gateway_rpc.js> secrets.reload` over the agent's WebSocket gateway | `node -e <gateway_rpc.js> wire-mcp <name> <url>` patches `mcp.servers.<name>` | none (nil; openclaw latch unverified) | "" (root) |
+| `hermes` | `~/.hermes/.env` | None (see caveats) | sh wrapper patches `mcp_servers.<name>.url` in `~/.hermes/config.yaml` (see caveats) | `hermes auth reset <target>` (pure argv `/opt/hermes/.venv/bin/hermes auth reset <target>`) | `10000:10000` |
 
-Adding a profile is a single edit to `internal/container/agent_profile.go` (register a struct in `builtinProfiles`); all three backends consume it via `BuildEnvInjectionScriptForProfile`, `profile.ReloadCmd()`, `profile.WireMCPCmd()`.
+`ExecUser` exists because hermes runs its gateway as the non-root runtime UID 10000; a default-root `docker exec` that writes a hermes-owned file (`auth.json`) root-chowns it and bricks the gateway, so `ResetAuth` execs as `profile.ExecUser()`. The target is validated (non-empty, no NUL, charset `[A-Za-z0-9_.:-]+`) before exec; `ResetAuthCmd` is pure argv (no `sh -c`) so there is no shell-metachar threat.
+
+Adding a profile is a single edit to `internal/container/agent_profile.go` (register a struct in `builtinProfiles`); all three backends consume it via `BuildEnvInjectionScriptForProfile`, `profile.ReloadCmd()`, `profile.WireMCPCmd()`, `profile.ResetAuthCmd()`, `profile.ExecUser()`.
 
 Hermes caveats:
 - `ReloadCmd` nil; `ReloadSecrets` logs a notice, returns nil. New phantom tokens take effect on next Hermes message or `/reload-mcp`.
@@ -174,18 +176,19 @@ A **pool** backs one phantom identity with **N real OAuth credentials**. The age
 **CLI:**
 
 ```
-sluice pool create <name> --members credA,credB[,credC]   # ordered; rejects static; namespace must not collide with a credential name
+sluice pool create <name> --members credA,credB[,credC] [--auth-reset-target <target>]   # ordered; rejects static; namespace must not collide with a credential name
 sluice pool list
-sluice pool status <name>     # active member, per-member health (healthy / cooldown + until + reason)
+sluice pool status <name>     # active member, per-member health (healthy / cooldown + until + reason), auth-reset target
 sluice pool rotate <name>     # operator override: advance active member
+sluice pool set-auth-reset <name> <target|->   # set/clear the recovery auth-reset target (a single - clears)
 sluice pool remove <name>
 ```
 
-Pools are reachable from all channels — CLI `sluice pool`, REST `/api/pools` (`GET`/`POST`, `GET`/`DELETE /api/pools/{name}`, `POST /api/pools/{name}/rotate`), and Telegram `/pool` — all via the channel-agnostic `internal/poolops`.
+Pools are reachable from all channels — CLI `sluice pool`, REST `/api/pools` (`GET`/`POST`, `GET`/`DELETE /api/pools/{name}`, `POST /api/pools/{name}/rotate`, `POST /api/pools/{name}/auth-reset-target`), and Telegram `/pool` (incl. `/pool set-auth-reset <name> <target|->` and an optional 3rd `/pool create` arg) — all via the channel-agnostic `internal/poolops`.
 
 Auto-failover on 429/401 is primary; `pool rotate` is an override.
 
-**Data model (migration `000006_credential_pools`):** `credential_pools` (name, strategy reserved `failover`), `credential_pool_members` (ordered, pool->credential FK), `credential_health` (`healthy|cooldown`, `cooldown_until`, `last_failure_reason`), all CHECK-constrained. Store API in `internal/store/pools.go`. `reloadAll` loads pool+health into an atomic-pointer-swapped `PoolResolver` (`internal/vault/pool.go`), rewired via `srv.StorePool`/`SetPoolResolver` on SIGHUP and the 2s data-version watcher.
+**Data model (migration `000006_credential_pools`):** `credential_pools` (name, strategy reserved `failover`, `auth_reset_target` added by migration `000008_pool_auth_reset` — empty default = opt-out), `credential_pool_members` (ordered, pool->credential FK), `credential_health` (`healthy|cooldown`, `cooldown_until`, `last_failure_reason`), all CHECK-constrained. Store API in `internal/store/pools.go` (`SetPoolAuthResetTarget`). `reloadAll` loads pool+health into an atomic-pointer-swapped `PoolResolver` (`internal/vault/pool.go`), rewired via `srv.StorePool`/`SetPoolResolver` on SIGHUP and the 2s data-version watcher.
 
 **Phase 1 — phantom indirection (pool phantom -> active member):**
 
@@ -199,13 +202,15 @@ Auto-failover on 429/401 is primary; `pool rotate` is an override.
 
 - **Classification** (`classifyFailover`, `internal/proxy/pool_failover.go`, from `SluiceAddon.Response` for pooled destinations): `429`/`403 + insufficient_quota` -> rate-limited; `401`/token-body `invalid_grant`/`invalid_token` -> auth-failure; `5xx`/other -> no-op. Token-endpoint body trusted only when the request URL matched the OAuth index.
 - **Pool attribution** (`poolForResponse`): a response is pool-attributed either (a) the flow's CONNECT host has a pooled binding (API-host 429/403), or (b) the request URL matches the OAuth token-URL index for a member (token-endpoint 401/`invalid_grant`). (b) is essential — an OAuth refresh hits `auth.openai.com` (no pool binding; only `api.openai.com` has one), so without it the token-endpoint classification is dead code for Codex. Member recovery + fail-closed are the R1 mechanism above (`OAuthIndex.MatchAll` + the refresh-token join key, never `OAuthIndex.Match`).
-- **Synchronous in-memory failover (I1):** health is updated in-process before the response returns (`MarkCooldown` write lock, `ResolveActive` read lock) so the switch never waits on the 2s watcher (which only reconciles); a detached `onFailover` also writes `SetCredentialHealth(member,'cooldown',now+ttl,reason)` for durability. TTLs: `vault.RateLimitCooldown`=60s, `vault.AuthFailCooldown`=300s. **Cooldown extension is monotonic on both layers:** a member parked 300s for auth that then trips a 60s rate-limit keeps the LATER expiry — `MarkCooldown` and `SetCredentialHealth`'s `cooldown` upsert (CASE-compared against the stored future `cooldown_until`) both keep `max(existing-future, new)`. Only extend is monotonic: an explicit clear (zero/past `until`) and any transition to `healthy` still shorten/clear, and lazy expiry still wins over an expired stored cooldown. No in-flight retry — next request uses the new member.
+- **Synchronous in-memory failover (I1):** health is updated in-process before the response returns (`MarkCooldown` write lock, `ResolveActive` read lock) so the switch never waits on the 2s watcher (which only reconciles); a detached `onFailover` also writes `SetCredentialHealth(member,'cooldown',now+ttl,reason)` for durability. **Cooldown window (B1):** `cooldownFromResponse(class, f.Response.Header)` (`internal/proxy/pool_failover.go`) derives the TTL from the upstream recovery hints — `Retry-After` (delta-seconds or HTTP-date), then `x-ratelimit-reset` / `x-ratelimit-reset-requests` / `x-ratelimit-reset-tokens` (delta or epoch) — clamped to `[floor(class), vault.MaxCooldown=6h]`; no hint falls back to the class default (`vault.RateLimitCooldown`=60s / `vault.AuthFailCooldown`=300s). Floors: rate-limit `vault.MinRateLimitFloor`=10s (a short parsed window is honored, not floored up to 60s), auth-failure `AuthFailCooldown` (a revoked/expired token is never re-probed in seconds). This honors the real multi-hour quota window so a usage-limited member is not re-probed every 60s (the degrade-flap root cause). No body parsing yet. **Cooldown extension is monotonic on both layers:** a member parked 300s for auth that then trips a 60s rate-limit keeps the LATER expiry — `MarkCooldown` and `SetCredentialHealth`'s `cooldown` upsert (CASE-compared against the stored future `cooldown_until`) both keep `max(existing-future, new)`. Only extend is monotonic: an explicit clear (zero/past `until`) and any transition to `healthy` still shorten/clear, and lazy expiry still wins over an expired stored cooldown. No in-flight retry — next request uses the new member.
 - **Reload doesn't resurrect a cooled member:** the durable write is detached/best-effort, so any reload (SIGHUP or the 2s watcher on any unrelated DB write) rebuilds the resolver from store rows via `NewPoolResolver`; `Server.StorePool` calls `PoolResolver.MergeLiveCooldowns(prev)` to carry forward still-active in-memory cooldowns before the atomic swap (monotonic; drops cooldowns for credentials no longer in any pool).
-- **Audit:** `cred_failover` (Verdict `failover`, Credential = cooled member), `Reason = "<pool>:<from>-><to>:<429|403|401|invalid_grant>"`, emitted synchronously in `handlePoolFailover`.
-- **Telegram:** best-effort non-blocking notice "pool <name> failed over <a> -> <b> (<reason>)" (plain text); store write + every channel `Notify` detached into their own goroutine so the response path never blocks.
+- **Exhaustion + edge-triggered notices (A1/A2):** a pool is exhausted iff `PoolResolver.HasHealthyMember(pool)` (RLock, single `now`, mirrors `cooling()` lazy-expiry) is false — NOT `to == from`. Per-pool exhaustion state lives on the long-lived `Server` (`poolExhausted` map, NOT `PoolHealth`, so it survives resolver swaps and is not pruned on membership change). `handlePoolFailover` emits one "pool exhausted" notice on the `false->true` edge and wakes a dedicated recovery monitor goroutine (cap-1 `recoveryWake`). The monitor (`internal/proxy/server.go`, started in `New`, stopped idempotently from both `Close` and `GracefulShutdown`) sleeps until `SoonestCooldown(pool)` (clamped to a ~1s floor), `Load()`s the current resolver each wake, and on `HasHealthyMember -> true` flips `true->false`, emits one "pool recovered" notice (`FormatPoolRecoveredNotice`), and invokes `onPoolRecovered`. This replaces the old per-cooldown-window flap that respammed `cred_failover` + a Telegram notice every ~30/60s.
+- **Recovery auto-reset (opt-in, per pool):** if the recovered pool has a non-empty `auth_reset_target`, `onPoolRecovered` (wired in `cmd/sluice/main.go` via `wirePoolRecovery`) calls `containerMgr.ResetAuth(ctx, target)` in a detached goroutine with a fresh bounded context and emits an `agent_auth_reset` audit event (`Verdict "recover"`, `Credential` = pool, `Reason` = target). Empty target = no reset (opt-out default); a `ResetAuth` error is logged, not fatal. This un-latches an agent (hermes) that latched "usage limit reached" so it resumes without a manual `auth reset`.
+- **Audit:** `cred_failover` (Verdict `failover`, Credential = cooled member), `Reason = "<pool>:<from>-><to>:<429|403|401|invalid_grant>"`, emitted synchronously in `handlePoolFailover`; `agent_auth_reset` (Verdict `recover`) on a successful recovery auto-reset.
+- **Telegram:** best-effort non-blocking notice "pool <name> failed over <a> -> <b> (<reason>)" on a real transition, plus the edge-triggered exhausted/recovered notices (`SetOnPoolRecoveredNotice` fans the recovered notice across `failoverBroker.Channels()` independent of the auth-reset); store write + every channel `Notify` detached into their own goroutine so the response path never blocks.
 - **Known limitation:** streaming responses bypass failover (`handlePoolFailover` runs only from the buffered `Response` addon; SSE / `StreamLargeBodies`-exceeding bodies set `f.Stream=true` and skip it). Impact low (quota/auth bodies are tiny JSON); the next non-streamed request fails over normally.
 
-**Key files:** `internal/store/migrations/000006_credential_pools.{up,down}.sql`, `internal/store/pools.go`, `internal/vault/pool.go`, `internal/proxy/pool_failover.go`, `cmd/sluice/pool.go`, plus pool routing in `internal/proxy/addon.go` / `oauth_response.go`.
+**Key files:** `internal/store/migrations/000006_credential_pools.{up,down}.sql`, `000008_pool_auth_reset.{up,down}.sql`, `internal/store/pools.go`, `internal/vault/pool.go`, `internal/proxy/pool_failover.go`, `internal/proxy/server.go` (recovery monitor), `cmd/sluice/pool.go` / `main.go` (`wirePoolRecovery`), `internal/container/agent_profile.go` (`ResetAuthCmd`/`ExecUser`), plus pool routing in `internal/proxy/addon.go` / `oauth_response.go`.
 
 ### Protocol-specific handling
 
@@ -272,7 +277,7 @@ Two-phase: port-based guess first (standard ports 443/22/25/… route on it), by
 
 ### Audit logger
 
-Optional. JSON lines, blake3 hash chain (`prev_hash`, genesis = blake3("")); recovers across restarts by reading the last line. `sluice audit verify` walks the log and reports broken links. Common action names: `tool_call` (MCP verdict), `inspect_block` (ContentInspector arg block), `exec_block` (ExecInspector block), `response_dlp_redact` (HTTPS response/header redacted), `inject` (phantom injected outbound), `deny` (connection denied at SOCKS5/SNI).
+Optional. JSON lines, blake3 hash chain (`prev_hash`, genesis = blake3("")); recovers across restarts by reading the last line. `sluice audit verify` walks the log and reports broken links. Common action names: `tool_call` (MCP verdict), `inspect_block` (ContentInspector arg block), `exec_block` (ExecInspector block), `response_dlp_redact` (HTTPS response/header redacted), `inject` (phantom injected outbound), `deny` (connection denied at SOCKS5/SNI), `cred_failover` (pool member cooled, Verdict `failover`), `agent_auth_reset` (recovery auto-reset run, Verdict `recover`).
 
 ### MCP gateway
 
diff --git a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md b/docs/plans/completed/20260522-pool-exhaustion-and-agent-auth-reset.md
similarity index 99%
rename from docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
rename to docs/plans/completed/20260522-pool-exhaustion-and-agent-auth-reset.md
index 854324d..7be9f25 100644
--- a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md
+++ b/docs/plans/completed/20260522-pool-exhaustion-and-agent-auth-reset.md
@@ -415,12 +415,12 @@ precedent) and emits an `agent_auth_reset` audit event.
   (verified post-commit: clean `git status`, build + full suite re-run green at HEAD)
 
 ### Task 10: [Final] Documentation
-- [ ] update `CLAUDE.md` credential-pools section: B1 cooldown-from-hints, A1/A2
+- [x] update `CLAUDE.md` credential-pools section: B1 cooldown-from-hints, A1/A2
   exhaustion+edge notices (replacing the per-window dedup wording), per-pool
   `auth_reset_target` + recovery auto-reset, the `agent_auth_reset` audit action, and the
   `ResetAuthCmd`/`ExecUser` profile hooks in the Agent Profiles table
-- [ ] cross-check the CLI subcommand list + the auto-generated `api/openapi.yaml` in CLAUDE.md
-- [ ] move this plan to `docs/plans/completed/`
+- [x] cross-check the CLI subcommand list + the auto-generated `api/openapi.yaml` in CLAUDE.md
+- [x] move this plan to `docs/plans/completed/`
 
 ## Post-Completion
 *Items requiring manual intervention or external systems — informational only*

From be34498de0c878b829e84744b306319ab72190d9 Mon Sep 17 00:00:00 2001
From: Nikita Nemirovsky <vaze.legend@gmail.com>
Date: Fri, 22 May 2026 19:51:09 +0800
Subject: [PATCH 12/19] fix(pools): enforce auth_reset_target charset parity at
 set time across channels

---
 cmd/sluice/pool_test.go             |  9 ++++
 internal/api/server_test.go         | 20 ++++++--
 internal/container/agent_profile.go |  4 +-
 internal/container/apple.go         | 10 +++-
 internal/container/docker.go        |  2 +-
 internal/container/tart.go          | 10 +++-
 internal/container/types.go         | 11 ++++-
 internal/poolops/poolops.go         | 27 ++++++-----
 internal/poolops/poolops_test.go    | 72 +++++++++++++++++++++--------
 internal/proxy/server.go            |  9 ++++
 internal/telegram/commands_test.go  | 11 +++++
 11 files changed, 145 insertions(+), 40 deletions(-)

diff --git a/cmd/sluice/pool_test.go b/cmd/sluice/pool_test.go
index f3189c9..d306170 100644
--- a/cmd/sluice/pool_test.go
+++ b/cmd/sluice/pool_test.go
@@ -165,6 +165,15 @@ func TestHandlePoolAuthResetTarget(t *testing.T) {
 	if err := handlePoolCommand([]string{"set-auth-reset", "--db", dbPath, "codex"}); err == nil {
 		t.Error("expected usage error for set-auth-reset with too few args")
 	}
+
+	// A target outside the allowlist (space, slash) is rejected at set time, not
+	// stored with success and then silently un-executable at recovery (F1).
+	for name, target := range map[string]string{"space": "openai codex", "slash": "openai/codex"} {
+		if err := handlePoolCommand([]string{"set-auth-reset", "--db", dbPath, "codex", target}); err == nil {
+			t.Errorf("%s target: expected rejection, got nil error", name)
+		}
+		assertStoredAuthResetTarget(t, dbPath, "codex", "")
+	}
 }
 
 func assertStoredAuthResetTarget(t *testing.T, dbPath, pool, want string) {
diff --git a/internal/api/server_test.go b/internal/api/server_test.go
index fa069fb..60cb452 100644
--- a/internal/api/server_test.go
+++ b/internal/api/server_test.go
@@ -3098,10 +3098,22 @@ func TestPostApiPoolsNameAuthResetTarget(t *testing.T) {
 		t.Fatalf("after clear stored AuthResetTarget = %q, want empty", got.AuthResetTarget)
 	}
 
-	// Invalid target (newline) -> 400.
-	rec = post(`{"auth_reset_target": "bad\ntarget"}`)
-	if rec.Code != http.StatusBadRequest {
-		t.Fatalf("invalid target: expected 400, got %d: %s", rec.Code, rec.Body.String())
+	// Invalid targets -> 400 and no store mutation. Newline was caught by the
+	// old looser rule; a space and a slash were NOT (F1) and are the cases that
+	// would otherwise store with 200 and fail silently at recovery.
+	for name, target := range map[string]string{
+		"newline": `bad\ntarget`,
+		"space":   `openai codex`,
+		"slash":   `openai/codex`,
+	} {
+		rec = post(`{"auth_reset_target": "` + target + `"}`)
+		if rec.Code != http.StatusBadRequest {
+			t.Fatalf("%s target: expected 400, got %d: %s", name, rec.Code, rec.Body.String())
+		}
+		got, _ = st.GetPool("pool1")
+		if got.AuthResetTarget != "" {
+			t.Fatalf("%s target: store mutated to %q despite 400", name, got.AuthResetTarget)
+		}
 	}
 
 	// Unknown pool -> 404.
diff --git a/internal/container/agent_profile.go b/internal/container/agent_profile.go
index 66423cf..34335f0 100644
--- a/internal/container/agent_profile.go
+++ b/internal/container/agent_profile.go
@@ -40,7 +40,7 @@ type AgentProfile struct {
 	// given target so it retries after a pool quota window passes. It is
 	// pure argv (no `sh -c` wrapper), so there is no shell-metacharacter
 	// threat from the interpolated target; the target is still validated
-	// (validateResetAuthTarget) before exec as defense in depth.
+	// (ValidateResetAuthTarget) before exec as defense in depth.
 	// Returning nil means the profile has no auth-reset mechanism; the
 	// caller should log a notice and rely on the agent recovering on its
 	// own (mirrors a nil ReloadCmd).
@@ -153,7 +153,7 @@ var HermesProfile = &AgentProfile{
 	// ResetAuthCmd un-latches hermes' local auth state for the given
 	// target via `hermes auth reset <target>`. It is pure argv (no
 	// `sh -c`), so the target cannot smuggle shell metacharacters — but
-	// the caller still validates it (validateResetAuthTarget) before
+	// the caller still validates it (ValidateResetAuthTarget) before
 	// exec. The exec must run as the runtime UID (see execUser above) or
 	// it root-chowns hermes-owned auth files and bricks the gateway.
 	ResetAuthCmd: func(target string) []string {
diff --git a/internal/container/apple.go b/internal/container/apple.go
index 65bb86a..c4556ed 100644
--- a/internal/container/apple.go
+++ b/internal/container/apple.go
@@ -255,12 +255,20 @@ func (m *AppleManager) ReloadSecrets(ctx context.Context) error {
 // ResetAuth clears the agent's local auth state for target. The mechanism
 // is profile-specific; a nil ResetAuthCmd is a logged no-op (mirrors
 // ReloadSecrets). The target is validated before exec.
+//
+// Unlike DockerManager, this does not thread profile.ExecUser() into the exec:
+// cli.Exec (the apple `container` CLI) has no per-exec user flag. That is
+// acceptable because the only profile needing a non-root exec UID is hermes,
+// which is docker/local-only (per CLAUDE.md the Apple/tart backends do not run
+// hermes), so the root-chown hazard ExecUser guards against on docker cannot
+// arise here. If a future profile needs a runtime UID on this backend, the CLI
+// invocation must be extended to pass it.
 func (m *AppleManager) ResetAuth(ctx context.Context, target string) error {
 	if m.profile.ResetAuthCmd == nil {
 		log.Printf("agent profile %q has no auth-reset command; agent must recover on its own", m.profile.Name)
 		return nil
 	}
-	if err := validateResetAuthTarget(target); err != nil {
+	if err := ValidateResetAuthTarget(target); err != nil {
 		return err
 	}
 	_, err := m.cli.Exec(ctx, m.containerName, m.profile.ResetAuthCmd(target))
diff --git a/internal/container/docker.go b/internal/container/docker.go
index b29566c..5f85c2c 100644
--- a/internal/container/docker.go
+++ b/internal/container/docker.go
@@ -138,7 +138,7 @@ func (m *DockerManager) ResetAuth(ctx context.Context, target string) error {
 		log.Printf("agent profile %q has no auth-reset command; agent must recover on its own", m.profile.Name)
 		return nil
 	}
-	if err := validateResetAuthTarget(target); err != nil {
+	if err := ValidateResetAuthTarget(target); err != nil {
 		return err
 	}
 	return m.client.ExecInContainer(ctx, m.containerName, m.profile.ExecUser(), m.profile.ResetAuthCmd(target))
diff --git a/internal/container/tart.go b/internal/container/tart.go
index f4ad5a8..f4d8c6c 100644
--- a/internal/container/tart.go
+++ b/internal/container/tart.go
@@ -294,12 +294,20 @@ func (m *TartManager) ReloadSecrets(ctx context.Context) error {
 // ResetAuth clears the agent's local auth state for target. The mechanism
 // is profile-specific; a nil ResetAuthCmd is a logged no-op (mirrors
 // ReloadSecrets). The target is validated before exec.
+//
+// Unlike DockerManager, this does not thread profile.ExecUser() into the exec:
+// cli.Exec (the tart CLI) has no per-exec user flag. That is acceptable because
+// the only profile needing a non-root exec UID is hermes, which is
+// docker/local-only (per CLAUDE.md the Apple/tart backends do not run hermes),
+// so the root-chown hazard ExecUser guards against on docker cannot arise here.
+// If a future profile needs a runtime UID on this backend, the CLI invocation
+// must be extended to pass it.
 func (m *TartManager) ResetAuth(ctx context.Context, target string) error {
 	if m.profile.ResetAuthCmd == nil {
 		log.Printf("agent profile %q has no auth-reset command; agent must recover on its own", m.profile.Name)
 		return nil
 	}
-	if err := validateResetAuthTarget(target); err != nil {
+	if err := ValidateResetAuthTarget(target); err != nil {
 		return err
 	}
 	_, err := m.cli.Exec(ctx, m.vmName, m.profile.ResetAuthCmd(target))
diff --git a/internal/container/types.go b/internal/container/types.go
index 39bf169..4c06237 100644
--- a/internal/container/types.go
+++ b/internal/container/types.go
@@ -128,12 +128,19 @@ func ValidateEnvVarKey(key string) error {
 // shell), so this allowlist is defense in depth, not a shell-escape.
 var resetAuthTargetRe = regexp.MustCompile(`^[A-Za-z0-9_.:-]+$`)
 
-// validateResetAuthTarget checks that an auth-reset target is non-empty,
+// ValidateResetAuthTarget checks that an auth-reset target is non-empty,
 // free of NUL bytes, and within the allowlisted charset before it is
 // passed to ContainerManager.ResetAuth. Mirrors ValidateEnvVarKey's
 // fail-closed style: an invalid target is rejected (no exec) rather than
 // sanitized.
-func validateResetAuthTarget(target string) error {
+//
+// This is the single canonical auth-reset-target validator. It is shared by
+// the channel-agnostic poolops layer (set-time validation on CLI/REST/
+// Telegram) and by every container backend (exec-time defense in depth) so
+// the two surfaces cannot drift: a target that the store accepts is always a
+// target ResetAuth can exec. An empty target ("") means "clear / no reset"
+// and is rejected here; poolops normalizes that sentinel away before calling.
+func ValidateResetAuthTarget(target string) error {
 	if target == "" {
 		return fmt.Errorf("auth-reset target is empty")
 	}
diff --git a/internal/poolops/poolops.go b/internal/poolops/poolops.go
index 49a9b3c..53a403d 100644
--- a/internal/poolops/poolops.go
+++ b/internal/poolops/poolops.go
@@ -15,6 +15,7 @@ import (
 	"strings"
 	"time"
 
+	"github.com/nemirovsky/sluice/internal/container"
 	"github.com/nemirovsky/sluice/internal/store"
 	"github.com/nemirovsky/sluice/internal/vault"
 )
@@ -110,22 +111,26 @@ func ParseMembers(membersStr string) ([]string, error) {
 	return members, nil
 }
 
-// ErrInvalidAuthResetTarget is returned when a non-empty auth-reset target
-// contains a NUL byte or newline. The target is consumed as argv (never
-// shell-interpolated), so this is a minimal structural guard, not a
-// shell-metachar check; channels that exec the target apply any stricter
-// allowlist at exec time.
-var ErrInvalidAuthResetTarget = errors.New("invalid auth_reset_target: must not contain NUL or newline characters")
+// ErrInvalidAuthResetTarget wraps a non-empty auth-reset target that fails the
+// canonical allowlist (container.ValidateResetAuthTarget: [A-Za-z0-9_.:-]+).
+// Channels can errors.Is against it for a uniform "bad target" mapping; the
+// wrapped error carries the specific reason. Set-time validation here matches
+// the exec-time validation exactly so a target the store accepts is always one
+// ResetAuth can exec — there is no longer a looser set-time rule that lets an
+// unexecutable target (a space, a slash) store with success and then fail
+// silently in the detached recovery goroutine.
+var ErrInvalidAuthResetTarget = errors.New("invalid auth_reset_target")
 
-// validateAuthResetTarget rejects a non-empty target containing a NUL byte or
-// a newline. An empty target ("") is always valid and means "clear / no
-// reset", so callers normalize the clear sentinel before validating.
+// validateAuthResetTarget validates a non-empty target against the single
+// canonical allowlist used at exec time (container.ValidateResetAuthTarget).
+// An empty target ("") is always valid and means "clear / no reset", so
+// callers normalize the clear sentinel before validating.
 func validateAuthResetTarget(target string) error {
 	if target == "" {
 		return nil
 	}
-	if strings.ContainsAny(target, "\x00\n\r") {
-		return ErrInvalidAuthResetTarget
+	if err := container.ValidateResetAuthTarget(target); err != nil {
+		return fmt.Errorf("%w: %w", ErrInvalidAuthResetTarget, err)
 	}
 	return nil
 }
diff --git a/internal/poolops/poolops_test.go b/internal/poolops/poolops_test.go
index 0f63dbc..7ea10ad 100644
--- a/internal/poolops/poolops_test.go
+++ b/internal/poolops/poolops_test.go
@@ -147,19 +147,35 @@ func TestCreateWithEmptyAuthResetTargetDefaultsEmpty(t *testing.T) {
 	}
 }
 
+// TestCreateWithInvalidAuthResetTarget asserts a target that the exec-time
+// allowlist (container.ValidateResetAuthTarget: [A-Za-z0-9_.:-]+) would reject
+// is also rejected at create time, on the channel-agnostic path, so it never
+// stores with success and then fails silently in the detached recovery
+// goroutine. A newline/NUL was caught by the old looser rule; a space and a
+// slash were NOT (the F1 silent-failure bug) and are the load-bearing cases.
 func TestCreateWithInvalidAuthResetTarget(t *testing.T) {
-	db := newTestStore(t, "acct_a")
-	err := poolops.CreateWithAuthResetTarget(db, "p", "", []string{"acct_a"}, "bad\ntarget")
-	if !errors.Is(err, poolops.ErrInvalidAuthResetTarget) {
-		t.Fatalf("err = %v, want ErrInvalidAuthResetTarget", err)
-	}
-	// The pool must not have been created when the target is invalid.
-	p, err := db.GetPool("p")
-	if err != nil {
-		t.Fatalf("GetPool: %v", err)
-	}
-	if p != nil {
-		t.Fatalf("pool created despite invalid target: %+v", p)
+	cases := map[string]string{
+		"newline": "bad\ntarget",
+		"nul":     "bad\x00target",
+		"space":   "openai codex",
+		"slash":   "openai/codex",
+	}
+	for name, target := range cases {
+		t.Run(name, func(t *testing.T) {
+			db := newTestStore(t, "acct_a")
+			err := poolops.CreateWithAuthResetTarget(db, "p", "", []string{"acct_a"}, target)
+			if !errors.Is(err, poolops.ErrInvalidAuthResetTarget) {
+				t.Fatalf("err = %v, want ErrInvalidAuthResetTarget", err)
+			}
+			// The pool must not have been created when the target is invalid.
+			p, err := db.GetPool("p")
+			if err != nil {
+				t.Fatalf("GetPool: %v", err)
+			}
+			if p != nil {
+				t.Fatalf("pool created despite invalid target: %+v", p)
+			}
+		})
 	}
 }
 
@@ -202,12 +218,32 @@ func TestSetAuthResetTargetUnknownPool(t *testing.T) {
 }
 
 func TestSetAuthResetTargetInvalid(t *testing.T) {
-	db := newTestStore(t, "acct_a")
-	if err := poolops.Create(db, "p", "", []string{"acct_a"}); err != nil {
-		t.Fatalf("Create: %v", err)
-	}
-	if err := poolops.SetAuthResetTarget(db, "p", "bad\x00target"); !errors.Is(err, poolops.ErrInvalidAuthResetTarget) {
-		t.Fatalf("err = %v, want ErrInvalidAuthResetTarget", err)
+	// Set-time validation mirrors the exec-time allowlist exactly: a space or
+	// slash (not just NUL/newline) is rejected before the store write, so an
+	// unexecutable target can never be persisted.
+	for name, target := range map[string]string{
+		"nul":     "bad\x00target",
+		"newline": "bad\ntarget",
+		"space":   "openai codex",
+		"slash":   "openai/codex",
+	} {
+		t.Run(name, func(t *testing.T) {
+			db := newTestStore(t, "acct_a")
+			if err := poolops.Create(db, "p", "", []string{"acct_a"}); err != nil {
+				t.Fatalf("Create: %v", err)
+			}
+			if err := poolops.SetAuthResetTarget(db, "p", target); !errors.Is(err, poolops.ErrInvalidAuthResetTarget) {
+				t.Fatalf("err = %v, want ErrInvalidAuthResetTarget", err)
+			}
+			// Store must not have been mutated.
+			p, err := db.GetPool("p")
+			if err != nil {
+				t.Fatalf("GetPool: %v", err)
+			}
+			if p.AuthResetTarget != "" {
+				t.Fatalf("target persisted despite invalid value: %q", p.AuthResetTarget)
+			}
+		})
 	}
 }
 
diff --git a/internal/proxy/server.go b/internal/proxy/server.go
index fa2c3fa..1f3ae47 100644
--- a/internal/proxy/server.go
+++ b/internal/proxy/server.go
@@ -3082,6 +3082,15 @@ func (s *Server) clearPoolExhausted(pool string) {
 // "pool recovered" notice, and invokes onPoolRecovered once. The state flip and
 // callback snapshot happen under poolExhaustMu so two concurrent passes cannot
 // both recover the same pool.
+//
+// Concurrency with a re-exhaustion: a 429 arriving between this scan's
+// HasHealthyMember==true and the delete below races markPoolExhausted, but both
+// transitions are guarded by poolExhaustMu (no torn state) and the machine
+// self-corrects. If markPoolExhausted re-sets the flag after this delete, it
+// returns true (a fresh edge), re-emits the exhausted notice, and wakes the
+// monitor, which re-evaluates HasHealthyMember on the next scan. The worst case
+// is one extra recovered/exhausted notice pair during a genuine flap — correct
+// behavior, not a stuck state.
 func (s *Server) recoverPool(pool string) {
 	s.poolExhaustMu.Lock()
 	if !s.poolExhausted[pool] {
diff --git a/internal/telegram/commands_test.go b/internal/telegram/commands_test.go
index 65f6d51..06523ff 100644
--- a/internal/telegram/commands_test.go
+++ b/internal/telegram/commands_test.go
@@ -2244,6 +2244,17 @@ func TestHandlePoolAuthResetTarget(t *testing.T) {
 	if !strings.Contains(got, "Usage:") {
 		t.Errorf("set-auth-reset too few args = %q", got)
 	}
+
+	// A target outside the allowlist (a slash; a space can't survive Telegram's
+	// space-split arg parsing) is rejected, not stored with a success message
+	// and then silently un-executable at recovery (F1).
+	got = h.Handle(&Command{Name: "pool", Args: []string{"set-auth-reset", "codex", "openai/codex"}})
+	if !strings.Contains(got, "Failed to set auth-reset target") {
+		t.Errorf("slash target: expected failure message, got %q", got)
+	}
+	if p, _ := s.GetPool("codex"); p.AuthResetTarget != "" {
+		t.Errorf("slash target: store mutated to %q despite rejection", p.AuthResetTarget)
+	}
 }
 
 func TestHandlePoolCreateNoMembers(t *testing.T) {

From d26a3f0d49b168794c8c7e8c6f736bdc011e9197 Mon Sep 17 00:00:00 2001
From: Nikita Nemirovsky <vaze.legend@gmail.com>
Date: Sat, 23 May 2026 00:16:36 +0800
Subject: [PATCH 13/19] test(pools): drop always-constant param from
 assertStoredAuthResetTarget (unparam)

---
 cmd/sluice/pool_test.go | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/cmd/sluice/pool_test.go b/cmd/sluice/pool_test.go
index d306170..1185141 100644
--- a/cmd/sluice/pool_test.go
+++ b/cmd/sluice/pool_test.go
@@ -120,7 +120,7 @@ func TestHandlePoolAuthResetTarget(t *testing.T) {
 	}
 
 	// Value reached the store.
-	assertStoredAuthResetTarget(t, dbPath, "codex", "openai-codex")
+	assertStoredAuthResetTarget(t, dbPath, "openai-codex")
 
 	// Surfaced in list and status.
 	out = captureStdout(t, func() {
@@ -145,7 +145,7 @@ func TestHandlePoolAuthResetTarget(t *testing.T) {
 	if !strings.Contains(out, "set to \"other-target\"") {
 		t.Errorf("set-auth-reset output = %q", out)
 	}
-	assertStoredAuthResetTarget(t, dbPath, "codex", "other-target")
+	assertStoredAuthResetTarget(t, dbPath, "other-target")
 
 	// Clear with the "-" sentinel.
 	out = captureStdout(t, func() {
@@ -156,7 +156,7 @@ func TestHandlePoolAuthResetTarget(t *testing.T) {
 	if !strings.Contains(out, "cleared") {
 		t.Errorf("clear output = %q", out)
 	}
-	assertStoredAuthResetTarget(t, dbPath, "codex", "")
+	assertStoredAuthResetTarget(t, dbPath, "")
 
 	// Unknown pool and bad usage.
 	if err := handlePoolCommand([]string{"set-auth-reset", "--db", dbPath, "missing", "x"}); err == nil {
@@ -172,17 +172,18 @@ func TestHandlePoolAuthResetTarget(t *testing.T) {
 		if err := handlePoolCommand([]string{"set-auth-reset", "--db", dbPath, "codex", target}); err == nil {
 			t.Errorf("%s target: expected rejection, got nil error", name)
 		}
-		assertStoredAuthResetTarget(t, dbPath, "codex", "")
+		assertStoredAuthResetTarget(t, dbPath, "")
 	}
 }
 
-func assertStoredAuthResetTarget(t *testing.T, dbPath, pool, want string) {
+func assertStoredAuthResetTarget(t *testing.T, dbPath, want string) {
 	t.Helper()
 	db, err := store.New(dbPath)
 	if err != nil {
 		t.Fatalf("open db: %v", err)
 	}
 	defer func() { _ = db.Close() }()
+	const pool = "codex"
 	p, err := db.GetPool(pool)
 	if err != nil {
 		t.Fatalf("GetPool: %v", err)

From 408920268d093cf3e7070b17a3d80ef7c844d2a6 Mon Sep 17 00:00:00 2001
From: Nikita Nemirovsky <vaze.legend@gmail.com>
Date: Sat, 23 May 2026 00:31:01 +0800
Subject: [PATCH 14/19] fix(pools): atomic create-with-target and schema-valid
 set-auth-reset response

---
 cmd/sluice/cred_test.go            |  4 +-
 cmd/sluice/main_test.go            |  2 +-
 cmd/sluice/pool_test.go            |  6 +--
 internal/api/server.go             | 22 +++++----
 internal/api/server_test.go        | 43 +++++++++++++----
 internal/poolops/poolops.go        | 23 ++++-----
 internal/poolops/poolops_test.go   |  5 ++
 internal/store/pools.go            |  7 ++-
 internal/store/pools_test.go       | 76 +++++++++++++++---------------
 internal/telegram/commands_test.go |  6 +--
 10 files changed, 114 insertions(+), 80 deletions(-)

diff --git a/cmd/sluice/cred_test.go b/cmd/sluice/cred_test.go
index d3195de..8b36ad1 100644
--- a/cmd/sluice/cred_test.go
+++ b/cmd/sluice/cred_test.go
@@ -2573,7 +2573,7 @@ func TestFinding3Round9_StoreGatedVaultDeleteOnLivePoolMember(t *testing.T) {
 	if err := db.AddCredentialMeta("pool_mem", "oauth", "https://auth.example.com/token"); err != nil {
 		t.Fatalf("AddCredentialMeta: %v", err)
 	}
-	if err := db.CreatePoolWithMembers("codex_pool", "failover", []string{"pool_mem"}); err != nil {
+	if err := db.CreatePoolWithMembers("codex_pool", "failover", []string{"pool_mem"}, ""); err != nil {
 		t.Fatalf("CreatePoolWithMembers: %v", err)
 	}
 	_ = db.Close()
@@ -2692,7 +2692,7 @@ func TestFinding3Round9_TOCTOUInterleaveStoreGatesVaultDelete(t *testing.T) {
 			if e != nil {
 				return
 			}
-			_ = pdb.CreatePoolWithMembers("codex_pool", "failover", []string{"racer"})
+			_ = pdb.CreatePoolWithMembers("codex_pool", "failover", []string{"racer"}, "")
 			_ = pdb.Close()
 		}()
 
diff --git a/cmd/sluice/main_test.go b/cmd/sluice/main_test.go
index bcc7740..16ebfea 100644
--- a/cmd/sluice/main_test.go
+++ b/cmd/sluice/main_test.go
@@ -1778,7 +1778,7 @@ func seedRecoveryPool(t *testing.T, db *store.Store, members []string) {
 			t.Fatalf("add credential meta %q: %v", m, err)
 		}
 	}
-	if err := db.CreatePoolWithMembers(pool, "", members); err != nil {
+	if err := db.CreatePoolWithMembers(pool, "", members, ""); err != nil {
 		t.Fatalf("create pool %q: %v", pool, err)
 	}
 }
diff --git a/cmd/sluice/pool_test.go b/cmd/sluice/pool_test.go
index 1185141..050f49e 100644
--- a/cmd/sluice/pool_test.go
+++ b/cmd/sluice/pool_test.go
@@ -502,7 +502,7 @@ func TestPoolRotateGuardedAgainstConcurrentRemoval(t *testing.T) {
 		_ = db.Close()
 		t.Fatalf("RemovePool: %v", rerr)
 	}
-	if cerr := db.CreatePoolWithMembers("codex", "failover", []string{"acct_a"}); cerr != nil {
+	if cerr := db.CreatePoolWithMembers("codex", "failover", []string{"acct_a"}, ""); cerr != nil {
 		_ = db.Close()
 		t.Fatalf("recreate pool: %v", cerr)
 	}
@@ -534,7 +534,7 @@ func TestPoolRotateGuardedAgainstConcurrentRemoval(t *testing.T) {
 	if _, rerr := db.RemovePool("codex"); rerr != nil {
 		t.Fatalf("final RemovePool: %v", rerr)
 	}
-	if cerr := db.CreatePoolWithMembers("codex", "failover", []string{"acct_a", "acct_b"}); cerr != nil {
+	if cerr := db.CreatePoolWithMembers("codex", "failover", []string{"acct_a", "acct_b"}, ""); cerr != nil {
 		t.Fatalf("final recreate pool: %v", cerr)
 	}
 	rows, lerr := db.ListCredentialHealth()
@@ -607,7 +607,7 @@ func TestPoolRotateEpochScopedRejectsCrossPoolReAdd(t *testing.T) {
 		_ = db.Close()
 		t.Fatalf("RemovePool(P): %v", rerr)
 	}
-	if cerr := db.CreatePoolWithMembers("Q", "failover", []string{"c", "d"}); cerr != nil {
+	if cerr := db.CreatePoolWithMembers("Q", "failover", []string{"c", "d"}, ""); cerr != nil {
 		_ = db.Close()
 		t.Fatalf("recreate c,d into Q: %v", cerr)
 	}
diff --git a/internal/api/server.go b/internal/api/server.go
index f1066ba..32efa7d 100644
--- a/internal/api/server.go
+++ b/internal/api/server.go
@@ -1856,9 +1856,10 @@ func (s *Server) PostApiPools(w http.ResponseWriter, r *http.Request) { //nolint
 		effectiveStrategy = store.PoolStrategyFailover
 	}
 	out := storePoolToAPI(store.Pool{
-		Name:     req.Name,
-		Strategy: effectiveStrategy,
-		Members:  membersToStorePoolMembers(req.Members),
+		Name:            req.Name,
+		Strategy:        effectiveStrategy,
+		Members:         membersToStorePoolMembers(req.Members),
+		AuthResetTarget: authResetTarget,
 	})
 	if p, err := s.store.GetPool(req.Name); err == nil && p != nil {
 		out = storePoolToAPI(*p)
@@ -1922,7 +1923,11 @@ func (s *Server) PostApiPoolsNameRotate(w http.ResponseWriter, _ *http.Request,
 // routing through poolops.SetAuthResetTarget so the three surfaces cannot
 // drift (channel feature-parity principle). A NUL/newline in the target is a
 // 400 (poolops.ErrInvalidAuthResetTarget); an unknown pool is 404. On success
-// the updated pool is returned so the caller sees the persisted value.
+// the updated pool is returned (200). If the post-write read-back fails the
+// set still succeeded, so 204 No Content is returned rather than a partial
+// Pool object: the OpenAPI Pool schema requires name+strategy+members, and the
+// request body alone cannot reconstruct strategy/members, so echoing it would
+// emit a schema-invalid response.
 func (s *Server) PostApiPoolsNameAuthResetTarget(w http.ResponseWriter, r *http.Request, name string) { //nolint:revive // generated interface name
 	var req SetPoolAuthResetTargetRequest
 	if err := json.NewDecoder(limitedBody(w, r)).Decode(&req); err != nil {
@@ -1937,14 +1942,15 @@ func (s *Server) PostApiPoolsNameAuthResetTarget(w http.ResponseWriter, r *http.
 		writeError(w, status, err.Error(), "")
 		return
 	}
-	w.Header().Set("Content-Type", "application/json")
 	if p, err := s.store.GetPool(name); err == nil && p != nil {
+		w.Header().Set("Content-Type", "application/json")
 		_ = json.NewEncoder(w).Encode(storePoolToAPI(*p))
 		return
 	}
-	// The set succeeded; a read-back failure must not report failure. Echo
-	// the persisted value from the request instead.
-	_ = json.NewEncoder(w).Encode(Pool{Name: name, AuthResetTarget: &req.AuthResetTarget})
+	// The set succeeded; only the post-write read-back failed. Returning a
+	// partial Pool (name + target, missing the required strategy/members)
+	// would violate the OpenAPI schema, so report success with no body.
+	w.WriteHeader(http.StatusNoContent)
 }
 
 // DeleteApiPoolsName removes a pool. It refuses (409) while any binding still
diff --git a/internal/api/server_test.go b/internal/api/server_test.go
index 60cb452..ede39a2 100644
--- a/internal/api/server_test.go
+++ b/internal/api/server_test.go
@@ -1561,7 +1561,7 @@ func TestDeleteApiCredentials_PoolGuardVsStoreFault(t *testing.T) {
 				t.Fatalf("seed oauth cred %q: %v", n, err)
 			}
 		}
-		if err := st.CreatePoolWithMembers("p", "failover", []string{"m", "n"}); err != nil {
+		if err := st.CreatePoolWithMembers("p", "failover", []string{"m", "n"}, ""); err != nil {
 			t.Fatalf("create pool: %v", err)
 		}
 
@@ -2801,7 +2801,7 @@ func TestPostApiPools_DuplicateName(t *testing.T) {
 	st := newTestStore(t)
 	enableHTTPChannel(t, st)
 	seedOAuthCred(t, st, "credA", "credB")
-	if err := st.CreatePoolWithMembers("dup_pool", store.PoolStrategyFailover, []string{"credA"}); err != nil {
+	if err := st.CreatePoolWithMembers("dup_pool", store.PoolStrategyFailover, []string{"credA"}, ""); err != nil {
 		t.Fatalf("seed pool: %v", err)
 	}
 	srv := api.NewServer(st, nil, nil, "")
@@ -2851,7 +2851,7 @@ func TestPostApiPools_MemberAlreadyPooled(t *testing.T) {
 	st := newTestStore(t)
 	enableHTTPChannel(t, st)
 	seedOAuthCred(t, st, "credA", "credB")
-	if err := st.CreatePoolWithMembers("pool_one", store.PoolStrategyFailover, []string{"credA"}); err != nil {
+	if err := st.CreatePoolWithMembers("pool_one", store.PoolStrategyFailover, []string{"credA"}, ""); err != nil {
 		t.Fatalf("seed pool: %v", err)
 	}
 	srv := api.NewServer(st, nil, nil, "")
@@ -2955,7 +2955,7 @@ func TestGetApiPoolsName_Status(t *testing.T) {
 	st := newTestStore(t)
 	enableHTTPChannel(t, st)
 	seedOAuthCred(t, st, "credA", "credB")
-	if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}); err != nil {
+	if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}, ""); err != nil {
 		t.Fatalf("create pool: %v", err)
 	}
 	srv := api.NewServer(st, nil, nil, "")
@@ -3011,7 +3011,7 @@ func TestPostApiPoolsNameRotate_Success(t *testing.T) {
 	st := newTestStore(t)
 	enableHTTPChannel(t, st)
 	seedOAuthCred(t, st, "credA", "credB")
-	if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}); err != nil {
+	if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}, ""); err != nil {
 		t.Fatalf("create pool: %v", err)
 	}
 	srv := api.NewServer(st, nil, nil, "")
@@ -3051,7 +3051,7 @@ func TestPostApiPoolsNameAuthResetTarget(t *testing.T) {
 	st := newTestStore(t)
 	enableHTTPChannel(t, st)
 	seedOAuthCred(t, st, "credA", "credB")
-	if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}); err != nil {
+	if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}, ""); err != nil {
 		t.Fatalf("create pool: %v", err)
 	}
 	srv := api.NewServer(st, nil, nil, "")
@@ -3067,7 +3067,12 @@ func TestPostApiPoolsNameAuthResetTarget(t *testing.T) {
 		return rec
 	}
 
-	// Set a target.
+	// Set a target. The 200 path returns the full, schema-complete pool via a
+	// read-back: name+strategy+members are all required by the OpenAPI Pool
+	// schema. (The read-back-failure fallback returns 204 No Content rather
+	// than a partial Pool that would violate that schema — Copilot #2 — but
+	// that path is not reachable here without failing the store mid-handler,
+	// which the concrete *store.Store gives no seam for.)
 	rec := post(`{"auth_reset_target": "openai-codex"}`)
 	if rec.Code != http.StatusOK {
 		t.Fatalf("set: expected 200, got %d: %s", rec.Code, rec.Body.String())
@@ -3079,6 +3084,16 @@ func TestPostApiPoolsNameAuthResetTarget(t *testing.T) {
 	if p.AuthResetTarget == nil || *p.AuthResetTarget != "openai-codex" {
 		t.Fatalf("response AuthResetTarget = %v, want openai-codex", p.AuthResetTarget)
 	}
+	// Schema completeness: the OpenAPI Pool schema requires name+strategy+members.
+	if p.Name != "pool1" {
+		t.Errorf("200 body name = %q, want pool1", p.Name)
+	}
+	if p.Strategy != store.PoolStrategyFailover {
+		t.Errorf("200 body strategy = %q, want %q", p.Strategy, store.PoolStrategyFailover)
+	}
+	if len(p.Members) != 2 {
+		t.Fatalf("200 body members = %+v, want 2 (credA, credB)", p.Members)
+	}
 	// Reached the store (no inline logic; routed through poolops).
 	got, err := st.GetPool("pool1")
 	if err != nil {
@@ -3147,6 +3162,16 @@ func TestPostApiPools_WithAuthResetTarget(t *testing.T) {
 	if rec.Code != http.StatusCreated {
 		t.Fatalf("expected 201, got %d: %s", rec.Code, rec.Body.String())
 	}
+	// The 201 response body must reflect the configured target. The synthetic
+	// store.Pool the handler builds when the read-back is skipped/fails used to
+	// omit it (Copilot #1), so assert it round-trips through the JSON body.
+	var p api.Pool
+	if err := json.NewDecoder(rec.Body).Decode(&p); err != nil {
+		t.Fatalf("decode 201 body: %v", err)
+	}
+	if p.AuthResetTarget == nil || *p.AuthResetTarget != "openai-codex" {
+		t.Fatalf("201 body AuthResetTarget = %v, want openai-codex", p.AuthResetTarget)
+	}
 	got, err := st.GetPool("codex")
 	if err != nil {
 		t.Fatalf("GetPool: %v", err)
@@ -3178,7 +3203,7 @@ func TestDeleteApiPoolsName_Success(t *testing.T) {
 	st := newTestStore(t)
 	enableHTTPChannel(t, st)
 	seedOAuthCred(t, st, "credA", "credB")
-	if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}); err != nil {
+	if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}, ""); err != nil {
 		t.Fatalf("create pool: %v", err)
 	}
 	srv := api.NewServer(st, nil, nil, "")
@@ -3225,7 +3250,7 @@ func TestDeleteApiPoolsName_ReferencedByBinding(t *testing.T) {
 	st := newTestStore(t)
 	enableHTTPChannel(t, st)
 	seedOAuthCred(t, st, "credA", "credB")
-	if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}); err != nil {
+	if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}, ""); err != nil {
 		t.Fatalf("create pool: %v", err)
 	}
 	// A binding referencing the pool by name keeps it from being removed.
diff --git a/internal/poolops/poolops.go b/internal/poolops/poolops.go
index 53a403d..754accf 100644
--- a/internal/poolops/poolops.go
+++ b/internal/poolops/poolops.go
@@ -24,7 +24,7 @@ import (
 // interface lets each channel pass its own already-open store and lets the
 // tests substitute a fake.
 type Store interface {
-	CreatePoolWithMembers(name, strategy string, members []string) error
+	CreatePoolWithMembers(name, strategy string, members []string, authResetTarget string) error
 	GetPool(name string) (*store.Pool, error)
 	ListPools() ([]store.Pool, error)
 	RemovePoolIfUnreferenced(name string) (bool, error)
@@ -144,10 +144,13 @@ func Create(s Store, name, strategy string, members []string) error {
 }
 
 // CreateWithAuthResetTarget is Create plus an optional per-pool
-// auth_reset_target (empty = no reset). The target is set in a follow-up
-// SetPoolAuthResetTarget call after the pool exists; channels that don't
-// accept a target call Create. Used by every channel's create adapter so the
-// create-with-target path has a single source of truth.
+// auth_reset_target (empty = no reset). The target is bound in the same store
+// transaction that creates the pool and its members, so create-with-target is
+// atomic: a partial state where the pool exists without its target can never
+// be observed, and there is no second write whose failure would leave a
+// created pool plus an error (which a retry would then 409 on). Channels that
+// don't accept a target call Create, which threads "". Used by every channel's
+// create adapter so the create-with-target path has a single source of truth.
 func CreateWithAuthResetTarget(s Store, name, strategy string, members []string, authResetTarget string) error {
 	if strategy == "" {
 		strategy = store.PoolStrategyFailover
@@ -158,15 +161,7 @@ func CreateWithAuthResetTarget(s Store, name, strategy string, members []string,
 	if err := validateAuthResetTarget(authResetTarget); err != nil {
 		return err
 	}
-	if err := s.CreatePoolWithMembers(name, strategy, members); err != nil {
-		return err
-	}
-	if authResetTarget != "" {
-		if err := s.SetPoolAuthResetTarget(name, authResetTarget); err != nil {
-			return err
-		}
-	}
-	return nil
+	return s.CreatePoolWithMembers(name, strategy, members, authResetTarget)
 }
 
 // SetAuthResetTarget sets (target != "") or clears (target == "") the
diff --git a/internal/poolops/poolops_test.go b/internal/poolops/poolops_test.go
index 7ea10ad..1f9b2fa 100644
--- a/internal/poolops/poolops_test.go
+++ b/internal/poolops/poolops_test.go
@@ -109,6 +109,11 @@ func TestCreateListStatusRotateRemove(t *testing.T) {
 	}
 }
 
+// TestCreateWithAuthResetTarget asserts create-with-target persists the target
+// in the single CreatePoolWithMembers call (the target is bound in the same
+// store transaction that creates the pool, not a separate follow-up
+// SetPoolAuthResetTarget write — Copilot #5). Both the GetPool read-back and
+// the derived Status reflect the configured target.
 func TestCreateWithAuthResetTarget(t *testing.T) {
 	db := newTestStore(t, "acct_a", "acct_b")
 
diff --git a/internal/store/pools.go b/internal/store/pools.go
index e71a927..0c0fb8e 100644
--- a/internal/store/pools.go
+++ b/internal/store/pools.go
@@ -196,7 +196,10 @@ func assertCredentialNotInAnotherPoolTx(tx *sql.Tx, credential, newPool string)
 // existing oauth credential with a token_url. At least two members are
 // required for failover to be meaningful, but a single-member pool is
 // permitted (it degrades to a plain indirection with no failover target).
-func (s *Store) CreatePoolWithMembers(name, strategy string, members []string) error {
+// authResetTarget is stored verbatim in the same transaction (empty = none),
+// so create-with-target is atomic: there is no window where the pool exists
+// without its configured target.
+func (s *Store) CreatePoolWithMembers(name, strategy string, members []string, authResetTarget string) error {
 	if name == "" {
 		return fmt.Errorf("%w: pool name is required", ErrPoolNoMembers)
 	}
@@ -253,7 +256,7 @@ func (s *Store) CreatePoolWithMembers(name, strategy string, members []string) e
 	}
 
 	if _, err := tx.Exec(
-		"INSERT INTO credential_pools (name, strategy, auth_reset_target) VALUES (?, ?, '')", name, strategy,
+		"INSERT INTO credential_pools (name, strategy, auth_reset_target) VALUES (?, ?, ?)", name, strategy, authResetTarget,
 	); err != nil {
 		return fmt.Errorf("insert pool %q: %w", name, err)
 	}
diff --git a/internal/store/pools_test.go b/internal/store/pools_test.go
index cdf2291..19a6c60 100644
--- a/internal/store/pools_test.go
+++ b/internal/store/pools_test.go
@@ -27,7 +27,7 @@ func TestCreatePoolWithMembersAndGet(t *testing.T) {
 	seedOAuthCred(t, s, "acct_a")
 	seedOAuthCred(t, s, "acct_b")
 
-	if err := s.CreatePoolWithMembers("codex", "", []string{"acct_a", "acct_b"}); err != nil {
+	if err := s.CreatePoolWithMembers("codex", "", []string{"acct_a", "acct_b"}, ""); err != nil {
 		t.Fatalf("CreatePoolWithMembers: %v", err)
 	}
 
@@ -66,7 +66,7 @@ func TestCreatePoolRejectsStaticMember(t *testing.T) {
 	if err := s.AddCredentialMeta("static_key", "static", ""); err != nil {
 		t.Fatalf("AddCredentialMeta: %v", err)
 	}
-	err := s.CreatePoolWithMembers("p", "failover", []string{"static_key"})
+	err := s.CreatePoolWithMembers("p", "failover", []string{"static_key"}, "")
 	if err == nil {
 		t.Fatal("expected error creating pool with static member")
 	}
@@ -78,7 +78,7 @@ func TestCreatePoolRejectsStaticMember(t *testing.T) {
 
 func TestCreatePoolRejectsMissingMember(t *testing.T) {
 	s := newTestStore(t)
-	if err := s.CreatePoolWithMembers("p", "failover", []string{"nope"}); err == nil {
+	if err := s.CreatePoolWithMembers("p", "failover", []string{"nope"}, ""); err == nil {
 		t.Fatal("expected error for non-existent member credential")
 	}
 }
@@ -86,13 +86,13 @@ func TestCreatePoolRejectsMissingMember(t *testing.T) {
 func TestCreatePoolRejectsBadStrategyAndDupes(t *testing.T) {
 	s := newTestStore(t)
 	seedOAuthCred(t, s, "a")
-	if err := s.CreatePoolWithMembers("p", "roundrobin", []string{"a"}); err == nil {
+	if err := s.CreatePoolWithMembers("p", "roundrobin", []string{"a"}, ""); err == nil {
 		t.Error("expected error for unsupported strategy")
 	}
-	if err := s.CreatePoolWithMembers("p", "failover", []string{"a", "a"}); err == nil {
+	if err := s.CreatePoolWithMembers("p", "failover", []string{"a", "a"}, ""); err == nil {
 		t.Error("expected error for duplicate member")
 	}
-	if err := s.CreatePoolWithMembers("p", "failover", nil); err == nil {
+	if err := s.CreatePoolWithMembers("p", "failover", nil, ""); err == nil {
 		t.Error("expected error for empty member list")
 	}
 }
@@ -101,7 +101,7 @@ func TestPoolCredentialNamespaceMutualExclusion(t *testing.T) {
 	s := newTestStore(t)
 	seedOAuthCred(t, s, "acct_a")
 	// "acct_a" is a credential; a pool may not shadow it.
-	if err := s.CreatePoolWithMembers("acct_a", "failover", []string{"acct_a"}); err == nil {
+	if err := s.CreatePoolWithMembers("acct_a", "failover", []string{"acct_a"}, ""); err == nil {
 		t.Fatal("expected namespace collision error (pool name == credential name)")
 	}
 }
@@ -118,12 +118,12 @@ func TestCreatePoolRejectsMemberAlreadyInAnotherPool(t *testing.T) {
 	seedOAuthCred(t, s, "shared")
 	seedOAuthCred(t, s, "solo")
 
-	if err := s.CreatePoolWithMembers("pool_one", "failover", []string{"shared"}); err != nil {
+	if err := s.CreatePoolWithMembers("pool_one", "failover", []string{"shared"}, ""); err != nil {
 		t.Fatalf("CreatePoolWithMembers(pool_one): %v", err)
 	}
 
 	// "shared" already belongs to pool_one; adding it to pool_two must fail.
-	err := s.CreatePoolWithMembers("pool_two", "failover", []string{"solo", "shared"})
+	err := s.CreatePoolWithMembers("pool_two", "failover", []string{"solo", "shared"}, "")
 	if err == nil {
 		t.Fatal("expected error: credential already a member of another pool (Finding 5)")
 	}
@@ -149,7 +149,7 @@ func TestCreatePoolRejectsMemberAlreadyInAnotherPool(t *testing.T) {
 	if _, err := s.RemovePool("pool_one"); err != nil {
 		t.Fatalf("RemovePool: %v", err)
 	}
-	if err := s.CreatePoolWithMembers("pool_three", "failover", []string{"shared"}); err != nil {
+	if err := s.CreatePoolWithMembers("pool_three", "failover", []string{"shared"}, ""); err != nil {
 		t.Fatalf("after removing pool_one, re-adding shared to a new pool must succeed: %v", err)
 	}
 }
@@ -159,10 +159,10 @@ func TestListPoolsOrdersMembers(t *testing.T) {
 	for _, n := range []string{"a", "b", "c"} {
 		seedOAuthCred(t, s, n)
 	}
-	if err := s.CreatePoolWithMembers("p1", "failover", []string{"c", "a"}); err != nil {
+	if err := s.CreatePoolWithMembers("p1", "failover", []string{"c", "a"}, ""); err != nil {
 		t.Fatalf("create p1: %v", err)
 	}
-	if err := s.CreatePoolWithMembers("p2", "failover", []string{"b"}); err != nil {
+	if err := s.CreatePoolWithMembers("p2", "failover", []string{"b"}, ""); err != nil {
 		t.Fatalf("create p2: %v", err)
 	}
 	pools, err := s.ListPools()
@@ -185,7 +185,7 @@ func TestListPoolsOrdersMembers(t *testing.T) {
 func TestRemovePoolCascadesMembers(t *testing.T) {
 	s := newTestStore(t)
 	seedOAuthCred(t, s, "a")
-	if err := s.CreatePoolWithMembers("p", "failover", []string{"a"}); err != nil {
+	if err := s.CreatePoolWithMembers("p", "failover", []string{"a"}, ""); err != nil {
 		t.Fatalf("create: %v", err)
 	}
 	removed, err := s.RemovePool("p")
@@ -205,7 +205,7 @@ func TestRemovePoolCascadesMembers(t *testing.T) {
 func TestRemovePoolIfUnreferenced_Unreferenced(t *testing.T) {
 	s := newTestStore(t)
 	seedOAuthCred(t, s, "a")
-	if err := s.CreatePoolWithMembers("p", "failover", []string{"a"}); err != nil {
+	if err := s.CreatePoolWithMembers("p", "failover", []string{"a"}, ""); err != nil {
 		t.Fatalf("create: %v", err)
 	}
 	if err := s.SetCredentialHealth("a", "cooldown", time.Now().Add(time.Hour), "429"); err != nil {
@@ -241,7 +241,7 @@ func TestRemovePoolIfUnreferenced_Unreferenced(t *testing.T) {
 func TestRemovePoolIfUnreferenced_RefusedWhenBound(t *testing.T) {
 	s := newTestStore(t)
 	seedOAuthCred(t, s, "a")
-	if err := s.CreatePoolWithMembers("p", "failover", []string{"a"}); err != nil {
+	if err := s.CreatePoolWithMembers("p", "failover", []string{"a"}, ""); err != nil {
 		t.Fatalf("create: %v", err)
 	}
 	// A binding NAMES THE POOL (pool shares the credential namespace).
@@ -288,7 +288,7 @@ func TestRemovePoolIfUnreferenced_RefusedWhenBound(t *testing.T) {
 func TestRemovePoolIfUnreferenced_BindingBeforeRemovalRefuses(t *testing.T) {
 	s := newTestStore(t)
 	seedOAuthCred(t, s, "a")
-	if err := s.CreatePoolWithMembers("p", "failover", []string{"a"}); err != nil {
+	if err := s.CreatePoolWithMembers("p", "failover", []string{"a"}, ""); err != nil {
 		t.Fatalf("create: %v", err)
 	}
 
@@ -327,7 +327,7 @@ func TestRemovePoolIfUnreferenced_BindingBeforeRemovalRefuses(t *testing.T) {
 func TestAddBinding_AfterPoolRemovedRefuses(t *testing.T) {
 	s := newTestStore(t)
 	seedOAuthCred(t, s, "a")
-	if err := s.CreatePoolWithMembers("p", "failover", []string{"a"}); err != nil {
+	if err := s.CreatePoolWithMembers("p", "failover", []string{"a"}, ""); err != nil {
 		t.Fatalf("create pool: %v", err)
 	}
 
@@ -418,7 +418,7 @@ func TestRemovePoolIfUnreferenced_ConcurrentIsInternallyConsistent(t *testing.T)
 			t.Fatalf("iter %d: new store: %v", iter, err)
 		}
 		seedOAuthCred(t, s, "a")
-		if err := s.CreatePoolWithMembers("p", "failover", []string{"a"}); err != nil {
+		if err := s.CreatePoolWithMembers("p", "failover", []string{"a"}, ""); err != nil {
 			t.Fatalf("iter %d: create: %v", iter, err)
 		}
 
@@ -502,12 +502,12 @@ func TestPoolsForMember(t *testing.T) {
 	s := newTestStore(t)
 	seedOAuthCred(t, s, "shared")
 	seedOAuthCred(t, s, "x")
-	if err := s.CreatePoolWithMembers("p1", "failover", []string{"shared", "x"}); err != nil {
+	if err := s.CreatePoolWithMembers("p1", "failover", []string{"shared", "x"}, ""); err != nil {
 		t.Fatalf("create p1: %v", err)
 	}
 	// A credential belongs to at most one pool (Finding 5): adding "shared"
 	// to a second pool must be rejected.
-	if err := s.CreatePoolWithMembers("p2", "failover", []string{"shared"}); err == nil {
+	if err := s.CreatePoolWithMembers("p2", "failover", []string{"shared"}, ""); err == nil {
 		t.Fatal("expected p2 creation to fail: shared already belongs to p1")
 	}
 
@@ -770,7 +770,7 @@ func TestRemoveCredentialMetaBlocksLivePoolMember(t *testing.T) {
 	s := newTestStore(t)
 	seedOAuthCred(t, s, "member")
 	seedOAuthCred(t, s, "other")
-	if err := s.CreatePoolWithMembers("p", "failover", []string{"member", "other"}); err != nil {
+	if err := s.CreatePoolWithMembers("p", "failover", []string{"member", "other"}, ""); err != nil {
 		t.Fatalf("CreatePoolWithMembers: %v", err)
 	}
 
@@ -831,7 +831,7 @@ func TestRemoveCredentialMetaCleansHealthRow(t *testing.T) {
 	// NOT inherit the old cooldown — GetCredentialHealth is nil (= healthy).
 	seedOAuthCred(t, s, "x")
 	seedOAuthCred(t, s, "y")
-	if err := s.CreatePoolWithMembers("fresh", "failover", []string{"x", "y"}); err != nil {
+	if err := s.CreatePoolWithMembers("fresh", "failover", []string{"x", "y"}, ""); err != nil {
 		t.Fatalf("CreatePoolWithMembers(fresh): %v", err)
 	}
 	if h, herr := s.GetCredentialHealth("x"); herr != nil || h != nil {
@@ -847,7 +847,7 @@ func TestAddCredentialMetaRejectsPoolNameCollision(t *testing.T) {
 	s := newTestStore(t)
 	seedOAuthCred(t, s, "acct_a")
 	seedOAuthCred(t, s, "acct_b")
-	if err := s.CreatePoolWithMembers("codex", "failover", []string{"acct_a", "acct_b"}); err != nil {
+	if err := s.CreatePoolWithMembers("codex", "failover", []string{"acct_a", "acct_b"}, ""); err != nil {
 		t.Fatalf("CreatePoolWithMembers: %v", err)
 	}
 
@@ -867,7 +867,7 @@ func TestAddCredentialMetaRejectsPoolNameCollision(t *testing.T) {
 
 	// The reverse direction still holds: CreatePoolWithMembers rejects a
 	// name that already exists as a credential.
-	if err := s.CreatePoolWithMembers("not_a_pool", "failover", []string{"acct_a"}); err == nil {
+	if err := s.CreatePoolWithMembers("not_a_pool", "failover", []string{"acct_a"}, ""); err == nil {
 		t.Fatal("expected CreatePoolWithMembers to reject a name that is already a credential")
 	}
 }
@@ -890,7 +890,7 @@ func TestRemoveCredentialMetaCASGuardsLivePoolMember(t *testing.T) {
 	seedOAuthCred(t, s, "sibling")
 
 	// Concurrent pool-create claims "c" between the insert and the rollback.
-	if err := s.CreatePoolWithMembers("p", "failover", []string{"c", "sibling"}); err != nil {
+	if err := s.CreatePoolWithMembers("p", "failover", []string{"c", "sibling"}, ""); err != nil {
 		t.Fatalf("CreatePoolWithMembers: %v", err)
 	}
 
@@ -967,7 +967,7 @@ func TestRemoveCredentialMetaCASGuardsLivePoolMember(t *testing.T) {
 func TestRemovePoolDeletesMemberHealth(t *testing.T) {
 	s := newTestStore(t)
 	seedOAuthCred(t, s, "m")
-	if err := s.CreatePoolWithMembers("p", "failover", []string{"m"}); err != nil {
+	if err := s.CreatePoolWithMembers("p", "failover", []string{"m"}, ""); err != nil {
 		t.Fatalf("create pool: %v", err)
 	}
 	until := time.Now().Add(10 * time.Minute).UTC().Truncate(time.Second)
@@ -1001,7 +1001,7 @@ func TestRemovePoolDeletesMemberHealth(t *testing.T) {
 func TestRemovePoolSparesStillPooledMemberHealth(t *testing.T) {
 	s := newTestStore(t)
 	seedOAuthCred(t, s, "m")
-	if err := s.CreatePoolWithMembers("p", "failover", []string{"m"}); err != nil {
+	if err := s.CreatePoolWithMembers("p", "failover", []string{"m"}, ""); err != nil {
 		t.Fatalf("create pool p: %v", err)
 	}
 	// "m" also belongs to pool q (legacy/pre-invariant row injected directly).
@@ -1044,7 +1044,7 @@ func TestRemovePoolSparesStillPooledMemberHealth(t *testing.T) {
 func TestAddCredentialMetaRejectsLivePoolMemberDowngrade(t *testing.T) {
 	s := newTestStore(t)
 	seedOAuthCred(t, s, "poolcred")
-	if err := s.CreatePoolWithMembers("p", "failover", []string{"poolcred"}); err != nil {
+	if err := s.CreatePoolWithMembers("p", "failover", []string{"poolcred"}, ""); err != nil {
 		t.Fatalf("create pool: %v", err)
 	}
 
@@ -1101,7 +1101,7 @@ func TestAddCredentialMetaRejectsLivePoolMemberDowngrade(t *testing.T) {
 func TestSetCredentialHealthIfPoolMemberLiveMemberPersists(t *testing.T) {
 	s := newTestStore(t)
 	seedOAuthCred(t, s, "live")
-	if err := s.CreatePoolWithMembers("p", "failover", []string{"live"}); err != nil {
+	if err := s.CreatePoolWithMembers("p", "failover", []string{"live"}, ""); err != nil {
 		t.Fatalf("create pool: %v", err)
 	}
 	until := time.Now().Add(10 * time.Minute).UTC().Truncate(time.Second)
@@ -1139,7 +1139,7 @@ func TestSetCredentialHealthIfPoolMemberLiveMemberPersists(t *testing.T) {
 func TestSetCredentialHealthIfPoolMemberSkipsRemoved(t *testing.T) {
 	s := newTestStore(t)
 	seedOAuthCred(t, s, "gone")
-	if err := s.CreatePoolWithMembers("p", "failover", []string{"gone"}); err != nil {
+	if err := s.CreatePoolWithMembers("p", "failover", []string{"gone"}, ""); err != nil {
 		t.Fatalf("create pool: %v", err)
 	}
 
@@ -1167,7 +1167,7 @@ func TestSetCredentialHealthIfPoolMemberSkipsRemoved(t *testing.T) {
 	// cooldown: ListCredentialHealth (what loadPoolResolver seeds from) must
 	// carry no row for "gone".
 	seedOAuthCred(t, s, "gone")
-	if err := s.CreatePoolWithMembers("p2", "failover", []string{"gone"}); err != nil {
+	if err := s.CreatePoolWithMembers("p2", "failover", []string{"gone"}, ""); err != nil {
 		t.Fatalf("recreate pool: %v", err)
 	}
 	rows, err := s.ListCredentialHealth()
@@ -1197,7 +1197,7 @@ func TestSetCredentialHealthIfPoolMemberSkipsRemoved(t *testing.T) {
 func TestSetCredentialHealthIfPoolMemberEpochRejectsReAddedSuccessor(t *testing.T) {
 	s := newTestStore(t)
 	seedOAuthCred(t, s, "c")
-	if err := s.CreatePoolWithMembers("P", "failover", []string{"c"}); err != nil {
+	if err := s.CreatePoolWithMembers("P", "failover", []string{"c"}, ""); err != nil {
 		t.Fatalf("create pool P: %v", err)
 	}
 	pP, err := s.GetPool("P")
@@ -1216,7 +1216,7 @@ func TestSetCredentialHealthIfPoolMemberEpochRejectsReAddedSuccessor(t *testing.
 		t.Fatalf("RemovePool(P) = %v, %v", removed, rerr)
 	}
 	seedOAuthCred(t, s, "c")
-	if err := s.CreatePoolWithMembers("Q", "failover", []string{"c"}); err != nil {
+	if err := s.CreatePoolWithMembers("Q", "failover", []string{"c"}, ""); err != nil {
 		t.Fatalf("recreate c into Q: %v", err)
 	}
 	pQ, err := s.GetPool("Q")
@@ -1264,7 +1264,7 @@ func TestSetCredentialHealthIfPoolMemberEpochRejectsReAddedSuccessor(t *testing.
 func TestSetCredentialHealthIfPoolMemberEpochLiveMemberSamePool(t *testing.T) {
 	s := newTestStore(t)
 	seedOAuthCred(t, s, "m")
-	if err := s.CreatePoolWithMembers("pool", "failover", []string{"m"}); err != nil {
+	if err := s.CreatePoolWithMembers("pool", "failover", []string{"m"}, ""); err != nil {
 		t.Fatalf("create pool: %v", err)
 	}
 	p, _ := s.GetPool("pool")
@@ -1410,7 +1410,7 @@ func TestRemoveCredentialFullyRefusesLivePoolMember(t *testing.T) {
 	s := newTestStore(t)
 	seedOAuthCred(t, s, "m")
 	seedOAuthCred(t, s, "n")
-	if err := s.CreatePoolWithMembers("p", "failover", []string{"m", "n"}); err != nil {
+	if err := s.CreatePoolWithMembers("p", "failover", []string{"m", "n"}, ""); err != nil {
 		t.Fatalf("CreatePoolWithMembers: %v", err)
 	}
 
@@ -1517,7 +1517,7 @@ func TestRemoveCredentialFullyCleansHealthOnPartialCleanupFinish(t *testing.T) {
 	// carries no row for "x".
 	seedOAuthCred(t, s, "x")
 	seedOAuthCred(t, s, "y")
-	if err := s.CreatePoolWithMembers("p", "failover", []string{"x", "y"}); err != nil {
+	if err := s.CreatePoolWithMembers("p", "failover", []string{"x", "y"}, ""); err != nil {
 		t.Fatalf("CreatePoolWithMembers: %v", err)
 	}
 	hrows, err := s.ListCredentialHealth()
@@ -1586,7 +1586,7 @@ func TestMigration000008DownUpPopulated(t *testing.T) {
 	// credential_health row for one member.
 	seedOAuthCred(t, s, "acct_a")
 	seedOAuthCred(t, s, "acct_b")
-	if err := s.CreatePoolWithMembers("codex", "failover", []string{"acct_a", "acct_b"}); err != nil {
+	if err := s.CreatePoolWithMembers("codex", "failover", []string{"acct_a", "acct_b"}, ""); err != nil {
 		t.Fatalf("CreatePoolWithMembers: %v", err)
 	}
 	if err := s.SetPoolAuthResetTarget("codex", "openai-codex"); err != nil {
@@ -1692,7 +1692,7 @@ func TestPoolAuthResetTargetCRUD(t *testing.T) {
 	seedOAuthCred(t, s, "acct_a")
 	seedOAuthCred(t, s, "acct_b")
 
-	if err := s.CreatePoolWithMembers("codex", "", []string{"acct_a", "acct_b"}); err != nil {
+	if err := s.CreatePoolWithMembers("codex", "", []string{"acct_a", "acct_b"}, ""); err != nil {
 		t.Fatalf("CreatePoolWithMembers: %v", err)
 	}
 
diff --git a/internal/telegram/commands_test.go b/internal/telegram/commands_test.go
index 06523ff..9594bee 100644
--- a/internal/telegram/commands_test.go
+++ b/internal/telegram/commands_test.go
@@ -2308,7 +2308,7 @@ func TestHandlePoolRemoveUnknown(t *testing.T) {
 func TestHandlePoolRemoveReferencedByBinding(t *testing.T) {
 	s := newTestStore(t)
 	seedPoolOAuthMeta(t, s, "acct_a", "acct_b")
-	if err := s.CreatePoolWithMembers("codex", store.PoolStrategyFailover, []string{"acct_a", "acct_b"}); err != nil {
+	if err := s.CreatePoolWithMembers("codex", store.PoolStrategyFailover, []string{"acct_a", "acct_b"}, ""); err != nil {
 		t.Fatalf("create pool: %v", err)
 	}
 	// A binding referencing the pool by name keeps it from being removed.
@@ -2328,7 +2328,7 @@ func TestPoolStatusFormatMatchesCLI(t *testing.T) {
 	// doesn't drift from cmd/sluice/pool.go.
 	s := newTestStore(t)
 	seedPoolOAuthMeta(t, s, "m0", "m1")
-	if err := s.CreatePoolWithMembers("p", store.PoolStrategyFailover, []string{"m0", "m1"}); err != nil {
+	if err := s.CreatePoolWithMembers("p", store.PoolStrategyFailover, []string{"m0", "m1"}, ""); err != nil {
 		t.Fatalf("create pool: %v", err)
 	}
 	// Park m0 so it shows a cooldown line with the reason.
@@ -2352,7 +2352,7 @@ func TestPoolStatusEscapesLastFailureReason(t *testing.T) {
 	// < > & must be HTML-escaped or the Bot API rejects/garbles the message.
 	s := newTestStore(t)
 	seedPoolOAuthMeta(t, s, "m0", "m1")
-	if err := s.CreatePoolWithMembers("p", store.PoolStrategyFailover, []string{"m0", "m1"}); err != nil {
+	if err := s.CreatePoolWithMembers("p", store.PoolStrategyFailover, []string{"m0", "m1"}, ""); err != nil {
 		t.Fatalf("create pool: %v", err)
 	}
 	rawReason := `429 <too many> & "retry"`

From 7008ec53a18bfc746fbb7a3ef921a9a4c51eac20 Mon Sep 17 00:00:00 2001
From: Nikita Nemirovsky <vaze.legend@gmail.com>
Date: Sat, 23 May 2026 00:45:03 +0800
Subject: [PATCH 15/19] fix(pools): scope Retry-After date parsing, require
 auth_reset_target field, document 204

---
 api/openapi.yaml                     |   4 +
 internal/api/api.gen.go              | 161 ++++++++++++++-------------
 internal/api/server.go               |  20 +++-
 internal/api/server_test.go          |  49 ++++++++
 internal/proxy/pool_failover.go      |  15 ++-
 internal/proxy/pool_failover_test.go |  37 ++++++
 6 files changed, 199 insertions(+), 87 deletions(-)

diff --git a/api/openapi.yaml b/api/openapi.yaml
index 7437d37..9d8396b 100644
--- a/api/openapi.yaml
+++ b/api/openapi.yaml
@@ -678,6 +678,10 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/Pool"
+        "204":
+          description: >-
+            Target updated; pool representation unavailable because the
+            post-update read-back failed
         "400":
           description: Invalid target
           content:
diff --git a/internal/api/api.gen.go b/internal/api/api.gen.go
index e88e9f8..1beadf8 100644
--- a/internal/api/api.gen.go
+++ b/internal/api/api.gen.go
@@ -2029,86 +2029,87 @@ func HandlerWithOptions(si ServerInterface, options ChiServerOptions) http.Handl
 // Base64 encoded, gzipped, json marshaled Swagger object
 var swaggerSpec = []string{
 
-	"H4sIAAAAAAAC/+R96XIbuXbwq6D6+6pGqkuRnDu+SY2m8kOWlWtV7LFKkpMfpouEug9FXHUDPQCaEuNS",
-	"VR4iT5gnSeEAvRK90JYoO/NrLDaWs299Ts+XIBRJKjhwrYLjL4EKV5BQ/OdJmkqxpvEl/JGB0uanVIoU",
-	"pGaAC0IJVEM0p/hsKWRi/hVEVMORZgkEo0BvUgiOA6Ul47fB4yiIQGnGqWaCm01bz1nk/TkVUlceMK7h",
-	"FmTw+DgKJPyRMQlRcPzJ7K5f4XaOqrB+LsASN/+AUJvzT7KI6TOu5WYbTRq2QhtKiIBrRuN5psAPeh/G",
-	"baiNglTCer6iauXdJ4GqliMN8ZWmSTqcL1qI2HvWGmTEQu151qB9eWm5yUfq14xHZv+TiFPJgK+iPfD1",
-	"fE2leRaBCiVL7dLgjK+ZFDwBrsmaSkZvYiCcJkCWQpJ0RbkWCdHiDjhh3KBmpW3rhhXQCGSHqBd4Mq7/",
-	"6VV5RFUOhLTKyTQkyi8r7hcqJd1Y2RFahCL27quwvbFNQ5LGVEM/uz2qVmFGB+M/ppG7oE7yCyrNVpLh",
-	"c6QzJTd2z5icaBIDVZoIDmTJII5IkilNboAo0IRxolcw49LaKnIjos2ICL0Cec8UmIdEgVyDJBJ0Jrki",
-	"r6ZT8ppGxJk3cpBQHa4Yv51xs/r03fnhmHzg8cZep4heUU2oBJJKUEYwzL8TEbElg2hMrkBr3L2oUGVB",
-	"tCCUcMGPIEn1hqxpnAGhsRIOUYWwpdTQdcZppsWRUwRC41jcE5nFQA40vb2FiNwzvSILR5UjGkXHs2w6",
-	"/SUsSY9/w4IIOeML83PXqkOiBN7P4Z5UwCZMES40EdKIusHufGnJYgG1QK2oIjcAnEhIxBoiklCe0Tje",
-	"jPBMB2X13BlniijN4pzNEaE8IpTcU8nNWqZILBDTAy7IksbxDQ3v7HVMzbgjzeGYXFClzA7KiaUsyjCK",
-	"zQIVxpCALAo9WJAwBioVopF49ls5twdYrV2MyCJXiMUIj3MGYzHj9jREFOVjTFCwzREeAbBsRwkKRRyz",
-	"CNSMIzMpRyktqCW4FVZja0p+VaT2V3Iq+DJmoR7P0MHVjOh3bfG+G0O2bZukQewd43de/5/RuN0Rw0MK",
-	"oXFcrStixmHOs+SmRpe2IKa6unn6qAaNz8qerijnED+NewVuxKAa19wIEQPlO/kv+8uXAHiWYKQAMdxK",
-	"mgSjYKV1WkGjvPkeblZC3M0VhBL0tri+fX9ySuxDFFDFbtGAuH0kpZtY0EgFHWdnMt4++D/cAR8v3+HB",
-	"b6+vL0hoieo5zecVcUlJvA4uXWmqM7XNq066707OZqQ2GL7SV+8A3zbr+jgwQEGNzWOeoDGCJc1iPa+E",
-	"qTld0H1ikMI3RnHUnVfSjOyLTBtw/RZpjRdEzG/T7FOTKTG/2WvHpo26z46TFyQ0Dy5Gq6R8de2wq1QZ",
-	"mZEb0PcmBqBVb2WdesUXjcl1EYGZKM3cqkiUpTELzYHHlROrHvCgPHRUPe9wxk0kQhK6IcLEaPDAMDgM",
-	"YUwuIbUB1MWHq2vlXGev5/yRU4nvMkPYKTmwgnVaLGotOtAwBKXmSK9tYn84yUxMhWscTQ9ymMj9Cjgx",
-	"N5swU9BMrw4HlCjqF5wvTb4xshF8mGsDr8bqRvidLPt96lMKCmFc2xCe3pqNoeCaMo6hwy4SZO55MtGS",
-	"sJSgVt1McotyLgl8atTcYGt/lEJbbfdWLNoFcRTgfr+Dv8ajgUepYFyjn99ZRHIX7Cw1PqOahTYlrtvL",
-	"3CaaPWPyxu4wwknsnjG6Ymvei1Pwaq9px0zCg9Wqli64yMimHQdKC4Mc8FBuUps+XeYYY/CE11oIe0Mc",
-	"lJR2FX5/evExVVoCTdp1WN7uaHVCkSSUR22WFw+NImZF6KJ2WdvZJeCtst8bGmhJucoLeA2G5I+QrMdE",
-	"6YgJcuAk5nBETKyGSeU93CgR3oEeRvqSGO1MuBCiw4JmejWXoEDPNZW3vuj6g1NGZ1TMjiPcQewOIjOe",
-	"+2l4WNFMaYiObFFBQijWgPIW3QI5wOx6IhKmjUv+F8IFwaO8ipWASXuUByIZ4Zl2QVXUDVEUOVhSFpt7",
-	"iTArzeHDpatVAJSWVMPtxlOtEiIm+ePfSFRR659yUH6y5sShPpS7OQnauXuZxdDK3b44pd3QU61B8peO",
-	"LySkMQ3BeMEWy95fqO6Ils0FEa2VplvY0VXDPq3FinUGtFuT3XwShrXGOqMvQCvS44s6HI/xAWjj8/MO",
-	"d/E6O3gAd2lbkfd3uHdOCeu73MbtJuovNXpM/rX0SeXPamQirxlf4P6FXWVDieYisqhGiQuMyPL4It7M",
-	"+KIWoCxsclJAsigYtTBRAFZ75Rqi8YzP+MXJ9elboiChXLNQHVvtxm31Q3MEmxDOeH6gLR6GmZTAdbwh",
-	"zkvX4qIxuRYEHkyaxMwarDraSmxtnUHb5FyNYubBYlYPxmbBMZkFs2BxOJ7x65VFbw1cKxLGDP+LRUqM",
-	"vMCmVhhcViPqpRTJjCsWW7iNxZFiY+7DvM2Hhiff6g7kjaD4gvlSISpM9+pFTxBaXtAIRP03jMmHhGmi",
-	"Rck/5/2c1DTYduVjh9mNHCTMpaBDwzsDbT2mW3p15NDrZbYU9UxKIS9BpYIrTwEiFJHfhIHZ15/52WU+",
-	"C/EWaKxX7TeroiAGDzRJY9x91+s63TbfjeeJcV6XoDBOb97n8jQ1Z1yB1LWSVsW3FcvUHUvTtlUhVnXm",
-	"Clre55rssO8mu6bzmsyF131Hles6jmsQsgFiE56Rh2Ae6nhh9AFUo5mPe5Vs4rnTiK+rj39j6jG4hr6/",
-	"HOWrMhOsew9ITy5c9LZzRnLyRIlIXwLyNVJQSVoKUfz/EpbBcfD/JmV7zcT11kwMEd7jnudIRlzkWMlC",
-	"mCIqSw2fITocmoYUt3VnJBVcfG+cuiqqqVCskaa0maXKSZV93fC0vVuhoWZr8L+6CIWII3HP5xnXLH6y",
-	"NpSYKj03/MgkzDv6drooMkIn5wkNVuhSNyOSA4+q634lB/mvJoo05OwXAD+xRznhckDaqH8JS5DAQ4i2",
-	"oow64K+mv2KfBkbKb87enV2fkQlN2SQVIlaTL0YQH11FbgXE/EqK9gFZ3EJuNtgUIiRJhCzaDtSYnFIp",
-	"mQuzb4GDZCHB0GRiAhySxplyEavMQp0ZSxHjW4QluYlFeIevN9xp5IBF5C+1VxC/EdcngvnFtXsLR1Qm",
-	"lzQERSTwCGT5LgPPPnCvEklKJdObQ5N7MGXwoiSCCN+GRMRairw5I4e9Rk8CfA2xsJVKLjQJRZbGEBEt",
-	"kBx5MjneCrxzlHayVzlXGb/NG7i8/vUpAscyougVsgo4O1dCBvrevr6nVhAxhWoLPE0Ota0S1nLZFOye",
-	"KmJVDuULuXxvxObOV0saBfbJrpYrbSunaOHPQhxIrg5Hl9qJeF6r7zUveOPIEgCvaSNgvwFvlDyKTLoO",
-	"Y6NW6KPCdxSAfH0w4ej1tSHFrlFB4Q66woNLUCJet1cr24p2c8FDczaN7+lGzWuFvG+q3BUAtWWgLQ3I",
-	"g9tg0T50QpDFsM9e6m9JLn6QmrASmQzhZcvFvXy/Am109STD+gfoa7Qf3/KWpsUk/bZVd3K9ikxjV2Wb",
-	"7WkgtA2AFys0Oh2lpLxzaqg9q3dG+WQI0N/PqRtPUG3N8+JhMzcxF3AXHjTj/aZjauzwXTUqEfJR499B",
-	"suWmtdiETYbzmPG74fSodCZ6iBHDLQ0N0BzaVFBoGnctWNOYRSVQPeFP9bj63lEdvwZs29QyegthZqLg",
-	"K4OroxFQCdIoiUfeL85dkdYI+5pRcvXu4/np2fzk4nx+/eHfzn43QTFZUxNCIv2Q3XhiKe7YI/dormd8",
-	"6QlyLs+urom5yqQlCeX0Nq9sX8UZC6uv2Y9cGwa/JbmIEJQiE3THLASnEta+Bu/Pr23FRmNd0x13KriW",
-	"IiYXMeVgLrZ2RFlofh5Px1OzS6TAacqC4+CX8XT8S4DGeYU0w6Sppg7OXBjhQ/dwHgXHwd9Bn6TspCLL",
-	"0qkt7vnrdGprv1w7O0tT257FBJ/8w+WsViwHS29zhmhLhB8fmy0TF1bpSoq6nn5lRSZLEio3wXHwzqRT",
-	"aeviUaCpyXE+BSVlPpsT6tSafGHR40TaqABVVigP7S6EqhHvPHKBBPJB0gQ0Bm2fvgTMIGF4k1fEjq1v",
-	"KLVIywxGFUo27fBnuxiUfi2izU5M6eJFIxZ7rKu2AerxG0Vi0O3OU3g4f1IyEddGRlxeTV89GQz1gkQX",
-	"BCahXoqMOxB+fQEQaCyBRpuCGERIYiLBiIhMN3TBEZfQVoXo04csYnoiIXQIdhkQs/TSrvQL/x8ZyE0p",
-	"/TFLmA6qAl+0Tf1tOgoS+sASE4D9PJ2aPxl3f3pS8c97sVnlLOAAc2UpQZCABLiWxtk3uVNZEovbYlmF",
-	"J+bZFj/WGE4M4YcNPIJn1N9aaOOhAz53RxuZtcuqZLAnVMiwompFwhVlvIMS1WpVBxVe58v2ISGtFbBt",
-	"slT6MApMPH4s9CwraVKWw2yRuN1B1cjw9C7E2yQ+yJH8/GQwFMTfJrZ75Ppyne2e7s92n3MMiAubu2/f",
-	"keOfuw5sTlC2oXbFVLNDv1o/qMvkSRTVG/or/czbMtlUVYyprJ2Pwb6yqAvrG/y9Iq7n0VPEUf2l3G3/",
-	"8Wo7Acip6KYa9x6E5PdXYpCGOzFw7cAfTBXClcdomJ9fig1Pb53qA8Z7jm8HmCU38vp9mKWXFeg9G8Y3",
-	"ldHmfP6WuPHbsoUrH0Cy9nJrBrehhlbMvsJMVktiHRHNaTlz+PwRTT41OiCi+V3oMtArcPHENNy7sCRN",
-	"WUnbIk3hQXoMV06jH9xw1act92y4CtZ7glf3rrpmuPZoOPL72zxhoYI+UeuRtGKetEsF7aLnJL69wYe7",
-	"fZtJwnxFFfG/g84bh92CTBZvX3Ok7cYB7r+C5jPIdnXUdd+i3U5dfPL9eGSfWPcytpDlsv+3T6ArK/fi",
-	"VirOc6dc2Q31YK+zwg+tcFiDJPCQCgXRYU8OjdurJKvg3ZtDN4n0XGn09rDrnjPpKnc6uUGj6E+YTVcJ",
-	"UEuoe5LlVrnz6KvrchuULFfk8nfbjdEf8bi2jR3fPfRlyKfV79K8TJJctRXD8+Quk9DjI/dG/GcxN/Wx",
-	"rEF2ppvvf9ZcdpDcYbcKppB2WEcsW0bdek1FEqaTYmyix7m/D9OPxdJ9ePfqcMYA9/7+9IKUuHgceH1B",
-	"SZokTPu99hb2z+W2PQPue/bbNbpv0zl/9qctgxcE6HfbvCZ0WzLnVcNdfHZVKl/YaRdUeSmXXQDQ57AH",
-	"MgXHBHps4gWu2YcxxMmm3ZIci0B3JpM6BHIS2L97rWGJ93OZweonJvZs/yypPZ1DQsR/WqOHyOPXgooC",
-	"N35B1TXBM1VYQyNClkB/2y+BNEhOYzuCY/mUd9bhbM8BTirary1P3rwmbmCqmetb8avX3d1MQVNJapZi",
-	"F7ON6vPC9hoZ+lK22krTc72v6RoTawNm+NjXNX4cwU1J3Yho85MqPpmrFu7jwTG+GcfP1A4Y9RoPeAXb",
-	"IoOjXve0TzmbPikP8ybtVpblPdwvKrs1zlUAIwf1OaG/kBTkkfvDzk8eDjQqk7IJ/6js1u/30Ib1jYGA",
-	"H6qy0D3TsOdqf1tY8NF9aTp1EdqLhAVOKr4rXbgCbcyn/SIJumGQR+iKv/mzX0P1xn5nZriy2KHKH9Na",
-	"1gZCfZ2kboKy6J78P+Xze6+35CGSGudOSSh4/obTDRiuWIrvdm+hIccfUGSEJEYAJYuA/M9//Teh0Zpy",
-	"V4Zzhh5l257WIaD4yZOevPIS1wxqf85nwqqi97UTZ/4b3Pect4/noO+FxM/n2OAkn+fznL2X1mqcfRwy",
-	"AyJiFm6I5YRv7KP6vOSk/bs3Oc6Z93zJcfULfXtOji2JPeqVxfD9JMeeF2gVnnpYWlPOCTzkH7Tp09Gz",
-	"B/c/ANpBuLVI4jryTWXZQvD6w/t3xEFVR84CYEWVUEXMyl78WJLj1y/I9gNXneKcZLFmKZV6shQyOYqo",
-	"pnX8Gt8mYLHvC7MGRdt7QHDFqGyYumGcokXqHinFbZ5JwL2GibUPgvlkNbEMqzjhF9AUFJO6JOWAoSQt",
-	"pUiGydLgZmwrTd9TJzbarJcqgeDlve+Wh1mt8rN2HfbKZdPPKPqNYe2OrjOcYi0S+HoGXXmEYwTmn1Uv",
-	"nH+MD7G3efR/dqH+1i15RsQbnx30RRyIFVP5h5NsefSXPUbfOQBG5LBQWxuRDo4/fa5ywWJEwhWEdxXa",
-	"W+gN7et764PVnz4b/bP/Nwar4nVY3omQxo4Sk5OLc/c/bghGAX69FkeojyeTn//6z+PpeDr++fiX6XQa",
-	"PH5+/N8AAAD//4Q38/5OcAAA",
+	"H4sIAAAAAAAC/+R9a28buXrwXyHmfYG1cWRJezanxXrRD47jnhhNNobttB+iQKJnHlk8niFnSY5sNTDQ",
+	"H9Ff2F9S8CHnKs5FiS0n3U8ba3h57jc+5H4JQpGkggPXKjj+EqhwBQnFf56kqRRrGl/CHxkobX5KpUhB",
+	"agY4IJRANURzit+WQibmX0FENRxplkAwCvQmheA4UFoyfhs8joIIlGacaia4mbT1nUXen1MhdeUD4xpu",
+	"QQaPj6NAwh8ZkxAFx5/M7PoWbuaoCuvnAixx8w8ItVn/JIuYPuNabrbRpGErtKGECLhmNJ5nCvyg92Hc",
+	"htooSCWs5yuqVt55EqhqWdIQX2mapMP5ooWIvWutQUYs1J5vDdqXm5aTfKR+zXhk5j+JOJUM+CraA1/P",
+	"11SabxGoULLUDg3O+JpJwRPgmqypZPQmBsJpAmQpJElXlGuREC3ugBPGDWpW2rZ2WAGNQHaIeoEn4/qf",
+	"XpVLVOVASKucTEOi/LLifqFS0o2VHaFFKGLvvArbG9M0JGlMNfSz26NqFWZ0MP5jGrkN6iS/oNJMJRl+",
+	"RzpTcmPnjMmJJjFQpYngQJYM4ogkmdLkBogCTRgnegUzLq2tIjci2oyI0CuQ90yB+UgUyDVIIkFnkivy",
+	"ajolr2lEnHkjBwnV4Yrx2xk3o0/fnR+OyQceb+x2iugV1YRKIKkEZQTD/DsREVsyiMbkCrTG2YsKVRZE",
+	"C0IJF/wIklRvyJrGGRAaK+EQVQhbSg1dZ5xmWhw5RSA0jsU9kVkM5EDT21uIyD3TK7JwVDmiUXQ8y6bT",
+	"X8KS9Pg3LIiQM74wP3eNOiRK4P4c7kkFbMIU4UITIY2oG+zOl5YsFlAL1IoqcgPAiYRErCEiCeUZjePN",
+	"CNd0UFbXnXGmiNIsztkcEcojQsk9ldyMZYrEAjE94IIsaRzf0PDObsfUjDvSHI7JBVXKzKCcWMqiDKPY",
+	"LFBhDAnIotCDBQljoFIhGolnvpVzu4DV2sWILHKFWIxwOWcwFjNuV0NEUT7GBAXbLOERAMt2lKBQxDGL",
+	"QM04MpNylNKCWoJbYTW2puRXRWp/JaeCL2MW6vEMHVzNiH7XFu+7MWTbtkkaxN4xfuf1/xmN2x0xPKQQ",
+	"GsfVOiJmHOY8S25qdGkLYqqjm6uPatD4rOzpinIO8dO4V+BGDKpxzY0QMVC+k/+yv3wJgGcJRgoQw62k",
+	"STAKVlqnFTTKne/hZiXE3VxBKEFvi+vb9yenxH5EAVXsFg2Im0dSuokFjVTQsXYm4+2F/8Mt8PHyHS78",
+	"9vr6goSWqJ7VfF4Rh5TE6+DSlaY6U9u86qT77uRsRmqD4St99Q7wbbOujwMDFNTYPOYJGiNY0izW80qY",
+	"mtMF3ScGKXxjFEfdeSXNyL7ItAHXb5HWuEHE/DbNfjWZEvObvXZs2qj77Dh5QULz4GK0SspX1w47SpWR",
+	"GbkBfW9iAFr1VtapV3zRmFwXEZiJ0syuikRZGrPQLHhcWbHqAQ/KRUfV9Q5n3EQiJKEbIkyMBg8Mg8MQ",
+	"xuQSUhtAXXy4ulbOdfZ6zh85lfguM4SdkgMrWKfFoNaiAw1DUGqO9Nom9oeTzMRUOMbR9CCHidyvgBOz",
+	"swkzBc306nBAiaK+wfnS5BsjG8GHuTbwaqxuhN/Jst+nPqWgEMa1DeHprZkYCq4p4xg67CJBZp8nEy0J",
+	"Swlq1c0kNyjnksCvRs0NtvZHKbTVdm/Fol0QRwHO9zv4a1waeJQKxjX6+Z1FJHfBzlLjN6pZaFPiur3M",
+	"baKZMyZv7AwjnMTOGaMrtua9WAW39pp2zCQ8WK1q6YKLjGzacaC0MMgBD+UmtenTZY4xBk+4rYWwN8RB",
+	"SWlX4fenFx9TpSXQpF2H5e2OVicUSUJ51GZ5cdEoYlaELmqbta1dAt4q+72hgZaUq7yA12BI/gnJekyU",
+	"jpggB05iDkfExGqYVN7DjRLhHehhpC+J0c6ECyE6LGimV3MJCvRcU3nri64/OGV0RsXMOMIZxM4gMuO5",
+	"n4aHFc2UhujIFhUkhGINKG/RLZADzK4nImHauOR/IVwQXMqrWAmYtEd5IJIRrmkHVEXdEEWRgyVlsdmX",
+	"CDPSLD5culoFQGlJNdxuPNUqIWKSf/6NRBW1/ikH5SdrThzqQ7mbk6Cdu5dZDK3c7YtT2g091Rokf+n4",
+	"QkIa0xCMF2yx7P2F6o5o2WwQ0VppuoUdXTXs01qsWGdAuzXZzSdhWGusM/oCtCI9vqjD8RgfgDY+X+9w",
+	"F6+zgwdwm7YVeX+He+eUsL7Lbdxuov5So8fkX0ufVP6sRibymvEFzl/YUTaUaA4ii2qUuMCILI8v4s2M",
+	"L2oBysImJwUki4JRCxMFYLVXriEaz/iMX5xcn74lChLKNQvVsdVunFZfNEewCeGM5wva4mGYSQlcxxvi",
+	"vHQtLhqTa0HgwaRJzIzBqqOtxNbGGbRNztUoZh4sZvVgbBYck1kwCxaH4xm/Xln01sC1ImHM8L9YpMTI",
+	"C2xqhcFlNaJeSpHMuGKxhdtYHCk2Zj/M23xoePKt7kDeCIovmC8VosJ0r170BKHlBo1A1L/DmHxImCZa",
+	"lPxz3s9JTYNtVz52mNnIQcJcCjo0vDPQ1mO6pVdHDr1eZktRz6QU8hJUKrjyFCBCEflNGJh5/ZmfHeaz",
+	"EG+BxnrVvrMqCmLwQJM0xtl3va7TTfPteJ4Y53UJCuP05n4uT1NzxhVIXStpVXxbMUzdsTRtGxViVWeu",
+	"oOU812SHfTvZMZ3bZC687luqHNexXIOQDRCb8Iw8BPNQxwujD6AazXzcq2QTz51GfF19/BtTj8E19P3l",
+	"KF+VmWDde0B6cuGit50zkpMnSkT6EpCvkYJK0lKI4v+XsAyOg/83KdtrJq63ZmKI8B7nPEcy4iLHShbC",
+	"FFFZavgM0eHQNKTYrTsjqeDiO3HqqqimQrFGmtJmliorVeZ1w9N2tkJDzdbgP7oIhYgjcc/nGdcsfrI2",
+	"lJgqPTf8yCTMO/p2uigyQifnCQ1W6FI3I5IDj6rrfiUH+a8mijTk7BcAP7FHOeFyQNqofwlLkMBDiLai",
+	"jDrgr6a/Yp8GRspvzt6dXZ+RCU3ZJBUiVpMvRhAfXUVuBcT8Sor2AVnsQm422BQiJEmELNoO1JicUimZ",
+	"C7NvgYNkIcHQZGICHJLGmXIRq8xCnRlLEeMpwpLcxCK8w+MNtxo5YBH5S+0I4jfi+kQwv7h2p3BEZXJJ",
+	"Q1BEAo9AlmcZuPaBO0okKZVMbw5N7sGUwYuSCCI8DYmItRR5c0YOe42eBPgaYmErlVxoEoosjSEiWiA5",
+	"8mRyvBV45yjtZK9yrjJ+mzdwef3rUwSOZUTRK2QVcHauhAz0vX19T60gYgrVFniaHGpbJazlsinYPVXE",
+	"qhzKF3L53ojNna+WNArsl10tV9pWTtHCn4U4kFwdji61E/G8Vt9rXnDHkSUAbtNGwH4D3ih5FJl0HcZG",
+	"rdBHhe8oAPn6YMLR62tDil2jgsIddIUHl6BEvG6vVrYV7eaCh2ZtGt/TjZrXCnnfVLkrAGrLQFsakAe3",
+	"waJ96IQgi2GfvdTfklz8IDVhJTIZwsuWi3v5fgXa6OpJhvUP0NdoP77llKbFJP22VXdyvYpMY1dlm+1p",
+	"ILQNgBcrNDodpaS8c2qoPat3RvlkCNDfz6m7nqDamufFw2ZuYi7gLjxoxvtNx9SY4dtqVCLko8a/g2TL",
+	"TWuxCZsM5zHjd8PpUelM9BAjhlsaGqA5tKmg0DTuGrCmMYtKoHrCn+py9bmjOn4N2LapZfQWwsxEwVcG",
+	"V0cjoBKkURKPvF+cuyKtEfY1o+Tq3cfz07P5ycX5/PrDv539boJisqYmhET6IbtxxVLcsUfu0WzP+NIT",
+	"5FyeXV0Ts5VJSxLK6W1e2b6KMxZWj9mPXBsGvyW5iBCUIhN0xywEpxLWvgbvz69txUZjXdMtdyq4liIm",
+	"FzHlYDa2dkRZaH4eT8dTM0ukwGnKguPgl/F0/EuAxnmFNMOkqaYOzlwY4UP3cB4Fx8HfQZ+k7KQiy9Kp",
+	"Lc7563Rqa79cOztLU9uexQSf/MPlrFYsB0tv8w7Rlgg/PjZbJi6s0pUUdT39yopMliRUboLj4J1Jp9LW",
+	"waNAU5PjfApKynw2K9SpNfnCoseJtFEBqqxQHtpdCFUj3nnkAgnkg6QJaAzaPn0JmEHC8CaviB1b31Bq",
+	"kZYZjCqUbNrhz3YwKP1aRJudmNLFi0Ys9lhXbQPU4zeKxKDdnafwcP6kZCKOjYy4vJq+ejIY6gWJLghM",
+	"Qr0UGXcg/PoCINBYAo02BTGIkMREghERmW7ogiMuoa0K0acPWcT0RELoEOwyIGbopR3pF/4/MpCbUvpj",
+	"ljAdVAW+aJv623QUJPSBJSYA+3k6NX8y7v70pOKf92KzyruAA8yVpQRBAhLgWhpn3+ROZUgsbothFZ6Y",
+	"b1v8WGM4MYQfNvAInlF/a6GNhw743S1tZNYOq5LBrlAhw4qqFQlXlPEOSlSrVR1UeJ0P24eEtFbAtslS",
+	"6cMoMPH4sdAzrKRJWQ6zReJ2B1Ujw9O7EG+T+CBH8vOTwVAQf5vY7pPry3W2e7o/233OMSAubO6+fUeO",
+	"f+46sDlB2YbaFVPNDv1q/aAukydRVG/or/Qzb8tkU1UxprJ2PgZ7ZFEX1jf4e0Vcz6OniKP6S7nb/uPV",
+	"dgKQU9Hdatx7EJLvX4lBGu7EwLUDfzBVCFceo2F+fik2PL11ql8w3nN8O8AsuSuv34dZelmB3rNhfFO5",
+	"2pzfvyXu+m3ZwpVfQLL2cusObkMNrZh9hZmslsQ6IprT8s7h80c0+a3RARHN70KXgV6Biyem4d6BJWnK",
+	"StoWaQoP0mO4chr94Iarfttyz4arYL0neHVn1TXDtUfDke/f5gkLFfSJWo+kFfdJu1TQDnpO4tsdfLjb",
+	"00wS5iOqiP8ddN447AZksjh9zZG2Ewe4/wqazyDb1auu+xbtduril+/HI/vEupexhSyX/b99Al0ZuRe3",
+	"UnGeO+XK7lIP9jorfGiFwxokgYdUKIgOe3JonF4lWQXv3hy6SaTnSqO3L7vuOZOucqeTGzSK/oTZdJUA",
+	"tYS6J1lulTuPvrout0HJckUuf7fdGP0Rj2vb2PHsoS9DPq2+S/MySXLVVgzPk7tMQo+P3Bvxn8Xc1K9l",
+	"DbIz3Xz/s+ayg+QOu1UwhbSXdcSy5apbr6lIwnRSXJvoce7vw/RjMXQf3r16OWOAe39/ekFKXDwOvD6g",
+	"JE0Spv1eewv753Lbngvue/bbNbpv0zn/9qctgxcE6HfbvCZ0WzLnVcNdfHZVKl/YaRdUeSmXXQDQ57AH",
+	"MgWvCfTYxAscsw9jiDebdktyLALdmUzqEMhJYP/utYYl3s9lBqtPTOzZ/llSezqHhIj/tEYPkcfXgooC",
+	"N76g6prgmSqsoREhS6C/7ZdAGiSnsb2CY/mUd9bh3Z4DvKloX1uevHlN3IWpZq5vxa9ed3d3CppKUrMU",
+	"u5htVJ8XttfI0Jey1Vaanuu8puuaWBsww699XePjCO6W1I2INj+p4slctXCPB8d4Mo7P1A646jUecATb",
+	"IoOjXve0TzmbPikP8ybtVpblPdwvKrs1zlUAIwf1e0J/ISnII/eHvT95ONCoTMom/KOyW7/fQxvWNy4E",
+	"/FCVhe47DXuu9reFBR/dS9Opi9C81tZCnpc1frMeSYJ76tseLGWcrimL8V2+Gwhp5h4WN3w+cs+WGxd7",
+	"hG9WG/f1gnGIE8PvSvmuQBt7bZ9AQcqBPEJKf/M7Y0MV1T5sM1w77S3OH9M8126g+lpX3ZXNol3z/1SQ",
+	"0bu9JQ+R1EQTlISC50eq7kbjiqV4mHwLDTn+gCIjJDECKFkE5H/+678JjdaUu7qf8ywo23a1DgHFN1Z6",
+	"EtlLHDOo3zq/hFYVva+94ubfwT0gvb08B30vJL7XY6Oh/AKhZ+299HLjZcshl05EzMINsZzw3TOpfi85",
+	"af/uzcZz5j1fNl59EnDP2bglsUe9shi+n2zcc2JX4amHpTXlnMBD/oJOn46ePbj/49AOwq1FEteRbyrL",
+	"FoLXH96/Iw6qOnIWACuqhCpiRvbix5Icv35Bti9qdYpzksWapVTqyVLI5CiimtbxazyGwGLfk7YGRdvs",
+	"QHDEqOzQumGcokXqvsOK0zxXD/cal9ZeIPPJamIZVnHCL6ApKCZ1ScoBQ0laSpEMk6XB3d9Wmr6n1m+0",
+	"WS9Vc8HNew+zh1mt8h29Dnvl0vdnFP3G7fCONje8NltUDOope+UT3lsw/6x64fz1P8TeJu7/2YX6Wzfk",
+	"GRFvvHPoizgQK6byl5psPfaXPUbfOQBG5LAyXLuTHRx/+lzlgsWIhCsI7yq0t9Ab2tfn1m9yf/ps9M/+",
+	"7x+sitdheSdCGjtKTE4uzt3/KSIYBfhcLt7ZPp5Mfv7rP4+n4+n45+NfptNp8Pj58X8DAAD//yYn3mC/",
+	"cAAA",
 }
 
 // GetSwagger returns the content of the embedded swagger specification file
diff --git a/internal/api/server.go b/internal/api/server.go
index 32efa7d..0e55d5e 100644
--- a/internal/api/server.go
+++ b/internal/api/server.go
@@ -1928,13 +1928,29 @@ func (s *Server) PostApiPoolsNameRotate(w http.ResponseWriter, _ *http.Request,
 // Pool object: the OpenAPI Pool schema requires name+strategy+members, and the
 // request body alone cannot reconstruct strategy/members, so echoing it would
 // emit a schema-invalid response.
+//
+// auth_reset_target is REQUIRED per the OpenAPI schema: omitting it is a 400
+// (the generated request type's plain-string field can't tell an absent field
+// from an explicit "", which would silently clear the target). An explicit
+// empty string ("") is accepted and clears the target — that is the documented
+// clear path.
 func (s *Server) PostApiPoolsNameAuthResetTarget(w http.ResponseWriter, r *http.Request, name string) { //nolint:revive // generated interface name
-	var req SetPoolAuthResetTargetRequest
+	// Decode into a presence-detecting type: the generated
+	// SetPoolAuthResetTargetRequest carries a plain (non-pointer) string, so a
+	// "{}" body would decode to "" and silently CLEAR the target. A pointer
+	// distinguishes absent (nil -> 400) from an explicit "" (clear).
+	var req struct {
+		AuthResetTarget *string `json:"auth_reset_target"`
+	}
 	if err := json.NewDecoder(limitedBody(w, r)).Decode(&req); err != nil {
 		writeError(w, http.StatusBadRequest, "invalid request body", "")
 		return
 	}
-	if err := poolops.SetAuthResetTarget(s.store, name, req.AuthResetTarget); err != nil {
+	if req.AuthResetTarget == nil {
+		writeError(w, http.StatusBadRequest, "auth_reset_target is required", "")
+		return
+	}
+	if err := poolops.SetAuthResetTarget(s.store, name, *req.AuthResetTarget); err != nil {
 		status := poolStatusError(err)
 		if errors.Is(err, poolops.ErrInvalidAuthResetTarget) {
 			status = http.StatusBadRequest
diff --git a/internal/api/server_test.go b/internal/api/server_test.go
index ede39a2..f018244 100644
--- a/internal/api/server_test.go
+++ b/internal/api/server_test.go
@@ -3142,6 +3142,55 @@ func TestPostApiPoolsNameAuthResetTarget(t *testing.T) {
 	}
 }
 
+// TestPostApiPoolsNameAuthResetTarget_RequiredField asserts auth_reset_target is
+// a REQUIRED field on the set route (Copilot re-review): an empty "{}" body that
+// OMITS the field is a 400 and must NOT silently clear the stored target, while
+// an EXPLICIT empty string ("") is still accepted and clears it (the documented
+// clear path).
+func TestPostApiPoolsNameAuthResetTarget_RequiredField(t *testing.T) {
+	st := newTestStore(t)
+	enableHTTPChannel(t, st)
+	seedOAuthCred(t, st, "credA", "credB")
+	if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}, "openai-codex"); err != nil {
+		t.Fatalf("create pool: %v", err)
+	}
+	srv := api.NewServer(st, nil, nil, "")
+	t.Setenv("SLUICE_API_TOKEN", "tok")
+	handler := newTestHandler(t, srv, st)
+
+	post := func(body string) *httptest.ResponseRecorder {
+		req := httptest.NewRequest("POST", "/api/pools/pool1/auth-reset-target", strings.NewReader(body))
+		req.Header.Set("Authorization", "Bearer tok")
+		req.Header.Set("Content-Type", "application/json")
+		rec := httptest.NewRecorder()
+		handler.ServeHTTP(rec, req)
+		return rec
+	}
+
+	// Omitting the field -> 400, and the pre-existing target is untouched.
+	rec := post(`{}`)
+	if rec.Code != http.StatusBadRequest {
+		t.Fatalf("omitted field: expected 400, got %d: %s", rec.Code, rec.Body.String())
+	}
+	got, err := st.GetPool("pool1")
+	if err != nil {
+		t.Fatalf("GetPool: %v", err)
+	}
+	if got.AuthResetTarget != "openai-codex" {
+		t.Fatalf("omitted field cleared the target to %q; expected it untouched (openai-codex)", got.AuthResetTarget)
+	}
+
+	// Explicit empty string -> still accepted and clears the target.
+	rec = post(`{"auth_reset_target": ""}`)
+	if rec.Code != http.StatusOK {
+		t.Fatalf("explicit clear: expected 200, got %d: %s", rec.Code, rec.Body.String())
+	}
+	got, _ = st.GetPool("pool1")
+	if got.AuthResetTarget != "" {
+		t.Fatalf("explicit clear: stored AuthResetTarget = %q, want empty", got.AuthResetTarget)
+	}
+}
+
 // TestPostApiPools_WithAuthResetTarget asserts the create body accepts
 // auth_reset_target and persists it via poolops.
 func TestPostApiPools_WithAuthResetTarget(t *testing.T) {
diff --git a/internal/proxy/pool_failover.go b/internal/proxy/pool_failover.go
index e719d84..d1e1f9b 100644
--- a/internal/proxy/pool_failover.go
+++ b/internal/proxy/pool_failover.go
@@ -238,12 +238,17 @@ func parseRecoveryHint(header, raw string, now time.Time) (time.Duration, bool)
 		}
 		return time.Duration(secs * float64(time.Second)), true
 	}
-	// HTTP-date (Retry-After absolute form).
-	if t, err := http.ParseTime(raw); err == nil {
-		if d := t.Sub(now); d > 0 {
-			return d, true
+	// HTTP-date (Retry-After absolute form, per RFC 9110). Only Retry-After is
+	// permitted to carry an absolute HTTP-date; the x-ratelimit-reset* family is
+	// numeric (delta-seconds / unix epoch / unit-suffixed duration) and must not
+	// be coerced through HTTP-date parsing.
+	if header == "Retry-After" {
+		if t, err := http.ParseTime(raw); err == nil {
+			if d := t.Sub(now); d > 0 {
+				return d, true
+			}
+			return 0, false
 		}
-		return 0, false
 	}
 	// Unit-suffixed duration (e.g. OpenAI "1.5s", "60ms").
 	if d, err := time.ParseDuration(raw); err == nil && d > 0 {
diff --git a/internal/proxy/pool_failover_test.go b/internal/proxy/pool_failover_test.go
index 8551be8..3c595e8 100644
--- a/internal/proxy/pool_failover_test.go
+++ b/internal/proxy/pool_failover_test.go
@@ -1408,3 +1408,40 @@ func TestCooldownFromResponseNilSafe(t *testing.T) {
 		t.Fatalf("nil header (auth): got %v, want %v", got, vault.AuthFailCooldown)
 	}
 }
+
+// TestParseRecoveryHintHTTPDateGatedToRetryAfter verifies the HTTP-date branch
+// is reachable only for Retry-After (RFC 9110). A reset-family header carrying
+// an HTTP-date string must NOT be coerced through http.ParseTime — those
+// headers are numeric (delta-seconds / epoch / unit-suffixed duration), so a
+// date value is simply not a usable hint and parseRecoveryHint reports ok=false.
+func TestParseRecoveryHintHTTPDateGatedToRetryAfter(t *testing.T) {
+	now := time.Now()
+	httpDate := now.Add(90 * time.Second).UTC().Format(http.TimeFormat)
+
+	// Retry-After with an HTTP-date still yields a positive duration.
+	d, ok := parseRecoveryHint("Retry-After", httpDate, now)
+	if !ok || d <= 0 {
+		t.Fatalf("Retry-After HTTP-date: got (%v, %v), want a positive duration", d, ok)
+	}
+
+	// The reset family must not parse an HTTP-date as a date: no usable hint.
+	for _, h := range []string{"x-ratelimit-reset", "x-ratelimit-reset-requests", "x-ratelimit-reset-tokens"} {
+		if d, ok := parseRecoveryHint(h, httpDate, now); ok || d != 0 {
+			t.Fatalf("%s HTTP-date: got (%v, %v), want (0, false) — date form must be Retry-After only", h, d, ok)
+		}
+	}
+
+	// And end-to-end through cooldownFromResponse: a reset-family HTTP-date is
+	// ignored and the class default applies, while the same date on Retry-After
+	// is honored.
+	resetHdr := make(http.Header)
+	resetHdr.Set("x-ratelimit-reset-requests", httpDate)
+	if got := cooldownFromResponse(failoverRateLimited, resetHdr); got != vault.RateLimitCooldown {
+		t.Fatalf("reset-family HTTP-date: got %v, want fallback %v", got, vault.RateLimitCooldown)
+	}
+	retryHdr := make(http.Header)
+	retryHdr.Set("Retry-After", httpDate)
+	if got := cooldownFromResponse(failoverRateLimited, retryHdr); got <= 0 || got > vault.MaxCooldown {
+		t.Fatalf("Retry-After HTTP-date: got %v, want a clamped positive duration", got)
+	}
+}

From 4f8fe14616f31ba741c2c31757c090da7cfd40ad Mon Sep 17 00:00:00 2001
From: Nikita Nemirovsky <vaze.legend@gmail.com>
Date: Sat, 23 May 2026 10:22:44 +0800
Subject: [PATCH 16/19] fix(pools): reserve "-" clear sentinel and de-flake
 TestSoonestCooldown

---
 internal/api/server_test.go      | 12 +++++---
 internal/poolops/poolops.go      |  9 ++++++
 internal/poolops/poolops_test.go | 41 +++++++++++++++++++++++++
 internal/vault/pool_test.go      | 51 +++++++++++++++++---------------
 4 files changed, 85 insertions(+), 28 deletions(-)

diff --git a/internal/api/server_test.go b/internal/api/server_test.go
index f018244..b69eb25 100644
--- a/internal/api/server_test.go
+++ b/internal/api/server_test.go
@@ -3115,11 +3115,15 @@ func TestPostApiPoolsNameAuthResetTarget(t *testing.T) {
 
 	// Invalid targets -> 400 and no store mutation. Newline was caught by the
 	// old looser rule; a space and a slash were NOT (F1) and are the cases that
-	// would otherwise store with 200 and fail silently at recovery.
+	// would otherwise store with 200 and fail silently at recovery. A literal
+	// "-" is the CLI/Telegram clear sentinel: CLI/Telegram translate it to ""
+	// before poolops, but REST forwards it verbatim, so it must be a 400 here
+	// (cross-channel parity) rather than a nonsensical stored target.
 	for name, target := range map[string]string{
-		"newline": `bad\ntarget`,
-		"space":   `openai codex`,
-		"slash":   `openai/codex`,
+		"newline":  `bad\ntarget`,
+		"space":    `openai codex`,
+		"slash":    `openai/codex`,
+		"sentinel": `-`,
 	} {
 		rec = post(`{"auth_reset_target": "` + target + `"}`)
 		if rec.Code != http.StatusBadRequest {
diff --git a/internal/poolops/poolops.go b/internal/poolops/poolops.go
index 754accf..7057722 100644
--- a/internal/poolops/poolops.go
+++ b/internal/poolops/poolops.go
@@ -129,6 +129,15 @@ func validateAuthResetTarget(target string) error {
 	if target == "" {
 		return nil
 	}
+	// "-" is the CLI/Telegram clear sentinel (converted to "" before reaching
+	// here), so a literal "-" is never a valid stored target on any channel.
+	// container.ValidateResetAuthTarget permits "-" ([A-Za-z0-9_.:-]+), so
+	// without this guard REST — which forwards the value verbatim instead of
+	// translating the sentinel — could store auth_reset_target="-" while
+	// CLI/Telegram cannot. Reject it here so the gap is closed cross-channel.
+	if target == "-" {
+		return fmt.Errorf("%w: %q is reserved as the clear sentinel", ErrInvalidAuthResetTarget, "-")
+	}
 	if err := container.ValidateResetAuthTarget(target); err != nil {
 		return fmt.Errorf("%w: %w", ErrInvalidAuthResetTarget, err)
 	}
diff --git a/internal/poolops/poolops_test.go b/internal/poolops/poolops_test.go
index 1f9b2fa..7db7f3d 100644
--- a/internal/poolops/poolops_test.go
+++ b/internal/poolops/poolops_test.go
@@ -184,6 +184,47 @@ func TestCreateWithInvalidAuthResetTarget(t *testing.T) {
 	}
 }
 
+// TestAuthResetTargetDashSentinelRejected pins the cross-channel gap fix: a
+// literal "-" is the CLI/Telegram clear sentinel (converted to "" before
+// reaching poolops), so it must never be a valid STORED target on any channel.
+// REST forwards the value verbatim, so without this guard it could persist
+// auth_reset_target="-" while CLI/Telegram cannot. Both the create path and
+// the set path must reject a literal "-".
+func TestAuthResetTargetDashSentinelRejected(t *testing.T) {
+	t.Run("set", func(t *testing.T) {
+		db := newTestStore(t, "acct_a")
+		if err := poolops.Create(db, "p", "", []string{"acct_a"}); err != nil {
+			t.Fatalf("Create: %v", err)
+		}
+		if err := poolops.SetAuthResetTarget(db, "p", "-"); !errors.Is(err, poolops.ErrInvalidAuthResetTarget) {
+			t.Fatalf("SetAuthResetTarget(\"-\") err = %v, want ErrInvalidAuthResetTarget", err)
+		}
+		p, err := db.GetPool("p")
+		if err != nil {
+			t.Fatalf("GetPool: %v", err)
+		}
+		if p.AuthResetTarget != "" {
+			t.Fatalf("sentinel persisted despite rejection: %q", p.AuthResetTarget)
+		}
+	})
+
+	t.Run("create", func(t *testing.T) {
+		db := newTestStore(t, "acct_a")
+		err := poolops.CreateWithAuthResetTarget(db, "p", "", []string{"acct_a"}, "-")
+		if !errors.Is(err, poolops.ErrInvalidAuthResetTarget) {
+			t.Fatalf("create-with-target \"-\" err = %v, want ErrInvalidAuthResetTarget", err)
+		}
+		// The pool must not have been created when the target is invalid.
+		p, err := db.GetPool("p")
+		if err != nil {
+			t.Fatalf("GetPool: %v", err)
+		}
+		if p != nil {
+			t.Fatalf("pool created despite sentinel target: %+v", p)
+		}
+	})
+}
+
 func TestSetAuthResetTargetSetAndClear(t *testing.T) {
 	db := newTestStore(t, "acct_a", "acct_b")
 	if err := poolops.Create(db, "codex", "", []string{"acct_a", "acct_b"}); err != nil {
diff --git a/internal/vault/pool_test.go b/internal/vault/pool_test.go
index 7dc0668..56401e5 100644
--- a/internal/vault/pool_test.go
+++ b/internal/vault/pool_test.go
@@ -1029,13 +1029,18 @@ func TestHasHealthyMemberNil(t *testing.T) {
 // STRICTLY-FUTURE member cooldown, skipping already-passed entries (lazy
 // expiry), and reports ok=false when no member is currently cooling.
 func TestSoonestCooldown(t *testing.T) {
-	now := time.Now()
 	tests := []struct {
-		name     string
-		setup    func(pr *PoolResolver, base time.Time)
-		pool     string
-		wantOK   bool
-		wantBase time.Time // expected soonest when wantOK (exact, set in setup)
+		name   string
+		setup  func(pr *PoolResolver, base time.Time)
+		pool   string
+		wantOK bool
+		// wantOffset is the intended soonest cooldown expressed as an offset
+		// from the per-subtest `base` (the same `base` passed into setup), so
+		// the expected value derives from the SAME clock sample the cooldowns
+		// were set against. Comparing against the outer `now` instead let slow
+		// CI / -race / GC widen the now-vs-base gap past the tolerance and
+		// flake; deriving both sides from `base` makes it exact.
+		wantOffset time.Duration
 	}{
 		{
 			name:   "no member cooling",
@@ -1048,9 +1053,9 @@ func TestSoonestCooldown(t *testing.T) {
 			setup: func(pr *PoolResolver, base time.Time) {
 				pr.MarkCooldown("a", base.Add(60*time.Second), "429")
 			},
-			pool:     "pool",
-			wantOK:   true,
-			wantBase: now.Add(60 * time.Second),
+			pool:       "pool",
+			wantOK:     true,
+			wantOffset: 60 * time.Second,
 		},
 		{
 			name: "two cooling -> min wins",
@@ -1058,9 +1063,9 @@ func TestSoonestCooldown(t *testing.T) {
 				pr.MarkCooldown("a", base.Add(10*time.Minute), "401")
 				pr.MarkCooldown("b", base.Add(60*time.Second), "429")
 			},
-			pool:     "pool",
-			wantOK:   true,
-			wantBase: now.Add(60 * time.Second),
+			pool:       "pool",
+			wantOK:     true,
+			wantOffset: 60 * time.Second,
 		},
 		{
 			name: "already-passed entry skipped",
@@ -1070,9 +1075,9 @@ func TestSoonestCooldown(t *testing.T) {
 				pr.MarkCooldown("a", base.Add(-1*time.Second), "429")
 				pr.MarkCooldown("b", base.Add(120*time.Second), "401")
 			},
-			pool:     "pool",
-			wantOK:   true,
-			wantBase: now.Add(120 * time.Second),
+			pool:       "pool",
+			wantOK:     true,
+			wantOffset: 120 * time.Second,
 		},
 		{
 			name: "all passed -> not cooling",
@@ -1093,9 +1098,6 @@ func TestSoonestCooldown(t *testing.T) {
 	for _, tc := range tests {
 		t.Run(tc.name, func(t *testing.T) {
 			pr := NewPoolResolver([]store.Pool{mkPool("pool", "a", "b")}, nil)
-			// Use a base far enough in the future that the "already-passed"
-			// negative offsets are still safely in the past relative to the
-			// SoonestCooldown's own time.Now() sample.
 			base := time.Now()
 			tc.setup(pr, base)
 			got, ok := pr.SoonestCooldown(tc.pool)
@@ -1105,12 +1107,13 @@ func TestSoonestCooldown(t *testing.T) {
 			if !tc.wantOK {
 				return
 			}
-			// The stored cooldown is exact (MarkCooldown stores `until`
-			// verbatim); SoonestCooldown returns it unchanged, so compare
-			// against the value set in setup within a small tolerance for the
-			// base/now skew.
-			if d := got.Sub(tc.wantBase); d < -2*time.Second || d > 2*time.Second {
-				t.Fatalf("SoonestCooldown(%q) = %v, want ~%v (delta %v)", tc.pool, got, tc.wantBase, d)
+			// MarkCooldown stores `until` verbatim and SoonestCooldown returns
+			// it unchanged, so the expected value is exactly base+offset (both
+			// derived from the same `base`). Assert exact equality — there is
+			// no clock skew between the stored value and the expectation.
+			want := base.Add(tc.wantOffset)
+			if !got.Equal(want) {
+				t.Fatalf("SoonestCooldown(%q) = %v, want %v (delta %v)", tc.pool, got, want, got.Sub(want))
 			}
 		})
 	}

From a0aca1be79b69e19f441383358ec3fe9fcea1db6 Mon Sep 17 00:00:00 2001
From: Nikita Nemirovsky <vaze.legend@gmail.com>
Date: Sat, 23 May 2026 10:33:14 +0800
Subject: [PATCH 17/19] fix(proxy): reject NaN/Inf/overflow recovery hints;
 correct migration test comments

---
 internal/proxy/pool_failover.go      | 35 ++++++++++++++++++--
 internal/proxy/pool_failover_test.go | 49 ++++++++++++++++++++++++++++
 internal/store/pools_test.go         | 14 +++++---
 3 files changed, 92 insertions(+), 6 deletions(-)

diff --git a/internal/proxy/pool_failover.go b/internal/proxy/pool_failover.go
index d1e1f9b..ab93d7c 100644
--- a/internal/proxy/pool_failover.go
+++ b/internal/proxy/pool_failover.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"fmt"
 	"log"
+	"math"
 	"net/http"
 	"strconv"
 	"strings"
@@ -219,13 +220,35 @@ func cooldownFromResponse(class failoverClass, header http.Header) time.Duration
 // Returns ok=false when the value parses to nothing usable (negative,
 // zero-after-now, unparseable). A delta value carrying a unit suffix (OpenAI's
 // "1.5s" / "60ms" form) is handled via time.ParseDuration as a fallback.
+//
+// Non-finite (NaN, +Inf, -Inf) and out-of-range bare-numeric values are
+// rejected up front: strconv.ParseFloat accepts all of them, NaN<0 and Inf<0
+// are both false so the old `secs < 0` guard let them through, and the
+// subsequent float->int64 conversion (delta `time.Duration(secs*1e9)` and
+// epoch `int64(secs)`) is implementation-defined for those inputs and can wrap
+// to a wrong (even negative) duration before clampCooldown runs. Any finite
+// magnitude above maxRecoveryHintSeconds is also rejected — it is far larger
+// than any real Retry-After / reset / epoch and would overflow int64 once
+// multiplied into nanoseconds.
+// maxRecoveryHintSeconds is the upper bound for a bare-numeric recovery hint
+// (~31,000 years). Any larger value is not a real Retry-After / reset / unix
+// epoch and is rejected: it is comfortably above a present-day epoch (~1.7e9)
+// yet small enough that int64(secs) cannot overflow on the epoch branch.
+const maxRecoveryHintSeconds = 1e12
+
 func parseRecoveryHint(header, raw string, now time.Time) (time.Duration, bool) {
 	// Bare numeric: delta-seconds for Retry-After; for the reset family it may
 	// be either delta-seconds OR a unix epoch. Disambiguate by magnitude — a
 	// value large enough to be a plausible epoch (>= ~ year 2001) is treated
 	// as absolute, otherwise as a delta.
 	if secs, err := strconv.ParseFloat(raw, 64); err == nil {
-		if secs < 0 {
+		// Reject negative, non-finite (NaN/±Inf), and absurdly large
+		// magnitudes before any float->int64 conversion. maxRecoveryHintSeconds
+		// (~31,000 years) comfortably exceeds any real epoch (~1.7e9 today) or
+		// recovery delta, while staying well within int64 for the epoch
+		// branch's int64(secs); the delta branch additionally caps in float
+		// space (below) so secs*1e9 can never overflow int64 (~9.2e18).
+		if secs < 0 || math.IsNaN(secs) || math.IsInf(secs, 0) || secs > maxRecoveryHintSeconds {
 			return 0, false
 		}
 		const epochThreshold = 1_000_000_000 // ~2001-09; below this, treat as delta-seconds
@@ -236,7 +259,15 @@ func parseRecoveryHint(header, raw string, now time.Time) (time.Duration, bool)
 			}
 			return 0, false
 		}
-		return time.Duration(secs * float64(time.Second)), true
+		// Delta path: cap in float space before the *float64(time.Second)
+		// multiply so the int64(time.Duration) conversion can never overflow.
+		// clampCooldown applies the real ceiling (MaxCooldown) afterward; this
+		// only prevents the conversion itself from wrapping.
+		deltaSec := secs
+		if maxSec := vault.MaxCooldown.Seconds(); deltaSec > maxSec {
+			deltaSec = maxSec
+		}
+		return time.Duration(deltaSec * float64(time.Second)), true
 	}
 	// HTTP-date (Retry-After absolute form, per RFC 9110). Only Retry-After is
 	// permitted to carry an absolute HTTP-date; the x-ratelimit-reset* family is
diff --git a/internal/proxy/pool_failover_test.go b/internal/proxy/pool_failover_test.go
index 3c595e8..0e0667a 100644
--- a/internal/proxy/pool_failover_test.go
+++ b/internal/proxy/pool_failover_test.go
@@ -1367,6 +1367,36 @@ func TestCooldownFromResponse(t *testing.T) {
 			setup: func(h http.Header) { h.Set("Retry-After", "-5") },
 			want:  vault.RateLimitCooldown,
 		},
+		{
+			name:  "NaN delta ignored falls back to default",
+			class: failoverRateLimited,
+			setup: func(h http.Header) { h.Set("Retry-After", "NaN") },
+			want:  vault.RateLimitCooldown,
+		},
+		{
+			name:  "Inf delta ignored falls back to default",
+			class: failoverRateLimited,
+			setup: func(h http.Header) { h.Set("Retry-After", "Inf") },
+			want:  vault.RateLimitCooldown,
+		},
+		{
+			name:  "+Inf delta ignored falls back to default",
+			class: failoverRateLimited,
+			setup: func(h http.Header) { h.Set("Retry-After", "+Inf") },
+			want:  vault.RateLimitCooldown,
+		},
+		{
+			name:  "-Inf delta ignored falls back to default",
+			class: failoverRateLimited,
+			setup: func(h http.Header) { h.Set("Retry-After", "-Inf") },
+			want:  vault.RateLimitCooldown,
+		},
+		{
+			name:  "overflow-magnitude delta ignored falls back to default",
+			class: failoverRateLimited,
+			setup: func(h http.Header) { h.Set("Retry-After", "1e308") },
+			want:  vault.RateLimitCooldown,
+		},
 	}
 
 	for _, tt := range tests {
@@ -1409,6 +1439,25 @@ func TestCooldownFromResponseNilSafe(t *testing.T) {
 	}
 }
 
+// TestParseRecoveryHintRejectsNonFinite verifies the bare-numeric branch
+// rejects NaN/±Inf and absurdly large finite magnitudes up front (ok=false,
+// d=0) instead of letting strconv.ParseFloat's acceptance of those tokens slip
+// past the `secs < 0` guard into an implementation-defined float->int64
+// conversion that could wrap to a wrong/negative duration. Exercised on both
+// the delta header (Retry-After) and the epoch-branch reset family (where the
+// int64(secs) conversion lives).
+func TestParseRecoveryHintRejectsNonFinite(t *testing.T) {
+	now := time.Now()
+	headers := []string{"Retry-After", "x-ratelimit-reset"}
+	for _, h := range headers {
+		for _, raw := range []string{"NaN", "Inf", "+Inf", "-Inf", "1e308"} {
+			if d, ok := parseRecoveryHint(h, raw, now); ok || d != 0 {
+				t.Errorf("parseRecoveryHint(%q, %q) = (%v, %v), want (0, false)", h, raw, d, ok)
+			}
+		}
+	}
+}
+
 // TestParseRecoveryHintHTTPDateGatedToRetryAfter verifies the HTTP-date branch
 // is reachable only for Retry-After (RFC 9110). A reset-family header carrying
 // an HTTP-date string must NOT be coerced through http.ParseTime — those
diff --git a/internal/store/pools_test.go b/internal/store/pools_test.go
index 19a6c60..d32f873 100644
--- a/internal/store/pools_test.go
+++ b/internal/store/pools_test.go
@@ -1571,8 +1571,12 @@ func TestRemoveCredentialMetaCASNoOpLeavesHealthIntact(t *testing.T) {
 // migration (000008) up -> down -> up against a POPULATED schema: a pool with
 // two members and a credential_health row must survive the down (which
 // rebuilds credential_pools while preserving the credential_pool_members FK)
-// and the re-up round-trip. The down migration disables foreign_keys for the
-// 12-step rebuild, so the FK-referencing member rows must NOT be cascade-wiped.
+// and the re-up round-trip. golang-migrate runs the down migration inside a
+// transaction where SQLite ignores PRAGMA foreign_keys=OFF, so dropping
+// credential_pools during the 12-step rebuild would CASCADE-wipe the member
+// rows. The down migration instead snapshots credential_pool_members into a
+// temp table before the drop and restores it once the rebuilt parent rows
+// exist, so the FK-referencing member rows survive.
 func TestMigration000008DownUpPopulated(t *testing.T) {
 	dir := t.TempDir()
 	dbPath := filepath.Join(dir, "m.db")
@@ -1636,8 +1640,10 @@ func TestMigration000008DownUpPopulated(t *testing.T) {
 
 	// Down one step (000008 -> 000007): the column goes; the rebuilt
 	// credential_pools keeps its row; the FK-referencing member rows and the
-	// health row survive (foreign_keys=OFF during the rebuild prevents a
-	// cascade wipe).
+	// health row survive (PRAGMA foreign_keys=OFF is ignored inside the
+	// migration transaction, so the down migration snapshots the member rows
+	// into a temp table and restores them after the rebuild rather than letting
+	// the credential_pools drop cascade-wipe them).
 	if err := m.Steps(-1); err != nil {
 		t.Fatalf("down 1 (000008): %v", err)
 	}

From 47a5c21b2f2e00149a62ef0ddcc54ca899fabe74 Mon Sep 17 00:00:00 2001
From: Nikita Nemirovsky <vaze.legend@gmail.com>
Date: Sat, 23 May 2026 10:41:56 +0800
Subject: [PATCH 18/19] fix(proxy): use [POOL-RECOVERY] log prefix and drop
 name-keyed test assertion

---
 internal/proxy/pool_failover_test.go | 12 +++---------
 internal/proxy/server.go             |  2 +-
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/internal/proxy/pool_failover_test.go b/internal/proxy/pool_failover_test.go
index 0e0667a..9b234a9 100644
--- a/internal/proxy/pool_failover_test.go
+++ b/internal/proxy/pool_failover_test.go
@@ -1319,8 +1319,9 @@ func TestCooldownFromResponse(t *testing.T) {
 			name:  "x-ratelimit-reset-tokens unit-suffixed duration honored",
 			class: failoverRateLimited,
 			setup: func(h http.Header) { h.Set("x-ratelimit-reset-tokens", "1500ms") },
-			want:  1500 * time.Millisecond,
-			// 1.5s is below MinRateLimitFloor so it clamps up; assert clamp below.
+			// 1500ms is below MinRateLimitFloor, so the parsed hint clamps up to
+			// the floor; want is the post-clamp value the generic check asserts.
+			want: vault.MinRateLimitFloor,
 		},
 		{
 			name:  "absurd value capped at MaxCooldown",
@@ -1414,13 +1415,6 @@ func TestCooldownFromResponse(t *testing.T) {
 				}
 				return
 			}
-			// The unit-suffixed 1.5s case clamps up to MinRateLimitFloor.
-			if tt.name == "x-ratelimit-reset-tokens unit-suffixed duration honored" {
-				if got != vault.MinRateLimitFloor {
-					t.Fatalf("cooldownFromResponse = %v, want clamp to %v", got, vault.MinRateLimitFloor)
-				}
-				return
-			}
 			if got != tt.want {
 				t.Fatalf("cooldownFromResponse = %v, want %v", got, tt.want)
 			}
diff --git a/internal/proxy/server.go b/internal/proxy/server.go
index 1f3ae47..66ce041 100644
--- a/internal/proxy/server.go
+++ b/internal/proxy/server.go
@@ -3102,7 +3102,7 @@ func (s *Server) recoverPool(pool string) {
 	noticeCb := s.onPoolRecoveredNotice
 	s.poolExhaustMu.Unlock()
 
-	log.Printf("[POOL-FAILOVER] %s", FormatPoolRecoveredNotice(pool))
+	log.Printf("[POOL-RECOVERY] %s", FormatPoolRecoveredNotice(pool))
 	// Notice always fires on the recovery edge (independent of auth-reset).
 	if noticeCb != nil {
 		noticeCb(pool)

From 3668eff376abfb8f1996217ec4c2fbd0896f2fdd Mon Sep 17 00:00:00 2001
From: Nikita Nemirovsky <vaze.legend@gmail.com>
Date: Sat, 23 May 2026 10:58:03 +0800
Subject: [PATCH 19/19] fix(proxy): persist pool cooldown durably even when the
 failover notice is deduped

---
 cmd/sluice/main.go                           | 10 ++-
 internal/proxy/pool_failover.go              | 64 ++++++++++-----
 internal/proxy/pool_failover_test.go         | 84 +++++++++++++-------
 internal/proxy/pool_recovery_monitor_test.go | 19 ++++-
 4 files changed, 124 insertions(+), 53 deletions(-)

diff --git a/cmd/sluice/main.go b/cmd/sluice/main.go
index dd61bb5..77a17f4 100644
--- a/cmd/sluice/main.go
+++ b/cmd/sluice/main.go
@@ -510,7 +510,15 @@ func main() {
 					log.Printf("[POOL-FAILOVER] durable health write for %q skipped: no longer a live member of pool %q at epoch %d (removed/re-added before failover landed)", ev.From, ev.Pool, ev.Epoch)
 				}
 			}
-			if failoverBroker != nil {
+			// The durable health write above runs for EVERY failover event
+			// (decoupled from the notice dedup) so the persisted cooldown
+			// stays monotonic through a sustained exhaustion. The operator
+			// notice, by contrast, is gated on ev.Notify — true only when
+			// this event passed the dedup gate in handlePoolFailover (a real
+			// transition's 30s window or the healthy->exhausted edge) — so a
+			// suppressed/deduped event still persists but does not re-spam
+			// the operator.
+			if failoverBroker != nil && ev.Notify {
 				// Plain text: TelegramChannel.Notify sends with no parse
 				// mode, so markdown backticks would render literally.
 				// Exhausted: no distinct member to fail over to (every
diff --git a/internal/proxy/pool_failover.go b/internal/proxy/pool_failover.go
index ab93d7c..3624c0a 100644
--- a/internal/proxy/pool_failover.go
+++ b/internal/proxy/pool_failover.go
@@ -324,6 +324,12 @@ type FailoverEvent struct {
 	// so a late callback firing after a remove/re-add cannot persist this
 	// cooldown onto the re-created same-name successor (Cluster A #2).
 	Epoch int64
+	// Notify is true only when the operator notice/audit for this event
+	// passed the dedup gate (a real transition's 30s window, or the
+	// healthy->exhausted edge). The durable store write runs regardless of
+	// Notify; only the Telegram notice is gated by it, so cooldown
+	// persistence stays monotonic even when notices are suppressed.
+	Notify bool
 }
 
 // humanizeFailoverReason maps a short reason tag (the same tag embedded in the
@@ -628,8 +634,13 @@ func (a *SluiceAddon) poolForResponse(f *mitmproxy.Flow) (pool, activeMember, pr
 //     below only reconciles for durability across restarts.
 //  2. Computes the next active member (post-cooldown) for the audit/notice.
 //  3. Hands a FailoverEvent to the onFailover callback (async, best-effort):
-//     the callback persists SetCredentialHealth to the store and fires the
-//     Telegram notice. The callback MUST NOT block the response path.
+//     the callback persists SetCredentialHealth to the store (ALWAYS) and
+//     fires the Telegram notice (only when ev.Notify is true). The callback is
+//     invoked unconditionally — even for a notice-suppressed (deduped) event —
+//     so the durable store write is decoupled from the notice gate and the
+//     persisted cooldown stays monotonic through a sustained exhaustion. The
+//     callback MUST NOT block the response path. Only the spam-prone log line
+//     and audit row stay gated on the dedup decision (ev.Notify).
 //
 // No in-flight retry: the triggering request still returns its own upstream
 // error to the agent unmodified. The agent (or its SDK) retries on its own
@@ -733,6 +744,37 @@ func (a *SluiceAddon) handlePoolFailover(f *mitmproxy.Flow) {
 	} else {
 		emit = a.shouldEmitPoolNotice(pool, from, to, tag)
 	}
+
+	// (3) Durability + Telegram via the callback. The callback is responsible
+	// for being non-blocking (it runs the store write and the Telegram send in
+	// its own goroutine); we still guard with a nil check. Fired
+	// UNCONDITIONALLY — even when the notice is deduped (emit == false) — so the
+	// durable cooldown write is decoupled from the notice gate: the store stays
+	// monotonically up to date through a sustained exhaustion/retry storm where
+	// every notice after the first edge is suppressed. Only the operator-facing
+	// Telegram notice is gated, via Notify=emit; the callback itself runs the
+	// durable store write regardless of Notify. The member did fail in every
+	// case (the in-memory cooldown above already reflects that), exhausted just
+	// changes the operator-facing wording.
+	if a.onFailover != nil {
+		a.onFailover(FailoverEvent{
+			Pool:      pool,
+			From:      from,
+			To:        to,
+			Reason:    tag,
+			Class:     class,
+			Until:     until,
+			Exhausted: exhausted,
+			Epoch:     idEpoch,
+			Notify:    emit,
+		})
+	}
+
+	// The log line + audit row below stay gated on the dedup decision: they are
+	// the spam-prone surfaces a retry storm would flood, so a suppressed notice
+	// (deduped window, or an already-exhausted pool whose edge already fired)
+	// must not append a fresh log/audit entry. The durable store write already
+	// ran above via the unconditional callback.
 	if !emit {
 		return
 	}
@@ -776,24 +818,6 @@ func (a *SluiceAddon) handlePoolFailover(f *mitmproxy.Flow) {
 			log.Printf("[POOL-FAILOVER] audit log error: %v", err)
 		}
 	}
-
-	// (3) Durability + Telegram via the callback. The callback is
-	// responsible for being non-blocking (it runs the store write and the
-	// Telegram send in its own goroutine); we still guard with a nil check.
-	// The durable cooldown is persisted even when exhausted (the member did
-	// fail); only the operator-facing wording differs.
-	if a.onFailover != nil {
-		a.onFailover(FailoverEvent{
-			Pool:      pool,
-			From:      from,
-			To:        to,
-			Reason:    tag,
-			Class:     class,
-			Until:     until,
-			Exhausted: exhausted,
-			Epoch:     idEpoch,
-		})
-	}
 }
 
 // poolNoticeDedupWindow bounds how often an identical pool failover /
diff --git a/internal/proxy/pool_failover_test.go b/internal/proxy/pool_failover_test.go
index 9b234a9..3e4a005 100644
--- a/internal/proxy/pool_failover_test.go
+++ b/internal/proxy/pool_failover_test.go
@@ -8,6 +8,7 @@ import (
 	"path/filepath"
 	"strconv"
 	"strings"
+	"sync"
 	"sync/atomic"
 	"testing"
 	"time"
@@ -1075,13 +1076,15 @@ func TestFailoverPoolExhaustedNoSelfFailoverSpam(t *testing.T) {
 	// itself -> ResolveActive degrades to memA -> no distinct target.
 	prPtr.Load().MarkCooldown("memB", time.Now().Add(10*time.Minute), "401")
 
-	var calls int32
-	var last FailoverEvent
-	done := make(chan struct{}, 4)
+	// The callback fires synchronously inside addon.Response (the production
+	// detaching happens in main.go's SetOnFailover closure, not here), so the
+	// per-call records are stable to read after the loop.
+	var mu sync.Mutex
+	var events []FailoverEvent
 	addon.SetOnFailover(func(ev FailoverEvent) {
-		atomic.AddInt32(&calls, 1)
-		last = ev
-		done <- struct{}{}
+		mu.Lock()
+		events = append(events, ev)
+		mu.Unlock()
 	})
 
 	// Two back-to-back identical 429s (the agent's retry storm).
@@ -1091,25 +1094,38 @@ func TestFailoverPoolExhaustedNoSelfFailoverSpam(t *testing.T) {
 		addon.Response(f)
 	}
 
-	select {
-	case <-done:
-	case <-time.After(2 * time.Second):
-		t.Fatal("onFailover callback not invoked")
+	mu.Lock()
+	got := append([]FailoverEvent(nil), events...)
+	mu.Unlock()
+
+	// New contract: the durable callback fires for EVERY qualifying failover
+	// response (decoupled from the notice dedup), with Notify reflecting the
+	// dedup decision — true on the first (edge) event, false on the suppressed
+	// retry. This is what keeps the durable store write monotonic across a
+	// sustained exhaustion while the notice/audit stay deduped.
+	if len(got) != 2 {
+		t.Fatalf("onFailover invoked %d times, want exactly 2 (one per qualifying response; durability decoupled from notice dedup)", len(got))
 	}
-	// Dedup: the second identical signal within the window is suppressed.
-	if got := atomic.LoadInt32(&calls); got != 1 {
-		t.Fatalf("onFailover invoked %d times, want exactly 1 (dedup window must collapse the retry storm)", got)
+	if !got[0].Notify {
+		t.Fatalf("first onFailover Notify = false, want true (healthy->exhausted edge passes the dedup gate)")
 	}
-	if !last.Exhausted {
-		t.Fatalf("FailoverEvent.Exhausted = false, want true (no distinct failover target)")
+	if got[1].Notify {
+		t.Fatalf("second onFailover Notify = true, want false (the retry-storm repeat is notice-suppressed but still persisted)")
 	}
-	if last.From != "memA" || last.To != "memA" {
-		t.Fatalf("FailoverEvent from=%q to=%q, want memA/memA (degraded to self)", last.From, last.To)
+	for i, ev := range got {
+		if !ev.Exhausted {
+			t.Fatalf("event %d Exhausted = false, want true (no distinct failover target)", i)
+		}
+		if ev.From != "memA" || ev.To != "memA" {
+			t.Fatalf("event %d from=%q to=%q, want memA/memA (degraded to self)", i, ev.From, ev.To)
+		}
 	}
 
 	if err := logger.Close(); err != nil {
 		t.Fatalf("logger close: %v", err)
 	}
+	// The audit row stays gated on the dedup decision (only the edge event
+	// emits), so exactly one row despite two callback invocations.
 	if n := auditActionCount(t, logPath, "pool_exhausted"); n != 1 {
 		t.Fatalf("pool_exhausted audit rows = %d, want exactly 1 (no per-retry spam)", n)
 	}
@@ -1150,14 +1166,15 @@ func TestFailoverExhaustedDedupCollapsesFlapDirection(t *testing.T) {
 	prPtr.Load().MarkCooldown("memA", time.Now().Add(5*time.Minute), "429")
 	prPtr.Load().MarkCooldown("memB", time.Now().Add(10*time.Minute), "401")
 
-	var calls int32
-	done := make(chan struct{}, 4)
+	var mu sync.Mutex
+	var events []FailoverEvent
 	addon.SetOnFailover(func(ev FailoverEvent) {
 		if !ev.Exhausted {
 			t.Errorf("FailoverEvent.Exhausted = false, want true (pool exhausted)")
 		}
-		atomic.AddInt32(&calls, 1)
-		done <- struct{}{}
+		mu.Lock()
+		events = append(events, ev)
+		mu.Unlock()
 	})
 
 	// Two failing responses, attributed to DIFFERENT members (flap directions).
@@ -1167,15 +1184,24 @@ func TestFailoverExhaustedDedupCollapsesFlapDirection(t *testing.T) {
 		addon.Response(f)
 	}
 
-	select {
-	case <-done:
-	case <-time.After(2 * time.Second):
-		t.Fatal("onFailover callback not invoked")
+	mu.Lock()
+	got := append([]FailoverEvent(nil), events...)
+	mu.Unlock()
+
+	// New contract: the durable callback fires once per qualifying response
+	// (both flap directions), but the COLLAPSED exhausted dedup key suppresses
+	// the notice on the second, so exactly one event carries Notify=true.
+	if len(got) != 2 {
+		t.Fatalf("onFailover invoked %d times, want exactly 2 (one per qualifying response; durability decoupled from notice dedup)", len(got))
+	}
+	notify := 0
+	for _, ev := range got {
+		if ev.Notify {
+			notify++
+		}
 	}
-	// Give any erroneous second call a moment to surface before asserting.
-	time.Sleep(50 * time.Millisecond)
-	if got := atomic.LoadInt32(&calls); got != 1 {
-		t.Fatalf("onFailover invoked %d times, want exactly 1 (collapsed exhausted dedup key must absorb both flap directions)", got)
+	if notify != 1 {
+		t.Fatalf("onFailover Notify=true on %d events, want exactly 1 (collapsed exhausted dedup key must absorb both flap directions)", notify)
 	}
 
 	if err := logger.Close(); err != nil {
diff --git a/internal/proxy/pool_recovery_monitor_test.go b/internal/proxy/pool_recovery_monitor_test.go
index 01b6a27..58ec1ab 100644
--- a/internal/proxy/pool_recovery_monitor_test.go
+++ b/internal/proxy/pool_recovery_monitor_test.go
@@ -105,13 +105,22 @@ func TestExhaustedNoticeEdgeAcrossManyResponses(t *testing.T) {
 	prPtr.Load().MarkCooldown("memA", time.Now().Add(time.Hour), "429")
 	prPtr.Load().MarkCooldown("memB", time.Now().Add(time.Hour), "401")
 
-	var notices int32
+	// New contract: the durable callback fires once per qualifying response
+	// (decoupled from the notice dedup), but exactly one event carries
+	// Notify=true — the healthy->exhausted edge. The operator notice is gated on
+	// Notify, so this still proves "one exhausted notice across many responses"
+	// (edge-gated, not per-response) while the durable store write runs on every
+	// call.
+	var calls, notices int32
 	done := make(chan struct{}, 16)
 	addon.SetOnFailover(func(ev FailoverEvent) {
 		if !ev.Exhausted {
 			t.Errorf("FailoverEvent.Exhausted = false, want true")
 		}
-		atomic.AddInt32(&notices, 1)
+		atomic.AddInt32(&calls, 1)
+		if ev.Notify {
+			atomic.AddInt32(&notices, 1)
+		}
 		done <- struct{}{}
 	})
 
@@ -130,7 +139,11 @@ func TestExhaustedNoticeEdgeAcrossManyResponses(t *testing.T) {
 	// Let any erroneous extra notice surface.
 	time.Sleep(50 * time.Millisecond)
 	if got := atomic.LoadInt32(&notices); got != 1 {
-		t.Fatalf("exhausted notices = %d, want exactly 1 (edge-gated, not per-response)", got)
+		t.Fatalf("exhausted notices (Notify=true) = %d, want exactly 1 (edge-gated, not per-response)", got)
+	}
+	// The durable callback itself ran for every qualifying response.
+	if got := atomic.LoadInt32(&calls); got != 10 {
+		t.Fatalf("onFailover calls = %d, want 10 (durable callback fires per response, decoupled from the notice gate)", got)
 	}
 }