From 0bff90db5d3ce927b66c63d34740dfbeb79d9476 Mon Sep 17 00:00:00 2001 From: Nikita Nemirovsky Date: Fri, 22 May 2026 15:38:13 +0800 Subject: [PATCH 01/19] docs(plans): add pool exhaustion handling + agent auth auto-reset plan --- ...22-pool-exhaustion-and-agent-auth-reset.md | 403 ++++++++++++++++++ 1 file changed, 403 insertions(+) create mode 100644 docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md diff --git a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md new file mode 100644 index 0000000..26087ee --- /dev/null +++ b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md @@ -0,0 +1,403 @@ +# Pool Exhaustion Handling + Agent Auth Auto-Reset + +## Overview + +Two coupled problems observed live (knuth, hermes + `openai_pool` of two Codex OAuth accounts): + +1. **Failover flap + Telegram notice spam.** Both pool members hit the OpenAI Codex + *usage-limit* 429 (a multi-hour quota window). sluice cools each member for a flat + `RateLimitCooldown = 60s`, then `ResolveActive`'s all-cooling **degrade** path re-serves + the soonest-recovering member, which 429s again. The result is a perpetual + `openai_oauth ⇄ openai_oauth_2` flap emitting ~2 notices/min forever. The sticky-failover + fix (#48, `docs/plans/completed/.../20260518-sticky-failover.md`) stopped the *snap-to-0* + flap but explicitly deferred the cooldown-window fix on the assumption that sticky made the + 60s cooldown harmless — the **degrade-path** flap proves it did not. + +2. **Agent stuck after pool exhaustion.** When the whole pool is exhausted, hermes latches + "usage limit reached", retries 3×, and gives up — it will not self-recover even after the + quota window passes, because its local auth state is latched. It needs an explicit + `hermes auth reset ` to un-latch and retry. + +This plan delivers three fixes for (1) and an auto-reset for (2): + +- **A1** — correct exhaustion detection: classify the pool as exhausted when *no healthy + member exists*, not only when `to == from`; collapse the exhausted notice dedup key so the + flap direction cannot produce two keys. +- **A2** — edge-triggered notices: emit "pool exhausted" once on the healthy→exhausted edge + and "pool recovered" once on the way back; no periodic spam. +- **B1** — honor the real recovery window: derive the cooldown from the upstream + `Retry-After` / rate-limit-reset hints (clamped), so a quota-exhausted member stays cooled + for the real window instead of being re-probed every 60s. This is what makes the degrade + flap structurally impossible and makes "recovered" mean recovered. +- **Auto-reset (problem 2)** — opt-in, per-pool: when a pool with a configured + `auth_reset_target` transitions exhausted→recovered, sluice runs the agent's auth-reset + command (hermes profile, **as the runtime UID 10000:10000**) so the agent un-latches. + +## Context (from discovery) + +- Language/stack: Go, pure-Go SQLite (`modernc.org/sqlite`), go-mitmproxy addon model. +- Failover logic: `internal/proxy/pool_failover.go` (`handlePoolFailover`, `classifyFailover`, + `FormatFailoverNotice`, `shouldEmitPoolNotice`, dedup key `pool+from+to+tag`, 30s window). +- Resolver/health: `internal/vault/pool.go` (`PoolResolver`, shared `PoolHealth`, + `ResolveActive` sticky pointer + all-cooling **degrade returns a still-cooling member with + ok=true**, `MarkCooldownScoped`, `CooldownUntil`, `cooling()` = `cooldownUntil.After(now)`, + constants `RateLimitCooldown=60s` / `AuthFailCooldown=300s`). +- Addon state: `internal/proxy/addon.go` (`SluiceAddon` incl. `onFailover`, + `poolNoticeMu`/`poolNoticeAt`); `SetOnFailover` on `internal/proxy/server.go`. +- Failover side-effects wiring: `cmd/sluice/main.go:489` (`srv.SetOnFailover(...)` → durable + health write + Telegram notice, fresh `context.WithTimeout(Background,10s)` per send). + Container manager var `containerMgr` (main.go:242); `ReloadSecrets` pattern at main.go:673. +- Agent abstraction: `internal/container/agent_profile.go` (`AgentProfile.ReloadCmd`, + `WireMCPCmd`); `ContainerManager` interface `internal/container/types.go`; backends + `docker.go` / `apple.go` / `tart.go` / standalone. **`ExecInContainer(ctx,name,cmd)` takes + NO user** (`docker.go:18`, `docker_socket.go:245`; `execCreateRequest` at + `docker_socket.go:247` has no `User`) → today an exec runs as root, which root-chowns + hermes `auth.json` and bricks the gateway (CLAUDE.local.md). `InjectEnvVars` avoids this by + chowning inside its script (types.go:299-321) — that trick does NOT help a command that + writes files itself, so `hermes auth reset` genuinely must exec as 10000:10000. +- Pools store + channel-agnostic ops: `internal/store/pools.go`, migrations + `internal/store/migrations/000006_credential_pools.*` (down `DROP TABLE`s all three: + `credential_pools`, `credential_pool_members` FK→pools(name), `credential_health`; + `pool_membership_epoch` machinery in pools.go:78-97), `internal/poolops`. +- **REST is generated**: `internal/api/api.gen.go` is `DO NOT EDIT`, produced by + `make generate` → `oapi-codegen --config config.yaml ../../api/openapi.yaml` + (Makefile:64, generate.go). Existing pool routes: `GetApiPools`/`PostApiPools`/ + `DeleteApiPoolsName`/`GetApiPoolsName`/`PostApiPoolsNameRotate` (api.gen.go:911-935). + New surface = edit `openapi.yaml` → `make generate` → implement the new generated + `ServerInterface` method in `internal/api/server.go`. Action-route style + (`POST .../rotate`) is the established mutation pattern. +- **No server run loop**: `ListenAndServe` just calls `s.socks.Serve`; shutdown is + `GracefulShutdown`/`Close` flipping `s.closed` (server.go:2864/2876/2891). A new monitor + goroutine needs its own explicit stop signaled from BOTH shutdown paths. +- Pools are **not** TOML-exportable (no `[[pool]]` in `policy export`/`examples/config.toml`), + so the new field needs no TOML surface. +- The data-version watcher fires only on DB writes (`internal/store/watcher.go`, main.go:810) + — it is NOT a timer, so recovery-edge detection needs its own monitor. +- `f.Response.Header.Get(...)` is available on the mitmproxy Flow (used in `addon.go` DLP). + +## Development Approach + +- **Testing approach: Regular** (implement, then table-driven tests in the same task). +- Complete each task fully (code + tests + green) before the next. +- **Every task includes new/updated unit tests** (success + error/edge), as separate checklist + items. All tests pass before moving on. +- Preserve all existing pool concurrency invariants (CRITICAL-1 shared `PoolHealth`, + identity/epoch-scoped writes, sticky-pointer survival across resolver swaps). +- Channel feature parity is mandatory (CLAUDE.md): any new store-backed pool field must be + reachable from CLI **and** REST **and** Telegram via the channel-agnostic `internal/poolops`. +- gofumpt before committing (CI enforces it). `go vet`, `golangci-lint`, `-race` on touched + packages, `make generate` clean tree, and `-tags=e2e ./e2e/` build. + +## Testing Strategy + +- **Unit (vault)**: cooldown-from-hints parsing (each header form, clamp floor/ceiling); + `HasHealthyMember` mirrors `cooling()` lazy-expiry exactly; `SoonestCooldown` strictly-future; + degrade unchanged; exhaustion detection edge. +- **Unit (proxy)**: exactly one "exhausted" notice per healthy→exhausted edge regardless of + flap direction; one "recovered" notice on the reverse edge; recovery monitor reschedules on + *unequal* member cooldowns (B1 parks memA 60s, memB 6h → wake at ~60s, recover once); + monitor stops on shutdown; B1 cooldown applied from headers. +- **Unit (store)**: migration up→down→up with a **populated** pool + members + health rows; + `auth_reset_target` round-trips; default empty. +- **Unit (poolops/channels)**: create/update with target through the channel-agnostic layer; + one adapter test per channel asserting it routes through poolops (no inline logic). +- **Unit (container)**: exec-user threading; `ResetAuth` argv per profile; nil-cmd no-ops with + a notice; (docker) exec carries the runtime UID. +- No UI e2e here (backend/CLI only); keep the existing `-tags=e2e` suite building. + +## Progress Tracking +- mark completed items `[x]` immediately; add ➕ for new tasks, ⚠️ for blockers. +- keep this file in sync if scope shifts during implementation. + +## Solution Overview + +**B1 — real recovery window.** Add `cooldownFromResponse(class, resp) time.Duration` in +`pool_failover.go`. Precedence: `Retry-After` (delta-seconds or HTTP-date) → known +rate-limit-reset headers (`x-ratelimit-reset`, OpenAI `x-ratelimit-reset-requests` / +`x-ratelimit-reset-tokens`; delta-seconds or epoch). **No body parsing in v1** (deferred to +Post-Completion until a real Codex 429 is captured). Clamp to `[minFloor(class), maxCooldown]` +where `maxCooldown = 6h`, `minFloor(rate-limit) = 10s` (a parsed short window must be honored — +the whole point is 60s is too *long*-floored, not too short — so do NOT floor up to the 60s +class default), `minFloor(auth-failure) = AuthFailCooldown` (a revoked/expired token must not +be retried in seconds). When **no** hint header is present, fall back to the class default +(60s / 300s). `handlePoolFailover` uses this instead of the flat `ttl`. **Task-1 tests assert +parsing/clamp mechanics generically; they must NOT bake in which header OpenAI's usage-limit +429 actually sends** — that is resolved by the Post-Completion capture, and a guessed winner +here would contradict it. + +**A1 — correct exhaustion detection.** Add `PoolResolver.HasHealthyMember(pool) bool` +(RLock, single `now`, true iff some member has `!cooling()` — mirrors `ResolveActive`'s +`cooling()` lazy-expiry exactly, NOT a status-field check). In `handlePoolFailover`, after +`MarkCooldownScoped(from)`, set `exhausted = !pr.HasHealthyMember(pool)` (replaces +`to == from`). When exhausted, collapse the dedup so flap direction yields one key: +`shouldEmitPoolNotice(pool, "*", "*", "exhausted")` (only the exhausted path; real +transitions keep `from/to/tag`). + +**A2 — edge-triggered notices + recovery monitor (authoritative for the exhausted notice).** +Per-pool exhaustion state lives on the long-lived `Server` (NOT on `PoolHealth`): exhaustion +is per-process notification bookkeeping, not per-resolver-generation health, so it must +survive resolver pointer swaps and must NOT be pruned on a membership change — putting it on +`PoolHealth` would wrongly prune it. `handlePoolFailover` flips `false→true` on the first +exhausted classification, emits the one-time "exhausted" notice on that edge, and wakes the +monitor (non-blocking buffered cap-1 send). **Edge-gating in A2 supersedes A1's 30s-window +dedup for the exhausted notice** — the edge owns it; the window dedup remains only for real +`from→to` transitions. A dedicated **recovery monitor** goroutine sleeps until the pool's +soonest *strictly-future* member cooldown (clamped to a `minReschedule = 1s` floor so a +just-expired-but-still-unhealthy state can't spin), `Load()`s the current resolver on every +wake (never caches it), and when `HasHealthyMember(pool)` becomes true flips `true→false`, +emits "pool recovered", and invokes `onPoolRecovered(pool)`. Server-driven (time-based, not +traffic-based) because the latched agent will not emit a recovering 2xx on its own. A pool +removed while exhausted has its state entry cleaned up so no recovered-notice fires for a +deleted pool. + +**Auto-reset (problem 2).** Per-pool `auth_reset_target TEXT` (migration 000007); non-empty = +opt-in. New `AgentProfile.ResetAuthCmd(target) []string` + `AgentProfile.ExecUser() string` +(hermes → `"10000:10000"`, openclaw → ""), `ContainerManager.ResetAuth(ctx, target)` parallel +to `ReloadCmd`/`ReloadSecrets`. `HermesProfile.ResetAuthCmd` → pure-argv +`["/opt/hermes/.venv/bin/hermes","auth","reset",target]` (no `sh -c`, so no shell-metachar +threat). `OpenclawProfile.ResetAuthCmd` nil pending verification openclaw latches +(Post-Completion). `onPoolRecovered` (wired in main.go) looks up the recovered pool's +`auth_reset_target`; if set, calls `containerMgr.ResetAuth(ctx, target)` in a detached +goroutine with a fresh `context.WithTimeout(Background, …)` (mirroring the `SetOnFailover` +precedent) and emits an `agent_auth_reset` audit event. + +## Technical Details + +- **New constants** (`internal/vault/pool.go`): `maxCooldown = 6h`, `minRateLimitFloor = 10s`, + `minReschedule = 1s`. +- **New resolver methods** (`internal/vault/pool.go`, both RLock-only, single `now`): + `HasHealthyMember(pool) bool`; `SoonestCooldown(pool) (time.Time, bool)` returning the + minimum `cooldownUntil` **strictly greater than a freshly-sampled `now`** (bool=false when + no member is currently cooling). +- **Exec-user plumbing** (Task 6): add `User string` to `execCreateRequest` + (`docker_socket.go`), thread an optional user through `ExecInContainer` on both + `ContainerClient` and `SocketClient` (default "" = unchanged root behavior), update the + `mockClient` in `docker_test.go` and all existing callers (no-op default). +- **Migration `000007_pool_auth_reset.{up,down}.sql`**: up = `ALTER TABLE credential_pools + ADD COLUMN auth_reset_target TEXT NOT NULL DEFAULT ''`. down = the SQLite 12-step rebuild of + `credential_pools` only, wrapped in `PRAGMA foreign_keys=OFF;` … `=ON;` so the + `credential_pool_members` FK→`credential_pools(name)` is not orphaned/failed (golang-migrate + runs each file as a script). +- **Store** (`internal/store/pools.go`): add `AuthResetTarget` to `Pool`; include in + create/list; add `SetPoolAuthResetTarget(name, target) error`. No TOML change (pools aren't + TOML-exportable). +- **Recovery state on `Server`**: `poolExhaustMu sync.Mutex`, `poolExhausted map[string]bool`, + `recoveryWake chan struct{}` (buffered cap-1, non-blocking send), `monitorStop chan struct{}` + + `monitorStopOnce sync.Once`, `onPoolRecovered func(pool string)` via `SetOnPoolRecovered`. + Monitor launched once (in `New`/a `StartMonitors`), stopped idempotently from BOTH `Close` + and `GracefulShutdown`. +- **Audit**: new action `agent_auth_reset` (`Verdict:"recover"`, `Credential:` recovered pool, + `Reason:` target). Keep `pool_exhausted` for the entry edge; add a recovered audit/notice. +- **Target validation**: non-empty, no NUL, allowlisted charset (mirror `ValidateEnvVarKey` + style) — argv form, so shell-metachar checks are the wrong model. + +## What Goes Where +- **Implementation Steps** (`[ ]`): all code, migrations, generated-API regen, tests, in-repo docs. +- **Post-Completion** (no checkboxes): openclaw latch verification + `ResetAuthCmd`; live + knuth validation; capturing a real OpenAI 429 to fix B1's header precedence + `maxCooldown`. + +## Implementation Steps + +### Task 1: B1 — derive cooldown from upstream recovery hints + +**Files:** +- Modify: `internal/vault/pool.go` (constants `maxCooldown`, `minRateLimitFloor`) +- Modify: `internal/proxy/pool_failover.go` +- Modify: `internal/proxy/pool_failover_test.go` + +- [ ] add `maxCooldown = 6*time.Hour` and `minRateLimitFloor = 10*time.Second` near the TTLs +- [ ] add `cooldownFromResponse(class failoverClass, resp *http.Response) time.Duration`: + parse `Retry-After` (delta-seconds + HTTP-date), then `x-ratelimit-reset` / + `x-ratelimit-reset-requests` / `x-ratelimit-reset-tokens` (delta or epoch); clamp to + `[minFloor(class), maxCooldown]` (rate-limit floor `minRateLimitFloor`, auth-failure floor + `AuthFailCooldown`); **no hint → class default**. No body parsing in v1. +- [ ] in `handlePoolFailover` replace the flat `ttl` with `cooldownFromResponse(class, f.Response)` +- [ ] write tests (generic parsing/clamp, NOT a Codex-specific header winner): Retry-After + seconds; Retry-After HTTP-date; epoch reset header; delta reset header; no headers → class + default; absurd value → `maxCooldown`; tiny rate-limit value honored down to + `minRateLimitFloor`; auth-failure floored at `AuthFailCooldown` +- [ ] run `go test ./internal/proxy/ ./internal/vault/ -race`; gofumpt; vet — pass before Task 2 + +### Task 2: A1 — exhaustion = no healthy member, collapse dedup key + +**Files:** +- Modify: `internal/vault/pool.go` (`HasHealthyMember`, `SoonestCooldown`) +- Modify: `internal/proxy/pool_failover.go` +- Modify: `internal/vault/pool_test.go`, `internal/proxy/pool_failover_test.go` + +- [ ] add `HasHealthyMember(pool) bool` (RLock, single `now`, `!cooling()` for some member — + mirror `cooling()`/lazy-expiry, not a status field) and `SoonestCooldown(pool) + (time.Time, bool)` (strictly-future min vs a fresh `now`) +- [ ] in `handlePoolFailover` set `exhausted = !pr.HasHealthyMember(pool)` (drop `to == from`); + keep computing `to` for the real-transition notice/audit +- [ ] when `exhausted`, key the dedup as `shouldEmitPoolNotice(pool, "*", "*", "exhausted")` + so flap direction can't create two keys (real transitions unchanged) +- [ ] write tests: 2-member pool both cooling → exhausted true; one healthy (incl. a + lazily-expired cooldown still in the map) → false; `SoonestCooldown` skips already-passed + entries; exhausted dedup collapses both flap directions to one notice within the window + (fail-before/pass-after). NOTE: Task 3 re-gates this to an edge — update this assertion there. +- [ ] run `go test ./internal/vault/ ./internal/proxy/ -race`; gofumpt; vet — pass before Task 3 + +### Task 3: A2 — per-pool exhaustion state machine, edge notices, recovery monitor + +**Files:** +- Modify: `internal/proxy/server.go` (state, monitor goroutine, lifecycle, `SetOnPoolRecovered`) +- Modify: `internal/proxy/addon.go` (exhaustion-edge hook into the server state) +- Modify: `internal/proxy/pool_failover.go` (entry-edge gating; recovered notice formatter) +- Modify: `internal/proxy/server_test.go` (+ focused new test file as needed) + +- [ ] add `poolExhaustMu`/`poolExhausted`/`recoveryWake`(cap-1)/`monitorStop`+`monitorStopOnce`/ + `onPoolRecovered` to `Server`; `SetOnPoolRecovered`; document WHY state is on `Server` not + `PoolHealth` (survives swaps, must not be pruned on membership change) +- [ ] launch the monitor once (in `New`/`StartMonitors`); stop it idempotently from BOTH + `Close` and `GracefulShutdown` (via `monitorStopOnce`) +- [ ] gate the "exhausted" notice on the `false→true` edge (one notice on entry; this edge — + not the 30s window — is authoritative for the exhausted notice); record state; wake the + monitor with a non-blocking `select { case recoveryWake <- struct{}{}: default: }` +- [ ] implement the monitor: each wake `Load()` the current resolver (never cache); compute + `sleep = max(SoonestCooldown(pool).Sub(time.Now()), minReschedule)`; on wake, for each + exhausted pool flip `true→false` when `HasHealthyMember` is true, emit "pool recovered", + call `onPoolRecovered(pool)`; reschedule while still exhausted; drop state for pools no + longer present +- [ ] add a recovered-notice formatter (plain text, sentence style) alongside + `FormatFailoverNotice` +- [ ] update the Task-2 exhausted-notice test to assert EDGE behavior (one notice per + healthy→exhausted edge across many failing responses), not 30s-window behavior +- [ ] write tests: entry edge emits exactly one exhausted notice; recovery edge emits exactly + one recovered notice + one `onPoolRecovered` call; **unequal cooldowns** (memA 60s, memB 6h) + → monitor wakes ~60s, recovers once though memB still cools; monitor stops on shutdown; a + pool removed while exhausted fires no recovered notice +- [ ] run `go test ./internal/proxy/ -race`; gofumpt; vet — pass before Task 4 + +### Task 4: Schema + store for per-pool `auth_reset_target` + +**Files:** +- Create: `internal/store/migrations/000007_pool_auth_reset.up.sql` +- Create: `internal/store/migrations/000007_pool_auth_reset.down.sql` +- Modify: `internal/store/pools.go` +- Modify: `internal/store/pools_test.go` + +- [ ] up: `ALTER TABLE credential_pools ADD COLUMN auth_reset_target TEXT NOT NULL DEFAULT ''` +- [ ] down: SQLite 12-step rebuild of `credential_pools` only, wrapped + `PRAGMA foreign_keys=OFF;` … `=ON;` so the `credential_pool_members` FK isn't orphaned +- [ ] add `AuthResetTarget` to `Pool`; include in create/list reads; add + `SetPoolAuthResetTarget(name, target) error` +- [ ] write tests: migrate up→down→up against a **populated** table (pool + members + health + rows survive/round-trip); default empty; create with target; set/clear target; list reflects it +- [ ] run `go test ./internal/store/ -race`; gofumpt; vet — pass before Task 5 + +### Task 5: Channel parity — `auth_reset_target` on CLI + REST + Telegram + +**Files:** +- Modify: `internal/poolops/*.go` (set/clear + create-with-target operation logic) +- Modify: `cmd/sluice/pool.go` (flag on `pool create`; `pool set-auth-reset`) +- Modify: `api/openapi.yaml`; then `make generate` → implement new method in `internal/api/server.go` +- Modify: Telegram `/pool` handler +- Modify: matching `_test.go` for poolops + each channel adapter + +- [ ] add the operation to `internal/poolops` (channel-agnostic) so all channels are thin + adapters (CLAUDE.md anti-pattern note) +- [ ] CLI: `--auth-reset-target` on `pool create`; `pool set-auth-reset ` + (set/clear); show target in `pool status`/`pool list` +- [ ] REST: accept on `POST /api/pools`; add an **action route** `POST + /api/pools/{name}/auth-reset-target` (mirrors the existing `/rotate` style, not a bespoke + PATCH); edit `api/openapi.yaml`, run `make generate`, implement the generated + `ServerInterface` method in `server.go` +- [ ] Telegram: accept on `/pool create`; add `/pool set-auth-reset ` +- [ ] write tests: poolops set/clear/create-with-target; one adapter test per channel + asserting it routes through poolops (no inline logic) +- [ ] run `go test ./... -race` for touched packages; `make generate` clean; gofumpt; vet — + pass before Task 6 + +### Task 6: Add optional exec user to the container exec path (prerequisite for hermes reset) + +**Files:** +- Modify: `internal/container/types.go` (`ExecInContainer` signature / `ContainerClient`) +- Modify: `internal/container/docker.go`, `internal/container/docker_socket.go` + (`execCreateRequest.User`) +- Modify: `internal/container/docker_test.go` (`mockClient`) + any other `ExecInContainer` callers +- Modify: `internal/container/agent_profile.go` (`AgentProfile.ExecUser() string`) + +- [ ] add `User string` to `execCreateRequest`; thread an optional user arg through + `ExecInContainer` on `ContainerClient` + `SocketClient` (empty "" = current root behavior) +- [ ] update `mockClient` and every existing `ExecInContainer` caller (no-op default, no + behavior change) +- [ ] add `ExecUser() string` to `AgentProfile` (hermes `"10000:10000"`, openclaw "") +- [ ] write tests: socket exec body carries `User` when set and omits/empties it otherwise; + profile `ExecUser` values; existing callers unaffected +- [ ] run `go test ./internal/container/ -race`; gofumpt; vet — pass before Task 7 + +### Task 7: Profile `ResetAuthCmd` + `ContainerManager.ResetAuth` + +**Files:** +- Modify: `internal/container/agent_profile.go` +- Modify: `internal/container/types.go` +- Modify: `internal/container/docker.go`, `apple.go`, `tart.go`, standalone (`none`) +- Modify: `internal/container/agent_profile_test.go`, `docker_test.go` + +- [ ] add `ResetAuthCmd func(target string) []string` to `AgentProfile`; `HermesProfile` → + pure-argv `["/opt/hermes/.venv/bin/hermes","auth","reset",target]`; + `OpenclawProfile.ResetAuthCmd` nil (documented; Post-Completion verification) +- [ ] add `ResetAuth(ctx, target string) error` to `ContainerManager`; implement per backend; + nil `ResetAuthCmd` → log notice, return nil (mirror `ReloadSecrets`); docker exec passes + `profile.ExecUser()` (Task 6) so hermes runs as 10000:10000 +- [ ] validate `target` (non-empty, no NUL, allowlisted charset) before exec +- [ ] write tests: argv per profile; nil-cmd no-ops with notice; empty/invalid target + rejected; **docker exec uses the runtime UID from `ExecUser` (now passable, Task 6)** +- [ ] run `go test ./internal/container/ -race`; gofumpt; vet — pass before Task 8 + +### Task 8: Wire auto-reset on the recovery edge (opt-in, per pool) + +**Files:** +- Modify: `cmd/sluice/main.go` (`srv.SetOnPoolRecovered(...)`) +- Modify: `internal/proxy/pool_failover.go` / audit usage (`agent_auth_reset` action) +- Modify: relevant `_test.go` + +- [ ] in main.go register `SetOnPoolRecovered`: look up the recovered pool's + `auth_reset_target`; if non-empty and `containerMgr != nil`, call + `containerMgr.ResetAuth(ctx, target)` in a detached goroutine using a fresh + `context.WithTimeout(context.Background(), …)` (never block; never reuse a wake-scoped ctx); + log + emit `agent_auth_reset`; empty target → no reset (opt-out default) +- [ ] keep the recovered Telegram notice (Task 3) and the reset independent (notice always; + reset only when target set); a `ResetAuth` error is logged, not fatal +- [ ] write tests: target set → recovery triggers `ResetAuth(target)` once; no target → no + reset, notice still emitted; `ResetAuth` error logged not fatal +- [ ] run `go test ./... -race` for touched packages; gofumpt; vet — pass before Task 9 + +### Task 9: Verify acceptance criteria +- [ ] both members exhausted → exactly one "pool exhausted" notice (no flap stream) +- [ ] B1 cooldown reflects the upstream window (member not re-probed every 60s) +- [ ] recovery edge → one "pool recovered" notice; auth reset fires only for pools with a + target; hermes reset runs as 10000:10000 (no root-chown of auth.json) +- [ ] channel parity verified: `auth_reset_target` settable+readable via CLI, REST, Telegram +- [ ] full `go test ./...`; `go vet ./...`; `go vet -tags=e2e ./e2e/`; `gofumpt -l` clean; + `golangci-lint run ./...` 0 issues; `make generate` then `git diff --exit-code + internal/api/api.gen.go` clean +- [ ] independently verify committed HEAD builds + tests pass (do not trust subagent green) + +### Task 10: [Final] Documentation +- [ ] update `CLAUDE.md` credential-pools section: B1 cooldown-from-hints, A1/A2 + exhaustion+edge notices (replacing the per-window dedup wording), per-pool + `auth_reset_target` + recovery auto-reset, the `agent_auth_reset` audit action, and the + `ResetAuthCmd`/`ExecUser` profile hooks in the Agent Profiles table +- [ ] cross-check the CLI subcommand list + the auto-generated `api/openapi.yaml` in CLAUDE.md +- [ ] move this plan to `docs/plans/completed/` + +## Post-Completion +*Items requiring manual intervention or external systems — informational only* + +**Manual verification:** +- Deploy `sluice:dev` to knuth (build→scp→load loop), set `auth_reset_target = openai-codex` + on `openai_pool`, exhaust both Codex accounts, and confirm: one exhausted notice, no flap; + after the real window, one recovered notice + hermes auth un-latches and resumes without a + manual `hermes auth reset`; `auth.json` stays owned by 10000. +- **Capture a real OpenAI Codex usage-limit 429** (headers + body) to confirm which recovery + hint is present (`Retry-After` vs `x-ratelimit-reset*` vs body) and fix B1's header + precedence / `maxCooldown` accordingly. Add the body-hint parser only if the capture proves + no usable header exists. + +**External / follow-up:** +- **openclaw latch**: verify whether openclaw latches on usage-limit and, if so, implement + `OpenclawProfile.ResetAuthCmd` (gateway RPC, like `ReloadCmd`). hermes-only until confirmed. +- Selectable pool strategy (position-priority vs sticky) remains the previously-noted + follow-up; out of scope here. From 96611d2f20e300c5751c25853779d37d12ce0b42 Mon Sep 17 00:00:00 2001 From: Nikita Nemirovsky Date: Fri, 22 May 2026 15:45:25 +0800 Subject: [PATCH 02/19] feat(proxy): derive pool cooldown from upstream recovery hints (B1) --- ...22-pool-exhaustion-and-agent-auth-reset.md | 19 ++- internal/proxy/pool_failover.go | 146 +++++++++++++++- internal/proxy/pool_failover_test.go | 157 ++++++++++++++++++ internal/vault/pool.go | 15 ++ 4 files changed, 325 insertions(+), 12 deletions(-) diff --git a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md index 26087ee..537d421 100644 --- a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md +++ b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md @@ -205,18 +205,21 @@ precedent) and emits an `agent_auth_reset` audit event. - Modify: `internal/proxy/pool_failover.go` - Modify: `internal/proxy/pool_failover_test.go` -- [ ] add `maxCooldown = 6*time.Hour` and `minRateLimitFloor = 10*time.Second` near the TTLs -- [ ] add `cooldownFromResponse(class failoverClass, resp *http.Response) time.Duration`: +- [x] add `maxCooldown = 6*time.Hour` and `minRateLimitFloor = 10*time.Second` near the TTLs + (exported as `MaxCooldown`/`MinRateLimitFloor` so the proxy package can clamp against them, + mirroring `RateLimitCooldown`/`AuthFailCooldown`) +- [x] add `cooldownFromResponse(class failoverClass, header http.Header) time.Duration` + (takes `http.Header` directly so it works against the go-mitmproxy Flow's `Response.Header`): parse `Retry-After` (delta-seconds + HTTP-date), then `x-ratelimit-reset` / `x-ratelimit-reset-requests` / `x-ratelimit-reset-tokens` (delta or epoch); clamp to - `[minFloor(class), maxCooldown]` (rate-limit floor `minRateLimitFloor`, auth-failure floor + `[minFloor(class), MaxCooldown]` (rate-limit floor `MinRateLimitFloor`, auth-failure floor `AuthFailCooldown`); **no hint → class default**. No body parsing in v1. -- [ ] in `handlePoolFailover` replace the flat `ttl` with `cooldownFromResponse(class, f.Response)` -- [ ] write tests (generic parsing/clamp, NOT a Codex-specific header winner): Retry-After +- [x] in `handlePoolFailover` replace the flat `ttl` with `cooldownFromResponse(class, f.Response.Header)` +- [x] write tests (generic parsing/clamp, NOT a Codex-specific header winner): Retry-After seconds; Retry-After HTTP-date; epoch reset header; delta reset header; no headers → class - default; absurd value → `maxCooldown`; tiny rate-limit value honored down to - `minRateLimitFloor`; auth-failure floored at `AuthFailCooldown` -- [ ] run `go test ./internal/proxy/ ./internal/vault/ -race`; gofumpt; vet — pass before Task 2 + default; absurd value → `MaxCooldown`; tiny rate-limit value honored down to + `MinRateLimitFloor`; auth-failure floored at `AuthFailCooldown` +- [x] run `go test ./internal/proxy/ ./internal/vault/ -race`; gofumpt; vet — pass before Task 2 ### Task 2: A1 — exhaustion = no healthy member, collapse dedup key diff --git a/internal/proxy/pool_failover.go b/internal/proxy/pool_failover.go index 567c124..fd232dd 100644 --- a/internal/proxy/pool_failover.go +++ b/internal/proxy/pool_failover.go @@ -4,6 +4,8 @@ import ( "bytes" "fmt" "log" + "net/http" + "strconv" "strings" "time" @@ -130,6 +132,139 @@ func bodyContainsAny(body []byte, subs ...string) bool { return false } +// minFloorForClass returns the lower clamp bound for a derived cooldown. A +// rate-limited member may be honored down to MinRateLimitFloor (a short real +// recovery window must be respected — the whole point of B1). An auth failure +// must never be retried in seconds: a revoked/expired token will not self-heal +// quickly, so the floor stays at AuthFailCooldown regardless of any hint. +func minFloorForClass(class failoverClass) time.Duration { + if class == failoverAuthFailure { + return vault.AuthFailCooldown + } + return vault.MinRateLimitFloor +} + +// classDefaultCooldown is the flat fallback applied when the upstream sends no +// usable recovery hint header. +func classDefaultCooldown(class failoverClass) time.Duration { + if class == failoverAuthFailure { + return vault.AuthFailCooldown + } + return vault.RateLimitCooldown +} + +// recoveryHintHeaders are the response headers, in precedence order, from +// which cooldownFromResponse derives the real recovery window. Retry-After is +// the standard signal; the x-ratelimit-reset* family is provider-specific +// (OpenAI emits per-resource reset hints). Each value is parsed as either a +// delta-seconds count, an HTTP-date (Retry-After only), or — for the reset +// family — a unix epoch. +var recoveryHintHeaders = []string{ + "Retry-After", + "x-ratelimit-reset", + "x-ratelimit-reset-requests", + "x-ratelimit-reset-tokens", +} + +// cooldownFromResponse derives the cooldown duration for a failed pool member +// from the upstream's recovery hint headers, clamped to the class bounds. +// +// Precedence: Retry-After (delta-seconds or HTTP-date), then the +// x-ratelimit-reset* family (delta-seconds or unix epoch). The first header +// that yields a positive duration wins. With NO usable hint the flat class +// default is returned (RateLimitCooldown / AuthFailCooldown), so behavior is +// unchanged from before B1 on responses that carry no hint. +// +// The parsed window is clamped to [minFloorForClass(class), MaxCooldown]: a +// rate-limit hint may shrink the cooldown down to MinRateLimitFloor (honoring +// a short real window), an auth failure stays floored at AuthFailCooldown, and +// any absurd/hostile value is capped at MaxCooldown so a member is never +// parked indefinitely. +// +// No body parsing in v1 (deferred until a real Codex 429 is captured — see the +// plan's Post-Completion note); only headers are consulted. +// +// Takes the response header directly (not *http.Response) so it works +// uniformly against the go-mitmproxy Flow's Response.Header at the call site +// and a plain http.Header in tests. +func cooldownFromResponse(class failoverClass, header http.Header) time.Duration { + if header == nil { + return classDefaultCooldown(class) + } + now := time.Now() + for _, h := range recoveryHintHeaders { + raw := strings.TrimSpace(header.Get(h)) + if raw == "" { + continue + } + d, ok := parseRecoveryHint(h, raw, now) + if !ok || d <= 0 { + continue + } + return clampCooldown(class, d) + } + return classDefaultCooldown(class) +} + +// parseRecoveryHint parses a single recovery-hint header value into a positive +// duration relative to now. It tries, in order: +// +// - delta-seconds: a bare integer/decimal count of seconds from now (all +// headers). +// - HTTP-date: an absolute time (Retry-After only, per RFC 9110); the +// duration is its distance from now. +// - unix epoch: a large integer treated as seconds-since-epoch (the +// x-ratelimit-reset* family commonly emits an absolute epoch). +// +// Returns ok=false when the value parses to nothing usable (negative, +// zero-after-now, unparseable). A delta value carrying a unit suffix (OpenAI's +// "1.5s" / "60ms" form) is handled via time.ParseDuration as a fallback. +func parseRecoveryHint(header, raw string, now time.Time) (time.Duration, bool) { + // Bare numeric: delta-seconds for Retry-After; for the reset family it may + // be either delta-seconds OR a unix epoch. Disambiguate by magnitude — a + // value large enough to be a plausible epoch (>= ~ year 2001) is treated + // as absolute, otherwise as a delta. + if secs, err := strconv.ParseFloat(raw, 64); err == nil { + if secs < 0 { + return 0, false + } + const epochThreshold = 1_000_000_000 // ~2001-09; below this, treat as delta-seconds + if header != "Retry-After" && secs >= epochThreshold { + until := time.Unix(int64(secs), 0) + if d := until.Sub(now); d > 0 { + return d, true + } + return 0, false + } + return time.Duration(secs * float64(time.Second)), true + } + // HTTP-date (Retry-After absolute form). + if t, err := http.ParseTime(raw); err == nil { + if d := t.Sub(now); d > 0 { + return d, true + } + return 0, false + } + // Unit-suffixed duration (e.g. OpenAI "1.5s", "60ms"). + if d, err := time.ParseDuration(raw); err == nil && d > 0 { + return d, true + } + return 0, false +} + +// clampCooldown bounds a derived cooldown to [minFloorForClass(class), +// MaxCooldown]. +func clampCooldown(class failoverClass, d time.Duration) time.Duration { + floor := minFloorForClass(class) + if d < floor { + return floor + } + if d > vault.MaxCooldown { + return vault.MaxCooldown + } + return d +} + // FailoverEvent describes a completed pool failover. It is handed to the // optional onFailover callback (store durability write + Telegram notice) // configured via SetOnFailover. @@ -476,10 +611,13 @@ func (a *SluiceAddon) handlePoolFailover(f *mitmproxy.Flow) { return } - ttl := vault.RateLimitCooldown - if class == failoverAuthFailure { - ttl = vault.AuthFailCooldown - } + // B1: derive the cooldown window from the upstream's recovery hints + // (Retry-After / x-ratelimit-reset*) instead of the flat class TTL, so a + // quota-exhausted member is parked for the REAL window rather than being + // re-probed every RateLimitCooldown (60s). cooldownFromResponse clamps to + // the class bounds and falls back to the flat default when no hint header + // is present. + ttl := cooldownFromResponse(class, f.Response.Header) until := time.Now().Add(ttl) tag := failoverReasonTag(class, f.Response.StatusCode, bodyTag) diff --git a/internal/proxy/pool_failover_test.go b/internal/proxy/pool_failover_test.go index 652c984..9fb9d60 100644 --- a/internal/proxy/pool_failover_test.go +++ b/internal/proxy/pool_failover_test.go @@ -6,6 +6,7 @@ import ( "net/url" "os" "path/filepath" + "strconv" "strings" "sync/atomic" "testing" @@ -1180,3 +1181,159 @@ func TestFailoverToManualRotateParkedPeer(t *testing.T) { t.Fatalf("pool_exhausted audit rows = %d, want 0 (a healthy parked peer exists)", n) } } + +// TestCooldownFromResponse exercises B1's header-derived cooldown parsing and +// clamping. These assertions are intentionally GENERIC about the parsing/clamp +// mechanics and do NOT bake in which header OpenAI's usage-limit 429 actually +// sends — that header winner is resolved by the Post-Completion live capture. +func TestCooldownFromResponse(t *testing.T) { + now := time.Now() + resp := func(class failoverClass, set func(h http.Header)) (failoverClass, http.Header) { + h := make(http.Header) + if set != nil { + set(h) + } + return class, h + } + + tests := []struct { + name string + class failoverClass + setup func(h http.Header) + want time.Duration + // approx: when true, want is treated as a target and the result must + // be within tolerance (HTTP-date / absolute-epoch cases lose sub-second + // precision against a freshly-sampled now inside cooldownFromResponse). + approx bool + }{ + { + name: "no headers falls back to rate-limit class default", + class: failoverRateLimited, + want: vault.RateLimitCooldown, + }, + { + name: "no headers falls back to auth-failure class default", + class: failoverAuthFailure, + want: vault.AuthFailCooldown, + }, + { + name: "Retry-After delta seconds honored", + class: failoverRateLimited, + setup: func(h http.Header) { h.Set("Retry-After", "120") }, + want: 120 * time.Second, + }, + { + name: "Retry-After HTTP-date honored", + class: failoverRateLimited, + setup: func(h http.Header) { h.Set("Retry-After", now.Add(90*time.Second).UTC().Format(http.TimeFormat)) }, + want: 90 * time.Second, + approx: true, + }, + { + name: "x-ratelimit-reset unix epoch honored", + class: failoverRateLimited, + setup: func(h http.Header) { + h.Set("x-ratelimit-reset", strconv.FormatInt(now.Add(300*time.Second).Unix(), 10)) + }, + want: 300 * time.Second, + approx: true, + }, + { + name: "x-ratelimit-reset-requests delta seconds honored", + class: failoverRateLimited, + setup: func(h http.Header) { h.Set("x-ratelimit-reset-requests", "45") }, + want: 45 * time.Second, + }, + { + name: "x-ratelimit-reset-tokens unit-suffixed duration honored", + class: failoverRateLimited, + setup: func(h http.Header) { h.Set("x-ratelimit-reset-tokens", "1500ms") }, + want: 1500 * time.Millisecond, + // 1.5s is below MinRateLimitFloor so it clamps up; assert clamp below. + }, + { + name: "absurd value capped at MaxCooldown", + class: failoverRateLimited, + setup: func(h http.Header) { h.Set("Retry-After", "999999999") }, + want: vault.MaxCooldown, + }, + { + name: "tiny rate-limit value floored at MinRateLimitFloor", + class: failoverRateLimited, + setup: func(h http.Header) { h.Set("Retry-After", "2") }, + want: vault.MinRateLimitFloor, + }, + { + name: "auth-failure hint floored at AuthFailCooldown", + class: failoverAuthFailure, + setup: func(h http.Header) { h.Set("Retry-After", "5") }, + want: vault.AuthFailCooldown, + }, + { + name: "auth-failure long hint honored above its floor", + class: failoverAuthFailure, + setup: func(h http.Header) { h.Set("Retry-After", "1200") }, + want: 1200 * time.Second, + }, + { + name: "Retry-After precedence over reset family", + class: failoverRateLimited, + setup: func(h http.Header) { + h.Set("Retry-After", "120") + h.Set("x-ratelimit-reset", "30") + }, + want: 120 * time.Second, + }, + { + name: "unparseable header ignored falls back to default", + class: failoverRateLimited, + setup: func(h http.Header) { h.Set("Retry-After", "soon-ish") }, + want: vault.RateLimitCooldown, + }, + { + name: "negative delta ignored falls back to default", + class: failoverRateLimited, + setup: func(h http.Header) { h.Set("Retry-After", "-5") }, + want: vault.RateLimitCooldown, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + class, h := resp(tt.class, tt.setup) + got := cooldownFromResponse(class, h) + if tt.approx { + const tol = 2 * time.Second + diff := got - tt.want + if diff < 0 { + diff = -diff + } + if diff > tol { + t.Fatalf("cooldownFromResponse = %v, want ~%v (tol %v)", got, tt.want, tol) + } + return + } + // The unit-suffixed 1.5s case clamps up to MinRateLimitFloor. + if tt.name == "x-ratelimit-reset-tokens unit-suffixed duration honored" { + if got != vault.MinRateLimitFloor { + t.Fatalf("cooldownFromResponse = %v, want clamp to %v", got, vault.MinRateLimitFloor) + } + return + } + if got != tt.want { + t.Fatalf("cooldownFromResponse = %v, want %v", got, tt.want) + } + }) + } +} + +// TestCooldownFromResponseNilSafe verifies the nil-header guard returns the +// class default rather than panicking. +func TestCooldownFromResponseNilSafe(t *testing.T) { + if got := cooldownFromResponse(failoverRateLimited, nil); got != vault.RateLimitCooldown { + t.Fatalf("nil header: got %v, want %v", got, vault.RateLimitCooldown) + } + if got := cooldownFromResponse(failoverAuthFailure, nil); got != vault.AuthFailCooldown { + t.Fatalf("nil header (auth): got %v, want %v", got, vault.AuthFailCooldown) + } +} diff --git a/internal/vault/pool.go b/internal/vault/pool.go index d818fc3..93d8ff5 100644 --- a/internal/vault/pool.go +++ b/internal/vault/pool.go @@ -18,6 +18,21 @@ const ( AuthFailCooldown = 300 * time.Second ) +// Cooldown bounds applied when the cooldown window is derived from an +// upstream recovery hint (Retry-After / rate-limit-reset headers) instead of +// the flat class default. MaxCooldown caps an absurd or malicious hint so a +// member can never be parked indefinitely; MinRateLimitFloor is the lower +// bound for a rate-limited member — deliberately well below RateLimitCooldown +// so a SHORT real recovery window from the upstream is honored (the point of +// B1 is that the flat 60s is too LONG-floored, re-probing an exhausted account +// every minute, not too short). Exported so the proxy package's +// cooldownFromResponse can clamp against them (mirrors RateLimitCooldown / +// AuthFailCooldown, which are likewise consumed from proxy). +const ( + MaxCooldown = 6 * time.Hour + MinRateLimitFloor = 10 * time.Second +) + // ManualRotateReason is the cooldown reason stamped by `sluice pool rotate` // when it parks the previously-active member. A member parked for this // reason is operationally deprioritized BY AN OPERATOR, not unhealthy: it From cd4fd871db7d382c828ed920bafbad47c3107dc1 Mon Sep 17 00:00:00 2001 From: Nikita Nemirovsky Date: Fri, 22 May 2026 15:50:44 +0800 Subject: [PATCH 03/19] feat(proxy): detect pool exhaustion via no-healthy-member and collapse exhausted notice dedup (A1) --- ...22-pool-exhaustion-and-agent-auth-reset.md | 10 +- internal/proxy/pool_failover.go | 36 ++-- internal/proxy/pool_failover_test.go | 71 +++++++ internal/vault/pool.go | 74 +++++++ internal/vault/pool_test.go | 183 ++++++++++++++++++ 5 files changed, 357 insertions(+), 17 deletions(-) diff --git a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md index 537d421..b24d35e 100644 --- a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md +++ b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md @@ -228,18 +228,18 @@ precedent) and emits an `agent_auth_reset` audit event. - Modify: `internal/proxy/pool_failover.go` - Modify: `internal/vault/pool_test.go`, `internal/proxy/pool_failover_test.go` -- [ ] add `HasHealthyMember(pool) bool` (RLock, single `now`, `!cooling()` for some member — +- [x] add `HasHealthyMember(pool) bool` (RLock, single `now`, `!cooling()` for some member — mirror `cooling()`/lazy-expiry, not a status field) and `SoonestCooldown(pool) (time.Time, bool)` (strictly-future min vs a fresh `now`) -- [ ] in `handlePoolFailover` set `exhausted = !pr.HasHealthyMember(pool)` (drop `to == from`); +- [x] in `handlePoolFailover` set `exhausted = !pr.HasHealthyMember(pool)` (drop `to == from`); keep computing `to` for the real-transition notice/audit -- [ ] when `exhausted`, key the dedup as `shouldEmitPoolNotice(pool, "*", "*", "exhausted")` +- [x] when `exhausted`, key the dedup as `shouldEmitPoolNotice(pool, "*", "*", "exhausted")` so flap direction can't create two keys (real transitions unchanged) -- [ ] write tests: 2-member pool both cooling → exhausted true; one healthy (incl. a +- [x] write tests: 2-member pool both cooling → exhausted true; one healthy (incl. a lazily-expired cooldown still in the map) → false; `SoonestCooldown` skips already-passed entries; exhausted dedup collapses both flap directions to one notice within the window (fail-before/pass-after). NOTE: Task 3 re-gates this to an edge — update this assertion there. -- [ ] run `go test ./internal/vault/ ./internal/proxy/ -race`; gofumpt; vet — pass before Task 3 +- [x] run `go test ./internal/vault/ ./internal/proxy/ -race`; gofumpt; vet — pass before Task 3 ### Task 3: A2 — per-pool exhaustion state machine, edge notices, recovery monitor diff --git a/internal/proxy/pool_failover.go b/internal/proxy/pool_failover.go index fd232dd..594278a 100644 --- a/internal/proxy/pool_failover.go +++ b/internal/proxy/pool_failover.go @@ -651,23 +651,35 @@ func (a *SluiceAddon) handlePoolFailover(f *mitmproxy.Flow) { to = next } - // to == from means ResolveActive degraded back to the member that just - // failed: every member is cooling and the soonest-recovering one IS - // `from`. There is NO distinct member to fail over to. Emitting a - // " -> " cred_failover here (and one Telegram notice per - // request) was both meaningless and a notification storm — the agent - // retries N times, each retry re-fails on the still-exhausted member - // and re-entered this path, producing N identical "failed over A -> A" - // notices. Classify it honestly as pool exhaustion instead. - exhausted := to == from + // A1: the pool is exhausted when NO member is healthy, not merely when + // ResolveActive degraded back to `from` (to == from). The old `to == from` + // test missed the flap case where the soonest-recovering degrade target is + // a DIFFERENT still-cooling member: that produced a meaningless + // " -> " cred_failover plus one Telegram notice per agent + // retry even though there was no healthy account to serve. HasHealthyMember + // mirrors ResolveActive's cooling()/lazy-expiry exactly, so it agrees with + // the degrade decision made just above. `to` is still computed for the + // real-transition notice/audit (the not-exhausted branch). + exhausted := !pr.HasHealthyMember(pool) // Deduplicate identical signals within a short window. Concurrent // in-flight requests (pipelined agents) and retries that race the // synchronous MarkCooldown above would otherwise each emit one audit - // row + one operator notice. One per (pool,from,to,tag) per window is - // all the operator needs; the cooldown itself was already applied + // row + one operator notice. + // + // A1: for the exhausted path collapse the dedup key to + // (pool, "*", "*", "exhausted") so the flap direction (which degrade + // target ResolveActive happened to pick) cannot mint two distinct keys and + // let the retry storm through twice. A real from->to transition keeps its + // (pool, from, to, tag) key. The cooldown itself was already applied // unconditionally above, so suppressing the notice loses nothing. - if !a.shouldEmitPoolNotice(pool, from, to, tag) { + emit := false + if exhausted { + emit = a.shouldEmitPoolNotice(pool, "*", "*", "exhausted") + } else { + emit = a.shouldEmitPoolNotice(pool, from, to, tag) + } + if !emit { return } diff --git a/internal/proxy/pool_failover_test.go b/internal/proxy/pool_failover_test.go index 9fb9d60..8551be8 100644 --- a/internal/proxy/pool_failover_test.go +++ b/internal/proxy/pool_failover_test.go @@ -1118,6 +1118,77 @@ func TestFailoverPoolExhaustedNoSelfFailoverSpam(t *testing.T) { } } +// TestFailoverExhaustedDedupCollapsesFlapDirection is the A1 dedup-collapse +// regression. When both members are exhausted, the degrade target ResolveActive +// picks can differ depending on which member a given failing response is +// attributed to (the "flap direction"). The OLD per-(pool,from,to,tag) dedup +// key minted a DISTINCT key per direction, so a retry storm hitting both +// members produced two notices even though the pool is in one exhausted state. +// +// A1 collapses the exhausted path to the single key (pool,"*","*","exhausted") +// so the direction can't create two keys. +// +// Fail-before (per-tuple key): two responses attributed to memA and memB emit +// two pool_exhausted rows / two onFailover calls. Pass-after (collapsed key): +// exactly one of each within the dedup window. +func TestFailoverExhaustedDedupCollapsesFlapDirection(t *testing.T) { + dir := t.TempDir() + logPath := filepath.Join(dir, "audit.log") + logger, err := audit.NewFileLogger(logPath) + if err != nil { + t.Fatalf("NewFileLogger: %v", err) + } + t.Cleanup(func() { _ = logger.Close() }) + + addon, _, prPtr := setupPoolAddon(t, "memA", "memB") + addon.auditLog = logger + client := setupAddonConn(addon, "auth.example.com:443") + + // Both members already genuinely failure-cooled (the pool is exhausted). + // Unequal future expiries so the degrade target differs by which member + // each request is attributed to (the flap direction the old key split on). + prPtr.Load().MarkCooldown("memA", time.Now().Add(5*time.Minute), "429") + prPtr.Load().MarkCooldown("memB", time.Now().Add(10*time.Minute), "401") + + var calls int32 + done := make(chan struct{}, 4) + addon.SetOnFailover(func(ev FailoverEvent) { + if !ev.Exhausted { + t.Errorf("FailoverEvent.Exhausted = false, want true (pool exhausted)") + } + atomic.AddInt32(&calls, 1) + done <- struct{}{} + }) + + // Two failing responses, attributed to DIFFERENT members (flap directions). + for _, member := range []string{"memA", "memB"} { + f := newPoolRespFlow(client, 429, []byte(`{"error":"rate_limited"}`)) + addon.flowInjected.Tag(f.Id, member) + addon.Response(f) + } + + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatal("onFailover callback not invoked") + } + // Give any erroneous second call a moment to surface before asserting. + time.Sleep(50 * time.Millisecond) + if got := atomic.LoadInt32(&calls); got != 1 { + t.Fatalf("onFailover invoked %d times, want exactly 1 (collapsed exhausted dedup key must absorb both flap directions)", got) + } + + if err := logger.Close(); err != nil { + t.Fatalf("logger close: %v", err) + } + if n := auditActionCount(t, logPath, "pool_exhausted"); n != 1 { + t.Fatalf("pool_exhausted audit rows = %d, want exactly 1 (one collapsed key across both flap directions)", n) + } + if n := auditActionCount(t, logPath, "cred_failover"); n != 0 { + t.Fatalf("cred_failover audit rows = %d, want 0 (exhausted is not a real failover)", n) + } +} + // TestFailoverToManualRotateParkedPeer is the pool-stranding regression that // broke the live agent: `sluice pool rotate` parks the previously-active // member (reason ManualRotateReason). That member is healthy, just operator diff --git a/internal/vault/pool.go b/internal/vault/pool.go index 93d8ff5..75572f2 100644 --- a/internal/vault/pool.go +++ b/internal/vault/pool.go @@ -758,6 +758,80 @@ func (pr *PoolResolver) MergeLiveCooldowns(prev *PoolResolver) { } } +// HasHealthyMember reports whether the pool has at least one SERVABLE member. +// It RLock-reads the shared health map once and evaluates every member against +// a SINGLE freshly-sampled `now`, mirroring ResolveActive's `cooling()` +// lazy-expiry semantics (a member whose cooldown is untracked, zero, or +// lazily-expired vs `now` is healthy — this is NOT a status-field check). It +// is the A1 exhaustion signal: a pool is exhausted iff !HasHealthyMember(pool). +// A non-pool name or unknown pool returns false (no member, nothing to serve). +// +// A member parked for ManualRotateReason is operator-DEPRIORITIZED, not +// unhealthy: ResolveActive's degrade path explicitly treats it as a +// "parked-but-healthy" servable target (preferred over a genuinely failed +// member). So such a member counts as healthy here too — otherwise a 429 on +// the rotated-to member while the rotated-from peer is operator-parked would +// be misclassified as pool exhaustion even though the parked peer is a valid +// failover target. This keeps HasHealthyMember in agreement with the degrade +// decision the failover path makes from the same health view. +func (pr *PoolResolver) HasHealthyMember(pool string) bool { + if pr == nil { + return false + } + members, isPool := pr.pools[pool] + if !isPool || len(members) == 0 { + return false + } + now := time.Now() + pr.health.mu.RLock() + defer pr.health.mu.RUnlock() + for _, m := range members { + h, tracked := pr.health.health[m] + if !tracked || h.cooldownUntil.IsZero() || !h.cooldownUntil.After(now) { + // Not cooling (untracked, zero, or lazily expired vs `now`). + return true + } + if h.reason == ManualRotateReason { + // Operator-parked but servable (matches the degrade path). + return true + } + } + return false +} + +// SoonestCooldown returns the minimum member cooldown expiry that is STRICTLY +// GREATER than a freshly-sampled `now` (i.e. only members currently cooling +// are considered; an already-passed cooldown still in the map is skipped, +// mirroring `cooling()`'s lazy-expiry). ok is false when no member of the pool +// is currently cooling (the recovery monitor uses this to decide whether to +// reschedule a wake). A non-pool name or unknown pool returns ok=false. It +// RLock-reads the shared health map once against the single `now`. +func (pr *PoolResolver) SoonestCooldown(pool string) (time.Time, bool) { + if pr == nil { + return time.Time{}, false + } + members, isPool := pr.pools[pool] + if !isPool || len(members) == 0 { + return time.Time{}, false + } + now := time.Now() + pr.health.mu.RLock() + defer pr.health.mu.RUnlock() + var soonest time.Time + found := false + for _, m := range members { + h, tracked := pr.health.health[m] + if !tracked || h.cooldownUntil.IsZero() || !h.cooldownUntil.After(now) { + continue // not currently cooling (lazy-expiry skip) + } + if !found || h.cooldownUntil.Before(soonest) { + soonest = h.cooldownUntil + found = true + } + } + return soonest, found +} + // CooldownUntil returns the in-memory cooldown expiry for a credential and // whether it is currently cooling down (future expiry). Exposed as an // introspection surface for tests and potential future `pool status` diff --git a/internal/vault/pool_test.go b/internal/vault/pool_test.go index 1318223..4060e8b 100644 --- a/internal/vault/pool_test.go +++ b/internal/vault/pool_test.go @@ -940,3 +940,186 @@ func TestSetCurrentMembersPrunesStaleActive(t *testing.T) { t.Fatal("Finding 1: SetCurrentMembers did not prune the stale sticky pointer for a dropped/epoch-bumped member") } } + +// TestHasHealthyMember pins A1's exhaustion signal: a pool is exhausted iff no +// member is healthy. The check mirrors ResolveActive's cooling()/lazy-expiry +// semantics exactly (an already-passed cooldown still in the map counts as +// healthy) and treats a ManualRotateReason park as servable. +func TestHasHealthyMember(t *testing.T) { + now := time.Now() + tests := []struct { + name string + setup func(pr *PoolResolver) + pool string + want bool + }{ + { + name: "both healthy", + setup: func(pr *PoolResolver) {}, + pool: "pool", + want: true, + }, + { + name: "one cooling one healthy", + setup: func(pr *PoolResolver) { + pr.MarkCooldown("a", now.Add(60*time.Second), "429") + }, + pool: "pool", + want: true, + }, + { + name: "both cooling -> exhausted", + setup: func(pr *PoolResolver) { + pr.MarkCooldown("a", now.Add(60*time.Second), "429") + pr.MarkCooldown("b", now.Add(10*time.Minute), "401") + }, + pool: "pool", + want: false, + }, + { + name: "lazily-expired cooldown still in map counts as healthy", + setup: func(pr *PoolResolver) { + // b genuinely cooling; a has a stored cooldown already in the + // past (lazy expiry) -> a is healthy, so the pool is not + // exhausted. + pr.MarkCooldown("a", now.Add(-1*time.Second), "429") + pr.MarkCooldown("b", now.Add(10*time.Minute), "401") + }, + pool: "pool", + want: true, + }, + { + name: "manual-rotate park counts as servable", + setup: func(pr *PoolResolver) { + // a genuinely failed; b operator-parked (deprioritized but + // servable) -> the pool is NOT exhausted. + pr.MarkCooldown("a", now.Add(60*time.Second), "429") + pr.MarkCooldown("b", now.Add(5*time.Minute), ManualRotateReason) + }, + pool: "pool", + want: true, + }, + { + name: "unknown pool", + setup: func(pr *PoolResolver) {}, + pool: "nope", + want: false, + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + pr := NewPoolResolver([]store.Pool{mkPool("pool", "a", "b")}, nil) + tc.setup(pr) + if got := pr.HasHealthyMember(tc.pool); got != tc.want { + t.Fatalf("HasHealthyMember(%q) = %v, want %v", tc.pool, got, tc.want) + } + }) + } +} + +// TestHasHealthyMemberNil pins the nil-receiver guard. +func TestHasHealthyMemberNil(t *testing.T) { + var pr *PoolResolver + if pr.HasHealthyMember("pool") { + t.Fatal("nil resolver HasHealthyMember = true, want false") + } +} + +// TestSoonestCooldown pins that SoonestCooldown returns the minimum +// STRICTLY-FUTURE member cooldown, skipping already-passed entries (lazy +// expiry), and reports ok=false when no member is currently cooling. +func TestSoonestCooldown(t *testing.T) { + now := time.Now() + tests := []struct { + name string + setup func(pr *PoolResolver, base time.Time) + pool string + wantOK bool + wantBase time.Time // expected soonest when wantOK (exact, set in setup) + }{ + { + name: "no member cooling", + setup: func(pr *PoolResolver, base time.Time) {}, + pool: "pool", + wantOK: false, + }, + { + name: "single cooling member", + setup: func(pr *PoolResolver, base time.Time) { + pr.MarkCooldown("a", base.Add(60*time.Second), "429") + }, + pool: "pool", + wantOK: true, + wantBase: now.Add(60 * time.Second), + }, + { + name: "two cooling -> min wins", + setup: func(pr *PoolResolver, base time.Time) { + pr.MarkCooldown("a", base.Add(10*time.Minute), "401") + pr.MarkCooldown("b", base.Add(60*time.Second), "429") + }, + pool: "pool", + wantOK: true, + wantBase: now.Add(60 * time.Second), + }, + { + name: "already-passed entry skipped", + setup: func(pr *PoolResolver, base time.Time) { + // a is in the past (lazy-expired) -> ignored; b is the only + // currently-cooling member. + pr.MarkCooldown("a", base.Add(-1*time.Second), "429") + pr.MarkCooldown("b", base.Add(120*time.Second), "401") + }, + pool: "pool", + wantOK: true, + wantBase: now.Add(120 * time.Second), + }, + { + name: "all passed -> not cooling", + setup: func(pr *PoolResolver, base time.Time) { + pr.MarkCooldown("a", base.Add(-2*time.Second), "429") + pr.MarkCooldown("b", base.Add(-1*time.Second), "401") + }, + pool: "pool", + wantOK: false, + }, + { + name: "unknown pool", + setup: func(pr *PoolResolver, base time.Time) {}, + pool: "nope", + wantOK: false, + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + pr := NewPoolResolver([]store.Pool{mkPool("pool", "a", "b")}, nil) + // Use a base far enough in the future that the "already-passed" + // negative offsets are still safely in the past relative to the + // SoonestCooldown's own time.Now() sample. + base := time.Now() + tc.setup(pr, base) + got, ok := pr.SoonestCooldown(tc.pool) + if ok != tc.wantOK { + t.Fatalf("SoonestCooldown(%q) ok = %v, want %v", tc.pool, ok, tc.wantOK) + } + if !tc.wantOK { + return + } + // The stored cooldown is exact (MarkCooldown stores `until` + // verbatim); SoonestCooldown returns it unchanged, so compare + // against the value set in setup within a small tolerance for the + // base/now skew. + if d := got.Sub(tc.wantBase); d < -2*time.Second || d > 2*time.Second { + t.Fatalf("SoonestCooldown(%q) = %v, want ~%v (delta %v)", tc.pool, got, tc.wantBase, d) + } + }) + } +} + +// TestSoonestCooldownNil pins the nil-receiver guard. +func TestSoonestCooldownNil(t *testing.T) { + var pr *PoolResolver + if _, ok := pr.SoonestCooldown("pool"); ok { + t.Fatal("nil resolver SoonestCooldown ok = true, want false") + } +} From 2c05a5b65e8c87e3288d2e92f64abb7417051a19 Mon Sep 17 00:00:00 2001 From: Nikita Nemirovsky Date: Fri, 22 May 2026 16:01:16 +0800 Subject: [PATCH 04/19] feat(proxy): edge-triggered pool exhaustion notices + recovery monitor (A2) --- ...22-pool-exhaustion-and-agent-auth-reset.md | 38 ++- internal/proxy/addon.go | 19 ++ internal/proxy/pool_failover.go | 30 +- internal/proxy/pool_recovery_monitor_test.go | 302 ++++++++++++++++++ internal/proxy/server.go | 275 +++++++++++++++- 5 files changed, 641 insertions(+), 23 deletions(-) create mode 100644 internal/proxy/pool_recovery_monitor_test.go diff --git a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md index b24d35e..2a5dae4 100644 --- a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md +++ b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md @@ -249,28 +249,34 @@ precedent) and emits an `agent_auth_reset` audit event. - Modify: `internal/proxy/pool_failover.go` (entry-edge gating; recovered notice formatter) - Modify: `internal/proxy/server_test.go` (+ focused new test file as needed) -- [ ] add `poolExhaustMu`/`poolExhausted`/`recoveryWake`(cap-1)/`monitorStop`+`monitorStopOnce`/ +- [x] add `poolExhaustMu`/`poolExhausted`/`recoveryWake`(cap-1)/`monitorStop`+`monitorStopOnce`/ `onPoolRecovered` to `Server`; `SetOnPoolRecovered`; document WHY state is on `Server` not - `PoolHealth` (survives swaps, must not be pruned on membership change) -- [ ] launch the monitor once (in `New`/`StartMonitors`); stop it idempotently from BOTH - `Close` and `GracefulShutdown` (via `monitorStopOnce`) -- [ ] gate the "exhausted" notice on the `false→true` edge (one notice on entry; this edge — + `PoolHealth` (survives swaps, must not be pruned on membership change). Also added a separate + `onPoolRecoveredNotice` callback (notice always fires; auth-reset stays opt-in per Task 8) and + a test-shortenable `recoveryMinReschedule` field. +- [x] launch the monitor once (in `New`); stop it idempotently from BOTH + `Close` and `GracefulShutdown` (via `monitorStopOnce`/`stopMonitors`) +- [x] gate the "exhausted" notice on the `false→true` edge (one notice on entry; this edge — not the 30s window — is authoritative for the exhausted notice); record state; wake the - monitor with a non-blocking `select { case recoveryWake <- struct{}{}: default: }` -- [ ] implement the monitor: each wake `Load()` the current resolver (never cache); compute + monitor with a non-blocking `select { case recoveryWake <- struct{}{}: default: }`. Wired via + addon `onPoolExhausted` hook (`SetOnPoolExhausted`, mirroring `SetOnFailover`); addon-only + tests with no Server fall back to the A1 30s-window collapse. +- [x] implement the monitor: each wake `Load()` the current resolver (never cache); compute `sleep = max(SoonestCooldown(pool).Sub(time.Now()), minReschedule)`; on wake, for each exhausted pool flip `true→false` when `HasHealthyMember` is true, emit "pool recovered", call `onPoolRecovered(pool)`; reschedule while still exhausted; drop state for pools no longer present -- [ ] add a recovered-notice formatter (plain text, sentence style) alongside - `FormatFailoverNotice` -- [ ] update the Task-2 exhausted-notice test to assert EDGE behavior (one notice per - healthy→exhausted edge across many failing responses), not 30s-window behavior -- [ ] write tests: entry edge emits exactly one exhausted notice; recovery edge emits exactly - one recovered notice + one `onPoolRecovered` call; **unequal cooldowns** (memA 60s, memB 6h) - → monitor wakes ~60s, recovers once though memB still cools; monitor stops on shutdown; a - pool removed while exhausted fires no recovered notice -- [ ] run `go test ./internal/proxy/ -race`; gofumpt; vet — pass before Task 4 +- [x] add a recovered-notice formatter (plain text, sentence style) alongside + `FormatFailoverNotice` (`FormatPoolRecoveredNotice`) +- [x] update the Task-2 exhausted-notice test to assert EDGE behavior (one notice per + healthy→exhausted edge across many failing responses), not 30s-window behavior. Added + `TestExhaustedNoticeEdgeAcrossManyResponses` (Server-wired edge path); the existing addon-only + Task-2 tests retain the 30s-window assertion for the no-Server fallback path. +- [x] write tests: entry edge emits exactly one exhausted notice; recovery edge emits exactly + one recovered notice + one `onPoolRecovered` call; **unequal cooldowns** (memA short, memB 6h) + → monitor wakes ~soonest, recovers once though memB still cools; monitor stops on shutdown + (double-stop no panic); a pool removed while exhausted fires no recovered notice +- [x] run `go test ./internal/proxy/ -race`; gofumpt; vet — pass before Task 4 ### Task 4: Schema + store for per-pool `auth_reset_target` diff --git a/internal/proxy/addon.go b/internal/proxy/addon.go index d067015..1429fa4 100644 --- a/internal/proxy/addon.go +++ b/internal/proxy/addon.go @@ -167,6 +167,16 @@ type SluiceAddon struct { // means failover is in-memory only (no durability, no notice). onFailover func(FailoverEvent) + // onPoolExhausted, when set, is called from handlePoolFailover on every + // response classified as pool-exhausting (no healthy member). It performs + // the healthy->exhausted EDGE bookkeeping on the long-lived Server and + // returns true ONLY on the false->true edge — that edge, not the addon's + // 30s notice window, is authoritative for the "pool exhausted" notice (A2). + // When this hook is nil (addon constructed without a Server, e.g. unit + // tests that exercise the failover path in isolation) the addon falls back + // to its own 30s-window dedup so those tests keep their existing behavior. + onPoolExhausted func(pool string) bool + // persistDone is an optional channel signaled when an async OAuth // token persist goroutine completes. Used by tests to avoid // time.Sleep-based synchronization. Nil in production. @@ -389,6 +399,15 @@ func (a *SluiceAddon) SetOnFailover(fn func(FailoverEvent)) { a.onFailover = fn } +// SetOnPoolExhausted configures the healthy->exhausted edge hook. The Server +// sets this (mirroring SetOnFailover) so handlePoolFailover can flip the +// per-pool exhaustion state and wake the recovery monitor. The hook returns +// true only on the edge, gating the one-time "exhausted" notice. Safe to leave +// unset: the addon then uses its own 30s-window dedup (legacy behavior). +func (a *SluiceAddon) SetOnPoolExhausted(fn func(pool string) bool) { + a.onPoolExhausted = fn +} + // UpdateOAuthIndex rebuilds the OAuth token URL index from credential // metadata. Called on startup and after credential metadata changes // (e.g. SIGHUP hot-reload). diff --git a/internal/proxy/pool_failover.go b/internal/proxy/pool_failover.go index 594278a..e719d84 100644 --- a/internal/proxy/pool_failover.go +++ b/internal/proxy/pool_failover.go @@ -349,6 +349,15 @@ func FormatFailoverNotice(ev FailoverEvent) string { ev.Pool, ev.From, ev.To, reason) } +// FormatPoolRecoveredNotice builds the plain-text, single-line operator notice +// emitted on the exhausted->recovered edge by the recovery monitor. Kept next +// to FormatFailoverNotice as a pure, directly-testable function. Plain text / +// sentence style for the same reason (the notice path sends with no parse +// mode, so markdown/HTML would render literally). +func FormatPoolRecoveredNotice(pool string) string { + return fmt.Sprintf("Pool %q recovered: a healthy account is available again.", pool) +} + // poolForResponse maps a response's CONNECT destination back to a pooled // binding and returns the pool name + the member that was active for this // request. Returns ok=false when the destination is not bound to a pool. @@ -667,15 +676,24 @@ func (a *SluiceAddon) handlePoolFailover(f *mitmproxy.Flow) { // synchronous MarkCooldown above would otherwise each emit one audit // row + one operator notice. // - // A1: for the exhausted path collapse the dedup key to - // (pool, "*", "*", "exhausted") so the flap direction (which degrade - // target ResolveActive happened to pick) cannot mint two distinct keys and - // let the retry storm through twice. A real from->to transition keeps its - // (pool, from, to, tag) key. The cooldown itself was already applied + // A2: the "exhausted" notice is gated on the healthy->exhausted EDGE, not + // the 30s window. onPoolExhausted (set by the Server) flips the per-pool + // exhaustion state on the long-lived Server and returns true only on the + // false->true edge, so a retry storm against an already-exhausted pool + // emits exactly one notice and the edge also wakes the recovery monitor. + // When the hook is unset (addon-only unit tests) fall back to the A1 + // collapsed 30s-window key so those tests keep their existing behavior. + // + // A real from->to transition is unchanged: it keeps its (pool, from, to, + // tag) 30s-window key. The cooldown itself was already applied // unconditionally above, so suppressing the notice loses nothing. emit := false if exhausted { - emit = a.shouldEmitPoolNotice(pool, "*", "*", "exhausted") + if a.onPoolExhausted != nil { + emit = a.onPoolExhausted(pool) + } else { + emit = a.shouldEmitPoolNotice(pool, "*", "*", "exhausted") + } } else { emit = a.shouldEmitPoolNotice(pool, from, to, tag) } diff --git a/internal/proxy/pool_recovery_monitor_test.go b/internal/proxy/pool_recovery_monitor_test.go new file mode 100644 index 0000000..4e298e4 --- /dev/null +++ b/internal/proxy/pool_recovery_monitor_test.go @@ -0,0 +1,302 @@ +package proxy + +import ( + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/nemirovsky/sluice/internal/policy" + "github.com/nemirovsky/sluice/internal/store" + "github.com/nemirovsky/sluice/internal/vault" +) + +// newMonitorTestServer builds a minimal Server (policy-only, no provider) for +// exercising the A2 recovery monitor in isolation. The monitor goroutine is +// already running (started in New); the test stores a pool resolver and drives +// the monitor via markPoolExhausted. recoveryMinReschedule is shortened so a +// just-expired-but-unhealthy reschedule does not slow the test. +func newMonitorTestServer(t *testing.T) *Server { + t.Helper() + eng, err := policy.LoadFromBytes([]byte(` +[policy] +default = "deny" +`)) + if err != nil { + t.Fatalf("policy load: %v", err) + } + srv, err := New(Config{ListenAddr: "127.0.0.1:0", Policy: eng}) + if err != nil { + t.Fatalf("New: %v", err) + } + srv.recoveryMinReschedule = time.Millisecond + t.Cleanup(func() { _ = srv.Close() }) + return srv +} + +// twoMemberPool builds a 2-member failover pool resolver and stores it on the +// server. Returns the live resolver. +func twoMemberPool(t *testing.T, srv *Server, name, a, b string) *vault.PoolResolver { + t.Helper() + pool := store.Pool{Name: name, Strategy: store.PoolStrategyFailover} + pool.Members = []store.PoolMember{ + {Credential: a, Position: 0}, + {Credential: b, Position: 1}, + } + pr := vault.NewPoolResolver([]store.Pool{pool}, nil) + srv.poolResolver.Store(pr) + return pr +} + +// waitFor polls cond up to timeout, returning whether it became true. Avoids a +// fixed Sleep so the monitor tests stay fast and deterministic. +func waitFor(timeout time.Duration, cond func() bool) bool { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + if cond() { + return true + } + time.Sleep(time.Millisecond) + } + return cond() +} + +// TestMarkPoolExhaustedEdge asserts the healthy->exhausted edge fires exactly +// once: the first call returns true, subsequent calls (the agent's retry storm) +// return false until the pool recovers. +func TestMarkPoolExhaustedEdge(t *testing.T) { + srv := newMonitorTestServer(t) + // No resolver / no recovery: a removed-but-marked pool would be cleaned up, + // but here we only test the edge bookkeeping, so use a pool that stays + // exhausted (no resolver -> scanRecovery clears it). Store a resolver whose + // members are all cooling so it stays exhausted. + pr := twoMemberPool(t, srv, "p", "a", "b") + far := time.Now().Add(time.Hour) + pr.MarkCooldown("a", far, "429") + pr.MarkCooldown("b", far, "429") + + if !srv.markPoolExhausted("p") { + t.Fatal("first markPoolExhausted = false, want true (healthy->exhausted edge)") + } + for i := 0; i < 5; i++ { + if srv.markPoolExhausted("p") { + t.Fatalf("markPoolExhausted call %d = true, want false (already exhausted, no new edge)", i+2) + } + } +} + +// TestExhaustedNoticeEdgeAcrossManyResponses asserts handlePoolFailover emits +// exactly one "exhausted" notice across many failing responses once the +// Server's edge gate is wired (A2: the edge, not the 30s window, is +// authoritative). +func TestExhaustedNoticeEdgeAcrossManyResponses(t *testing.T) { + srv := newMonitorTestServer(t) + addon, _, prPtr := setupPoolAddon(t, "memA", "memB") + // Wire the addon's exhaustion edge into the server (mirrors setupInjection). + addon.SetOnPoolExhausted(srv.markPoolExhausted) + // Share the same resolver between the addon and the server so the monitor + // observes the cooldowns the failover path applies. + srv.poolResolver.Store(prPtr.Load()) + srv.addon = addon + client := setupAddonConn(addon, "auth.example.com:443") + + // Both members already failure-cooled for a long window: every response is + // pool-exhausting. + prPtr.Load().MarkCooldown("memA", time.Now().Add(time.Hour), "429") + prPtr.Load().MarkCooldown("memB", time.Now().Add(time.Hour), "401") + + var notices int32 + done := make(chan struct{}, 16) + addon.SetOnFailover(func(ev FailoverEvent) { + if !ev.Exhausted { + t.Errorf("FailoverEvent.Exhausted = false, want true") + } + atomic.AddInt32(¬ices, 1) + done <- struct{}{} + }) + + // Ten back-to-back failing responses (the agent retry storm). + for i := 0; i < 10; i++ { + f := newPoolRespFlow(client, 429, []byte(`{"error":"rate_limited"}`)) + addon.flowInjected.Tag(f.Id, "memA") + addon.Response(f) + } + + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatal("onFailover not invoked for the exhausted edge") + } + // Let any erroneous extra notice surface. + time.Sleep(50 * time.Millisecond) + if got := atomic.LoadInt32(¬ices); got != 1 { + t.Fatalf("exhausted notices = %d, want exactly 1 (edge-gated, not per-response)", got) + } +} + +// TestRecoveryMonitorEdgeNoticeAndCallback asserts that on the +// exhausted->recovered edge the monitor fires exactly one recovered notice AND +// exactly one onPoolRecovered call. +func TestRecoveryMonitorEdgeNoticeAndCallback(t *testing.T) { + srv := newMonitorTestServer(t) + pr := twoMemberPool(t, srv, "p", "a", "b") + + var notices, resets int32 + var mu sync.Mutex + var recoveredNotice, recoveredCb string + srv.SetOnPoolRecoveredNotice(func(pool string) { + mu.Lock() + recoveredNotice = pool + mu.Unlock() + atomic.AddInt32(¬ices, 1) + }) + srv.SetOnPoolRecovered(func(pool string) { + mu.Lock() + recoveredCb = pool + mu.Unlock() + atomic.AddInt32(&resets, 1) + }) + + // Both members cooling briefly: the pool is exhausted, then recovers. + until := time.Now().Add(40 * time.Millisecond) + pr.MarkCooldown("a", until, "429") + pr.MarkCooldown("b", until, "429") + + if !srv.markPoolExhausted("p") { + t.Fatal("markPoolExhausted = false, want true (edge)") + } + + if !waitFor(2*time.Second, func() bool { return atomic.LoadInt32(¬ices) == 1 }) { + t.Fatalf("recovered notices = %d, want 1", atomic.LoadInt32(¬ices)) + } + // Let a wrongful second notice surface. + time.Sleep(50 * time.Millisecond) + if got := atomic.LoadInt32(¬ices); got != 1 { + t.Fatalf("recovered notices = %d, want exactly 1", got) + } + if got := atomic.LoadInt32(&resets); got != 1 { + t.Fatalf("onPoolRecovered calls = %d, want exactly 1", got) + } + mu.Lock() + gotNotice, gotCb := recoveredNotice, recoveredCb + mu.Unlock() + if gotNotice != "p" || gotCb != "p" { + t.Fatalf("recovered pool = notice:%q cb:%q, want p/p", gotNotice, gotCb) + } + // State cleared after recovery: a fresh edge can re-fire. + if !srv.markPoolExhausted("p") { + t.Fatal("post-recovery markPoolExhausted = false, want true (state was cleared)") + } +} + +// TestRecoveryMonitorUnequalCooldowns is the B1-shaped case: memA cools for a +// short window, memB for a far longer one. The monitor must wake at the SOONEST +// cooldown (~memA) and recover once when memA becomes healthy, even though memB +// is still cooling. Uses short injected durations (no 60s sleep). +func TestRecoveryMonitorUnequalCooldowns(t *testing.T) { + srv := newMonitorTestServer(t) + pr := twoMemberPool(t, srv, "p", "memA", "memB") + + var notices int32 + srv.SetOnPoolRecoveredNotice(func(string) { atomic.AddInt32(¬ices, 1) }) + + // memA: short (recovers soon). memB: 6h (still cooling at recovery). + pr.MarkCooldown("memA", time.Now().Add(40*time.Millisecond), "429") + pr.MarkCooldown("memB", time.Now().Add(6*time.Hour), "429") + + if !srv.markPoolExhausted("p") { + t.Fatal("markPoolExhausted = false, want true") + } + + if !waitFor(2*time.Second, func() bool { return atomic.LoadInt32(¬ices) == 1 }) { + t.Fatalf("recovered notices = %d, want 1 (wake at soonest cooldown, recover on memA)", atomic.LoadInt32(¬ices)) + } + time.Sleep(50 * time.Millisecond) + if got := atomic.LoadInt32(¬ices); got != 1 { + t.Fatalf("recovered notices = %d, want exactly 1 (memB still cooling must not re-fire)", got) + } + // memB must still be cooling (the recovery was driven solely by memA). + if _, cooling := pr.CooldownUntil("memB"); !cooling { + t.Fatal("memB no longer cooling, expected its 6h cooldown to persist through memA's recovery") + } +} + +// TestRecoveryMonitorPoolRemovedFiresNoNotice asserts a pool removed while +// exhausted has its state dropped and fires no recovered notice. +func TestRecoveryMonitorPoolRemovedFiresNoNotice(t *testing.T) { + srv := newMonitorTestServer(t) + pr := twoMemberPool(t, srv, "p", "a", "b") + pr.MarkCooldown("a", time.Now().Add(time.Hour), "429") + pr.MarkCooldown("b", time.Now().Add(time.Hour), "429") + + var notices, resets int32 + srv.SetOnPoolRecoveredNotice(func(string) { atomic.AddInt32(¬ices, 1) }) + srv.SetOnPoolRecovered(func(string) { atomic.AddInt32(&resets, 1) }) + + if !srv.markPoolExhausted("p") { + t.Fatal("markPoolExhausted = false, want true") + } + + // Remove the pool from the resolver (membership change rebuilds the + // resolver with no pools). + srv.poolResolver.Store(vault.NewPoolResolver(nil, nil)) + // Wake the monitor so it observes the removal. + srv.markPoolExhausted("p") // already-exhausted -> false, but harmless + select { + case srv.recoveryWake <- struct{}{}: + default: + } + + // The state entry must be dropped without any recovered notice/reset. + if !waitFor(2*time.Second, func() bool { + srv.poolExhaustMu.Lock() + _, present := srv.poolExhausted["p"] + srv.poolExhaustMu.Unlock() + return !present + }) { + t.Fatal("exhaustion state for removed pool was not dropped") + } + time.Sleep(50 * time.Millisecond) + if got := atomic.LoadInt32(¬ices); got != 0 { + t.Fatalf("recovered notices = %d, want 0 (removed pool fires no notice)", got) + } + if got := atomic.LoadInt32(&resets); got != 0 { + t.Fatalf("onPoolRecovered calls = %d, want 0 (removed pool triggers no reset)", got) + } +} + +// TestRecoveryMonitorStopsCleanly asserts the monitor stops on shutdown and that +// stopping twice (Close after GracefulShutdown, or vice versa) does not panic. +func TestRecoveryMonitorStopsCleanly(t *testing.T) { + eng, err := policy.LoadFromBytes([]byte("[policy]\ndefault = \"deny\"\n")) + if err != nil { + t.Fatalf("policy load: %v", err) + } + srv, err := New(Config{ListenAddr: "127.0.0.1:0", Policy: eng}) + if err != nil { + t.Fatalf("New: %v", err) + } + // Double-stop must be idempotent (monitorStopOnce). Calling both shutdown + // paths must not panic on a double close of monitorStop. (The second close + // of the already-closed listener returns a benign "use of closed network + // connection" error, which is unrelated to the monitor and ignored here.) + if err := srv.GracefulShutdown(time.Second); err != nil { + t.Fatalf("GracefulShutdown: %v", err) + } + _ = srv.Close() // must not panic on the monitorStop double-close + // stopMonitors directly a third time also must not panic. + srv.stopMonitors() + + // The monitor goroutine must have returned: a markPoolExhausted edge after + // stop must not be serviced (no recovered notice ever fires). + var notices int32 + srv.SetOnPoolRecoveredNotice(func(string) { atomic.AddInt32(¬ices, 1) }) + pr := twoMemberPool(t, srv, "p", "a", "b") + pr.MarkCooldown("a", time.Now().Add(20*time.Millisecond), "429") + pr.MarkCooldown("b", time.Now().Add(20*time.Millisecond), "429") + srv.markPoolExhausted("p") + time.Sleep(100 * time.Millisecond) + if got := atomic.LoadInt32(¬ices); got != 0 { + t.Fatalf("recovered notices = %d, want 0 (monitor stopped, must not service wakes)", got) + } +} diff --git a/internal/proxy/server.go b/internal/proxy/server.go index 4a2adbd..fa2c3fa 100644 --- a/internal/proxy/server.go +++ b/internal/proxy/server.go @@ -92,6 +92,44 @@ type Server struct { serving atomic.Bool activeConns sync.WaitGroup + // Per-pool exhaustion notification state for the A2 edge-triggered + // notices + recovery monitor. This deliberately lives on the long-lived + // Server, NOT on PoolHealth, for two reasons: + // + // 1. It is per-PROCESS notification bookkeeping (have we already told the + // operator this pool is exhausted?), not per-resolver-generation health. + // PoolHealth is rebuilt/merged on every resolver pointer swap (SIGHUP, + // the 2s data-version watcher); the exhaustion edge must survive those + // swaps so a swap mid-exhaustion does not re-fire the "exhausted" notice. + // 2. PoolHealth entries are pruned on a membership change (MergeLiveCooldowns + // drops cooldowns for credentials no longer in any pool). Exhaustion state + // keyed by POOL must NOT be pruned that way — it is the recovery monitor, + // not membership churn, that clears it (and only when the pool genuinely + // recovers or is removed). Putting it on PoolHealth would wrongly prune it. + // + // poolExhausted[pool] is true between the healthy->exhausted edge (set in the + // addon's handlePoolFailover via onPoolExhausted) and the recovery monitor + // observing HasHealthyMember again. recoveryWake (buffered cap-1) is a + // non-blocking edge signal that wakes the monitor. monitorStop is closed once + // (monitorStopOnce) from BOTH Close and GracefulShutdown to stop the monitor + // goroutine idempotently. onPoolRecovered is the optional recovery-edge + // callback (auto-reset wiring, Task 8) set via SetOnPoolRecovered. + // onPoolRecoveredNotice is the operator-notice callback, fired on every + // recovery edge independent of any auth-reset target (wired to the broker + // channels in main.go) — kept separate so the notice always fires while the + // reset is opt-in (Task 8). + poolExhaustMu sync.Mutex + poolExhausted map[string]bool + recoveryWake chan struct{} + monitorStop chan struct{} + monitorStopOnce sync.Once + onPoolRecovered func(pool string) + onPoolRecoveredNotice func(pool string) + // recoveryMinReschedule is the floor for the monitor's per-pool sleep so a + // just-expired-but-still-unhealthy state can't spin the loop. Defaults to + // minRecoveryReschedule; tests may shorten it for determinism. + recoveryMinReschedule time.Duration + // oauthMetasCache holds the latest credential_meta slice the // server saw via UpdateOAuthIndex. Cached so a later // quicProxy initialization (or re-init) can re-apply it. @@ -615,7 +653,13 @@ func New(cfg Config) (*Server, error) { return nil, fmt.Errorf("listen: %w", err) } - srv := &Server{listener: ln} + srv := &Server{ + listener: ln, + poolExhausted: make(map[string]bool), + recoveryWake: make(chan struct{}, 1), + monitorStop: make(chan struct{}), + recoveryMinReschedule: minRecoveryReschedule, + } // Initialize credential injection handlers when a vault provider is // configured. The resolver may be nil at startup (no bindings yet) and @@ -668,6 +712,12 @@ func New(cfg Config) (*Server, error) { socks5.WithAssociateHandle(srv.handleAssociate), ) + // Launch the pool recovery monitor exactly once for the lifetime of this + // Server. It is started here (not in ListenAndServe) so it is running for + // any constructed Server; both Close and GracefulShutdown stop it + // idempotently via monitorStopOnce. + go srv.runRecoveryMonitor() + return srv, nil } @@ -718,6 +768,13 @@ func (s *Server) setupInjection(cfg Config, _ net.Listener) error { addonOpts = append(addonOpts, WithAuditLogger(cfg.Audit)) } s.addon = NewSluiceAddon(addonOpts...) + // Wire the addon's exhaustion-edge hook back into the Server's per-pool + // notification state. The addon owns the failover classification but the + // edge bookkeeping (and the recovery monitor it wakes) live on the + // long-lived Server (see the poolExhausted field doc). markPoolExhausted + // returns true only on the healthy->exhausted edge, so the addon emits the + // "exhausted" notice exactly once per edge. + s.addon.SetOnPoolExhausted(s.markPoolExhausted) // Load credential metadata once and stash it; we cannot mirror it // into the QUIC proxy yet because that proxy is initialized later @@ -2833,6 +2890,220 @@ func (s *Server) SetOnFailover(fn func(FailoverEvent)) { } } +// SetOnPoolRecovered configures the callback the recovery monitor invokes once, +// on the exhausted->recovered edge, after it has emitted the "pool recovered" +// notice. Task 8 wires this in main.go to look up the recovered pool's +// auth_reset_target and, when set, run the agent's auth-reset command in a +// detached goroutine. Safe to leave unset (recovery is then notice-only). +func (s *Server) SetOnPoolRecovered(fn func(pool string)) { + s.poolExhaustMu.Lock() + s.onPoolRecovered = fn + s.poolExhaustMu.Unlock() +} + +// SetOnPoolRecoveredNotice configures the operator-notice callback the recovery +// monitor fires on every exhausted->recovered edge, independent of whether the +// pool has an auth-reset target. main.go wires this to fan the +// FormatPoolRecoveredNotice text out across the broker channels (mirroring the +// onFailover notice path). Kept separate from onPoolRecovered so the notice +// always fires while the auth-reset stays opt-in. +func (s *Server) SetOnPoolRecoveredNotice(fn func(pool string)) { + s.poolExhaustMu.Lock() + s.onPoolRecoveredNotice = fn + s.poolExhaustMu.Unlock() +} + +// minRecoveryReschedule floors the recovery monitor's per-pool wake interval so +// a member whose cooldown has just expired but whose upstream is still +// unhealthy (no HasHealthyMember yet) cannot spin the monitor in a tight loop. +const minRecoveryReschedule = 1 * time.Second + +// markPoolExhausted records the healthy->exhausted edge for a pool and returns +// true ONLY on that edge (false->true). It is the addon's onPoolExhausted hook: +// the addon calls it on every exhausted classification, but the "exhausted" +// operator notice is emitted only when this returns true, so a retry storm +// against an already-exhausted pool produces exactly one notice. On the edge it +// also wakes the recovery monitor with a non-blocking buffered send. +// +// This edge — not the addon's 30s window — is authoritative for the exhausted +// notice (A2). The window dedup remains only for real from->to transitions. +func (s *Server) markPoolExhausted(pool string) bool { + s.poolExhaustMu.Lock() + if s.poolExhausted == nil { + s.poolExhausted = make(map[string]bool) + } + already := s.poolExhausted[pool] + if already { + s.poolExhaustMu.Unlock() + return false + } + s.poolExhausted[pool] = true + s.poolExhaustMu.Unlock() + + // Wake the monitor. Non-blocking: the channel is cap-1, so a wake already + // pending coalesces with this one (the monitor re-Loads the resolver and + // scans every exhausted pool on each wake regardless of which pool woke it). + select { + case s.recoveryWake <- struct{}{}: + default: + } + return true +} + +// stopMonitors stops the recovery monitor goroutine idempotently. Closing +// monitorStop is guarded by monitorStopOnce so calling it from BOTH Close and +// GracefulShutdown (or twice) cannot panic on a double close. +func (s *Server) stopMonitors() { + s.monitorStopOnce.Do(func() { + close(s.monitorStop) + }) +} + +// runRecoveryMonitor is the A2 recovery monitor. It is server-driven +// (time-based) because a latched agent will not emit a recovering 2xx on its +// own — nothing on the traffic path signals recovery, so sluice must poll the +// in-memory cooldowns itself. +// +// On every wake (an exhaustion edge from markPoolExhausted, or its own +// reschedule timer) it Load()s the CURRENT pool resolver — never caching it +// across wakes, so a resolver pointer swap (SIGHUP / data-version watcher) is +// observed immediately. For each pool currently marked exhausted it: +// +// - drops the state entry if the pool no longer exists in the resolver (a +// removed pool fires no recovered notice); +// - on HasHealthyMember becoming true, flips true->false, emits the "pool +// recovered" notice, and invokes onPoolRecovered(pool) once; +// - otherwise computes the next wake as max(SoonestCooldown-now, +// recoveryMinReschedule) and keeps the soonest such wake across all still- +// exhausted pools. +// +// A single timer is reused across iterations; the loop selects on the +// reschedule timer, the wake channel, and monitorStop. +func (s *Server) runRecoveryMonitor() { + timer := time.NewTimer(time.Hour) + if !timer.Stop() { + <-timer.C + } + timerArmed := false + + for { + next, anyExhausted := s.scanRecovery() + if !timer.Stop() && timerArmed { + // Drain a fired-but-unread timer so the reset below is clean. + select { + case <-timer.C: + default: + } + } + timerArmed = false + if anyExhausted { + if next < s.recoveryMinReschedule { + next = s.recoveryMinReschedule + } + timer.Reset(next) + timerArmed = true + } + + select { + case <-s.monitorStop: + if timerArmed { + timer.Stop() + } + return + case <-s.recoveryWake: + // New exhaustion edge (or coalesced edges): re-scan immediately. + case <-timer.C: + timerArmed = false + // Reschedule fired: re-scan to check for recovery. + } + } +} + +// scanRecovery performs one pass over the pools currently marked exhausted. It +// returns the soonest reschedule interval among the pools still exhausted and +// whether any pool remains exhausted (so the caller knows whether to arm the +// timer). It Load()s the current resolver fresh. +func (s *Server) scanRecovery() (next time.Duration, anyExhausted bool) { + pr := s.poolResolver.Load() + + // Snapshot the exhausted pool names under the lock, then evaluate health + // outside it (HasHealthyMember/SoonestCooldown take their own RLock on the + // shared PoolHealth and must not be called while holding poolExhaustMu). + s.poolExhaustMu.Lock() + pools := make([]string, 0, len(s.poolExhausted)) + for p, ex := range s.poolExhausted { + if ex { + pools = append(pools, p) + } + } + s.poolExhaustMu.Unlock() + + now := time.Now() + for _, pool := range pools { + // A removed pool: drop the state, fire no recovered notice. + if pr == nil || !pr.IsPool(pool) { + s.clearPoolExhausted(pool) + continue + } + if pr.HasHealthyMember(pool) { + s.recoverPool(pool) + continue + } + // Still exhausted: schedule the next wake at the soonest cooldown. + if until, ok := pr.SoonestCooldown(pool); ok { + d := until.Sub(now) + if !anyExhausted || d < next { + next = d + } + anyExhausted = true + } else { + // No member is currently cooling yet HasHealthyMember is false + // (e.g. an empty/raced pool). Reschedule at the floor so we + // re-evaluate rather than block forever on the wake channel. + if !anyExhausted || s.recoveryMinReschedule < next { + next = s.recoveryMinReschedule + } + anyExhausted = true + } + } + return next, anyExhausted +} + +// clearPoolExhausted drops a pool's exhaustion state without firing a recovered +// notice (used when the pool was removed while exhausted). +func (s *Server) clearPoolExhausted(pool string) { + s.poolExhaustMu.Lock() + delete(s.poolExhausted, pool) + s.poolExhaustMu.Unlock() +} + +// recoverPool fires the exhausted->recovered edge for a pool: it flips the +// state false (returning early if a concurrent pass already did so), emits the +// "pool recovered" notice, and invokes onPoolRecovered once. The state flip and +// callback snapshot happen under poolExhaustMu so two concurrent passes cannot +// both recover the same pool. +func (s *Server) recoverPool(pool string) { + s.poolExhaustMu.Lock() + if !s.poolExhausted[pool] { + s.poolExhaustMu.Unlock() + return + } + delete(s.poolExhausted, pool) + cb := s.onPoolRecovered + noticeCb := s.onPoolRecoveredNotice + s.poolExhaustMu.Unlock() + + log.Printf("[POOL-FAILOVER] %s", FormatPoolRecoveredNotice(pool)) + // Notice always fires on the recovery edge (independent of auth-reset). + if noticeCb != nil { + noticeCb(pool) + } + // Auth-reset (opt-in, only when a target is configured) is wired here. + if cb != nil { + cb(pool) + } +} + // EnginePtr returns the shared atomic engine pointer. The Telegram command // handler uses this to read and mutate the same engine as the proxy, avoiding // split-brain windows during SIGHUP reloads. @@ -2875,6 +3146,7 @@ func (s *Server) IsListening() bool { // Close stops the server by closing the listener and any internal resources. func (s *Server) Close() error { s.closed.Store(true) + s.stopMonitors() if s.mitmProxy != nil { _ = s.mitmProxy.Close() } @@ -2890,6 +3162,7 @@ func (s *Server) Close() error { // the timeout, or an error if the timeout was exceeded. func (s *Server) GracefulShutdown(timeout time.Duration) error { s.closed.Store(true) + s.stopMonitors() // Stop accepting new connections. _ = s.listener.Close() if s.mitmProxy != nil { From 0a6745387954896ebab7806d84aabfbf48a2159d Mon Sep 17 00:00:00 2001 From: Nikita Nemirovsky Date: Fri, 22 May 2026 16:07:28 +0800 Subject: [PATCH 05/19] feat(store): add per-pool auth_reset_target column + accessor (migration 000008) --- ...22-pool-exhaustion-and-agent-auth-reset.md | 15 +- .../000008_pool_auth_reset.down.sql | 41 ++++ .../migrations/000008_pool_auth_reset.up.sql | 15 ++ internal/store/pools.go | 46 ++++- internal/store/pools_test.go | 193 ++++++++++++++++++ 5 files changed, 295 insertions(+), 15 deletions(-) create mode 100644 internal/store/migrations/000008_pool_auth_reset.down.sql create mode 100644 internal/store/migrations/000008_pool_auth_reset.up.sql diff --git a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md index 2a5dae4..b7e9d47 100644 --- a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md +++ b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md @@ -286,14 +286,17 @@ precedent) and emits an `agent_auth_reset` audit event. - Modify: `internal/store/pools.go` - Modify: `internal/store/pools_test.go` -- [ ] up: `ALTER TABLE credential_pools ADD COLUMN auth_reset_target TEXT NOT NULL DEFAULT ''` -- [ ] down: SQLite 12-step rebuild of `credential_pools` only, wrapped - `PRAGMA foreign_keys=OFF;` … `=ON;` so the `credential_pool_members` FK isn't orphaned -- [ ] add `AuthResetTarget` to `Pool`; include in create/list reads; add +- [x] up: `ALTER TABLE credential_pools ADD COLUMN auth_reset_target TEXT NOT NULL DEFAULT ''` + (migration `000008_pool_auth_reset` — `000007` was already taken by `pool_membership_epoch`) +- [x] down: SQLite 12-step rebuild of `credential_pools` only; because golang-migrate runs each + script inside a transaction and SQLite ignores `PRAGMA foreign_keys=OFF` while a transaction is + open, the FK-referencing `credential_pool_members` rows are snapshotted to a temp table and + restored after the rebuild instead (cascade-safe, FK preserved) +- [x] add `AuthResetTarget` to `Pool`; include in create/list reads; add `SetPoolAuthResetTarget(name, target) error` -- [ ] write tests: migrate up→down→up against a **populated** table (pool + members + health +- [x] write tests: migrate up→down→up against a **populated** table (pool + members + health rows survive/round-trip); default empty; create with target; set/clear target; list reflects it -- [ ] run `go test ./internal/store/ -race`; gofumpt; vet — pass before Task 5 +- [x] run `go test ./internal/store/ -race`; gofumpt; vet — pass before Task 5 ### Task 5: Channel parity — `auth_reset_target` on CLI + REST + Telegram diff --git a/internal/store/migrations/000008_pool_auth_reset.down.sql b/internal/store/migrations/000008_pool_auth_reset.down.sql new file mode 100644 index 0000000..353ceaa --- /dev/null +++ b/internal/store/migrations/000008_pool_auth_reset.down.sql @@ -0,0 +1,41 @@ +-- Revert the per-pool auth_reset_target column. +-- +-- This is the SQLite 12-step table rebuild +-- (https://www.sqlite.org/lang_altertable.html#otheralter) for +-- credential_pools: recreate it WITHOUT auth_reset_target, copy the data, drop +-- the old table, rename. The column set + constraints recreated here match +-- credential_pools as created by migration 000006 (name PK, strategy CHECK, +-- created_at default). +-- +-- credential_pool_members.pool has a FK -> credential_pools(name) with ON +-- DELETE CASCADE. golang-migrate runs each migration inside a transaction, and +-- SQLite ignores `PRAGMA foreign_keys=OFF` while a transaction is open, so the +-- pragma trick used outside migrations cannot disable the cascade here. +-- Dropping credential_pools would therefore CASCADE-delete every member row. +-- +-- To keep the FK-referencing member rows (and their epoch column from 000007) +-- intact across the rebuild, the member rows are snapshotted into a temp table +-- before the parent table is dropped and restored afterward, once the rebuilt +-- credential_pools rows exist again to satisfy the FK. credential_health is +-- not FK-tied to credential_pools, so it is untouched. + +CREATE TEMP TABLE _cpm_backup AS + SELECT pool, credential, position, epoch FROM credential_pool_members; + +CREATE TABLE credential_pools_new ( + name TEXT PRIMARY KEY, + strategy TEXT NOT NULL DEFAULT 'failover' CHECK(strategy IN ('failover')), + created_at TEXT NOT NULL DEFAULT (datetime('now')) +); + +INSERT INTO credential_pools_new (name, strategy, created_at) + SELECT name, strategy, created_at FROM credential_pools; + +DROP TABLE credential_pools; + +ALTER TABLE credential_pools_new RENAME TO credential_pools; + +INSERT INTO credential_pool_members (pool, credential, position, epoch) + SELECT pool, credential, position, epoch FROM _cpm_backup; + +DROP TABLE _cpm_backup; diff --git a/internal/store/migrations/000008_pool_auth_reset.up.sql b/internal/store/migrations/000008_pool_auth_reset.up.sql new file mode 100644 index 0000000..a126b59 --- /dev/null +++ b/internal/store/migrations/000008_pool_auth_reset.up.sql @@ -0,0 +1,15 @@ +-- Per-pool agent auth-reset target. +-- +-- auth_reset_target is an opt-in, per-pool string naming the agent auth +-- target sluice resets when the pool transitions exhausted -> recovered (e.g. +-- the hermes "openai-codex" auth entry). Empty (the default) = opt-out: no +-- reset runs. Non-empty = sluice invokes the agent profile's auth-reset +-- command on the recovery edge so a latched agent un-latches and resumes +-- without a manual operator reset. +-- +-- A plain ADD COLUMN with a NOT NULL DEFAULT '' is sufficient for SQLite; the +-- down migration does the 12-step table rebuild because SQLite cannot DROP a +-- column referenced by no FK in older engines portably, and the column must be +-- removed while preserving the credential_pool_members FK -> credential_pools. + +ALTER TABLE credential_pools ADD COLUMN auth_reset_target TEXT NOT NULL DEFAULT ''; diff --git a/internal/store/pools.go b/internal/store/pools.go index 2ee18e2..e71a927 100644 --- a/internal/store/pools.go +++ b/internal/store/pools.go @@ -56,10 +56,14 @@ var ( // Pool is a named group of OAuth credentials backing a single phantom // identity. Members are returned ordered by position (failover order). type Pool struct { - Name string - Strategy string - CreatedAt string - Members []PoolMember + Name string + Strategy string + // AuthResetTarget is the opt-in, per-pool agent auth-reset target sluice + // resets on the exhausted -> recovered edge (e.g. the hermes + // "openai-codex" auth entry). Empty (the default) means no reset runs. + AuthResetTarget string + CreatedAt string + Members []PoolMember } // PoolMember is one credential entry in a pool. Position determines the @@ -249,7 +253,7 @@ func (s *Store) CreatePoolWithMembers(name, strategy string, members []string) e } if _, err := tx.Exec( - "INSERT INTO credential_pools (name, strategy) VALUES (?, ?)", name, strategy, + "INSERT INTO credential_pools (name, strategy, auth_reset_target) VALUES (?, ?, '')", name, strategy, ); err != nil { return fmt.Errorf("insert pool %q: %w", name, err) } @@ -284,13 +288,37 @@ func (s *Store) CreatePoolWithMembers(name, strategy string, members []string) e return nil } +// SetPoolAuthResetTarget sets (or clears, when target is "") the per-pool +// auth_reset_target. Returns ErrPoolNoMembers when the pool name is empty and +// sql.ErrNoRows-wrapped behavior is intentionally avoided: a missing pool is +// reported via the returned error so every management channel can surface a +// "no such pool" message. The target value itself is stored verbatim; +// charset/validation of the target is the channel/ops layer's concern (the +// stored string is consumed as argv, never shell-interpolated). +func (s *Store) SetPoolAuthResetTarget(name, target string) error { + if name == "" { + return fmt.Errorf("%w: pool name is required", ErrPoolNoMembers) + } + res, err := s.db.Exec( + "UPDATE credential_pools SET auth_reset_target = ? WHERE name = ?", target, name, + ) + if err != nil { + return fmt.Errorf("set auth_reset_target for pool %q: %w", name, err) + } + n, _ := res.RowsAffected() + if n == 0 { + return fmt.Errorf("pool %q does not exist", name) + } + return nil +} + // GetPool returns a pool by name with members ordered by position, or nil if // the pool does not exist. func (s *Store) GetPool(name string) (*Pool, error) { var p Pool err := s.db.QueryRow( - "SELECT name, strategy, created_at FROM credential_pools WHERE name = ?", name, - ).Scan(&p.Name, &p.Strategy, &p.CreatedAt) + "SELECT name, strategy, auth_reset_target, created_at FROM credential_pools WHERE name = ?", name, + ).Scan(&p.Name, &p.Strategy, &p.AuthResetTarget, &p.CreatedAt) if errors.Is(err, sql.ErrNoRows) { return nil, nil } @@ -320,7 +348,7 @@ func (s *Store) GetPool(name string) (*Pool, error) { // ListPools returns all pools with their members ordered by position. func (s *Store) ListPools() ([]Pool, error) { - rows, err := s.db.Query("SELECT name, strategy, created_at FROM credential_pools ORDER BY name") + rows, err := s.db.Query("SELECT name, strategy, auth_reset_target, created_at FROM credential_pools ORDER BY name") if err != nil { return nil, fmt.Errorf("list pools: %w", err) } @@ -328,7 +356,7 @@ func (s *Store) ListPools() ([]Pool, error) { pools := make(map[string]*Pool) for rows.Next() { var p Pool - if err := rows.Scan(&p.Name, &p.Strategy, &p.CreatedAt); err != nil { + if err := rows.Scan(&p.Name, &p.Strategy, &p.AuthResetTarget, &p.CreatedAt); err != nil { _ = rows.Close() return nil, fmt.Errorf("scan pool: %w", err) } diff --git a/internal/store/pools_test.go b/internal/store/pools_test.go index 3016170..cdf2291 100644 --- a/internal/store/pools_test.go +++ b/internal/store/pools_test.go @@ -682,6 +682,15 @@ func TestMigration000006DownUp(t *testing.T) { t.Fatalf("migrator: %v", err) } + // Migrations newer than 000007 (e.g. 000008_pool_auth_reset) may sit on top + // of the schema after New(). This test exercises the 000006/000007 down/up + // boundary with relative Steps, so pin the starting version to 000007 + // first; otherwise a relative Steps(-1) would only undo the newest + // migration instead of 000007. + if err := m.Migrate(7); err != nil && !errors.Is(err, migrate.ErrNoChange) { + t.Fatalf("pin to 000007: %v", err) + } + columnExists := func(table, col string) bool { rows, qerr := s.db.Query("PRAGMA table_info(" + table + ")") if qerr != nil { @@ -1557,3 +1566,187 @@ func TestRemoveCredentialMetaCASNoOpLeavesHealthIntact(t *testing.T) { t.Error("credential_health wrongly deleted by a CAS no-op (round-11 invariant regressed)") } } + +// TestMigration000008DownUpPopulated exercises the auth_reset_target column +// migration (000008) up -> down -> up against a POPULATED schema: a pool with +// two members and a credential_health row must survive the down (which +// rebuilds credential_pools while preserving the credential_pool_members FK) +// and the re-up round-trip. The down migration disables foreign_keys for the +// 12-step rebuild, so the FK-referencing member rows must NOT be cascade-wiped. +func TestMigration000008DownUpPopulated(t *testing.T) { + dir := t.TempDir() + dbPath := filepath.Join(dir, "m.db") + s, err := New(dbPath) + if err != nil { + t.Fatalf("New: %v", err) + } + defer func() { _ = s.Close() }() + + // Populate: oauth members, a pool with a non-empty auth_reset_target, and a + // credential_health row for one member. + seedOAuthCred(t, s, "acct_a") + seedOAuthCred(t, s, "acct_b") + if err := s.CreatePoolWithMembers("codex", "failover", []string{"acct_a", "acct_b"}); err != nil { + t.Fatalf("CreatePoolWithMembers: %v", err) + } + if err := s.SetPoolAuthResetTarget("codex", "openai-codex"); err != nil { + t.Fatalf("SetPoolAuthResetTarget: %v", err) + } + if err := s.SetCredentialHealth("acct_a", "cooldown", time.Now().Add(time.Hour), "429"); err != nil { + t.Fatalf("SetCredentialHealth: %v", err) + } + + columnExists := func(table, col string) bool { + rows, qerr := s.db.Query("PRAGMA table_info(" + table + ")") + if qerr != nil { + return false + } + defer func() { _ = rows.Close() }() + for rows.Next() { + var cid int + var name, ctype string + var notnull, pk int + var dflt interface{} + if scanErr := rows.Scan(&cid, &name, &ctype, ¬null, &dflt, &pk); scanErr != nil { + return false + } + if name == col { + return true + } + } + return false + } + + if !columnExists("credential_pools", "auth_reset_target") { + t.Fatal("credential_pools.auth_reset_target missing after up migration (000008)") + } + + src, err := iofs.New(migrationsFS, "migrations") + if err != nil { + t.Fatalf("iofs: %v", err) + } + drv, err := migsqlite.WithInstance(s.db, &migsqlite.Config{}) + if err != nil { + t.Fatalf("driver: %v", err) + } + m, err := migrate.NewWithInstance("iofs", src, "sqlite", drv) + if err != nil { + t.Fatalf("migrator: %v", err) + } + + // Down one step (000008 -> 000007): the column goes; the rebuilt + // credential_pools keeps its row; the FK-referencing member rows and the + // health row survive (foreign_keys=OFF during the rebuild prevents a + // cascade wipe). + if err := m.Steps(-1); err != nil { + t.Fatalf("down 1 (000008): %v", err) + } + if columnExists("credential_pools", "auth_reset_target") { + t.Error("credential_pools.auth_reset_target still present after 000008 down") + } + var poolCount, memberCount, healthCount int + if err := s.db.QueryRow("SELECT COUNT(*) FROM credential_pools").Scan(&poolCount); err != nil { + t.Fatalf("count pools: %v", err) + } + if poolCount != 1 { + t.Errorf("pool row lost in 000008 down rebuild: got %d, want 1", poolCount) + } + if err := s.db.QueryRow("SELECT COUNT(*) FROM credential_pool_members WHERE pool = 'codex'").Scan(&memberCount); err != nil { + t.Fatalf("count members: %v", err) + } + if memberCount != 2 { + t.Errorf("member rows cascade-wiped by 000008 down rebuild: got %d, want 2", memberCount) + } + if err := s.db.QueryRow("SELECT COUNT(*) FROM credential_health WHERE credential = 'acct_a'").Scan(&healthCount); err != nil { + t.Fatalf("count health: %v", err) + } + if healthCount != 1 { + t.Errorf("health row lost in 000008 down rebuild: got %d, want 1", healthCount) + } + + // Re-up (000007 -> 000008): the column returns, defaulting to '' for the + // existing row (the target value is not preserved across a down, which is + // expected — the column was dropped). All populated rows still present. + if err := m.Steps(1); err != nil { + t.Fatalf("up 1 (re-000008): %v", err) + } + if !columnExists("credential_pools", "auth_reset_target") { + t.Fatal("credential_pools.auth_reset_target missing after re-up migration (000008)") + } + p, err := s.GetPool("codex") + if err != nil || p == nil { + t.Fatalf("GetPool after re-up: %+v, %v", p, err) + } + if p.AuthResetTarget != "" { + t.Errorf("auth_reset_target = %q after re-up; want '' (column dropped on down)", p.AuthResetTarget) + } + if len(p.Members) != 2 { + t.Errorf("members not preserved across down/up: got %d, want 2", len(p.Members)) + } +} + +// TestPoolAuthResetTargetCRUD covers the store accessor: default empty on +// create, create-then-set, clear, and that list/get reflect the value. +func TestPoolAuthResetTargetCRUD(t *testing.T) { + s := newTestStore(t) + seedOAuthCred(t, s, "acct_a") + seedOAuthCred(t, s, "acct_b") + + if err := s.CreatePoolWithMembers("codex", "", []string{"acct_a", "acct_b"}); err != nil { + t.Fatalf("CreatePoolWithMembers: %v", err) + } + + // Default empty on create. + p, err := s.GetPool("codex") + if err != nil || p == nil { + t.Fatalf("GetPool: %+v, %v", p, err) + } + if p.AuthResetTarget != "" { + t.Errorf("auth_reset_target = %q on fresh pool; want empty", p.AuthResetTarget) + } + + // Set a target. + if err := s.SetPoolAuthResetTarget("codex", "openai-codex"); err != nil { + t.Fatalf("SetPoolAuthResetTarget set: %v", err) + } + p, _ = s.GetPool("codex") + if p.AuthResetTarget != "openai-codex" { + t.Errorf("auth_reset_target = %q after set; want openai-codex", p.AuthResetTarget) + } + + // List reflects the value. + pools, err := s.ListPools() + if err != nil { + t.Fatalf("ListPools: %v", err) + } + var found bool + for _, lp := range pools { + if lp.Name == "codex" { + found = true + if lp.AuthResetTarget != "openai-codex" { + t.Errorf("ListPools auth_reset_target = %q; want openai-codex", lp.AuthResetTarget) + } + } + } + if !found { + t.Fatal("pool codex not found in ListPools") + } + + // Clear it. + if err := s.SetPoolAuthResetTarget("codex", ""); err != nil { + t.Fatalf("SetPoolAuthResetTarget clear: %v", err) + } + p, _ = s.GetPool("codex") + if p.AuthResetTarget != "" { + t.Errorf("auth_reset_target = %q after clear; want empty", p.AuthResetTarget) + } + + // Unknown pool errors. + if err := s.SetPoolAuthResetTarget("nope", "x"); err == nil { + t.Error("SetPoolAuthResetTarget on a missing pool should error") + } + // Empty pool name errors. + if err := s.SetPoolAuthResetTarget("", "x"); err == nil { + t.Error("SetPoolAuthResetTarget with empty name should error") + } +} From eefe85df68007e513d30bcaee9b4348bf7a39375 Mon Sep 17 00:00:00 2001 From: Nikita Nemirovsky Date: Fri, 22 May 2026 16:18:30 +0800 Subject: [PATCH 06/19] feat(pools): expose auth_reset_target on CLI, REST, and Telegram --- api/openapi.yaml | 63 +++++ cmd/sluice/pool.go | 57 ++++- cmd/sluice/pool_test.go | 91 +++++++ ...22-pool-exhaustion-and-agent-auth-reset.md | 29 ++- internal/api/api.gen.go | 231 +++++++++++------- internal/api/server.go | 51 +++- internal/api/server_test.go | 102 ++++++++ internal/poolops/poolops.go | 78 +++++- internal/poolops/poolops_test.go | 102 ++++++++ internal/telegram/commands.go | 47 +++- internal/telegram/commands_test.go | 60 +++++ 11 files changed, 800 insertions(+), 111 deletions(-) diff --git a/api/openapi.yaml b/api/openapi.yaml index 6ea9098..7437d37 100644 --- a/api/openapi.yaml +++ b/api/openapi.yaml @@ -652,6 +652,45 @@ paths: schema: $ref: "#/components/schemas/ErrorResponse" + /api/pools/{name}/auth-reset-target: + post: + operationId: postApiPoolsNameAuthResetTarget + summary: >- + Set or clear the per-pool agent auth-reset target run on the + exhausted->recovered edge + tags: [pools] + parameters: + - name: name + in: path + required: true + schema: + type: string + requestBody: + required: true + content: + application/json: + schema: + $ref: "#/components/schemas/SetPoolAuthResetTargetRequest" + responses: + "200": + description: Updated pool + content: + application/json: + schema: + $ref: "#/components/schemas/Pool" + "400": + description: Invalid target + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorResponse" + "404": + description: Pool not found + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorResponse" + /api/audit/recent: get: operationId: getApiAuditRecent @@ -1181,6 +1220,11 @@ components: strategy: type: string description: "Pool strategy (only 'failover' is supported)" + auth_reset_target: + type: string + description: >- + Agent auth-reset target run on the exhausted->recovered edge + (empty = no reset) created_at: type: string format: date-time @@ -1207,12 +1251,26 @@ components: strategy: type: string description: "Pool strategy; defaults to 'failover' when omitted" + auth_reset_target: + type: string + description: >- + Optional agent auth-reset target run on the exhausted->recovered + edge (empty/omitted = no reset) members: type: array description: "Ordered member credential names (failover order)" items: type: string + SetPoolAuthResetTargetRequest: + type: object + required: [auth_reset_target] + properties: + auth_reset_target: + type: string + description: >- + Agent auth-reset target; an empty string clears it (no reset) + PoolStatus: type: object required: [name, strategy, active, members] @@ -1224,6 +1282,11 @@ components: active: type: string description: "Currently active member credential name" + auth_reset_target: + type: string + description: >- + Agent auth-reset target run on the exhausted->recovered edge + (empty = no reset) members: type: array items: diff --git a/cmd/sluice/pool.go b/cmd/sluice/pool.go index 924856b..3418393 100644 --- a/cmd/sluice/pool.go +++ b/cmd/sluice/pool.go @@ -13,7 +13,7 @@ import ( func handlePoolCommand(args []string) error { if len(args) == 0 { - return fmt.Errorf("usage: sluice pool [create|list|status|rotate|remove]") + return fmt.Errorf("usage: sluice pool [create|list|status|rotate|set-auth-reset|remove]") } switch args[0] { @@ -25,10 +25,12 @@ func handlePoolCommand(args []string) error { return handlePoolStatus(args[1:]) case "rotate": return handlePoolRotate(args[1:]) + case "set-auth-reset": + return handlePoolSetAuthReset(args[1:]) case "remove": return handlePoolRemove(args[1:]) default: - return fmt.Errorf("unknown pool command: %s (usage: sluice pool [create|list|status|rotate|remove] ...)", args[0]) + return fmt.Errorf("unknown pool command: %s (usage: sluice pool [create|list|status|rotate|set-auth-reset|remove] ...)", args[0]) } } @@ -37,12 +39,13 @@ func handlePoolCreate(args []string) error { dbPath := fs.String("db", "data/sluice.db", "path to SQLite database") membersStr := fs.String("members", "", "comma-separated ordered list of oauth credential names (failover order)") strategy := fs.String("strategy", store.PoolStrategyFailover, "pool strategy (only 'failover' is supported)") + authResetTarget := fs.String("auth-reset-target", "", "agent auth-reset target run on the exhausted->recovered edge (e.g. openai-codex); empty = no reset") if err := fs.Parse(reorderFlagsBeforePositional(args, fs)); err != nil { return err } if fs.NArg() == 0 { - return fmt.Errorf("usage: sluice pool create --members a,b[,c] [--strategy failover]") + return fmt.Errorf("usage: sluice pool create --members a,b[,c] [--strategy failover] [--auth-reset-target ]") } name := fs.Arg(0) @@ -60,7 +63,7 @@ func handlePoolCreate(args []string) error { } defer func() { _ = db.Close() }() - if err := poolops.Create(db, name, *strategy, members); err != nil { + if err := poolops.CreateWithAuthResetTarget(db, name, *strategy, members, *authResetTarget); err != nil { return err } @@ -68,6 +71,9 @@ func handlePoolCreate(args []string) error { for i, m := range members { fmt.Printf(" [%d] %s\n", i, m) } + if *authResetTarget != "" { + fmt.Printf("auth-reset target: %s\n", *authResetTarget) + } fmt.Printf("bind it with: sluice binding add %s --destination [--ports 443]\n", name) return nil } @@ -99,6 +105,9 @@ func handlePoolList(args []string) error { names = append(names, m.Credential) } fmt.Printf("%s (strategy: %s): %s\n", p.Name, p.Strategy, strings.Join(names, ", ")) + if p.AuthResetTarget != "" { + fmt.Printf(" auth-reset target: %s\n", p.AuthResetTarget) + } } return nil } @@ -151,6 +160,46 @@ func handlePoolStatus(args []string) error { fmt.Printf("%s[%d] %s %s\n", marker, m.Position, m.Credential, status) } fmt.Printf("active: %s\n", res.Active) + if res.AuthResetTarget != "" { + fmt.Printf("auth-reset target: %s\n", res.AuthResetTarget) + } + return nil +} + +func handlePoolSetAuthReset(args []string) error { + fs := flag.NewFlagSet("pool set-auth-reset", flag.ContinueOnError) + dbPath := fs.String("db", "data/sluice.db", "path to SQLite database") + if err := fs.Parse(reorderFlagsBeforePositional(args, fs)); err != nil { + return err + } + if fs.NArg() < 2 { + return fmt.Errorf("usage: sluice pool set-auth-reset (a single - clears the target)") + } + name := fs.Arg(0) + target := fs.Arg(1) + // A single "-" is the channel-uniform clear sentinel. + if target == "-" { + target = "" + } + + db, err := store.New(*dbPath) + if err != nil { + return fmt.Errorf("open store: %w", err) + } + defer func() { _ = db.Close() }() + + if err := poolops.SetAuthResetTarget(db, name, target); err != nil { + var nf *poolops.PoolNotFoundError + if errors.As(err, &nf) { + return fmt.Errorf("pool %q not found", name) + } + return err + } + if target == "" { + fmt.Printf("pool %q auth-reset target cleared\n", name) + } else { + fmt.Printf("pool %q auth-reset target set to %q\n", name, target) + } return nil } diff --git a/cmd/sluice/pool_test.go b/cmd/sluice/pool_test.go index 6625e56..f3189c9 100644 --- a/cmd/sluice/pool_test.go +++ b/cmd/sluice/pool_test.go @@ -95,6 +95,97 @@ func TestHandlePoolCreateListStatusRemove(t *testing.T) { } } +// TestHandlePoolAuthResetTarget exercises the CLI adapter for the per-pool +// auth_reset_target: --auth-reset-target on create, the set-auth-reset +// subcommand (set and clear via "-"), and that the value reaches the store +// and is surfaced in list/status output. The CLI is a thin poolops adapter, +// so this asserts the value round-trips through the store the same way the +// REST and Telegram adapters do. +func TestHandlePoolAuthResetTarget(t *testing.T) { + dir := t.TempDir() + dbPath := setupVaultDB(t, dir) + seedPoolCred(t, dbPath, dir, "acct_a") + seedPoolCred(t, dbPath, dir, "acct_b") + + out := captureStdout(t, func() { + if err := handlePoolCommand([]string{ + "create", "--db", dbPath, "--members", "acct_a,acct_b", + "--auth-reset-target", "openai-codex", "codex", + }); err != nil { + t.Fatalf("pool create: %v", err) + } + }) + if !strings.Contains(out, "auth-reset target: openai-codex") { + t.Errorf("create output missing target: %q", out) + } + + // Value reached the store. + assertStoredAuthResetTarget(t, dbPath, "codex", "openai-codex") + + // Surfaced in list and status. + out = captureStdout(t, func() { + _ = handlePoolCommand([]string{"list", "--db", dbPath}) + }) + if !strings.Contains(out, "auth-reset target: openai-codex") { + t.Errorf("list output missing target: %q", out) + } + out = captureStdout(t, func() { + _ = handlePoolCommand([]string{"status", "--db", dbPath, "codex"}) + }) + if !strings.Contains(out, "auth-reset target: openai-codex") { + t.Errorf("status output missing target: %q", out) + } + + // set-auth-reset to a new value. + out = captureStdout(t, func() { + if err := handlePoolCommand([]string{"set-auth-reset", "--db", dbPath, "codex", "other-target"}); err != nil { + t.Fatalf("set-auth-reset: %v", err) + } + }) + if !strings.Contains(out, "set to \"other-target\"") { + t.Errorf("set-auth-reset output = %q", out) + } + assertStoredAuthResetTarget(t, dbPath, "codex", "other-target") + + // Clear with the "-" sentinel. + out = captureStdout(t, func() { + if err := handlePoolCommand([]string{"set-auth-reset", "--db", dbPath, "codex", "-"}); err != nil { + t.Fatalf("set-auth-reset clear: %v", err) + } + }) + if !strings.Contains(out, "cleared") { + t.Errorf("clear output = %q", out) + } + assertStoredAuthResetTarget(t, dbPath, "codex", "") + + // Unknown pool and bad usage. + if err := handlePoolCommand([]string{"set-auth-reset", "--db", dbPath, "missing", "x"}); err == nil { + t.Error("expected error for set-auth-reset on missing pool") + } + if err := handlePoolCommand([]string{"set-auth-reset", "--db", dbPath, "codex"}); err == nil { + t.Error("expected usage error for set-auth-reset with too few args") + } +} + +func assertStoredAuthResetTarget(t *testing.T, dbPath, pool, want string) { + t.Helper() + db, err := store.New(dbPath) + if err != nil { + t.Fatalf("open db: %v", err) + } + defer func() { _ = db.Close() }() + p, err := db.GetPool(pool) + if err != nil { + t.Fatalf("GetPool: %v", err) + } + if p == nil { + t.Fatalf("pool %q not found", pool) + } + if p.AuthResetTarget != want { + t.Fatalf("stored AuthResetTarget = %q, want %q", p.AuthResetTarget, want) + } +} + func TestHandlePoolErrorPaths(t *testing.T) { dir := t.TempDir() dbPath := setupVaultDB(t, dir) diff --git a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md index b7e9d47..5197770 100644 --- a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md +++ b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md @@ -307,19 +307,22 @@ precedent) and emits an `agent_auth_reset` audit event. - Modify: Telegram `/pool` handler - Modify: matching `_test.go` for poolops + each channel adapter -- [ ] add the operation to `internal/poolops` (channel-agnostic) so all channels are thin - adapters (CLAUDE.md anti-pattern note) -- [ ] CLI: `--auth-reset-target` on `pool create`; `pool set-auth-reset ` - (set/clear); show target in `pool status`/`pool list` -- [ ] REST: accept on `POST /api/pools`; add an **action route** `POST - /api/pools/{name}/auth-reset-target` (mirrors the existing `/rotate` style, not a bespoke - PATCH); edit `api/openapi.yaml`, run `make generate`, implement the generated - `ServerInterface` method in `server.go` -- [ ] Telegram: accept on `/pool create`; add `/pool set-auth-reset ` -- [ ] write tests: poolops set/clear/create-with-target; one adapter test per channel - asserting it routes through poolops (no inline logic) -- [ ] run `go test ./... -race` for touched packages; `make generate` clean; gofumpt; vet — - pass before Task 6 +- [x] add the operation to `internal/poolops` (channel-agnostic) so all channels are thin + adapters (CLAUDE.md anti-pattern note) — `CreateWithAuthResetTarget`, `SetAuthResetTarget`, + `validateAuthResetTarget`/`ErrInvalidAuthResetTarget`, target on `StatusResult` +- [x] CLI: `--auth-reset-target` on `pool create`; `pool set-auth-reset ` + (set/clear, `-` clears); show target in `pool status`/`pool list` +- [x] REST: accept `auth_reset_target` on `POST /api/pools`; action route `POST + /api/pools/{name}/auth-reset-target` (mirrors `/rotate`, not a PATCH); edited + `api/openapi.yaml`, ran `make generate`, implemented `PostApiPoolsNameAuthResetTarget` in + `server.go` +- [x] Telegram: accept target as the optional 3rd `/pool create` arg; added `/pool + set-auth-reset ` +- [x] write tests: poolops set/clear/create-with-target unit tests; one adapter test per + channel (CLI, REST, Telegram) asserting it routes through poolops and the value reaches the + store +- [x] run `go test ./... -race` for touched packages; `make generate` clean (byte-stable, + raw output committed); gofumpt; vet — pass before Task 6 ### Task 6: Add optional exec user to the container exec path (prerequisite for hermes reset) diff --git a/internal/api/api.gen.go b/internal/api/api.gen.go index 5477fa9..e88e9f8 100644 --- a/internal/api/api.gen.go +++ b/internal/api/api.gen.go @@ -432,6 +432,9 @@ type CreateMCPUpstreamRequest struct { // CreatePoolRequest defines model for CreatePoolRequest. type CreatePoolRequest struct { + // AuthResetTarget Optional agent auth-reset target run on the exhausted->recovered edge (empty/omitted = no reset) + AuthResetTarget *string `json:"auth_reset_target,omitempty"` + // Members Ordered member credential names (failover order) Members []string `json:"members"` Name string `json:"name"` @@ -529,9 +532,11 @@ type MCPUpstream struct { // Pool defines model for Pool. type Pool struct { - CreatedAt *time.Time `json:"created_at,omitempty"` - Members []PoolMember `json:"members"` - Name string `json:"name"` + // AuthResetTarget Agent auth-reset target run on the exhausted->recovered edge (empty = no reset) + AuthResetTarget *string `json:"auth_reset_target,omitempty"` + CreatedAt *time.Time `json:"created_at,omitempty"` + Members []PoolMember `json:"members"` + Name string `json:"name"` // Strategy Pool strategy (only 'failover' is supported) Strategy string `json:"strategy"` @@ -582,10 +587,13 @@ type PoolRotateResult struct { // PoolStatus defines model for PoolStatus. type PoolStatus struct { // Active Currently active member credential name - Active string `json:"active"` - Members []PoolMemberStatus `json:"members"` - Name string `json:"name"` - Strategy string `json:"strategy"` + Active string `json:"active"` + + // AuthResetTarget Agent auth-reset target run on the exhausted->recovered edge (empty = no reset) + AuthResetTarget *string `json:"auth_reset_target,omitempty"` + Members []PoolMemberStatus `json:"members"` + Name string `json:"name"` + Strategy string `json:"strategy"` } // ResolveRequest defines model for ResolveRequest. @@ -620,6 +628,12 @@ type Rule struct { // RuleVerdict defines model for Rule.Verdict. type RuleVerdict string +// SetPoolAuthResetTargetRequest defines model for SetPoolAuthResetTargetRequest. +type SetPoolAuthResetTargetRequest struct { + // AuthResetTarget Agent auth-reset target; an empty string clears it (no reset) + AuthResetTarget string `json:"auth_reset_target"` +} + // StatusResponse defines model for StatusResponse. type StatusResponse struct { Channels []ChannelStatus `json:"channels"` @@ -685,6 +699,9 @@ type PostApiMcpUpstreamsJSONRequestBody = CreateMCPUpstreamRequest // PostApiPoolsJSONRequestBody defines body for PostApiPools for application/json ContentType. type PostApiPoolsJSONRequestBody = CreatePoolRequest +// PostApiPoolsNameAuthResetTargetJSONRequestBody defines body for PostApiPoolsNameAuthResetTarget for application/json ContentType. +type PostApiPoolsNameAuthResetTargetJSONRequestBody = SetPoolAuthResetTargetRequest + // PostApiRulesJSONRequestBody defines body for PostApiRules for application/json ContentType. type PostApiRulesJSONRequestBody = CreateRuleRequest @@ -762,6 +779,9 @@ type ServerInterface interface { // Pool status (active member + per-member health) // (GET /api/pools/{name}) GetApiPoolsName(w http.ResponseWriter, r *http.Request, name string) + // Set or clear the per-pool agent auth-reset target run on the exhausted->recovered edge + // (POST /api/pools/{name}/auth-reset-target) + PostApiPoolsNameAuthResetTarget(w http.ResponseWriter, r *http.Request, name string) // Operator override — advance the active pool member // (POST /api/pools/{name}/rotate) PostApiPoolsNameRotate(w http.ResponseWriter, r *http.Request, name string) @@ -930,6 +950,12 @@ func (_ Unimplemented) GetApiPoolsName(w http.ResponseWriter, r *http.Request, n w.WriteHeader(http.StatusNotImplemented) } +// Set or clear the per-pool agent auth-reset target run on the exhausted->recovered edge +// (POST /api/pools/{name}/auth-reset-target) +func (_ Unimplemented) PostApiPoolsNameAuthResetTarget(w http.ResponseWriter, r *http.Request, name string) { + w.WriteHeader(http.StatusNotImplemented) +} + // Operator override — advance the active pool member // (POST /api/pools/{name}/rotate) func (_ Unimplemented) PostApiPoolsNameRotate(w http.ResponseWriter, r *http.Request, name string) { @@ -1559,6 +1585,37 @@ func (siw *ServerInterfaceWrapper) GetApiPoolsName(w http.ResponseWriter, r *htt handler.ServeHTTP(w, r) } +// PostApiPoolsNameAuthResetTarget operation middleware +func (siw *ServerInterfaceWrapper) PostApiPoolsNameAuthResetTarget(w http.ResponseWriter, r *http.Request) { + + var err error + + // ------------- Path parameter "name" ------------- + var name string + + err = runtime.BindStyledParameterWithOptions("simple", "name", chi.URLParam(r, "name"), &name, runtime.BindStyledParameterOptions{ParamLocation: runtime.ParamLocationPath, Explode: false, Required: true, Type: "string", Format: ""}) + if err != nil { + siw.ErrorHandlerFunc(w, r, &InvalidParamFormatError{ParamName: "name", Err: err}) + return + } + + ctx := r.Context() + + ctx = context.WithValue(ctx, BearerAuthScopes, []string{}) + + r = r.WithContext(ctx) + + handler := http.Handler(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + siw.Handler.PostApiPoolsNameAuthResetTarget(w, r, name) + })) + + for _, middleware := range siw.HandlerMiddlewares { + handler = middleware(handler) + } + + handler.ServeHTTP(w, r) +} + // PostApiPoolsNameRotate operation middleware func (siw *ServerInterfaceWrapper) PostApiPoolsNameRotate(w http.ResponseWriter, r *http.Request) { @@ -1938,6 +1995,9 @@ func HandlerWithOptions(si ServerInterface, options ChiServerOptions) http.Handl r.Group(func(r chi.Router) { r.Get(options.BaseURL+"/api/pools/{name}", wrapper.GetApiPoolsName) }) + r.Group(func(r chi.Router) { + r.Post(options.BaseURL+"/api/pools/{name}/auth-reset-target", wrapper.PostApiPoolsNameAuthResetTarget) + }) r.Group(func(r chi.Router) { r.Post(options.BaseURL+"/api/pools/{name}/rotate", wrapper.PostApiPoolsNameRotate) }) @@ -1969,83 +2029,86 @@ func HandlerWithOptions(si ServerInterface, options ChiServerOptions) http.Handl // Base64 encoded, gzipped, json marshaled Swagger object var swaggerSpec = []string{ - "H4sIAAAAAAAC/+Rd73LbuHZ/FQzbmWvPVSTt3dx21veT47g3niYbj+O0H6KMBJNHFq5JgAuActSMZ/oQ", - "fcI+SQcHIAlSICkltpztftpYxJ+D8/d3Dg+4X6NYZLngwLWKTr5GKl5BRvGfp3kuxZqmV/BbAUqbn3Ip", - "cpCaAQ6IJVANyZzis6WQmflXlFANLzTLIBpFepNDdBIpLRm/jR5GUQJKM041E9xM2nrOkuDPuZDae8C4", - "hluQ0cPDKJLwW8EkJNHJJzO7uYWbOfJp/VyRJW7+AbE2658WCdPnXMvN9jFp3EltLCEBrhlN54WCMOlD", - "J+462ijKJaznK6pWwXkSqOpY0jBfaZrlu8tFC5EG11qDTFisA89avK83rSeFWP2K8cTMfxR1qgXwTbwH", - "vp6vqTTPElCxZLkdGp3zNZOCZ8A1WVPJ6E0KhNMMyFJIkq8o1yIjWtwBJ4ybo1lt29phBTQB2aPq1TkZ", - "1//ysl7C1wMhrXEyDZkK64r7hUpJN1Z3hBaxSIPzPLG3pmnI8pRqGBZ3wNQ8YfQI/mOeuA2aLL+k0kwl", - "BT5HPlNyY+eMyakmKVClieBAlgzShGSF0uQGiAJNGCd6BTMura8iNyLZjIjQK5D3TIF5SBTINUgiQReS", - "K/JyOiWvaEKceyNHGdXxivHbGTejz95eHI/Je55u7HaK6BXVhEoguQRlFMP8OxMJWzJIxuQDaI2zFx5X", - "FkQLQgkX/AVkud6QNU0LIDRVwh1UIW05NXydcVpo8cIZAqFpKu6JLFIgR5re3kJC7plekYXjyguaJCez", - "Yjr9Oa5Zj3/Dggg54wvzc9+oY6IE7s/hnnhkE6YIF5oIaVTdnO5iadliCbVEragiNwCcSMjEGhKSUV7Q", - "NN2McE1Hpb/ujDNFlGZpKeaEUJ4QSu6p5GYsUyQVeNIjLsiSpukNje/sdkzNuGPN8ZhcUqXMDMqJ5Szq", - "MKrNAg3GsIAsKjtYkDgFKhUeIwvMt3puF7BWuxiRRWkQixEu5xzGYsbtanhQ1I8xQcU2SwQUwIodNSgW", - "acoSUDOOwqQctbTiluBWWY2vqeXlae0v5EzwZcpiPZ5hgGs40R/a4/0wjmzbN0lzsLeM3wXjf0HT7kAM", - "X3KITeDqHJEyDnNeZDcNvnSBGH90e/VRg5qQlz1bUc4hfZzwCtyogY9rboRIgfK94pf95WsEvMgQKUAK", - "t5Jm0ShaaZ17x6h3voeblRB3cwWxBL2trm/enZ4R+xAVVLFbdCBuHsnpJhU0UVHP2oVMtxf+T7fAx6u3", - "uPCb6+tLElumBlYLRUUcUjOvR0ofNNWF2pZVL9/3Z2cbqe1MXx2r96BvW3RDEtjBQI3PYwHQmMCSFqme", - "ezC15AuGTwQpfGMMR90FNc3ovii0ITfskda4QcLCPs0+NZkSC7u97tN0cffJzxQkCd2Dw2heyte0DjtK", - "1ciM3IC+NxiA+tHKBnUvFo3JdYXADEozuyqSFHnKYrPgibeiHwGP6kVH/nrHM26QCMnohgiD0eALQ3AY", - "w5hcQW4B1OX7D9fKhc7ByPl7TiV+yAxhr+TAKtZZNaiz6EDjGJSaI7+2mf3+tDCYCsc4nh6VNJH7FXBi", - "djYwU9BCr453KFE0N7hYmnxjZBF8XFoD97G6UX6ny+GY+piKQhjXFsLTWzMxFlxTxhE67KNBZp9HUy0J", - "Swlq1S8kN6iUksCnxszNae2PUmhr7cGKRbcijiKcHw7w17g08CQXjGuM83urSBmCnafGZ1Sz2KbETX9Z", - "+kQzZ0xe2xlGOYmdM8ZQbN17tQpuHXTtmEkETrVqpAsOGdm040hpYQ4HPJab3KZPV+WJETzhtpbCQYiD", - "mtJtwu/OLj/mSkugWbcNy9s9vU4ssozypMvz4qJJwqwKXTY261q7JrxT9wehgZaUq7KA1xJI+QjZekKU", - "TpggR05jjkfEYDVMKu/hRon4DvRurK+Z0S2ESyG6PWgGJrVQAbuUCRiVsAN8dTIbK3K0pCwVa5BEmJHG", - "MnaXYCeTlZZUw+0mUBESIiXl47+RxDOdP5Wk/MmarMiY1pDsysGSBd0cvCpS6OTgEBbodqZUa5D8uWO4", - "hDylMZhI0+E9h4vBPYjUbJDQRvm3Qxx9deKzBh5rCqDbYvfz+wgdjQdEf4uWOuDve5y78bPoR8v1jvfx", - "7Ht4WbdpVyH1V7h3jh9rqNxiY4Osa4sek3+r/X79sxoZdDPjC5y/sKNsuG4PIgsfiS0Q9ZQxPN3M+KIB", - "AhY2AagoWVSCWphIixVVuYZkPOMzfnl6ffaGKMgo1yxWJ9a6cVpz0fKAbQpnvFzQFujiQkrgOt0QFwkb", - "2GNMrgWBLyYVYWYMVvZstbMxzhzb5DWtguHRYtYEPLPohMyiWbQ4Hs/49coebw1cKxKnDP+LhUBEN2DT", - "FwRwPmpdSpHNuGKppdt4HCk2Zj/MjULHCOQ0/WDZKEoIMNcG4Qk9aBcDQK/eoAX2wjuMyfuMaaJFLT88", - "baU1LbF9CInDzEYJEubSvF0hlKG2iZuWQRs5DkaZLUM9l1LIK1C54CqQ5MciCbswMPOGsys7LOQh3gBN", - "9ap7Z1UVneALzfIUZ98Nhk43LbTjRWaC1xUoxMLt/VwupOaMK5C6UTbyYls1TN2xPO8aFWPlZK6g452p", - "ycCGdrJjercpHIQdWqoe17Nci5EtEtv0jAIMC3AnSGOIoAbPQtLzEPtTQ/Vvq0F/J7zfuU59uDzgm9A/", - "1pZ3SAEuHXr7/vK/lylU8v9nCcvoJPqnSd03MnFNIxOz8zuc8xQZgINrHvRniqgiN8yF5HhX7F/t1p8G", - "eGcJ8bKvVJgLxVq5QZcv8Fby5vXT0/XSgMaarSFck4+FSBNxz+cF1yx9tP6KlCo9N/IoJMx7GlL6ODLC", - "yBKIxyuMY5sRKYlHe3G/kqPyVwPdDDuHFSDM7FHJuJKQLu5fwRIk8BiSrdDeJPzl9BdsQEB4+vr87fn1", - "OZnQnE1yIVI1+WoU8cGVmlZAzK+kei8uq13IzQa7HYQkmZDV+3Q1JmdUSuaw7S1wkCwmiAcmBlWQPC2U", - "g4myiHVhoGKK5fEluUlFfId1e7caOWIJ+XOjtv434hogENRfu9dLRBVySWNQRAJPQNZFelz7yL0jIzmV", - "TG+ODeBnypyLkgQSLPMnxHqKsuugpL3BTwJ8DamwJTguNIlFkaeQmLTfsKPM4MZbaLc80l7+qpQq47dl", - "Z1IwqD0GWqvD+KCSeeTsXX7YMeANNfR0koh5SxfaM4nLtklYz2XznnuqiDU51C+U8r1Rm7tQAWcU2Sf7", - "eq68q4ahRRj6O5Jc8YsutVPxsgg96F5wx5FlAG7TxcBhB96qM1Tpa5PGVoHucSO4I/Jb4/i+objywX0x", - "+QqUSNfddbmu8tRc8NisTdN7ulHzRsnqu2pUFUFduVZHO+vOTZVolL0UFCkcsjP3e2D076T6qUQhY3je", - "wuig3K159pQXyo6VXS2/2ZES4jZgOJpT1xauupqWxZfN3EAC4C56teFo22+2ZoS2GtUHCnHjP0Cy5aaz", - "AIHNXfOU8bvd+eF1hAWYkcItjQ3RHLqUVWia9g1Y05QlNVED0dlfrjl31Dxfi7ZtbhkNh7gwIO2DOavj", - "EVAJ8rTQq+34c3p54Qp3CjRZM0o+vP14cXY+P728mF+///fzXw1mI2tqEA7yD8WNK9YeAnuTHsz2jC8D", - "Mfjq/MM1MVsZ1JxRTm/LaueHtGCx/3rzhXv9zW9JqSIEtchgwpTF4EzCeqLo3cW1zeI11rrccmeCaylS", - "cplSDmZja3HKUvPTeDqemlkiB05zFp1EP4+n458jdGMr5Bli+oY53NqylFE+dKQXSXQS/R30ac5OPV2W", - "zmxxzl+mU1sP5Np5JJrbthgm+OQfLqWyarmz9rbvbmyp8MND+1X1pTW6mqOul1pZlSmyjMpNdBK9NWg/", - "7xw8ijQ1EPxTVHPms1mhya3JV5Y8TKSNn2iyQgV4dylUg3kXiQu5KAdJM9AIbz59jZg5hJFNWSU5sV60", - "tiItCxh5nGy74M92MCj9SiSbvYTSJ4sWanlomrYh6uE7VWKn3V2kCEj+tBYijk2Murycvnw0Gpr5ch8F", - "Jt9bioI7En55BhJoKoEmm4oZJv82mCkhotAtW3DMJbTTIIbsoUiYnkiI3QH7HIgZemVHhpX/twLkptb+", - "lGVMR77CV+0qf52Ooox+YZmBKj9Np+ZPxt2fgUzx80F8Vn0Hawd3ZTlBkIEEuJYm2Lel4w1JxW01zJOJ", - "ebYljzXCiV3kYYFH9IT224A2AT7gc7e00Vk7zGeDXcFjw4qqFYlXlPEeTvjFlB4uvCqHHUJDOgs022zx", - "3s1XJwnEsTgwrOZJXa2xNczuANVgw+OHkGBz7k6B5KdHo6Fi/jaz3SPXD+l89/RwvvuCIyCufO6hY0d5", - "/jJ04AtrZRsZV0y1O6P9TLupk6dJ0myk9vpIt3WybaqIqayfT8FW1JvK+hp/99T1InkMHDVcadyOHy+3", - "E4CSi+422cFBSLm/h0Fa4cTQtYd8MFWIVwGnYX5+LjE8vndqXuw8ML7dwS25q4Y/hlt6XoU+sGN87V0p", - "Le89EnftsW7rKS9+WH+5dfexZYZWzb7BTfolsR5Ec1bf9Xp6RFPe1tsB0fwqdA30qrMEMA0PDqxZU1fS", - "tlhTRZABx1Xy6HfuuJq33A7suCrRB8Cre5XacFwHdBzl/l2RsDLBkKoNaFp1j6/PBO2gp2S+3SF0dvuy", - "jcTlCP/gfwddNpO6AYWsXg6Wh7YTdwj/3jGfQLf9K4aHVu1u7uKTHycih9R6ULCVLtc9oUMK7Y08SFjx", - "gudeubK76IH9rwo/cMFhDZLAl1woSI4Hcmic7rPMO/dgDt1m0lOl0duXDA+cSfvS6ZUGTZI/YDbtM6CR", - "UA8ky516F7BX14S1U7Ls6eWvtm9hGPG4Boc93z0MZchn/vdAnidJ9n3F7nlyn0sYiJEHY/6TuJvmVZ2d", - "/Ey/3P+ouexOeod9HZhC2gscYtlx/WnQVWRxPqla6QeC+7s4/1gNPUR09xv2dwjv784uSX2WQABvDqhZ", - "k8X5cNTeOv1The3AxeIDx+0G37f5XD77w5bBKwYMh23eULotnQua4T4x29fKZw7aFVeeK2RXBAwF7B2F", - "gl3sAz7xEsccwhnibZf9khx7gP5MJncHKFlg/x70hvW5n8oN+lf7D+z/LKsDnUNCpH9Yp4eHx6+0VAVu", - "/HKl69FmqvKGRoUsg/56WAZpkJym9oaIlVPZWYdXT47w9pr9yu3k9Svi7vO0c32rfs26u2t5bxtJw1Ps", - "47bRfJ7ZX6NAn8tXW216qvc1fbeYuojZ/VbSNV6Yd5d4bkSy+ZOqPlWqFu6jrSm+GcfPg+5wE2m8wyvY", - "Dh0cDYanQ+rZ9FFlWDZpd4qs7OF+Vt1tSM4jjBw1r7H8meQgX7g/7PW+4x2dysR+Q2Gwf7SSt7279PuU", - "euPeVagjzl1UqrrA/l/5rsHtLXuIpMZJURILXr6pcVeKVizHd1S30NLN96gyQhKxBilZAuR///t/CE3W", - "lLtyglNYDJd2tR4Fxev8A/j4Csfs1MZZ3gLxVe9b75iEd3DfA91enoO+FxI/DWGdbHmDJ7D2QVpE8bbT", - "Lr3sImXxhlhJhNrX/ee1JO3fgyC/FN7TgXz/61MHBvmWxQHzKlL4cUB+4EWAJ9OASBvGOYEv5ccahmz0", - "/Iv7H0jsodxaZGnz8G1j2Trg9ft3b4mjqnk4S4BVVUIVMSMHz8ey8nzDimw/3tKrzlmRapZTqSdLIbMX", - "CdW0eb7WFWCWhr5QaI5o36ESHDGqGz9uGKfokfpvxeG0wI2mg77cbnzsJqSrmRWYF4SfwVJQTZqaVBKG", - "mrSUIttNl3ZuKrXa9CN1lKLPeq5UDjcffEe2m9eqP9nU469cVvCEqt+6dNrTPYO38apEpJkJeI+wHdr8", - "04/C5Yem8PQ2H/ivvqO/cUOe8OCtT2qFEAeeiqny+yS2zPPzAdF3SYBROSw4Na56RiefPvtSsCci8Qri", - "O4/3lnrD++bc5gXRT5+N/dmveVsTb9LyVsQ0dZyYnF5euA9/R6MIv8yIV0FPJpOf/vKv4+l4Ov7p5Ofp", - "dBo9fH74vwAAAP//eOrONI5qAAA=", + "H4sIAAAAAAAC/+R96XIbuXbwq6D6+6pGqkuRnDu+SY2m8kOWlWtV7LFKkpMfpouEug9FXHUDPQCaEuNS", + "VR4iT5gnSeEAvRK90JYoO/NrLDaWs299Ts+XIBRJKjhwrYLjL4EKV5BQ/OdJmkqxpvEl/JGB0uanVIoU", + "pGaAC0IJVEM0p/hsKWRi/hVEVMORZgkEo0BvUgiOA6Ul47fB4yiIQGnGqWaCm01bz1nk/TkVUlceMK7h", + "FmTw+DgKJPyRMQlRcPzJ7K5f4XaOqrB+LsASN/+AUJvzT7KI6TOu5WYbTRq2QhtKiIBrRuN5psAPeh/G", + "baiNglTCer6iauXdJ4GqliMN8ZWmSTqcL1qI2HvWGmTEQu151qB9eWm5yUfq14xHZv+TiFPJgK+iPfD1", + "fE2leRaBCiVL7dLgjK+ZFDwBrsmaSkZvYiCcJkCWQpJ0RbkWCdHiDjhh3KBmpW3rhhXQCGSHqBd4Mq7/", + "6VV5RFUOhLTKyTQkyi8r7hcqJd1Y2RFahCL27quwvbFNQ5LGVEM/uz2qVmFGB+M/ppG7oE7yCyrNVpLh", + "c6QzJTd2z5icaBIDVZoIDmTJII5IkilNboAo0IRxolcw49LaKnIjos2ICL0Cec8UmIdEgVyDJBJ0Jrki", + "r6ZT8ppGxJk3cpBQHa4Yv51xs/r03fnhmHzg8cZep4heUU2oBJJKUEYwzL8TEbElg2hMrkBr3L2oUGVB", + "tCCUcMGPIEn1hqxpnAGhsRIOUYWwpdTQdcZppsWRUwRC41jcE5nFQA40vb2FiNwzvSILR5UjGkXHs2w6", + "/SUsSY9/w4IIOeML83PXqkOiBN7P4Z5UwCZMES40EdKIusHufGnJYgG1QK2oIjcAnEhIxBoiklCe0Tje", + "jPBMB2X13BlniijN4pzNEaE8IpTcU8nNWqZILBDTAy7IksbxDQ3v7HVMzbgjzeGYXFClzA7KiaUsyjCK", + "zQIVxpCALAo9WJAwBioVopF49ls5twdYrV2MyCJXiMUIj3MGYzHj9jREFOVjTFCwzREeAbBsRwkKRRyz", + "CNSMIzMpRyktqCW4FVZja0p+VaT2V3Iq+DJmoR7P0MHVjOh3bfG+G0O2bZukQewd43de/5/RuN0Rw0MK", + "oXFcrStixmHOs+SmRpe2IKa6unn6qAaNz8qerijnED+NewVuxKAa19wIEQPlO/kv+8uXAHiWYKQAMdxK", + "mgSjYKV1WkGjvPkeblZC3M0VhBL0tri+fX9ySuxDFFDFbtGAuH0kpZtY0EgFHWdnMt4++D/cAR8v3+HB", + "b6+vL0hoieo5zecVcUlJvA4uXWmqM7XNq066707OZqQ2GL7SV+8A3zbr+jgwQEGNzWOeoDGCJc1iPa+E", + "qTld0H1ikMI3RnHUnVfSjOyLTBtw/RZpjRdEzG/T7FOTKTG/2WvHpo26z46TFyQ0Dy5Gq6R8de2wq1QZ", + "mZEb0PcmBqBVb2WdesUXjcl1EYGZKM3cqkiUpTELzYHHlROrHvCgPHRUPe9wxk0kQhK6IcLEaPDAMDgM", + "YUwuIbUB1MWHq2vlXGev5/yRU4nvMkPYKTmwgnVaLGotOtAwBKXmSK9tYn84yUxMhWscTQ9ymMj9Cjgx", + "N5swU9BMrw4HlCjqF5wvTb4xshF8mGsDr8bqRvidLPt96lMKCmFc2xCe3pqNoeCaMo6hwy4SZO55MtGS", + "sJSgVt1McotyLgl8atTcYGt/lEJbbfdWLNoFcRTgfr+Dv8ajgUepYFyjn99ZRHIX7Cw1PqOahTYlrtvL", + "3CaaPWPyxu4wwknsnjG6Ymvei1Pwaq9px0zCg9Wqli64yMimHQdKC4Mc8FBuUps+XeYYY/CE11oIe0Mc", + "lJR2FX5/evExVVoCTdp1WN7uaHVCkSSUR22WFw+NImZF6KJ2WdvZJeCtst8bGmhJucoLeA2G5I+QrMdE", + "6YgJcuAk5nBETKyGSeU93CgR3oEeRvqSGO1MuBCiw4JmejWXoEDPNZW3vuj6g1NGZ1TMjiPcQewOIjOe", + "+2l4WNFMaYiObFFBQijWgPIW3QI5wOx6IhKmjUv+F8IFwaO8ipWASXuUByIZ4Zl2QVXUDVEUOVhSFpt7", + "iTArzeHDpatVAJSWVMPtxlOtEiIm+ePfSFRR659yUH6y5sShPpS7OQnauXuZxdDK3b44pd3QU61B8peO", + "LySkMQ3BeMEWy95fqO6Ils0FEa2VplvY0VXDPq3FinUGtFuT3XwShrXGOqMvQCvS44s6HI/xAWjj8/MO", + "d/E6O3gAd2lbkfd3uHdOCeu73MbtJuovNXpM/rX0SeXPamQirxlf4P6FXWVDieYisqhGiQuMyPL4It7M", + "+KIWoCxsclJAsigYtTBRAFZ75Rqi8YzP+MXJ9elboiChXLNQHVvtxm31Q3MEmxDOeH6gLR6GmZTAdbwh", + "zkvX4qIxuRYEHkyaxMwarDraSmxtnUHb5FyNYubBYlYPxmbBMZkFs2BxOJ7x65VFbw1cKxLGDP+LRUqM", + "vMCmVhhcViPqpRTJjCsWW7iNxZFiY+7DvM2Hhiff6g7kjaD4gvlSISpM9+pFTxBaXtAIRP03jMmHhGmi", + "Rck/5/2c1DTYduVjh9mNHCTMpaBDwzsDbT2mW3p15NDrZbYU9UxKIS9BpYIrTwEiFJHfhIHZ15/52WU+", + "C/EWaKxX7TeroiAGDzRJY9x91+s63TbfjeeJcV6XoDBOb97n8jQ1Z1yB1LWSVsW3FcvUHUvTtlUhVnXm", + "Clre55rssO8mu6bzmsyF131Hles6jmsQsgFiE56Rh2Ae6nhh9AFUo5mPe5Vs4rnTiK+rj39j6jG4hr6/", + "HOWrMhOsew9ITy5c9LZzRnLyRIlIXwLyNVJQSVoKUfz/EpbBcfD/JmV7zcT11kwMEd7jnudIRlzkWMlC", + "mCIqSw2fITocmoYUt3VnJBVcfG+cuiqqqVCskaa0maXKSZV93fC0vVuhoWZr8L+6CIWII3HP5xnXLH6y", + "NpSYKj03/MgkzDv6drooMkIn5wkNVuhSNyOSA4+q634lB/mvJoo05OwXAD+xRznhckDaqH8JS5DAQ4i2", + "oow64K+mv2KfBkbKb87enV2fkQlN2SQVIlaTL0YQH11FbgXE/EqK9gFZ3EJuNtgUIiRJhCzaDtSYnFIp", + "mQuzb4GDZCHB0GRiAhySxplyEavMQp0ZSxHjW4QluYlFeIevN9xp5IBF5C+1VxC/EdcngvnFtXsLR1Qm", + "lzQERSTwCGT5LgPPPnCvEklKJdObQ5N7MGXwoiSCCN+GRMRairw5I4e9Rk8CfA2xsJVKLjQJRZbGEBEt", + "kBx5MjneCrxzlHayVzlXGb/NG7i8/vUpAscyougVsgo4O1dCBvrevr6nVhAxhWoLPE0Ota0S1nLZFOye", + "KmJVDuULuXxvxObOV0saBfbJrpYrbSunaOHPQhxIrg5Hl9qJeF6r7zUveOPIEgCvaSNgvwFvlDyKTLoO", + "Y6NW6KPCdxSAfH0w4ej1tSHFrlFB4Q66woNLUCJet1cr24p2c8FDczaN7+lGzWuFvG+q3BUAtWWgLQ3I", + "g9tg0T50QpDFsM9e6m9JLn6QmrASmQzhZcvFvXy/Am109STD+gfoa7Qf3/KWpsUk/bZVd3K9ikxjV2Wb", + "7WkgtA2AFys0Oh2lpLxzaqg9q3dG+WQI0N/PqRtPUG3N8+JhMzcxF3AXHjTj/aZjauzwXTUqEfJR499B", + "suWmtdiETYbzmPG74fSodCZ6iBHDLQ0N0BzaVFBoGnctWNOYRSVQPeFP9bj63lEdvwZs29QyegthZqLg", + "K4OroxFQCdIoiUfeL85dkdYI+5pRcvXu4/np2fzk4nx+/eHfzn43QTFZUxNCIv2Q3XhiKe7YI/dormd8", + "6QlyLs+urom5yqQlCeX0Nq9sX8UZC6uv2Y9cGwa/JbmIEJQiE3THLASnEta+Bu/Pr23FRmNd0x13KriW", + "IiYXMeVgLrZ2RFlofh5Px1OzS6TAacqC4+CX8XT8S4DGeYU0w6Sppg7OXBjhQ/dwHgXHwd9Bn6TspCLL", + "0qkt7vnrdGprv1w7O0tT257FBJ/8w+WsViwHS29zhmhLhB8fmy0TF1bpSoq6nn5lRSZLEio3wXHwzqRT", + "aeviUaCpyXE+BSVlPpsT6tSafGHR40TaqABVVigP7S6EqhHvPHKBBPJB0gQ0Bm2fvgTMIGF4k1fEjq1v", + "KLVIywxGFUo27fBnuxiUfi2izU5M6eJFIxZ7rKu2AerxG0Vi0O3OU3g4f1IyEddGRlxeTV89GQz1gkQX", + "BCahXoqMOxB+fQEQaCyBRpuCGERIYiLBiIhMN3TBEZfQVoXo04csYnoiIXQIdhkQs/TSrvQL/x8ZyE0p", + "/TFLmA6qAl+0Tf1tOgoS+sASE4D9PJ2aPxl3f3pS8c97sVnlLOAAc2UpQZCABLiWxtk3uVNZEovbYlmF", + "J+bZFj/WGE4M4YcNPIJn1N9aaOOhAz53RxuZtcuqZLAnVMiwompFwhVlvIMS1WpVBxVe58v2ISGtFbBt", + "slT6MApMPH4s9CwraVKWw2yRuN1B1cjw9C7E2yQ+yJH8/GQwFMTfJrZ75Ppyne2e7s92n3MMiAubu2/f", + "keOfuw5sTlC2oXbFVLNDv1o/qMvkSRTVG/or/czbMtlUVYyprJ2Pwb6yqAvrG/y9Iq7n0VPEUf2l3G3/", + "8Wo7Acip6KYa9x6E5PdXYpCGOzFw7cAfTBXClcdomJ9fig1Pb53qA8Z7jm8HmCU38vp9mKWXFeg9G8Y3", + "ldHmfP6WuPHbsoUrH0Cy9nJrBrehhlbMvsJMVktiHRHNaTlz+PwRTT41OiCi+V3oMtArcPHENNy7sCRN", + "WUnbIk3hQXoMV06jH9xw1act92y4CtZ7glf3rrpmuPZoOPL72zxhoYI+UeuRtGKetEsF7aLnJL69wYe7", + "fZtJwnxFFfG/g84bh92CTBZvX3Ok7cYB7r+C5jPIdnXUdd+i3U5dfPL9eGSfWPcytpDlsv+3T6ArK/fi", + "VirOc6dc2Q31YK+zwg+tcFiDJPCQCgXRYU8OjdurJKvg3ZtDN4n0XGn09rDrnjPpKnc6uUGj6E+YTVcJ", + "UEuoe5LlVrnz6KvrchuULFfk8nfbjdEf8bi2jR3fPfRlyKfV79K8TJJctRXD8+Quk9DjI/dG/GcxN/Wx", + "rEF2ppvvf9ZcdpDcYbcKppB2WEcsW0bdek1FEqaTYmyix7m/D9OPxdJ9ePfqcMYA9/7+9IKUuHgceH1B", + "SZokTPu99hb2z+W2PQPue/bbNbpv0zl/9qctgxcE6HfbvCZ0WzLnVcNdfHZVKl/YaRdUeSmXXQDQ57AH", + "MgXHBHps4gWu2YcxxMmm3ZIci0B3JpM6BHIS2L97rWGJ93OZweonJvZs/yypPZ1DQsR/WqOHyOPXgooC", + "N35B1TXBM1VYQyNClkB/2y+BNEhOYzuCY/mUd9bhbM8BTirary1P3rwmbmCqmetb8avX3d1MQVNJapZi", + "F7ON6vPC9hoZ+lK22krTc72v6RoTawNm+NjXNX4cwU1J3Yho85MqPpmrFu7jwTG+GcfP1A4Y9RoPeAXb", + "IoOjXve0TzmbPikP8ybtVpblPdwvKrs1zlUAIwf1OaG/kBTkkfvDzk8eDjQqk7IJ/6js1u/30Ib1jYGA", + "H6qy0D3TsOdqf1tY8NF9aTp1EdqLhAVOKr4rXbgCbcyn/SIJumGQR+iKv/mzX0P1xn5nZriy2KHKH9Na", + "1gZCfZ2kboKy6J78P+Xze6+35CGSGudOSSh4/obTDRiuWIrvdm+hIccfUGSEJEYAJYuA/M9//Teh0Zpy", + "V4Zzhh5l257WIaD4yZOevPIS1wxqf85nwqqi97UTZ/4b3Pect4/noO+FxM/n2OAkn+fznL2X1mqcfRwy", + "AyJiFm6I5YRv7KP6vOSk/bs3Oc6Z93zJcfULfXtOji2JPeqVxfD9JMeeF2gVnnpYWlPOCTzkH7Tp09Gz", + "B/c/ANpBuLVI4jryTWXZQvD6w/t3xEFVR84CYEWVUEXMyl78WJLj1y/I9gNXneKcZLFmKZV6shQyOYqo", + "pnX8Gt8mYLHvC7MGRdt7QHDFqGyYumGcokXqHinFbZ5JwL2GibUPgvlkNbEMqzjhF9AUFJO6JOWAoSQt", + "pUiGydLgZmwrTd9TJzbarJcqgeDlve+Wh1mt8rN2HfbKZdPPKPqNYe2OrjOcYi0S+HoGXXmEYwTmn1Uv", + "nH+MD7G3efR/dqH+1i15RsQbnx30RRyIFVP5h5NsefSXPUbfOQBG5LBQWxuRDo4/fa5ywWJEwhWEdxXa", + "W+gN7et764PVnz4b/bP/Nwar4nVY3omQxo4Sk5OLc/c/bghGAX69FkeojyeTn//6z+PpeDr++fiX6XQa", + "PH5+/N8AAAD//4Q38/5OcAAA", } // GetSwagger returns the content of the embedded swagger specification file diff --git a/internal/api/server.go b/internal/api/server.go index 9740b41..f1066ba 100644 --- a/internal/api/server.go +++ b/internal/api/server.go @@ -1829,8 +1829,16 @@ func (s *Server) PostApiPools(w http.ResponseWriter, r *http.Request) { //nolint if req.Strategy != nil { strategy = *req.Strategy } - if err := poolops.Create(s.store, req.Name, strategy, req.Members); err != nil { - writeError(w, poolCreateError(err), err.Error(), "") + authResetTarget := "" + if req.AuthResetTarget != nil { + authResetTarget = *req.AuthResetTarget + } + if err := poolops.CreateWithAuthResetTarget(s.store, req.Name, strategy, req.Members, authResetTarget); err != nil { + status := poolCreateError(err) + if errors.Is(err, poolops.ErrInvalidAuthResetTarget) { + status = http.StatusBadRequest + } + writeError(w, status, err.Error(), "") return } @@ -1908,6 +1916,37 @@ func (s *Server) PostApiPoolsNameRotate(w http.ResponseWriter, _ *http.Request, _ = json.NewEncoder(w).Encode(out) } +// PostApiPoolsNameAuthResetTarget sets (or clears, with an empty string) the +// per-pool agent auth-reset target. This is the action-route REST counterpart +// of the CLI `pool set-auth-reset` and Telegram `/pool set-auth-reset`, all +// routing through poolops.SetAuthResetTarget so the three surfaces cannot +// drift (channel feature-parity principle). A NUL/newline in the target is a +// 400 (poolops.ErrInvalidAuthResetTarget); an unknown pool is 404. On success +// the updated pool is returned so the caller sees the persisted value. +func (s *Server) PostApiPoolsNameAuthResetTarget(w http.ResponseWriter, r *http.Request, name string) { //nolint:revive // generated interface name + var req SetPoolAuthResetTargetRequest + if err := json.NewDecoder(limitedBody(w, r)).Decode(&req); err != nil { + writeError(w, http.StatusBadRequest, "invalid request body", "") + return + } + if err := poolops.SetAuthResetTarget(s.store, name, req.AuthResetTarget); err != nil { + status := poolStatusError(err) + if errors.Is(err, poolops.ErrInvalidAuthResetTarget) { + status = http.StatusBadRequest + } + writeError(w, status, err.Error(), "") + return + } + w.Header().Set("Content-Type", "application/json") + if p, err := s.store.GetPool(name); err == nil && p != nil { + _ = json.NewEncoder(w).Encode(storePoolToAPI(*p)) + return + } + // The set succeeded; a read-back failure must not report failure. Echo + // the persisted value from the request instead. + _ = json.NewEncoder(w).Encode(Pool{Name: name, AuthResetTarget: &req.AuthResetTarget}) +} + // DeleteApiPoolsName removes a pool. It refuses (409) while any binding still // references it by name; an unknown pool is 404. On the 409 the structured // list of referencing bindings (id + destination) is included in the response @@ -2228,6 +2267,10 @@ func storePoolToAPI(p store.Pool) Pool { Strategy: p.Strategy, Members: make([]PoolMember, len(p.Members)), } + if p.AuthResetTarget != "" { + t := p.AuthResetTarget + pool.AuthResetTarget = &t + } for i, m := range p.Members { pool.Members[i] = PoolMember{Credential: m.Credential, Position: m.Position} } @@ -2247,6 +2290,10 @@ func poolStatusToAPI(res *poolops.StatusResult) PoolStatus { Active: res.Active, Members: make([]PoolMemberStatus, len(res.Members)), } + if res.AuthResetTarget != "" { + t := res.AuthResetTarget + out.AuthResetTarget = &t + } for i, m := range res.Members { ms := PoolMemberStatus{ Credential: m.Credential, diff --git a/internal/api/server_test.go b/internal/api/server_test.go index e80b349..f43092f 100644 --- a/internal/api/server_test.go +++ b/internal/api/server_test.go @@ -3042,6 +3042,108 @@ func TestPostApiPoolsNameRotate_Success(t *testing.T) { } } +// TestPostApiPoolsNameAuthResetTarget exercises the REST adapter for the +// per-pool auth_reset_target action route. It asserts the value reaches the +// store (the REST handler is a thin poolops adapter), that clearing with an +// empty string works, and that an invalid target is a 400 and an unknown +// pool a 404. +func TestPostApiPoolsNameAuthResetTarget(t *testing.T) { + st := newTestStore(t) + enableHTTPChannel(t, st) + seedOAuthCred(t, st, "credA", "credB") + if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}); err != nil { + t.Fatalf("create pool: %v", err) + } + srv := api.NewServer(st, nil, nil, "") + t.Setenv("SLUICE_API_TOKEN", "tok") + handler := newTestHandler(t, srv, st) + + post := func(body string) *httptest.ResponseRecorder { + req := httptest.NewRequest("POST", "/api/pools/pool1/auth-reset-target", strings.NewReader(body)) + req.Header.Set("Authorization", "Bearer tok") + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + return rec + } + + // Set a target. + rec := post(`{"auth_reset_target": "openai-codex"}`) + if rec.Code != http.StatusOK { + t.Fatalf("set: expected 200, got %d: %s", rec.Code, rec.Body.String()) + } + var p api.Pool + if err := json.NewDecoder(rec.Body).Decode(&p); err != nil { + t.Fatalf("decode: %v", err) + } + if p.AuthResetTarget == nil || *p.AuthResetTarget != "openai-codex" { + t.Fatalf("response AuthResetTarget = %v, want openai-codex", p.AuthResetTarget) + } + // Reached the store (no inline logic; routed through poolops). + got, err := st.GetPool("pool1") + if err != nil { + t.Fatalf("GetPool: %v", err) + } + if got.AuthResetTarget != "openai-codex" { + t.Fatalf("stored AuthResetTarget = %q, want openai-codex", got.AuthResetTarget) + } + + // Clear with an empty string. + rec = post(`{"auth_reset_target": ""}`) + if rec.Code != http.StatusOK { + t.Fatalf("clear: expected 200, got %d: %s", rec.Code, rec.Body.String()) + } + got, _ = st.GetPool("pool1") + if got.AuthResetTarget != "" { + t.Fatalf("after clear stored AuthResetTarget = %q, want empty", got.AuthResetTarget) + } + + // Invalid target (newline) -> 400. + rec = post(`{"auth_reset_target": "bad\ntarget"}`) + if rec.Code != http.StatusBadRequest { + t.Fatalf("invalid target: expected 400, got %d: %s", rec.Code, rec.Body.String()) + } + + // Unknown pool -> 404. + req := httptest.NewRequest("POST", "/api/pools/nope/auth-reset-target", strings.NewReader(`{"auth_reset_target": "x"}`)) + req.Header.Set("Authorization", "Bearer tok") + req.Header.Set("Content-Type", "application/json") + rec = httptest.NewRecorder() + handler.ServeHTTP(rec, req) + if rec.Code != http.StatusNotFound { + t.Fatalf("unknown pool: expected 404, got %d: %s", rec.Code, rec.Body.String()) + } +} + +// TestPostApiPools_WithAuthResetTarget asserts the create body accepts +// auth_reset_target and persists it via poolops. +func TestPostApiPools_WithAuthResetTarget(t *testing.T) { + st := newTestStore(t) + enableHTTPChannel(t, st) + seedOAuthCred(t, st, "credA", "credB") + srv := api.NewServer(st, nil, nil, "") + t.Setenv("SLUICE_API_TOKEN", "tok") + handler := newTestHandler(t, srv, st) + + body := `{"name": "codex", "members": ["credA", "credB"], "auth_reset_target": "openai-codex"}` + req := httptest.NewRequest("POST", "/api/pools", strings.NewReader(body)) + req.Header.Set("Authorization", "Bearer tok") + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + if rec.Code != http.StatusCreated { + t.Fatalf("expected 201, got %d: %s", rec.Code, rec.Body.String()) + } + got, err := st.GetPool("codex") + if err != nil { + t.Fatalf("GetPool: %v", err) + } + if got.AuthResetTarget != "openai-codex" { + t.Fatalf("stored AuthResetTarget = %q, want openai-codex", got.AuthResetTarget) + } +} + func TestPostApiPoolsNameRotate_NotFound(t *testing.T) { st := newTestStore(t) enableHTTPChannel(t, st) diff --git a/internal/poolops/poolops.go b/internal/poolops/poolops.go index 7e9f691..49a9b3c 100644 --- a/internal/poolops/poolops.go +++ b/internal/poolops/poolops.go @@ -29,6 +29,7 @@ type Store interface { RemovePoolIfUnreferenced(name string) (bool, error) ListCredentialHealth() ([]store.CredentialHealth, error) SetCredentialHealthIfPoolMemberEpoch(credential, pool string, epoch int64, status string, cooldownUntil time.Time, reason string) (bool, error) + SetPoolAuthResetTarget(name, target string) error } // ErrNoMembers is returned by Create when the member list is empty. @@ -78,7 +79,9 @@ type StatusResult struct { Name string Strategy string Active string - Members []MemberStatus + // AuthResetTarget is the per-pool agent auth-reset target (empty = none). + AuthResetTarget string + Members []MemberStatus } // RotateResult is the outcome of a successful Rotate. @@ -107,18 +110,87 @@ func ParseMembers(membersStr string) ([]string, error) { return members, nil } +// ErrInvalidAuthResetTarget is returned when a non-empty auth-reset target +// contains a NUL byte or newline. The target is consumed as argv (never +// shell-interpolated), so this is a minimal structural guard, not a +// shell-metachar check; channels that exec the target apply any stricter +// allowlist at exec time. +var ErrInvalidAuthResetTarget = errors.New("invalid auth_reset_target: must not contain NUL or newline characters") + +// validateAuthResetTarget rejects a non-empty target containing a NUL byte or +// a newline. An empty target ("") is always valid and means "clear / no +// reset", so callers normalize the clear sentinel before validating. +func validateAuthResetTarget(target string) error { + if target == "" { + return nil + } + if strings.ContainsAny(target, "\x00\n\r") { + return ErrInvalidAuthResetTarget + } + return nil +} + // Create creates a pool with the given ordered members. An empty strategy // defaults to the only supported strategy (failover). Sentinel errors from // the store (namespace collision, static member, unknown member) propagate // unchanged so channels can map them. func Create(s Store, name, strategy string, members []string) error { + return CreateWithAuthResetTarget(s, name, strategy, members, "") +} + +// CreateWithAuthResetTarget is Create plus an optional per-pool +// auth_reset_target (empty = no reset). The target is set in a follow-up +// SetPoolAuthResetTarget call after the pool exists; channels that don't +// accept a target call Create. Used by every channel's create adapter so the +// create-with-target path has a single source of truth. +func CreateWithAuthResetTarget(s Store, name, strategy string, members []string, authResetTarget string) error { if strategy == "" { strategy = store.PoolStrategyFailover } if len(members) == 0 { return ErrNoMembers } - return s.CreatePoolWithMembers(name, strategy, members) + if err := validateAuthResetTarget(authResetTarget); err != nil { + return err + } + if err := s.CreatePoolWithMembers(name, strategy, members); err != nil { + return err + } + if authResetTarget != "" { + if err := s.SetPoolAuthResetTarget(name, authResetTarget); err != nil { + return err + } + } + return nil +} + +// SetAuthResetTarget sets (target != "") or clears (target == "") the +// per-pool auth_reset_target. Channels normalize their clear sentinel (CLI / +// Telegram use a single "-") to "" before calling. A missing pool surfaces as +// the store's "does not exist" error; channels may also wrap it as a +// *PoolNotFoundError by checking existence first. +func SetAuthResetTarget(s Store, name, target string) error { + if err := validateAuthResetTarget(target); err != nil { + return err + } + if _, err := mustExist(s, name); err != nil { + return err + } + return s.SetPoolAuthResetTarget(name, target) +} + +// mustExist returns the pool or a *PoolNotFoundError, so channels get the same +// typed not-found error from SetAuthResetTarget that Status/Rotate/Remove +// already return. +func mustExist(s Store, name string) (*store.Pool, error) { + p, err := s.GetPool(name) + if err != nil { + return nil, err + } + if p == nil { + return nil, &PoolNotFoundError{Name: name} + } + return p, nil } // List returns every configured pool, ordered as the store returns them. @@ -151,7 +223,7 @@ func Status(s Store, name string) (*StatusResult, error) { } now := time.Now() - res := &StatusResult{Name: p.Name, Strategy: p.Strategy, Active: active} + res := &StatusResult{Name: p.Name, Strategy: p.Strategy, Active: active, AuthResetTarget: p.AuthResetTarget} for _, m := range p.Members { ms := MemberStatus{ Credential: m.Credential, diff --git a/internal/poolops/poolops_test.go b/internal/poolops/poolops_test.go index 673f867..0f63dbc 100644 --- a/internal/poolops/poolops_test.go +++ b/internal/poolops/poolops_test.go @@ -109,6 +109,108 @@ func TestCreateListStatusRotateRemove(t *testing.T) { } } +func TestCreateWithAuthResetTarget(t *testing.T) { + db := newTestStore(t, "acct_a", "acct_b") + + if err := poolops.CreateWithAuthResetTarget(db, "codex", "", []string{"acct_a", "acct_b"}, "openai-codex"); err != nil { + t.Fatalf("CreateWithAuthResetTarget: %v", err) + } + + p, err := db.GetPool("codex") + if err != nil { + t.Fatalf("GetPool: %v", err) + } + if p.AuthResetTarget != "openai-codex" { + t.Fatalf("AuthResetTarget = %q, want openai-codex", p.AuthResetTarget) + } + + st, err := poolops.Status(db, "codex") + if err != nil { + t.Fatalf("Status: %v", err) + } + if st.AuthResetTarget != "openai-codex" { + t.Fatalf("Status.AuthResetTarget = %q, want openai-codex", st.AuthResetTarget) + } +} + +func TestCreateWithEmptyAuthResetTargetDefaultsEmpty(t *testing.T) { + db := newTestStore(t, "acct_a") + if err := poolops.CreateWithAuthResetTarget(db, "p", "", []string{"acct_a"}, ""); err != nil { + t.Fatalf("CreateWithAuthResetTarget: %v", err) + } + p, err := db.GetPool("p") + if err != nil { + t.Fatalf("GetPool: %v", err) + } + if p.AuthResetTarget != "" { + t.Fatalf("AuthResetTarget = %q, want empty", p.AuthResetTarget) + } +} + +func TestCreateWithInvalidAuthResetTarget(t *testing.T) { + db := newTestStore(t, "acct_a") + err := poolops.CreateWithAuthResetTarget(db, "p", "", []string{"acct_a"}, "bad\ntarget") + if !errors.Is(err, poolops.ErrInvalidAuthResetTarget) { + t.Fatalf("err = %v, want ErrInvalidAuthResetTarget", err) + } + // The pool must not have been created when the target is invalid. + p, err := db.GetPool("p") + if err != nil { + t.Fatalf("GetPool: %v", err) + } + if p != nil { + t.Fatalf("pool created despite invalid target: %+v", p) + } +} + +func TestSetAuthResetTargetSetAndClear(t *testing.T) { + db := newTestStore(t, "acct_a", "acct_b") + if err := poolops.Create(db, "codex", "", []string{"acct_a", "acct_b"}); err != nil { + t.Fatalf("Create: %v", err) + } + + if err := poolops.SetAuthResetTarget(db, "codex", "openai-codex"); err != nil { + t.Fatalf("SetAuthResetTarget set: %v", err) + } + st, err := poolops.Status(db, "codex") + if err != nil { + t.Fatalf("Status: %v", err) + } + if st.AuthResetTarget != "openai-codex" { + t.Fatalf("after set AuthResetTarget = %q, want openai-codex", st.AuthResetTarget) + } + + if err := poolops.SetAuthResetTarget(db, "codex", ""); err != nil { + t.Fatalf("SetAuthResetTarget clear: %v", err) + } + st, err = poolops.Status(db, "codex") + if err != nil { + t.Fatalf("Status post-clear: %v", err) + } + if st.AuthResetTarget != "" { + t.Fatalf("after clear AuthResetTarget = %q, want empty", st.AuthResetTarget) + } +} + +func TestSetAuthResetTargetUnknownPool(t *testing.T) { + db := newTestStore(t) + err := poolops.SetAuthResetTarget(db, "missing", "x") + var nf *poolops.PoolNotFoundError + if !errors.As(err, &nf) { + t.Fatalf("SetAuthResetTarget unknown pool err = %v, want PoolNotFoundError", err) + } +} + +func TestSetAuthResetTargetInvalid(t *testing.T) { + db := newTestStore(t, "acct_a") + if err := poolops.Create(db, "p", "", []string{"acct_a"}); err != nil { + t.Fatalf("Create: %v", err) + } + if err := poolops.SetAuthResetTarget(db, "p", "bad\x00target"); !errors.Is(err, poolops.ErrInvalidAuthResetTarget) { + t.Fatalf("err = %v, want ErrInvalidAuthResetTarget", err) + } +} + func TestCreateErrors(t *testing.T) { db := newTestStore(t, "acct_a") diff --git a/internal/telegram/commands.go b/internal/telegram/commands.go index c5c2af5..22f5419 100644 --- a/internal/telegram/commands.go +++ b/internal/telegram/commands.go @@ -1187,7 +1187,7 @@ func (h *CommandHandler) handleAudit(args []string) string { } // poolUsage is the usage banner for /pool. -const poolUsage = "Usage: /pool create | /pool list | /pool status | /pool rotate | /pool remove " +const poolUsage = "Usage: /pool create [auth-reset-target] | /pool list | /pool status | /pool rotate | /pool set-auth-reset | /pool remove " // handlePool dispatches /pool subcommands to the channel-agnostic // internal/poolops package, the same package the CLI and REST API call, so @@ -1221,6 +1221,11 @@ func (h *CommandHandler) handlePool(args []string) string { return "Usage: /pool rotate " } return h.poolRotate(args[1]) + case "set-auth-reset": + if len(args) < 3 { + return "Usage: /pool set-auth-reset (a single - clears the target)" + } + return h.poolSetAuthReset(args[1], args[2]) case "remove": if len(args) < 2 { return "Usage: /pool remove " @@ -1233,14 +1238,19 @@ func (h *CommandHandler) handlePool(args []string) string { func (h *CommandHandler) poolCreate(args []string) string { if len(args) < 2 { - return "Usage: /pool create " + return "Usage: /pool create [auth-reset-target]" } name := args[0] members, err := poolops.ParseMembers(args[1]) if err != nil { return fmt.Sprintf("Failed to create pool: %v", err) } - if err := poolops.Create(h.store, name, "", members); err != nil { + // Optional 3rd positional arg is the per-pool auth-reset target. + authResetTarget := "" + if len(args) >= 3 { + authResetTarget = args[2] + } + if err := poolops.CreateWithAuthResetTarget(h.store, name, "", members, authResetTarget); err != nil { return fmt.Sprintf("Failed to create pool: %v", err) } var b strings.Builder @@ -1248,10 +1258,31 @@ func (h *CommandHandler) poolCreate(args []string) string { for i, m := range members { fmt.Fprintf(&b, " [%d] %s\n", i, htmlCode(m)) } + if authResetTarget != "" { + fmt.Fprintf(&b, "auth-reset target: %s\n", htmlCode(authResetTarget)) + } b.WriteString("Bind it with /policy or " + htmlCode("sluice binding add "+name+" --destination ")) return b.String() } +func (h *CommandHandler) poolSetAuthReset(name, target string) string { + // A single "-" is the channel-uniform clear sentinel. + if target == "-" { + target = "" + } + if err := poolops.SetAuthResetTarget(h.store, name, target); err != nil { + var nf *poolops.PoolNotFoundError + if errors.As(err, &nf) { + return fmt.Sprintf("No pool named %s", htmlCode(name)) + } + return fmt.Sprintf("Failed to set auth-reset target: %v", err) + } + if target == "" { + return fmt.Sprintf("Cleared auth-reset target for pool %s", htmlCode(name)) + } + return fmt.Sprintf("Set auth-reset target for pool %s to %s", htmlCode(name), htmlCode(target)) +} + func (h *CommandHandler) poolList() string { pools, err := poolops.List(h.store) if err != nil { @@ -1269,6 +1300,9 @@ func (h *CommandHandler) poolList() string { } fmt.Fprintf(&b, "%s (strategy: %s): %s\n", htmlCode(p.Name), htmlCode(p.Strategy), htmlCode(strings.Join(names, ", "))) + if p.AuthResetTarget != "" { + fmt.Fprintf(&b, " auth-reset target: %s\n", htmlCode(p.AuthResetTarget)) + } } return b.String() } @@ -1309,6 +1343,9 @@ func (h *CommandHandler) poolStatus(name string) string { fmt.Fprintf(&b, "%s[%d] %s %s\n", marker, m.Position, htmlCode(m.Credential), status) } fmt.Fprintf(&b, "active: %s\n", htmlCode(res.Active)) + if res.AuthResetTarget != "" { + fmt.Fprintf(&b, "auth-reset target: %s\n", htmlCode(res.AuthResetTarget)) + } return b.String() } @@ -1387,8 +1424,8 @@ MCP Upstreams /mcp remove Credential Pools -/pool create | /pool list | /pool status -/pool rotate | /pool remove ` +/pool create [auth-reset-target] | /pool list | /pool status +/pool rotate | /pool set-auth-reset | /pool remove ` } help += ` diff --git a/internal/telegram/commands_test.go b/internal/telegram/commands_test.go index 9c28dc4..65f6d51 100644 --- a/internal/telegram/commands_test.go +++ b/internal/telegram/commands_test.go @@ -2186,6 +2186,66 @@ func TestHandlePoolCreateListStatusRotateRemove(t *testing.T) { } } +// TestHandlePoolAuthResetTarget exercises the Telegram adapter for the +// per-pool auth_reset_target: the optional 3rd create arg, the +// set-auth-reset subcommand (set and clear via "-"), and that the value +// reaches the store (the handler is a thin poolops adapter) and is surfaced +// in list/status output. +func TestHandlePoolAuthResetTarget(t *testing.T) { + s := newTestStore(t) + seedPoolOAuthMeta(t, s, "acct_a", "acct_b") + h := newTestHandlerWithStore(t, s, nil, "") + + // Create with target as 3rd positional arg. + got := h.Handle(&Command{Name: "pool", Args: []string{"create", "codex", "acct_a,acct_b", "openai-codex"}}) + if !strings.Contains(got, "auth-reset target: ") || !strings.Contains(got, "openai-codex") { + t.Fatalf("pool create with target = %q", got) + } + if p, err := s.GetPool("codex"); err != nil { + t.Fatalf("GetPool: %v", err) + } else if p.AuthResetTarget != "openai-codex" { + t.Fatalf("stored AuthResetTarget = %q, want openai-codex", p.AuthResetTarget) + } + + // Surfaced in list and status. + got = h.Handle(&Command{Name: "pool", Args: []string{"list"}}) + if !strings.Contains(got, "openai-codex") { + t.Errorf("pool list missing target = %q", got) + } + got = h.Handle(&Command{Name: "pool", Args: []string{"status", "codex"}}) + if !strings.Contains(got, "auth-reset target: ") || !strings.Contains(got, "openai-codex") { + t.Errorf("pool status missing target = %q", got) + } + + // set-auth-reset to a new value. + got = h.Handle(&Command{Name: "pool", Args: []string{"set-auth-reset", "codex", "other-target"}}) + if !strings.Contains(got, "Set auth-reset target") || !strings.Contains(got, "other-target") { + t.Fatalf("set-auth-reset = %q", got) + } + if p, _ := s.GetPool("codex"); p.AuthResetTarget != "other-target" { + t.Fatalf("after set stored target = %q, want other-target", p.AuthResetTarget) + } + + // Clear with "-". + got = h.Handle(&Command{Name: "pool", Args: []string{"set-auth-reset", "codex", "-"}}) + if !strings.Contains(got, "Cleared auth-reset target") { + t.Fatalf("clear = %q", got) + } + if p, _ := s.GetPool("codex"); p.AuthResetTarget != "" { + t.Fatalf("after clear stored target = %q, want empty", p.AuthResetTarget) + } + + // Unknown pool and usage. + got = h.Handle(&Command{Name: "pool", Args: []string{"set-auth-reset", "ghost", "x"}}) + if !strings.Contains(got, "No pool named") { + t.Errorf("set-auth-reset unknown pool = %q", got) + } + got = h.Handle(&Command{Name: "pool", Args: []string{"set-auth-reset", "codex"}}) + if !strings.Contains(got, "Usage:") { + t.Errorf("set-auth-reset too few args = %q", got) + } +} + func TestHandlePoolCreateNoMembers(t *testing.T) { s := newTestStore(t) h := newTestHandlerWithStore(t, s, nil, "") From 57525637b06fce77b9640d53fa1ff1861a785bf6 Mon Sep 17 00:00:00 2001 From: Nikita Nemirovsky Date: Fri, 22 May 2026 16:23:20 +0800 Subject: [PATCH 07/19] feat(container): thread optional exec user through ExecInContainer + AgentProfile.ExecUser --- ...22-pool-exhaustion-and-agent-auth-reset.md | 10 ++-- internal/container/agent_profile.go | 23 +++++++++ internal/container/agent_profile_test.go | 14 ++++++ internal/container/docker.go | 15 ++++-- internal/container/docker_socket.go | 9 +++- internal/container/docker_socket_test.go | 49 ++++++++++++++++++- internal/container/docker_test.go | 48 +++++++++++++++++- 7 files changed, 154 insertions(+), 14 deletions(-) diff --git a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md index 5197770..bb9f32e 100644 --- a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md +++ b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md @@ -333,14 +333,14 @@ precedent) and emits an `agent_auth_reset` audit event. - Modify: `internal/container/docker_test.go` (`mockClient`) + any other `ExecInContainer` callers - Modify: `internal/container/agent_profile.go` (`AgentProfile.ExecUser() string`) -- [ ] add `User string` to `execCreateRequest`; thread an optional user arg through +- [x] add `User string` to `execCreateRequest`; thread an optional user arg through `ExecInContainer` on `ContainerClient` + `SocketClient` (empty "" = current root behavior) -- [ ] update `mockClient` and every existing `ExecInContainer` caller (no-op default, no +- [x] update `mockClient` and every existing `ExecInContainer` caller (no-op default, no behavior change) -- [ ] add `ExecUser() string` to `AgentProfile` (hermes `"10000:10000"`, openclaw "") -- [ ] write tests: socket exec body carries `User` when set and omits/empties it otherwise; +- [x] add `ExecUser() string` to `AgentProfile` (hermes `"10000:10000"`, openclaw "") +- [x] write tests: socket exec body carries `User` when set and omits/empties it otherwise; profile `ExecUser` values; existing callers unaffected -- [ ] run `go test ./internal/container/ -race`; gofumpt; vet — pass before Task 7 +- [x] run `go test ./internal/container/ -race`; gofumpt; vet — pass before Task 7 ### Task 7: Profile `ResetAuthCmd` + `ContainerManager.ResetAuth` diff --git a/internal/container/agent_profile.go b/internal/container/agent_profile.go index 8e82f94..6783d03 100644 --- a/internal/container/agent_profile.go +++ b/internal/container/agent_profile.go @@ -34,6 +34,26 @@ type AgentProfile struct { // config in place and the operator must wire the MCP gateway // manually before starting the agent. WireMCPCmd func(name, url string) []string + + // execUser is the UID:GID (or user name) that file-writing execs + // should run as inside the agent container. Empty means run as the + // image's USER (typically root). Hermes runs its gateway as a + // non-root runtime user (UID 10000), so an exec that writes a + // hermes-owned file (e.g. `hermes auth reset`) must run as that + // user or it root-chowns the file and bricks the gateway. Read it + // via the ExecUser accessor, which is nil-safe. + execUser string +} + +// ExecUser returns the UID:GID (or user name) that file-writing execs +// should run as for this profile, or "" to run as the image's USER. The +// receiver is nil-safe so callers holding a possibly-nil profile can call +// it directly. +func (p *AgentProfile) ExecUser() string { + if p == nil { + return "" + } + return p.execUser } // OpenclawProfile is the default profile. Openclaw stores secrets at @@ -100,6 +120,9 @@ var HermesProfile = &AgentProfile{ Name: "hermes", EnvFileRelPath: ".hermes/.env", ReloadCmd: nil, + // Hermes' gateway runs as UID 10000; file-writing execs must match + // so they don't root-chown hermes-owned files and brick the gateway. + execUser: "10000:10000", WireMCPCmd: func(name, url string) []string { return []string{ "sh", diff --git a/internal/container/agent_profile_test.go b/internal/container/agent_profile_test.go index ee7da70..4108d94 100644 --- a/internal/container/agent_profile_test.go +++ b/internal/container/agent_profile_test.go @@ -348,3 +348,17 @@ func TestResolveProfile_NilDefaultsToOpenclaw(t *testing.T) { t.Error("non-nil profile should be returned as-is") } } + +func TestProfileExecUser(t *testing.T) { + if got := HermesProfile.ExecUser(); got != "10000:10000" { + t.Errorf("HermesProfile.ExecUser() = %q, want 10000:10000", got) + } + if got := OpenclawProfile.ExecUser(); got != "" { + t.Errorf("OpenclawProfile.ExecUser() = %q, want empty", got) + } + // Nil-safe accessor. + var p *AgentProfile + if got := p.ExecUser(); got != "" { + t.Errorf("(*AgentProfile)(nil).ExecUser() = %q, want empty", got) + } +} diff --git a/internal/container/docker.go b/internal/container/docker.go index 5c10ef0..208b4a8 100644 --- a/internal/container/docker.go +++ b/internal/container/docker.go @@ -15,7 +15,11 @@ type ContainerClient interface { //nolint:revive // stuttering accepted for clar RemoveContainer(ctx context.Context, name string) error CreateContainer(ctx context.Context, spec ContainerSpec) (string, error) StartContainer(ctx context.Context, id string) error - ExecInContainer(ctx context.Context, name string, cmd []string) error + // ExecInContainer runs cmd inside the named container. user is the + // optional UID:GID (or user name) the exec runs as; an empty string + // preserves the historical behavior of running as the image's USER + // (typically root for agent containers). + ExecInContainer(ctx context.Context, name, user string, cmd []string) error } // ContainerState holds the result of inspecting a container. @@ -94,7 +98,10 @@ func (m *DockerManager) InjectEnvVars(ctx context.Context, envMap map[string]str return fmt.Errorf("build env injection script: %w", err) } - if execErr := m.client.ExecInContainer(ctx, m.containerName, + // Pass the empty user so the env-injection exec keeps its historical + // root behavior; the generated script chowns the file back to the + // runtime user itself (see BuildEnvInjectionScriptForProfile). + if execErr := m.client.ExecInContainer(ctx, m.containerName, "", []string{"sh", "-c", script}); execErr != nil { return fmt.Errorf("inject env vars: %w", execErr) } @@ -117,7 +124,7 @@ func (m *DockerManager) ReloadSecrets(ctx context.Context) error { log.Printf("agent profile %q has no in-place reload; new secrets take effect on next agent run", m.profile.Name) return nil } - return m.client.ExecInContainer(ctx, m.containerName, m.profile.ReloadCmd()) + return m.client.ExecInContainer(ctx, m.containerName, "", m.profile.ReloadCmd()) } // WireMCPGateway registers sluice's MCP gateway URL inside the agent's @@ -137,7 +144,7 @@ func (m *DockerManager) WireMCPGateway(ctx context.Context, name, sluiceURL stri log.Printf("agent profile %q does not support automatic MCP wiring; configure %s manually", m.profile.Name, sluiceURL) return nil } - err := m.client.ExecInContainer(ctx, m.containerName, m.profile.WireMCPCmd(name, sluiceURL)) + err := m.client.ExecInContainer(ctx, m.containerName, "", m.profile.WireMCPCmd(name, sluiceURL)) if err != nil && m.profile.Name == OpenclawProfile.Name && strings.Contains(err.Error(), "exit") && strings.Contains(err.Error(), "137") { return nil diff --git a/internal/container/docker_socket.go b/internal/container/docker_socket.go index 7809c5c..80a023b 100644 --- a/internal/container/docker_socket.go +++ b/internal/container/docker_socket.go @@ -241,11 +241,15 @@ func (c *SocketClient) StartContainer(ctx context.Context, id string) error { return nil } -// ExecInContainer runs a command inside a running container. -func (c *SocketClient) ExecInContainer(ctx context.Context, name string, cmd []string) error { +// ExecInContainer runs a command inside a running container. user is the +// optional UID:GID (or user name) the exec runs as; an empty string omits +// the User field from the exec-create request, preserving Docker's default +// of running as the image's USER (typically root for agent containers). +func (c *SocketClient) ExecInContainer(ctx context.Context, name, user string, cmd []string) error { // Step 1: Create exec instance. createBody := execCreateRequest{ Cmd: cmd, + User: user, AttachStdout: true, AttachStderr: true, } @@ -395,6 +399,7 @@ type createResponseBody struct { type execCreateRequest struct { Cmd []string `json:"Cmd"` + User string `json:"User,omitempty"` AttachStdout bool `json:"AttachStdout"` AttachStderr bool `json:"AttachStderr"` } diff --git a/internal/container/docker_socket_test.go b/internal/container/docker_socket_test.go index 56cc9e0..2463fa0 100644 --- a/internal/container/docker_socket_test.go +++ b/internal/container/docker_socket_test.go @@ -316,6 +316,7 @@ func TestSocketClientExecInContainer(t *testing.T) { defer cleanup() var execCreateBody execCreateRequest + var rawCreateBody map[string]any mux.HandleFunc("/v1.25/containers/mycontainer/exec", func(w http.ResponseWriter, r *http.Request) { if r.Method != "POST" { http.Error(w, "want POST", http.StatusMethodNotAllowed) @@ -323,6 +324,7 @@ func TestSocketClientExecInContainer(t *testing.T) { } data, _ := io.ReadAll(r.Body) _ = json.Unmarshal(data, &execCreateBody) + _ = json.Unmarshal(data, &rawCreateBody) w.WriteHeader(http.StatusCreated) _ = json.NewEncoder(w).Encode(map[string]string{"Id": "exec123"}) }) @@ -339,7 +341,7 @@ func TestSocketClientExecInContainer(t *testing.T) { _ = json.NewEncoder(w).Encode(map[string]interface{}{"ExitCode": 0}) }) - err := client.ExecInContainer(context.Background(), "mycontainer", []string{"openclaw", "secrets", "reload"}) + err := client.ExecInContainer(context.Background(), "mycontainer", "", []string{"openclaw", "secrets", "reload"}) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -347,6 +349,49 @@ func TestSocketClientExecInContainer(t *testing.T) { if len(execCreateBody.Cmd) != 3 || execCreateBody.Cmd[0] != "openclaw" { t.Errorf("exec Cmd = %v, want [openclaw secrets reload]", execCreateBody.Cmd) } + if execCreateBody.User != "" { + t.Errorf("exec User = %q, want empty for default-root exec", execCreateBody.User) + } + if _, ok := rawCreateBody["User"]; ok { + t.Error("exec-create body should omit the User field when no user is set (omitempty)") + } +} + +// TestSocketClientExecInContainerWithUser verifies that a non-empty user is +// carried into the exec-create request body (Docker runs the exec as that +// UID:GID), and that the omitempty tag keeps it absent when empty. +func TestSocketClientExecInContainerWithUser(t *testing.T) { + client, mux, cleanup := newTestServer(t) + defer cleanup() + + var rawCreateBody map[string]any + var execCreateBody execCreateRequest + mux.HandleFunc("/v1.25/containers/mycontainer/exec", func(w http.ResponseWriter, r *http.Request) { + data, _ := io.ReadAll(r.Body) + _ = json.Unmarshal(data, &execCreateBody) + _ = json.Unmarshal(data, &rawCreateBody) + w.WriteHeader(http.StatusCreated) + _ = json.NewEncoder(w).Encode(map[string]string{"Id": "exec123"}) + }) + mux.HandleFunc("/v1.25/exec/exec123/start", func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + }) + mux.HandleFunc("/v1.25/exec/exec123/json", func(w http.ResponseWriter, _ *http.Request) { + _ = json.NewEncoder(w).Encode(map[string]interface{}{"ExitCode": 0}) + }) + + err := client.ExecInContainer(context.Background(), "mycontainer", "10000:10000", + []string{"hermes", "auth", "reset", "openai-codex"}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if execCreateBody.User != "10000:10000" { + t.Errorf("exec User = %q, want 10000:10000", execCreateBody.User) + } + if _, ok := rawCreateBody["User"]; !ok { + t.Error("exec-create body should contain a User field when a user is set") + } } func TestSocketClientExecNonZeroExit(t *testing.T) { @@ -366,7 +411,7 @@ func TestSocketClientExecNonZeroExit(t *testing.T) { _ = json.NewEncoder(w).Encode(map[string]interface{}{"ExitCode": 127}) }) - err := client.ExecInContainer(context.Background(), "mycontainer", []string{"openclaw", "secrets", "reload"}) + err := client.ExecInContainer(context.Background(), "mycontainer", "", []string{"openclaw", "secrets", "reload"}) if err == nil { t.Fatal("expected error for non-zero exit code") } diff --git a/internal/container/docker_test.go b/internal/container/docker_test.go index 078853a..d91932a 100644 --- a/internal/container/docker_test.go +++ b/internal/container/docker_test.go @@ -25,6 +25,8 @@ type mockClient struct { execCalled bool execCmd []string execCalls [][]string // all exec calls recorded + execUser string // user from the most recent exec call + execUsers []string // user from every exec call, in order // Track container names passed to each method. inspectedName string stoppedName string @@ -61,11 +63,13 @@ func (m *mockClient) StartContainer(_ context.Context, id string) error { return m.startErr } -func (m *mockClient) ExecInContainer(_ context.Context, name string, cmd []string) error { +func (m *mockClient) ExecInContainer(_ context.Context, name, user string, cmd []string) error { m.execCalled = true m.execName = name m.execCmd = cmd + m.execUser = user m.execCalls = append(m.execCalls, cmd) + m.execUsers = append(m.execUsers, user) // Use per-call error if available. if len(m.execErrs) > 0 { @@ -471,6 +475,48 @@ func TestInjectEnvVarsHermesProfile(t *testing.T) { } } +// TestExistingExecCallersUseDefaultUser pins the no-behavior-change +// guarantee for Task 6: the existing DockerManager exec call sites +// (InjectEnvVars, ReloadSecrets, WireMCPGateway) keep running as the +// image's USER (empty user string), regardless of the profile's +// ExecUser. Only the future auth-reset path (Task 7) opts into a user. +func TestExistingExecCallersUseDefaultUser(t *testing.T) { + // Hermes profile (ExecUser "10000:10000") exercises both the env + // write and the MCP wire exec; neither should adopt the profile user. + mc := &mockClient{} + mgr := NewDockerManagerForProfile(mc, "hermes", HermesProfile) + + if err := mgr.InjectEnvVars(context.Background(), + map[string]string{"OPENAI_API_KEY": "sk-phantom"}, false); err != nil { + t.Fatalf("InjectEnvVars: %v", err) + } + if err := mgr.WireMCPGateway(context.Background(), "sluice", "http://sluice:3000/mcp"); err != nil { + t.Fatalf("WireMCPGateway: %v", err) + } + + if len(mc.execUsers) == 0 { + t.Fatal("expected at least one exec call recorded") + } + for i, u := range mc.execUsers { + if u != "" { + t.Errorf("exec call %d ran as user %q, want empty (no behavior change)", i, u) + } + } + + // Openclaw profile runs the env write + reload exec; both default too. + mcO := &mockClient{} + mgrO := NewDockerManagerForProfile(mcO, "openclaw", OpenclawProfile) + if err := mgrO.InjectEnvVars(context.Background(), + map[string]string{"OPENAI_API_KEY": "sk-phantom"}, false); err != nil { + t.Fatalf("InjectEnvVars (openclaw): %v", err) + } + for i, u := range mcO.execUsers { + if u != "" { + t.Errorf("openclaw exec call %d ran as user %q, want empty", i, u) + } + } +} + func TestWireMCPGateway_Exit137GatedToOpenclaw(t *testing.T) { // Openclaw profile: exit 137 from the gateway restart is swallowed. mc := &mockClient{execErr: fmt.Errorf("exec failed: exit 137")} From deb7d88fd7f350363070e8eeb6c709f1ad444dd3 Mon Sep 17 00:00:00 2001 From: Nikita Nemirovsky Date: Fri, 22 May 2026 16:29:29 +0800 Subject: [PATCH 08/19] feat(container): add ResetAuth + profile ResetAuthCmd for agent auth un-latch --- cmd/sluice/main_test.go | 4 + ...22-pool-exhaustion-and-agent-auth-reset.md | 15 ++-- internal/api/server_test.go | 4 + internal/container/agent_profile.go | 26 +++++++ internal/container/agent_profile_test.go | 27 +++++++ internal/container/apple.go | 15 ++++ internal/container/docker.go | 17 ++++ internal/container/docker_test.go | 77 +++++++++++++++++++ internal/container/tart.go | 15 ++++ internal/container/types.go | 34 ++++++++ internal/telegram/approval_test.go | 4 + 11 files changed, 231 insertions(+), 7 deletions(-) diff --git a/cmd/sluice/main_test.go b/cmd/sluice/main_test.go index 53e1bb1..7b2c2bc 100644 --- a/cmd/sluice/main_test.go +++ b/cmd/sluice/main_test.go @@ -1453,6 +1453,10 @@ func (m *mockContainerMgr) ReloadSecrets(_ context.Context) error { return nil } +func (m *mockContainerMgr) ResetAuth(_ context.Context, _ string) error { + return nil +} + func (m *mockContainerMgr) Runtime() container.Runtime { return container.RuntimeDocker } diff --git a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md index bb9f32e..acde716 100644 --- a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md +++ b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md @@ -350,16 +350,17 @@ precedent) and emits an `agent_auth_reset` audit event. - Modify: `internal/container/docker.go`, `apple.go`, `tart.go`, standalone (`none`) - Modify: `internal/container/agent_profile_test.go`, `docker_test.go` -- [ ] add `ResetAuthCmd func(target string) []string` to `AgentProfile`; `HermesProfile` → +- [x] add `ResetAuthCmd func(target string) []string` to `AgentProfile`; `HermesProfile` → pure-argv `["/opt/hermes/.venv/bin/hermes","auth","reset",target]`; `OpenclawProfile.ResetAuthCmd` nil (documented; Post-Completion verification) -- [ ] add `ResetAuth(ctx, target string) error` to `ContainerManager`; implement per backend; - nil `ResetAuthCmd` → log notice, return nil (mirror `ReloadSecrets`); docker exec passes - `profile.ExecUser()` (Task 6) so hermes runs as 10000:10000 -- [ ] validate `target` (non-empty, no NUL, allowlisted charset) before exec -- [ ] write tests: argv per profile; nil-cmd no-ops with notice; empty/invalid target +- [x] add `ResetAuth(ctx, target string) error` to `ContainerManager`; implement per backend + (docker/apple/tart; `none`/standalone has no manager struct — `containerMgr` is nil and + handled by the caller); nil `ResetAuthCmd` → log notice, return nil (mirror `ReloadSecrets`); + docker exec passes `profile.ExecUser()` (Task 6) so hermes runs as 10000:10000 +- [x] validate `target` (non-empty, no NUL, allowlisted charset `[A-Za-z0-9_.:-]+`) before exec +- [x] write tests: argv per profile; nil-cmd no-ops with notice; empty/invalid target rejected; **docker exec uses the runtime UID from `ExecUser` (now passable, Task 6)** -- [ ] run `go test ./internal/container/ -race`; gofumpt; vet — pass before Task 8 +- [x] run `go test ./internal/container/ -race`; gofumpt; vet — pass before Task 8 ### Task 8: Wire auto-reset on the recovery edge (opt-in, per pool) diff --git a/internal/api/server_test.go b/internal/api/server_test.go index f43092f..fa069fb 100644 --- a/internal/api/server_test.go +++ b/internal/api/server_test.go @@ -3779,6 +3779,10 @@ func (m *mockContainerMgr) ReloadSecrets(_ context.Context) error { return nil } +func (m *mockContainerMgr) ResetAuth(_ context.Context, _ string) error { + return nil +} + func (m *mockContainerMgr) Runtime() container.Runtime { return container.RuntimeDocker } diff --git a/internal/container/agent_profile.go b/internal/container/agent_profile.go index 6783d03..66423cf 100644 --- a/internal/container/agent_profile.go +++ b/internal/container/agent_profile.go @@ -35,6 +35,17 @@ type AgentProfile struct { // manually before starting the agent. WireMCPCmd func(name, url string) []string + // ResetAuthCmd returns the argv to exec inside the agent container in + // order to clear ("un-latch") the agent's local auth state for the + // given target so it retries after a pool quota window passes. It is + // pure argv (no `sh -c` wrapper), so there is no shell-metacharacter + // threat from the interpolated target; the target is still validated + // (validateResetAuthTarget) before exec as defense in depth. + // Returning nil means the profile has no auth-reset mechanism; the + // caller should log a notice and rely on the agent recovering on its + // own (mirrors a nil ReloadCmd). + ResetAuthCmd func(target string) []string + // execUser is the UID:GID (or user name) that file-writing execs // should run as inside the agent container. Empty means run as the // image's USER (typically root). Hermes runs its gateway as a @@ -69,6 +80,12 @@ var OpenclawProfile = &AgentProfile{ WireMCPCmd: func(name, url string) []string { return GatewayRPCNodeCommand("wire-mcp", name, url) }, + // ResetAuthCmd is nil pending verification of whether openclaw + // latches its auth state on a usage-limit exhaustion the way hermes + // does. If it turns out openclaw needs an explicit un-latch, this is + // the place to add the gateway-RPC command (see Post-Completion in + // docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md). + ResetAuthCmd: nil, } // hermesMCPWireScript is a small Python script that registers an MCP @@ -133,6 +150,15 @@ var HermesProfile = &AgentProfile{ url, } }, + // ResetAuthCmd un-latches hermes' local auth state for the given + // target via `hermes auth reset `. It is pure argv (no + // `sh -c`), so the target cannot smuggle shell metacharacters — but + // the caller still validates it (validateResetAuthTarget) before + // exec. The exec must run as the runtime UID (see execUser above) or + // it root-chowns hermes-owned auth files and bricks the gateway. + ResetAuthCmd: func(target string) []string { + return []string{"/opt/hermes/.venv/bin/hermes", "auth", "reset", target} + }, } // builtinProfiles is the registry consulted by ProfileFromName. diff --git a/internal/container/agent_profile_test.go b/internal/container/agent_profile_test.go index 4108d94..8386298 100644 --- a/internal/container/agent_profile_test.go +++ b/internal/container/agent_profile_test.go @@ -362,3 +362,30 @@ func TestProfileExecUser(t *testing.T) { t.Errorf("(*AgentProfile)(nil).ExecUser() = %q, want empty", got) } } + +func TestHermesProfile_ResetAuthCmd(t *testing.T) { + if HermesProfile.ResetAuthCmd == nil { + t.Fatal("hermes profile should have a ResetAuthCmd") + } + got := HermesProfile.ResetAuthCmd("openai-codex") + want := []string{"/opt/hermes/.venv/bin/hermes", "auth", "reset", "openai-codex"} + if len(got) != len(want) { + t.Fatalf("ResetAuthCmd argv = %v, want %v", got, want) + } + for i := range want { + if got[i] != want[i] { + t.Errorf("ResetAuthCmd[%d] = %q, want %q", i, got[i], want[i]) + } + } + // Pure argv: no sh -c wrapper, so the target lands as its own + // element and cannot smuggle shell metacharacters. + if got[0] == "sh" { + t.Errorf("ResetAuthCmd should be pure argv, not an sh -c wrapper: %v", got) + } +} + +func TestOpenclawProfile_ResetAuthCmdIsNil(t *testing.T) { + if OpenclawProfile.ResetAuthCmd != nil { + t.Error("openclaw profile should have nil ResetAuthCmd (pending verification it latches)") + } +} diff --git a/internal/container/apple.go b/internal/container/apple.go index 50a7e0b..65bb86a 100644 --- a/internal/container/apple.go +++ b/internal/container/apple.go @@ -252,6 +252,21 @@ func (m *AppleManager) ReloadSecrets(ctx context.Context) error { return err } +// ResetAuth clears the agent's local auth state for target. The mechanism +// is profile-specific; a nil ResetAuthCmd is a logged no-op (mirrors +// ReloadSecrets). The target is validated before exec. +func (m *AppleManager) ResetAuth(ctx context.Context, target string) error { + if m.profile.ResetAuthCmd == nil { + log.Printf("agent profile %q has no auth-reset command; agent must recover on its own", m.profile.Name) + return nil + } + if err := validateResetAuthTarget(target); err != nil { + return err + } + _, err := m.cli.Exec(ctx, m.containerName, m.profile.ResetAuthCmd(target)) + return err +} + // WireMCPGateway registers sluice's MCP gateway URL in the agent's // config. The exact storage format depends on the profile. func (m *AppleManager) WireMCPGateway(ctx context.Context, name, sluiceURL string) error { diff --git a/internal/container/docker.go b/internal/container/docker.go index 208b4a8..b29566c 100644 --- a/internal/container/docker.go +++ b/internal/container/docker.go @@ -127,6 +127,23 @@ func (m *DockerManager) ReloadSecrets(ctx context.Context) error { return m.client.ExecInContainer(ctx, m.containerName, "", m.profile.ReloadCmd()) } +// ResetAuth clears the agent's local auth state for target so it retries +// after a credential-pool quota window passes. The mechanism is +// profile-specific (hermes runs `hermes auth reset `); a nil +// ResetAuthCmd is a logged no-op (mirrors ReloadSecrets). The exec runs +// as the profile's ExecUser (UID 10000:10000 for hermes) so it does not +// root-chown the agent's auth files and brick the gateway. +func (m *DockerManager) ResetAuth(ctx context.Context, target string) error { + if m.profile.ResetAuthCmd == nil { + log.Printf("agent profile %q has no auth-reset command; agent must recover on its own", m.profile.Name) + return nil + } + if err := validateResetAuthTarget(target); err != nil { + return err + } + return m.client.ExecInContainer(ctx, m.containerName, m.profile.ExecUser(), m.profile.ResetAuthCmd(target)) +} + // WireMCPGateway registers sluice's MCP gateway URL inside the agent's // config so that its embedded runtime discovers sluice as an MCP server. // The exact storage format depends on the profile (openclaw patches its diff --git a/internal/container/docker_test.go b/internal/container/docker_test.go index d91932a..13dcaa9 100644 --- a/internal/container/docker_test.go +++ b/internal/container/docker_test.go @@ -766,3 +766,80 @@ func TestDockerManagerInjectCACertNoop(t *testing.T) { t.Error("Docker InjectCACert should not exec anything") } } + +func TestResetAuthHermesRunsAsRuntimeUID(t *testing.T) { + mc := &mockClient{} + mgr := NewDockerManagerForProfile(mc, "hermes", HermesProfile) + + if err := mgr.ResetAuth(context.Background(), "openai-codex"); err != nil { + t.Fatalf("ResetAuth: %v", err) + } + if len(mc.execCalls) != 1 { + t.Fatalf("expected 1 exec call, got %d", len(mc.execCalls)) + } + // The hermes reset exec must run as the runtime UID so it does not + // root-chown hermes-owned auth files and brick the gateway. + if mc.execUsers[0] != "10000:10000" { + t.Errorf("ResetAuth exec user = %q, want 10000:10000", mc.execUsers[0]) + } + want := []string{"/opt/hermes/.venv/bin/hermes", "auth", "reset", "openai-codex"} + cmd := mc.execCalls[0] + if len(cmd) != len(want) { + t.Fatalf("ResetAuth argv = %v, want %v", cmd, want) + } + for i := range want { + if cmd[i] != want[i] { + t.Errorf("ResetAuth argv[%d] = %q, want %q", i, cmd[i], want[i]) + } + } +} + +func TestResetAuthNilCmdNoOps(t *testing.T) { + // Openclaw has a nil ResetAuthCmd: ResetAuth logs a notice and + // returns nil without exec'ing anything (mirrors ReloadSecrets). + mc := &mockClient{} + mgr := NewDockerManagerForProfile(mc, "openclaw", OpenclawProfile) + + if err := mgr.ResetAuth(context.Background(), "openai-codex"); err != nil { + t.Fatalf("ResetAuth with nil cmd should return nil, got: %v", err) + } + if mc.execCalled { + t.Error("ResetAuth with nil ResetAuthCmd should not exec anything") + } +} + +func TestResetAuthRejectsInvalidTarget(t *testing.T) { + cases := []struct { + name string + target string + }{ + {"empty", ""}, + {"nul byte", "openai\x00codex"}, + {"shell metachar", "openai;rm -rf /"}, + {"space", "openai codex"}, + {"slash", "openai/codex"}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + mc := &mockClient{} + mgr := NewDockerManagerForProfile(mc, "hermes", HermesProfile) + if err := mgr.ResetAuth(context.Background(), tc.target); err == nil { + t.Errorf("ResetAuth(%q) should reject invalid target, got nil error", tc.target) + } + if mc.execCalled { + t.Errorf("ResetAuth(%q) should not exec on invalid target", tc.target) + } + }) + } +} + +func TestResetAuthValidTargetWithColonAndDot(t *testing.T) { + mc := &mockClient{} + mgr := NewDockerManagerForProfile(mc, "hermes", HermesProfile) + if err := mgr.ResetAuth(context.Background(), "provider:account.v2"); err != nil { + t.Fatalf("ResetAuth with allowlisted target should pass: %v", err) + } + if !mc.execCalled { + t.Error("ResetAuth with valid target should exec") + } +} diff --git a/internal/container/tart.go b/internal/container/tart.go index 4d389b1..f4ad5a8 100644 --- a/internal/container/tart.go +++ b/internal/container/tart.go @@ -291,6 +291,21 @@ func (m *TartManager) ReloadSecrets(ctx context.Context) error { return err } +// ResetAuth clears the agent's local auth state for target. The mechanism +// is profile-specific; a nil ResetAuthCmd is a logged no-op (mirrors +// ReloadSecrets). The target is validated before exec. +func (m *TartManager) ResetAuth(ctx context.Context, target string) error { + if m.profile.ResetAuthCmd == nil { + log.Printf("agent profile %q has no auth-reset command; agent must recover on its own", m.profile.Name) + return nil + } + if err := validateResetAuthTarget(target); err != nil { + return err + } + _, err := m.cli.Exec(ctx, m.vmName, m.profile.ResetAuthCmd(target)) + return err +} + // WireMCPGateway registers sluice's MCP gateway URL in the agent's config. // The exact storage format depends on the profile. func (m *TartManager) WireMCPGateway(ctx context.Context, name, sluiceURL string) error { diff --git a/internal/container/types.go b/internal/container/types.go index 7c1766c..39bf169 100644 --- a/internal/container/types.go +++ b/internal/container/types.go @@ -76,6 +76,15 @@ type ContainerManager interface { //nolint:revive // stuttering accepted for cla // ReloadSecrets signals the agent to re-read secrets from the env file. ReloadSecrets(ctx context.Context) error + // ResetAuth clears ("un-latches") the agent's local auth state for the + // given target so the agent retries after a credential-pool quota + // window passes (the agent will not self-recover otherwise). The + // mechanism is profile-specific (hermes runs `hermes auth reset + // ` as its runtime UID; openclaw has no reset yet). When the + // profile provides no reset command this is a logged no-op so callers + // can treat it as best-effort. The target is validated before exec. + ResetAuth(ctx context.Context, target string) error + // WireMCPGateway registers sluice's MCP gateway URL under // mcp.servers. in the agent's config so the embedded runtime // discovers sluice as an MCP server. Idempotent: a second call with @@ -112,6 +121,31 @@ func ValidateEnvVarKey(key string) error { return nil } +// resetAuthTargetRe matches an allowlisted auth-reset target: one or more +// letters, digits, dash, underscore, dot, or colon. Targets are agent +// identifiers like "openai-codex"; the colon allows scoped names such as +// "provider:account". The target is exec'd as a pure argv element (no +// shell), so this allowlist is defense in depth, not a shell-escape. +var resetAuthTargetRe = regexp.MustCompile(`^[A-Za-z0-9_.:-]+$`) + +// validateResetAuthTarget checks that an auth-reset target is non-empty, +// free of NUL bytes, and within the allowlisted charset before it is +// passed to ContainerManager.ResetAuth. Mirrors ValidateEnvVarKey's +// fail-closed style: an invalid target is rejected (no exec) rather than +// sanitized. +func validateResetAuthTarget(target string) error { + if target == "" { + return fmt.Errorf("auth-reset target is empty") + } + if strings.ContainsRune(target, 0) { + return fmt.Errorf("auth-reset target contains NUL byte") + } + if !resetAuthTargetRe.MatchString(target) { + return fmt.Errorf("invalid auth-reset target %q: must match [A-Za-z0-9_.:-]+", target) + } + return nil +} + // validateEnvVarValue rejects values that cannot be safely written as a // single line of a dotenv-style file. Newlines would split one logical // entry into multiple file lines (the second line would either be a diff --git a/internal/telegram/approval_test.go b/internal/telegram/approval_test.go index d8cc48c..081ed23 100644 --- a/internal/telegram/approval_test.go +++ b/internal/telegram/approval_test.go @@ -2092,6 +2092,10 @@ func (m *mockContainerMgr) ReloadSecrets(_ context.Context) error { return nil } +func (m *mockContainerMgr) ResetAuth(_ context.Context, _ string) error { + return nil +} + func (m *mockContainerMgr) Runtime() container.Runtime { return container.RuntimeDocker } From 8f4a12a2b5dfeb8fef61e6480f325e42b582bad1 Mon Sep 17 00:00:00 2001 From: Nikita Nemirovsky Date: Fri, 22 May 2026 16:36:59 +0800 Subject: [PATCH 09/19] feat(pools): auto-reset agent auth on pool recovery edge (opt-in per pool) --- cmd/sluice/main.go | 104 ++++++++ cmd/sluice/main_test.go | 250 +++++++++++++++++- ...22-pool-exhaustion-and-agent-auth-reset.md | 18 +- 3 files changed, 361 insertions(+), 11 deletions(-) diff --git a/cmd/sluice/main.go b/cmd/sluice/main.go index ef6094a..dd61bb5 100644 --- a/cmd/sluice/main.go +++ b/cmd/sluice/main.go @@ -528,6 +528,17 @@ func main() { }() }) + // Wire pool recovery side effects (Task 8): the always-fires operator + // notice (fanned across broker channels, independent of any reset target) + // and the opt-in, per-pool agent auth auto-reset. Registered + // UNCONDITIONALLY, mirroring SetOnFailover: the notice is internally gated + // on a broker being present, and the reset is internally gated on the + // recovered pool having a non-empty auth_reset_target AND a container + // manager. Both side effects run in detached goroutines so the recovery + // monitor is never blocked by a SQLite read, a Telegram round-trip, or a + // container exec. + wirePoolRecovery(srv, db, containerMgr, failoverBroker, logger) + if len(allChannels) > 0 { // Start all channels. if tgChannel != nil { @@ -955,6 +966,99 @@ func injectEnvVarsFromStore(db *store.Store, mgr container.ContainerManager) err return nil } +// auditLogger is the minimal audit-write surface wirePoolRecovery needs. +// *audit.FileLogger satisfies it; tests can supply a fake. +type auditLogger interface { + Log(evt audit.Event) error +} + +// wirePoolRecovery registers the two pool-recovery side effects on the proxy +// server (Task 8), mirroring the SetOnFailover wiring: +// +// - SetOnPoolRecoveredNotice: ALWAYS fires on a recovery edge, independent of +// any reset target. Builds the "pool recovered" notice via +// proxy.FormatPoolRecoveredNotice and fans it across the broker's channels. +// Internally gated only on a broker being present. +// - SetOnPoolRecovered: the opt-in, per-pool agent auth auto-reset. Looks up +// the recovered pool's auth_reset_target in the store; when non-empty AND a +// container manager exists, calls mgr.ResetAuth in a detached goroutine with +// a fresh bounded context (never a wake-scoped ctx) and emits an +// agent_auth_reset audit event on success. An empty target is the opt-out +// default (no reset). A ResetAuth error is logged, not fatal. +// +// Both callbacks run their work in detached goroutines so the recovery monitor +// is never blocked by a SQLite read, a Telegram round-trip, or a container exec. +// The two hooks are independent: the notice fires even when no reset target is +// configured, and the reset never suppresses the notice. +func wirePoolRecovery(srv *proxy.Server, db *store.Store, mgr container.ContainerManager, broker *channel.Broker, logger auditLogger) { + srv.SetOnPoolRecoveredNotice(poolRecoveredNoticeFunc(broker)) + srv.SetOnPoolRecovered(poolAuthResetFunc(db, mgr, logger)) +} + +// poolRecoveredNoticeFunc builds the always-fires recovery-notice callback. It +// fans proxy.FormatPoolRecoveredNotice across the broker's channels in a +// detached goroutine; a nil broker yields a callback that does nothing (the +// notice is internally gated on a broker being present, like SetOnFailover). +func poolRecoveredNoticeFunc(broker *channel.Broker) func(pool string) { + return func(pool string) { + if broker == nil { + return + } + go func() { + msg := proxy.FormatPoolRecoveredNotice(pool) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + for _, ch := range broker.Channels() { + if nerr := ch.Notify(ctx, msg); nerr != nil { + log.Printf("[POOL-RECOVERY] notice via %s failed: %v", ch.Type(), nerr) + } + } + }() + } +} + +// poolAuthResetFunc builds the opt-in, per-pool agent auth auto-reset callback. +// It looks up the recovered pool's auth_reset_target; when non-empty AND a +// container manager exists, it calls mgr.ResetAuth in a detached goroutine with +// a fresh bounded context and emits an agent_auth_reset audit event on success. +// An empty target is the opt-out default. A ResetAuth error is logged, not fatal. +func poolAuthResetFunc(db *store.Store, mgr container.ContainerManager, logger auditLogger) func(pool string) { + return func(pool string) { + if db == nil || mgr == nil { + return + } + go func() { + p, err := db.GetPool(pool) + if err != nil { + log.Printf("[POOL-RECOVERY] auth-reset lookup for pool %q failed: %v", pool, err) + return + } + if p == nil || p.AuthResetTarget == "" { + // Opt-out default: no reset target configured for this pool. + return + } + target := p.AuthResetTarget + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + if rerr := mgr.ResetAuth(ctx, target); rerr != nil { + log.Printf("[POOL-RECOVERY] auth reset for pool %q (target %q) failed: %v", pool, target, rerr) + return + } + log.Printf("[POOL-RECOVERY] agent auth reset for pool %q (target %q)", pool, target) + if logger != nil { + if lerr := logger.Log(audit.Event{ + Action: "agent_auth_reset", + Verdict: "recover", + Credential: pool, + Reason: target, + }); lerr != nil { + log.Printf("[POOL-RECOVERY] audit log for agent_auth_reset (pool %q) failed: %v", pool, lerr) + } + } + }() + } +} + // envDefault returns the environment variable value if set, otherwise the fallback. func envDefault(key, fallback string) string { if v := os.Getenv(key); v != "" { diff --git a/cmd/sluice/main_test.go b/cmd/sluice/main_test.go index 7b2c2bc..c244bd1 100644 --- a/cmd/sluice/main_test.go +++ b/cmd/sluice/main_test.go @@ -13,6 +13,8 @@ import ( "time" "github.com/nemirovsky/sluice/internal/api" + "github.com/nemirovsky/sluice/internal/audit" + "github.com/nemirovsky/sluice/internal/channel" "github.com/nemirovsky/sluice/internal/container" "github.com/nemirovsky/sluice/internal/policy" "github.com/nemirovsky/sluice/internal/proxy" @@ -1420,8 +1422,19 @@ func TestStandaloneModeCredentialInjection(t *testing.T) { // mockContainerMgr implements container.ContainerManager for testing. type mockContainerMgr struct { - injectedEnv map[string]string - injectErr error + mu sync.Mutex + injectedEnv map[string]string + injectErr error + resetAuthCalls []string + resetAuthErr error +} + +func (m *mockContainerMgr) resetAuthTargets() []string { + m.mu.Lock() + defer m.mu.Unlock() + out := make([]string, len(m.resetAuthCalls)) + copy(out, m.resetAuthCalls) + return out } func (m *mockContainerMgr) InjectEnvVars(_ context.Context, envMap map[string]string, _ bool) error { @@ -1453,8 +1466,12 @@ func (m *mockContainerMgr) ReloadSecrets(_ context.Context) error { return nil } -func (m *mockContainerMgr) ResetAuth(_ context.Context, _ string) error { - return nil +func (m *mockContainerMgr) ResetAuth(_ context.Context, target string) error { + m.mu.Lock() + m.resetAuthCalls = append(m.resetAuthCalls, target) + err := m.resetAuthErr + m.mu.Unlock() + return err } func (m *mockContainerMgr) Runtime() container.Runtime { @@ -1672,3 +1689,228 @@ func TestDeriveMCPBaseURL(t *testing.T) { }) } } + +// recoveryMockChannel implements channel.Channel for wirePoolRecovery tests. +// It records every Notify message so the test can assert the recovered notice +// fired across the broker's channels. +type recoveryMockChannel struct { + mu sync.Mutex + notified []string +} + +func (c *recoveryMockChannel) RequestApproval(_ context.Context, _ channel.ApprovalRequest) error { + return nil +} +func (c *recoveryMockChannel) CancelApproval(_ string) error { return nil } +func (c *recoveryMockChannel) Commands() <-chan channel.Command { + return nil +} + +func (c *recoveryMockChannel) Notify(_ context.Context, msg string) error { + c.mu.Lock() + c.notified = append(c.notified, msg) + c.mu.Unlock() + return nil +} +func (c *recoveryMockChannel) Start() error { return nil } +func (c *recoveryMockChannel) Stop() {} +func (c *recoveryMockChannel) Type() channel.ChannelType { + return channel.ChannelTelegram +} + +func (c *recoveryMockChannel) messages() []string { + c.mu.Lock() + defer c.mu.Unlock() + out := make([]string, len(c.notified)) + copy(out, c.notified) + return out +} + +// fakeAuditLogger captures audit events for wirePoolRecovery tests. +type fakeAuditLogger struct { + mu sync.Mutex + events []audit.Event +} + +func (f *fakeAuditLogger) Log(evt audit.Event) error { + f.mu.Lock() + f.events = append(f.events, evt) + f.mu.Unlock() + return nil +} + +func (f *fakeAuditLogger) snapshot() []audit.Event { + f.mu.Lock() + defer f.mu.Unlock() + out := make([]audit.Event, len(f.events)) + copy(out, f.events) + return out +} + +// waitFor polls cond up to ~2s; fails the test if it never becomes true. +func waitFor(t *testing.T, cond func() bool, msg string) { + t.Helper() + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + if cond() { + return + } + time.Sleep(5 * time.Millisecond) + } + t.Fatalf("condition never met: %s", msg) +} + +func newRecoveryTestStore(t *testing.T) *store.Store { + t.Helper() + db, err := store.New(":memory:") + if err != nil { + t.Fatalf("store.New: %v", err) + } + t.Cleanup(func() { _ = db.Close() }) + return db +} + +func seedRecoveryPool(t *testing.T, db *store.Store, pool string, members []string) { + t.Helper() + for _, m := range members { + if err := db.AddCredentialMeta(m, "oauth", "https://auth.example.com/token"); err != nil { + t.Fatalf("add credential meta %q: %v", m, err) + } + } + if err := db.CreatePoolWithMembers(pool, "", members); err != nil { + t.Fatalf("create pool %q: %v", pool, err) + } +} + +// TestPoolAuthResetFuncTargetSet asserts a pool with a non-empty +// auth_reset_target triggers exactly one ResetAuth(target) call and one +// agent_auth_reset audit event on recovery. +func TestPoolAuthResetFuncTargetSet(t *testing.T) { + db := newRecoveryTestStore(t) + seedRecoveryPool(t, db, "codex", []string{"acct_a", "acct_b"}) + if err := db.SetPoolAuthResetTarget("codex", "openai-codex"); err != nil { + t.Fatalf("SetPoolAuthResetTarget: %v", err) + } + + mgr := &mockContainerMgr{} + logger := &fakeAuditLogger{} + fn := poolAuthResetFunc(db, mgr, logger) + + fn("codex") + + waitFor(t, func() bool { return len(mgr.resetAuthTargets()) == 1 }, "ResetAuth called once") + targets := mgr.resetAuthTargets() + if targets[0] != "openai-codex" { + t.Errorf("ResetAuth target = %q, want %q", targets[0], "openai-codex") + } + + waitFor(t, func() bool { return len(logger.snapshot()) == 1 }, "one audit event") + evts := logger.snapshot() + e := evts[0] + if e.Action != "agent_auth_reset" || e.Verdict != "recover" || e.Credential != "codex" || e.Reason != "openai-codex" { + t.Errorf("audit event = %+v, want action=agent_auth_reset verdict=recover credential=codex reason=openai-codex", e) + } +} + +// TestPoolAuthResetFuncNoTarget asserts a pool with an empty auth_reset_target +// triggers no ResetAuth call (the recovered notice path is independent and +// covered by TestPoolRecoveredNoticeFunc). +func TestPoolAuthResetFuncNoTarget(t *testing.T) { + db := newRecoveryTestStore(t) + seedRecoveryPool(t, db, "codex", []string{"acct_a", "acct_b"}) + + mgr := &mockContainerMgr{} + logger := &fakeAuditLogger{} + fn := poolAuthResetFunc(db, mgr, logger) + + fn("codex") + + // Give the detached goroutine time to run; it must not call ResetAuth. + time.Sleep(100 * time.Millisecond) + if got := mgr.resetAuthTargets(); len(got) != 0 { + t.Errorf("ResetAuth calls = %v, want none for a pool with empty auth_reset_target", got) + } + if got := logger.snapshot(); len(got) != 0 { + t.Errorf("audit events = %+v, want none", got) + } +} + +// TestPoolAuthResetFuncErrorNotFatal asserts a ResetAuth error is logged (no +// panic, no audit event) and does not crash. +func TestPoolAuthResetFuncErrorNotFatal(t *testing.T) { + db := newRecoveryTestStore(t) + seedRecoveryPool(t, db, "codex", []string{"acct_a", "acct_b"}) + if err := db.SetPoolAuthResetTarget("codex", "openai-codex"); err != nil { + t.Fatalf("SetPoolAuthResetTarget: %v", err) + } + + mgr := &mockContainerMgr{resetAuthErr: fmt.Errorf("reset boom")} + logger := &fakeAuditLogger{} + fn := poolAuthResetFunc(db, mgr, logger) + + fn("codex") + + waitFor(t, func() bool { return len(mgr.resetAuthTargets()) == 1 }, "ResetAuth attempted once") + // On error, no audit event is emitted; give the goroutine time to settle. + time.Sleep(100 * time.Millisecond) + if got := logger.snapshot(); len(got) != 0 { + t.Errorf("audit events = %+v, want none on ResetAuth error", got) + } +} + +// TestPoolAuthResetFuncNilManager asserts a nil container manager is a no-op +// (no panic), covering standalone deployments. +func TestPoolAuthResetFuncNilManager(t *testing.T) { + db := newRecoveryTestStore(t) + seedRecoveryPool(t, db, "codex", []string{"acct_a", "acct_b"}) + if err := db.SetPoolAuthResetTarget("codex", "openai-codex"); err != nil { + t.Fatalf("SetPoolAuthResetTarget: %v", err) + } + fn := poolAuthResetFunc(db, nil, &fakeAuditLogger{}) + fn("codex") // must not panic + time.Sleep(50 * time.Millisecond) +} + +// TestPoolRecoveredNoticeFunc asserts the recovered notice fires across the +// broker's channels even when no auth_reset_target is configured. +func TestPoolRecoveredNoticeFunc(t *testing.T) { + ch := &recoveryMockChannel{} + broker := channel.NewBroker([]channel.Channel{ch}) + fn := poolRecoveredNoticeFunc(broker) + + fn("codex") + + waitFor(t, func() bool { return len(ch.messages()) == 1 }, "one recovered notice") + want := proxy.FormatPoolRecoveredNotice("codex") + if got := ch.messages(); got[0] != want { + t.Errorf("notice = %q, want %q", got[0], want) + } +} + +// TestPoolRecoveredNoticeFuncNoBroker asserts a nil broker yields a no-op +// callback (no panic). +func TestPoolRecoveredNoticeFuncNoBroker(t *testing.T) { + fn := poolRecoveredNoticeFunc(nil) + fn("codex") // must not panic + time.Sleep(20 * time.Millisecond) +} + +// TestWirePoolRecoveryRegisters asserts wirePoolRecovery wires both callbacks +// onto the server without panicking (smoke test of the registration path). +func TestWirePoolRecoveryRegisters(t *testing.T) { + db := newRecoveryTestStore(t) + eng, err := policy.LoadFromStore(db) + if err != nil { + t.Fatalf("LoadFromStore: %v", err) + } + srv, err := proxy.New(proxy.Config{ListenAddr: "127.0.0.1:0", Policy: eng}) + if err != nil { + t.Fatalf("proxy.New: %v", err) + } + defer func() { _ = srv.Close() }() + + ch := &recoveryMockChannel{} + broker := channel.NewBroker([]channel.Channel{ch}) + mgr := &mockContainerMgr{} + wirePoolRecovery(srv, db, mgr, broker, &fakeAuditLogger{}) +} diff --git a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md index acde716..6e23903 100644 --- a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md +++ b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md @@ -369,16 +369,20 @@ precedent) and emits an `agent_auth_reset` audit event. - Modify: `internal/proxy/pool_failover.go` / audit usage (`agent_auth_reset` action) - Modify: relevant `_test.go` -- [ ] in main.go register `SetOnPoolRecovered`: look up the recovered pool's +- [x] in main.go register `SetOnPoolRecovered`: look up the recovered pool's `auth_reset_target`; if non-empty and `containerMgr != nil`, call `containerMgr.ResetAuth(ctx, target)` in a detached goroutine using a fresh `context.WithTimeout(context.Background(), …)` (never block; never reuse a wake-scoped ctx); - log + emit `agent_auth_reset`; empty target → no reset (opt-out default) -- [ ] keep the recovered Telegram notice (Task 3) and the reset independent (notice always; - reset only when target set); a `ResetAuth` error is logged, not fatal -- [ ] write tests: target set → recovery triggers `ResetAuth(target)` once; no target → no - reset, notice still emitted; `ResetAuth` error logged not fatal -- [ ] run `go test ./... -race` for touched packages; gofumpt; vet — pass before Task 9 + log + emit `agent_auth_reset`; empty target → no reset (opt-out default). Implemented via + `wirePoolRecovery` (registered unconditionally near `SetOnFailover`), with + `poolAuthResetFunc`/`poolRecoveredNoticeFunc` factored out for direct testing. +- [x] keep the recovered Telegram notice (Task 3) and the reset independent (notice always + via `SetOnPoolRecoveredNotice` fanned across `failoverBroker.Channels()`; reset only when + target set via `SetOnPoolRecovered`); a `ResetAuth` error is logged, not fatal +- [x] write tests: target set → recovery triggers `ResetAuth(target)` once + one + `agent_auth_reset` audit event; no target → no reset; notice fires across broker channels + regardless of target; `ResetAuth` error logged not fatal; nil manager/nil broker no-op +- [x] run `go test ./... -race` for touched packages; gofumpt; vet — pass before Task 9 ### Task 9: Verify acceptance criteria - [ ] both members exhausted → exactly one "pool exhausted" notice (no flap stream) From bcc876a34ab0c1267a1f743b80118caef9cf25bf Mon Sep 17 00:00:00 2001 From: Nikita Nemirovsky Date: Fri, 22 May 2026 16:44:51 +0800 Subject: [PATCH 10/19] test: verify acceptance criteria for pool exhaustion + auth reset --- cmd/sluice/main_test.go | 13 ++++---- ...22-pool-exhaustion-and-agent-auth-reset.md | 31 +++++++++++++++---- internal/proxy/pool_recovery_monitor_test.go | 14 ++++----- internal/vault/pool_test.go | 8 ++--- 4 files changed, 43 insertions(+), 23 deletions(-) diff --git a/cmd/sluice/main_test.go b/cmd/sluice/main_test.go index c244bd1..bcc7740 100644 --- a/cmd/sluice/main_test.go +++ b/cmd/sluice/main_test.go @@ -1770,8 +1770,9 @@ func newRecoveryTestStore(t *testing.T) *store.Store { return db } -func seedRecoveryPool(t *testing.T, db *store.Store, pool string, members []string) { +func seedRecoveryPool(t *testing.T, db *store.Store, members []string) { t.Helper() + const pool = "codex" for _, m := range members { if err := db.AddCredentialMeta(m, "oauth", "https://auth.example.com/token"); err != nil { t.Fatalf("add credential meta %q: %v", m, err) @@ -1787,7 +1788,7 @@ func seedRecoveryPool(t *testing.T, db *store.Store, pool string, members []stri // agent_auth_reset audit event on recovery. func TestPoolAuthResetFuncTargetSet(t *testing.T) { db := newRecoveryTestStore(t) - seedRecoveryPool(t, db, "codex", []string{"acct_a", "acct_b"}) + seedRecoveryPool(t, db, []string{"acct_a", "acct_b"}) if err := db.SetPoolAuthResetTarget("codex", "openai-codex"); err != nil { t.Fatalf("SetPoolAuthResetTarget: %v", err) } @@ -1817,7 +1818,7 @@ func TestPoolAuthResetFuncTargetSet(t *testing.T) { // covered by TestPoolRecoveredNoticeFunc). func TestPoolAuthResetFuncNoTarget(t *testing.T) { db := newRecoveryTestStore(t) - seedRecoveryPool(t, db, "codex", []string{"acct_a", "acct_b"}) + seedRecoveryPool(t, db, []string{"acct_a", "acct_b"}) mgr := &mockContainerMgr{} logger := &fakeAuditLogger{} @@ -1839,7 +1840,7 @@ func TestPoolAuthResetFuncNoTarget(t *testing.T) { // panic, no audit event) and does not crash. func TestPoolAuthResetFuncErrorNotFatal(t *testing.T) { db := newRecoveryTestStore(t) - seedRecoveryPool(t, db, "codex", []string{"acct_a", "acct_b"}) + seedRecoveryPool(t, db, []string{"acct_a", "acct_b"}) if err := db.SetPoolAuthResetTarget("codex", "openai-codex"); err != nil { t.Fatalf("SetPoolAuthResetTarget: %v", err) } @@ -1862,7 +1863,7 @@ func TestPoolAuthResetFuncErrorNotFatal(t *testing.T) { // (no panic), covering standalone deployments. func TestPoolAuthResetFuncNilManager(t *testing.T) { db := newRecoveryTestStore(t) - seedRecoveryPool(t, db, "codex", []string{"acct_a", "acct_b"}) + seedRecoveryPool(t, db, []string{"acct_a", "acct_b"}) if err := db.SetPoolAuthResetTarget("codex", "openai-codex"); err != nil { t.Fatalf("SetPoolAuthResetTarget: %v", err) } @@ -1889,7 +1890,7 @@ func TestPoolRecoveredNoticeFunc(t *testing.T) { // TestPoolRecoveredNoticeFuncNoBroker asserts a nil broker yields a no-op // callback (no panic). -func TestPoolRecoveredNoticeFuncNoBroker(t *testing.T) { +func TestPoolRecoveredNoticeFuncNoBroker(_ *testing.T) { fn := poolRecoveredNoticeFunc(nil) fn("codex") // must not panic time.Sleep(20 * time.Millisecond) diff --git a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md index 6e23903..854324d 100644 --- a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md +++ b/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md @@ -385,15 +385,34 @@ precedent) and emits an `agent_auth_reset` audit event. - [x] run `go test ./... -race` for touched packages; gofumpt; vet — pass before Task 9 ### Task 9: Verify acceptance criteria -- [ ] both members exhausted → exactly one "pool exhausted" notice (no flap stream) -- [ ] B1 cooldown reflects the upstream window (member not re-probed every 60s) -- [ ] recovery edge → one "pool recovered" notice; auth reset fires only for pools with a +- [x] both members exhausted → exactly one "pool exhausted" notice (no flap stream) + (covered: `TestExhaustedNoticeEdgeAcrossManyResponses`, `TestMarkPoolExhaustedEdge` in + `internal/proxy/pool_recovery_monitor_test.go`) +- [x] B1 cooldown reflects the upstream window (member not re-probed every 60s) + (covered: `TestCooldownFromResponse` + `TestCooldownFromResponseNilSafe` in + `internal/proxy/pool_failover_test.go`; `cooldownFromResponse` wired into `handlePoolFailover` + at `pool_failover.go:629`) +- [x] recovery edge → one "pool recovered" notice; auth reset fires only for pools with a target; hermes reset runs as 10000:10000 (no root-chown of auth.json) -- [ ] channel parity verified: `auth_reset_target` settable+readable via CLI, REST, Telegram -- [ ] full `go test ./...`; `go vet ./...`; `go vet -tags=e2e ./e2e/`; `gofumpt -l` clean; + (covered: `TestRecoveryMonitorEdgeNoticeAndCallback`, `TestRecoveryMonitorUnequalCooldowns`, + `TestRecoveryMonitorPoolRemovedFiresNoNotice`; `TestPoolAuthResetFuncTargetSet`/`...NoTarget` + in `cmd/sluice/main_test.go` assert target-gating + the `agent_auth_reset` audit event; + `TestResetAuthHermesRunsAsRuntimeUID` asserts 10000:10000) +- [x] channel parity verified: `auth_reset_target` settable+readable via CLI, REST, Telegram + (CLI `TestHandlePoolAuthResetTarget` in `cmd/sluice/pool_test.go`; REST + `TestPostApiPoolsNameAuthResetTarget` + `TestPostApiPools_WithAuthResetTarget` in + `internal/api/server_test.go`; Telegram `TestHandlePoolAuthResetTarget` in + `internal/telegram/commands_test.go`; channel-agnostic `internal/poolops` tests + `TestCreateWithAuthResetTarget`/`TestSetAuthResetTargetSetAndClear`) +- [x] full `go test ./...`; `go vet ./...`; `go vet -tags=e2e ./e2e/`; `gofumpt -l` clean; `golangci-lint run ./...` 0 issues; `make generate` then `git diff --exit-code internal/api/api.gen.go` clean -- [ ] independently verify committed HEAD builds + tests pass (do not trust subagent green) + (results: build OK; `go test ./...` 2826 pass / 0 fail across 14 pkgs; vet clean; e2e vet + clean; gofumpt clean on tracked source — `internal/api/api.gen.go` is the raw oapi-codegen + output, byte-stable under `make generate`, NOT gofumpt-formatted by project convention and + CI-green on main; golangci-lint v2.9.0 0 issues; `make generate` produces no api.gen.go diff) +- [x] independently verify committed HEAD builds + tests pass (do not trust subagent green) + (verified post-commit: clean `git status`, build + full suite re-run green at HEAD) ### Task 10: [Final] Documentation - [ ] update `CLAUDE.md` credential-pools section: B1 cooldown-from-hints, A1/A2 diff --git a/internal/proxy/pool_recovery_monitor_test.go b/internal/proxy/pool_recovery_monitor_test.go index 4e298e4..01b6a27 100644 --- a/internal/proxy/pool_recovery_monitor_test.go +++ b/internal/proxy/pool_recovery_monitor_test.go @@ -36,9 +36,9 @@ default = "deny" // twoMemberPool builds a 2-member failover pool resolver and stores it on the // server. Returns the live resolver. -func twoMemberPool(t *testing.T, srv *Server, name, a, b string) *vault.PoolResolver { +func twoMemberPool(t *testing.T, srv *Server, a, b string) *vault.PoolResolver { t.Helper() - pool := store.Pool{Name: name, Strategy: store.PoolStrategyFailover} + pool := store.Pool{Name: "p", Strategy: store.PoolStrategyFailover} pool.Members = []store.PoolMember{ {Credential: a, Position: 0}, {Credential: b, Position: 1}, @@ -70,7 +70,7 @@ func TestMarkPoolExhaustedEdge(t *testing.T) { // but here we only test the edge bookkeeping, so use a pool that stays // exhausted (no resolver -> scanRecovery clears it). Store a resolver whose // members are all cooling so it stays exhausted. - pr := twoMemberPool(t, srv, "p", "a", "b") + pr := twoMemberPool(t, srv, "a", "b") far := time.Now().Add(time.Hour) pr.MarkCooldown("a", far, "429") pr.MarkCooldown("b", far, "429") @@ -139,7 +139,7 @@ func TestExhaustedNoticeEdgeAcrossManyResponses(t *testing.T) { // exactly one onPoolRecovered call. func TestRecoveryMonitorEdgeNoticeAndCallback(t *testing.T) { srv := newMonitorTestServer(t) - pr := twoMemberPool(t, srv, "p", "a", "b") + pr := twoMemberPool(t, srv, "a", "b") var notices, resets int32 var mu sync.Mutex @@ -195,7 +195,7 @@ func TestRecoveryMonitorEdgeNoticeAndCallback(t *testing.T) { // is still cooling. Uses short injected durations (no 60s sleep). func TestRecoveryMonitorUnequalCooldowns(t *testing.T) { srv := newMonitorTestServer(t) - pr := twoMemberPool(t, srv, "p", "memA", "memB") + pr := twoMemberPool(t, srv, "memA", "memB") var notices int32 srv.SetOnPoolRecoveredNotice(func(string) { atomic.AddInt32(¬ices, 1) }) @@ -225,7 +225,7 @@ func TestRecoveryMonitorUnequalCooldowns(t *testing.T) { // exhausted has its state dropped and fires no recovered notice. func TestRecoveryMonitorPoolRemovedFiresNoNotice(t *testing.T) { srv := newMonitorTestServer(t) - pr := twoMemberPool(t, srv, "p", "a", "b") + pr := twoMemberPool(t, srv, "a", "b") pr.MarkCooldown("a", time.Now().Add(time.Hour), "429") pr.MarkCooldown("b", time.Now().Add(time.Hour), "429") @@ -291,7 +291,7 @@ func TestRecoveryMonitorStopsCleanly(t *testing.T) { // stop must not be serviced (no recovered notice ever fires). var notices int32 srv.SetOnPoolRecoveredNotice(func(string) { atomic.AddInt32(¬ices, 1) }) - pr := twoMemberPool(t, srv, "p", "a", "b") + pr := twoMemberPool(t, srv, "a", "b") pr.MarkCooldown("a", time.Now().Add(20*time.Millisecond), "429") pr.MarkCooldown("b", time.Now().Add(20*time.Millisecond), "429") srv.markPoolExhausted("p") diff --git a/internal/vault/pool_test.go b/internal/vault/pool_test.go index 4060e8b..7dc0668 100644 --- a/internal/vault/pool_test.go +++ b/internal/vault/pool_test.go @@ -955,7 +955,7 @@ func TestHasHealthyMember(t *testing.T) { }{ { name: "both healthy", - setup: func(pr *PoolResolver) {}, + setup: func(_ *PoolResolver) {}, pool: "pool", want: true, }, @@ -1001,7 +1001,7 @@ func TestHasHealthyMember(t *testing.T) { }, { name: "unknown pool", - setup: func(pr *PoolResolver) {}, + setup: func(_ *PoolResolver) {}, pool: "nope", want: false, }, @@ -1039,7 +1039,7 @@ func TestSoonestCooldown(t *testing.T) { }{ { name: "no member cooling", - setup: func(pr *PoolResolver, base time.Time) {}, + setup: func(_ *PoolResolver, _ time.Time) {}, pool: "pool", wantOK: false, }, @@ -1085,7 +1085,7 @@ func TestSoonestCooldown(t *testing.T) { }, { name: "unknown pool", - setup: func(pr *PoolResolver, base time.Time) {}, + setup: func(_ *PoolResolver, _ time.Time) {}, pool: "nope", wantOK: false, }, From 889e9e62c0e1775b34e5480781a856b76a677b8e Mon Sep 17 00:00:00 2001 From: Nikita Nemirovsky Date: Fri, 22 May 2026 16:49:06 +0800 Subject: [PATCH 11/19] docs: document pool exhaustion handling + agent auth auto-reset --- CLAUDE.md | 35 +++++++++++-------- ...22-pool-exhaustion-and-agent-auth-reset.md | 6 ++-- 2 files changed, 23 insertions(+), 18 deletions(-) rename docs/plans/{ => completed}/20260522-pool-exhaustion-and-agent-auth-reset.md (99%) diff --git a/CLAUDE.md b/CLAUDE.md index eec0716..853c084 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -105,14 +105,16 @@ The only acceptable single-channel features have a documented rationale making t ## Agent Profiles -Profiles abstract per-agent runtime conventions so container managers stay agent-agnostic. Each carries `EnvFileRelPath` (phantom env-var path), `ReloadCmd` (argv for in-place secret reload, or nil), `WireMCPCmd` (argv to register sluice as an MCP server). +Profiles abstract per-agent runtime conventions so container managers stay agent-agnostic. Each carries `EnvFileRelPath` (phantom env-var path), `ReloadCmd` (argv for in-place secret reload, or nil), `WireMCPCmd` (argv to register sluice as an MCP server), `ResetAuthCmd(target)` (argv to un-latch the agent's local auth state on pool recovery, or nil), and `ExecUser()` (UID:GID for file-writing execs, "" = image USER). -| Profile | Env file | Reload | MCP wiring | -|---------|----------|--------|------------| -| `openclaw` (default) | `~/.openclaw/.env` | `node -e secrets.reload` over the agent's WebSocket gateway | `node -e wire-mcp ` patches `mcp.servers.` | -| `hermes` | `~/.hermes/.env` | None (see caveats) | sh wrapper patches `mcp_servers..url` in `~/.hermes/config.yaml` (see caveats) | +| Profile | Env file | Reload | MCP wiring | Auth reset | Exec user | +|---------|----------|--------|------------|------------|-----------| +| `openclaw` (default) | `~/.openclaw/.env` | `node -e secrets.reload` over the agent's WebSocket gateway | `node -e wire-mcp ` patches `mcp.servers.` | none (nil; openclaw latch unverified) | "" (root) | +| `hermes` | `~/.hermes/.env` | None (see caveats) | sh wrapper patches `mcp_servers..url` in `~/.hermes/config.yaml` (see caveats) | `hermes auth reset ` (pure argv `/opt/hermes/.venv/bin/hermes auth reset `) | `10000:10000` | -Adding a profile is a single edit to `internal/container/agent_profile.go` (register a struct in `builtinProfiles`); all three backends consume it via `BuildEnvInjectionScriptForProfile`, `profile.ReloadCmd()`, `profile.WireMCPCmd()`. +`ExecUser` exists because hermes runs its gateway as the non-root runtime UID 10000; a default-root `docker exec` that writes a hermes-owned file (`auth.json`) root-chowns it and bricks the gateway, so `ResetAuth` execs as `profile.ExecUser()`. The target is validated (non-empty, no NUL, charset `[A-Za-z0-9_.:-]+`) before exec; `ResetAuthCmd` is pure argv (no `sh -c`) so there is no shell-metachar threat. + +Adding a profile is a single edit to `internal/container/agent_profile.go` (register a struct in `builtinProfiles`); all three backends consume it via `BuildEnvInjectionScriptForProfile`, `profile.ReloadCmd()`, `profile.WireMCPCmd()`, `profile.ResetAuthCmd()`, `profile.ExecUser()`. Hermes caveats: - `ReloadCmd` nil; `ReloadSecrets` logs a notice, returns nil. New phantom tokens take effect on next Hermes message or `/reload-mcp`. @@ -174,18 +176,19 @@ A **pool** backs one phantom identity with **N real OAuth credentials**. The age **CLI:** ``` -sluice pool create --members credA,credB[,credC] # ordered; rejects static; namespace must not collide with a credential name +sluice pool create --members credA,credB[,credC] [--auth-reset-target ] # ordered; rejects static; namespace must not collide with a credential name sluice pool list -sluice pool status # active member, per-member health (healthy / cooldown + until + reason) +sluice pool status # active member, per-member health (healthy / cooldown + until + reason), auth-reset target sluice pool rotate # operator override: advance active member +sluice pool set-auth-reset # set/clear the recovery auth-reset target (a single - clears) sluice pool remove ``` -Pools are reachable from all channels — CLI `sluice pool`, REST `/api/pools` (`GET`/`POST`, `GET`/`DELETE /api/pools/{name}`, `POST /api/pools/{name}/rotate`), and Telegram `/pool` — all via the channel-agnostic `internal/poolops`. +Pools are reachable from all channels — CLI `sluice pool`, REST `/api/pools` (`GET`/`POST`, `GET`/`DELETE /api/pools/{name}`, `POST /api/pools/{name}/rotate`, `POST /api/pools/{name}/auth-reset-target`), and Telegram `/pool` (incl. `/pool set-auth-reset ` and an optional 3rd `/pool create` arg) — all via the channel-agnostic `internal/poolops`. Auto-failover on 429/401 is primary; `pool rotate` is an override. -**Data model (migration `000006_credential_pools`):** `credential_pools` (name, strategy reserved `failover`), `credential_pool_members` (ordered, pool->credential FK), `credential_health` (`healthy|cooldown`, `cooldown_until`, `last_failure_reason`), all CHECK-constrained. Store API in `internal/store/pools.go`. `reloadAll` loads pool+health into an atomic-pointer-swapped `PoolResolver` (`internal/vault/pool.go`), rewired via `srv.StorePool`/`SetPoolResolver` on SIGHUP and the 2s data-version watcher. +**Data model (migration `000006_credential_pools`):** `credential_pools` (name, strategy reserved `failover`, `auth_reset_target` added by migration `000008_pool_auth_reset` — empty default = opt-out), `credential_pool_members` (ordered, pool->credential FK), `credential_health` (`healthy|cooldown`, `cooldown_until`, `last_failure_reason`), all CHECK-constrained. Store API in `internal/store/pools.go` (`SetPoolAuthResetTarget`). `reloadAll` loads pool+health into an atomic-pointer-swapped `PoolResolver` (`internal/vault/pool.go`), rewired via `srv.StorePool`/`SetPoolResolver` on SIGHUP and the 2s data-version watcher. **Phase 1 — phantom indirection (pool phantom -> active member):** @@ -199,13 +202,15 @@ Auto-failover on 429/401 is primary; `pool rotate` is an override. - **Classification** (`classifyFailover`, `internal/proxy/pool_failover.go`, from `SluiceAddon.Response` for pooled destinations): `429`/`403 + insufficient_quota` -> rate-limited; `401`/token-body `invalid_grant`/`invalid_token` -> auth-failure; `5xx`/other -> no-op. Token-endpoint body trusted only when the request URL matched the OAuth index. - **Pool attribution** (`poolForResponse`): a response is pool-attributed either (a) the flow's CONNECT host has a pooled binding (API-host 429/403), or (b) the request URL matches the OAuth token-URL index for a member (token-endpoint 401/`invalid_grant`). (b) is essential — an OAuth refresh hits `auth.openai.com` (no pool binding; only `api.openai.com` has one), so without it the token-endpoint classification is dead code for Codex. Member recovery + fail-closed are the R1 mechanism above (`OAuthIndex.MatchAll` + the refresh-token join key, never `OAuthIndex.Match`). -- **Synchronous in-memory failover (I1):** health is updated in-process before the response returns (`MarkCooldown` write lock, `ResolveActive` read lock) so the switch never waits on the 2s watcher (which only reconciles); a detached `onFailover` also writes `SetCredentialHealth(member,'cooldown',now+ttl,reason)` for durability. TTLs: `vault.RateLimitCooldown`=60s, `vault.AuthFailCooldown`=300s. **Cooldown extension is monotonic on both layers:** a member parked 300s for auth that then trips a 60s rate-limit keeps the LATER expiry — `MarkCooldown` and `SetCredentialHealth`'s `cooldown` upsert (CASE-compared against the stored future `cooldown_until`) both keep `max(existing-future, new)`. Only extend is monotonic: an explicit clear (zero/past `until`) and any transition to `healthy` still shorten/clear, and lazy expiry still wins over an expired stored cooldown. No in-flight retry — next request uses the new member. +- **Synchronous in-memory failover (I1):** health is updated in-process before the response returns (`MarkCooldown` write lock, `ResolveActive` read lock) so the switch never waits on the 2s watcher (which only reconciles); a detached `onFailover` also writes `SetCredentialHealth(member,'cooldown',now+ttl,reason)` for durability. **Cooldown window (B1):** `cooldownFromResponse(class, f.Response.Header)` (`internal/proxy/pool_failover.go`) derives the TTL from the upstream recovery hints — `Retry-After` (delta-seconds or HTTP-date), then `x-ratelimit-reset` / `x-ratelimit-reset-requests` / `x-ratelimit-reset-tokens` (delta or epoch) — clamped to `[floor(class), vault.MaxCooldown=6h]`; no hint falls back to the class default (`vault.RateLimitCooldown`=60s / `vault.AuthFailCooldown`=300s). Floors: rate-limit `vault.MinRateLimitFloor`=10s (a short parsed window is honored, not floored up to 60s), auth-failure `AuthFailCooldown` (a revoked/expired token is never re-probed in seconds). This honors the real multi-hour quota window so a usage-limited member is not re-probed every 60s (the degrade-flap root cause). No body parsing yet. **Cooldown extension is monotonic on both layers:** a member parked 300s for auth that then trips a 60s rate-limit keeps the LATER expiry — `MarkCooldown` and `SetCredentialHealth`'s `cooldown` upsert (CASE-compared against the stored future `cooldown_until`) both keep `max(existing-future, new)`. Only extend is monotonic: an explicit clear (zero/past `until`) and any transition to `healthy` still shorten/clear, and lazy expiry still wins over an expired stored cooldown. No in-flight retry — next request uses the new member. - **Reload doesn't resurrect a cooled member:** the durable write is detached/best-effort, so any reload (SIGHUP or the 2s watcher on any unrelated DB write) rebuilds the resolver from store rows via `NewPoolResolver`; `Server.StorePool` calls `PoolResolver.MergeLiveCooldowns(prev)` to carry forward still-active in-memory cooldowns before the atomic swap (monotonic; drops cooldowns for credentials no longer in any pool). -- **Audit:** `cred_failover` (Verdict `failover`, Credential = cooled member), `Reason = ":->:<429|403|401|invalid_grant>"`, emitted synchronously in `handlePoolFailover`. -- **Telegram:** best-effort non-blocking notice "pool failed over -> ()" (plain text); store write + every channel `Notify` detached into their own goroutine so the response path never blocks. +- **Exhaustion + edge-triggered notices (A1/A2):** a pool is exhausted iff `PoolResolver.HasHealthyMember(pool)` (RLock, single `now`, mirrors `cooling()` lazy-expiry) is false — NOT `to == from`. Per-pool exhaustion state lives on the long-lived `Server` (`poolExhausted` map, NOT `PoolHealth`, so it survives resolver swaps and is not pruned on membership change). `handlePoolFailover` emits one "pool exhausted" notice on the `false->true` edge and wakes a dedicated recovery monitor goroutine (cap-1 `recoveryWake`). The monitor (`internal/proxy/server.go`, started in `New`, stopped idempotently from both `Close` and `GracefulShutdown`) sleeps until `SoonestCooldown(pool)` (clamped to a ~1s floor), `Load()`s the current resolver each wake, and on `HasHealthyMember -> true` flips `true->false`, emits one "pool recovered" notice (`FormatPoolRecoveredNotice`), and invokes `onPoolRecovered`. This replaces the old per-cooldown-window flap that respammed `cred_failover` + a Telegram notice every ~30/60s. +- **Recovery auto-reset (opt-in, per pool):** if the recovered pool has a non-empty `auth_reset_target`, `onPoolRecovered` (wired in `cmd/sluice/main.go` via `wirePoolRecovery`) calls `containerMgr.ResetAuth(ctx, target)` in a detached goroutine with a fresh bounded context and emits an `agent_auth_reset` audit event (`Verdict "recover"`, `Credential` = pool, `Reason` = target). Empty target = no reset (opt-out default); a `ResetAuth` error is logged, not fatal. This un-latches an agent (hermes) that latched "usage limit reached" so it resumes without a manual `auth reset`. +- **Audit:** `cred_failover` (Verdict `failover`, Credential = cooled member), `Reason = ":->:<429|403|401|invalid_grant>"`, emitted synchronously in `handlePoolFailover`; `agent_auth_reset` (Verdict `recover`) on a successful recovery auto-reset. +- **Telegram:** best-effort non-blocking notice "pool failed over -> ()" on a real transition, plus the edge-triggered exhausted/recovered notices (`SetOnPoolRecoveredNotice` fans the recovered notice across `failoverBroker.Channels()` independent of the auth-reset); store write + every channel `Notify` detached into their own goroutine so the response path never blocks. - **Known limitation:** streaming responses bypass failover (`handlePoolFailover` runs only from the buffered `Response` addon; SSE / `StreamLargeBodies`-exceeding bodies set `f.Stream=true` and skip it). Impact low (quota/auth bodies are tiny JSON); the next non-streamed request fails over normally. -**Key files:** `internal/store/migrations/000006_credential_pools.{up,down}.sql`, `internal/store/pools.go`, `internal/vault/pool.go`, `internal/proxy/pool_failover.go`, `cmd/sluice/pool.go`, plus pool routing in `internal/proxy/addon.go` / `oauth_response.go`. +**Key files:** `internal/store/migrations/000006_credential_pools.{up,down}.sql`, `000008_pool_auth_reset.{up,down}.sql`, `internal/store/pools.go`, `internal/vault/pool.go`, `internal/proxy/pool_failover.go`, `internal/proxy/server.go` (recovery monitor), `cmd/sluice/pool.go` / `main.go` (`wirePoolRecovery`), `internal/container/agent_profile.go` (`ResetAuthCmd`/`ExecUser`), plus pool routing in `internal/proxy/addon.go` / `oauth_response.go`. ### Protocol-specific handling @@ -272,7 +277,7 @@ Two-phase: port-based guess first (standard ports 443/22/25/… route on it), by ### Audit logger -Optional. JSON lines, blake3 hash chain (`prev_hash`, genesis = blake3("")); recovers across restarts by reading the last line. `sluice audit verify` walks the log and reports broken links. Common action names: `tool_call` (MCP verdict), `inspect_block` (ContentInspector arg block), `exec_block` (ExecInspector block), `response_dlp_redact` (HTTPS response/header redacted), `inject` (phantom injected outbound), `deny` (connection denied at SOCKS5/SNI). +Optional. JSON lines, blake3 hash chain (`prev_hash`, genesis = blake3("")); recovers across restarts by reading the last line. `sluice audit verify` walks the log and reports broken links. Common action names: `tool_call` (MCP verdict), `inspect_block` (ContentInspector arg block), `exec_block` (ExecInspector block), `response_dlp_redact` (HTTPS response/header redacted), `inject` (phantom injected outbound), `deny` (connection denied at SOCKS5/SNI), `cred_failover` (pool member cooled, Verdict `failover`), `agent_auth_reset` (recovery auto-reset run, Verdict `recover`). ### MCP gateway diff --git a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md b/docs/plans/completed/20260522-pool-exhaustion-and-agent-auth-reset.md similarity index 99% rename from docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md rename to docs/plans/completed/20260522-pool-exhaustion-and-agent-auth-reset.md index 854324d..7be9f25 100644 --- a/docs/plans/20260522-pool-exhaustion-and-agent-auth-reset.md +++ b/docs/plans/completed/20260522-pool-exhaustion-and-agent-auth-reset.md @@ -415,12 +415,12 @@ precedent) and emits an `agent_auth_reset` audit event. (verified post-commit: clean `git status`, build + full suite re-run green at HEAD) ### Task 10: [Final] Documentation -- [ ] update `CLAUDE.md` credential-pools section: B1 cooldown-from-hints, A1/A2 +- [x] update `CLAUDE.md` credential-pools section: B1 cooldown-from-hints, A1/A2 exhaustion+edge notices (replacing the per-window dedup wording), per-pool `auth_reset_target` + recovery auto-reset, the `agent_auth_reset` audit action, and the `ResetAuthCmd`/`ExecUser` profile hooks in the Agent Profiles table -- [ ] cross-check the CLI subcommand list + the auto-generated `api/openapi.yaml` in CLAUDE.md -- [ ] move this plan to `docs/plans/completed/` +- [x] cross-check the CLI subcommand list + the auto-generated `api/openapi.yaml` in CLAUDE.md +- [x] move this plan to `docs/plans/completed/` ## Post-Completion *Items requiring manual intervention or external systems — informational only* From be34498de0c878b829e84744b306319ab72190d9 Mon Sep 17 00:00:00 2001 From: Nikita Nemirovsky Date: Fri, 22 May 2026 19:51:09 +0800 Subject: [PATCH 12/19] fix(pools): enforce auth_reset_target charset parity at set time across channels --- cmd/sluice/pool_test.go | 9 ++++ internal/api/server_test.go | 20 ++++++-- internal/container/agent_profile.go | 4 +- internal/container/apple.go | 10 +++- internal/container/docker.go | 2 +- internal/container/tart.go | 10 +++- internal/container/types.go | 11 ++++- internal/poolops/poolops.go | 27 ++++++----- internal/poolops/poolops_test.go | 72 +++++++++++++++++++++-------- internal/proxy/server.go | 9 ++++ internal/telegram/commands_test.go | 11 +++++ 11 files changed, 145 insertions(+), 40 deletions(-) diff --git a/cmd/sluice/pool_test.go b/cmd/sluice/pool_test.go index f3189c9..d306170 100644 --- a/cmd/sluice/pool_test.go +++ b/cmd/sluice/pool_test.go @@ -165,6 +165,15 @@ func TestHandlePoolAuthResetTarget(t *testing.T) { if err := handlePoolCommand([]string{"set-auth-reset", "--db", dbPath, "codex"}); err == nil { t.Error("expected usage error for set-auth-reset with too few args") } + + // A target outside the allowlist (space, slash) is rejected at set time, not + // stored with success and then silently un-executable at recovery (F1). + for name, target := range map[string]string{"space": "openai codex", "slash": "openai/codex"} { + if err := handlePoolCommand([]string{"set-auth-reset", "--db", dbPath, "codex", target}); err == nil { + t.Errorf("%s target: expected rejection, got nil error", name) + } + assertStoredAuthResetTarget(t, dbPath, "codex", "") + } } func assertStoredAuthResetTarget(t *testing.T, dbPath, pool, want string) { diff --git a/internal/api/server_test.go b/internal/api/server_test.go index fa069fb..60cb452 100644 --- a/internal/api/server_test.go +++ b/internal/api/server_test.go @@ -3098,10 +3098,22 @@ func TestPostApiPoolsNameAuthResetTarget(t *testing.T) { t.Fatalf("after clear stored AuthResetTarget = %q, want empty", got.AuthResetTarget) } - // Invalid target (newline) -> 400. - rec = post(`{"auth_reset_target": "bad\ntarget"}`) - if rec.Code != http.StatusBadRequest { - t.Fatalf("invalid target: expected 400, got %d: %s", rec.Code, rec.Body.String()) + // Invalid targets -> 400 and no store mutation. Newline was caught by the + // old looser rule; a space and a slash were NOT (F1) and are the cases that + // would otherwise store with 200 and fail silently at recovery. + for name, target := range map[string]string{ + "newline": `bad\ntarget`, + "space": `openai codex`, + "slash": `openai/codex`, + } { + rec = post(`{"auth_reset_target": "` + target + `"}`) + if rec.Code != http.StatusBadRequest { + t.Fatalf("%s target: expected 400, got %d: %s", name, rec.Code, rec.Body.String()) + } + got, _ = st.GetPool("pool1") + if got.AuthResetTarget != "" { + t.Fatalf("%s target: store mutated to %q despite 400", name, got.AuthResetTarget) + } } // Unknown pool -> 404. diff --git a/internal/container/agent_profile.go b/internal/container/agent_profile.go index 66423cf..34335f0 100644 --- a/internal/container/agent_profile.go +++ b/internal/container/agent_profile.go @@ -40,7 +40,7 @@ type AgentProfile struct { // given target so it retries after a pool quota window passes. It is // pure argv (no `sh -c` wrapper), so there is no shell-metacharacter // threat from the interpolated target; the target is still validated - // (validateResetAuthTarget) before exec as defense in depth. + // (ValidateResetAuthTarget) before exec as defense in depth. // Returning nil means the profile has no auth-reset mechanism; the // caller should log a notice and rely on the agent recovering on its // own (mirrors a nil ReloadCmd). @@ -153,7 +153,7 @@ var HermesProfile = &AgentProfile{ // ResetAuthCmd un-latches hermes' local auth state for the given // target via `hermes auth reset `. It is pure argv (no // `sh -c`), so the target cannot smuggle shell metacharacters — but - // the caller still validates it (validateResetAuthTarget) before + // the caller still validates it (ValidateResetAuthTarget) before // exec. The exec must run as the runtime UID (see execUser above) or // it root-chowns hermes-owned auth files and bricks the gateway. ResetAuthCmd: func(target string) []string { diff --git a/internal/container/apple.go b/internal/container/apple.go index 65bb86a..c4556ed 100644 --- a/internal/container/apple.go +++ b/internal/container/apple.go @@ -255,12 +255,20 @@ func (m *AppleManager) ReloadSecrets(ctx context.Context) error { // ResetAuth clears the agent's local auth state for target. The mechanism // is profile-specific; a nil ResetAuthCmd is a logged no-op (mirrors // ReloadSecrets). The target is validated before exec. +// +// Unlike DockerManager, this does not thread profile.ExecUser() into the exec: +// cli.Exec (the apple `container` CLI) has no per-exec user flag. That is +// acceptable because the only profile needing a non-root exec UID is hermes, +// which is docker/local-only (per CLAUDE.md the Apple/tart backends do not run +// hermes), so the root-chown hazard ExecUser guards against on docker cannot +// arise here. If a future profile needs a runtime UID on this backend, the CLI +// invocation must be extended to pass it. func (m *AppleManager) ResetAuth(ctx context.Context, target string) error { if m.profile.ResetAuthCmd == nil { log.Printf("agent profile %q has no auth-reset command; agent must recover on its own", m.profile.Name) return nil } - if err := validateResetAuthTarget(target); err != nil { + if err := ValidateResetAuthTarget(target); err != nil { return err } _, err := m.cli.Exec(ctx, m.containerName, m.profile.ResetAuthCmd(target)) diff --git a/internal/container/docker.go b/internal/container/docker.go index b29566c..5f85c2c 100644 --- a/internal/container/docker.go +++ b/internal/container/docker.go @@ -138,7 +138,7 @@ func (m *DockerManager) ResetAuth(ctx context.Context, target string) error { log.Printf("agent profile %q has no auth-reset command; agent must recover on its own", m.profile.Name) return nil } - if err := validateResetAuthTarget(target); err != nil { + if err := ValidateResetAuthTarget(target); err != nil { return err } return m.client.ExecInContainer(ctx, m.containerName, m.profile.ExecUser(), m.profile.ResetAuthCmd(target)) diff --git a/internal/container/tart.go b/internal/container/tart.go index f4ad5a8..f4d8c6c 100644 --- a/internal/container/tart.go +++ b/internal/container/tart.go @@ -294,12 +294,20 @@ func (m *TartManager) ReloadSecrets(ctx context.Context) error { // ResetAuth clears the agent's local auth state for target. The mechanism // is profile-specific; a nil ResetAuthCmd is a logged no-op (mirrors // ReloadSecrets). The target is validated before exec. +// +// Unlike DockerManager, this does not thread profile.ExecUser() into the exec: +// cli.Exec (the tart CLI) has no per-exec user flag. That is acceptable because +// the only profile needing a non-root exec UID is hermes, which is +// docker/local-only (per CLAUDE.md the Apple/tart backends do not run hermes), +// so the root-chown hazard ExecUser guards against on docker cannot arise here. +// If a future profile needs a runtime UID on this backend, the CLI invocation +// must be extended to pass it. func (m *TartManager) ResetAuth(ctx context.Context, target string) error { if m.profile.ResetAuthCmd == nil { log.Printf("agent profile %q has no auth-reset command; agent must recover on its own", m.profile.Name) return nil } - if err := validateResetAuthTarget(target); err != nil { + if err := ValidateResetAuthTarget(target); err != nil { return err } _, err := m.cli.Exec(ctx, m.vmName, m.profile.ResetAuthCmd(target)) diff --git a/internal/container/types.go b/internal/container/types.go index 39bf169..4c06237 100644 --- a/internal/container/types.go +++ b/internal/container/types.go @@ -128,12 +128,19 @@ func ValidateEnvVarKey(key string) error { // shell), so this allowlist is defense in depth, not a shell-escape. var resetAuthTargetRe = regexp.MustCompile(`^[A-Za-z0-9_.:-]+$`) -// validateResetAuthTarget checks that an auth-reset target is non-empty, +// ValidateResetAuthTarget checks that an auth-reset target is non-empty, // free of NUL bytes, and within the allowlisted charset before it is // passed to ContainerManager.ResetAuth. Mirrors ValidateEnvVarKey's // fail-closed style: an invalid target is rejected (no exec) rather than // sanitized. -func validateResetAuthTarget(target string) error { +// +// This is the single canonical auth-reset-target validator. It is shared by +// the channel-agnostic poolops layer (set-time validation on CLI/REST/ +// Telegram) and by every container backend (exec-time defense in depth) so +// the two surfaces cannot drift: a target that the store accepts is always a +// target ResetAuth can exec. An empty target ("") means "clear / no reset" +// and is rejected here; poolops normalizes that sentinel away before calling. +func ValidateResetAuthTarget(target string) error { if target == "" { return fmt.Errorf("auth-reset target is empty") } diff --git a/internal/poolops/poolops.go b/internal/poolops/poolops.go index 49a9b3c..53a403d 100644 --- a/internal/poolops/poolops.go +++ b/internal/poolops/poolops.go @@ -15,6 +15,7 @@ import ( "strings" "time" + "github.com/nemirovsky/sluice/internal/container" "github.com/nemirovsky/sluice/internal/store" "github.com/nemirovsky/sluice/internal/vault" ) @@ -110,22 +111,26 @@ func ParseMembers(membersStr string) ([]string, error) { return members, nil } -// ErrInvalidAuthResetTarget is returned when a non-empty auth-reset target -// contains a NUL byte or newline. The target is consumed as argv (never -// shell-interpolated), so this is a minimal structural guard, not a -// shell-metachar check; channels that exec the target apply any stricter -// allowlist at exec time. -var ErrInvalidAuthResetTarget = errors.New("invalid auth_reset_target: must not contain NUL or newline characters") +// ErrInvalidAuthResetTarget wraps a non-empty auth-reset target that fails the +// canonical allowlist (container.ValidateResetAuthTarget: [A-Za-z0-9_.:-]+). +// Channels can errors.Is against it for a uniform "bad target" mapping; the +// wrapped error carries the specific reason. Set-time validation here matches +// the exec-time validation exactly so a target the store accepts is always one +// ResetAuth can exec — there is no longer a looser set-time rule that lets an +// unexecutable target (a space, a slash) store with success and then fail +// silently in the detached recovery goroutine. +var ErrInvalidAuthResetTarget = errors.New("invalid auth_reset_target") -// validateAuthResetTarget rejects a non-empty target containing a NUL byte or -// a newline. An empty target ("") is always valid and means "clear / no -// reset", so callers normalize the clear sentinel before validating. +// validateAuthResetTarget validates a non-empty target against the single +// canonical allowlist used at exec time (container.ValidateResetAuthTarget). +// An empty target ("") is always valid and means "clear / no reset", so +// callers normalize the clear sentinel before validating. func validateAuthResetTarget(target string) error { if target == "" { return nil } - if strings.ContainsAny(target, "\x00\n\r") { - return ErrInvalidAuthResetTarget + if err := container.ValidateResetAuthTarget(target); err != nil { + return fmt.Errorf("%w: %w", ErrInvalidAuthResetTarget, err) } return nil } diff --git a/internal/poolops/poolops_test.go b/internal/poolops/poolops_test.go index 0f63dbc..7ea10ad 100644 --- a/internal/poolops/poolops_test.go +++ b/internal/poolops/poolops_test.go @@ -147,19 +147,35 @@ func TestCreateWithEmptyAuthResetTargetDefaultsEmpty(t *testing.T) { } } +// TestCreateWithInvalidAuthResetTarget asserts a target that the exec-time +// allowlist (container.ValidateResetAuthTarget: [A-Za-z0-9_.:-]+) would reject +// is also rejected at create time, on the channel-agnostic path, so it never +// stores with success and then fails silently in the detached recovery +// goroutine. A newline/NUL was caught by the old looser rule; a space and a +// slash were NOT (the F1 silent-failure bug) and are the load-bearing cases. func TestCreateWithInvalidAuthResetTarget(t *testing.T) { - db := newTestStore(t, "acct_a") - err := poolops.CreateWithAuthResetTarget(db, "p", "", []string{"acct_a"}, "bad\ntarget") - if !errors.Is(err, poolops.ErrInvalidAuthResetTarget) { - t.Fatalf("err = %v, want ErrInvalidAuthResetTarget", err) - } - // The pool must not have been created when the target is invalid. - p, err := db.GetPool("p") - if err != nil { - t.Fatalf("GetPool: %v", err) - } - if p != nil { - t.Fatalf("pool created despite invalid target: %+v", p) + cases := map[string]string{ + "newline": "bad\ntarget", + "nul": "bad\x00target", + "space": "openai codex", + "slash": "openai/codex", + } + for name, target := range cases { + t.Run(name, func(t *testing.T) { + db := newTestStore(t, "acct_a") + err := poolops.CreateWithAuthResetTarget(db, "p", "", []string{"acct_a"}, target) + if !errors.Is(err, poolops.ErrInvalidAuthResetTarget) { + t.Fatalf("err = %v, want ErrInvalidAuthResetTarget", err) + } + // The pool must not have been created when the target is invalid. + p, err := db.GetPool("p") + if err != nil { + t.Fatalf("GetPool: %v", err) + } + if p != nil { + t.Fatalf("pool created despite invalid target: %+v", p) + } + }) } } @@ -202,12 +218,32 @@ func TestSetAuthResetTargetUnknownPool(t *testing.T) { } func TestSetAuthResetTargetInvalid(t *testing.T) { - db := newTestStore(t, "acct_a") - if err := poolops.Create(db, "p", "", []string{"acct_a"}); err != nil { - t.Fatalf("Create: %v", err) - } - if err := poolops.SetAuthResetTarget(db, "p", "bad\x00target"); !errors.Is(err, poolops.ErrInvalidAuthResetTarget) { - t.Fatalf("err = %v, want ErrInvalidAuthResetTarget", err) + // Set-time validation mirrors the exec-time allowlist exactly: a space or + // slash (not just NUL/newline) is rejected before the store write, so an + // unexecutable target can never be persisted. + for name, target := range map[string]string{ + "nul": "bad\x00target", + "newline": "bad\ntarget", + "space": "openai codex", + "slash": "openai/codex", + } { + t.Run(name, func(t *testing.T) { + db := newTestStore(t, "acct_a") + if err := poolops.Create(db, "p", "", []string{"acct_a"}); err != nil { + t.Fatalf("Create: %v", err) + } + if err := poolops.SetAuthResetTarget(db, "p", target); !errors.Is(err, poolops.ErrInvalidAuthResetTarget) { + t.Fatalf("err = %v, want ErrInvalidAuthResetTarget", err) + } + // Store must not have been mutated. + p, err := db.GetPool("p") + if err != nil { + t.Fatalf("GetPool: %v", err) + } + if p.AuthResetTarget != "" { + t.Fatalf("target persisted despite invalid value: %q", p.AuthResetTarget) + } + }) } } diff --git a/internal/proxy/server.go b/internal/proxy/server.go index fa2c3fa..1f3ae47 100644 --- a/internal/proxy/server.go +++ b/internal/proxy/server.go @@ -3082,6 +3082,15 @@ func (s *Server) clearPoolExhausted(pool string) { // "pool recovered" notice, and invokes onPoolRecovered once. The state flip and // callback snapshot happen under poolExhaustMu so two concurrent passes cannot // both recover the same pool. +// +// Concurrency with a re-exhaustion: a 429 arriving between this scan's +// HasHealthyMember==true and the delete below races markPoolExhausted, but both +// transitions are guarded by poolExhaustMu (no torn state) and the machine +// self-corrects. If markPoolExhausted re-sets the flag after this delete, it +// returns true (a fresh edge), re-emits the exhausted notice, and wakes the +// monitor, which re-evaluates HasHealthyMember on the next scan. The worst case +// is one extra recovered/exhausted notice pair during a genuine flap — correct +// behavior, not a stuck state. func (s *Server) recoverPool(pool string) { s.poolExhaustMu.Lock() if !s.poolExhausted[pool] { diff --git a/internal/telegram/commands_test.go b/internal/telegram/commands_test.go index 65f6d51..06523ff 100644 --- a/internal/telegram/commands_test.go +++ b/internal/telegram/commands_test.go @@ -2244,6 +2244,17 @@ func TestHandlePoolAuthResetTarget(t *testing.T) { if !strings.Contains(got, "Usage:") { t.Errorf("set-auth-reset too few args = %q", got) } + + // A target outside the allowlist (a slash; a space can't survive Telegram's + // space-split arg parsing) is rejected, not stored with a success message + // and then silently un-executable at recovery (F1). + got = h.Handle(&Command{Name: "pool", Args: []string{"set-auth-reset", "codex", "openai/codex"}}) + if !strings.Contains(got, "Failed to set auth-reset target") { + t.Errorf("slash target: expected failure message, got %q", got) + } + if p, _ := s.GetPool("codex"); p.AuthResetTarget != "" { + t.Errorf("slash target: store mutated to %q despite rejection", p.AuthResetTarget) + } } func TestHandlePoolCreateNoMembers(t *testing.T) { From d26a3f0d49b168794c8c7e8c6f736bdc011e9197 Mon Sep 17 00:00:00 2001 From: Nikita Nemirovsky Date: Sat, 23 May 2026 00:16:36 +0800 Subject: [PATCH 13/19] test(pools): drop always-constant param from assertStoredAuthResetTarget (unparam) --- cmd/sluice/pool_test.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cmd/sluice/pool_test.go b/cmd/sluice/pool_test.go index d306170..1185141 100644 --- a/cmd/sluice/pool_test.go +++ b/cmd/sluice/pool_test.go @@ -120,7 +120,7 @@ func TestHandlePoolAuthResetTarget(t *testing.T) { } // Value reached the store. - assertStoredAuthResetTarget(t, dbPath, "codex", "openai-codex") + assertStoredAuthResetTarget(t, dbPath, "openai-codex") // Surfaced in list and status. out = captureStdout(t, func() { @@ -145,7 +145,7 @@ func TestHandlePoolAuthResetTarget(t *testing.T) { if !strings.Contains(out, "set to \"other-target\"") { t.Errorf("set-auth-reset output = %q", out) } - assertStoredAuthResetTarget(t, dbPath, "codex", "other-target") + assertStoredAuthResetTarget(t, dbPath, "other-target") // Clear with the "-" sentinel. out = captureStdout(t, func() { @@ -156,7 +156,7 @@ func TestHandlePoolAuthResetTarget(t *testing.T) { if !strings.Contains(out, "cleared") { t.Errorf("clear output = %q", out) } - assertStoredAuthResetTarget(t, dbPath, "codex", "") + assertStoredAuthResetTarget(t, dbPath, "") // Unknown pool and bad usage. if err := handlePoolCommand([]string{"set-auth-reset", "--db", dbPath, "missing", "x"}); err == nil { @@ -172,17 +172,18 @@ func TestHandlePoolAuthResetTarget(t *testing.T) { if err := handlePoolCommand([]string{"set-auth-reset", "--db", dbPath, "codex", target}); err == nil { t.Errorf("%s target: expected rejection, got nil error", name) } - assertStoredAuthResetTarget(t, dbPath, "codex", "") + assertStoredAuthResetTarget(t, dbPath, "") } } -func assertStoredAuthResetTarget(t *testing.T, dbPath, pool, want string) { +func assertStoredAuthResetTarget(t *testing.T, dbPath, want string) { t.Helper() db, err := store.New(dbPath) if err != nil { t.Fatalf("open db: %v", err) } defer func() { _ = db.Close() }() + const pool = "codex" p, err := db.GetPool(pool) if err != nil { t.Fatalf("GetPool: %v", err) From 408920268d093cf3e7070b17a3d80ef7c844d2a6 Mon Sep 17 00:00:00 2001 From: Nikita Nemirovsky Date: Sat, 23 May 2026 00:31:01 +0800 Subject: [PATCH 14/19] fix(pools): atomic create-with-target and schema-valid set-auth-reset response --- cmd/sluice/cred_test.go | 4 +- cmd/sluice/main_test.go | 2 +- cmd/sluice/pool_test.go | 6 +-- internal/api/server.go | 22 +++++---- internal/api/server_test.go | 43 +++++++++++++---- internal/poolops/poolops.go | 23 ++++----- internal/poolops/poolops_test.go | 5 ++ internal/store/pools.go | 7 ++- internal/store/pools_test.go | 76 +++++++++++++++--------------- internal/telegram/commands_test.go | 6 +-- 10 files changed, 114 insertions(+), 80 deletions(-) diff --git a/cmd/sluice/cred_test.go b/cmd/sluice/cred_test.go index d3195de..8b36ad1 100644 --- a/cmd/sluice/cred_test.go +++ b/cmd/sluice/cred_test.go @@ -2573,7 +2573,7 @@ func TestFinding3Round9_StoreGatedVaultDeleteOnLivePoolMember(t *testing.T) { if err := db.AddCredentialMeta("pool_mem", "oauth", "https://auth.example.com/token"); err != nil { t.Fatalf("AddCredentialMeta: %v", err) } - if err := db.CreatePoolWithMembers("codex_pool", "failover", []string{"pool_mem"}); err != nil { + if err := db.CreatePoolWithMembers("codex_pool", "failover", []string{"pool_mem"}, ""); err != nil { t.Fatalf("CreatePoolWithMembers: %v", err) } _ = db.Close() @@ -2692,7 +2692,7 @@ func TestFinding3Round9_TOCTOUInterleaveStoreGatesVaultDelete(t *testing.T) { if e != nil { return } - _ = pdb.CreatePoolWithMembers("codex_pool", "failover", []string{"racer"}) + _ = pdb.CreatePoolWithMembers("codex_pool", "failover", []string{"racer"}, "") _ = pdb.Close() }() diff --git a/cmd/sluice/main_test.go b/cmd/sluice/main_test.go index bcc7740..16ebfea 100644 --- a/cmd/sluice/main_test.go +++ b/cmd/sluice/main_test.go @@ -1778,7 +1778,7 @@ func seedRecoveryPool(t *testing.T, db *store.Store, members []string) { t.Fatalf("add credential meta %q: %v", m, err) } } - if err := db.CreatePoolWithMembers(pool, "", members); err != nil { + if err := db.CreatePoolWithMembers(pool, "", members, ""); err != nil { t.Fatalf("create pool %q: %v", pool, err) } } diff --git a/cmd/sluice/pool_test.go b/cmd/sluice/pool_test.go index 1185141..050f49e 100644 --- a/cmd/sluice/pool_test.go +++ b/cmd/sluice/pool_test.go @@ -502,7 +502,7 @@ func TestPoolRotateGuardedAgainstConcurrentRemoval(t *testing.T) { _ = db.Close() t.Fatalf("RemovePool: %v", rerr) } - if cerr := db.CreatePoolWithMembers("codex", "failover", []string{"acct_a"}); cerr != nil { + if cerr := db.CreatePoolWithMembers("codex", "failover", []string{"acct_a"}, ""); cerr != nil { _ = db.Close() t.Fatalf("recreate pool: %v", cerr) } @@ -534,7 +534,7 @@ func TestPoolRotateGuardedAgainstConcurrentRemoval(t *testing.T) { if _, rerr := db.RemovePool("codex"); rerr != nil { t.Fatalf("final RemovePool: %v", rerr) } - if cerr := db.CreatePoolWithMembers("codex", "failover", []string{"acct_a", "acct_b"}); cerr != nil { + if cerr := db.CreatePoolWithMembers("codex", "failover", []string{"acct_a", "acct_b"}, ""); cerr != nil { t.Fatalf("final recreate pool: %v", cerr) } rows, lerr := db.ListCredentialHealth() @@ -607,7 +607,7 @@ func TestPoolRotateEpochScopedRejectsCrossPoolReAdd(t *testing.T) { _ = db.Close() t.Fatalf("RemovePool(P): %v", rerr) } - if cerr := db.CreatePoolWithMembers("Q", "failover", []string{"c", "d"}); cerr != nil { + if cerr := db.CreatePoolWithMembers("Q", "failover", []string{"c", "d"}, ""); cerr != nil { _ = db.Close() t.Fatalf("recreate c,d into Q: %v", cerr) } diff --git a/internal/api/server.go b/internal/api/server.go index f1066ba..32efa7d 100644 --- a/internal/api/server.go +++ b/internal/api/server.go @@ -1856,9 +1856,10 @@ func (s *Server) PostApiPools(w http.ResponseWriter, r *http.Request) { //nolint effectiveStrategy = store.PoolStrategyFailover } out := storePoolToAPI(store.Pool{ - Name: req.Name, - Strategy: effectiveStrategy, - Members: membersToStorePoolMembers(req.Members), + Name: req.Name, + Strategy: effectiveStrategy, + Members: membersToStorePoolMembers(req.Members), + AuthResetTarget: authResetTarget, }) if p, err := s.store.GetPool(req.Name); err == nil && p != nil { out = storePoolToAPI(*p) @@ -1922,7 +1923,11 @@ func (s *Server) PostApiPoolsNameRotate(w http.ResponseWriter, _ *http.Request, // routing through poolops.SetAuthResetTarget so the three surfaces cannot // drift (channel feature-parity principle). A NUL/newline in the target is a // 400 (poolops.ErrInvalidAuthResetTarget); an unknown pool is 404. On success -// the updated pool is returned so the caller sees the persisted value. +// the updated pool is returned (200). If the post-write read-back fails the +// set still succeeded, so 204 No Content is returned rather than a partial +// Pool object: the OpenAPI Pool schema requires name+strategy+members, and the +// request body alone cannot reconstruct strategy/members, so echoing it would +// emit a schema-invalid response. func (s *Server) PostApiPoolsNameAuthResetTarget(w http.ResponseWriter, r *http.Request, name string) { //nolint:revive // generated interface name var req SetPoolAuthResetTargetRequest if err := json.NewDecoder(limitedBody(w, r)).Decode(&req); err != nil { @@ -1937,14 +1942,15 @@ func (s *Server) PostApiPoolsNameAuthResetTarget(w http.ResponseWriter, r *http. writeError(w, status, err.Error(), "") return } - w.Header().Set("Content-Type", "application/json") if p, err := s.store.GetPool(name); err == nil && p != nil { + w.Header().Set("Content-Type", "application/json") _ = json.NewEncoder(w).Encode(storePoolToAPI(*p)) return } - // The set succeeded; a read-back failure must not report failure. Echo - // the persisted value from the request instead. - _ = json.NewEncoder(w).Encode(Pool{Name: name, AuthResetTarget: &req.AuthResetTarget}) + // The set succeeded; only the post-write read-back failed. Returning a + // partial Pool (name + target, missing the required strategy/members) + // would violate the OpenAPI schema, so report success with no body. + w.WriteHeader(http.StatusNoContent) } // DeleteApiPoolsName removes a pool. It refuses (409) while any binding still diff --git a/internal/api/server_test.go b/internal/api/server_test.go index 60cb452..ede39a2 100644 --- a/internal/api/server_test.go +++ b/internal/api/server_test.go @@ -1561,7 +1561,7 @@ func TestDeleteApiCredentials_PoolGuardVsStoreFault(t *testing.T) { t.Fatalf("seed oauth cred %q: %v", n, err) } } - if err := st.CreatePoolWithMembers("p", "failover", []string{"m", "n"}); err != nil { + if err := st.CreatePoolWithMembers("p", "failover", []string{"m", "n"}, ""); err != nil { t.Fatalf("create pool: %v", err) } @@ -2801,7 +2801,7 @@ func TestPostApiPools_DuplicateName(t *testing.T) { st := newTestStore(t) enableHTTPChannel(t, st) seedOAuthCred(t, st, "credA", "credB") - if err := st.CreatePoolWithMembers("dup_pool", store.PoolStrategyFailover, []string{"credA"}); err != nil { + if err := st.CreatePoolWithMembers("dup_pool", store.PoolStrategyFailover, []string{"credA"}, ""); err != nil { t.Fatalf("seed pool: %v", err) } srv := api.NewServer(st, nil, nil, "") @@ -2851,7 +2851,7 @@ func TestPostApiPools_MemberAlreadyPooled(t *testing.T) { st := newTestStore(t) enableHTTPChannel(t, st) seedOAuthCred(t, st, "credA", "credB") - if err := st.CreatePoolWithMembers("pool_one", store.PoolStrategyFailover, []string{"credA"}); err != nil { + if err := st.CreatePoolWithMembers("pool_one", store.PoolStrategyFailover, []string{"credA"}, ""); err != nil { t.Fatalf("seed pool: %v", err) } srv := api.NewServer(st, nil, nil, "") @@ -2955,7 +2955,7 @@ func TestGetApiPoolsName_Status(t *testing.T) { st := newTestStore(t) enableHTTPChannel(t, st) seedOAuthCred(t, st, "credA", "credB") - if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}); err != nil { + if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}, ""); err != nil { t.Fatalf("create pool: %v", err) } srv := api.NewServer(st, nil, nil, "") @@ -3011,7 +3011,7 @@ func TestPostApiPoolsNameRotate_Success(t *testing.T) { st := newTestStore(t) enableHTTPChannel(t, st) seedOAuthCred(t, st, "credA", "credB") - if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}); err != nil { + if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}, ""); err != nil { t.Fatalf("create pool: %v", err) } srv := api.NewServer(st, nil, nil, "") @@ -3051,7 +3051,7 @@ func TestPostApiPoolsNameAuthResetTarget(t *testing.T) { st := newTestStore(t) enableHTTPChannel(t, st) seedOAuthCred(t, st, "credA", "credB") - if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}); err != nil { + if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}, ""); err != nil { t.Fatalf("create pool: %v", err) } srv := api.NewServer(st, nil, nil, "") @@ -3067,7 +3067,12 @@ func TestPostApiPoolsNameAuthResetTarget(t *testing.T) { return rec } - // Set a target. + // Set a target. The 200 path returns the full, schema-complete pool via a + // read-back: name+strategy+members are all required by the OpenAPI Pool + // schema. (The read-back-failure fallback returns 204 No Content rather + // than a partial Pool that would violate that schema — Copilot #2 — but + // that path is not reachable here without failing the store mid-handler, + // which the concrete *store.Store gives no seam for.) rec := post(`{"auth_reset_target": "openai-codex"}`) if rec.Code != http.StatusOK { t.Fatalf("set: expected 200, got %d: %s", rec.Code, rec.Body.String()) @@ -3079,6 +3084,16 @@ func TestPostApiPoolsNameAuthResetTarget(t *testing.T) { if p.AuthResetTarget == nil || *p.AuthResetTarget != "openai-codex" { t.Fatalf("response AuthResetTarget = %v, want openai-codex", p.AuthResetTarget) } + // Schema completeness: the OpenAPI Pool schema requires name+strategy+members. + if p.Name != "pool1" { + t.Errorf("200 body name = %q, want pool1", p.Name) + } + if p.Strategy != store.PoolStrategyFailover { + t.Errorf("200 body strategy = %q, want %q", p.Strategy, store.PoolStrategyFailover) + } + if len(p.Members) != 2 { + t.Fatalf("200 body members = %+v, want 2 (credA, credB)", p.Members) + } // Reached the store (no inline logic; routed through poolops). got, err := st.GetPool("pool1") if err != nil { @@ -3147,6 +3162,16 @@ func TestPostApiPools_WithAuthResetTarget(t *testing.T) { if rec.Code != http.StatusCreated { t.Fatalf("expected 201, got %d: %s", rec.Code, rec.Body.String()) } + // The 201 response body must reflect the configured target. The synthetic + // store.Pool the handler builds when the read-back is skipped/fails used to + // omit it (Copilot #1), so assert it round-trips through the JSON body. + var p api.Pool + if err := json.NewDecoder(rec.Body).Decode(&p); err != nil { + t.Fatalf("decode 201 body: %v", err) + } + if p.AuthResetTarget == nil || *p.AuthResetTarget != "openai-codex" { + t.Fatalf("201 body AuthResetTarget = %v, want openai-codex", p.AuthResetTarget) + } got, err := st.GetPool("codex") if err != nil { t.Fatalf("GetPool: %v", err) @@ -3178,7 +3203,7 @@ func TestDeleteApiPoolsName_Success(t *testing.T) { st := newTestStore(t) enableHTTPChannel(t, st) seedOAuthCred(t, st, "credA", "credB") - if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}); err != nil { + if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}, ""); err != nil { t.Fatalf("create pool: %v", err) } srv := api.NewServer(st, nil, nil, "") @@ -3225,7 +3250,7 @@ func TestDeleteApiPoolsName_ReferencedByBinding(t *testing.T) { st := newTestStore(t) enableHTTPChannel(t, st) seedOAuthCred(t, st, "credA", "credB") - if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}); err != nil { + if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}, ""); err != nil { t.Fatalf("create pool: %v", err) } // A binding referencing the pool by name keeps it from being removed. diff --git a/internal/poolops/poolops.go b/internal/poolops/poolops.go index 53a403d..754accf 100644 --- a/internal/poolops/poolops.go +++ b/internal/poolops/poolops.go @@ -24,7 +24,7 @@ import ( // interface lets each channel pass its own already-open store and lets the // tests substitute a fake. type Store interface { - CreatePoolWithMembers(name, strategy string, members []string) error + CreatePoolWithMembers(name, strategy string, members []string, authResetTarget string) error GetPool(name string) (*store.Pool, error) ListPools() ([]store.Pool, error) RemovePoolIfUnreferenced(name string) (bool, error) @@ -144,10 +144,13 @@ func Create(s Store, name, strategy string, members []string) error { } // CreateWithAuthResetTarget is Create plus an optional per-pool -// auth_reset_target (empty = no reset). The target is set in a follow-up -// SetPoolAuthResetTarget call after the pool exists; channels that don't -// accept a target call Create. Used by every channel's create adapter so the -// create-with-target path has a single source of truth. +// auth_reset_target (empty = no reset). The target is bound in the same store +// transaction that creates the pool and its members, so create-with-target is +// atomic: a partial state where the pool exists without its target can never +// be observed, and there is no second write whose failure would leave a +// created pool plus an error (which a retry would then 409 on). Channels that +// don't accept a target call Create, which threads "". Used by every channel's +// create adapter so the create-with-target path has a single source of truth. func CreateWithAuthResetTarget(s Store, name, strategy string, members []string, authResetTarget string) error { if strategy == "" { strategy = store.PoolStrategyFailover @@ -158,15 +161,7 @@ func CreateWithAuthResetTarget(s Store, name, strategy string, members []string, if err := validateAuthResetTarget(authResetTarget); err != nil { return err } - if err := s.CreatePoolWithMembers(name, strategy, members); err != nil { - return err - } - if authResetTarget != "" { - if err := s.SetPoolAuthResetTarget(name, authResetTarget); err != nil { - return err - } - } - return nil + return s.CreatePoolWithMembers(name, strategy, members, authResetTarget) } // SetAuthResetTarget sets (target != "") or clears (target == "") the diff --git a/internal/poolops/poolops_test.go b/internal/poolops/poolops_test.go index 7ea10ad..1f9b2fa 100644 --- a/internal/poolops/poolops_test.go +++ b/internal/poolops/poolops_test.go @@ -109,6 +109,11 @@ func TestCreateListStatusRotateRemove(t *testing.T) { } } +// TestCreateWithAuthResetTarget asserts create-with-target persists the target +// in the single CreatePoolWithMembers call (the target is bound in the same +// store transaction that creates the pool, not a separate follow-up +// SetPoolAuthResetTarget write — Copilot #5). Both the GetPool read-back and +// the derived Status reflect the configured target. func TestCreateWithAuthResetTarget(t *testing.T) { db := newTestStore(t, "acct_a", "acct_b") diff --git a/internal/store/pools.go b/internal/store/pools.go index e71a927..0c0fb8e 100644 --- a/internal/store/pools.go +++ b/internal/store/pools.go @@ -196,7 +196,10 @@ func assertCredentialNotInAnotherPoolTx(tx *sql.Tx, credential, newPool string) // existing oauth credential with a token_url. At least two members are // required for failover to be meaningful, but a single-member pool is // permitted (it degrades to a plain indirection with no failover target). -func (s *Store) CreatePoolWithMembers(name, strategy string, members []string) error { +// authResetTarget is stored verbatim in the same transaction (empty = none), +// so create-with-target is atomic: there is no window where the pool exists +// without its configured target. +func (s *Store) CreatePoolWithMembers(name, strategy string, members []string, authResetTarget string) error { if name == "" { return fmt.Errorf("%w: pool name is required", ErrPoolNoMembers) } @@ -253,7 +256,7 @@ func (s *Store) CreatePoolWithMembers(name, strategy string, members []string) e } if _, err := tx.Exec( - "INSERT INTO credential_pools (name, strategy, auth_reset_target) VALUES (?, ?, '')", name, strategy, + "INSERT INTO credential_pools (name, strategy, auth_reset_target) VALUES (?, ?, ?)", name, strategy, authResetTarget, ); err != nil { return fmt.Errorf("insert pool %q: %w", name, err) } diff --git a/internal/store/pools_test.go b/internal/store/pools_test.go index cdf2291..19a6c60 100644 --- a/internal/store/pools_test.go +++ b/internal/store/pools_test.go @@ -27,7 +27,7 @@ func TestCreatePoolWithMembersAndGet(t *testing.T) { seedOAuthCred(t, s, "acct_a") seedOAuthCred(t, s, "acct_b") - if err := s.CreatePoolWithMembers("codex", "", []string{"acct_a", "acct_b"}); err != nil { + if err := s.CreatePoolWithMembers("codex", "", []string{"acct_a", "acct_b"}, ""); err != nil { t.Fatalf("CreatePoolWithMembers: %v", err) } @@ -66,7 +66,7 @@ func TestCreatePoolRejectsStaticMember(t *testing.T) { if err := s.AddCredentialMeta("static_key", "static", ""); err != nil { t.Fatalf("AddCredentialMeta: %v", err) } - err := s.CreatePoolWithMembers("p", "failover", []string{"static_key"}) + err := s.CreatePoolWithMembers("p", "failover", []string{"static_key"}, "") if err == nil { t.Fatal("expected error creating pool with static member") } @@ -78,7 +78,7 @@ func TestCreatePoolRejectsStaticMember(t *testing.T) { func TestCreatePoolRejectsMissingMember(t *testing.T) { s := newTestStore(t) - if err := s.CreatePoolWithMembers("p", "failover", []string{"nope"}); err == nil { + if err := s.CreatePoolWithMembers("p", "failover", []string{"nope"}, ""); err == nil { t.Fatal("expected error for non-existent member credential") } } @@ -86,13 +86,13 @@ func TestCreatePoolRejectsMissingMember(t *testing.T) { func TestCreatePoolRejectsBadStrategyAndDupes(t *testing.T) { s := newTestStore(t) seedOAuthCred(t, s, "a") - if err := s.CreatePoolWithMembers("p", "roundrobin", []string{"a"}); err == nil { + if err := s.CreatePoolWithMembers("p", "roundrobin", []string{"a"}, ""); err == nil { t.Error("expected error for unsupported strategy") } - if err := s.CreatePoolWithMembers("p", "failover", []string{"a", "a"}); err == nil { + if err := s.CreatePoolWithMembers("p", "failover", []string{"a", "a"}, ""); err == nil { t.Error("expected error for duplicate member") } - if err := s.CreatePoolWithMembers("p", "failover", nil); err == nil { + if err := s.CreatePoolWithMembers("p", "failover", nil, ""); err == nil { t.Error("expected error for empty member list") } } @@ -101,7 +101,7 @@ func TestPoolCredentialNamespaceMutualExclusion(t *testing.T) { s := newTestStore(t) seedOAuthCred(t, s, "acct_a") // "acct_a" is a credential; a pool may not shadow it. - if err := s.CreatePoolWithMembers("acct_a", "failover", []string{"acct_a"}); err == nil { + if err := s.CreatePoolWithMembers("acct_a", "failover", []string{"acct_a"}, ""); err == nil { t.Fatal("expected namespace collision error (pool name == credential name)") } } @@ -118,12 +118,12 @@ func TestCreatePoolRejectsMemberAlreadyInAnotherPool(t *testing.T) { seedOAuthCred(t, s, "shared") seedOAuthCred(t, s, "solo") - if err := s.CreatePoolWithMembers("pool_one", "failover", []string{"shared"}); err != nil { + if err := s.CreatePoolWithMembers("pool_one", "failover", []string{"shared"}, ""); err != nil { t.Fatalf("CreatePoolWithMembers(pool_one): %v", err) } // "shared" already belongs to pool_one; adding it to pool_two must fail. - err := s.CreatePoolWithMembers("pool_two", "failover", []string{"solo", "shared"}) + err := s.CreatePoolWithMembers("pool_two", "failover", []string{"solo", "shared"}, "") if err == nil { t.Fatal("expected error: credential already a member of another pool (Finding 5)") } @@ -149,7 +149,7 @@ func TestCreatePoolRejectsMemberAlreadyInAnotherPool(t *testing.T) { if _, err := s.RemovePool("pool_one"); err != nil { t.Fatalf("RemovePool: %v", err) } - if err := s.CreatePoolWithMembers("pool_three", "failover", []string{"shared"}); err != nil { + if err := s.CreatePoolWithMembers("pool_three", "failover", []string{"shared"}, ""); err != nil { t.Fatalf("after removing pool_one, re-adding shared to a new pool must succeed: %v", err) } } @@ -159,10 +159,10 @@ func TestListPoolsOrdersMembers(t *testing.T) { for _, n := range []string{"a", "b", "c"} { seedOAuthCred(t, s, n) } - if err := s.CreatePoolWithMembers("p1", "failover", []string{"c", "a"}); err != nil { + if err := s.CreatePoolWithMembers("p1", "failover", []string{"c", "a"}, ""); err != nil { t.Fatalf("create p1: %v", err) } - if err := s.CreatePoolWithMembers("p2", "failover", []string{"b"}); err != nil { + if err := s.CreatePoolWithMembers("p2", "failover", []string{"b"}, ""); err != nil { t.Fatalf("create p2: %v", err) } pools, err := s.ListPools() @@ -185,7 +185,7 @@ func TestListPoolsOrdersMembers(t *testing.T) { func TestRemovePoolCascadesMembers(t *testing.T) { s := newTestStore(t) seedOAuthCred(t, s, "a") - if err := s.CreatePoolWithMembers("p", "failover", []string{"a"}); err != nil { + if err := s.CreatePoolWithMembers("p", "failover", []string{"a"}, ""); err != nil { t.Fatalf("create: %v", err) } removed, err := s.RemovePool("p") @@ -205,7 +205,7 @@ func TestRemovePoolCascadesMembers(t *testing.T) { func TestRemovePoolIfUnreferenced_Unreferenced(t *testing.T) { s := newTestStore(t) seedOAuthCred(t, s, "a") - if err := s.CreatePoolWithMembers("p", "failover", []string{"a"}); err != nil { + if err := s.CreatePoolWithMembers("p", "failover", []string{"a"}, ""); err != nil { t.Fatalf("create: %v", err) } if err := s.SetCredentialHealth("a", "cooldown", time.Now().Add(time.Hour), "429"); err != nil { @@ -241,7 +241,7 @@ func TestRemovePoolIfUnreferenced_Unreferenced(t *testing.T) { func TestRemovePoolIfUnreferenced_RefusedWhenBound(t *testing.T) { s := newTestStore(t) seedOAuthCred(t, s, "a") - if err := s.CreatePoolWithMembers("p", "failover", []string{"a"}); err != nil { + if err := s.CreatePoolWithMembers("p", "failover", []string{"a"}, ""); err != nil { t.Fatalf("create: %v", err) } // A binding NAMES THE POOL (pool shares the credential namespace). @@ -288,7 +288,7 @@ func TestRemovePoolIfUnreferenced_RefusedWhenBound(t *testing.T) { func TestRemovePoolIfUnreferenced_BindingBeforeRemovalRefuses(t *testing.T) { s := newTestStore(t) seedOAuthCred(t, s, "a") - if err := s.CreatePoolWithMembers("p", "failover", []string{"a"}); err != nil { + if err := s.CreatePoolWithMembers("p", "failover", []string{"a"}, ""); err != nil { t.Fatalf("create: %v", err) } @@ -327,7 +327,7 @@ func TestRemovePoolIfUnreferenced_BindingBeforeRemovalRefuses(t *testing.T) { func TestAddBinding_AfterPoolRemovedRefuses(t *testing.T) { s := newTestStore(t) seedOAuthCred(t, s, "a") - if err := s.CreatePoolWithMembers("p", "failover", []string{"a"}); err != nil { + if err := s.CreatePoolWithMembers("p", "failover", []string{"a"}, ""); err != nil { t.Fatalf("create pool: %v", err) } @@ -418,7 +418,7 @@ func TestRemovePoolIfUnreferenced_ConcurrentIsInternallyConsistent(t *testing.T) t.Fatalf("iter %d: new store: %v", iter, err) } seedOAuthCred(t, s, "a") - if err := s.CreatePoolWithMembers("p", "failover", []string{"a"}); err != nil { + if err := s.CreatePoolWithMembers("p", "failover", []string{"a"}, ""); err != nil { t.Fatalf("iter %d: create: %v", iter, err) } @@ -502,12 +502,12 @@ func TestPoolsForMember(t *testing.T) { s := newTestStore(t) seedOAuthCred(t, s, "shared") seedOAuthCred(t, s, "x") - if err := s.CreatePoolWithMembers("p1", "failover", []string{"shared", "x"}); err != nil { + if err := s.CreatePoolWithMembers("p1", "failover", []string{"shared", "x"}, ""); err != nil { t.Fatalf("create p1: %v", err) } // A credential belongs to at most one pool (Finding 5): adding "shared" // to a second pool must be rejected. - if err := s.CreatePoolWithMembers("p2", "failover", []string{"shared"}); err == nil { + if err := s.CreatePoolWithMembers("p2", "failover", []string{"shared"}, ""); err == nil { t.Fatal("expected p2 creation to fail: shared already belongs to p1") } @@ -770,7 +770,7 @@ func TestRemoveCredentialMetaBlocksLivePoolMember(t *testing.T) { s := newTestStore(t) seedOAuthCred(t, s, "member") seedOAuthCred(t, s, "other") - if err := s.CreatePoolWithMembers("p", "failover", []string{"member", "other"}); err != nil { + if err := s.CreatePoolWithMembers("p", "failover", []string{"member", "other"}, ""); err != nil { t.Fatalf("CreatePoolWithMembers: %v", err) } @@ -831,7 +831,7 @@ func TestRemoveCredentialMetaCleansHealthRow(t *testing.T) { // NOT inherit the old cooldown — GetCredentialHealth is nil (= healthy). seedOAuthCred(t, s, "x") seedOAuthCred(t, s, "y") - if err := s.CreatePoolWithMembers("fresh", "failover", []string{"x", "y"}); err != nil { + if err := s.CreatePoolWithMembers("fresh", "failover", []string{"x", "y"}, ""); err != nil { t.Fatalf("CreatePoolWithMembers(fresh): %v", err) } if h, herr := s.GetCredentialHealth("x"); herr != nil || h != nil { @@ -847,7 +847,7 @@ func TestAddCredentialMetaRejectsPoolNameCollision(t *testing.T) { s := newTestStore(t) seedOAuthCred(t, s, "acct_a") seedOAuthCred(t, s, "acct_b") - if err := s.CreatePoolWithMembers("codex", "failover", []string{"acct_a", "acct_b"}); err != nil { + if err := s.CreatePoolWithMembers("codex", "failover", []string{"acct_a", "acct_b"}, ""); err != nil { t.Fatalf("CreatePoolWithMembers: %v", err) } @@ -867,7 +867,7 @@ func TestAddCredentialMetaRejectsPoolNameCollision(t *testing.T) { // The reverse direction still holds: CreatePoolWithMembers rejects a // name that already exists as a credential. - if err := s.CreatePoolWithMembers("not_a_pool", "failover", []string{"acct_a"}); err == nil { + if err := s.CreatePoolWithMembers("not_a_pool", "failover", []string{"acct_a"}, ""); err == nil { t.Fatal("expected CreatePoolWithMembers to reject a name that is already a credential") } } @@ -890,7 +890,7 @@ func TestRemoveCredentialMetaCASGuardsLivePoolMember(t *testing.T) { seedOAuthCred(t, s, "sibling") // Concurrent pool-create claims "c" between the insert and the rollback. - if err := s.CreatePoolWithMembers("p", "failover", []string{"c", "sibling"}); err != nil { + if err := s.CreatePoolWithMembers("p", "failover", []string{"c", "sibling"}, ""); err != nil { t.Fatalf("CreatePoolWithMembers: %v", err) } @@ -967,7 +967,7 @@ func TestRemoveCredentialMetaCASGuardsLivePoolMember(t *testing.T) { func TestRemovePoolDeletesMemberHealth(t *testing.T) { s := newTestStore(t) seedOAuthCred(t, s, "m") - if err := s.CreatePoolWithMembers("p", "failover", []string{"m"}); err != nil { + if err := s.CreatePoolWithMembers("p", "failover", []string{"m"}, ""); err != nil { t.Fatalf("create pool: %v", err) } until := time.Now().Add(10 * time.Minute).UTC().Truncate(time.Second) @@ -1001,7 +1001,7 @@ func TestRemovePoolDeletesMemberHealth(t *testing.T) { func TestRemovePoolSparesStillPooledMemberHealth(t *testing.T) { s := newTestStore(t) seedOAuthCred(t, s, "m") - if err := s.CreatePoolWithMembers("p", "failover", []string{"m"}); err != nil { + if err := s.CreatePoolWithMembers("p", "failover", []string{"m"}, ""); err != nil { t.Fatalf("create pool p: %v", err) } // "m" also belongs to pool q (legacy/pre-invariant row injected directly). @@ -1044,7 +1044,7 @@ func TestRemovePoolSparesStillPooledMemberHealth(t *testing.T) { func TestAddCredentialMetaRejectsLivePoolMemberDowngrade(t *testing.T) { s := newTestStore(t) seedOAuthCred(t, s, "poolcred") - if err := s.CreatePoolWithMembers("p", "failover", []string{"poolcred"}); err != nil { + if err := s.CreatePoolWithMembers("p", "failover", []string{"poolcred"}, ""); err != nil { t.Fatalf("create pool: %v", err) } @@ -1101,7 +1101,7 @@ func TestAddCredentialMetaRejectsLivePoolMemberDowngrade(t *testing.T) { func TestSetCredentialHealthIfPoolMemberLiveMemberPersists(t *testing.T) { s := newTestStore(t) seedOAuthCred(t, s, "live") - if err := s.CreatePoolWithMembers("p", "failover", []string{"live"}); err != nil { + if err := s.CreatePoolWithMembers("p", "failover", []string{"live"}, ""); err != nil { t.Fatalf("create pool: %v", err) } until := time.Now().Add(10 * time.Minute).UTC().Truncate(time.Second) @@ -1139,7 +1139,7 @@ func TestSetCredentialHealthIfPoolMemberLiveMemberPersists(t *testing.T) { func TestSetCredentialHealthIfPoolMemberSkipsRemoved(t *testing.T) { s := newTestStore(t) seedOAuthCred(t, s, "gone") - if err := s.CreatePoolWithMembers("p", "failover", []string{"gone"}); err != nil { + if err := s.CreatePoolWithMembers("p", "failover", []string{"gone"}, ""); err != nil { t.Fatalf("create pool: %v", err) } @@ -1167,7 +1167,7 @@ func TestSetCredentialHealthIfPoolMemberSkipsRemoved(t *testing.T) { // cooldown: ListCredentialHealth (what loadPoolResolver seeds from) must // carry no row for "gone". seedOAuthCred(t, s, "gone") - if err := s.CreatePoolWithMembers("p2", "failover", []string{"gone"}); err != nil { + if err := s.CreatePoolWithMembers("p2", "failover", []string{"gone"}, ""); err != nil { t.Fatalf("recreate pool: %v", err) } rows, err := s.ListCredentialHealth() @@ -1197,7 +1197,7 @@ func TestSetCredentialHealthIfPoolMemberSkipsRemoved(t *testing.T) { func TestSetCredentialHealthIfPoolMemberEpochRejectsReAddedSuccessor(t *testing.T) { s := newTestStore(t) seedOAuthCred(t, s, "c") - if err := s.CreatePoolWithMembers("P", "failover", []string{"c"}); err != nil { + if err := s.CreatePoolWithMembers("P", "failover", []string{"c"}, ""); err != nil { t.Fatalf("create pool P: %v", err) } pP, err := s.GetPool("P") @@ -1216,7 +1216,7 @@ func TestSetCredentialHealthIfPoolMemberEpochRejectsReAddedSuccessor(t *testing. t.Fatalf("RemovePool(P) = %v, %v", removed, rerr) } seedOAuthCred(t, s, "c") - if err := s.CreatePoolWithMembers("Q", "failover", []string{"c"}); err != nil { + if err := s.CreatePoolWithMembers("Q", "failover", []string{"c"}, ""); err != nil { t.Fatalf("recreate c into Q: %v", err) } pQ, err := s.GetPool("Q") @@ -1264,7 +1264,7 @@ func TestSetCredentialHealthIfPoolMemberEpochRejectsReAddedSuccessor(t *testing. func TestSetCredentialHealthIfPoolMemberEpochLiveMemberSamePool(t *testing.T) { s := newTestStore(t) seedOAuthCred(t, s, "m") - if err := s.CreatePoolWithMembers("pool", "failover", []string{"m"}); err != nil { + if err := s.CreatePoolWithMembers("pool", "failover", []string{"m"}, ""); err != nil { t.Fatalf("create pool: %v", err) } p, _ := s.GetPool("pool") @@ -1410,7 +1410,7 @@ func TestRemoveCredentialFullyRefusesLivePoolMember(t *testing.T) { s := newTestStore(t) seedOAuthCred(t, s, "m") seedOAuthCred(t, s, "n") - if err := s.CreatePoolWithMembers("p", "failover", []string{"m", "n"}); err != nil { + if err := s.CreatePoolWithMembers("p", "failover", []string{"m", "n"}, ""); err != nil { t.Fatalf("CreatePoolWithMembers: %v", err) } @@ -1517,7 +1517,7 @@ func TestRemoveCredentialFullyCleansHealthOnPartialCleanupFinish(t *testing.T) { // carries no row for "x". seedOAuthCred(t, s, "x") seedOAuthCred(t, s, "y") - if err := s.CreatePoolWithMembers("p", "failover", []string{"x", "y"}); err != nil { + if err := s.CreatePoolWithMembers("p", "failover", []string{"x", "y"}, ""); err != nil { t.Fatalf("CreatePoolWithMembers: %v", err) } hrows, err := s.ListCredentialHealth() @@ -1586,7 +1586,7 @@ func TestMigration000008DownUpPopulated(t *testing.T) { // credential_health row for one member. seedOAuthCred(t, s, "acct_a") seedOAuthCred(t, s, "acct_b") - if err := s.CreatePoolWithMembers("codex", "failover", []string{"acct_a", "acct_b"}); err != nil { + if err := s.CreatePoolWithMembers("codex", "failover", []string{"acct_a", "acct_b"}, ""); err != nil { t.Fatalf("CreatePoolWithMembers: %v", err) } if err := s.SetPoolAuthResetTarget("codex", "openai-codex"); err != nil { @@ -1692,7 +1692,7 @@ func TestPoolAuthResetTargetCRUD(t *testing.T) { seedOAuthCred(t, s, "acct_a") seedOAuthCred(t, s, "acct_b") - if err := s.CreatePoolWithMembers("codex", "", []string{"acct_a", "acct_b"}); err != nil { + if err := s.CreatePoolWithMembers("codex", "", []string{"acct_a", "acct_b"}, ""); err != nil { t.Fatalf("CreatePoolWithMembers: %v", err) } diff --git a/internal/telegram/commands_test.go b/internal/telegram/commands_test.go index 06523ff..9594bee 100644 --- a/internal/telegram/commands_test.go +++ b/internal/telegram/commands_test.go @@ -2308,7 +2308,7 @@ func TestHandlePoolRemoveUnknown(t *testing.T) { func TestHandlePoolRemoveReferencedByBinding(t *testing.T) { s := newTestStore(t) seedPoolOAuthMeta(t, s, "acct_a", "acct_b") - if err := s.CreatePoolWithMembers("codex", store.PoolStrategyFailover, []string{"acct_a", "acct_b"}); err != nil { + if err := s.CreatePoolWithMembers("codex", store.PoolStrategyFailover, []string{"acct_a", "acct_b"}, ""); err != nil { t.Fatalf("create pool: %v", err) } // A binding referencing the pool by name keeps it from being removed. @@ -2328,7 +2328,7 @@ func TestPoolStatusFormatMatchesCLI(t *testing.T) { // doesn't drift from cmd/sluice/pool.go. s := newTestStore(t) seedPoolOAuthMeta(t, s, "m0", "m1") - if err := s.CreatePoolWithMembers("p", store.PoolStrategyFailover, []string{"m0", "m1"}); err != nil { + if err := s.CreatePoolWithMembers("p", store.PoolStrategyFailover, []string{"m0", "m1"}, ""); err != nil { t.Fatalf("create pool: %v", err) } // Park m0 so it shows a cooldown line with the reason. @@ -2352,7 +2352,7 @@ func TestPoolStatusEscapesLastFailureReason(t *testing.T) { // < > & must be HTML-escaped or the Bot API rejects/garbles the message. s := newTestStore(t) seedPoolOAuthMeta(t, s, "m0", "m1") - if err := s.CreatePoolWithMembers("p", store.PoolStrategyFailover, []string{"m0", "m1"}); err != nil { + if err := s.CreatePoolWithMembers("p", store.PoolStrategyFailover, []string{"m0", "m1"}, ""); err != nil { t.Fatalf("create pool: %v", err) } rawReason := `429 & "retry"` From 7008ec53a18bfc746fbb7a3ef921a9a4c51eac20 Mon Sep 17 00:00:00 2001 From: Nikita Nemirovsky Date: Sat, 23 May 2026 00:45:03 +0800 Subject: [PATCH 15/19] fix(pools): scope Retry-After date parsing, require auth_reset_target field, document 204 --- api/openapi.yaml | 4 + internal/api/api.gen.go | 161 ++++++++++++++------------- internal/api/server.go | 20 +++- internal/api/server_test.go | 49 ++++++++ internal/proxy/pool_failover.go | 15 ++- internal/proxy/pool_failover_test.go | 37 ++++++ 6 files changed, 199 insertions(+), 87 deletions(-) diff --git a/api/openapi.yaml b/api/openapi.yaml index 7437d37..9d8396b 100644 --- a/api/openapi.yaml +++ b/api/openapi.yaml @@ -678,6 +678,10 @@ paths: application/json: schema: $ref: "#/components/schemas/Pool" + "204": + description: >- + Target updated; pool representation unavailable because the + post-update read-back failed "400": description: Invalid target content: diff --git a/internal/api/api.gen.go b/internal/api/api.gen.go index e88e9f8..1beadf8 100644 --- a/internal/api/api.gen.go +++ b/internal/api/api.gen.go @@ -2029,86 +2029,87 @@ func HandlerWithOptions(si ServerInterface, options ChiServerOptions) http.Handl // Base64 encoded, gzipped, json marshaled Swagger object var swaggerSpec = []string{ - "H4sIAAAAAAAC/+R96XIbuXbwq6D6+6pGqkuRnDu+SY2m8kOWlWtV7LFKkpMfpouEug9FXHUDPQCaEuNS", - "VR4iT5gnSeEAvRK90JYoO/NrLDaWs299Ts+XIBRJKjhwrYLjL4EKV5BQ/OdJmkqxpvEl/JGB0uanVIoU", - "pGaAC0IJVEM0p/hsKWRi/hVEVMORZgkEo0BvUgiOA6Ul47fB4yiIQGnGqWaCm01bz1nk/TkVUlceMK7h", - "FmTw+DgKJPyRMQlRcPzJ7K5f4XaOqrB+LsASN/+AUJvzT7KI6TOu5WYbTRq2QhtKiIBrRuN5psAPeh/G", - "baiNglTCer6iauXdJ4GqliMN8ZWmSTqcL1qI2HvWGmTEQu151qB9eWm5yUfq14xHZv+TiFPJgK+iPfD1", - "fE2leRaBCiVL7dLgjK+ZFDwBrsmaSkZvYiCcJkCWQpJ0RbkWCdHiDjhh3KBmpW3rhhXQCGSHqBd4Mq7/", - "6VV5RFUOhLTKyTQkyi8r7hcqJd1Y2RFahCL27quwvbFNQ5LGVEM/uz2qVmFGB+M/ppG7oE7yCyrNVpLh", - "c6QzJTd2z5icaBIDVZoIDmTJII5IkilNboAo0IRxolcw49LaKnIjos2ICL0Cec8UmIdEgVyDJBJ0Jrki", - "r6ZT8ppGxJk3cpBQHa4Yv51xs/r03fnhmHzg8cZep4heUU2oBJJKUEYwzL8TEbElg2hMrkBr3L2oUGVB", - "tCCUcMGPIEn1hqxpnAGhsRIOUYWwpdTQdcZppsWRUwRC41jcE5nFQA40vb2FiNwzvSILR5UjGkXHs2w6", - "/SUsSY9/w4IIOeML83PXqkOiBN7P4Z5UwCZMES40EdKIusHufGnJYgG1QK2oIjcAnEhIxBoiklCe0Tje", - "jPBMB2X13BlniijN4pzNEaE8IpTcU8nNWqZILBDTAy7IksbxDQ3v7HVMzbgjzeGYXFClzA7KiaUsyjCK", - "zQIVxpCALAo9WJAwBioVopF49ls5twdYrV2MyCJXiMUIj3MGYzHj9jREFOVjTFCwzREeAbBsRwkKRRyz", - "CNSMIzMpRyktqCW4FVZja0p+VaT2V3Iq+DJmoR7P0MHVjOh3bfG+G0O2bZukQewd43de/5/RuN0Rw0MK", - "oXFcrStixmHOs+SmRpe2IKa6unn6qAaNz8qerijnED+NewVuxKAa19wIEQPlO/kv+8uXAHiWYKQAMdxK", - "mgSjYKV1WkGjvPkeblZC3M0VhBL0tri+fX9ySuxDFFDFbtGAuH0kpZtY0EgFHWdnMt4++D/cAR8v3+HB", - "b6+vL0hoieo5zecVcUlJvA4uXWmqM7XNq066707OZqQ2GL7SV+8A3zbr+jgwQEGNzWOeoDGCJc1iPa+E", - "qTld0H1ikMI3RnHUnVfSjOyLTBtw/RZpjRdEzG/T7FOTKTG/2WvHpo26z46TFyQ0Dy5Gq6R8de2wq1QZ", - "mZEb0PcmBqBVb2WdesUXjcl1EYGZKM3cqkiUpTELzYHHlROrHvCgPHRUPe9wxk0kQhK6IcLEaPDAMDgM", - "YUwuIbUB1MWHq2vlXGev5/yRU4nvMkPYKTmwgnVaLGotOtAwBKXmSK9tYn84yUxMhWscTQ9ymMj9Cjgx", - "N5swU9BMrw4HlCjqF5wvTb4xshF8mGsDr8bqRvidLPt96lMKCmFc2xCe3pqNoeCaMo6hwy4SZO55MtGS", - "sJSgVt1McotyLgl8atTcYGt/lEJbbfdWLNoFcRTgfr+Dv8ajgUepYFyjn99ZRHIX7Cw1PqOahTYlrtvL", - "3CaaPWPyxu4wwknsnjG6Ymvei1Pwaq9px0zCg9Wqli64yMimHQdKC4Mc8FBuUps+XeYYY/CE11oIe0Mc", - "lJR2FX5/evExVVoCTdp1WN7uaHVCkSSUR22WFw+NImZF6KJ2WdvZJeCtst8bGmhJucoLeA2G5I+QrMdE", - "6YgJcuAk5nBETKyGSeU93CgR3oEeRvqSGO1MuBCiw4JmejWXoEDPNZW3vuj6g1NGZ1TMjiPcQewOIjOe", - "+2l4WNFMaYiObFFBQijWgPIW3QI5wOx6IhKmjUv+F8IFwaO8ipWASXuUByIZ4Zl2QVXUDVEUOVhSFpt7", - "iTArzeHDpatVAJSWVMPtxlOtEiIm+ePfSFRR659yUH6y5sShPpS7OQnauXuZxdDK3b44pd3QU61B8peO", - "LySkMQ3BeMEWy95fqO6Ils0FEa2VplvY0VXDPq3FinUGtFuT3XwShrXGOqMvQCvS44s6HI/xAWjj8/MO", - "d/E6O3gAd2lbkfd3uHdOCeu73MbtJuovNXpM/rX0SeXPamQirxlf4P6FXWVDieYisqhGiQuMyPL4It7M", - "+KIWoCxsclJAsigYtTBRAFZ75Rqi8YzP+MXJ9elboiChXLNQHVvtxm31Q3MEmxDOeH6gLR6GmZTAdbwh", - "zkvX4qIxuRYEHkyaxMwarDraSmxtnUHb5FyNYubBYlYPxmbBMZkFs2BxOJ7x65VFbw1cKxLGDP+LRUqM", - "vMCmVhhcViPqpRTJjCsWW7iNxZFiY+7DvM2Hhiff6g7kjaD4gvlSISpM9+pFTxBaXtAIRP03jMmHhGmi", - "Rck/5/2c1DTYduVjh9mNHCTMpaBDwzsDbT2mW3p15NDrZbYU9UxKIS9BpYIrTwEiFJHfhIHZ15/52WU+", - "C/EWaKxX7TeroiAGDzRJY9x91+s63TbfjeeJcV6XoDBOb97n8jQ1Z1yB1LWSVsW3FcvUHUvTtlUhVnXm", - "Clre55rssO8mu6bzmsyF131Hles6jmsQsgFiE56Rh2Ae6nhh9AFUo5mPe5Vs4rnTiK+rj39j6jG4hr6/", - "HOWrMhOsew9ITy5c9LZzRnLyRIlIXwLyNVJQSVoKUfz/EpbBcfD/JmV7zcT11kwMEd7jnudIRlzkWMlC", - "mCIqSw2fITocmoYUt3VnJBVcfG+cuiqqqVCskaa0maXKSZV93fC0vVuhoWZr8L+6CIWII3HP5xnXLH6y", - "NpSYKj03/MgkzDv6drooMkIn5wkNVuhSNyOSA4+q634lB/mvJoo05OwXAD+xRznhckDaqH8JS5DAQ4i2", - "oow64K+mv2KfBkbKb87enV2fkQlN2SQVIlaTL0YQH11FbgXE/EqK9gFZ3EJuNtgUIiRJhCzaDtSYnFIp", - "mQuzb4GDZCHB0GRiAhySxplyEavMQp0ZSxHjW4QluYlFeIevN9xp5IBF5C+1VxC/EdcngvnFtXsLR1Qm", - "lzQERSTwCGT5LgPPPnCvEklKJdObQ5N7MGXwoiSCCN+GRMRairw5I4e9Rk8CfA2xsJVKLjQJRZbGEBEt", - "kBx5MjneCrxzlHayVzlXGb/NG7i8/vUpAscyougVsgo4O1dCBvrevr6nVhAxhWoLPE0Ota0S1nLZFOye", - "KmJVDuULuXxvxObOV0saBfbJrpYrbSunaOHPQhxIrg5Hl9qJeF6r7zUveOPIEgCvaSNgvwFvlDyKTLoO", - "Y6NW6KPCdxSAfH0w4ej1tSHFrlFB4Q66woNLUCJet1cr24p2c8FDczaN7+lGzWuFvG+q3BUAtWWgLQ3I", - "g9tg0T50QpDFsM9e6m9JLn6QmrASmQzhZcvFvXy/Am109STD+gfoa7Qf3/KWpsUk/bZVd3K9ikxjV2Wb", - "7WkgtA2AFys0Oh2lpLxzaqg9q3dG+WQI0N/PqRtPUG3N8+JhMzcxF3AXHjTj/aZjauzwXTUqEfJR499B", - "suWmtdiETYbzmPG74fSodCZ6iBHDLQ0N0BzaVFBoGnctWNOYRSVQPeFP9bj63lEdvwZs29QyegthZqLg", - "K4OroxFQCdIoiUfeL85dkdYI+5pRcvXu4/np2fzk4nx+/eHfzn43QTFZUxNCIv2Q3XhiKe7YI/dormd8", - "6QlyLs+urom5yqQlCeX0Nq9sX8UZC6uv2Y9cGwa/JbmIEJQiE3THLASnEta+Bu/Pr23FRmNd0x13KriW", - "IiYXMeVgLrZ2RFlofh5Px1OzS6TAacqC4+CX8XT8S4DGeYU0w6Sppg7OXBjhQ/dwHgXHwd9Bn6TspCLL", - "0qkt7vnrdGprv1w7O0tT257FBJ/8w+WsViwHS29zhmhLhB8fmy0TF1bpSoq6nn5lRSZLEio3wXHwzqRT", - "aeviUaCpyXE+BSVlPpsT6tSafGHR40TaqABVVigP7S6EqhHvPHKBBPJB0gQ0Bm2fvgTMIGF4k1fEjq1v", - "KLVIywxGFUo27fBnuxiUfi2izU5M6eJFIxZ7rKu2AerxG0Vi0O3OU3g4f1IyEddGRlxeTV89GQz1gkQX", - "BCahXoqMOxB+fQEQaCyBRpuCGERIYiLBiIhMN3TBEZfQVoXo04csYnoiIXQIdhkQs/TSrvQL/x8ZyE0p", - "/TFLmA6qAl+0Tf1tOgoS+sASE4D9PJ2aPxl3f3pS8c97sVnlLOAAc2UpQZCABLiWxtk3uVNZEovbYlmF", - "J+bZFj/WGE4M4YcNPIJn1N9aaOOhAz53RxuZtcuqZLAnVMiwompFwhVlvIMS1WpVBxVe58v2ISGtFbBt", - "slT6MApMPH4s9CwraVKWw2yRuN1B1cjw9C7E2yQ+yJH8/GQwFMTfJrZ75Ppyne2e7s92n3MMiAubu2/f", - "keOfuw5sTlC2oXbFVLNDv1o/qMvkSRTVG/or/czbMtlUVYyprJ2Pwb6yqAvrG/y9Iq7n0VPEUf2l3G3/", - "8Wo7Acip6KYa9x6E5PdXYpCGOzFw7cAfTBXClcdomJ9fig1Pb53qA8Z7jm8HmCU38vp9mKWXFeg9G8Y3", - "ldHmfP6WuPHbsoUrH0Cy9nJrBrehhlbMvsJMVktiHRHNaTlz+PwRTT41OiCi+V3oMtArcPHENNy7sCRN", - "WUnbIk3hQXoMV06jH9xw1act92y4CtZ7glf3rrpmuPZoOPL72zxhoYI+UeuRtGKetEsF7aLnJL69wYe7", - "fZtJwnxFFfG/g84bh92CTBZvX3Ok7cYB7r+C5jPIdnXUdd+i3U5dfPL9eGSfWPcytpDlsv+3T6ArK/fi", - "VirOc6dc2Q31YK+zwg+tcFiDJPCQCgXRYU8OjdurJKvg3ZtDN4n0XGn09rDrnjPpKnc6uUGj6E+YTVcJ", - "UEuoe5LlVrnz6KvrchuULFfk8nfbjdEf8bi2jR3fPfRlyKfV79K8TJJctRXD8+Quk9DjI/dG/GcxN/Wx", - "rEF2ppvvf9ZcdpDcYbcKppB2WEcsW0bdek1FEqaTYmyix7m/D9OPxdJ9ePfqcMYA9/7+9IKUuHgceH1B", - "SZokTPu99hb2z+W2PQPue/bbNbpv0zl/9qctgxcE6HfbvCZ0WzLnVcNdfHZVKl/YaRdUeSmXXQDQ57AH", - "MgXHBHps4gWu2YcxxMmm3ZIci0B3JpM6BHIS2L97rWGJ93OZweonJvZs/yypPZ1DQsR/WqOHyOPXgooC", - "N35B1TXBM1VYQyNClkB/2y+BNEhOYzuCY/mUd9bhbM8BTirary1P3rwmbmCqmetb8avX3d1MQVNJapZi", - "F7ON6vPC9hoZ+lK22krTc72v6RoTawNm+NjXNX4cwU1J3Yho85MqPpmrFu7jwTG+GcfP1A4Y9RoPeAXb", - "IoOjXve0TzmbPikP8ybtVpblPdwvKrs1zlUAIwf1OaG/kBTkkfvDzk8eDjQqk7IJ/6js1u/30Ib1jYGA", - "H6qy0D3TsOdqf1tY8NF9aTp1EdqLhAVOKr4rXbgCbcyn/SIJumGQR+iKv/mzX0P1xn5nZriy2KHKH9Na", - "1gZCfZ2kboKy6J78P+Xze6+35CGSGudOSSh4/obTDRiuWIrvdm+hIccfUGSEJEYAJYuA/M9//Teh0Zpy", - "V4Zzhh5l257WIaD4yZOevPIS1wxqf85nwqqi97UTZ/4b3Pect4/noO+FxM/n2OAkn+fznL2X1mqcfRwy", - "AyJiFm6I5YRv7KP6vOSk/bs3Oc6Z93zJcfULfXtOji2JPeqVxfD9JMeeF2gVnnpYWlPOCTzkH7Tp09Gz", - "B/c/ANpBuLVI4jryTWXZQvD6w/t3xEFVR84CYEWVUEXMyl78WJLj1y/I9gNXneKcZLFmKZV6shQyOYqo", - "pnX8Gt8mYLHvC7MGRdt7QHDFqGyYumGcokXqHinFbZ5JwL2GibUPgvlkNbEMqzjhF9AUFJO6JOWAoSQt", - "pUiGydLgZmwrTd9TJzbarJcqgeDlve+Wh1mt8rN2HfbKZdPPKPqNYe2OrjOcYi0S+HoGXXmEYwTmn1Uv", - "nH+MD7G3efR/dqH+1i15RsQbnx30RRyIFVP5h5NsefSXPUbfOQBG5LBQWxuRDo4/fa5ywWJEwhWEdxXa", - "W+gN7et764PVnz4b/bP/Nwar4nVY3omQxo4Sk5OLc/c/bghGAX69FkeojyeTn//6z+PpeDr++fiX6XQa", - "PH5+/N8AAAD//4Q38/5OcAAA", + "H4sIAAAAAAAC/+R9a28buXrwXyHmfYG1cWRJezanxXrRD47jnhhNNobttB+iQKJnHlk8niFnSY5sNTDQ", + "H9Ff2F9S8CHnKs5FiS0n3U8ba3h57jc+5H4JQpGkggPXKjj+EqhwBQnFf56kqRRrGl/CHxkobX5KpUhB", + "agY4IJRANURzit+WQibmX0FENRxplkAwCvQmheA4UFoyfhs8joIIlGacaia4mbT1nUXen1MhdeUD4xpu", + "QQaPj6NAwh8ZkxAFx5/M7PoWbuaoCuvnAixx8w8ItVn/JIuYPuNabrbRpGErtKGECLhmNJ5nCvyg92Hc", + "htooSCWs5yuqVt55EqhqWdIQX2mapMP5ooWIvWutQUYs1J5vDdqXm5aTfKR+zXhk5j+JOJUM+CraA1/P", + "11SabxGoULLUDg3O+JpJwRPgmqypZPQmBsJpAmQpJElXlGuREC3ugBPGDWpW2rZ2WAGNQHaIeoEn4/qf", + "XpVLVOVASKucTEOi/LLifqFS0o2VHaFFKGLvvArbG9M0JGlMNfSz26NqFWZ0MP5jGrkN6iS/oNJMJRl+", + "RzpTcmPnjMmJJjFQpYngQJYM4ogkmdLkBogCTRgnegUzLq2tIjci2oyI0CuQ90yB+UgUyDVIIkFnkivy", + "ajolr2lEnHkjBwnV4Yrx2xk3o0/fnR+OyQceb+x2iugV1YRKIKkEZQTD/DsREVsyiMbkCrTG2YsKVRZE", + "C0IJF/wIklRvyJrGGRAaK+EQVQhbSg1dZ5xmWhw5RSA0jsU9kVkM5EDT21uIyD3TK7JwVDmiUXQ8y6bT", + "X8KS9Pg3LIiQM74wP3eNOiRK4P4c7kkFbMIU4UITIY2oG+zOl5YsFlAL1IoqcgPAiYRErCEiCeUZjePN", + "CNd0UFbXnXGmiNIsztkcEcojQsk9ldyMZYrEAjE94IIsaRzf0PDObsfUjDvSHI7JBVXKzKCcWMqiDKPY", + "LFBhDAnIotCDBQljoFIhGolnvpVzu4DV2sWILHKFWIxwOWcwFjNuV0NEUT7GBAXbLOERAMt2lKBQxDGL", + "QM04MpNylNKCWoJbYTW2puRXRWp/JaeCL2MW6vEMHVzNiH7XFu+7MWTbtkkaxN4xfuf1/xmN2x0xPKQQ", + "GsfVOiJmHOY8S25qdGkLYqqjm6uPatD4rOzpinIO8dO4V+BGDKpxzY0QMVC+k/+yv3wJgGcJRgoQw62k", + "STAKVlqnFTTKne/hZiXE3VxBKEFvi+vb9yenxH5EAVXsFg2Im0dSuokFjVTQsXYm4+2F/8Mt8PHyHS78", + "9vr6goSWqJ7VfF4Rh5TE6+DSlaY6U9u86qT77uRsRmqD4St99Q7wbbOujwMDFNTYPOYJGiNY0izW80qY", + "mtMF3ScGKXxjFEfdeSXNyL7ItAHXb5HWuEHE/DbNfjWZEvObvXZs2qj77Dh5QULz4GK0SspX1w47SpWR", + "GbkBfW9iAFr1VtapV3zRmFwXEZiJ0syuikRZGrPQLHhcWbHqAQ/KRUfV9Q5n3EQiJKEbIkyMBg8Mg8MQ", + "xuQSUhtAXXy4ulbOdfZ6zh85lfguM4SdkgMrWKfFoNaiAw1DUGqO9Nom9oeTzMRUOMbR9CCHidyvgBOz", + "swkzBc306nBAiaK+wfnS5BsjG8GHuTbwaqxuhN/Jst+nPqWgEMa1DeHprZkYCq4p4xg67CJBZp8nEy0J", + "Swlq1c0kNyjnksCvRs0NtvZHKbTVdm/Fol0QRwHO9zv4a1waeJQKxjX6+Z1FJHfBzlLjN6pZaFPiur3M", + "baKZMyZv7AwjnMTOGaMrtua9WAW39pp2zCQ8WK1q6YKLjGzacaC0MMgBD+UmtenTZY4xBk+4rYWwN8RB", + "SWlX4fenFx9TpSXQpF2H5e2OVicUSUJ51GZ5cdEoYlaELmqbta1dAt4q+72hgZaUq7yA12BI/gnJekyU", + "jpggB05iDkfExGqYVN7DjRLhHehhpC+J0c6ECyE6LGimV3MJCvRcU3nri64/OGV0RsXMOMIZxM4gMuO5", + "n4aHFc2UhujIFhUkhGINKG/RLZADzK4nImHauOR/IVwQXMqrWAmYtEd5IJIRrmkHVEXdEEWRgyVlsdmX", + "CDPSLD5culoFQGlJNdxuPNUqIWKSf/6NRBW1/ikH5SdrThzqQ7mbk6Cdu5dZDK3c7YtT2g091Rokf+n4", + "QkIa0xCMF2yx7P2F6o5o2WwQ0VppuoUdXTXs01qsWGdAuzXZzSdhWGusM/oCtCI9vqjD8RgfgDY+X+9w", + "F6+zgwdwm7YVeX+He+eUsL7Lbdxuov5So8fkX0ufVP6sRibymvEFzl/YUTaUaA4ii2qUuMCILI8v4s2M", + "L2oBysImJwUki4JRCxMFYLVXriEaz/iMX5xcn74lChLKNQvVsdVunFZfNEewCeGM5wva4mGYSQlcxxvi", + "vHQtLhqTa0HgwaRJzIzBqqOtxNbGGbRNztUoZh4sZvVgbBYck1kwCxaH4xm/Xln01sC1ImHM8L9YpMTI", + "C2xqhcFlNaJeSpHMuGKxhdtYHCk2Zj/M23xoePKt7kDeCIovmC8VosJ0r170BKHlBo1A1L/DmHxImCZa", + "lPxz3s9JTYNtVz52mNnIQcJcCjo0vDPQ1mO6pVdHDr1eZktRz6QU8hJUKrjyFCBCEflNGJh5/ZmfHeaz", + "EG+BxnrVvrMqCmLwQJM0xtl3va7TTfPteJ4Y53UJCuP05n4uT1NzxhVIXStpVXxbMUzdsTRtGxViVWeu", + "oOU812SHfTvZMZ3bZC687luqHNexXIOQDRCb8Iw8BPNQxwujD6AazXzcq2QTz51GfF19/BtTj8E19P3l", + "KF+VmWDde0B6cuGit50zkpMnSkT6EpCvkYJK0lKI4v+XsAyOg/83KdtrJq63ZmKI8B7nPEcy4iLHShbC", + "FFFZavgM0eHQNKTYrTsjqeDiO3HqqqimQrFGmtJmliorVeZ1w9N2tkJDzdbgP7oIhYgjcc/nGdcsfrI2", + "lJgqPTf8yCTMO/p2uigyQifnCQ1W6FI3I5IDj6rrfiUH+a8mijTk7BcAP7FHOeFyQNqofwlLkMBDiLai", + "jDrgr6a/Yp8GRspvzt6dXZ+RCU3ZJBUiVpMvRhAfXUVuBcT8Sor2AVnsQm422BQiJEmELNoO1JicUimZ", + "C7NvgYNkIcHQZGICHJLGmXIRq8xCnRlLEeMpwpLcxCK8w+MNtxo5YBH5S+0I4jfi+kQwv7h2p3BEZXJJ", + "Q1BEAo9AlmcZuPaBO0okKZVMbw5N7sGUwYuSCCI8DYmItRR5c0YOe42eBPgaYmErlVxoEoosjSEiWiA5", + "8mRyvBV45yjtZK9yrjJ+mzdwef3rUwSOZUTRK2QVcHauhAz0vX19T60gYgrVFniaHGpbJazlsinYPVXE", + "qhzKF3L53ojNna+WNArsl10tV9pWTtHCn4U4kFwdji61E/G8Vt9rXnDHkSUAbtNGwH4D3ih5FJl0HcZG", + "rdBHhe8oAPn6YMLR62tDil2jgsIddIUHl6BEvG6vVrYV7eaCh2ZtGt/TjZrXCnnfVLkrAGrLQFsakAe3", + "waJ96IQgi2GfvdTfklz8IDVhJTIZwsuWi3v5fgXa6OpJhvUP0NdoP77llKbFJP22VXdyvYpMY1dlm+1p", + "ILQNgBcrNDodpaS8c2qoPat3RvlkCNDfz6m7nqDamufFw2ZuYi7gLjxoxvtNx9SY4dtqVCLko8a/g2TL", + "TWuxCZsM5zHjd8PpUelM9BAjhlsaGqA5tKmg0DTuGrCmMYtKoHrCn+py9bmjOn4N2LapZfQWwsxEwVcG", + "V0cjoBKkURKPvF+cuyKtEfY1o+Tq3cfz07P5ycX5/PrDv539boJisqYmhET6IbtxxVLcsUfu0WzP+NIT", + "5FyeXV0Ts5VJSxLK6W1e2b6KMxZWj9mPXBsGvyW5iBCUIhN0xywEpxLWvgbvz69txUZjXdMtdyq4liIm", + "FzHlYDa2dkRZaH4eT8dTM0ukwGnKguPgl/F0/EuAxnmFNMOkqaYOzlwY4UP3cB4Fx8HfQZ+k7KQiy9Kp", + "Lc7563Rqa79cOztLU9uexQSf/MPlrFYsB0tv8w7Rlgg/PjZbJi6s0pUUdT39yopMliRUboLj4J1Jp9LW", + "waNAU5PjfApKynw2K9SpNfnCoseJtFEBqqxQHtpdCFUj3nnkAgnkg6QJaAzaPn0JmEHC8CaviB1b31Bq", + "kZYZjCqUbNrhz3YwKP1aRJudmNLFi0Ys9lhXbQPU4zeKxKDdnafwcP6kZCKOjYy4vJq+ejIY6gWJLghM", + "Qr0UGXcg/PoCINBYAo02BTGIkMREghERmW7ogiMuoa0K0acPWcT0RELoEOwyIGbopR3pF/4/MpCbUvpj", + "ljAdVAW+aJv623QUJPSBJSYA+3k6NX8y7v70pOKf92KzyruAA8yVpQRBAhLgWhpn3+ROZUgsbothFZ6Y", + "b1v8WGM4MYQfNvAInlF/a6GNhw743S1tZNYOq5LBrlAhw4qqFQlXlPEOSlSrVR1UeJ0P24eEtFbAtslS", + "6cMoMPH4sdAzrKRJWQ6zReJ2B1Ujw9O7EG+T+CBH8vOTwVAQf5vY7pPry3W2e7o/233OMSAubO6+fUeO", + "f+46sDlB2YbaFVPNDv1q/aAukydRVG/or/Qzb8tkU1UxprJ2PgZ7ZFEX1jf4e0Vcz6OniKP6S7nb/uPV", + "dgKQU9Hdatx7EJLvX4lBGu7EwLUDfzBVCFceo2F+fik2PL11ql8w3nN8O8AsuSuv34dZelmB3rNhfFO5", + "2pzfvyXu+m3ZwpVfQLL2cusObkMNrZh9hZmslsQ6IprT8s7h80c0+a3RARHN70KXgV6Biyem4d6BJWnK", + "StoWaQoP0mO4chr94Iarfttyz4arYL0neHVn1TXDtUfDke/f5gkLFfSJWo+kFfdJu1TQDnpO4tsdfLjb", + "00wS5iOqiP8ddN447AZksjh9zZG2Ewe4/wqazyDb1auu+xbtduril+/HI/vEupexhSyX/b99Al0ZuRe3", + "UnGeO+XK7lIP9jorfGiFwxokgYdUKIgOe3JonF4lWQXv3hy6SaTnSqO3L7vuOZOucqeTGzSK/oTZdJUA", + "tYS6J1lulTuPvrout0HJckUuf7fdGP0Rj2vb2PHsoS9DPq2+S/MySXLVVgzPk7tMQo+P3Bvxn8Xc1K9l", + "DbIz3Xz/s+ayg+QOu1UwhbSXdcSy5apbr6lIwnRSXJvoce7vw/RjMXQf3r16OWOAe39/ekFKXDwOvD6g", + "JE0Spv1eewv753Lbngvue/bbNbpv0zn/9qctgxcE6HfbvCZ0WzLnVcNdfHZVKl/YaRdUeSmXXQDQ57AH", + "MgWvCfTYxAscsw9jiDebdktyLALdmUzqEMhJYP/utYYl3s9lBqtPTOzZ/llSezqHhIj/tEYPkcfXgooC", + "N76g6prgmSqsoREhS6C/7ZdAGiSnsb2CY/mUd9bh3Z4DvKloX1uevHlN3IWpZq5vxa9ed3d3CppKUrMU", + "u5htVJ8XttfI0Jey1Vaanuu8puuaWBsww699XePjCO6W1I2INj+p4slctXCPB8d4Mo7P1A646jUecATb", + "IoOjXve0TzmbPikP8ybtVpblPdwvKrs1zlUAIwf1e0J/ISnII/eHvT95ONCoTMom/KOyW7/fQxvWNy4E", + "/FCVhe47DXuu9reFBR/dS9Opi9C81tZCnpc1frMeSYJ76tseLGWcrimL8V2+Gwhp5h4WN3w+cs+WGxd7", + "hG9WG/f1gnGIE8PvSvmuQBt7bZ9AQcqBPEJKf/M7Y0MV1T5sM1w77S3OH9M8126g+lpX3ZXNol3z/1SQ", + "0bu9JQ+R1EQTlISC50eq7kbjiqV4mHwLDTn+gCIjJDECKFkE5H/+678JjdaUu7qf8ywo23a1DgHFN1Z6", + "EtlLHDOo3zq/hFYVva+94ubfwT0gvb08B30vJL7XY6Oh/AKhZ+299HLjZcshl05EzMINsZzw3TOpfi85", + "af/uzcZz5j1fNl59EnDP2bglsUe9shi+n2zcc2JX4amHpTXlnMBD/oJOn46ePbj/49AOwq1FEteRbyrL", + "FoLXH96/Iw6qOnIWACuqhCpiRvbix5Icv35Bti9qdYpzksWapVTqyVLI5CiimtbxazyGwGLfk7YGRdvs", + "QHDEqOzQumGcokXqvsOK0zxXD/cal9ZeIPPJamIZVnHCL6ApKCZ1ScoBQ0laSpEMk6XB3d9Wmr6n1m+0", + "WS9Vc8HNew+zh1mt8h29Dnvl0vdnFP3G7fCONje8NltUDOope+UT3lsw/6x64fz1P8TeJu7/2YX6Wzfk", + "GRFvvHPoizgQK6byl5psPfaXPUbfOQBG5LAyXLuTHRx/+lzlgsWIhCsI7yq0t9Ab2tfn1m9yf/ps9M/+", + "7x+sitdheSdCGjtKTE4uzt3/KSIYBfhcLt7ZPp5Mfv7rP4+n4+n45+NfptNp8Pj58X8DAAD//yYn3mC/", + "cAAA", } // GetSwagger returns the content of the embedded swagger specification file diff --git a/internal/api/server.go b/internal/api/server.go index 32efa7d..0e55d5e 100644 --- a/internal/api/server.go +++ b/internal/api/server.go @@ -1928,13 +1928,29 @@ func (s *Server) PostApiPoolsNameRotate(w http.ResponseWriter, _ *http.Request, // Pool object: the OpenAPI Pool schema requires name+strategy+members, and the // request body alone cannot reconstruct strategy/members, so echoing it would // emit a schema-invalid response. +// +// auth_reset_target is REQUIRED per the OpenAPI schema: omitting it is a 400 +// (the generated request type's plain-string field can't tell an absent field +// from an explicit "", which would silently clear the target). An explicit +// empty string ("") is accepted and clears the target — that is the documented +// clear path. func (s *Server) PostApiPoolsNameAuthResetTarget(w http.ResponseWriter, r *http.Request, name string) { //nolint:revive // generated interface name - var req SetPoolAuthResetTargetRequest + // Decode into a presence-detecting type: the generated + // SetPoolAuthResetTargetRequest carries a plain (non-pointer) string, so a + // "{}" body would decode to "" and silently CLEAR the target. A pointer + // distinguishes absent (nil -> 400) from an explicit "" (clear). + var req struct { + AuthResetTarget *string `json:"auth_reset_target"` + } if err := json.NewDecoder(limitedBody(w, r)).Decode(&req); err != nil { writeError(w, http.StatusBadRequest, "invalid request body", "") return } - if err := poolops.SetAuthResetTarget(s.store, name, req.AuthResetTarget); err != nil { + if req.AuthResetTarget == nil { + writeError(w, http.StatusBadRequest, "auth_reset_target is required", "") + return + } + if err := poolops.SetAuthResetTarget(s.store, name, *req.AuthResetTarget); err != nil { status := poolStatusError(err) if errors.Is(err, poolops.ErrInvalidAuthResetTarget) { status = http.StatusBadRequest diff --git a/internal/api/server_test.go b/internal/api/server_test.go index ede39a2..f018244 100644 --- a/internal/api/server_test.go +++ b/internal/api/server_test.go @@ -3142,6 +3142,55 @@ func TestPostApiPoolsNameAuthResetTarget(t *testing.T) { } } +// TestPostApiPoolsNameAuthResetTarget_RequiredField asserts auth_reset_target is +// a REQUIRED field on the set route (Copilot re-review): an empty "{}" body that +// OMITS the field is a 400 and must NOT silently clear the stored target, while +// an EXPLICIT empty string ("") is still accepted and clears it (the documented +// clear path). +func TestPostApiPoolsNameAuthResetTarget_RequiredField(t *testing.T) { + st := newTestStore(t) + enableHTTPChannel(t, st) + seedOAuthCred(t, st, "credA", "credB") + if err := st.CreatePoolWithMembers("pool1", store.PoolStrategyFailover, []string{"credA", "credB"}, "openai-codex"); err != nil { + t.Fatalf("create pool: %v", err) + } + srv := api.NewServer(st, nil, nil, "") + t.Setenv("SLUICE_API_TOKEN", "tok") + handler := newTestHandler(t, srv, st) + + post := func(body string) *httptest.ResponseRecorder { + req := httptest.NewRequest("POST", "/api/pools/pool1/auth-reset-target", strings.NewReader(body)) + req.Header.Set("Authorization", "Bearer tok") + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + return rec + } + + // Omitting the field -> 400, and the pre-existing target is untouched. + rec := post(`{}`) + if rec.Code != http.StatusBadRequest { + t.Fatalf("omitted field: expected 400, got %d: %s", rec.Code, rec.Body.String()) + } + got, err := st.GetPool("pool1") + if err != nil { + t.Fatalf("GetPool: %v", err) + } + if got.AuthResetTarget != "openai-codex" { + t.Fatalf("omitted field cleared the target to %q; expected it untouched (openai-codex)", got.AuthResetTarget) + } + + // Explicit empty string -> still accepted and clears the target. + rec = post(`{"auth_reset_target": ""}`) + if rec.Code != http.StatusOK { + t.Fatalf("explicit clear: expected 200, got %d: %s", rec.Code, rec.Body.String()) + } + got, _ = st.GetPool("pool1") + if got.AuthResetTarget != "" { + t.Fatalf("explicit clear: stored AuthResetTarget = %q, want empty", got.AuthResetTarget) + } +} + // TestPostApiPools_WithAuthResetTarget asserts the create body accepts // auth_reset_target and persists it via poolops. func TestPostApiPools_WithAuthResetTarget(t *testing.T) { diff --git a/internal/proxy/pool_failover.go b/internal/proxy/pool_failover.go index e719d84..d1e1f9b 100644 --- a/internal/proxy/pool_failover.go +++ b/internal/proxy/pool_failover.go @@ -238,12 +238,17 @@ func parseRecoveryHint(header, raw string, now time.Time) (time.Duration, bool) } return time.Duration(secs * float64(time.Second)), true } - // HTTP-date (Retry-After absolute form). - if t, err := http.ParseTime(raw); err == nil { - if d := t.Sub(now); d > 0 { - return d, true + // HTTP-date (Retry-After absolute form, per RFC 9110). Only Retry-After is + // permitted to carry an absolute HTTP-date; the x-ratelimit-reset* family is + // numeric (delta-seconds / unix epoch / unit-suffixed duration) and must not + // be coerced through HTTP-date parsing. + if header == "Retry-After" { + if t, err := http.ParseTime(raw); err == nil { + if d := t.Sub(now); d > 0 { + return d, true + } + return 0, false } - return 0, false } // Unit-suffixed duration (e.g. OpenAI "1.5s", "60ms"). if d, err := time.ParseDuration(raw); err == nil && d > 0 { diff --git a/internal/proxy/pool_failover_test.go b/internal/proxy/pool_failover_test.go index 8551be8..3c595e8 100644 --- a/internal/proxy/pool_failover_test.go +++ b/internal/proxy/pool_failover_test.go @@ -1408,3 +1408,40 @@ func TestCooldownFromResponseNilSafe(t *testing.T) { t.Fatalf("nil header (auth): got %v, want %v", got, vault.AuthFailCooldown) } } + +// TestParseRecoveryHintHTTPDateGatedToRetryAfter verifies the HTTP-date branch +// is reachable only for Retry-After (RFC 9110). A reset-family header carrying +// an HTTP-date string must NOT be coerced through http.ParseTime — those +// headers are numeric (delta-seconds / epoch / unit-suffixed duration), so a +// date value is simply not a usable hint and parseRecoveryHint reports ok=false. +func TestParseRecoveryHintHTTPDateGatedToRetryAfter(t *testing.T) { + now := time.Now() + httpDate := now.Add(90 * time.Second).UTC().Format(http.TimeFormat) + + // Retry-After with an HTTP-date still yields a positive duration. + d, ok := parseRecoveryHint("Retry-After", httpDate, now) + if !ok || d <= 0 { + t.Fatalf("Retry-After HTTP-date: got (%v, %v), want a positive duration", d, ok) + } + + // The reset family must not parse an HTTP-date as a date: no usable hint. + for _, h := range []string{"x-ratelimit-reset", "x-ratelimit-reset-requests", "x-ratelimit-reset-tokens"} { + if d, ok := parseRecoveryHint(h, httpDate, now); ok || d != 0 { + t.Fatalf("%s HTTP-date: got (%v, %v), want (0, false) — date form must be Retry-After only", h, d, ok) + } + } + + // And end-to-end through cooldownFromResponse: a reset-family HTTP-date is + // ignored and the class default applies, while the same date on Retry-After + // is honored. + resetHdr := make(http.Header) + resetHdr.Set("x-ratelimit-reset-requests", httpDate) + if got := cooldownFromResponse(failoverRateLimited, resetHdr); got != vault.RateLimitCooldown { + t.Fatalf("reset-family HTTP-date: got %v, want fallback %v", got, vault.RateLimitCooldown) + } + retryHdr := make(http.Header) + retryHdr.Set("Retry-After", httpDate) + if got := cooldownFromResponse(failoverRateLimited, retryHdr); got <= 0 || got > vault.MaxCooldown { + t.Fatalf("Retry-After HTTP-date: got %v, want a clamped positive duration", got) + } +} From 4f8fe14616f31ba741c2c31757c090da7cfd40ad Mon Sep 17 00:00:00 2001 From: Nikita Nemirovsky Date: Sat, 23 May 2026 10:22:44 +0800 Subject: [PATCH 16/19] fix(pools): reserve "-" clear sentinel and de-flake TestSoonestCooldown --- internal/api/server_test.go | 12 +++++--- internal/poolops/poolops.go | 9 ++++++ internal/poolops/poolops_test.go | 41 +++++++++++++++++++++++++ internal/vault/pool_test.go | 51 +++++++++++++++++--------------- 4 files changed, 85 insertions(+), 28 deletions(-) diff --git a/internal/api/server_test.go b/internal/api/server_test.go index f018244..b69eb25 100644 --- a/internal/api/server_test.go +++ b/internal/api/server_test.go @@ -3115,11 +3115,15 @@ func TestPostApiPoolsNameAuthResetTarget(t *testing.T) { // Invalid targets -> 400 and no store mutation. Newline was caught by the // old looser rule; a space and a slash were NOT (F1) and are the cases that - // would otherwise store with 200 and fail silently at recovery. + // would otherwise store with 200 and fail silently at recovery. A literal + // "-" is the CLI/Telegram clear sentinel: CLI/Telegram translate it to "" + // before poolops, but REST forwards it verbatim, so it must be a 400 here + // (cross-channel parity) rather than a nonsensical stored target. for name, target := range map[string]string{ - "newline": `bad\ntarget`, - "space": `openai codex`, - "slash": `openai/codex`, + "newline": `bad\ntarget`, + "space": `openai codex`, + "slash": `openai/codex`, + "sentinel": `-`, } { rec = post(`{"auth_reset_target": "` + target + `"}`) if rec.Code != http.StatusBadRequest { diff --git a/internal/poolops/poolops.go b/internal/poolops/poolops.go index 754accf..7057722 100644 --- a/internal/poolops/poolops.go +++ b/internal/poolops/poolops.go @@ -129,6 +129,15 @@ func validateAuthResetTarget(target string) error { if target == "" { return nil } + // "-" is the CLI/Telegram clear sentinel (converted to "" before reaching + // here), so a literal "-" is never a valid stored target on any channel. + // container.ValidateResetAuthTarget permits "-" ([A-Za-z0-9_.:-]+), so + // without this guard REST — which forwards the value verbatim instead of + // translating the sentinel — could store auth_reset_target="-" while + // CLI/Telegram cannot. Reject it here so the gap is closed cross-channel. + if target == "-" { + return fmt.Errorf("%w: %q is reserved as the clear sentinel", ErrInvalidAuthResetTarget, "-") + } if err := container.ValidateResetAuthTarget(target); err != nil { return fmt.Errorf("%w: %w", ErrInvalidAuthResetTarget, err) } diff --git a/internal/poolops/poolops_test.go b/internal/poolops/poolops_test.go index 1f9b2fa..7db7f3d 100644 --- a/internal/poolops/poolops_test.go +++ b/internal/poolops/poolops_test.go @@ -184,6 +184,47 @@ func TestCreateWithInvalidAuthResetTarget(t *testing.T) { } } +// TestAuthResetTargetDashSentinelRejected pins the cross-channel gap fix: a +// literal "-" is the CLI/Telegram clear sentinel (converted to "" before +// reaching poolops), so it must never be a valid STORED target on any channel. +// REST forwards the value verbatim, so without this guard it could persist +// auth_reset_target="-" while CLI/Telegram cannot. Both the create path and +// the set path must reject a literal "-". +func TestAuthResetTargetDashSentinelRejected(t *testing.T) { + t.Run("set", func(t *testing.T) { + db := newTestStore(t, "acct_a") + if err := poolops.Create(db, "p", "", []string{"acct_a"}); err != nil { + t.Fatalf("Create: %v", err) + } + if err := poolops.SetAuthResetTarget(db, "p", "-"); !errors.Is(err, poolops.ErrInvalidAuthResetTarget) { + t.Fatalf("SetAuthResetTarget(\"-\") err = %v, want ErrInvalidAuthResetTarget", err) + } + p, err := db.GetPool("p") + if err != nil { + t.Fatalf("GetPool: %v", err) + } + if p.AuthResetTarget != "" { + t.Fatalf("sentinel persisted despite rejection: %q", p.AuthResetTarget) + } + }) + + t.Run("create", func(t *testing.T) { + db := newTestStore(t, "acct_a") + err := poolops.CreateWithAuthResetTarget(db, "p", "", []string{"acct_a"}, "-") + if !errors.Is(err, poolops.ErrInvalidAuthResetTarget) { + t.Fatalf("create-with-target \"-\" err = %v, want ErrInvalidAuthResetTarget", err) + } + // The pool must not have been created when the target is invalid. + p, err := db.GetPool("p") + if err != nil { + t.Fatalf("GetPool: %v", err) + } + if p != nil { + t.Fatalf("pool created despite sentinel target: %+v", p) + } + }) +} + func TestSetAuthResetTargetSetAndClear(t *testing.T) { db := newTestStore(t, "acct_a", "acct_b") if err := poolops.Create(db, "codex", "", []string{"acct_a", "acct_b"}); err != nil { diff --git a/internal/vault/pool_test.go b/internal/vault/pool_test.go index 7dc0668..56401e5 100644 --- a/internal/vault/pool_test.go +++ b/internal/vault/pool_test.go @@ -1029,13 +1029,18 @@ func TestHasHealthyMemberNil(t *testing.T) { // STRICTLY-FUTURE member cooldown, skipping already-passed entries (lazy // expiry), and reports ok=false when no member is currently cooling. func TestSoonestCooldown(t *testing.T) { - now := time.Now() tests := []struct { - name string - setup func(pr *PoolResolver, base time.Time) - pool string - wantOK bool - wantBase time.Time // expected soonest when wantOK (exact, set in setup) + name string + setup func(pr *PoolResolver, base time.Time) + pool string + wantOK bool + // wantOffset is the intended soonest cooldown expressed as an offset + // from the per-subtest `base` (the same `base` passed into setup), so + // the expected value derives from the SAME clock sample the cooldowns + // were set against. Comparing against the outer `now` instead let slow + // CI / -race / GC widen the now-vs-base gap past the tolerance and + // flake; deriving both sides from `base` makes it exact. + wantOffset time.Duration }{ { name: "no member cooling", @@ -1048,9 +1053,9 @@ func TestSoonestCooldown(t *testing.T) { setup: func(pr *PoolResolver, base time.Time) { pr.MarkCooldown("a", base.Add(60*time.Second), "429") }, - pool: "pool", - wantOK: true, - wantBase: now.Add(60 * time.Second), + pool: "pool", + wantOK: true, + wantOffset: 60 * time.Second, }, { name: "two cooling -> min wins", @@ -1058,9 +1063,9 @@ func TestSoonestCooldown(t *testing.T) { pr.MarkCooldown("a", base.Add(10*time.Minute), "401") pr.MarkCooldown("b", base.Add(60*time.Second), "429") }, - pool: "pool", - wantOK: true, - wantBase: now.Add(60 * time.Second), + pool: "pool", + wantOK: true, + wantOffset: 60 * time.Second, }, { name: "already-passed entry skipped", @@ -1070,9 +1075,9 @@ func TestSoonestCooldown(t *testing.T) { pr.MarkCooldown("a", base.Add(-1*time.Second), "429") pr.MarkCooldown("b", base.Add(120*time.Second), "401") }, - pool: "pool", - wantOK: true, - wantBase: now.Add(120 * time.Second), + pool: "pool", + wantOK: true, + wantOffset: 120 * time.Second, }, { name: "all passed -> not cooling", @@ -1093,9 +1098,6 @@ func TestSoonestCooldown(t *testing.T) { for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { pr := NewPoolResolver([]store.Pool{mkPool("pool", "a", "b")}, nil) - // Use a base far enough in the future that the "already-passed" - // negative offsets are still safely in the past relative to the - // SoonestCooldown's own time.Now() sample. base := time.Now() tc.setup(pr, base) got, ok := pr.SoonestCooldown(tc.pool) @@ -1105,12 +1107,13 @@ func TestSoonestCooldown(t *testing.T) { if !tc.wantOK { return } - // The stored cooldown is exact (MarkCooldown stores `until` - // verbatim); SoonestCooldown returns it unchanged, so compare - // against the value set in setup within a small tolerance for the - // base/now skew. - if d := got.Sub(tc.wantBase); d < -2*time.Second || d > 2*time.Second { - t.Fatalf("SoonestCooldown(%q) = %v, want ~%v (delta %v)", tc.pool, got, tc.wantBase, d) + // MarkCooldown stores `until` verbatim and SoonestCooldown returns + // it unchanged, so the expected value is exactly base+offset (both + // derived from the same `base`). Assert exact equality — there is + // no clock skew between the stored value and the expectation. + want := base.Add(tc.wantOffset) + if !got.Equal(want) { + t.Fatalf("SoonestCooldown(%q) = %v, want %v (delta %v)", tc.pool, got, want, got.Sub(want)) } }) } From a0aca1be79b69e19f441383358ec3fe9fcea1db6 Mon Sep 17 00:00:00 2001 From: Nikita Nemirovsky Date: Sat, 23 May 2026 10:33:14 +0800 Subject: [PATCH 17/19] fix(proxy): reject NaN/Inf/overflow recovery hints; correct migration test comments --- internal/proxy/pool_failover.go | 35 ++++++++++++++++++-- internal/proxy/pool_failover_test.go | 49 ++++++++++++++++++++++++++++ internal/store/pools_test.go | 14 +++++--- 3 files changed, 92 insertions(+), 6 deletions(-) diff --git a/internal/proxy/pool_failover.go b/internal/proxy/pool_failover.go index d1e1f9b..ab93d7c 100644 --- a/internal/proxy/pool_failover.go +++ b/internal/proxy/pool_failover.go @@ -4,6 +4,7 @@ import ( "bytes" "fmt" "log" + "math" "net/http" "strconv" "strings" @@ -219,13 +220,35 @@ func cooldownFromResponse(class failoverClass, header http.Header) time.Duration // Returns ok=false when the value parses to nothing usable (negative, // zero-after-now, unparseable). A delta value carrying a unit suffix (OpenAI's // "1.5s" / "60ms" form) is handled via time.ParseDuration as a fallback. +// +// Non-finite (NaN, +Inf, -Inf) and out-of-range bare-numeric values are +// rejected up front: strconv.ParseFloat accepts all of them, NaN<0 and Inf<0 +// are both false so the old `secs < 0` guard let them through, and the +// subsequent float->int64 conversion (delta `time.Duration(secs*1e9)` and +// epoch `int64(secs)`) is implementation-defined for those inputs and can wrap +// to a wrong (even negative) duration before clampCooldown runs. Any finite +// magnitude above maxRecoveryHintSeconds is also rejected — it is far larger +// than any real Retry-After / reset / epoch and would overflow int64 once +// multiplied into nanoseconds. +// maxRecoveryHintSeconds is the upper bound for a bare-numeric recovery hint +// (~31,000 years). Any larger value is not a real Retry-After / reset / unix +// epoch and is rejected: it is comfortably above a present-day epoch (~1.7e9) +// yet small enough that int64(secs) cannot overflow on the epoch branch. +const maxRecoveryHintSeconds = 1e12 + func parseRecoveryHint(header, raw string, now time.Time) (time.Duration, bool) { // Bare numeric: delta-seconds for Retry-After; for the reset family it may // be either delta-seconds OR a unix epoch. Disambiguate by magnitude — a // value large enough to be a plausible epoch (>= ~ year 2001) is treated // as absolute, otherwise as a delta. if secs, err := strconv.ParseFloat(raw, 64); err == nil { - if secs < 0 { + // Reject negative, non-finite (NaN/±Inf), and absurdly large + // magnitudes before any float->int64 conversion. maxRecoveryHintSeconds + // (~31,000 years) comfortably exceeds any real epoch (~1.7e9 today) or + // recovery delta, while staying well within int64 for the epoch + // branch's int64(secs); the delta branch additionally caps in float + // space (below) so secs*1e9 can never overflow int64 (~9.2e18). + if secs < 0 || math.IsNaN(secs) || math.IsInf(secs, 0) || secs > maxRecoveryHintSeconds { return 0, false } const epochThreshold = 1_000_000_000 // ~2001-09; below this, treat as delta-seconds @@ -236,7 +259,15 @@ func parseRecoveryHint(header, raw string, now time.Time) (time.Duration, bool) } return 0, false } - return time.Duration(secs * float64(time.Second)), true + // Delta path: cap in float space before the *float64(time.Second) + // multiply so the int64(time.Duration) conversion can never overflow. + // clampCooldown applies the real ceiling (MaxCooldown) afterward; this + // only prevents the conversion itself from wrapping. + deltaSec := secs + if maxSec := vault.MaxCooldown.Seconds(); deltaSec > maxSec { + deltaSec = maxSec + } + return time.Duration(deltaSec * float64(time.Second)), true } // HTTP-date (Retry-After absolute form, per RFC 9110). Only Retry-After is // permitted to carry an absolute HTTP-date; the x-ratelimit-reset* family is diff --git a/internal/proxy/pool_failover_test.go b/internal/proxy/pool_failover_test.go index 3c595e8..0e0667a 100644 --- a/internal/proxy/pool_failover_test.go +++ b/internal/proxy/pool_failover_test.go @@ -1367,6 +1367,36 @@ func TestCooldownFromResponse(t *testing.T) { setup: func(h http.Header) { h.Set("Retry-After", "-5") }, want: vault.RateLimitCooldown, }, + { + name: "NaN delta ignored falls back to default", + class: failoverRateLimited, + setup: func(h http.Header) { h.Set("Retry-After", "NaN") }, + want: vault.RateLimitCooldown, + }, + { + name: "Inf delta ignored falls back to default", + class: failoverRateLimited, + setup: func(h http.Header) { h.Set("Retry-After", "Inf") }, + want: vault.RateLimitCooldown, + }, + { + name: "+Inf delta ignored falls back to default", + class: failoverRateLimited, + setup: func(h http.Header) { h.Set("Retry-After", "+Inf") }, + want: vault.RateLimitCooldown, + }, + { + name: "-Inf delta ignored falls back to default", + class: failoverRateLimited, + setup: func(h http.Header) { h.Set("Retry-After", "-Inf") }, + want: vault.RateLimitCooldown, + }, + { + name: "overflow-magnitude delta ignored falls back to default", + class: failoverRateLimited, + setup: func(h http.Header) { h.Set("Retry-After", "1e308") }, + want: vault.RateLimitCooldown, + }, } for _, tt := range tests { @@ -1409,6 +1439,25 @@ func TestCooldownFromResponseNilSafe(t *testing.T) { } } +// TestParseRecoveryHintRejectsNonFinite verifies the bare-numeric branch +// rejects NaN/±Inf and absurdly large finite magnitudes up front (ok=false, +// d=0) instead of letting strconv.ParseFloat's acceptance of those tokens slip +// past the `secs < 0` guard into an implementation-defined float->int64 +// conversion that could wrap to a wrong/negative duration. Exercised on both +// the delta header (Retry-After) and the epoch-branch reset family (where the +// int64(secs) conversion lives). +func TestParseRecoveryHintRejectsNonFinite(t *testing.T) { + now := time.Now() + headers := []string{"Retry-After", "x-ratelimit-reset"} + for _, h := range headers { + for _, raw := range []string{"NaN", "Inf", "+Inf", "-Inf", "1e308"} { + if d, ok := parseRecoveryHint(h, raw, now); ok || d != 0 { + t.Errorf("parseRecoveryHint(%q, %q) = (%v, %v), want (0, false)", h, raw, d, ok) + } + } + } +} + // TestParseRecoveryHintHTTPDateGatedToRetryAfter verifies the HTTP-date branch // is reachable only for Retry-After (RFC 9110). A reset-family header carrying // an HTTP-date string must NOT be coerced through http.ParseTime — those diff --git a/internal/store/pools_test.go b/internal/store/pools_test.go index 19a6c60..d32f873 100644 --- a/internal/store/pools_test.go +++ b/internal/store/pools_test.go @@ -1571,8 +1571,12 @@ func TestRemoveCredentialMetaCASNoOpLeavesHealthIntact(t *testing.T) { // migration (000008) up -> down -> up against a POPULATED schema: a pool with // two members and a credential_health row must survive the down (which // rebuilds credential_pools while preserving the credential_pool_members FK) -// and the re-up round-trip. The down migration disables foreign_keys for the -// 12-step rebuild, so the FK-referencing member rows must NOT be cascade-wiped. +// and the re-up round-trip. golang-migrate runs the down migration inside a +// transaction where SQLite ignores PRAGMA foreign_keys=OFF, so dropping +// credential_pools during the 12-step rebuild would CASCADE-wipe the member +// rows. The down migration instead snapshots credential_pool_members into a +// temp table before the drop and restores it once the rebuilt parent rows +// exist, so the FK-referencing member rows survive. func TestMigration000008DownUpPopulated(t *testing.T) { dir := t.TempDir() dbPath := filepath.Join(dir, "m.db") @@ -1636,8 +1640,10 @@ func TestMigration000008DownUpPopulated(t *testing.T) { // Down one step (000008 -> 000007): the column goes; the rebuilt // credential_pools keeps its row; the FK-referencing member rows and the - // health row survive (foreign_keys=OFF during the rebuild prevents a - // cascade wipe). + // health row survive (PRAGMA foreign_keys=OFF is ignored inside the + // migration transaction, so the down migration snapshots the member rows + // into a temp table and restores them after the rebuild rather than letting + // the credential_pools drop cascade-wipe them). if err := m.Steps(-1); err != nil { t.Fatalf("down 1 (000008): %v", err) } From 47a5c21b2f2e00149a62ef0ddcc54ca899fabe74 Mon Sep 17 00:00:00 2001 From: Nikita Nemirovsky Date: Sat, 23 May 2026 10:41:56 +0800 Subject: [PATCH 18/19] fix(proxy): use [POOL-RECOVERY] log prefix and drop name-keyed test assertion --- internal/proxy/pool_failover_test.go | 12 +++--------- internal/proxy/server.go | 2 +- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/internal/proxy/pool_failover_test.go b/internal/proxy/pool_failover_test.go index 0e0667a..9b234a9 100644 --- a/internal/proxy/pool_failover_test.go +++ b/internal/proxy/pool_failover_test.go @@ -1319,8 +1319,9 @@ func TestCooldownFromResponse(t *testing.T) { name: "x-ratelimit-reset-tokens unit-suffixed duration honored", class: failoverRateLimited, setup: func(h http.Header) { h.Set("x-ratelimit-reset-tokens", "1500ms") }, - want: 1500 * time.Millisecond, - // 1.5s is below MinRateLimitFloor so it clamps up; assert clamp below. + // 1500ms is below MinRateLimitFloor, so the parsed hint clamps up to + // the floor; want is the post-clamp value the generic check asserts. + want: vault.MinRateLimitFloor, }, { name: "absurd value capped at MaxCooldown", @@ -1414,13 +1415,6 @@ func TestCooldownFromResponse(t *testing.T) { } return } - // The unit-suffixed 1.5s case clamps up to MinRateLimitFloor. - if tt.name == "x-ratelimit-reset-tokens unit-suffixed duration honored" { - if got != vault.MinRateLimitFloor { - t.Fatalf("cooldownFromResponse = %v, want clamp to %v", got, vault.MinRateLimitFloor) - } - return - } if got != tt.want { t.Fatalf("cooldownFromResponse = %v, want %v", got, tt.want) } diff --git a/internal/proxy/server.go b/internal/proxy/server.go index 1f3ae47..66ce041 100644 --- a/internal/proxy/server.go +++ b/internal/proxy/server.go @@ -3102,7 +3102,7 @@ func (s *Server) recoverPool(pool string) { noticeCb := s.onPoolRecoveredNotice s.poolExhaustMu.Unlock() - log.Printf("[POOL-FAILOVER] %s", FormatPoolRecoveredNotice(pool)) + log.Printf("[POOL-RECOVERY] %s", FormatPoolRecoveredNotice(pool)) // Notice always fires on the recovery edge (independent of auth-reset). if noticeCb != nil { noticeCb(pool) From 3668eff376abfb8f1996217ec4c2fbd0896f2fdd Mon Sep 17 00:00:00 2001 From: Nikita Nemirovsky Date: Sat, 23 May 2026 10:58:03 +0800 Subject: [PATCH 19/19] fix(proxy): persist pool cooldown durably even when the failover notice is deduped --- cmd/sluice/main.go | 10 ++- internal/proxy/pool_failover.go | 64 ++++++++++----- internal/proxy/pool_failover_test.go | 84 +++++++++++++------- internal/proxy/pool_recovery_monitor_test.go | 19 ++++- 4 files changed, 124 insertions(+), 53 deletions(-) diff --git a/cmd/sluice/main.go b/cmd/sluice/main.go index dd61bb5..77a17f4 100644 --- a/cmd/sluice/main.go +++ b/cmd/sluice/main.go @@ -510,7 +510,15 @@ func main() { log.Printf("[POOL-FAILOVER] durable health write for %q skipped: no longer a live member of pool %q at epoch %d (removed/re-added before failover landed)", ev.From, ev.Pool, ev.Epoch) } } - if failoverBroker != nil { + // The durable health write above runs for EVERY failover event + // (decoupled from the notice dedup) so the persisted cooldown + // stays monotonic through a sustained exhaustion. The operator + // notice, by contrast, is gated on ev.Notify — true only when + // this event passed the dedup gate in handlePoolFailover (a real + // transition's 30s window or the healthy->exhausted edge) — so a + // suppressed/deduped event still persists but does not re-spam + // the operator. + if failoverBroker != nil && ev.Notify { // Plain text: TelegramChannel.Notify sends with no parse // mode, so markdown backticks would render literally. // Exhausted: no distinct member to fail over to (every diff --git a/internal/proxy/pool_failover.go b/internal/proxy/pool_failover.go index ab93d7c..3624c0a 100644 --- a/internal/proxy/pool_failover.go +++ b/internal/proxy/pool_failover.go @@ -324,6 +324,12 @@ type FailoverEvent struct { // so a late callback firing after a remove/re-add cannot persist this // cooldown onto the re-created same-name successor (Cluster A #2). Epoch int64 + // Notify is true only when the operator notice/audit for this event + // passed the dedup gate (a real transition's 30s window, or the + // healthy->exhausted edge). The durable store write runs regardless of + // Notify; only the Telegram notice is gated by it, so cooldown + // persistence stays monotonic even when notices are suppressed. + Notify bool } // humanizeFailoverReason maps a short reason tag (the same tag embedded in the @@ -628,8 +634,13 @@ func (a *SluiceAddon) poolForResponse(f *mitmproxy.Flow) (pool, activeMember, pr // below only reconciles for durability across restarts. // 2. Computes the next active member (post-cooldown) for the audit/notice. // 3. Hands a FailoverEvent to the onFailover callback (async, best-effort): -// the callback persists SetCredentialHealth to the store and fires the -// Telegram notice. The callback MUST NOT block the response path. +// the callback persists SetCredentialHealth to the store (ALWAYS) and +// fires the Telegram notice (only when ev.Notify is true). The callback is +// invoked unconditionally — even for a notice-suppressed (deduped) event — +// so the durable store write is decoupled from the notice gate and the +// persisted cooldown stays monotonic through a sustained exhaustion. The +// callback MUST NOT block the response path. Only the spam-prone log line +// and audit row stay gated on the dedup decision (ev.Notify). // // No in-flight retry: the triggering request still returns its own upstream // error to the agent unmodified. The agent (or its SDK) retries on its own @@ -733,6 +744,37 @@ func (a *SluiceAddon) handlePoolFailover(f *mitmproxy.Flow) { } else { emit = a.shouldEmitPoolNotice(pool, from, to, tag) } + + // (3) Durability + Telegram via the callback. The callback is responsible + // for being non-blocking (it runs the store write and the Telegram send in + // its own goroutine); we still guard with a nil check. Fired + // UNCONDITIONALLY — even when the notice is deduped (emit == false) — so the + // durable cooldown write is decoupled from the notice gate: the store stays + // monotonically up to date through a sustained exhaustion/retry storm where + // every notice after the first edge is suppressed. Only the operator-facing + // Telegram notice is gated, via Notify=emit; the callback itself runs the + // durable store write regardless of Notify. The member did fail in every + // case (the in-memory cooldown above already reflects that), exhausted just + // changes the operator-facing wording. + if a.onFailover != nil { + a.onFailover(FailoverEvent{ + Pool: pool, + From: from, + To: to, + Reason: tag, + Class: class, + Until: until, + Exhausted: exhausted, + Epoch: idEpoch, + Notify: emit, + }) + } + + // The log line + audit row below stay gated on the dedup decision: they are + // the spam-prone surfaces a retry storm would flood, so a suppressed notice + // (deduped window, or an already-exhausted pool whose edge already fired) + // must not append a fresh log/audit entry. The durable store write already + // ran above via the unconditional callback. if !emit { return } @@ -776,24 +818,6 @@ func (a *SluiceAddon) handlePoolFailover(f *mitmproxy.Flow) { log.Printf("[POOL-FAILOVER] audit log error: %v", err) } } - - // (3) Durability + Telegram via the callback. The callback is - // responsible for being non-blocking (it runs the store write and the - // Telegram send in its own goroutine); we still guard with a nil check. - // The durable cooldown is persisted even when exhausted (the member did - // fail); only the operator-facing wording differs. - if a.onFailover != nil { - a.onFailover(FailoverEvent{ - Pool: pool, - From: from, - To: to, - Reason: tag, - Class: class, - Until: until, - Exhausted: exhausted, - Epoch: idEpoch, - }) - } } // poolNoticeDedupWindow bounds how often an identical pool failover / diff --git a/internal/proxy/pool_failover_test.go b/internal/proxy/pool_failover_test.go index 9b234a9..3e4a005 100644 --- a/internal/proxy/pool_failover_test.go +++ b/internal/proxy/pool_failover_test.go @@ -8,6 +8,7 @@ import ( "path/filepath" "strconv" "strings" + "sync" "sync/atomic" "testing" "time" @@ -1075,13 +1076,15 @@ func TestFailoverPoolExhaustedNoSelfFailoverSpam(t *testing.T) { // itself -> ResolveActive degrades to memA -> no distinct target. prPtr.Load().MarkCooldown("memB", time.Now().Add(10*time.Minute), "401") - var calls int32 - var last FailoverEvent - done := make(chan struct{}, 4) + // The callback fires synchronously inside addon.Response (the production + // detaching happens in main.go's SetOnFailover closure, not here), so the + // per-call records are stable to read after the loop. + var mu sync.Mutex + var events []FailoverEvent addon.SetOnFailover(func(ev FailoverEvent) { - atomic.AddInt32(&calls, 1) - last = ev - done <- struct{}{} + mu.Lock() + events = append(events, ev) + mu.Unlock() }) // Two back-to-back identical 429s (the agent's retry storm). @@ -1091,25 +1094,38 @@ func TestFailoverPoolExhaustedNoSelfFailoverSpam(t *testing.T) { addon.Response(f) } - select { - case <-done: - case <-time.After(2 * time.Second): - t.Fatal("onFailover callback not invoked") + mu.Lock() + got := append([]FailoverEvent(nil), events...) + mu.Unlock() + + // New contract: the durable callback fires for EVERY qualifying failover + // response (decoupled from the notice dedup), with Notify reflecting the + // dedup decision — true on the first (edge) event, false on the suppressed + // retry. This is what keeps the durable store write monotonic across a + // sustained exhaustion while the notice/audit stay deduped. + if len(got) != 2 { + t.Fatalf("onFailover invoked %d times, want exactly 2 (one per qualifying response; durability decoupled from notice dedup)", len(got)) } - // Dedup: the second identical signal within the window is suppressed. - if got := atomic.LoadInt32(&calls); got != 1 { - t.Fatalf("onFailover invoked %d times, want exactly 1 (dedup window must collapse the retry storm)", got) + if !got[0].Notify { + t.Fatalf("first onFailover Notify = false, want true (healthy->exhausted edge passes the dedup gate)") } - if !last.Exhausted { - t.Fatalf("FailoverEvent.Exhausted = false, want true (no distinct failover target)") + if got[1].Notify { + t.Fatalf("second onFailover Notify = true, want false (the retry-storm repeat is notice-suppressed but still persisted)") } - if last.From != "memA" || last.To != "memA" { - t.Fatalf("FailoverEvent from=%q to=%q, want memA/memA (degraded to self)", last.From, last.To) + for i, ev := range got { + if !ev.Exhausted { + t.Fatalf("event %d Exhausted = false, want true (no distinct failover target)", i) + } + if ev.From != "memA" || ev.To != "memA" { + t.Fatalf("event %d from=%q to=%q, want memA/memA (degraded to self)", i, ev.From, ev.To) + } } if err := logger.Close(); err != nil { t.Fatalf("logger close: %v", err) } + // The audit row stays gated on the dedup decision (only the edge event + // emits), so exactly one row despite two callback invocations. if n := auditActionCount(t, logPath, "pool_exhausted"); n != 1 { t.Fatalf("pool_exhausted audit rows = %d, want exactly 1 (no per-retry spam)", n) } @@ -1150,14 +1166,15 @@ func TestFailoverExhaustedDedupCollapsesFlapDirection(t *testing.T) { prPtr.Load().MarkCooldown("memA", time.Now().Add(5*time.Minute), "429") prPtr.Load().MarkCooldown("memB", time.Now().Add(10*time.Minute), "401") - var calls int32 - done := make(chan struct{}, 4) + var mu sync.Mutex + var events []FailoverEvent addon.SetOnFailover(func(ev FailoverEvent) { if !ev.Exhausted { t.Errorf("FailoverEvent.Exhausted = false, want true (pool exhausted)") } - atomic.AddInt32(&calls, 1) - done <- struct{}{} + mu.Lock() + events = append(events, ev) + mu.Unlock() }) // Two failing responses, attributed to DIFFERENT members (flap directions). @@ -1167,15 +1184,24 @@ func TestFailoverExhaustedDedupCollapsesFlapDirection(t *testing.T) { addon.Response(f) } - select { - case <-done: - case <-time.After(2 * time.Second): - t.Fatal("onFailover callback not invoked") + mu.Lock() + got := append([]FailoverEvent(nil), events...) + mu.Unlock() + + // New contract: the durable callback fires once per qualifying response + // (both flap directions), but the COLLAPSED exhausted dedup key suppresses + // the notice on the second, so exactly one event carries Notify=true. + if len(got) != 2 { + t.Fatalf("onFailover invoked %d times, want exactly 2 (one per qualifying response; durability decoupled from notice dedup)", len(got)) + } + notify := 0 + for _, ev := range got { + if ev.Notify { + notify++ + } } - // Give any erroneous second call a moment to surface before asserting. - time.Sleep(50 * time.Millisecond) - if got := atomic.LoadInt32(&calls); got != 1 { - t.Fatalf("onFailover invoked %d times, want exactly 1 (collapsed exhausted dedup key must absorb both flap directions)", got) + if notify != 1 { + t.Fatalf("onFailover Notify=true on %d events, want exactly 1 (collapsed exhausted dedup key must absorb both flap directions)", notify) } if err := logger.Close(); err != nil { diff --git a/internal/proxy/pool_recovery_monitor_test.go b/internal/proxy/pool_recovery_monitor_test.go index 01b6a27..58ec1ab 100644 --- a/internal/proxy/pool_recovery_monitor_test.go +++ b/internal/proxy/pool_recovery_monitor_test.go @@ -105,13 +105,22 @@ func TestExhaustedNoticeEdgeAcrossManyResponses(t *testing.T) { prPtr.Load().MarkCooldown("memA", time.Now().Add(time.Hour), "429") prPtr.Load().MarkCooldown("memB", time.Now().Add(time.Hour), "401") - var notices int32 + // New contract: the durable callback fires once per qualifying response + // (decoupled from the notice dedup), but exactly one event carries + // Notify=true — the healthy->exhausted edge. The operator notice is gated on + // Notify, so this still proves "one exhausted notice across many responses" + // (edge-gated, not per-response) while the durable store write runs on every + // call. + var calls, notices int32 done := make(chan struct{}, 16) addon.SetOnFailover(func(ev FailoverEvent) { if !ev.Exhausted { t.Errorf("FailoverEvent.Exhausted = false, want true") } - atomic.AddInt32(¬ices, 1) + atomic.AddInt32(&calls, 1) + if ev.Notify { + atomic.AddInt32(¬ices, 1) + } done <- struct{}{} }) @@ -130,7 +139,11 @@ func TestExhaustedNoticeEdgeAcrossManyResponses(t *testing.T) { // Let any erroneous extra notice surface. time.Sleep(50 * time.Millisecond) if got := atomic.LoadInt32(¬ices); got != 1 { - t.Fatalf("exhausted notices = %d, want exactly 1 (edge-gated, not per-response)", got) + t.Fatalf("exhausted notices (Notify=true) = %d, want exactly 1 (edge-gated, not per-response)", got) + } + // The durable callback itself ran for every qualifying response. + if got := atomic.LoadInt32(&calls); got != 10 { + t.Fatalf("onFailover calls = %d, want 10 (durable callback fires per response, decoupled from the notice gate)", got) } }