From 6ce98496cb7e0d23d5fb71e41bb31aeeadf45322 Mon Sep 17 00:00:00 2001 From: Inder Singh Date: Thu, 18 Jun 2026 20:44:46 +0000 Subject: [PATCH 1/2] fix(observe,session): tolerate a late first transcript flush (empty-mirror race) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Under load, MCP/tool startup defers claude's first .jsonl write past the fixed ~3s anchor window: send() returns DELIVERY_UNCONFIRMED and the observer reads the still-absent transcript and never re-resolves, so messagesSince("0") returns [] and turnComplete=false even though wait()->completed. It is a LATE flush, not a no-flush (file appears 5-9s later under contention). Two fixes at the read/anchor seam, preserving all core promises (wait() unchanged; messages still race-free after completed — strengthened; no pane-scraping; the sentinel still reads empty): Fix #1 (settle-on-read) — SessionObserver.settleFirstRecords(): a one-time, latched (#sawRecords) bounded poll (<=5s) for the first records when the transcript path is known but no records have been seen. messagesSince/ turnComplete await it (they only run after wait()->completed, so the records the contract owes are en route). A present/already-flushed session finds records on the first iteration and pays nothing; the latch makes every later call a no-op. Zero blast radius on non-MCP / fast-flush sessions. Fix #2 (anchor-on-addressable-but-absent) — anchorOwnTurn extends its poll budget to FIRST_FLUSH_ANCHOR_POLLS (24 -> 6s) ONLY while the transcript file is addressable-but-absent (new SessionObserver.transcriptAddressableButAbsent()), so tool/heavy sessions anchor a real cursor instead of the sentinel. A present-file session never crosses the original 12-poll/3s window — byte-for- byte unchanged — and the shorter lost-Enter retry is not extended. Regression tests: session-observer settle (late file appears after N polls returns the records within budget; present-from-t0 returns immediately with no extra polls + latch holds; budget-exhausted returns empty, no hang) and handle anchor (absent-then-present file anchors a real cursor; present-from-t0 keeps the original 3s window). Full suite green (356 passed, 3 skipped). Verified live (claude 2.1.181): under 20 concurrent heavy tool-MCP sessions the unmodified build reproduced 1/20 DELIVERY_UNCONFIRMED + empty messagesSince("0"); the fixed build returned [user, assistant] on 20/20 with 0 sentinels. The fast-flush control path is unchanged. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01PeMGuKstkYF9dCgcsxWHWa --- src/observe/session-observer.test.ts | 76 ++++++++++++++++++++++++++++ src/observe/session-observer.ts | 62 +++++++++++++++++++++++ src/session/handle.test.ts | 61 ++++++++++++++++++++++ src/session/handle.ts | 28 +++++++++- 4 files changed, 226 insertions(+), 1 deletion(-) diff --git a/src/observe/session-observer.test.ts b/src/observe/session-observer.test.ts index b663981..e6b0f4e 100644 --- a/src/observe/session-observer.test.ts +++ b/src/observe/session-observer.test.ts @@ -83,4 +83,80 @@ describe("SessionObserver — incremental belief + thread, hook-resolved transcr expect(b.hookChannelHealthy).toBe(false); expect(b.state).toBe("working"); // the pane is the only signal }); + + // Fix #1 — the late-first-flush settle. The transcript path is KNOWN (the hook + // reported it) but the file is not on disk yet; a writer creates it after N + // polls. The settle must wait the records out (within budget), while a + // present-from-t0 transcript must return immediately and pay no extra polls. + describe("settleFirstRecords — bounded one-time wait for a delayed first flush", () => { + it("returns the records once the absent file appears after a few polls (within budget)", async () => { + // Hook names the path, but the file does NOT exist yet — the empty-mirror + // signature (tp_exists=false at read time). + writeFileSync( + rv, + `1700000001.0 {"hook_event_name":"SessionStart","transcript_path":"${tx}"}\n`, + ); + const obs = new SessionObserver({ agent: claude, rendezvousPath: rv }); + // Confirm the early read IS empty (the bug today) and the file is seen absent. + expect(obs.thread().messages.length).toBe(0); + expect(obs.transcriptAddressableButAbsent()).toBe(true); + + // The first .jsonl bytes land ~750ms later (3 poll cadences) — a late flush. + const writer = setTimeout(() => { + writeFileSync(tx, `${[userRec("u1", "hi"), asstRec("a1", "u1", "PONG")].join("\n")}\n`); + }, 750); + + const t0 = Date.now(); + await obs.settleFirstRecords(5_000); + const elapsed = Date.now() - t0; + clearTimeout(writer); + + const roles = obs.thread().messages.map((m) => m.role); + expect(roles).toEqual(["user", "assistant"]); // the contract's records, recovered + expect(elapsed).toBeGreaterThanOrEqual(700); // it genuinely waited for the flush + expect(elapsed).toBeLessThan(2_000); // and returned promptly once it appeared, well inside budget + expect(obs.transcriptAddressableButAbsent()).toBe(false); // file present now + }); + + it("control — file present from t=0 returns immediately and pays no extra wait", async () => { + writeFileSync( + rv, + `1700000001.0 {"hook_event_name":"SessionStart","transcript_path":"${tx}"}\n`, + ); + writeFileSync(tx, `${[userRec("u1", "hi"), asstRec("a1", "u1", "PONG")].join("\n")}\n`); + const obs = new SessionObserver({ agent: claude, rendezvousPath: rv }); + + const t0 = Date.now(); + await obs.settleFirstRecords(5_000); + const elapsed = Date.now() - t0; + + // First iteration finds the records → no sleep → near-zero latency (the + // zero-blast-radius half: a fast-flush session is untouched). + expect(elapsed).toBeLessThan(SETTLE_POLL_MS_FLOOR); + expect(obs.thread().messages.map((m) => m.role)).toEqual(["user", "assistant"]); + + // And the latch holds: a second settle, even on a path now made absent, is a no-op. + rmSync(tx); + const t1 = Date.now(); + await obs.settleFirstRecords(5_000); + expect(Date.now() - t1).toBeLessThan(SETTLE_POLL_MS_FLOOR); // never re-enters the poll + }); + + it("budget-exhausted absent file → returns empty after the budget, never hangs", async () => { + writeFileSync( + rv, + `1700000001.0 {"hook_event_name":"SessionStart","transcript_path":"${tx}"}\n`, + ); + const obs = new SessionObserver({ agent: claude, rendezvousPath: rv }); + const t0 = Date.now(); + await obs.settleFirstRecords(500); // tiny budget; file never appears + const elapsed = Date.now() - t0; + expect(elapsed).toBeGreaterThanOrEqual(450); // it waited the budget + expect(elapsed).toBeLessThan(1_500); // then gave up — bounded, no hang + expect(obs.thread().messages.length).toBe(0); // honest empty, not invented records + }); + }); }); + +// One poll cadence is 250ms; an "immediate" return must be well under it. +const SETTLE_POLL_MS_FLOOR = 200; diff --git a/src/observe/session-observer.ts b/src/observe/session-observer.ts index 55d6c9e..1e97a5d 100644 --- a/src/observe/session-observer.ts +++ b/src/observe/session-observer.ts @@ -1,8 +1,25 @@ +import { existsSync } from "node:fs"; import type { AgentDef, HookEdge } from "../agents/types.js"; import type { Message, State } from "../types.js"; +import { sleep } from "../util/sleep.js"; import { TailReader } from "./incremental.js"; import { type Belief, believe } from "./observer.js"; +/** + * The one-time first-flush settle budget (Fix #1). claude defers the first + * `.jsonl` write for a fresh session until after boot + first-turn + * processing, and MCP/tool startup (loading the server + tool schemas) pushes + * that first flush past the send/anchor window — measured 3–9 s under + * contention. Because `messagesSince`/`turnComplete` are only called AFTER + * `wait()→completed` (the documented contract), a read that finds the transcript + * still absent is reading the records too early: this bounded poll waits for the + * records `completed` already implies exist. Latched to run at most once, so a + * live / already-flushed session pays nothing. + */ +const FIRST_FLUSH_SETTLE_MS = 5_000; +/** Poll cadence for the first-flush settle — matches the anchor poll. */ +const SETTLE_POLL_MS = 250; + /** * The per-session stateful read core — the single owner of "what's true" with * **bounded** reads. It holds incremental {@link TailReader}s over the hook @@ -27,6 +44,10 @@ export class SessionObserver { #messages: Message[] = []; #parentOf = new Map(); #txPath: string | undefined; // memoized once resolved + // Latch for the one-time first-flush settle (Fix #1): once ANY record has been + // read, the late-flush race is over for this session's lifetime and the settle + // never runs again — a live session pays nothing. + #sawRecords = false; constructor(o: { agent: AgentDef; rendezvousPath?: string; agentSessionId?: string }) { this.#agent = o.agent; @@ -84,6 +105,7 @@ export class SessionObserver { const e = transcript.parseEdge?.(line); if (e !== null && e !== undefined) this.#parentOf.set(e.id, e.parentId); } + if (this.#messages.length > 0) this.#sawRecords = true; } /** @@ -127,4 +149,44 @@ export class SessionObserver { this.#refreshTranscript(); return { messages: this.#messages, parentOf: this.#parentOf }; } + + /** + * One-time bounded settle for a delayed first transcript flush (Fix #1). Run + * BEFORE the first `thread()`-backed read of a session: if no record has been + * seen yet, poll up to `budgetMs` for the first records to land on disk. A + * session whose transcript is already present/flushed returns on the first + * iteration (no added latency), and once any record has been seen the latch + * makes every later call a no-op. This converts an empty read caused by a late + * first flush — the file simply not yet written when the caller (correctly, + * post-`wait()→completed`) reads — into the records the contract owes, without + * changing ordering or returning more than the transcript holds. + */ + async settleFirstRecords(budgetMs: number = FIRST_FLUSH_SETTLE_MS): Promise { + if (this.#sawRecords) return; // already past the first flush — pay nothing + const deadline = Date.now() + budgetMs; + for (;;) { + this.#refreshEdges(); // re-resolve the hook-reported path on the way in + this.#refreshTranscript(); + if (this.#sawRecords || Date.now() >= deadline) return; + await sleep(SETTLE_POLL_MS); + } + } + + /** + * Is the transcript **addressable** (we know where it lives) but the file is + * **not yet on disk** (Fix #2)? This is the late-first-flush signature: the hook + * channel named the path or we hold an id to locate by, yet no `.jsonl` + * exists — claude hasn't written its first bytes. Distinct from + * {@link transcriptLocatable} (addressability alone) so `anchorOwnTurn` can tell + * "the file isn't created yet" (keep waiting) from "the file exists but my + * record isn't in it yet" (the normal flush trail). Returns `false` when the + * file is present (so a normal session keeps its exact anchor behaviour) or when + * the transcript is unaddressable. + */ + transcriptAddressableButAbsent(): boolean { + this.#refreshEdges(); // a hook edge may have just reported the path + const path = this.#resolveTxPath(); + if (path === undefined) return false; + return !existsSync(path); + } } diff --git a/src/session/handle.test.ts b/src/session/handle.test.ts index 994c1c9..cd10a39 100644 --- a/src/session/handle.test.ts +++ b/src/session/handle.test.ts @@ -220,6 +220,67 @@ describe("send() → cursor anchoring", () => { expect(await h.send("hello world")).toBe("own-1"); // the user record's id, not "0" }); + it("Fix #2: anchors a real cursor when the first flush is LATE (file absent-then-present)", async () => { + // The MCP/tool late-first-flush signature: the user record is NOT on disk when + // send() returns from paste — the file does not exist yet. It appears after a + // few poll cadences (here ~900ms). Today anchorOwnTurn gives up at the 3s/12-poll + // window… but the file is absent, so Fix #2's extended budget keeps polling and + // anchors the real id instead of DELIVERY_UNCONFIRMED. + rmSync(tx); // start with NO transcript file (the absent-but-addressable state) + const recording: Backend = { + ...noopBackend(), + send: async (_ref, payload: SendPayload) => { + if (payload.kind === "paste") { + // The first flush lands ~900ms after the send — past a couple of polls, + // simulating claude deferring the write while the MCP server boots. + setTimeout(() => { + writeFileSync(tx, `${userRec("own-late", null, payload.text)}\n`); + }, 900); + } + }, + }; + const h = makeHandle({ + backend: recording, + agent: agent(), + namespace: "claudemux", + name: "t", + agentSessionId: "id", + }); + expect(await h.send("hello from a tool session")).toBe("own-late"); // real cursor, NOT the sentinel + }); + + it("Fix #2: a present-from-t0 transcript keeps the ORIGINAL 3s window (extension never engages)", async () => { + // The file EXISTS (size 0) but our record never appears — the honest + // non-delivery case. The pane shows the WORKING spinner (not the `unknown` + // draft), so send() skips the lost-Enter recovery and we isolate the pure + // anchor window. Because the file is present, transcriptAddressableButAbsent() + // is false and the extended budget never engages: the anchor gives up at the + // original 12 polls (3.0s), NOT the extended 24 (6.0s). This is the sharp + // zero-blast-radius control for the anchor — a present-file session is + // byte-for-byte unchanged. + const working: Backend = { + ...noopBackend(), + capture: async () => "✶ Working… (esc to interrupt)", + }; + const h = makeHandle({ + backend: working, // never writes a record; pane is `working`, not `unknown` + agent: agent(), + namespace: "claudemux", + name: "t", + agentSessionId: "id", + }); + const t0 = Date.now(); + expect(await h.send("never lands")).toBe(DELIVERY_UNCONFIRMED); + const elapsed = Date.now() - t0; + // send() carries a fixed paste/baseline-stabilize overhead (~1.5–2s, unchanged + // by this fix) plus the anchor window. The base anchor is 12×250=3.0s; the + // EXTENDED window would add 12 more polls = +3.0s. Asserting the total stays + // well under base+extension proves the extension did NOT engage on a + // present-file session — the sharp zero-blast-radius bound for the anchor. + expect(elapsed).toBeGreaterThanOrEqual(2_900); // it DID poll the full base window + expect(elapsed).toBeLessThan(6_500); // …but NOT the extra 3s of the extended window + }); + it("returns the DELIVERY_UNCONFIRMED sentinel when no user record appears (not a count)", async () => { const h = makeHandle({ backend: noopBackend(), diff --git a/src/session/handle.ts b/src/session/handle.ts index 0313636..1bee403 100644 --- a/src/session/handle.ts +++ b/src/session/handle.ts @@ -156,11 +156,17 @@ export function makeHandle(deps: HandleDeps): SessionHandle { messagesSince: (cursor) => mutex.run(async () => { if (!observer.transcriptLocatable()) throw new TranscriptUnlocatable(deps.name); + // Fix #1: tolerate a delayed first transcript flush. A locatable session + // whose first `.jsonl` write is still in flight (MCP/tool startup under + // load) reads empty today; settle once for the records `wait()→completed` + // already implies exist. No-op once any record has been seen. + await observer.settleFirstRecords(); return messagesSince(observer, cursor); }), turnComplete: (cursor) => mutex.run(async () => { if (!observer.transcriptLocatable()) throw new TranscriptUnlocatable(deps.name); + await observer.settleFirstRecords(); return messagesSince(observer, cursor).some((m) => m.role === "assistant"); }), progress: () => @@ -299,6 +305,15 @@ function descendantsOf( } const ANCHOR_POLLS = 12; +// Fix #2: when the transcript file is addressable-but-absent (claude hasn't +// written its first `.jsonl` bytes yet — the MCP/tool late-first-flush +// signature), let anchorOwnTurn keep polling out to this longer budget (24 × +// 250ms = 6s) instead of giving up to DELIVERY_UNCONFIRMED at 3s. A normal +// session whose file is already present is never extended — it keeps the exact +// 12-poll/3s behaviour — because the extension only applies while the file is +// genuinely missing. If it never appears, the sentinel is still the honest +// answer (and Fix #1 restores the mirror on the later read). +const FIRST_FLUSH_ANCHOR_POLLS = 24; // The post-recovery re-anchor is shorter: a lost-Enter record appears within a // poll or two of the retry Enter, so a true non-delivery still reports promptly // rather than paying a second full anchor window. @@ -319,13 +334,24 @@ async function anchorOwnTurn( polls: number = ANCHOR_POLLS, ): Promise { const needle = squash(text).slice(0, 80); // prefix tolerates echo reflow / truncation - for (let attempt = 0; attempt < polls; attempt++) { + // Fix #2: the absolute attempt cap. Past the base `polls`, keep polling out to + // the extended budget ONLY while the transcript file is addressable-but-absent + // (claude's first flush hasn't landed — the MCP/tool late-flush signature). A + // session whose file is already present never crosses `polls`, so it keeps the + // exact original window byte-for-byte. The extension applies only to the FULL + // first-turn anchor, not the shorter lost-Enter retry (which must report + // promptly), so it is gated on `polls === ANCHOR_POLLS`. + const maxPolls = polls === ANCHOR_POLLS ? FIRST_FLUSH_ANCHOR_POLLS : polls; + for (let attempt = 0; attempt < maxPolls; attempt++) { const msgs = observer.thread().messages; for (let i = msgs.length - 1; i >= 0; i--) { const m = msgs[i]; if (m === undefined || m.role !== "user" || beforeIds.has(m.id)) continue; if (m.parts.some((p) => p.kind === "text" && squash(p.text).includes(needle))) return m.id; } + // Once the base budget is spent, give up UNLESS the file is still missing — the + // late-first-flush case is the only one worth waiting longer for. + if (attempt + 1 >= polls && !observer.transcriptAddressableButAbsent()) break; await sleep(ANCHOR_POLL_MS); } return undefined; From 83785dec5375e8c7b45b7e5c4170a3cd2d4239cd Mon Sep 17 00:00:00 2001 From: Inder Singh Date: Thu, 18 Jun 2026 22:21:25 +0000 Subject: [PATCH 2/2] fix(observe): bound the first-flush settle on exhaustion; record ADR 0009 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up hardening on the late-first-flush fix (6ce9849), from an architecture + code-quality review. The settle's latch was success-only: SessionObserver.settleFirstRecords() returned early only once records had been SEEN (#sawRecords). A locatable- but-never-flushing session — a crashed/slow resume, an adopt-miss locate, a read before the first turn — therefore re-paid the FULL budget on every messagesSince/turnComplete, stalling exactly the daemon recover()->turnComplete loop that walks unhealthy sessions. Add a second latch (#firstFlushSettled) set when the budget is spent once: every later call is then a no-op that still re-reads the file (so records landing later are surfaced) but never BLOCKS for the full budget again. Also from the review: - Unify the two seams onto one shared budget. Export FIRST_FLUSH_BUDGET_MS (6s) from the observer; anchorOwnTurn derives its extended poll count (Math.ceil(BUDGET / ANCHOR_POLL_MS) = 24) from it, so the read-settle and write-anchor windows cannot drift apart. - Record the design as ADR 0009 (the phenomenon, the 3-9s measurement, why two seams, why the 6s budget is sufficient given wait() precedes the read, the existsSync-vs-records-readable boundary, and the additive send+read worst case). Cross-reference wait()'s replyOnDisk gate and settleFirstRecords so a future editor cannot defeat one flush-gate without seeing the other. Tests: new "never re-pays the budget" (fails without the exhausted latch) and the write-side "absent-forever -> full extended window, then bounded give-up" (the analogue of the read-side exhaustion test). Full suite green (358 passed, 3 skipped); typecheck + biome clean. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01UgUf4t5AZkFKtgGWu8qxWL --- ...09-tolerate-late-first-transcript-flush.md | 137 ++++++++++++++++++ docs/decisions/README.md | 1 + src/io/wait.ts | 5 +- src/observe/session-observer.test.ts | 24 +++ src/observe/session-observer.ts | 69 ++++++--- src/session/handle.test.ts | 30 ++++ src/session/handle.ts | 33 +++-- 7 files changed, 262 insertions(+), 37 deletions(-) create mode 100644 docs/decisions/0009-tolerate-late-first-transcript-flush.md diff --git a/docs/decisions/0009-tolerate-late-first-transcript-flush.md b/docs/decisions/0009-tolerate-late-first-transcript-flush.md new file mode 100644 index 0000000..bbac6ae --- /dev/null +++ b/docs/decisions/0009-tolerate-late-first-transcript-flush.md @@ -0,0 +1,137 @@ +# 0009. Tolerate a late first transcript flush — a bounded settle/anchor, one shared budget + +**Status:** accepted +**Date:** 2026-06-18 + +## Context + +claude writes a session's conversation to `.jsonl` *append-only*, but the +**first** write for a fresh session is deferred until after boot + first-turn +processing. Under load — and especially when MCP servers and tool schemas are +loading — that first flush slips **3–9 s** behind the moment the turn is +otherwise "done." We measured it directly: under 20 concurrent heavy tool-MCP +sessions, ~1/20 turns flushed their first record 5–9 s after `wait()→completed`. + +This collides with two seams that read the transcript shortly after a turn: + +1. **The write-side anchor.** `send()` does not return a positional cursor — it + anchors on the *id of the user record this send produced* (ADR-adjacent: "a + real id, never a count"). `anchorOwnTurn` polls the transcript for that record + for a fixed window (12 × 250 ms = 3 s). If the first flush is late, the record + is not on disk inside that window and `send()` falls back to the + `DELIVERY_UNCONFIRMED` sentinel — telling the consumer "re-send me" for a turn + that actually landed. + +2. **The read-side mirror.** `messagesSince(cursor)` / `turnComplete(cursor)` are + called *after* `wait()→completed`. But `wait()`'s cross-process flush gate + (`io/wait.ts`, `replyOnDisk`) falls back to the **pane** when the transcript is + blind (`transcriptCount === 0`), so `completed` can legitimately fire before + the first record exists. A separate CLI process — which rebuilds its + `SessionObserver` cold on every invocation — then reads the still-absent file + and returns `[]` / `false`, even though the contract owes records. + +Both are the *same* phenomenon (a late first flush), observed from opposite +sides of the send/read split. The naïve "wait longer everywhere" risks two +regressions: stalling the **healthy** fast-flush path (the overwhelming common +case), and stalling the **unhealthy** never-flushing path (a crashed/slow +resume) on every call. + +## Decision + +Tolerate the late first flush at both seams, gated so neither regression occurs, +against **one shared budget**. + +- **`FIRST_FLUSH_BUDGET_MS = 6_000`** (`src/observe/session-observer.ts`) is the + single source of truth. The read-side settle uses it directly; the write-side + anchor derives its extended poll count from it + (`Math.ceil(FIRST_FLUSH_BUDGET_MS / ANCHOR_POLL_MS) = 24`), so the two seams + cannot drift apart. + +- **Write-side (`anchorOwnTurn`).** Past the base 3 s window, keep polling out to + the shared budget **only while `SessionObserver.transcriptAddressableButAbsent()` + is true** — the path is known but the file does not yet exist. A session whose + file is already present never crosses the base window, so its behaviour is + byte-for-byte unchanged. The extension applies only to the full first-turn + anchor, never the shorter lost-Enter retry (which must report promptly). + +- **Read-side (`settleFirstRecords`).** Before the first `thread()`-backed read, + if no record has been seen, poll up to the shared budget for the first records + to land. It is bounded **twice**: `#sawRecords` (success — records seen, the + common case, returns on the first poll with zero added latency) and + `#firstFlushSettled` (exhaustion — the budget was spent once and the file never + appeared). The exhaustion latch is load-bearing: without it a + locatable-but-never-flushing session would re-pay the full budget on *every* + `messagesSince`/`turnComplete`, stalling exactly the daemon `recover()→ + turnComplete` loop that walks crashed/slow resumes. + +**Why 6 s is sufficient despite a 9 s tail.** The budget is measured from the +*read*, not from spawn. The read-side settle only runs after `wait()→completed`, +and `wait()` itself burns wall-clock (pane stabilization, the stop-edge settle) +before returning. So the effective deadline is `read-time + 6 s`, which lands +well past the 9-s-from-spawn tail. The write-side anchor runs earlier (from +`send`), but a flush past its 6 s is *expected* to fall back to the sentinel — +the read-side settle then restores the mirror on the consumer's follow-up read. +The two seams overlap on purpose; neither has to cover the tail alone. + +## Consequences + +- **The healthy path is untouched.** A present/fast-flush transcript finds its + records on the first poll of either seam — no added latency, no extra polls. + The present-from-t0 controls assert this directly (the anchor keeps its exact + 3 s window; the settle returns sub-poll). + +- **The unhealthy path is bounded once.** A never-flushing session waits the + budget a single time, then every later read is a no-op that still re-reads the + file (so records that land later are surfaced) but never blocks again. + +- **The settle holds the per-handle mutex for up to the budget, once.** A + concurrent `kill()`/`state()` on the same handle queues behind a first-ever + settle. Bounded by the budget and the once-only latch; not moved outside the + mutex (that would reopen the byte-interleaving the mutex exists to prevent). + +- **The stateless CLI pays the budget per-invocation, not per-process-lifetime.** + `claudemux messages` / `turn-complete` rebuild the observer cold each call, so + the in-process latches reset between invocations. This is inherent to + statelessness — there is no shared state to latch across processes — and is + acceptable: a script that polls the CLI against a never-flushing session should + bound its own retries, exactly as it must for any other "not ready yet" read. + +- **The write anchor keys on file *existence*, not record presence.** The + extension engages while `existsSync(path)` is false. claude is observed to + create `.jsonl` only at its first content (the live repro's 0-sentinel + result requires a genuinely absent file), so this matches reality. A + hypothetical future claude that pre-created a zero-byte file would not engage + the extension — but the read-side `settleFirstRecords` (which keys on records + being *readable*, not the file existing) is the backstop: it would still wait + the records out on the consumer's follow-up read. The two seams deliberately + use different "is it flushed?" signals; the read side is the stronger one. + +- **A send + read pair on one never-flushing session can serialize both budgets.** + `send()`'s extended anchor (≤6 s) and a follow-up `messagesSince`/`turnComplete`'s + settle (≤6 s) hold *separate* latches (the anchor does not trip the observer's + `#firstFlushSettled`), so a single unhealthy session can pay up to ~12 s of + serialized blocking across the pair before both give up. Bounded and one-time + per observer; accepted rather than coupling the write-anchor's "found my + record?" question to the read-settle's "any records at all?" question, which + are genuinely different. + +- **The sentinel semantics are unchanged.** A flush past the write-side budget + still returns `DELIVERY_UNCONFIRMED`, which still reads empty in + `messagesSince` (re-send semantics preserved); recovery rides the read-side + settle plus a positional `"0"`/prior-id cursor, never the sentinel. + +## Evidence + +Verified against the code at HEAD, not assumed: `wait()` declares `completed` +with `replyOnDisk = transcriptCount === 0 || lastMessageRole === "assistant"` +(`src/io/wait.ts`), so completion can precede the first flush — the read-side +hole this closes. The daemon consumer `recover()` calls `turnComplete(lastCursor)` +on a resumed (possibly never-flushed) session (`src/compose.ts`) — the +unhealthy-path loop the exhaustion latch bounds. Live: the unmodified build +reproduced 1/20 `DELIVERY_UNCONFIRMED` + empty `messagesSince("0")` under 20 +concurrent heavy tool-MCP sessions; the fixed build returned `[user, assistant]` +on 20/20 with 0 sentinels (claude 2.1.181). Regression coverage: +`settleFirstRecords` (late-file-appears, present-from-t0 zero-cost, budget +exhaustion, and the never-re-pays exhausted latch) and the `anchorOwnTurn` +extension (absent-then-present anchors a real cursor; present-from-t0 keeps the +exact 3 s window). diff --git a/docs/decisions/README.md b/docs/decisions/README.md index 0ea50fc..405124c 100644 --- a/docs/decisions/README.md +++ b/docs/decisions/README.md @@ -20,3 +20,4 @@ record, it cites one of these repo-relative paths, which resolve on GitHub. | [0006](0006-adopt-single-writer-invariant-no-lock.md) | `adopt()` documents a single-writer invariant, adds no cross-process lock, and is a pure attach (no dialog dismissal); `state()`-after-adopt carries the safety. | | [0007](0007-pane-dead-detection-and-signal-representation.md) | _(superseded)_ `PaneDead` detection + signal representation. `PaneDead` was removed (`remain-on-exit off` reaps the pane → `SessionGone`). | | [0008](0008-scrub-parent-agent-env-for-transcript-persistence.md) | The spawned `claude` runs under an `env -u` scrub of the parent agent's env vars (`CLAUDECODE`, `CLAUDE_CODE_*`, `AI_AGENT`) so its transcript persists; threaded via a new optional `unsetEnv` on `buildArgv`; consumer `env` re-set wins. | +| [0009](0009-tolerate-late-first-transcript-flush.md) | A late first `.jsonl` flush (3–9 s under MCP/tool load) is tolerated at both seams against one shared `FIRST_FLUSH_BUDGET_MS`: the write-side anchor extends only while the file is addressable-but-absent; the read-side `settleFirstRecords` waits once, latched on success **and** exhaustion so a never-flushing session never re-pays. | diff --git a/src/io/wait.ts b/src/io/wait.ts index d35d0c7..4370bf1 100644 --- a/src/io/wait.ts +++ b/src/io/wait.ts @@ -127,7 +127,10 @@ export async function waitForOutcome( // if a readable transcript never yields the reply the CONSUMER's patience // bounds it (budget-exceeded), never a library-invented timeout — "time is the // policy's." In-process the same observer already cached the record, so the - // gate is a no-op; it only pays on the cross-process race. + // gate is a no-op; it only pays on the cross-process race. Its read-side + // complement (for a separate process that rebuilds the observer cold and reads + // before the first flush) is `SessionObserver.settleFirstRecords` — ADR 0009; + // keep the two in step if you touch either. const paneDone = belief.state === "idle" && (hookDone || armed); const replyOnDisk = belief.transcriptCount === 0 || belief.lastMessageRole === "assistant"; if (paneDone && replyOnDisk) { diff --git a/src/observe/session-observer.test.ts b/src/observe/session-observer.test.ts index e6b0f4e..81128cd 100644 --- a/src/observe/session-observer.test.ts +++ b/src/observe/session-observer.test.ts @@ -155,6 +155,30 @@ describe("SessionObserver — incremental belief + thread, hook-resolved transcr expect(elapsed).toBeLessThan(1_500); // then gave up — bounded, no hang expect(obs.thread().messages.length).toBe(0); // honest empty, not invented records }); + + // The exhausted-latch (ADR 0009). A locatable-but-NEVER-flushing session — a + // crashed/slow resume, an adopt-miss locate, a read before the first turn — + // must spend the budget at most ONCE. Without the exhausted latch every later + // settle re-pays the full budget, so a daemon polling recover()→turnComplete + // would stall the budget on EVERY call against exactly the unhealthy session. + it("never re-pays the budget: a never-flushing session settles once, then no-ops", async () => { + writeFileSync( + rv, + `1700000001.0 {"hook_event_name":"SessionStart","transcript_path":"${tx}"}\n`, + ); + const obs = new SessionObserver({ agent: claude, rendezvousPath: rv }); + + const t0 = Date.now(); + await obs.settleFirstRecords(500); // first call spends the budget (file never appears) + expect(Date.now() - t0).toBeGreaterThanOrEqual(450); + + // The latch now holds even though NO record was ever seen: the SECOND call + // returns immediately instead of re-polling the budget (the flag-A fix). + const t1 = Date.now(); + await obs.settleFirstRecords(500); + expect(Date.now() - t1).toBeLessThan(SETTLE_POLL_MS_FLOOR); + expect(obs.thread().messages.length).toBe(0); // still honest empty + }); }); }); diff --git a/src/observe/session-observer.ts b/src/observe/session-observer.ts index 1e97a5d..94c6d7f 100644 --- a/src/observe/session-observer.ts +++ b/src/observe/session-observer.ts @@ -6,19 +6,24 @@ import { TailReader } from "./incremental.js"; import { type Belief, believe } from "./observer.js"; /** - * The one-time first-flush settle budget (Fix #1). claude defers the first - * `.jsonl` write for a fresh session until after boot + first-turn - * processing, and MCP/tool startup (loading the server + tool schemas) pushes - * that first flush past the send/anchor window — measured 3–9 s under - * contention. Because `messagesSince`/`turnComplete` are only called AFTER - * `wait()→completed` (the documented contract), a read that finds the transcript - * still absent is reading the records too early: this bounded poll waits for the - * records `completed` already implies exist. Latched to run at most once, so a - * live / already-flushed session pays nothing. + * The one-time budget for tolerating a LATE first transcript flush — the single + * source of truth shared by both seams that wait for it: the read-side + * {@link SessionObserver.settleFirstRecords} here, and the write-side + * `anchorOwnTurn` extension in `handle.ts` (which derives its extended poll count + * from this constant, so the two can never drift). See ADR 0009. + * + * claude defers the first `.jsonl` write for a fresh session until after boot + * + first-turn processing, and MCP/tool startup (loading the server + tool + * schemas) pushes that first flush past the send/anchor window — measured 3–9 s + * under contention. Because `messagesSince`/`turnComplete` are only called AFTER + * `wait()→completed` (the documented contract) and `wait()` itself burns + * wall-clock before returning, the EFFECTIVE settle deadline is read-time + this + * budget — comfortably past the 9 s tail measured from spawn. A present / + * already-flushed session finds its records on the first poll and pays nothing. */ -const FIRST_FLUSH_SETTLE_MS = 5_000; -/** Poll cadence for the first-flush settle — matches the anchor poll. */ -const SETTLE_POLL_MS = 250; +export const FIRST_FLUSH_BUDGET_MS = 6_000; +/** Poll cadence for the first-flush settle — matches `handle.ts`'s anchor poll. */ +const FIRST_FLUSH_POLL_MS = 250; /** * The per-session stateful read core — the single owner of "what's true" with @@ -44,10 +49,15 @@ export class SessionObserver { #messages: Message[] = []; #parentOf = new Map(); #txPath: string | undefined; // memoized once resolved - // Latch for the one-time first-flush settle (Fix #1): once ANY record has been - // read, the late-flush race is over for this session's lifetime and the settle - // never runs again — a live session pays nothing. + // Latches for the one-time first-flush settle (ADR 0009). `#sawRecords` is the + // SUCCESS guard — once any record has been read the late-flush race is over and + // the settle never runs again (a live session pays nothing). `#firstFlushSettled` + // is the EXHAUSTED guard — once the budget has been spent once without the file + // appearing, never block again either: a locatable-but-never-flushing session (a + // crashed/slow resume, an adopt-miss locate, a read before the first turn) must + // not re-pay the full budget on every messagesSince/turnComplete call. #sawRecords = false; + #firstFlushSettled = false; constructor(o: { agent: AgentDef; rendezvousPath?: string; agentSessionId?: string }) { this.#agent = o.agent; @@ -151,7 +161,7 @@ export class SessionObserver { } /** - * One-time bounded settle for a delayed first transcript flush (Fix #1). Run + * One-time bounded settle for a delayed first transcript flush (ADR 0009). Run * BEFORE the first `thread()`-backed read of a session: if no record has been * seen yet, poll up to `budgetMs` for the first records to land on disk. A * session whose transcript is already present/flushed returns on the first @@ -160,15 +170,32 @@ export class SessionObserver { * first flush — the file simply not yet written when the caller (correctly, * post-`wait()→completed`) reads — into the records the contract owes, without * changing ordering or returning more than the transcript holds. + * + * The read-side complement of `wait()`'s cross-process flush gate (`io/wait.ts`, + * the `replyOnDisk` check): that gate holds `completed` until the reply is on + * disk for the SAME-process observer, but a separate CLI process rebuilds the + * observer cold and can still read before the flush — this settle closes that + * residual window. Bounded once — on success (`#sawRecords`) AND on exhaustion + * (`#firstFlushSettled`) — so a never-flushing session never re-waits. */ - async settleFirstRecords(budgetMs: number = FIRST_FLUSH_SETTLE_MS): Promise { - if (this.#sawRecords) return; // already past the first flush — pay nothing + async settleFirstRecords(budgetMs: number = FIRST_FLUSH_BUDGET_MS): Promise { + // Past the first flush (records seen), or already spent the budget once. + if (this.#sawRecords || this.#firstFlushSettled) return; const deadline = Date.now() + budgetMs; for (;;) { this.#refreshEdges(); // re-resolve the hook-reported path on the way in - this.#refreshTranscript(); - if (this.#sawRecords || Date.now() >= deadline) return; - await sleep(SETTLE_POLL_MS); + this.#refreshTranscript(); // sets #sawRecords the moment the first records land + if (this.#sawRecords) return; + if (Date.now() >= deadline) { + // Budget spent and still nothing on disk. Latch so every later call is a + // no-op: a subsequent messagesSince/turnComplete still re-reads the file + // (via thread()) and surfaces records that land later — it just never + // BLOCKS for the full budget again. Bounds the unhealthy-session path + // (the daemon recover()→turnComplete loop against a crashed/slow resume). + this.#firstFlushSettled = true; + return; + } + await sleep(FIRST_FLUSH_POLL_MS); } } diff --git a/src/session/handle.test.ts b/src/session/handle.test.ts index cd10a39..ccc1b9c 100644 --- a/src/session/handle.test.ts +++ b/src/session/handle.test.ts @@ -281,6 +281,36 @@ describe("send() → cursor anchoring", () => { expect(elapsed).toBeLessThan(6_500); // …but NOT the extra 3s of the extended window }); + it("Fix #2: an absent-forever transcript polls the FULL extended window, then gives up (bounded)", async () => { + // The write-side analogue of the read-side budget-exhaustion test + // (session-observer.test.ts): the file is addressable-but-absent for the + // ENTIRE extended window (it never appears), so the extension engages fully — + // and must still TERMINATE at the 24-poll cap and return the sentinel, never + // hang. Pane is `working` (not the `unknown` draft), so send() skips the + // lost-Enter recovery and we measure the pure extended anchor. + rmSync(tx); // absent the whole time → transcriptAddressableButAbsent() stays true + const working: Backend = { + ...noopBackend(), + capture: async () => "✶ Working… (esc to interrupt)", + }; + const h = makeHandle({ + backend: working, // never writes a record; the file is never created + agent: agent(), + namespace: "claudemux", + name: "t", + agentSessionId: "id", + }); + const t0 = Date.now(); + expect(await h.send("never lands")).toBe(DELIVERY_UNCONFIRMED); + const elapsed = Date.now() - t0; + // The extension DID engage (file absent throughout): the full 24-poll/6s + // window ran on top of the ~1.5–2s paste/baseline overhead — well past the + // present-file control's 3s ceiling. Proves the cap is honoured (a bounded + // give-up, no hang) and that an absent file is what unlocks the extension. + expect(elapsed).toBeGreaterThanOrEqual(5_900); // the full extended window ran + expect(elapsed).toBeLessThan(11_000); // …and it terminated — never hung + }); + it("returns the DELIVERY_UNCONFIRMED sentinel when no user record appears (not a count)", async () => { const h = makeHandle({ backend: noopBackend(), diff --git a/src/session/handle.ts b/src/session/handle.ts index 1bee403..a471f36 100644 --- a/src/session/handle.ts +++ b/src/session/handle.ts @@ -7,7 +7,7 @@ import { sendOnce, submitOnce } from "../io/send.js"; import { stabilize } from "../io/stabilize.js"; import { waitForOutcome } from "../io/wait.js"; import type { Belief } from "../observe/observer.js"; -import { SessionObserver } from "../observe/session-observer.js"; +import { FIRST_FLUSH_BUDGET_MS, SessionObserver } from "../observe/session-observer.js"; import { classify } from "../state/classifier.js"; import type { BackendCommandEvent, @@ -156,10 +156,11 @@ export function makeHandle(deps: HandleDeps): SessionHandle { messagesSince: (cursor) => mutex.run(async () => { if (!observer.transcriptLocatable()) throw new TranscriptUnlocatable(deps.name); - // Fix #1: tolerate a delayed first transcript flush. A locatable session + // Tolerate a delayed first transcript flush (ADR 0009): a locatable session // whose first `.jsonl` write is still in flight (MCP/tool startup under // load) reads empty today; settle once for the records `wait()→completed` - // already implies exist. No-op once any record has been seen. + // already implies exist. No-op once any record has been seen — or once the + // budget has been spent once on a never-flushing session. await observer.settleFirstRecords(); return messagesSince(observer, cursor); }), @@ -304,21 +305,23 @@ function descendantsOf( }); } +const ANCHOR_POLL_MS = 250; const ANCHOR_POLLS = 12; -// Fix #2: when the transcript file is addressable-but-absent (claude hasn't -// written its first `.jsonl` bytes yet — the MCP/tool late-first-flush -// signature), let anchorOwnTurn keep polling out to this longer budget (24 × -// 250ms = 6s) instead of giving up to DELIVERY_UNCONFIRMED at 3s. A normal -// session whose file is already present is never extended — it keeps the exact -// 12-poll/3s behaviour — because the extension only applies while the file is -// genuinely missing. If it never appears, the sentinel is still the honest -// answer (and Fix #1 restores the mirror on the later read). -const FIRST_FLUSH_ANCHOR_POLLS = 24; +// When the transcript file is addressable-but-absent (claude hasn't written its +// first `.jsonl` bytes yet — the MCP/tool late-first-flush signature), let +// anchorOwnTurn keep polling out to this longer budget instead of giving up to +// DELIVERY_UNCONFIRMED at the base 3s window. Derived from the ONE shared +// first-flush budget (ADR 0009) so the write-anchor and read-settle seams can +// never drift apart: 6000ms / 250ms = 24 polls. A normal session whose file is +// already present is never extended — it keeps the exact 12-poll/3s behaviour — +// because the extension only applies while the file is genuinely missing. If it +// never appears, the sentinel is still the honest answer (and the read-side +// settleFirstRecords restores the mirror on the later read). +const FIRST_FLUSH_ANCHOR_POLLS = Math.ceil(FIRST_FLUSH_BUDGET_MS / ANCHOR_POLL_MS); // The post-recovery re-anchor is shorter: a lost-Enter record appears within a // poll or two of the retry Enter, so a true non-delivery still reports promptly // rather than paying a second full anchor window. const RETRY_ANCHOR_POLLS = 8; -const ANCHOR_POLL_MS = 250; /** Collapse whitespace so a reflowed/echoed prompt still matches. */ const squash = (s: string): string => s.replace(/\s+/g, " ").trim(); @@ -334,8 +337,8 @@ async function anchorOwnTurn( polls: number = ANCHOR_POLLS, ): Promise { const needle = squash(text).slice(0, 80); // prefix tolerates echo reflow / truncation - // Fix #2: the absolute attempt cap. Past the base `polls`, keep polling out to - // the extended budget ONLY while the transcript file is addressable-but-absent + // The absolute attempt cap (ADR 0009). Past the base `polls`, keep polling out + // to the extended budget ONLY while the transcript file is addressable-but-absent // (claude's first flush hasn't landed — the MCP/tool late-flush signature). A // session whose file is already present never crosses `polls`, so it keeps the // exact original window byte-for-byte. The extension applies only to the FULL