From 6ce98496cb7e0d23d5fb71e41bb31aeeadf45322 Mon Sep 17 00:00:00 2001
From: Inder Singh <inder@wastedcode.com>
Date: Thu, 18 Jun 2026 20:44:46 +0000
Subject: [PATCH 1/2] fix(observe,session): tolerate a late first transcript
 flush (empty-mirror race)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Under load, MCP/tool startup defers claude's first <id>.jsonl write past the
fixed ~3s anchor window: send() returns DELIVERY_UNCONFIRMED and the observer
reads the still-absent transcript and never re-resolves, so messagesSince("0")
returns [] and turnComplete=false even though wait()->completed. It is a LATE
flush, not a no-flush (file appears 5-9s later under contention).

Two fixes at the read/anchor seam, preserving all core promises (wait()
unchanged; messages still race-free after completed — strengthened; no
pane-scraping; the sentinel still reads empty):

Fix #1 (settle-on-read) — SessionObserver.settleFirstRecords(): a one-time,
latched (#sawRecords) bounded poll (<=5s) for the first records when the
transcript path is known but no records have been seen. messagesSince/
turnComplete await it (they only run after wait()->completed, so the records
the contract owes are en route). A present/already-flushed session finds records
on the first iteration and pays nothing; the latch makes every later call a
no-op. Zero blast radius on non-MCP / fast-flush sessions.

Fix #2 (anchor-on-addressable-but-absent) — anchorOwnTurn extends its poll
budget to FIRST_FLUSH_ANCHOR_POLLS (24 -> 6s) ONLY while the transcript file is
addressable-but-absent (new SessionObserver.transcriptAddressableButAbsent()),
so tool/heavy sessions anchor a real cursor instead of the sentinel. A
present-file session never crosses the original 12-poll/3s window — byte-for-
byte unchanged — and the shorter lost-Enter retry is not extended.

Regression tests: session-observer settle (late file appears after N polls
returns the records within budget; present-from-t0 returns immediately with no
extra polls + latch holds; budget-exhausted returns empty, no hang) and handle
anchor (absent-then-present file anchors a real cursor; present-from-t0 keeps the
original 3s window). Full suite green (356 passed, 3 skipped).

Verified live (claude 2.1.181): under 20 concurrent heavy tool-MCP sessions the
unmodified build reproduced 1/20 DELIVERY_UNCONFIRMED + empty messagesSince("0");
the fixed build returned [user, assistant] on 20/20 with 0 sentinels. The
fast-flush control path is unchanged.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01PeMGuKstkYF9dCgcsxWHWa
---
 src/observe/session-observer.test.ts | 76 ++++++++++++++++++++++++++++
 src/observe/session-observer.ts      | 62 +++++++++++++++++++++++
 src/session/handle.test.ts           | 61 ++++++++++++++++++++++
 src/session/handle.ts                | 28 +++++++++-
 4 files changed, 226 insertions(+), 1 deletion(-)
diff --git a/src/observe/session-observer.test.ts b/src/observe/session-observer.test.ts
index b663981..e6b0f4e 100644
--- a/src/observe/session-observer.test.ts
+++ b/src/observe/session-observer.test.ts
@@ -83,4 +83,80 @@ describe("SessionObserver — incremental belief + thread, hook-resolved transcr
     expect(b.hookChannelHealthy).toBe(false);
     expect(b.state).toBe("working"); // the pane is the only signal
   });
+
+  // Fix #1 — the late-first-flush settle. The transcript path is KNOWN (the hook
+  // reported it) but the file is not on disk yet; a writer creates it after N
+  // polls. The settle must wait the records out (within budget), while a
+  // present-from-t0 transcript must return immediately and pay no extra polls.
+  describe("settleFirstRecords — bounded one-time wait for a delayed first flush", () => {
+    it("returns the records once the absent file appears after a few polls (within budget)", async () => {
+      // Hook names the path, but the file does NOT exist yet — the empty-mirror
+      // signature (tp_exists=false at read time).
+      writeFileSync(
+        rv,
+        `1700000001.0 {"hook_event_name":"SessionStart","transcript_path":"${tx}"}\n`,
+      );
+      const obs = new SessionObserver({ agent: claude, rendezvousPath: rv });
+      // Confirm the early read IS empty (the bug today) and the file is seen absent.
+      expect(obs.thread().messages.length).toBe(0);
+      expect(obs.transcriptAddressableButAbsent()).toBe(true);
+
+      // The first <id>.jsonl bytes land ~750ms later (3 poll cadences) — a late flush.
+      const writer = setTimeout(() => {
+        writeFileSync(tx, `${[userRec("u1", "hi"), asstRec("a1", "u1", "PONG")].join("\n")}\n`);
+      }, 750);
+
+      const t0 = Date.now();
+      await obs.settleFirstRecords(5_000);
+      const elapsed = Date.now() - t0;
+      clearTimeout(writer);
+
+      const roles = obs.thread().messages.map((m) => m.role);
+      expect(roles).toEqual(["user", "assistant"]); // the contract's records, recovered
+      expect(elapsed).toBeGreaterThanOrEqual(700); // it genuinely waited for the flush
+      expect(elapsed).toBeLessThan(2_000); // and returned promptly once it appeared, well inside budget
+      expect(obs.transcriptAddressableButAbsent()).toBe(false); // file present now
+    });
+
+    it("control — file present from t=0 returns immediately and pays no extra wait", async () => {
+      writeFileSync(
+        rv,
+        `1700000001.0 {"hook_event_name":"SessionStart","transcript_path":"${tx}"}\n`,
+      );
+      writeFileSync(tx, `${[userRec("u1", "hi"), asstRec("a1", "u1", "PONG")].join("\n")}\n`);
+      const obs = new SessionObserver({ agent: claude, rendezvousPath: rv });
+
+      const t0 = Date.now();
+      await obs.settleFirstRecords(5_000);
+      const elapsed = Date.now() - t0;
+
+      // First iteration finds the records → no sleep → near-zero latency (the
+      // zero-blast-radius half: a fast-flush session is untouched).
+      expect(elapsed).toBeLessThan(SETTLE_POLL_MS_FLOOR);
+      expect(obs.thread().messages.map((m) => m.role)).toEqual(["user", "assistant"]);
+
+      // And the latch holds: a second settle, even on a path now made absent, is a no-op.
+      rmSync(tx);
+      const t1 = Date.now();
+      await obs.settleFirstRecords(5_000);
+      expect(Date.now() - t1).toBeLessThan(SETTLE_POLL_MS_FLOOR); // never re-enters the poll
+    });
+
+    it("budget-exhausted absent file → returns empty after the budget, never hangs", async () => {
+      writeFileSync(
+        rv,
+        `1700000001.0 {"hook_event_name":"SessionStart","transcript_path":"${tx}"}\n`,
+      );
+      const obs = new SessionObserver({ agent: claude, rendezvousPath: rv });
+      const t0 = Date.now();
+      await obs.settleFirstRecords(500); // tiny budget; file never appears
+      const elapsed = Date.now() - t0;
+      expect(elapsed).toBeGreaterThanOrEqual(450); // it waited the budget
+      expect(elapsed).toBeLessThan(1_500); // then gave up — bounded, no hang
+      expect(obs.thread().messages.length).toBe(0); // honest empty, not invented records
+    });
+  });
 });
+
+// One poll cadence is 250ms; an "immediate" return must be well under it.
+const SETTLE_POLL_MS_FLOOR = 200;
diff --git a/src/observe/session-observer.ts b/src/observe/session-observer.ts
index 55d6c9e..1e97a5d 100644
--- a/src/observe/session-observer.ts
+++ b/src/observe/session-observer.ts
@@ -1,8 +1,25 @@
+import { existsSync } from "node:fs";
 import type { AgentDef, HookEdge } from "../agents/types.js";
 import type { Message, State } from "../types.js";
+import { sleep } from "../util/sleep.js";
 import { TailReader } from "./incremental.js";
 import { type Belief, believe } from "./observer.js";
 
+/**
+ * The one-time first-flush settle budget (Fix #1). claude defers the first
+ * `<id>.jsonl` write for a fresh session until after boot + first-turn
+ * processing, and MCP/tool startup (loading the server + tool schemas) pushes
+ * that first flush past the send/anchor window — measured 3–9 s under
+ * contention. Because `messagesSince`/`turnComplete` are only called AFTER
+ * `wait()→completed` (the documented contract), a read that finds the transcript
+ * still absent is reading the records too early: this bounded poll waits for the
+ * records `completed` already implies exist. Latched to run at most once, so a
+ * live / already-flushed session pays nothing.
+ */
+const FIRST_FLUSH_SETTLE_MS = 5_000;
+/** Poll cadence for the first-flush settle — matches the anchor poll. */
+const SETTLE_POLL_MS = 250;
+
 /**
  * The per-session stateful read core — the single owner of "what's true" with
  * **bounded** reads. It holds incremental {@link TailReader}s over the hook
@@ -27,6 +44,10 @@ export class SessionObserver {
   #messages: Message[] = [];
   #parentOf = new Map<string, string | undefined>();
   #txPath: string | undefined; // memoized once resolved
+  // Latch for the one-time first-flush settle (Fix #1): once ANY record has been
+  // read, the late-flush race is over for this session's lifetime and the settle
+  // never runs again — a live session pays nothing.
+  #sawRecords = false;
 
   constructor(o: { agent: AgentDef; rendezvousPath?: string; agentSessionId?: string }) {
     this.#agent = o.agent;
@@ -84,6 +105,7 @@ export class SessionObserver {
       const e = transcript.parseEdge?.(line);
       if (e !== null && e !== undefined) this.#parentOf.set(e.id, e.parentId);
     }
+    if (this.#messages.length > 0) this.#sawRecords = true;
   }
 
   /**
@@ -127,4 +149,44 @@ export class SessionObserver {
     this.#refreshTranscript();
     return { messages: this.#messages, parentOf: this.#parentOf };
   }
+
+  /**
+   * One-time bounded settle for a delayed first transcript flush (Fix #1). Run
+   * BEFORE the first `thread()`-backed read of a session: if no record has been
+   * seen yet, poll up to `budgetMs` for the first records to land on disk. A
+   * session whose transcript is already present/flushed returns on the first
+   * iteration (no added latency), and once any record has been seen the latch
+   * makes every later call a no-op. This converts an empty read caused by a late
+   * first flush — the file simply not yet written when the caller (correctly,
+   * post-`wait()→completed`) reads — into the records the contract owes, without
+   * changing ordering or returning more than the transcript holds.
+   */
+  async settleFirstRecords(budgetMs: number = FIRST_FLUSH_SETTLE_MS): Promise<void> {
+    if (this.#sawRecords) return; // already past the first flush — pay nothing
+    const deadline = Date.now() + budgetMs;
+    for (;;) {
+      this.#refreshEdges(); // re-resolve the hook-reported path on the way in
+      this.#refreshTranscript();
+      if (this.#sawRecords || Date.now() >= deadline) return;
+      await sleep(SETTLE_POLL_MS);
+    }
+  }
+
+  /**
+   * Is the transcript **addressable** (we know where it lives) but the file is
+   * **not yet on disk** (Fix #2)? This is the late-first-flush signature: the hook
+   * channel named the path or we hold an id to locate by, yet no `<id>.jsonl`
+   * exists — claude hasn't written its first bytes. Distinct from
+   * {@link transcriptLocatable} (addressability alone) so `anchorOwnTurn` can tell
+   * "the file isn't created yet" (keep waiting) from "the file exists but my
+   * record isn't in it yet" (the normal flush trail). Returns `false` when the
+   * file is present (so a normal session keeps its exact anchor behaviour) or when
+   * the transcript is unaddressable.
+   */
+  transcriptAddressableButAbsent(): boolean {
+    this.#refreshEdges(); // a hook edge may have just reported the path
+    const path = this.#resolveTxPath();
+    if (path === undefined) return false;
+    return !existsSync(path);
+  }
 }
diff --git a/src/session/handle.test.ts b/src/session/handle.test.ts
index 994c1c9..cd10a39 100644
--- a/src/session/handle.test.ts
+++ b/src/session/handle.test.ts
@@ -220,6 +220,67 @@ describe("send() → cursor anchoring", () => {
     expect(await h.send("hello world")).toBe("own-1"); // the user record's id, not "0"
   });
 
+  it("Fix #2: anchors a real cursor when the first flush is LATE (file absent-then-present)", async () => {
+    // The MCP/tool late-first-flush signature: the user record is NOT on disk when
+    // send() returns from paste — the file does not exist yet. It appears after a
+    // few poll cadences (here ~900ms). Today anchorOwnTurn gives up at the 3s/12-poll
+    // window… but the file is absent, so Fix #2's extended budget keeps polling and
+    // anchors the real id instead of DELIVERY_UNCONFIRMED.
+    rmSync(tx); // start with NO transcript file (the absent-but-addressable state)
+    const recording: Backend = {
+      ...noopBackend(),
+      send: async (_ref, payload: SendPayload) => {
+        if (payload.kind === "paste") {
+          // The first flush lands ~900ms after the send — past a couple of polls,
+          // simulating claude deferring the write while the MCP server boots.
+          setTimeout(() => {
+            writeFileSync(tx, `${userRec("own-late", null, payload.text)}\n`);
+          }, 900);
+        }
+      },
+    };
+    const h = makeHandle({
+      backend: recording,
+      agent: agent(),
+      namespace: "claudemux",
+      name: "t",
+      agentSessionId: "id",
+    });
+    expect(await h.send("hello from a tool session")).toBe("own-late"); // real cursor, NOT the sentinel
+  });
+
+  it("Fix #2: a present-from-t0 transcript keeps the ORIGINAL 3s window (extension never engages)", async () => {
+    // The file EXISTS (size 0) but our record never appears — the honest
+    // non-delivery case. The pane shows the WORKING spinner (not the `unknown`
+    // draft), so send() skips the lost-Enter recovery and we isolate the pure
+    // anchor window. Because the file is present, transcriptAddressableButAbsent()
+    // is false and the extended budget never engages: the anchor gives up at the
+    // original 12 polls (3.0s), NOT the extended 24 (6.0s). This is the sharp
+    // zero-blast-radius control for the anchor — a present-file session is
+    // byte-for-byte unchanged.
+    const working: Backend = {
+      ...noopBackend(),
+      capture: async () => "✶ Working… (esc to interrupt)",
+    };
+    const h = makeHandle({
+      backend: working, // never writes a record; pane is `working`, not `unknown`
+      agent: agent(),
+      namespace: "claudemux",
+      name: "t",
+      agentSessionId: "id",
+    });
+    const t0 = Date.now();
+    expect(await h.send("never lands")).toBe(DELIVERY_UNCONFIRMED);
+    const elapsed = Date.now() - t0;
+    // send() carries a fixed paste/baseline-stabilize overhead (~1.5–2s, unchanged
+    // by this fix) plus the anchor window. The base anchor is 12×250=3.0s; the
+    // EXTENDED window would add 12 more polls = +3.0s. Asserting the total stays
+    // well under base+extension proves the extension did NOT engage on a
+    // present-file session — the sharp zero-blast-radius bound for the anchor.
+    expect(elapsed).toBeGreaterThanOrEqual(2_900); // it DID poll the full base window
+    expect(elapsed).toBeLessThan(6_500); // …but NOT the extra 3s of the extended window
+  });
+
   it("returns the DELIVERY_UNCONFIRMED sentinel when no user record appears (not a count)", async () => {
     const h = makeHandle({
       backend: noopBackend(),
diff --git a/src/session/handle.ts b/src/session/handle.ts
index 0313636..1bee403 100644
--- a/src/session/handle.ts
+++ b/src/session/handle.ts
@@ -156,11 +156,17 @@ export function makeHandle(deps: HandleDeps): SessionHandle {
     messagesSince: (cursor) =>
       mutex.run(async () => {
         if (!observer.transcriptLocatable()) throw new TranscriptUnlocatable(deps.name);
+        // Fix #1: tolerate a delayed first transcript flush. A locatable session
+        // whose first `<id>.jsonl` write is still in flight (MCP/tool startup under
+        // load) reads empty today; settle once for the records `wait()→completed`
+        // already implies exist. No-op once any record has been seen.
+        await observer.settleFirstRecords();
         return messagesSince(observer, cursor);
       }),
     turnComplete: (cursor) =>
       mutex.run(async () => {
         if (!observer.transcriptLocatable()) throw new TranscriptUnlocatable(deps.name);
+        await observer.settleFirstRecords();
         return messagesSince(observer, cursor).some((m) => m.role === "assistant");
       }),
     progress: () =>
@@ -299,6 +305,15 @@ function descendantsOf(
 }
 
 const ANCHOR_POLLS = 12;
+// Fix #2: when the transcript file is addressable-but-absent (claude hasn't
+// written its first `<id>.jsonl` bytes yet — the MCP/tool late-first-flush
+// signature), let anchorOwnTurn keep polling out to this longer budget (24 ×
+// 250ms = 6s) instead of giving up to DELIVERY_UNCONFIRMED at 3s. A normal
+// session whose file is already present is never extended — it keeps the exact
+// 12-poll/3s behaviour — because the extension only applies while the file is
+// genuinely missing. If it never appears, the sentinel is still the honest
+// answer (and Fix #1 restores the mirror on the later read).
+const FIRST_FLUSH_ANCHOR_POLLS = 24;
 // The post-recovery re-anchor is shorter: a lost-Enter record appears within a
 // poll or two of the retry Enter, so a true non-delivery still reports promptly
 // rather than paying a second full anchor window.
@@ -319,13 +334,24 @@ async function anchorOwnTurn(
   polls: number = ANCHOR_POLLS,
 ): Promise<string | undefined> {
   const needle = squash(text).slice(0, 80); // prefix tolerates echo reflow / truncation
-  for (let attempt = 0; attempt < polls; attempt++) {
+  // Fix #2: the absolute attempt cap. Past the base `polls`, keep polling out to
+  // the extended budget ONLY while the transcript file is addressable-but-absent
+  // (claude's first flush hasn't landed — the MCP/tool late-flush signature). A
+  // session whose file is already present never crosses `polls`, so it keeps the
+  // exact original window byte-for-byte. The extension applies only to the FULL
+  // first-turn anchor, not the shorter lost-Enter retry (which must report
+  // promptly), so it is gated on `polls === ANCHOR_POLLS`.
+  const maxPolls = polls === ANCHOR_POLLS ? FIRST_FLUSH_ANCHOR_POLLS : polls;
+  for (let attempt = 0; attempt < maxPolls; attempt++) {
     const msgs = observer.thread().messages;
     for (let i = msgs.length - 1; i >= 0; i--) {
       const m = msgs[i];
       if (m === undefined || m.role !== "user" || beforeIds.has(m.id)) continue;
       if (m.parts.some((p) => p.kind === "text" && squash(p.text).includes(needle))) return m.id;
     }
+    // Once the base budget is spent, give up UNLESS the file is still missing — the
+    // late-first-flush case is the only one worth waiting longer for.
+    if (attempt + 1 >= polls && !observer.transcriptAddressableButAbsent()) break;
     await sleep(ANCHOR_POLL_MS);
   }
   return undefined;

From 83785dec5375e8c7b45b7e5c4170a3cd2d4239cd Mon Sep 17 00:00:00 2001
From: Inder Singh <inder@wastedcode.com>
Date: Thu, 18 Jun 2026 22:21:25 +0000
Subject: [PATCH 2/2] fix(observe): bound the first-flush settle on exhaustion;
 record ADR 0009
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Follow-up hardening on the late-first-flush fix (6ce9849), from an
architecture + code-quality review.

The settle's latch was success-only: SessionObserver.settleFirstRecords()
returned early only once records had been SEEN (#sawRecords). A locatable-
but-never-flushing session — a crashed/slow resume, an adopt-miss locate, a
read before the first turn — therefore re-paid the FULL budget on every
messagesSince/turnComplete, stalling exactly the daemon recover()->turnComplete
loop that walks unhealthy sessions. Add a second latch (#firstFlushSettled) set
when the budget is spent once: every later call is then a no-op that still
re-reads the file (so records landing later are surfaced) but never BLOCKS for
the full budget again.

Also from the review:
- Unify the two seams onto one shared budget. Export FIRST_FLUSH_BUDGET_MS
  (6s) from the observer; anchorOwnTurn derives its extended poll count
  (Math.ceil(BUDGET / ANCHOR_POLL_MS) = 24) from it, so the read-settle and
  write-anchor windows cannot drift apart.
- Record the design as ADR 0009 (the phenomenon, the 3-9s measurement, why two
  seams, why the 6s budget is sufficient given wait() precedes the read, the
  existsSync-vs-records-readable boundary, and the additive send+read worst
  case). Cross-reference wait()'s replyOnDisk gate and settleFirstRecords so a
  future editor cannot defeat one flush-gate without seeing the other.

Tests: new "never re-pays the budget" (fails without the exhausted latch) and
the write-side "absent-forever -> full extended window, then bounded give-up"
(the analogue of the read-side exhaustion test). Full suite green (358 passed,
3 skipped); typecheck + biome clean.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01UgUf4t5AZkFKtgGWu8qxWL
---
 ...09-tolerate-late-first-transcript-flush.md | 137 ++++++++++++++++++
 docs/decisions/README.md                      |   1 +
 src/io/wait.ts                                |   5 +-
 src/observe/session-observer.test.ts          |  24 +++
 src/observe/session-observer.ts               |  69 ++++++---
 src/session/handle.test.ts                    |  30 ++++
 src/session/handle.ts                         |  33 +++--
 7 files changed, 262 insertions(+), 37 deletions(-)
 create mode 100644 docs/decisions/0009-tolerate-late-first-transcript-flush.md

diff --git a/docs/decisions/0009-tolerate-late-first-transcript-flush.md b/docs/decisions/0009-tolerate-late-first-transcript-flush.md
new file mode 100644
index 0000000..bbac6ae
--- /dev/null
+++ b/docs/decisions/0009-tolerate-late-first-transcript-flush.md
@@ -0,0 +1,137 @@
+# 0009. Tolerate a late first transcript flush — a bounded settle/anchor, one shared budget
+
+**Status:** accepted
+**Date:** 2026-06-18
+
+## Context
+
+claude writes a session's conversation to `<id>.jsonl` *append-only*, but the
+**first** write for a fresh session is deferred until after boot + first-turn
+processing. Under load — and especially when MCP servers and tool schemas are
+loading — that first flush slips **3–9 s** behind the moment the turn is
+otherwise "done." We measured it directly: under 20 concurrent heavy tool-MCP
+sessions, ~1/20 turns flushed their first record 5–9 s after `wait()→completed`.
+
+This collides with two seams that read the transcript shortly after a turn:
+
+1. **The write-side anchor.** `send()` does not return a positional cursor — it
+   anchors on the *id of the user record this send produced* (ADR-adjacent: "a
+   real id, never a count"). `anchorOwnTurn` polls the transcript for that record
+   for a fixed window (12 × 250 ms = 3 s). If the first flush is late, the record
+   is not on disk inside that window and `send()` falls back to the
+   `DELIVERY_UNCONFIRMED` sentinel — telling the consumer "re-send me" for a turn
+   that actually landed.
+
+2. **The read-side mirror.** `messagesSince(cursor)` / `turnComplete(cursor)` are
+   called *after* `wait()→completed`. But `wait()`'s cross-process flush gate
+   (`io/wait.ts`, `replyOnDisk`) falls back to the **pane** when the transcript is
+   blind (`transcriptCount === 0`), so `completed` can legitimately fire before
+   the first record exists. A separate CLI process — which rebuilds its
+   `SessionObserver` cold on every invocation — then reads the still-absent file
+   and returns `[]` / `false`, even though the contract owes records.
+
+Both are the *same* phenomenon (a late first flush), observed from opposite
+sides of the send/read split. The naïve "wait longer everywhere" risks two
+regressions: stalling the **healthy** fast-flush path (the overwhelming common
+case), and stalling the **unhealthy** never-flushing path (a crashed/slow
+resume) on every call.
+
+## Decision
+
+Tolerate the late first flush at both seams, gated so neither regression occurs,
+against **one shared budget**.
+
+- **`FIRST_FLUSH_BUDGET_MS = 6_000`** (`src/observe/session-observer.ts`) is the
+  single source of truth. The read-side settle uses it directly; the write-side
+  anchor derives its extended poll count from it
+  (`Math.ceil(FIRST_FLUSH_BUDGET_MS / ANCHOR_POLL_MS) = 24`), so the two seams
+  cannot drift apart.
+
+- **Write-side (`anchorOwnTurn`).** Past the base 3 s window, keep polling out to
+  the shared budget **only while `SessionObserver.transcriptAddressableButAbsent()`
+  is true** — the path is known but the file does not yet exist. A session whose
+  file is already present never crosses the base window, so its behaviour is
+  byte-for-byte unchanged. The extension applies only to the full first-turn
+  anchor, never the shorter lost-Enter retry (which must report promptly).
+
+- **Read-side (`settleFirstRecords`).** Before the first `thread()`-backed read,
+  if no record has been seen, poll up to the shared budget for the first records
+  to land. It is bounded **twice**: `#sawRecords` (success — records seen, the
+  common case, returns on the first poll with zero added latency) and
+  `#firstFlushSettled` (exhaustion — the budget was spent once and the file never
+  appeared). The exhaustion latch is load-bearing: without it a
+  locatable-but-never-flushing session would re-pay the full budget on *every*
+  `messagesSince`/`turnComplete`, stalling exactly the daemon `recover()→
+  turnComplete` loop that walks crashed/slow resumes.
+
+**Why 6 s is sufficient despite a 9 s tail.** The budget is measured from the
+*read*, not from spawn. The read-side settle only runs after `wait()→completed`,
+and `wait()` itself burns wall-clock (pane stabilization, the stop-edge settle)
+before returning. So the effective deadline is `read-time + 6 s`, which lands
+well past the 9-s-from-spawn tail. The write-side anchor runs earlier (from
+`send`), but a flush past its 6 s is *expected* to fall back to the sentinel —
+the read-side settle then restores the mirror on the consumer's follow-up read.
+The two seams overlap on purpose; neither has to cover the tail alone.
+
+## Consequences
+
+- **The healthy path is untouched.** A present/fast-flush transcript finds its
+  records on the first poll of either seam — no added latency, no extra polls.
+  The present-from-t0 controls assert this directly (the anchor keeps its exact
+  3 s window; the settle returns sub-poll).
+
+- **The unhealthy path is bounded once.** A never-flushing session waits the
+  budget a single time, then every later read is a no-op that still re-reads the
+  file (so records that land later are surfaced) but never blocks again.
+
+- **The settle holds the per-handle mutex for up to the budget, once.** A
+  concurrent `kill()`/`state()` on the same handle queues behind a first-ever
+  settle. Bounded by the budget and the once-only latch; not moved outside the
+  mutex (that would reopen the byte-interleaving the mutex exists to prevent).
+
+- **The stateless CLI pays the budget per-invocation, not per-process-lifetime.**
+  `claudemux messages` / `turn-complete` rebuild the observer cold each call, so
+  the in-process latches reset between invocations. This is inherent to
+  statelessness — there is no shared state to latch across processes — and is
+  acceptable: a script that polls the CLI against a never-flushing session should
+  bound its own retries, exactly as it must for any other "not ready yet" read.
+
+- **The write anchor keys on file *existence*, not record presence.** The
+  extension engages while `existsSync(path)` is false. claude is observed to
+  create `<id>.jsonl` only at its first content (the live repro's 0-sentinel
+  result requires a genuinely absent file), so this matches reality. A
+  hypothetical future claude that pre-created a zero-byte file would not engage
+  the extension — but the read-side `settleFirstRecords` (which keys on records
+  being *readable*, not the file existing) is the backstop: it would still wait
+  the records out on the consumer's follow-up read. The two seams deliberately
+  use different "is it flushed?" signals; the read side is the stronger one.
+
+- **A send + read pair on one never-flushing session can serialize both budgets.**
+  `send()`'s extended anchor (≤6 s) and a follow-up `messagesSince`/`turnComplete`'s
+  settle (≤6 s) hold *separate* latches (the anchor does not trip the observer's
+  `#firstFlushSettled`), so a single unhealthy session can pay up to ~12 s of
+  serialized blocking across the pair before both give up. Bounded and one-time
+  per observer; accepted rather than coupling the write-anchor's "found my
+  record?" question to the read-settle's "any records at all?" question, which
+  are genuinely different.
+
+- **The sentinel semantics are unchanged.** A flush past the write-side budget
+  still returns `DELIVERY_UNCONFIRMED`, which still reads empty in
+  `messagesSince` (re-send semantics preserved); recovery rides the read-side
+  settle plus a positional `"0"`/prior-id cursor, never the sentinel.
+
+## Evidence
+
+Verified against the code at HEAD, not assumed: `wait()` declares `completed`
+with `replyOnDisk = transcriptCount === 0 || lastMessageRole === "assistant"`
+(`src/io/wait.ts`), so completion can precede the first flush — the read-side
+hole this closes. The daemon consumer `recover()` calls `turnComplete(lastCursor)`
+on a resumed (possibly never-flushed) session (`src/compose.ts`) — the
+unhealthy-path loop the exhaustion latch bounds. Live: the unmodified build
+reproduced 1/20 `DELIVERY_UNCONFIRMED` + empty `messagesSince("0")` under 20
+concurrent heavy tool-MCP sessions; the fixed build returned `[user, assistant]`
+on 20/20 with 0 sentinels (claude 2.1.181). Regression coverage:
+`settleFirstRecords` (late-file-appears, present-from-t0 zero-cost, budget
+exhaustion, and the never-re-pays exhausted latch) and the `anchorOwnTurn`
+extension (absent-then-present anchors a real cursor; present-from-t0 keeps the
+exact 3 s window).
diff --git a/docs/decisions/README.md b/docs/decisions/README.md
index 0ea50fc..405124c 100644
--- a/docs/decisions/README.md
+++ b/docs/decisions/README.md
@@ -20,3 +20,4 @@ record, it cites one of these repo-relative paths, which resolve on GitHub.
 | [0006](0006-adopt-single-writer-invariant-no-lock.md) | `adopt()` documents a single-writer invariant, adds no cross-process lock, and is a pure attach (no dialog dismissal); `state()`-after-adopt carries the safety. |
 | [0007](0007-pane-dead-detection-and-signal-representation.md) | _(superseded)_ `PaneDead` detection + signal representation. `PaneDead` was removed (`remain-on-exit off` reaps the pane → `SessionGone`). |
 | [0008](0008-scrub-parent-agent-env-for-transcript-persistence.md) | The spawned `claude` runs under an `env -u` scrub of the parent agent's env vars (`CLAUDECODE`, `CLAUDE_CODE_*`, `AI_AGENT`) so its transcript persists; threaded via a new optional `unsetEnv` on `buildArgv`; consumer `env` re-set wins. |
+| [0009](0009-tolerate-late-first-transcript-flush.md) | A late first `<id>.jsonl` flush (3–9 s under MCP/tool load) is tolerated at both seams against one shared `FIRST_FLUSH_BUDGET_MS`: the write-side anchor extends only while the file is addressable-but-absent; the read-side `settleFirstRecords` waits once, latched on success **and** exhaustion so a never-flushing session never re-pays. |
diff --git a/src/io/wait.ts b/src/io/wait.ts
index d35d0c7..4370bf1 100644
--- a/src/io/wait.ts
+++ b/src/io/wait.ts
@@ -127,7 +127,10 @@ export async function waitForOutcome(
     // if a readable transcript never yields the reply the CONSUMER's patience
     // bounds it (budget-exceeded), never a library-invented timeout — "time is the
     // policy's." In-process the same observer already cached the record, so the
-    // gate is a no-op; it only pays on the cross-process race.
+    // gate is a no-op; it only pays on the cross-process race. Its read-side
+    // complement (for a separate process that rebuilds the observer cold and reads
+    // before the first flush) is `SessionObserver.settleFirstRecords` — ADR 0009;
+    // keep the two in step if you touch either.
     const paneDone = belief.state === "idle" && (hookDone || armed);
     const replyOnDisk = belief.transcriptCount === 0 || belief.lastMessageRole === "assistant";
     if (paneDone && replyOnDisk) {
diff --git a/src/observe/session-observer.test.ts b/src/observe/session-observer.test.ts
index e6b0f4e..81128cd 100644
--- a/src/observe/session-observer.test.ts
+++ b/src/observe/session-observer.test.ts
@@ -155,6 +155,30 @@ describe("SessionObserver — incremental belief + thread, hook-resolved transcr
       expect(elapsed).toBeLessThan(1_500); // then gave up — bounded, no hang
       expect(obs.thread().messages.length).toBe(0); // honest empty, not invented records
     });
+
+    // The exhausted-latch (ADR 0009). A locatable-but-NEVER-flushing session — a
+    // crashed/slow resume, an adopt-miss locate, a read before the first turn —
+    // must spend the budget at most ONCE. Without the exhausted latch every later
+    // settle re-pays the full budget, so a daemon polling recover()→turnComplete
+    // would stall the budget on EVERY call against exactly the unhealthy session.
+    it("never re-pays the budget: a never-flushing session settles once, then no-ops", async () => {
+      writeFileSync(
+        rv,
+        `1700000001.0 {"hook_event_name":"SessionStart","transcript_path":"${tx}"}\n`,
+      );
+      const obs = new SessionObserver({ agent: claude, rendezvousPath: rv });
+
+      const t0 = Date.now();
+      await obs.settleFirstRecords(500); // first call spends the budget (file never appears)
+      expect(Date.now() - t0).toBeGreaterThanOrEqual(450);
+
+      // The latch now holds even though NO record was ever seen: the SECOND call
+      // returns immediately instead of re-polling the budget (the flag-A fix).
+      const t1 = Date.now();
+      await obs.settleFirstRecords(500);
+      expect(Date.now() - t1).toBeLessThan(SETTLE_POLL_MS_FLOOR);
+      expect(obs.thread().messages.length).toBe(0); // still honest empty
+    });
   });
 });
 
diff --git a/src/observe/session-observer.ts b/src/observe/session-observer.ts
index 1e97a5d..94c6d7f 100644
--- a/src/observe/session-observer.ts
+++ b/src/observe/session-observer.ts
@@ -6,19 +6,24 @@ import { TailReader } from "./incremental.js";
 import { type Belief, believe } from "./observer.js";
 
 /**
- * The one-time first-flush settle budget (Fix #1). claude defers the first
- * `<id>.jsonl` write for a fresh session until after boot + first-turn
- * processing, and MCP/tool startup (loading the server + tool schemas) pushes
- * that first flush past the send/anchor window — measured 3–9 s under
- * contention. Because `messagesSince`/`turnComplete` are only called AFTER
- * `wait()→completed` (the documented contract), a read that finds the transcript
- * still absent is reading the records too early: this bounded poll waits for the
- * records `completed` already implies exist. Latched to run at most once, so a
- * live / already-flushed session pays nothing.
+ * The one-time budget for tolerating a LATE first transcript flush — the single
+ * source of truth shared by both seams that wait for it: the read-side
+ * {@link SessionObserver.settleFirstRecords} here, and the write-side
+ * `anchorOwnTurn` extension in `handle.ts` (which derives its extended poll count
+ * from this constant, so the two can never drift). See ADR 0009.
+ *
+ * claude defers the first `<id>.jsonl` write for a fresh session until after boot
+ * + first-turn processing, and MCP/tool startup (loading the server + tool
+ * schemas) pushes that first flush past the send/anchor window — measured 3–9 s
+ * under contention. Because `messagesSince`/`turnComplete` are only called AFTER
+ * `wait()→completed` (the documented contract) and `wait()` itself burns
+ * wall-clock before returning, the EFFECTIVE settle deadline is read-time + this
+ * budget — comfortably past the 9 s tail measured from spawn. A present /
+ * already-flushed session finds its records on the first poll and pays nothing.
  */
-const FIRST_FLUSH_SETTLE_MS = 5_000;
-/** Poll cadence for the first-flush settle — matches the anchor poll. */
-const SETTLE_POLL_MS = 250;
+export const FIRST_FLUSH_BUDGET_MS = 6_000;
+/** Poll cadence for the first-flush settle — matches `handle.ts`'s anchor poll. */
+const FIRST_FLUSH_POLL_MS = 250;
 
 /**
  * The per-session stateful read core — the single owner of "what's true" with
@@ -44,10 +49,15 @@ export class SessionObserver {
   #messages: Message[] = [];
   #parentOf = new Map<string, string | undefined>();
   #txPath: string | undefined; // memoized once resolved
-  // Latch for the one-time first-flush settle (Fix #1): once ANY record has been
-  // read, the late-flush race is over for this session's lifetime and the settle
-  // never runs again — a live session pays nothing.
+  // Latches for the one-time first-flush settle (ADR 0009). `#sawRecords` is the
+  // SUCCESS guard — once any record has been read the late-flush race is over and
+  // the settle never runs again (a live session pays nothing). `#firstFlushSettled`
+  // is the EXHAUSTED guard — once the budget has been spent once without the file
+  // appearing, never block again either: a locatable-but-never-flushing session (a
+  // crashed/slow resume, an adopt-miss locate, a read before the first turn) must
+  // not re-pay the full budget on every messagesSince/turnComplete call.
   #sawRecords = false;
+  #firstFlushSettled = false;
 
   constructor(o: { agent: AgentDef; rendezvousPath?: string; agentSessionId?: string }) {
     this.#agent = o.agent;
@@ -151,7 +161,7 @@ export class SessionObserver {
   }
 
   /**
-   * One-time bounded settle for a delayed first transcript flush (Fix #1). Run
+   * One-time bounded settle for a delayed first transcript flush (ADR 0009). Run
    * BEFORE the first `thread()`-backed read of a session: if no record has been
    * seen yet, poll up to `budgetMs` for the first records to land on disk. A
    * session whose transcript is already present/flushed returns on the first
@@ -160,15 +170,32 @@ export class SessionObserver {
    * first flush — the file simply not yet written when the caller (correctly,
    * post-`wait()→completed`) reads — into the records the contract owes, without
    * changing ordering or returning more than the transcript holds.
+   *
+   * The read-side complement of `wait()`'s cross-process flush gate (`io/wait.ts`,
+   * the `replyOnDisk` check): that gate holds `completed` until the reply is on
+   * disk for the SAME-process observer, but a separate CLI process rebuilds the
+   * observer cold and can still read before the flush — this settle closes that
+   * residual window. Bounded once — on success (`#sawRecords`) AND on exhaustion
+   * (`#firstFlushSettled`) — so a never-flushing session never re-waits.
    */
-  async settleFirstRecords(budgetMs: number = FIRST_FLUSH_SETTLE_MS): Promise<void> {
-    if (this.#sawRecords) return; // already past the first flush — pay nothing
+  async settleFirstRecords(budgetMs: number = FIRST_FLUSH_BUDGET_MS): Promise<void> {
+    // Past the first flush (records seen), or already spent the budget once.
+    if (this.#sawRecords || this.#firstFlushSettled) return;
     const deadline = Date.now() + budgetMs;
     for (;;) {
       this.#refreshEdges(); // re-resolve the hook-reported path on the way in
-      this.#refreshTranscript();
-      if (this.#sawRecords || Date.now() >= deadline) return;
-      await sleep(SETTLE_POLL_MS);
+      this.#refreshTranscript(); // sets #sawRecords the moment the first records land
+      if (this.#sawRecords) return;
+      if (Date.now() >= deadline) {
+        // Budget spent and still nothing on disk. Latch so every later call is a
+        // no-op: a subsequent messagesSince/turnComplete still re-reads the file
+        // (via thread()) and surfaces records that land later — it just never
+        // BLOCKS for the full budget again. Bounds the unhealthy-session path
+        // (the daemon recover()→turnComplete loop against a crashed/slow resume).
+        this.#firstFlushSettled = true;
+        return;
+      }
+      await sleep(FIRST_FLUSH_POLL_MS);
     }
   }
 
diff --git a/src/session/handle.test.ts b/src/session/handle.test.ts
index cd10a39..ccc1b9c 100644
--- a/src/session/handle.test.ts
+++ b/src/session/handle.test.ts
@@ -281,6 +281,36 @@ describe("send() → cursor anchoring", () => {
     expect(elapsed).toBeLessThan(6_500); // …but NOT the extra 3s of the extended window
   });
 
+  it("Fix #2: an absent-forever transcript polls the FULL extended window, then gives up (bounded)", async () => {
+    // The write-side analogue of the read-side budget-exhaustion test
+    // (session-observer.test.ts): the file is addressable-but-absent for the
+    // ENTIRE extended window (it never appears), so the extension engages fully —
+    // and must still TERMINATE at the 24-poll cap and return the sentinel, never
+    // hang. Pane is `working` (not the `unknown` draft), so send() skips the
+    // lost-Enter recovery and we measure the pure extended anchor.
+    rmSync(tx); // absent the whole time → transcriptAddressableButAbsent() stays true
+    const working: Backend = {
+      ...noopBackend(),
+      capture: async () => "✶ Working… (esc to interrupt)",
+    };
+    const h = makeHandle({
+      backend: working, // never writes a record; the file is never created
+      agent: agent(),
+      namespace: "claudemux",
+      name: "t",
+      agentSessionId: "id",
+    });
+    const t0 = Date.now();
+    expect(await h.send("never lands")).toBe(DELIVERY_UNCONFIRMED);
+    const elapsed = Date.now() - t0;
+    // The extension DID engage (file absent throughout): the full 24-poll/6s
+    // window ran on top of the ~1.5–2s paste/baseline overhead — well past the
+    // present-file control's 3s ceiling. Proves the cap is honoured (a bounded
+    // give-up, no hang) and that an absent file is what unlocks the extension.
+    expect(elapsed).toBeGreaterThanOrEqual(5_900); // the full extended window ran
+    expect(elapsed).toBeLessThan(11_000); // …and it terminated — never hung
+  });
+
   it("returns the DELIVERY_UNCONFIRMED sentinel when no user record appears (not a count)", async () => {
     const h = makeHandle({
       backend: noopBackend(),
diff --git a/src/session/handle.ts b/src/session/handle.ts
index 1bee403..a471f36 100644
--- a/src/session/handle.ts
+++ b/src/session/handle.ts
@@ -7,7 +7,7 @@ import { sendOnce, submitOnce } from "../io/send.js";
 import { stabilize } from "../io/stabilize.js";
 import { waitForOutcome } from "../io/wait.js";
 import type { Belief } from "../observe/observer.js";
-import { SessionObserver } from "../observe/session-observer.js";
+import { FIRST_FLUSH_BUDGET_MS, SessionObserver } from "../observe/session-observer.js";
 import { classify } from "../state/classifier.js";
 import type {
   BackendCommandEvent,
@@ -156,10 +156,11 @@ export function makeHandle(deps: HandleDeps): SessionHandle {
     messagesSince: (cursor) =>
       mutex.run(async () => {
         if (!observer.transcriptLocatable()) throw new TranscriptUnlocatable(deps.name);
-        // Fix #1: tolerate a delayed first transcript flush. A locatable session
+        // Tolerate a delayed first transcript flush (ADR 0009): a locatable session
         // whose first `<id>.jsonl` write is still in flight (MCP/tool startup under
         // load) reads empty today; settle once for the records `wait()→completed`
-        // already implies exist. No-op once any record has been seen.
+        // already implies exist. No-op once any record has been seen — or once the
+        // budget has been spent once on a never-flushing session.
         await observer.settleFirstRecords();
         return messagesSince(observer, cursor);
       }),
@@ -304,21 +305,23 @@ function descendantsOf(
   });
 }
 
+const ANCHOR_POLL_MS = 250;
 const ANCHOR_POLLS = 12;
-// Fix #2: when the transcript file is addressable-but-absent (claude hasn't
-// written its first `<id>.jsonl` bytes yet — the MCP/tool late-first-flush
-// signature), let anchorOwnTurn keep polling out to this longer budget (24 ×
-// 250ms = 6s) instead of giving up to DELIVERY_UNCONFIRMED at 3s. A normal
-// session whose file is already present is never extended — it keeps the exact
-// 12-poll/3s behaviour — because the extension only applies while the file is
-// genuinely missing. If it never appears, the sentinel is still the honest
-// answer (and Fix #1 restores the mirror on the later read).
-const FIRST_FLUSH_ANCHOR_POLLS = 24;
+// When the transcript file is addressable-but-absent (claude hasn't written its
+// first `<id>.jsonl` bytes yet — the MCP/tool late-first-flush signature), let
+// anchorOwnTurn keep polling out to this longer budget instead of giving up to
+// DELIVERY_UNCONFIRMED at the base 3s window. Derived from the ONE shared
+// first-flush budget (ADR 0009) so the write-anchor and read-settle seams can
+// never drift apart: 6000ms / 250ms = 24 polls. A normal session whose file is
+// already present is never extended — it keeps the exact 12-poll/3s behaviour —
+// because the extension only applies while the file is genuinely missing. If it
+// never appears, the sentinel is still the honest answer (and the read-side
+// settleFirstRecords restores the mirror on the later read).
+const FIRST_FLUSH_ANCHOR_POLLS = Math.ceil(FIRST_FLUSH_BUDGET_MS / ANCHOR_POLL_MS);
 // The post-recovery re-anchor is shorter: a lost-Enter record appears within a
 // poll or two of the retry Enter, so a true non-delivery still reports promptly
 // rather than paying a second full anchor window.
 const RETRY_ANCHOR_POLLS = 8;
-const ANCHOR_POLL_MS = 250;
 /** Collapse whitespace so a reflowed/echoed prompt still matches. */
 const squash = (s: string): string => s.replace(/\s+/g, " ").trim();
 
@@ -334,8 +337,8 @@ async function anchorOwnTurn(
   polls: number = ANCHOR_POLLS,
 ): Promise<string | undefined> {
   const needle = squash(text).slice(0, 80); // prefix tolerates echo reflow / truncation
-  // Fix #2: the absolute attempt cap. Past the base `polls`, keep polling out to
-  // the extended budget ONLY while the transcript file is addressable-but-absent
+  // The absolute attempt cap (ADR 0009). Past the base `polls`, keep polling out
+  // to the extended budget ONLY while the transcript file is addressable-but-absent
   // (claude's first flush hasn't landed — the MCP/tool late-flush signature). A
   // session whose file is already present never crosses `polls`, so it keeps the
   // exact original window byte-for-byte. The extension applies only to the FULL