From 28e9ff1265316987ab8c777a80fed5e5bbfe29ee Mon Sep 17 00:00:00 2001
From: Nosko Artem <time.for.mindwar@gmail.com>
Date: Wed, 17 Jun 2026 10:13:25 +0300
Subject: [PATCH] fix(card): adaptive stall_finalize threshold + false-positive
 recovery
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two regressions on a long-reasoning metrics-debug session: (1)
``STALL_FINALIZE_AFTER_SECONDS=90s`` fired on ``tail=tool_use idle=96s``
while Claude was legitimately thinking toward the final answer; (2)
once the STALL_NOTE was appended, the real answer landing later got
silently edited into the now-finalized card the user had scrolled
past.

Fix (1): split the threshold by tail event type. ``tool_use`` tails
get ``STALL_FINALIZE_TOOL_USE_SECONDS=300s`` because slow tools and
post-tool reasoning are routinely silent for minutes; ``text`` /
``thinking`` tails keep the original 90s because mid-emit silence is
genuinely suspicious.

Fix (2): ``maybe_finalize_stalled`` now arms ``CardState
.stall_finalized=True`` after the STALL_NOTE lands. The next call to
``update_session_card`` or ``finalize_task`` runs
``_recover_from_false_stall`` — wipes msg_id / events / pagination
and flips ``is_continuation=True`` so ``_send_card`` spawns a fresh
card below the stub instead of clobbering it.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/ccbot/handlers/card_model.py            |   8 +
 src/ccbot/handlers/notifications.py         |  74 ++++++-
 tests/ccbot/handlers/test_stall_recovery.py | 213 ++++++++++++++++++++
 tests/e2e/test_stalled_finalize.py          |  10 +-
 tests/test_stalled_card.py                  |  79 +++++++-
 5 files changed, 375 insertions(+), 9 deletions(-)
 create mode 100644 tests/ccbot/handlers/test_stall_recovery.py

diff --git a/src/ccbot/handlers/card_model.py b/src/ccbot/handlers/card_model.py
index a042a761..fa9c7d1e 100644
--- a/src/ccbot/handlers/card_model.py
+++ b/src/ccbot/handlers/card_model.py
@@ -200,6 +200,14 @@ class CardState:
     # transcript still spans many turn-pages. ``/clear`` leaves it True:
     # that is an intentional wipe-to-zero.
     seed_attempted: bool = False
+    # Stall-recovery flag. Set by ``maybe_finalize_stalled`` after it
+    # appends the STALL_NOTE final_text. If the stall was a false positive
+    # (a genuine assistant turn arrives after), the next
+    # ``update_session_card`` / ``finalize_task`` wipes the card binding
+    # and lets ``_send_card`` spawn a fresh message below the stalled
+    # stub — so the recovered answer is visible instead of being silently
+    # edited into a card the user has scrolled past or marked complete.
+    stall_finalized: bool = False
 
 
 def _trim(s: str, limit: int = 200) -> str:
diff --git a/src/ccbot/handlers/notifications.py b/src/ccbot/handlers/notifications.py
index b6ae9d55..0047d38b 100644
--- a/src/ccbot/handlers/notifications.py
+++ b/src/ccbot/handlers/notifications.py
@@ -576,6 +576,31 @@ def reset_card(user_id: int, session_id: str) -> None:
     _cards.pop((user_id, session_id), None)
 
 
+def _recover_from_false_stall(state: CardState) -> None:
+    """Wipe the live-card binding after a false-positive stall_finalize.
+
+    Set when a genuine assistant turn lands AFTER
+    ``maybe_finalize_stalled`` armed ``state.stall_finalized``. Clears
+    msg_id / events / pagination so the next render path goes through
+    ``_send_card`` (fresh message below the stalled stub) rather than
+    ``_edit_card`` (silent edit of the now-finalized card). The stalled
+    stub stays in chat history with its STALL_NOTE — we don't rewrite
+    it; the recovery message appears as a fresh card with
+    ``is_continuation=True`` so the header carries the ``…continued``
+    marker.
+    """
+    if state.pending_edit is not None and not state.pending_edit.done():
+        state.pending_edit.cancel()
+    state.pending_edit = None
+    state.msg_id = None
+    state.events = []
+    state.current_page_idx = None
+    state.is_continuation = True
+    state.last_rendered = ""
+    state.seed_attempted = False
+    state.stall_finalized = False
+
+
 async def cancel_pending_card_edits(timeout: float = 2.0) -> None:
     """Cancel + drain every deferred ``_edit_card`` task across all cards.
 
@@ -1498,6 +1523,12 @@ async def _update_session_card_locked(
     new_event: Event,
     replaced: bool,
 ) -> None:
+    # Recover from a prior false-positive stall_finalize. Wipe the card
+    # binding so this real assistant turn lands on a fresh message
+    # below the stalled stub instead of being silently edited into it.
+    if state.stall_finalized:
+        _recover_from_false_stall(state)
+        await _ensure_seeded(user_id, sess, state)
     # Trigger: long pause → fresh card.
     if _is_stale(state):
         state.msg_id = None
@@ -1619,6 +1650,14 @@ async def finalize_task(bot: Bot, user_id: int, sess: Session, final_text: str)
     ``paginate_events``.
     """
     state = get_card_state(user_id, sess)
+    # Recover from a prior false-positive stall_finalize: wipe the card
+    # binding so this real answer spawns a fresh card below the stalled
+    # stub. Must run before ``_ensure_seeded`` so the seed targets the
+    # cleared events list. NOT triggered by ``maybe_finalize_stalled``'s
+    # own call into ``finalize_task`` — the flag is set only AFTER that
+    # path returns.
+    if state.stall_finalized:
+        _recover_from_false_stall(state)
     # First event after a bot restart: seed JSONL history before
     # appending the final answer so the user sees their context.
     await _ensure_seeded(user_id, sess, state)
@@ -1850,7 +1889,16 @@ def is_card_busy(user_id: int, session_id: str) -> bool:
 # True and this never fires during long thinking / a slow tool. We only
 # trip when the spinner is gone or frozen AND no new renderable event
 # arrived for this long — i.e. the subprocess produced nothing.
+#
+# Two-tier threshold by tail event type. Tools legitimately run for
+# minutes (slow Bash, CHYT, Map-Reduce, network) and Claude routinely
+# spends a comparable amount of time reasoning after the last tool
+# result before emitting the final assistant turn — both produce a
+# silent JSONL tail of ``tool_use``. Pre-textual silence (``text`` /
+# ``thinking`` tail) is rarer and more suspicious because Claude is
+# mid-emit, so the original threshold still applies there.
 STALL_FINALIZE_AFTER_SECONDS = 90.0
+STALL_FINALIZE_TOOL_USE_SECONDS = 300.0
 
 # Note appended to the card when a stall is detected. Card-body strings
 # in this module are not localized (header / "context:" / "goal:" are
@@ -1906,30 +1954,46 @@ async def maybe_finalize_stalled(
     if state.in_menu_view or state.in_kb_mode:
         return False
     # Already finalized — nothing frozen to rescue.
-    if state.events[-1].type in ("final_text", "error"):
+    tail_type = state.events[-1].type
+    if tail_type in ("final_text", "error"):
         return False
     if state.last_event_ts <= 0:
         return False
     when = now if now is not None else time.time()
-    if (when - state.last_event_ts) < STALL_FINALIZE_AFTER_SECONDS:
+    threshold = (
+        STALL_FINALIZE_TOOL_USE_SECONDS
+        if tail_type == "tool_use"
+        else STALL_FINALIZE_AFTER_SECONDS
+    )
+    if (when - state.last_event_ts) < threshold:
         return False
     logger.warning(
-        "stall_finalize user=%d sess=%s wid=%s idle=%.0fs tail=%s",
+        "stall_finalize user=%d sess=%s wid=%s idle=%.0fs tail=%s threshold=%.0fs",
         user_id,
         sess.id,
         sess.window_id,
         when - state.last_event_ts,
-        state.events[-1].type,
+        tail_type,
+        threshold,
         extra={
             "event": "stall_finalize",
             "user_id": user_id,
             "session_id": sess.id,
             "window_id": sess.window_id,
             "idle_seconds": round(when - state.last_event_ts),
-            "tail_type": state.events[-1].type,
+            "tail_type": tail_type,
+            "threshold_seconds": round(threshold),
         },
     )
     await finalize_task(bot, user_id, sess, STALL_NOTE)
+    # Arm the false-positive recovery: if a real assistant turn arrives
+    # after this, the next ``update_session_card`` / ``finalize_task``
+    # spawns a fresh card below the stalled stub instead of silently
+    # editing it. ``finalize_task`` already ran and re-fetched ``state``,
+    # so re-read from ``_cards`` to set the flag on the same instance.
+    post_state = _cards.get((user_id, sess.id))
+    if post_state is not None:
+        post_state.stall_finalized = True
     return True
 
 
diff --git a/tests/ccbot/handlers/test_stall_recovery.py b/tests/ccbot/handlers/test_stall_recovery.py
new file mode 100644
index 00000000..4ee187d3
--- /dev/null
+++ b/tests/ccbot/handlers/test_stall_recovery.py
@@ -0,0 +1,213 @@
+"""Regression tests for stall_finalize false-positive recovery.
+
+When ``maybe_finalize_stalled`` fires too eagerly (e.g. the metrics-debug
+incident on 2026-06-17: ``tail=tool_use idle=96s`` while Claude was
+reasoning toward the final answer), a genuine assistant turn lands
+*after* the STALL_NOTE was already appended to the card. Without the
+recovery path the real reply would be silently edited into a card the
+user has already scrolled past or marked as complete.
+
+The fix: ``maybe_finalize_stalled`` sets ``state.stall_finalized=True``
+after the STALL_NOTE finalize_task lands. Both ``update_session_card``
+and ``finalize_task`` check the flag on entry and wipe the binding via
+``_recover_from_false_stall`` so the next render lands as a fresh
+``_send_card`` below the stub.
+"""
+
+from __future__ import annotations
+
+import time
+from typing import Any
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from ccbot.handlers import notifications
+from ccbot.handlers.notifications import (
+    CardState,
+    Event,
+    _card_locks,
+    _cards,
+    _recover_from_false_stall,
+    _repost_intent,
+)
+from ccbot.session_models import Session
+from ccbot.session_monitor import NewMessage
+
+
+@pytest.fixture(autouse=True)
+def _clear_card_state():
+    _cards.clear()
+    _card_locks.clear()
+    _repost_intent.clear()
+    yield
+    _cards.clear()
+    _card_locks.clear()
+    _repost_intent.clear()
+
+
+def _make_sess(sid: str = "s1") -> Session:
+    return Session(
+        id=sid,
+        name="test",
+        window_id="@1",
+        workdir="/tmp",
+        state="active",
+        claude_session_id="uuid-" + sid,
+    )
+
+
+def _stalled_msg(text: str = "real answer at last") -> NewMessage:
+    return NewMessage(
+        session_id="uuid-s1",
+        text=text,
+        is_complete=True,
+        content_type="text",
+        role="assistant",
+        stop_reason="end_turn",
+    )
+
+
+def _seed_stalled_card(user_id: int, sess: Session, *, msg_id: int = 999) -> CardState:
+    """Seed a card in the post-stall_finalize state: STALL_NOTE landed,
+    flag armed, the original tool_use + STALL_NOTE final_text are in
+    events. This mirrors what ``maybe_finalize_stalled`` leaves behind."""
+    now = time.time()
+    state = _cards.setdefault((user_id, sess.id), CardState())
+    state.msg_id = msg_id
+    state.events = [
+        Event(type="user_msg", text="run the analysis", started_at=now - 600),
+        Event(type="tool_use", text="Bash(…)", started_at=now - 500),
+        Event(
+            type="final_text",
+            text=notifications.STALL_NOTE,
+            started_at=now - 100,
+        ),
+    ]
+    state.last_event_ts = now - 100
+    state.stall_finalized = True
+    return state
+
+
+def test_recover_helper_wipes_binding_and_flag():
+    """``_recover_from_false_stall`` clears msg_id, events, last_rendered
+    AND the flag itself; sets is_continuation + clears seed_attempted so
+    the next event re-pulls JSONL context."""
+    state = CardState()
+    state.msg_id = 7177
+    state.events = [Event(type="final_text", text="stub", started_at=time.time())]
+    state.last_rendered = "rendered stub"
+    state.stall_finalized = True
+    state.seed_attempted = True
+    state.current_page_idx = 3
+    state.is_continuation = False
+
+    _recover_from_false_stall(state)
+
+    assert state.msg_id is None
+    assert state.events == []
+    assert state.last_rendered == ""
+    assert state.stall_finalized is False
+    assert state.seed_attempted is False
+    assert state.current_page_idx is None
+    assert state.is_continuation is True
+
+
+@pytest.mark.asyncio
+async def test_update_session_card_recovers_after_stall(monkeypatch):
+    """A genuine assistant turn arriving after a false stall_finalize
+    must spawn a FRESH card (``_send_card``), NOT edit the stalled stub
+    (``_edit_card``). The recovery path is triggered by
+    ``state.stall_finalized=True``."""
+    sess = _make_sess()
+    bot = AsyncMock()
+    sent: list[Any] = []
+    edits: list[Any] = []
+
+    async def fake_send_card(b, uid, s, st, *, text, reply_markup=None):
+        st.msg_id = 5000 + len(sent)
+        sent.append(text)
+
+    async def fake_edit_card(b, uid, st, *, text, reply_markup=None):
+        edits.append(text)
+        return True
+
+    monkeypatch.setattr(notifications, "_send_card", fake_send_card)
+    monkeypatch.setattr(notifications, "_edit_card", fake_edit_card)
+    monkeypatch.setattr(notifications, "_ensure_seeded", AsyncMock(return_value=None))
+    fake_active = MagicMock()
+    fake_active.id = sess.id
+    monkeypatch.setattr(
+        notifications.session_manager,
+        "get_active_session",
+        lambda uid: fake_active,
+    )
+    monkeypatch.setattr(
+        notifications.session_manager,
+        "get_user_settings",
+        lambda uid: {"live_lag": 0},
+    )
+
+    user_id = 42
+    state = _seed_stalled_card(user_id, sess, msg_id=7177)
+    assert state.stall_finalized is True
+    assert state.msg_id == 7177
+
+    await notifications.update_session_card(bot, user_id, sess, _stalled_msg())
+
+    assert len(sent) == 1, f"expected fresh card spawn, got sends={sent} edits={edits}"
+    assert edits == [], "stalled stub must not be edited"
+    assert _cards[(user_id, sess.id)].stall_finalized is False
+
+
+@pytest.mark.asyncio
+async def test_finalize_task_recovers_after_stall(monkeypatch):
+    """A real end-of-turn assistant text arriving after a false stall
+    routes through ``finalize_task`` (not update_session_card). It also
+    must spawn a fresh card below the stub."""
+    sess = _make_sess()
+    bot = AsyncMock()
+    sent: list[Any] = []
+    edits: list[Any] = []
+
+    async def fake_send_card(b, uid, s, st, *, text, reply_markup=None):
+        st.msg_id = 6000 + len(sent)
+        sent.append(text)
+
+    async def fake_edit_card(b, uid, st, *, text, reply_markup=None):
+        edits.append(text)
+        return True
+
+    monkeypatch.setattr(notifications, "_send_card", fake_send_card)
+    monkeypatch.setattr(notifications, "_edit_card", fake_edit_card)
+    monkeypatch.setattr(notifications, "_ensure_seeded", AsyncMock(return_value=None))
+    fake_active = MagicMock()
+    fake_active.id = sess.id
+    monkeypatch.setattr(
+        notifications.session_manager,
+        "get_active_session",
+        lambda uid: fake_active,
+    )
+    monkeypatch.setattr(
+        notifications.session_manager,
+        "get_user_settings",
+        lambda uid: {"live_lag": 0, "card_page_lines": 45},
+    )
+    # ``finalize_task`` calls ``prewarm_pages_cache`` which would hit the
+    # JSONL file system path — stub it to a no-op.
+    fake_prewarm = AsyncMock(return_value=None)
+    monkeypatch.setattr(notifications, "_send_attachments", AsyncMock())
+
+    import ccbot.handlers.history as _history_mod
+
+    monkeypatch.setattr(_history_mod, "prewarm_pages_cache", fake_prewarm)
+
+    user_id = 42
+    state = _seed_stalled_card(user_id, sess, msg_id=7177)
+    assert state.stall_finalized is True
+
+    await notifications.finalize_task(bot, user_id, sess, "the real BT_fin answer")
+
+    assert len(sent) == 1, f"expected fresh card spawn, got sends={sent} edits={edits}"
+    assert edits == [], "stalled stub must not be edited by finalize_task"
+    assert _cards[(user_id, sess.id)].stall_finalized is False
diff --git a/tests/e2e/test_stalled_finalize.py b/tests/e2e/test_stalled_finalize.py
index b1f561e6..9e027999 100644
--- a/tests/e2e/test_stalled_finalize.py
+++ b/tests/e2e/test_stalled_finalize.py
@@ -21,6 +21,7 @@
 from ccbot.handlers import notifications
 from ccbot.handlers.notifications import (
     STALL_FINALIZE_AFTER_SECONDS,
+    STALL_FINALIZE_TOOL_USE_SECONDS,
     STALL_NOTE,
     CardState,
     Event,
@@ -41,11 +42,16 @@
 
 def _seed_frozen_card(*, tail_type: str = "tool_use") -> CardState:
     """Install an active-session card whose tail is non-terminal and whose
-    last event is older than the stall threshold."""
+    last event is older than the per-tail-type stall threshold."""
+    threshold = (
+        STALL_FINALIZE_TOOL_USE_SECONDS
+        if tail_type == "tool_use"
+        else STALL_FINALIZE_AFTER_SECONDS
+    )
     now = time.time()
     state = CardState()
     state.msg_id = 7777
-    state.last_event_ts = now - (STALL_FINALIZE_AFTER_SECONDS + 30)
+    state.last_event_ts = now - (threshold + 30)
     state.last_edit_ts = 0.0
     state.events = [
         Event(type="user_msg", text="do the long thing", started_at=now - 200),
diff --git a/tests/test_stalled_card.py b/tests/test_stalled_card.py
index 8f59e69c..8aeeac2b 100644
--- a/tests/test_stalled_card.py
+++ b/tests/test_stalled_card.py
@@ -26,6 +26,7 @@
 from ccbot.handlers import notifications
 from ccbot.handlers.notifications import (
     STALL_FINALIZE_AFTER_SECONDS,
+    STALL_FINALIZE_TOOL_USE_SECONDS,
     STALL_NOTE,
     CardState,
     Event,
@@ -118,9 +119,16 @@ async def test_idle_nonterminal_tail_finalizes(self, stub_finalize):
     @pytest.mark.asyncio
     async def test_tool_use_tail_finalizes(self, stub_finalize):
         """A tool_use whose result never came is also a stall fingerprint
-        once the pane has been idle long enough."""
+        once the pane has been idle long enough — but the threshold is
+        the longer ``STALL_FINALIZE_TOOL_USE_SECONDS`` because slow tools
+        and post-tool reasoning are legitimately silent for minutes."""
         user_id, sess = 42, _make_sess()
-        _seed_card(user_id, sess, tail_type="tool_use")
+        _seed_card(
+            user_id,
+            sess,
+            tail_type="tool_use",
+            age_seconds=STALL_FINALIZE_TOOL_USE_SECONDS + 30,
+        )
 
         fired = await maybe_finalize_stalled(
             AsyncMock(),
@@ -134,6 +142,73 @@ async def test_tool_use_tail_finalizes(self, stub_finalize):
         assert fired is True
         assert len(stub_finalize) == 1
 
+    @pytest.mark.asyncio
+    async def test_tool_use_within_extended_threshold_no_fire(self, stub_finalize):
+        """A tool_use tail idle for ``STALL_FINALIZE_AFTER_SECONDS`` but
+        under ``STALL_FINALIZE_TOOL_USE_SECONDS`` must NOT finalize — this
+        is the metrics-debug regression: Claude was reasoning ~96 s after
+        the last tool_use before emitting the final answer, which the
+        old single-threshold policy treated as a stall."""
+        user_id, sess = 42, _make_sess()
+        _seed_card(
+            user_id,
+            sess,
+            tail_type="tool_use",
+            age_seconds=STALL_FINALIZE_AFTER_SECONDS + 30,
+        )
+
+        fired = await maybe_finalize_stalled(
+            AsyncMock(),
+            user_id,
+            sess,
+            pane_busy=False,
+            interactive_waiting=False,
+            in_menu=False,
+        )
+
+        assert fired is False
+        assert stub_finalize == []
+
+    @pytest.mark.asyncio
+    async def test_stall_arms_recovery_flag(self, monkeypatch):
+        """After ``finalize_task`` lands the STALL_NOTE, the card state
+        must carry ``stall_finalized=True`` so the next genuine assistant
+        turn spawns a fresh card instead of silently editing the stub."""
+
+        # Real ``finalize_task`` replacement that mirrors the actual
+        # contract: append a ``final_text`` Event so the post-call check
+        # sees a terminal tail (defensive — recovery flag is set after
+        # finalize returns, irrespective of internals).
+        async def _fake_finalize(bot, user_id, sess, final_text):  # type: ignore[no-untyped-def]
+            st = _cards.get((user_id, sess.id))
+            if st is not None:
+                st.events.append(
+                    Event(type="final_text", text=final_text, started_at=time.time())
+                )
+
+        monkeypatch.setattr(notifications, "finalize_task", _fake_finalize)
+
+        user_id, sess = 42, _make_sess()
+        state = _seed_card(
+            user_id,
+            sess,
+            tail_type="tool_use",
+            age_seconds=STALL_FINALIZE_TOOL_USE_SECONDS + 30,
+        )
+        assert state.stall_finalized is False
+
+        fired = await maybe_finalize_stalled(
+            AsyncMock(),
+            user_id,
+            sess,
+            pane_busy=False,
+            interactive_waiting=False,
+            in_menu=False,
+        )
+
+        assert fired is True
+        assert _cards[(user_id, sess.id)].stall_finalized is True
+
 
 # ── Negative: the trigger must stay quiet ─────────────────────────────