From 28e9ff1265316987ab8c777a80fed5e5bbfe29ee Mon Sep 17 00:00:00 2001 From: Nosko Artem Date: Wed, 17 Jun 2026 10:13:25 +0300 Subject: [PATCH] fix(card): adaptive stall_finalize threshold + false-positive recovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two regressions on a long-reasoning metrics-debug session: (1) ``STALL_FINALIZE_AFTER_SECONDS=90s`` fired on ``tail=tool_use idle=96s`` while Claude was legitimately thinking toward the final answer; (2) once the STALL_NOTE was appended, the real answer landing later got silently edited into the now-finalized card the user had scrolled past. Fix (1): split the threshold by tail event type. ``tool_use`` tails get ``STALL_FINALIZE_TOOL_USE_SECONDS=300s`` because slow tools and post-tool reasoning are routinely silent for minutes; ``text`` / ``thinking`` tails keep the original 90s because mid-emit silence is genuinely suspicious. Fix (2): ``maybe_finalize_stalled`` now arms ``CardState .stall_finalized=True`` after the STALL_NOTE lands. The next call to ``update_session_card`` or ``finalize_task`` runs ``_recover_from_false_stall`` — wipes msg_id / events / pagination and flips ``is_continuation=True`` so ``_send_card`` spawns a fresh card below the stub instead of clobbering it. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ccbot/handlers/card_model.py | 8 + src/ccbot/handlers/notifications.py | 74 ++++++- tests/ccbot/handlers/test_stall_recovery.py | 213 ++++++++++++++++++++ tests/e2e/test_stalled_finalize.py | 10 +- tests/test_stalled_card.py | 79 +++++++- 5 files changed, 375 insertions(+), 9 deletions(-) create mode 100644 tests/ccbot/handlers/test_stall_recovery.py diff --git a/src/ccbot/handlers/card_model.py b/src/ccbot/handlers/card_model.py index a042a761..fa9c7d1e 100644 --- a/src/ccbot/handlers/card_model.py +++ b/src/ccbot/handlers/card_model.py @@ -200,6 +200,14 @@ class CardState: # transcript still spans many turn-pages. ``/clear`` leaves it True: # that is an intentional wipe-to-zero. seed_attempted: bool = False + # Stall-recovery flag. Set by ``maybe_finalize_stalled`` after it + # appends the STALL_NOTE final_text. If the stall was a false positive + # (a genuine assistant turn arrives after), the next + # ``update_session_card`` / ``finalize_task`` wipes the card binding + # and lets ``_send_card`` spawn a fresh message below the stalled + # stub — so the recovered answer is visible instead of being silently + # edited into a card the user has scrolled past or marked complete. + stall_finalized: bool = False def _trim(s: str, limit: int = 200) -> str: diff --git a/src/ccbot/handlers/notifications.py b/src/ccbot/handlers/notifications.py index b6ae9d55..0047d38b 100644 --- a/src/ccbot/handlers/notifications.py +++ b/src/ccbot/handlers/notifications.py @@ -576,6 +576,31 @@ def reset_card(user_id: int, session_id: str) -> None: _cards.pop((user_id, session_id), None) +def _recover_from_false_stall(state: CardState) -> None: + """Wipe the live-card binding after a false-positive stall_finalize. + + Set when a genuine assistant turn lands AFTER + ``maybe_finalize_stalled`` armed ``state.stall_finalized``. Clears + msg_id / events / pagination so the next render path goes through + ``_send_card`` (fresh message below the stalled stub) rather than + ``_edit_card`` (silent edit of the now-finalized card). The stalled + stub stays in chat history with its STALL_NOTE — we don't rewrite + it; the recovery message appears as a fresh card with + ``is_continuation=True`` so the header carries the ``…continued`` + marker. + """ + if state.pending_edit is not None and not state.pending_edit.done(): + state.pending_edit.cancel() + state.pending_edit = None + state.msg_id = None + state.events = [] + state.current_page_idx = None + state.is_continuation = True + state.last_rendered = "" + state.seed_attempted = False + state.stall_finalized = False + + async def cancel_pending_card_edits(timeout: float = 2.0) -> None: """Cancel + drain every deferred ``_edit_card`` task across all cards. @@ -1498,6 +1523,12 @@ async def _update_session_card_locked( new_event: Event, replaced: bool, ) -> None: + # Recover from a prior false-positive stall_finalize. Wipe the card + # binding so this real assistant turn lands on a fresh message + # below the stalled stub instead of being silently edited into it. + if state.stall_finalized: + _recover_from_false_stall(state) + await _ensure_seeded(user_id, sess, state) # Trigger: long pause → fresh card. if _is_stale(state): state.msg_id = None @@ -1619,6 +1650,14 @@ async def finalize_task(bot: Bot, user_id: int, sess: Session, final_text: str) ``paginate_events``. """ state = get_card_state(user_id, sess) + # Recover from a prior false-positive stall_finalize: wipe the card + # binding so this real answer spawns a fresh card below the stalled + # stub. Must run before ``_ensure_seeded`` so the seed targets the + # cleared events list. NOT triggered by ``maybe_finalize_stalled``'s + # own call into ``finalize_task`` — the flag is set only AFTER that + # path returns. + if state.stall_finalized: + _recover_from_false_stall(state) # First event after a bot restart: seed JSONL history before # appending the final answer so the user sees their context. await _ensure_seeded(user_id, sess, state) @@ -1850,7 +1889,16 @@ def is_card_busy(user_id: int, session_id: str) -> bool: # True and this never fires during long thinking / a slow tool. We only # trip when the spinner is gone or frozen AND no new renderable event # arrived for this long — i.e. the subprocess produced nothing. +# +# Two-tier threshold by tail event type. Tools legitimately run for +# minutes (slow Bash, CHYT, Map-Reduce, network) and Claude routinely +# spends a comparable amount of time reasoning after the last tool +# result before emitting the final assistant turn — both produce a +# silent JSONL tail of ``tool_use``. Pre-textual silence (``text`` / +# ``thinking`` tail) is rarer and more suspicious because Claude is +# mid-emit, so the original threshold still applies there. STALL_FINALIZE_AFTER_SECONDS = 90.0 +STALL_FINALIZE_TOOL_USE_SECONDS = 300.0 # Note appended to the card when a stall is detected. Card-body strings # in this module are not localized (header / "context:" / "goal:" are @@ -1906,30 +1954,46 @@ async def maybe_finalize_stalled( if state.in_menu_view or state.in_kb_mode: return False # Already finalized — nothing frozen to rescue. - if state.events[-1].type in ("final_text", "error"): + tail_type = state.events[-1].type + if tail_type in ("final_text", "error"): return False if state.last_event_ts <= 0: return False when = now if now is not None else time.time() - if (when - state.last_event_ts) < STALL_FINALIZE_AFTER_SECONDS: + threshold = ( + STALL_FINALIZE_TOOL_USE_SECONDS + if tail_type == "tool_use" + else STALL_FINALIZE_AFTER_SECONDS + ) + if (when - state.last_event_ts) < threshold: return False logger.warning( - "stall_finalize user=%d sess=%s wid=%s idle=%.0fs tail=%s", + "stall_finalize user=%d sess=%s wid=%s idle=%.0fs tail=%s threshold=%.0fs", user_id, sess.id, sess.window_id, when - state.last_event_ts, - state.events[-1].type, + tail_type, + threshold, extra={ "event": "stall_finalize", "user_id": user_id, "session_id": sess.id, "window_id": sess.window_id, "idle_seconds": round(when - state.last_event_ts), - "tail_type": state.events[-1].type, + "tail_type": tail_type, + "threshold_seconds": round(threshold), }, ) await finalize_task(bot, user_id, sess, STALL_NOTE) + # Arm the false-positive recovery: if a real assistant turn arrives + # after this, the next ``update_session_card`` / ``finalize_task`` + # spawns a fresh card below the stalled stub instead of silently + # editing it. ``finalize_task`` already ran and re-fetched ``state``, + # so re-read from ``_cards`` to set the flag on the same instance. + post_state = _cards.get((user_id, sess.id)) + if post_state is not None: + post_state.stall_finalized = True return True diff --git a/tests/ccbot/handlers/test_stall_recovery.py b/tests/ccbot/handlers/test_stall_recovery.py new file mode 100644 index 00000000..4ee187d3 --- /dev/null +++ b/tests/ccbot/handlers/test_stall_recovery.py @@ -0,0 +1,213 @@ +"""Regression tests for stall_finalize false-positive recovery. + +When ``maybe_finalize_stalled`` fires too eagerly (e.g. the metrics-debug +incident on 2026-06-17: ``tail=tool_use idle=96s`` while Claude was +reasoning toward the final answer), a genuine assistant turn lands +*after* the STALL_NOTE was already appended to the card. Without the +recovery path the real reply would be silently edited into a card the +user has already scrolled past or marked as complete. + +The fix: ``maybe_finalize_stalled`` sets ``state.stall_finalized=True`` +after the STALL_NOTE finalize_task lands. Both ``update_session_card`` +and ``finalize_task`` check the flag on entry and wipe the binding via +``_recover_from_false_stall`` so the next render lands as a fresh +``_send_card`` below the stub. +""" + +from __future__ import annotations + +import time +from typing import Any +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from ccbot.handlers import notifications +from ccbot.handlers.notifications import ( + CardState, + Event, + _card_locks, + _cards, + _recover_from_false_stall, + _repost_intent, +) +from ccbot.session_models import Session +from ccbot.session_monitor import NewMessage + + +@pytest.fixture(autouse=True) +def _clear_card_state(): + _cards.clear() + _card_locks.clear() + _repost_intent.clear() + yield + _cards.clear() + _card_locks.clear() + _repost_intent.clear() + + +def _make_sess(sid: str = "s1") -> Session: + return Session( + id=sid, + name="test", + window_id="@1", + workdir="/tmp", + state="active", + claude_session_id="uuid-" + sid, + ) + + +def _stalled_msg(text: str = "real answer at last") -> NewMessage: + return NewMessage( + session_id="uuid-s1", + text=text, + is_complete=True, + content_type="text", + role="assistant", + stop_reason="end_turn", + ) + + +def _seed_stalled_card(user_id: int, sess: Session, *, msg_id: int = 999) -> CardState: + """Seed a card in the post-stall_finalize state: STALL_NOTE landed, + flag armed, the original tool_use + STALL_NOTE final_text are in + events. This mirrors what ``maybe_finalize_stalled`` leaves behind.""" + now = time.time() + state = _cards.setdefault((user_id, sess.id), CardState()) + state.msg_id = msg_id + state.events = [ + Event(type="user_msg", text="run the analysis", started_at=now - 600), + Event(type="tool_use", text="Bash(…)", started_at=now - 500), + Event( + type="final_text", + text=notifications.STALL_NOTE, + started_at=now - 100, + ), + ] + state.last_event_ts = now - 100 + state.stall_finalized = True + return state + + +def test_recover_helper_wipes_binding_and_flag(): + """``_recover_from_false_stall`` clears msg_id, events, last_rendered + AND the flag itself; sets is_continuation + clears seed_attempted so + the next event re-pulls JSONL context.""" + state = CardState() + state.msg_id = 7177 + state.events = [Event(type="final_text", text="stub", started_at=time.time())] + state.last_rendered = "rendered stub" + state.stall_finalized = True + state.seed_attempted = True + state.current_page_idx = 3 + state.is_continuation = False + + _recover_from_false_stall(state) + + assert state.msg_id is None + assert state.events == [] + assert state.last_rendered == "" + assert state.stall_finalized is False + assert state.seed_attempted is False + assert state.current_page_idx is None + assert state.is_continuation is True + + +@pytest.mark.asyncio +async def test_update_session_card_recovers_after_stall(monkeypatch): + """A genuine assistant turn arriving after a false stall_finalize + must spawn a FRESH card (``_send_card``), NOT edit the stalled stub + (``_edit_card``). The recovery path is triggered by + ``state.stall_finalized=True``.""" + sess = _make_sess() + bot = AsyncMock() + sent: list[Any] = [] + edits: list[Any] = [] + + async def fake_send_card(b, uid, s, st, *, text, reply_markup=None): + st.msg_id = 5000 + len(sent) + sent.append(text) + + async def fake_edit_card(b, uid, st, *, text, reply_markup=None): + edits.append(text) + return True + + monkeypatch.setattr(notifications, "_send_card", fake_send_card) + monkeypatch.setattr(notifications, "_edit_card", fake_edit_card) + monkeypatch.setattr(notifications, "_ensure_seeded", AsyncMock(return_value=None)) + fake_active = MagicMock() + fake_active.id = sess.id + monkeypatch.setattr( + notifications.session_manager, + "get_active_session", + lambda uid: fake_active, + ) + monkeypatch.setattr( + notifications.session_manager, + "get_user_settings", + lambda uid: {"live_lag": 0}, + ) + + user_id = 42 + state = _seed_stalled_card(user_id, sess, msg_id=7177) + assert state.stall_finalized is True + assert state.msg_id == 7177 + + await notifications.update_session_card(bot, user_id, sess, _stalled_msg()) + + assert len(sent) == 1, f"expected fresh card spawn, got sends={sent} edits={edits}" + assert edits == [], "stalled stub must not be edited" + assert _cards[(user_id, sess.id)].stall_finalized is False + + +@pytest.mark.asyncio +async def test_finalize_task_recovers_after_stall(monkeypatch): + """A real end-of-turn assistant text arriving after a false stall + routes through ``finalize_task`` (not update_session_card). It also + must spawn a fresh card below the stub.""" + sess = _make_sess() + bot = AsyncMock() + sent: list[Any] = [] + edits: list[Any] = [] + + async def fake_send_card(b, uid, s, st, *, text, reply_markup=None): + st.msg_id = 6000 + len(sent) + sent.append(text) + + async def fake_edit_card(b, uid, st, *, text, reply_markup=None): + edits.append(text) + return True + + monkeypatch.setattr(notifications, "_send_card", fake_send_card) + monkeypatch.setattr(notifications, "_edit_card", fake_edit_card) + monkeypatch.setattr(notifications, "_ensure_seeded", AsyncMock(return_value=None)) + fake_active = MagicMock() + fake_active.id = sess.id + monkeypatch.setattr( + notifications.session_manager, + "get_active_session", + lambda uid: fake_active, + ) + monkeypatch.setattr( + notifications.session_manager, + "get_user_settings", + lambda uid: {"live_lag": 0, "card_page_lines": 45}, + ) + # ``finalize_task`` calls ``prewarm_pages_cache`` which would hit the + # JSONL file system path — stub it to a no-op. + fake_prewarm = AsyncMock(return_value=None) + monkeypatch.setattr(notifications, "_send_attachments", AsyncMock()) + + import ccbot.handlers.history as _history_mod + + monkeypatch.setattr(_history_mod, "prewarm_pages_cache", fake_prewarm) + + user_id = 42 + state = _seed_stalled_card(user_id, sess, msg_id=7177) + assert state.stall_finalized is True + + await notifications.finalize_task(bot, user_id, sess, "the real BT_fin answer") + + assert len(sent) == 1, f"expected fresh card spawn, got sends={sent} edits={edits}" + assert edits == [], "stalled stub must not be edited by finalize_task" + assert _cards[(user_id, sess.id)].stall_finalized is False diff --git a/tests/e2e/test_stalled_finalize.py b/tests/e2e/test_stalled_finalize.py index b1f561e6..9e027999 100644 --- a/tests/e2e/test_stalled_finalize.py +++ b/tests/e2e/test_stalled_finalize.py @@ -21,6 +21,7 @@ from ccbot.handlers import notifications from ccbot.handlers.notifications import ( STALL_FINALIZE_AFTER_SECONDS, + STALL_FINALIZE_TOOL_USE_SECONDS, STALL_NOTE, CardState, Event, @@ -41,11 +42,16 @@ def _seed_frozen_card(*, tail_type: str = "tool_use") -> CardState: """Install an active-session card whose tail is non-terminal and whose - last event is older than the stall threshold.""" + last event is older than the per-tail-type stall threshold.""" + threshold = ( + STALL_FINALIZE_TOOL_USE_SECONDS + if tail_type == "tool_use" + else STALL_FINALIZE_AFTER_SECONDS + ) now = time.time() state = CardState() state.msg_id = 7777 - state.last_event_ts = now - (STALL_FINALIZE_AFTER_SECONDS + 30) + state.last_event_ts = now - (threshold + 30) state.last_edit_ts = 0.0 state.events = [ Event(type="user_msg", text="do the long thing", started_at=now - 200), diff --git a/tests/test_stalled_card.py b/tests/test_stalled_card.py index 8f59e69c..8aeeac2b 100644 --- a/tests/test_stalled_card.py +++ b/tests/test_stalled_card.py @@ -26,6 +26,7 @@ from ccbot.handlers import notifications from ccbot.handlers.notifications import ( STALL_FINALIZE_AFTER_SECONDS, + STALL_FINALIZE_TOOL_USE_SECONDS, STALL_NOTE, CardState, Event, @@ -118,9 +119,16 @@ async def test_idle_nonterminal_tail_finalizes(self, stub_finalize): @pytest.mark.asyncio async def test_tool_use_tail_finalizes(self, stub_finalize): """A tool_use whose result never came is also a stall fingerprint - once the pane has been idle long enough.""" + once the pane has been idle long enough — but the threshold is + the longer ``STALL_FINALIZE_TOOL_USE_SECONDS`` because slow tools + and post-tool reasoning are legitimately silent for minutes.""" user_id, sess = 42, _make_sess() - _seed_card(user_id, sess, tail_type="tool_use") + _seed_card( + user_id, + sess, + tail_type="tool_use", + age_seconds=STALL_FINALIZE_TOOL_USE_SECONDS + 30, + ) fired = await maybe_finalize_stalled( AsyncMock(), @@ -134,6 +142,73 @@ async def test_tool_use_tail_finalizes(self, stub_finalize): assert fired is True assert len(stub_finalize) == 1 + @pytest.mark.asyncio + async def test_tool_use_within_extended_threshold_no_fire(self, stub_finalize): + """A tool_use tail idle for ``STALL_FINALIZE_AFTER_SECONDS`` but + under ``STALL_FINALIZE_TOOL_USE_SECONDS`` must NOT finalize — this + is the metrics-debug regression: Claude was reasoning ~96 s after + the last tool_use before emitting the final answer, which the + old single-threshold policy treated as a stall.""" + user_id, sess = 42, _make_sess() + _seed_card( + user_id, + sess, + tail_type="tool_use", + age_seconds=STALL_FINALIZE_AFTER_SECONDS + 30, + ) + + fired = await maybe_finalize_stalled( + AsyncMock(), + user_id, + sess, + pane_busy=False, + interactive_waiting=False, + in_menu=False, + ) + + assert fired is False + assert stub_finalize == [] + + @pytest.mark.asyncio + async def test_stall_arms_recovery_flag(self, monkeypatch): + """After ``finalize_task`` lands the STALL_NOTE, the card state + must carry ``stall_finalized=True`` so the next genuine assistant + turn spawns a fresh card instead of silently editing the stub.""" + + # Real ``finalize_task`` replacement that mirrors the actual + # contract: append a ``final_text`` Event so the post-call check + # sees a terminal tail (defensive — recovery flag is set after + # finalize returns, irrespective of internals). + async def _fake_finalize(bot, user_id, sess, final_text): # type: ignore[no-untyped-def] + st = _cards.get((user_id, sess.id)) + if st is not None: + st.events.append( + Event(type="final_text", text=final_text, started_at=time.time()) + ) + + monkeypatch.setattr(notifications, "finalize_task", _fake_finalize) + + user_id, sess = 42, _make_sess() + state = _seed_card( + user_id, + sess, + tail_type="tool_use", + age_seconds=STALL_FINALIZE_TOOL_USE_SECONDS + 30, + ) + assert state.stall_finalized is False + + fired = await maybe_finalize_stalled( + AsyncMock(), + user_id, + sess, + pane_busy=False, + interactive_waiting=False, + in_menu=False, + ) + + assert fired is True + assert _cards[(user_id, sess.id)].stall_finalized is True + # ── Negative: the trigger must stay quiet ─────────────────────────────