From aef04cda3ecf9dbbf26cdb8d6f5b2f2543144452 Mon Sep 17 00:00:00 2001 From: Jean Brazeau Date: Tue, 9 Jun 2026 11:32:48 -0700 Subject: [PATCH] fix(web): warm lancedb at startup to fix first-run MCP bootstrap_failure On a cold server the first lazy `import lancedb` (koan/memory/retrieval/index.py) runs synchronously on the asyncio event loop during the first run. The seconds-long native import blocks the loop in the exact window where a freshly-spawned agent's CLI fetches tools/list from koan's MCP server. The response never arrives, so the agent starts with no koan tools, never calls koan_complete_step, and the run dies with bootstrap_failure ("Process exited before first koan_complete_step call"). Subsequent runs were fine because lancedb was already warm. Warm koan.memory.retrieval.index in the server lifespan, off-loop via asyncio.to_thread, before the server accepts any run. Best-effort: a warmup failure is logged at debug and never blocks startup. Co-Authored-By: Claude Opus 4.8 (1M context) --- koan/web/app.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/koan/web/app.py b/koan/web/app.py index 34da5f9..bd9bca9 100644 --- a/koan/web/app.py +++ b/koan/web/app.py @@ -5,6 +5,7 @@ from __future__ import annotations import asyncio +import importlib import json import os import shutil @@ -1774,6 +1775,25 @@ async def lifespan(app): await _refresh_probe_state(app_state, broadcast=False) _push_initial_config_events(app_state) + # Warm the memory-retrieval stack before we accept any run. Its module + # does a top-level `import lancedb` -- a heavy, fork-unsafe native + # extension that takes ~seconds to load the first time. On a cold server + # that import otherwise fires lazily on the event loop during the first + # run, blocking the loop for the duration. That block lands squarely in + # the window where a just-spawned agent's CLI fetches tools/list from our + # MCP server, so the response never arrives, the agent starts with no + # koan tools, never calls koan_complete_step, and the run dies with + # bootstrap_failure. Importing it here, off-loop and before the server is + # ready, closes that race (subsequent runs were always fine because + # lancedb was warm by then). Best-effort: a warmup failure must not stop + # the server from starting. + try: + await asyncio.to_thread( + importlib.import_module, "koan.memory.retrieval.index" + ) + except Exception: + log.debug("memory-retrieval warmup skipped", exc_info=True) + # Open browser once after server is listening if app_state.server.open_browser: app_state.server.open_browser = False # one-shot guard