From 8e98d09f02e04c0e27072d9a4e0191fb7dbbe83c Mon Sep 17 00:00:00 2001 From: "frantisek.spacek@morosystems.cz" Date: Thu, 11 Jun 2026 20:26:12 +0200 Subject: [PATCH 01/21] feat(backend): context-window resolution, token estimation & chat compaction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds proactive + reactive context-window management so chats no longer hard-fail when history exceeds a model's window (docs/adr/0009). - Context-window resolution (context-window.ts + litellm-registry.ts): manual override → provider API → vendored litellm registry → conservative default, cached with TTL and evicted on modelMeta change. - Single token estimator over a neutral CountUnit structure (token-estimate.ts), shared by Tier 1 (UIMessages) and Tier 2 (ModelMessages). - Tier 1 cross-turn compaction (compaction.ts): prune-then-summarize behind a single versioned CAS watermark writer; hysteresis trigger/target ratios. - Tier 2 in-turn compaction via prepareStep; sub-agent wiring threads the window. - Recovery middleware (recovery.ts): detect provider context-overflow, trim and retry once; always on, independent of the proactive kill switch. - agent-runner wraps the model with recovery, threads RV1 prior-messages for C4 edit-detection, and stamps §H/§I run stats onto the assistant message. - Schema: additive nullable provider.modelMeta + chat compaction state + per-agent compaction config; migration 0046. RV2 workspace-scope check on chat submit; POST /:chatId/compact for on-demand compaction. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../drizzle/0046_context_compaction.sql | 11 + apps/backend/drizzle/meta/0046_snapshot.json | 4359 +++++++++++++++++ apps/backend/drizzle/meta/_journal.json | 7 + apps/backend/src/db/schema.ts | 26 + apps/backend/src/routes/chat.test.ts | 87 +- apps/backend/src/routes/chat.ts | 64 +- apps/backend/src/routes/org-provider.ts | 5 + apps/backend/src/routes/provider.ts | 45 + apps/backend/src/runs/agent-runner.test.ts | 473 +- apps/backend/src/runs/agent-runner.ts | 270 +- apps/backend/src/runs/compaction.test.ts | 859 ++++ apps/backend/src/runs/compaction.ts | 1102 +++++ apps/backend/src/runs/context-window.test.ts | 330 ++ apps/backend/src/runs/context-window.ts | 466 ++ apps/backend/src/runs/litellm-registry.ts | 347 ++ apps/backend/src/runs/recovery.test.ts | 364 ++ apps/backend/src/runs/recovery.ts | 229 + apps/backend/src/runs/token-estimate.test.ts | 371 ++ apps/backend/src/runs/token-estimate.ts | 518 ++ apps/backend/src/runs/types.ts | 4 +- apps/backend/src/services/chat-execution.ts | 570 ++- apps/backend/src/tools/sub-agent.test.ts | 58 +- apps/backend/src/tools/sub-agent.ts | 14 +- docs/adr/0009-context-compaction.md | 136 + packages/schemas/index.test.ts | 143 + packages/schemas/index.ts | 162 +- 26 files changed, 10731 insertions(+), 289 deletions(-) create mode 100644 apps/backend/drizzle/0046_context_compaction.sql create mode 100644 apps/backend/drizzle/meta/0046_snapshot.json create mode 100644 apps/backend/src/runs/compaction.test.ts create mode 100644 apps/backend/src/runs/compaction.ts create mode 100644 apps/backend/src/runs/context-window.test.ts create mode 100644 apps/backend/src/runs/context-window.ts create mode 100644 apps/backend/src/runs/litellm-registry.ts create mode 100644 apps/backend/src/runs/recovery.test.ts create mode 100644 apps/backend/src/runs/recovery.ts create mode 100644 apps/backend/src/runs/token-estimate.test.ts create mode 100644 apps/backend/src/runs/token-estimate.ts create mode 100644 docs/adr/0009-context-compaction.md diff --git a/apps/backend/drizzle/0046_context_compaction.sql b/apps/backend/drizzle/0046_context_compaction.sql new file mode 100644 index 00000000..e526bc0a --- /dev/null +++ b/apps/backend/drizzle/0046_context_compaction.sql @@ -0,0 +1,11 @@ +ALTER TABLE "agent" ADD COLUMN "compaction_enabled" boolean;--> statement-breakpoint +ALTER TABLE "agent" ADD COLUMN "trigger_ratio" real;--> statement-breakpoint +ALTER TABLE "agent" ADD COLUMN "target_ratio" real;--> statement-breakpoint +ALTER TABLE "agent" ADD COLUMN "reserve_ratio" real;--> statement-breakpoint +ALTER TABLE "agent" ADD COLUMN "keep_recent_messages" integer;--> statement-breakpoint +ALTER TABLE "agent" ADD COLUMN "min_prunable_chars" integer;--> statement-breakpoint +ALTER TABLE "chat" ADD COLUMN "context_summary" text;--> statement-breakpoint +ALTER TABLE "chat" ADD COLUMN "summary_watermark" text;--> statement-breakpoint +ALTER TABLE "chat" ADD COLUMN "compaction_dirty" boolean DEFAULT false NOT NULL;--> statement-breakpoint +ALTER TABLE "chat" ADD COLUMN "version" integer DEFAULT 0 NOT NULL;--> statement-breakpoint +ALTER TABLE "provider" ADD COLUMN "model_meta" jsonb; \ No newline at end of file diff --git a/apps/backend/drizzle/meta/0046_snapshot.json b/apps/backend/drizzle/meta/0046_snapshot.json new file mode 100644 index 00000000..78402286 --- /dev/null +++ b/apps/backend/drizzle/meta/0046_snapshot.json @@ -0,0 +1,4359 @@ +{ + "id": "c302529b-3427-4f45-a87d-6615109ab2eb", + "prevId": "668db7b6-9bad-46e6-b6bc-2533fce5ce32", + "version": "7", + "dialect": "postgresql", + "tables": { + "public.agent": { + "name": "agent", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "organization_id": { + "name": "organization_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "provider_id": { + "name": "provider_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "system_prompt": { + "name": "system_prompt", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "model_id": { + "name": "model_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "max_steps": { + "name": "max_steps", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "temperature": { + "name": "temperature", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "top_p": { + "name": "top_p", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "top_k": { + "name": "top_k", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "seed": { + "name": "seed", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "presence_penalty": { + "name": "presence_penalty", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "frequency_penalty": { + "name": "frequency_penalty", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "compaction_enabled": { + "name": "compaction_enabled", + "type": "boolean", + "primaryKey": false, + "notNull": false + }, + "trigger_ratio": { + "name": "trigger_ratio", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "target_ratio": { + "name": "target_ratio", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "reserve_ratio": { + "name": "reserve_ratio", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "keep_recent_messages": { + "name": "keep_recent_messages", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "min_prunable_chars": { + "name": "min_prunable_chars", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "tool_set_ids": { + "name": "tool_set_ids", + "type": "jsonb", + "primaryKey": false, + "notNull": false, + "default": "'[]'::jsonb" + }, + "skill_ids": { + "name": "skill_ids", + "type": "jsonb", + "primaryKey": false, + "notNull": false, + "default": "'[]'::jsonb" + }, + "sub_agent_ids": { + "name": "sub_agent_ids", + "type": "jsonb", + "primaryKey": false, + "notNull": false, + "default": "'[]'::jsonb" + }, + "input_placeholder": { + "name": "input_placeholder", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "avatar_key": { + "name": "avatar_key", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_agent_workspace_id": { + "name": "idx_agent_workspace_id", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_agent_organization_id": { + "name": "idx_agent_organization_id", + "columns": [ + { + "expression": "organization_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_agent_provider_id": { + "name": "idx_agent_provider_id", + "columns": [ + { + "expression": "provider_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "agent_organization_id_organization_id_fk": { + "name": "agent_organization_id_organization_id_fk", + "tableFrom": "agent", + "tableTo": "organization", + "columnsFrom": [ + "organization_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "agent_workspace_id_workspace_id_fk": { + "name": "agent_workspace_id_workspace_id_fk", + "tableFrom": "agent", + "tableTo": "workspace", + "columnsFrom": [ + "workspace_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "agent_provider_id_provider_id_fk": { + "name": "agent_provider_id_provider_id_fk", + "tableFrom": "agent", + "tableTo": "provider", + "columnsFrom": [ + "provider_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "restrict", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_agent_name_org": { + "name": "unique_agent_name_org", + "nullsNotDistinct": false, + "columns": [ + "organization_id", + "name" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.attachment": { + "name": "attachment", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "resource_type": { + "name": "resource_type", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "resource_id": { + "name": "resource_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_attachment_workspace": { + "name": "idx_attachment_workspace", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "resource_type", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_attachment_resource": { + "name": "idx_attachment_resource", + "columns": [ + { + "expression": "resource_type", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "resource_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "attachment_workspace_id_workspace_id_fk": { + "name": "attachment_workspace_id_workspace_id_fk", + "tableFrom": "attachment", + "tableTo": "workspace", + "columnsFrom": [ + "workspace_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_attachment": { + "name": "unique_attachment", + "nullsNotDistinct": false, + "columns": [ + "workspace_id", + "resource_type", + "resource_id" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.blueprint": { + "name": "blueprint", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "organization_id": { + "name": "organization_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "task_model_provider_id": { + "name": "task_model_provider_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "memory_extraction_provider_id": { + "name": "memory_extraction_provider_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "memory_embedding_provider_id": { + "name": "memory_embedding_provider_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "context": { + "name": "context", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_blueprint_organization_id": { + "name": "idx_blueprint_organization_id", + "columns": [ + { + "expression": "organization_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "blueprint_organization_id_organization_id_fk": { + "name": "blueprint_organization_id_organization_id_fk", + "tableFrom": "blueprint", + "tableTo": "organization", + "columnsFrom": [ + "organization_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "blueprint_task_model_provider_id_provider_id_fk": { + "name": "blueprint_task_model_provider_id_provider_id_fk", + "tableFrom": "blueprint", + "tableTo": "provider", + "columnsFrom": [ + "task_model_provider_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "set null", + "onUpdate": "no action" + }, + "blueprint_memory_extraction_provider_id_provider_id_fk": { + "name": "blueprint_memory_extraction_provider_id_provider_id_fk", + "tableFrom": "blueprint", + "tableTo": "provider", + "columnsFrom": [ + "memory_extraction_provider_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "set null", + "onUpdate": "no action" + }, + "blueprint_memory_embedding_provider_id_provider_id_fk": { + "name": "blueprint_memory_embedding_provider_id_provider_id_fk", + "tableFrom": "blueprint", + "tableTo": "provider", + "columnsFrom": [ + "memory_embedding_provider_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "set null", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_blueprint_name_org": { + "name": "unique_blueprint_name_org", + "nullsNotDistinct": false, + "columns": [ + "organization_id", + "name" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.blueprint_item": { + "name": "blueprint_item", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "blueprint_id": { + "name": "blueprint_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "resource_type": { + "name": "resource_type", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "resource_id": { + "name": "resource_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_blueprint_item_blueprint": { + "name": "idx_blueprint_item_blueprint", + "columns": [ + { + "expression": "blueprint_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_blueprint_item_resource": { + "name": "idx_blueprint_item_resource", + "columns": [ + { + "expression": "resource_type", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "resource_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "blueprint_item_blueprint_id_blueprint_id_fk": { + "name": "blueprint_item_blueprint_id_blueprint_id_fk", + "tableFrom": "blueprint_item", + "tableTo": "blueprint", + "columnsFrom": [ + "blueprint_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_blueprint_item": { + "name": "unique_blueprint_item", + "nullsNotDistinct": false, + "columns": [ + "blueprint_id", + "resource_type", + "resource_id" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.chat": { + "name": "chat", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "title": { + "name": "title", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "messages": { + "name": "messages", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "status": { + "name": "status", + "type": "text", + "primaryKey": false, + "notNull": true, + "default": "'succeeded'" + }, + "is_pinned": { + "name": "is_pinned", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": false + }, + "tags": { + "name": "tags", + "type": "jsonb", + "primaryKey": false, + "notNull": false, + "default": "'[]'::jsonb" + }, + "agent_id": { + "name": "agent_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "provider_id": { + "name": "provider_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "model_id": { + "name": "model_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "system_prompt": { + "name": "system_prompt", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "temperature": { + "name": "temperature", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "top_p": { + "name": "top_p", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "top_k": { + "name": "top_k", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "seed": { + "name": "seed", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "presence_penalty": { + "name": "presence_penalty", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "frequency_penalty": { + "name": "frequency_penalty", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "context_summary": { + "name": "context_summary", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "summary_watermark": { + "name": "summary_watermark", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "compaction_dirty": { + "name": "compaction_dirty", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": false + }, + "version": { + "name": "version", + "type": "integer", + "primaryKey": false, + "notNull": true, + "default": 0 + }, + "last_memory_processed_at": { + "name": "last_memory_processed_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "memory_extraction_status": { + "name": "memory_extraction_status", + "type": "text", + "primaryKey": false, + "notNull": false, + "default": "'pending'" + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_chat_workspace_id": { + "name": "idx_chat_workspace_id", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_chat_tags": { + "name": "idx_chat_tags", + "columns": [ + { + "expression": "tags", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + }, + "idx_chat_memory_processing": { + "name": "idx_chat_memory_processing", + "columns": [ + { + "expression": "memory_extraction_status", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "last_memory_processed_at", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "updated_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "chat_workspace_id_workspace_id_fk": { + "name": "chat_workspace_id_workspace_id_fk", + "tableFrom": "chat", + "tableTo": "workspace", + "columnsFrom": [ + "workspace_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.context": { + "name": "context", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "user_id": { + "name": "user_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "content": { + "name": "content", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_context_user_id": { + "name": "idx_context_user_id", + "columns": [ + { + "expression": "user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_context_workspace_id": { + "name": "idx_context_workspace_id", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "context_user_id_user_id_fk": { + "name": "context_user_id_user_id_fk", + "tableFrom": "context", + "tableTo": "user", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "context_workspace_id_workspace_id_fk": { + "name": "context_workspace_id_workspace_id_fk", + "tableFrom": "context", + "tableTo": "workspace", + "columnsFrom": [ + "workspace_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_context_user_workspace": { + "name": "unique_context_user_workspace", + "nullsNotDistinct": false, + "columns": [ + "user_id", + "workspace_id" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.dashboard": { + "name": "dashboard", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "desktop_layout": { + "name": "desktop_layout", + "type": "jsonb", + "primaryKey": false, + "notNull": true, + "default": "'[]'::jsonb" + }, + "mobile_layout": { + "name": "mobile_layout", + "type": "jsonb", + "primaryKey": false, + "notNull": true, + "default": "'[]'::jsonb" + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_dashboard_workspace_id": { + "name": "idx_dashboard_workspace_id", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "uq_dashboard_workspace_name": { + "name": "uq_dashboard_workspace_name", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "name", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "dashboard_workspace_id_workspace_id_fk": { + "name": "dashboard_workspace_id_workspace_id_fk", + "tableFrom": "dashboard", + "tableTo": "workspace", + "columnsFrom": [ + "workspace_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.invitation": { + "name": "invitation", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "email": { + "name": "email", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "organization_id": { + "name": "organization_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "invited_by": { + "name": "invited_by", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "status": { + "name": "status", + "type": "text", + "primaryKey": false, + "notNull": true, + "default": "'pending'" + }, + "workspace_name": { + "name": "workspace_name", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "expires_at": { + "name": "expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_invitation_email": { + "name": "idx_invitation_email", + "columns": [ + { + "expression": "email", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_invitation_org_id": { + "name": "idx_invitation_org_id", + "columns": [ + { + "expression": "organization_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "invitation_organization_id_organization_id_fk": { + "name": "invitation_organization_id_organization_id_fk", + "tableFrom": "invitation", + "tableTo": "organization", + "columnsFrom": [ + "organization_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "invitation_invited_by_user_id_fk": { + "name": "invitation_invited_by_user_id_fk", + "tableFrom": "invitation", + "tableTo": "user", + "columnsFrom": [ + "invited_by" + ], + "columnsTo": [ + "id" + ], + "onDelete": "no action", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_invitation_org_email": { + "name": "unique_invitation_org_email", + "nullsNotDistinct": false, + "columns": [ + "organization_id", + "email" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.invitation_blueprint": { + "name": "invitation_blueprint", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "invitation_id": { + "name": "invitation_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "blueprint_id": { + "name": "blueprint_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "position": { + "name": "position", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_invitation_blueprint_invitation": { + "name": "idx_invitation_blueprint_invitation", + "columns": [ + { + "expression": "invitation_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_invitation_blueprint_blueprint": { + "name": "idx_invitation_blueprint_blueprint", + "columns": [ + { + "expression": "blueprint_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "invitation_blueprint_invitation_id_invitation_id_fk": { + "name": "invitation_blueprint_invitation_id_invitation_id_fk", + "tableFrom": "invitation_blueprint", + "tableTo": "invitation", + "columnsFrom": [ + "invitation_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "invitation_blueprint_blueprint_id_blueprint_id_fk": { + "name": "invitation_blueprint_blueprint_id_blueprint_id_fk", + "tableFrom": "invitation_blueprint", + "tableTo": "blueprint", + "columnsFrom": [ + "blueprint_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_invitation_blueprint": { + "name": "unique_invitation_blueprint", + "nullsNotDistinct": false, + "columns": [ + "invitation_id", + "blueprint_id" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.kanban_board": { + "name": "kanban_board", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "labels": { + "name": "labels", + "type": "jsonb", + "primaryKey": false, + "notNull": true, + "default": "'[]'::jsonb" + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_kanban_board_workspace_id": { + "name": "idx_kanban_board_workspace_id", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "kanban_board_workspace_id_workspace_id_fk": { + "name": "kanban_board_workspace_id_workspace_id_fk", + "tableFrom": "kanban_board", + "tableTo": "workspace", + "columnsFrom": [ + "workspace_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_kanban_board_name_workspace": { + "name": "unique_kanban_board_name_workspace", + "nullsNotDistinct": false, + "columns": [ + "workspace_id", + "name" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.kanban_card": { + "name": "kanban_card", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "column_id": { + "name": "column_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "title": { + "name": "title", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "body": { + "name": "body", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "label_ids": { + "name": "label_ids", + "type": "jsonb", + "primaryKey": false, + "notNull": true, + "default": "'[]'::jsonb" + }, + "assignees": { + "name": "assignees", + "type": "jsonb", + "primaryKey": false, + "notNull": true, + "default": "'[]'::jsonb" + }, + "due_date": { + "name": "due_date", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "priority": { + "name": "priority", + "type": "text", + "primaryKey": false, + "notNull": true, + "default": "'none'" + }, + "position": { + "name": "position", + "type": "real", + "primaryKey": false, + "notNull": true + }, + "created_by_user_id": { + "name": "created_by_user_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_by_agent_id": { + "name": "created_by_agent_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "last_edited_by_user_id": { + "name": "last_edited_by_user_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "last_edited_by_agent_id": { + "name": "last_edited_by_agent_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_kanban_card_column_id": { + "name": "idx_kanban_card_column_id", + "columns": [ + { + "expression": "column_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_kanban_card_label_ids": { + "name": "idx_kanban_card_label_ids", + "columns": [ + { + "expression": "label_ids", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + }, + "idx_kanban_card_assignees": { + "name": "idx_kanban_card_assignees", + "columns": [ + { + "expression": "assignees", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + }, + "idx_kanban_card_due_date": { + "name": "idx_kanban_card_due_date", + "columns": [ + { + "expression": "due_date", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_kanban_card_priority": { + "name": "idx_kanban_card_priority", + "columns": [ + { + "expression": "priority", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_kanban_card_column_position": { + "name": "idx_kanban_card_column_position", + "columns": [ + { + "expression": "column_id", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "position", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "kanban_card_column_id_kanban_column_id_fk": { + "name": "kanban_card_column_id_kanban_column_id_fk", + "tableFrom": "kanban_card", + "tableTo": "kanban_column", + "columnsFrom": [ + "column_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "kanban_card_created_by_user_id_user_id_fk": { + "name": "kanban_card_created_by_user_id_user_id_fk", + "tableFrom": "kanban_card", + "tableTo": "user", + "columnsFrom": [ + "created_by_user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "set null", + "onUpdate": "no action" + }, + "kanban_card_created_by_agent_id_agent_id_fk": { + "name": "kanban_card_created_by_agent_id_agent_id_fk", + "tableFrom": "kanban_card", + "tableTo": "agent", + "columnsFrom": [ + "created_by_agent_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "set null", + "onUpdate": "no action" + }, + "kanban_card_last_edited_by_user_id_user_id_fk": { + "name": "kanban_card_last_edited_by_user_id_user_id_fk", + "tableFrom": "kanban_card", + "tableTo": "user", + "columnsFrom": [ + "last_edited_by_user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "set null", + "onUpdate": "no action" + }, + "kanban_card_last_edited_by_agent_id_agent_id_fk": { + "name": "kanban_card_last_edited_by_agent_id_agent_id_fk", + "tableFrom": "kanban_card", + "tableTo": "agent", + "columnsFrom": [ + "last_edited_by_agent_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "set null", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.kanban_card_comment": { + "name": "kanban_card_comment", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "card_id": { + "name": "card_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "body": { + "name": "body", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_by_user_id": { + "name": "created_by_user_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_by_agent_id": { + "name": "created_by_agent_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_kanban_card_comment_card_id": { + "name": "idx_kanban_card_comment_card_id", + "columns": [ + { + "expression": "card_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "kanban_card_comment_card_id_kanban_card_id_fk": { + "name": "kanban_card_comment_card_id_kanban_card_id_fk", + "tableFrom": "kanban_card_comment", + "tableTo": "kanban_card", + "columnsFrom": [ + "card_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "kanban_card_comment_created_by_user_id_user_id_fk": { + "name": "kanban_card_comment_created_by_user_id_user_id_fk", + "tableFrom": "kanban_card_comment", + "tableTo": "user", + "columnsFrom": [ + "created_by_user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "set null", + "onUpdate": "no action" + }, + "kanban_card_comment_created_by_agent_id_agent_id_fk": { + "name": "kanban_card_comment_created_by_agent_id_agent_id_fk", + "tableFrom": "kanban_card_comment", + "tableTo": "agent", + "columnsFrom": [ + "created_by_agent_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "set null", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.kanban_column": { + "name": "kanban_column", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "board_id": { + "name": "board_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "position": { + "name": "position", + "type": "real", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_kanban_column_board_id": { + "name": "idx_kanban_column_board_id", + "columns": [ + { + "expression": "board_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "kanban_column_board_id_kanban_board_id_fk": { + "name": "kanban_column_board_id_kanban_board_id_fk", + "tableFrom": "kanban_column", + "tableTo": "kanban_board", + "columnsFrom": [ + "board_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.mcp": { + "name": "mcp", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "organization_id": { + "name": "organization_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "url": { + "name": "url", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "headers": { + "name": "headers", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "auth_type": { + "name": "auth_type", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "bearer_token": { + "name": "bearer_token", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "oauth_access_token": { + "name": "oauth_access_token", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "oauth_refresh_token": { + "name": "oauth_refresh_token", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "oauth_token_expires_at": { + "name": "oauth_token_expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "oauth_scope": { + "name": "oauth_scope", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "oauth_requested_scope": { + "name": "oauth_requested_scope", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "oauth_client_id": { + "name": "oauth_client_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "oauth_client_secret": { + "name": "oauth_client_secret", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_mcp_workspace_id": { + "name": "idx_mcp_workspace_id", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_mcp_organization_id": { + "name": "idx_mcp_organization_id", + "columns": [ + { + "expression": "organization_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "mcp_organization_id_organization_id_fk": { + "name": "mcp_organization_id_organization_id_fk", + "tableFrom": "mcp", + "tableTo": "organization", + "columnsFrom": [ + "organization_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "mcp_workspace_id_workspace_id_fk": { + "name": "mcp_workspace_id_workspace_id_fk", + "tableFrom": "mcp", + "tableTo": "workspace", + "columnsFrom": [ + "workspace_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_mcp_name_org": { + "name": "unique_mcp_name_org", + "nullsNotDistinct": false, + "columns": [ + "organization_id", + "name" + ] + }, + "unique_mcp_name_workspace": { + "name": "unique_mcp_name_workspace", + "nullsNotDistinct": false, + "columns": [ + "workspace_id", + "name" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.mcp_oauth_state": { + "name": "mcp_oauth_state", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "mcp_id": { + "name": "mcp_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "code_verifier": { + "name": "code_verifier", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "redirect_uri": { + "name": "redirect_uri", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "expires_at": { + "name": "expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "idx_mcp_oauth_state_mcp_id": { + "name": "idx_mcp_oauth_state_mcp_id", + "columns": [ + { + "expression": "mcp_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "mcp_oauth_state_mcp_id_mcp_id_fk": { + "name": "mcp_oauth_state_mcp_id_mcp_id_fk", + "tableFrom": "mcp_oauth_state", + "tableTo": "mcp", + "columnsFrom": [ + "mcp_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.memory_daily_summary": { + "name": "memory_daily_summary", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "user_id": { + "name": "user_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "summary_date": { + "name": "summary_date", + "type": "date", + "primaryKey": false, + "notNull": true + }, + "summary": { + "name": "summary", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "embedding": { + "name": "embedding", + "type": "vector", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_daily_summary_user_workspace": { + "name": "idx_daily_summary_user_workspace", + "columns": [ + { + "expression": "user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_daily_summary_date": { + "name": "idx_daily_summary_date", + "columns": [ + { + "expression": "summary_date", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "memory_daily_summary_user_id_user_id_fk": { + "name": "memory_daily_summary_user_id_user_id_fk", + "tableFrom": "memory_daily_summary", + "tableTo": "user", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "memory_daily_summary_workspace_id_workspace_id_fk": { + "name": "memory_daily_summary_workspace_id_workspace_id_fk", + "tableFrom": "memory_daily_summary", + "tableTo": "workspace", + "columnsFrom": [ + "workspace_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_daily_summary_user_workspace_date": { + "name": "unique_daily_summary_user_workspace_date", + "nullsNotDistinct": false, + "columns": [ + "user_id", + "workspace_id", + "summary_date" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.notification": { + "name": "notification", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "agent_id": { + "name": "agent_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "title": { + "name": "title", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "body": { + "name": "body", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_notification_workspace_id": { + "name": "idx_notification_workspace_id", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_notification_agent_id": { + "name": "idx_notification_agent_id", + "columns": [ + { + "expression": "agent_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_notification_created_at": { + "name": "idx_notification_created_at", + "columns": [ + { + "expression": "created_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "notification_workspace_id_workspace_id_fk": { + "name": "notification_workspace_id_workspace_id_fk", + "tableFrom": "notification", + "tableTo": "workspace", + "columnsFrom": [ + "workspace_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "notification_agent_id_agent_id_fk": { + "name": "notification_agent_id_agent_id_fk", + "tableFrom": "notification", + "tableTo": "agent", + "columnsFrom": [ + "agent_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.notification_read": { + "name": "notification_read", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "notification_id": { + "name": "notification_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "user_id": { + "name": "user_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "read_at": { + "name": "read_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_notification_read_user_id": { + "name": "idx_notification_read_user_id", + "columns": [ + { + "expression": "user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_notification_read_notification_id": { + "name": "idx_notification_read_notification_id", + "columns": [ + { + "expression": "notification_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "notification_read_notification_id_notification_id_fk": { + "name": "notification_read_notification_id_notification_id_fk", + "tableFrom": "notification_read", + "tableTo": "notification", + "columnsFrom": [ + "notification_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "notification_read_user_id_user_id_fk": { + "name": "notification_read_user_id_user_id_fk", + "tableFrom": "notification_read", + "tableTo": "user", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_notification_read": { + "name": "unique_notification_read", + "nullsNotDistinct": false, + "columns": [ + "notification_id", + "user_id" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.organization": { + "name": "organization", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.organization_member": { + "name": "organization_member", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "organization_id": { + "name": "organization_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "user_id": { + "name": "user_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "role": { + "name": "role", + "type": "text", + "primaryKey": false, + "notNull": true, + "default": "'member'" + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_org_member_org_id": { + "name": "idx_org_member_org_id", + "columns": [ + { + "expression": "organization_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_org_member_user_id": { + "name": "idx_org_member_user_id", + "columns": [ + { + "expression": "user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "organization_member_organization_id_organization_id_fk": { + "name": "organization_member_organization_id_organization_id_fk", + "tableFrom": "organization_member", + "tableTo": "organization", + "columnsFrom": [ + "organization_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "organization_member_user_id_user_id_fk": { + "name": "organization_member_user_id_user_id_fk", + "tableFrom": "organization_member", + "tableTo": "user", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.provider": { + "name": "provider", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "organization_id": { + "name": "organization_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "provider_type": { + "name": "provider_type", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "api_key": { + "name": "api_key", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "region": { + "name": "region", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "base_url": { + "name": "base_url", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "headers": { + "name": "headers", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "extraBody": { + "name": "extraBody", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "organization": { + "name": "organization", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "project": { + "name": "project", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "api_mode": { + "name": "api_mode", + "type": "text", + "primaryKey": false, + "notNull": true, + "default": "'responses'" + }, + "native_search_enabled": { + "name": "native_search_enabled", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": true + }, + "modelIds": { + "name": "modelIds", + "type": "jsonb", + "primaryKey": false, + "notNull": true + }, + "task_model_id": { + "name": "task_model_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "memory_extraction_model_id": { + "name": "memory_extraction_model_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "embedding_model_id": { + "name": "embedding_model_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "embedding_dimensions": { + "name": "embedding_dimensions", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "model_meta": { + "name": "model_meta", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_provider_workspace_id": { + "name": "idx_provider_workspace_id", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_provider_organization_id": { + "name": "idx_provider_organization_id", + "columns": [ + { + "expression": "organization_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "provider_organization_id_organization_id_fk": { + "name": "provider_organization_id_organization_id_fk", + "tableFrom": "provider", + "tableTo": "organization", + "columnsFrom": [ + "organization_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_provider_name_org": { + "name": "unique_provider_name_org", + "nullsNotDistinct": false, + "columns": [ + "organization_id", + "name" + ] + }, + "unique_provider_name_workspace": { + "name": "unique_provider_name_workspace", + "nullsNotDistinct": false, + "columns": [ + "workspace_id", + "name" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.sandbox": { + "name": "sandbox", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "backend": { + "name": "backend", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "config": { + "name": "config", + "type": "jsonb", + "primaryKey": false, + "notNull": true, + "default": "'{}'::jsonb" + }, + "credentials": { + "name": "credentials", + "type": "jsonb", + "primaryKey": false, + "notNull": true, + "default": "'{}'::jsonb" + }, + "admin_env": { + "name": "admin_env", + "type": "jsonb", + "primaryKey": false, + "notNull": true, + "default": "'{}'::jsonb" + }, + "user_env": { + "name": "user_env", + "type": "jsonb", + "primaryKey": false, + "notNull": true, + "default": "'{}'::jsonb" + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "unique_sandbox_workspace_id": { + "name": "unique_sandbox_workspace_id", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "sandbox_workspace_id_workspace_id_fk": { + "name": "sandbox_workspace_id_workspace_id_fk", + "tableFrom": "sandbox", + "tableTo": "workspace", + "columnsFrom": [ + "workspace_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.sandbox_teardown_failure": { + "name": "sandbox_teardown_failure", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "backend": { + "name": "backend", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "config": { + "name": "config", + "type": "jsonb", + "primaryKey": false, + "notNull": true + }, + "error": { + "name": "error", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "attempted_at": { + "name": "attempted_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_sandbox_teardown_failure_workspace_id": { + "name": "idx_sandbox_teardown_failure_workspace_id", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.skill": { + "name": "skill", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "organization_id": { + "name": "organization_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "body": { + "name": "body", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_skill_workspace_id": { + "name": "idx_skill_workspace_id", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_skill_organization_id": { + "name": "idx_skill_organization_id", + "columns": [ + { + "expression": "organization_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "skill_organization_id_organization_id_fk": { + "name": "skill_organization_id_organization_id_fk", + "tableFrom": "skill", + "tableTo": "organization", + "columnsFrom": [ + "organization_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "skill_workspace_id_workspace_id_fk": { + "name": "skill_workspace_id_workspace_id_fk", + "tableFrom": "skill", + "tableTo": "workspace", + "columnsFrom": [ + "workspace_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_skill_name_workspace": { + "name": "unique_skill_name_workspace", + "nullsNotDistinct": false, + "columns": [ + "workspace_id", + "name" + ] + }, + "unique_skill_name_org": { + "name": "unique_skill_name_org", + "nullsNotDistinct": false, + "columns": [ + "organization_id", + "name" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.trigger": { + "name": "trigger", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "agent_id": { + "name": "agent_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "type": { + "name": "type", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "instruction": { + "name": "instruction", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "enabled": { + "name": "enabled", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": true + }, + "max_runs_to_keep": { + "name": "max_runs_to_keep", + "type": "integer", + "primaryKey": false, + "notNull": true, + "default": 10 + }, + "search": { + "name": "search", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": false + }, + "config": { + "name": "config", + "type": "jsonb", + "primaryKey": false, + "notNull": true + }, + "last_run_at": { + "name": "last_run_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "next_run_at": { + "name": "next_run_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_trigger_workspace_id": { + "name": "idx_trigger_workspace_id", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_trigger_next_run_at": { + "name": "idx_trigger_next_run_at", + "columns": [ + { + "expression": "next_run_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_trigger_type": { + "name": "idx_trigger_type", + "columns": [ + { + "expression": "type", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "trigger_workspace_id_workspace_id_fk": { + "name": "trigger_workspace_id_workspace_id_fk", + "tableFrom": "trigger", + "tableTo": "workspace", + "columnsFrom": [ + "workspace_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "trigger_agent_id_agent_id_fk": { + "name": "trigger_agent_id_agent_id_fk", + "tableFrom": "trigger", + "tableTo": "agent", + "columnsFrom": [ + "agent_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "restrict", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.trigger_run": { + "name": "trigger_run", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "trigger_id": { + "name": "trigger_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "status": { + "name": "status", + "type": "text", + "primaryKey": false, + "notNull": true, + "default": "'pending'" + }, + "event_type": { + "name": "event_type", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "event_data": { + "name": "event_data", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "started_at": { + "name": "started_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "completed_at": { + "name": "completed_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "error_message": { + "name": "error_message", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "stats": { + "name": "stats", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_trigger_run_trigger_id": { + "name": "idx_trigger_run_trigger_id", + "columns": [ + { + "expression": "trigger_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_trigger_run_started_at": { + "name": "idx_trigger_run_started_at", + "columns": [ + { + "expression": "started_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "trigger_run_trigger_id_trigger_id_fk": { + "name": "trigger_run_trigger_id_trigger_id_fk", + "tableFrom": "trigger_run", + "tableTo": "trigger", + "columnsFrom": [ + "trigger_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.webhook": { + "name": "webhook", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true, + "default": "'Webhook'" + }, + "url": { + "name": "url", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "signing_secret": { + "name": "signing_secret", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "headers": { + "name": "headers", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "enabled": { + "name": "enabled", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": true + }, + "events": { + "name": "events", + "type": "jsonb", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_webhook_workspace_id": { + "name": "idx_webhook_workspace_id", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "webhook_workspace_id_workspace_id_fk": { + "name": "webhook_workspace_id_workspace_id_fk", + "tableFrom": "webhook", + "tableTo": "workspace", + "columnsFrom": [ + "workspace_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.widget": { + "name": "widget", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "dashboard_id": { + "name": "dashboard_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "type": { + "name": "type", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "title": { + "name": "title", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "data": { + "name": "data", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_widget_dashboard_id": { + "name": "idx_widget_dashboard_id", + "columns": [ + { + "expression": "dashboard_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "uq_widget_dashboard_title": { + "name": "uq_widget_dashboard_title", + "columns": [ + { + "expression": "dashboard_id", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "title", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "widget_dashboard_id_dashboard_id_fk": { + "name": "widget_dashboard_id_dashboard_id_fk", + "tableFrom": "widget", + "tableTo": "dashboard", + "columnsFrom": [ + "dashboard_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.workspace": { + "name": "workspace", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "organization_id": { + "name": "organization_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "owner_id": { + "name": "owner_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "context": { + "name": "context", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "task_model_provider_id": { + "name": "task_model_provider_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "memory_extraction_provider_id": { + "name": "memory_extraction_provider_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "memory_embedding_provider_id": { + "name": "memory_embedding_provider_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "max_daily_summaries": { + "name": "max_daily_summaries", + "type": "integer", + "primaryKey": false, + "notNull": false, + "default": 90 + }, + "provider_self_management": { + "name": "provider_self_management", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": false + }, + "mcp_self_management": { + "name": "mcp_self_management", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_workspace_organization_id": { + "name": "idx_workspace_organization_id", + "columns": [ + { + "expression": "organization_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_workspace_owner_id": { + "name": "idx_workspace_owner_id", + "columns": [ + { + "expression": "owner_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "workspace_organization_id_organization_id_fk": { + "name": "workspace_organization_id_organization_id_fk", + "tableFrom": "workspace", + "tableTo": "organization", + "columnsFrom": [ + "organization_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "workspace_owner_id_user_id_fk": { + "name": "workspace_owner_id_user_id_fk", + "tableFrom": "workspace", + "tableTo": "user", + "columnsFrom": [ + "owner_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "workspace_task_model_provider_id_provider_id_fk": { + "name": "workspace_task_model_provider_id_provider_id_fk", + "tableFrom": "workspace", + "tableTo": "provider", + "columnsFrom": [ + "task_model_provider_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "set null", + "onUpdate": "no action" + }, + "workspace_memory_extraction_provider_id_provider_id_fk": { + "name": "workspace_memory_extraction_provider_id_provider_id_fk", + "tableFrom": "workspace", + "tableTo": "provider", + "columnsFrom": [ + "memory_extraction_provider_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "set null", + "onUpdate": "no action" + }, + "workspace_memory_embedding_provider_id_provider_id_fk": { + "name": "workspace_memory_embedding_provider_id_provider_id_fk", + "tableFrom": "workspace", + "tableTo": "provider", + "columnsFrom": [ + "memory_embedding_provider_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "set null", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.account": { + "name": "account", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "account_id": { + "name": "account_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "provider_id": { + "name": "provider_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "user_id": { + "name": "user_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "access_token": { + "name": "access_token", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "refresh_token": { + "name": "refresh_token", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "id_token": { + "name": "id_token", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "access_token_expires_at": { + "name": "access_token_expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "refresh_token_expires_at": { + "name": "refresh_token_expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "scope": { + "name": "scope", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "password": { + "name": "password", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "account_userId_idx": { + "name": "account_userId_idx", + "columns": [ + { + "expression": "user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "account_user_id_user_id_fk": { + "name": "account_user_id_user_id_fk", + "tableFrom": "account", + "tableTo": "user", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.session": { + "name": "session", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "expires_at": { + "name": "expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true + }, + "token": { + "name": "token", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "ip_address": { + "name": "ip_address", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "user_agent": { + "name": "user_agent", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "user_id": { + "name": "user_id", + "type": "text", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "session_userId_idx": { + "name": "session_userId_idx", + "columns": [ + { + "expression": "user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "session_user_id_user_id_fk": { + "name": "session_user_id_user_id_fk", + "tableFrom": "session", + "tableTo": "user", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "session_token_unique": { + "name": "session_token_unique", + "nullsNotDistinct": false, + "columns": [ + "token" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.user": { + "name": "user", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "email": { + "name": "email", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "email_verified": { + "name": "email_verified", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": false + }, + "image": { + "name": "image", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "role": { + "name": "role", + "type": "text", + "primaryKey": false, + "notNull": true, + "default": "'user'" + }, + "banned": { + "name": "banned", + "type": "boolean", + "primaryKey": false, + "notNull": false, + "default": false + }, + "ban_reason": { + "name": "ban_reason", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "ban_expires": { + "name": "ban_expires", + "type": "timestamp", + "primaryKey": false, + "notNull": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "user_email_unique": { + "name": "user_email_unique", + "nullsNotDistinct": false, + "columns": [ + "email" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.verification": { + "name": "verification", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "identifier": { + "name": "identifier", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "value": { + "name": "value", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "expires_at": { + "name": "expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "verification_identifier_idx": { + "name": "verification_identifier_idx", + "columns": [ + { + "expression": "identifier", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + } + }, + "enums": {}, + "schemas": {}, + "sequences": {}, + "roles": {}, + "policies": {}, + "views": {}, + "_meta": { + "columns": {}, + "schemas": {}, + "tables": {} + } +} \ No newline at end of file diff --git a/apps/backend/drizzle/meta/_journal.json b/apps/backend/drizzle/meta/_journal.json index 44a49c37..847f0e0f 100644 --- a/apps/backend/drizzle/meta/_journal.json +++ b/apps/backend/drizzle/meta/_journal.json @@ -323,6 +323,13 @@ "when": 1780816408681, "tag": "0045_married_roxanne_simpson", "breakpoints": true + }, + { + "idx": 46, + "version": "7", + "when": 1781201728242, + "tag": "0046_context_compaction", + "breakpoints": true } ] } \ No newline at end of file diff --git a/apps/backend/src/db/schema.ts b/apps/backend/src/db/schema.ts index 933abde6..764692dc 100644 --- a/apps/backend/src/db/schema.ts +++ b/apps/backend/src/db/schema.ts @@ -67,6 +67,13 @@ export const provider = pgTable( memoryExtractionModelId: t.text("memory_extraction_model_id").notNull(), embeddingModelId: t.text("embedding_model_id"), embeddingDimensions: t.integer("embedding_dimensions"), + // Per-model context-window / output overrides (context-compaction-plan §A). + // Keyed by model id; resolveContextWindow consults this before API/registry. + modelMeta: t + .jsonb("model_meta") + .$type< + Record + >(), createdAt: t.timestamp("created_at").notNull().defaultNow(), updatedAt: t.timestamp("updated_at").notNull().defaultNow(), }), @@ -162,6 +169,16 @@ export const chat = pgTable( presencePenalty: t.real("presence_penalty"), frequencyPenalty: t.real("frequency_penalty"), + // Context-compaction state (docs/adr/0009). All additive nullable/defaulted. + // P1 view-not-delete: these change what is sent to the model, never the + // stored `messages`. `summaryWatermark` = message id of the last summarized + // message. All mutations go through the single versioned CAS writer (P3/R1); + // `version` is its compare-and-swap token. + contextSummary: t.text("context_summary"), + summaryWatermark: t.text("summary_watermark"), + compactionDirty: t.boolean("compaction_dirty").notNull().default(false), + version: t.integer("version").notNull().default(0), + // Memory processing tracking lastMemoryProcessedAt: t.timestamp("last_memory_processed_at"), memoryExtractionStatus: t @@ -215,6 +232,15 @@ export const agent = pgTable( seed: t.real("seed"), presencePenalty: t.real("presence_penalty"), frequencyPenalty: t.real("frequency_penalty"), + // Per-agent context-compaction config (context-compaction-plan §G). All + // nullable; the runtime applies defaults when unset (true / 0.8 / 0.5 / + // 0.05 / 10 / 2000). Editable surface wired in a later slice. + compactionEnabled: t.boolean("compaction_enabled"), + triggerRatio: t.real("trigger_ratio"), + targetRatio: t.real("target_ratio"), + reserveRatio: t.real("reserve_ratio"), + keepRecentMessages: t.integer("keep_recent_messages"), + minPrunableChars: t.integer("min_prunable_chars"), toolSetIds: t.jsonb("tool_set_ids").$type().default([]), // Array of tool set ids skillIds: t.jsonb("skill_ids").$type().default([]), // Array of skill ids subAgentIds: t.jsonb("sub_agent_ids").$type().default([]), // Array of sub-agent ids diff --git a/apps/backend/src/routes/chat.test.ts b/apps/backend/src/routes/chat.test.ts index 12802c40..25a1766d 100644 --- a/apps/backend/src/routes/chat.test.ts +++ b/apps/backend/src/routes/chat.test.ts @@ -6,8 +6,9 @@ import { resetMockDb, } from "../test-utils.ts"; -const { mockPrepareChatTurn } = vi.hoisted(() => ({ +const { mockPrepareChatTurn, mockForceCompactChat } = vi.hoisted(() => ({ mockPrepareChatTurn: vi.fn(), + mockForceCompactChat: vi.fn(), })); vi.mock("../services/chat-execution.ts", () => { @@ -25,12 +26,20 @@ vi.mock("../services/chat-execution.ts", () => { } return { prepareChatTurn: mockPrepareChatTurn, + forceCompactChat: mockForceCompactChat, + // loadChatMessages is called by agent-runner before onStart (RV1 baseline). + loadChatMessages: vi.fn().mockResolvedValue([]), ValidationError, NotFoundError, drizzleChatTurnQueries: {}, }; }); +import { runRegistry } from "../runs/run-registry.ts"; +// Mocked above — resolves to the mock's NotFoundError class, the same one the +// route checks with `instanceof`. +import { NotFoundError } from "../services/chat-execution.ts"; + import app from "../server.ts"; // Mock AI SDK @@ -88,6 +97,10 @@ describe("Chat Routes", () => { beforeEach(() => { resetMockDb(); vi.clearAllMocks(); + // The `POST /` test starts a (mocked) run that registers chat-1 and never + // finalizes, leaving it in the process-wide registry. Clear it so the + // compact route's in-progress guard sees a clean slate. + runRegistry.unregister("chat-1"); mockDb.where.mockReturnValue(mockDb); mockDb.orderBy.mockReturnValue(mockDb); mockDb.limit.mockReturnValue(mockDb); @@ -239,6 +252,7 @@ describe("Chat Routes", () => { mockDb.limit.mockResolvedValueOnce([ { ownerId: "user-1", organizationId: "org-1" }, ]); // requireWorkspaceAccess + mockDb.limit.mockResolvedValueOnce([{ workspaceId: "ws-1" }]); // RV2 chat workspace check // ChatSink.onStart upserts the chat row with status=running before // prepareChatTurn runs. Returning a non-empty array skips the insert @@ -469,4 +483,75 @@ describe("Chat Routes", () => { expect(await res.json()).toEqual(mockUpdatedChat); }); }); + + describe("POST /:chatId/compact", () => { + const ownerAccess = () => { + mockSession(); + mockDb.limit.mockResolvedValueOnce([{ role: "member" }]); // requireOrgAccess + mockDb.limit.mockResolvedValueOnce([ + { ownerId: "user-1", organizationId: "org-1" }, + ]); // requireWorkspaceAccess + owner + }; + + it("force-compacts and returns the refreshed usage", async () => { + ownerAccess(); + mockForceCompactChat.mockResolvedValueOnce({ + estimatedTokens: 1234, + contextWindow: 8192, + contextWindowIsDefault: false, + }); + + const res = await app.request(`${baseUrl}/chat-1/compact`, { + method: "POST", + }); + + expect(res.status).toBe(200); + expect(await res.json()).toEqual({ + inputTokens: 1234, + contextWindow: 8192, + contextWindowIsDefault: false, + }); + expect(mockForceCompactChat).toHaveBeenCalledWith( + "chat-1", + workspaceId, + orgId, + ); + }); + + it("returns 409 when a run is in progress (does not compact)", async () => { + ownerAccess(); + // runRegistry is keyed by runId, which equals the chatId for top-level + // chat runs — so an in-flight run on this chat blocks the compact. + runRegistry.register("chat-1"); + try { + const res = await app.request(`${baseUrl}/chat-1/compact`, { + method: "POST", + }); + expect(res.status).toBe(409); + expect(mockForceCompactChat).not.toHaveBeenCalled(); + } finally { + runRegistry.unregister("chat-1"); + } + }); + + it("returns 404 when the chat is not found / not in the workspace", async () => { + ownerAccess(); + mockForceCompactChat.mockRejectedValueOnce( + new NotFoundError("Chat not found"), + ); + + const res = await app.request(`${baseUrl}/chat-other/compact`, { + method: "POST", + }); + expect(res.status).toBe(404); + }); + + it("returns 401 without a session", async () => { + mockNoSession(); + const res = await app.request(`${baseUrl}/chat-1/compact`, { + method: "POST", + }); + expect(res.status).toBe(401); + }); + }); }); diff --git a/apps/backend/src/routes/chat.ts b/apps/backend/src/routes/chat.ts index e1b27755..37038bb8 100644 --- a/apps/backend/src/routes/chat.ts +++ b/apps/backend/src/routes/chat.ts @@ -9,7 +9,11 @@ import { provider as providerTable, workspace as workspaceTable, } from "../db/schema.ts"; -import { NotFoundError, ValidationError } from "../services/chat-execution.ts"; +import { + forceCompactChat, + NotFoundError, + ValidationError, +} from "../services/chat-execution.ts"; import { openProvider } from "../services/provider.ts"; import { chatGenerateMetadataSchema, @@ -29,6 +33,7 @@ import { type PlatypusUIMessage } from "../types.ts"; import { rewriteStorageUrls, deleteFiles } from "../storage/utils.ts"; import { getOrigin } from "../utils/get-origin.ts"; import { agentRunner } from "../runs/agent-runner.ts"; +import { runRegistry } from "../runs/run-registry.ts"; import { ChatSink } from "../runs/sinks/chat-sink.ts"; import type { RunInput } from "../runs/types.ts"; @@ -143,6 +148,23 @@ chat.post( const scope = c.get("workspaceScope")!; const data = c.req.valid("json"); + // RV2: verify the submitted chat id (if any) belongs to this workspace. + // Without this check a workspace-A user could supply a workspace-B chat id + // and corrupt B's compaction state via the unscoped store writes. + if (data.id) { + const existing = await db + .select({ workspaceId: chatTable.workspaceId }) + .from(chatTable) + .where(eq(chatTable.id, data.id)) + .limit(1); + if ( + existing.length > 0 && + existing[0].workspaceId !== scope.workspaceId + ) { + return c.json({ message: "Chat not found" }, 404); + } + } + const input: RunInput = { runId: data.id, request: data, @@ -417,4 +439,44 @@ chat.post( }, ); +chat.post( + "/:chatId/compact", + requireAuth, + requireOrgAccess(), + requireWorkspaceAccess, + requireWorkspaceOwner, + async (c) => { + const orgId = c.req.param("orgId")!; + const chatId = c.req.param("chatId"); + const workspaceId = c.req.param("workspaceId")!; + + // Reject if a run is currently in flight — the frontend defers the click + // until streaming finishes (drift U4), but guard here as a belt-and-suspenders + // check to avoid CAS races with an in-progress writer. + if (runRegistry.has(chatId)) { + return c.json( + { error: "Run in progress; retry after the response finishes" }, + 409, + ); + } + + try { + const result = await forceCompactChat(chatId, workspaceId, orgId); + return c.json({ + inputTokens: result.estimatedTokens, + contextWindow: result.contextWindow, + contextWindowIsDefault: result.contextWindowIsDefault, + }); + } catch (error) { + if (error instanceof NotFoundError) { + return c.json({ error: error.message }, 404); + } + if (error instanceof ValidationError) { + return c.json({ error: error.message }, 400); + } + throw error; + } + }, +); + export { chat }; diff --git a/apps/backend/src/routes/org-provider.ts b/apps/backend/src/routes/org-provider.ts index 9777fe75..d037152e 100644 --- a/apps/backend/src/routes/org-provider.ts +++ b/apps/backend/src/routes/org-provider.ts @@ -7,6 +7,7 @@ import { providerCreateSchema, providerUpdateSchema } from "@platypus/schemas"; import { eq, and } from "drizzle-orm"; import { handleEmbeddingConfigChange } from "../services/embedding-invalidation.ts"; import { dedupeArray } from "../utils.ts"; +import { contextWindowResolver } from "../runs/context-window.ts"; import { requireAuth } from "../middleware/authentication.ts"; import { requireOrgAccess } from "../middleware/authorization.ts"; import { requireSharedDeletable } from "../services/scoped-resource.ts"; @@ -117,6 +118,10 @@ orgProvider.put( throw new NotFoundError("Provider not found"); } + // RV7c: bust the cached context window so a modelMeta override takes effect + // immediately rather than waiting out the 1-hour TTL (drift T5). + contextWindowResolver.evict(providerId); + return c.json(record[0], 200); }, ); diff --git a/apps/backend/src/routes/provider.ts b/apps/backend/src/routes/provider.ts index b437c335..49a16334 100644 --- a/apps/backend/src/routes/provider.ts +++ b/apps/backend/src/routes/provider.ts @@ -7,6 +7,7 @@ import { providerCreateSchema, providerUpdateSchema } from "@platypus/schemas"; import { eq, and } from "drizzle-orm"; import { handleEmbeddingConfigChange } from "../services/embedding-invalidation.ts"; import { dedupeArray } from "../utils.ts"; +import { contextWindowResolver } from "../runs/context-window.ts"; import { requireAuth } from "../middleware/authentication.ts"; import { requireOrgAccess, @@ -135,6 +136,10 @@ provider.put( ) .returning(); + // RV7c: bust the cached context window so a modelMeta override takes effect + // immediately rather than waiting out the 1-hour TTL (drift T5). + contextWindowResolver.evict(providerId); + return c.json(record[0], 200); }, ); @@ -171,4 +176,44 @@ provider.delete( }, ); +/** + * Returns the resolved context window for a specific model on this provider + * (§H ring, drift U1). Uses the cached resolver — fast for repeated calls. + * Returns `{ contextWindow: null }` when the window fell to the conservative + * default so the frontend can render the ring neutral (drift T6). + */ +provider.get( + "/:providerId/context-window", + requireAuth, + requireOrgAccess(), + requireWorkspaceAccess, + async (c) => { + const orgId = c.req.param("orgId")!; + const workspaceId = c.req.param("workspaceId")!; + const providerId = c.req.param("providerId"); + const modelId = c.req.query("modelId"); + + if (!modelId) { + return c.json({ error: "modelId query parameter required" }, 400); + } + + const found = await requireScoped(db, "provider", providerId, { + orgId, + wsId: workspaceId, + }); + + const resolved = await contextWindowResolver + .resolve(found.row, modelId) + .catch(() => null); + + return c.json({ + contextWindow: + resolved && resolved.source !== "default" + ? resolved.contextWindow + : null, + source: resolved?.source ?? "default", + }); + }, +); + export { provider }; diff --git a/apps/backend/src/runs/agent-runner.test.ts b/apps/backend/src/runs/agent-runner.test.ts index 03262f3f..ea3acc5b 100644 --- a/apps/backend/src/runs/agent-runner.test.ts +++ b/apps/backend/src/runs/agent-runner.test.ts @@ -1,57 +1,12 @@ import { describe, it, expect, beforeEach, vi } from "vitest"; -const { mockPrepareChatTurn, mockGenerateText, mockStreamText, streamHarness } = - vi.hoisted(() => { - // A minimal, manually-driven async iterable standing in for the server-side - // snapshot branch of the UI message stream. The test pushes partial - // messages and ends it explicitly so timing is deterministic. - class AsyncQueue { - items: unknown[] = []; - resolvers: ((r: { value: unknown; done: boolean }) => void)[] = []; - ended = false; - push(item: unknown) { - const r = this.resolvers.shift(); - if (r) r({ value: item, done: false }); - else this.items.push(item); - } - end() { - this.ended = true; - let r; - while ((r = this.resolvers.shift())) - r({ value: undefined, done: true }); - } - [Symbol.asyncIterator]() { - return { - next: () => { - if (this.items.length) - return Promise.resolve({ - value: this.items.shift(), - done: false, - }); - if (this.ended) - return Promise.resolve({ value: undefined, done: true }); - return new Promise((res) => this.resolvers.push(res)); - }, - }; - } - } - return { - mockPrepareChatTurn: vi.fn(), - mockGenerateText: vi.fn(), - mockStreamText: vi.fn(), - streamHarness: { - AsyncQueue, - queue: null as InstanceType | null, - // The AI SDK callbacks the runner registers; captured so the test can - // drive step-completion and stream-completion by hand. - onStepFinish: undefined as ((step: unknown) => void) | undefined, - onFinish: undefined as - | ((ctx: { messages: unknown[] }) => Promise | void) - | undefined, - responseSentinel: { __isResponse: true }, - }, - }; - }); +const { mockPrepareChatTurn, mockGenerateText, mockStreamText } = vi.hoisted( + () => ({ + mockPrepareChatTurn: vi.fn(), + mockGenerateText: vi.fn(), + mockStreamText: vi.fn(), + }), +); vi.mock("../services/chat-execution.ts", () => ({ prepareChatTurn: mockPrepareChatTurn, @@ -66,8 +21,6 @@ vi.mock("ai", async () => { convertToModelMessages: vi.fn().mockReturnValue([]), createIdGenerator: vi.fn().mockReturnValue(() => "msg-1"), stepCountIs: vi.fn(), - readUIMessageStream: () => streamHarness.queue, - createUIMessageStreamResponse: () => streamHarness.responseSentinel, }; }); @@ -80,7 +33,9 @@ vi.mock("../logger.ts", () => ({ }, })); -import { AgentRunner } from "./agent-runner.ts"; +import { AgentRunner, withToolTimestamps } from "./agent-runner.ts"; +import { buildTier2PrepareStep } from "./compaction.ts"; +import type { UIMessageChunk } from "ai"; import { runRegistry, TimeoutError } from "./run-registry.ts"; import type { ResolvedRunPlan, RunInput, RunSink } from "./types.ts"; import type { WorkspaceScope } from "../scope.ts"; @@ -89,13 +44,7 @@ type LifecycleEvent = | { name: "onStart"; runId: string } | { name: "onResolved"; runId: string; plan: ResolvedRunPlan } | { name: "onProgress"; runId: string } - | { - name: "onFinish"; - runId: string; - status: string; - error?: string; - messages?: unknown[]; - }; + | { name: "onFinish"; runId: string; status: string; error?: string }; class RecordingSink implements RunSink { events: LifecycleEvent[] = []; @@ -116,14 +65,12 @@ class RecordingSink implements RunSink { runId: string; status: string; error?: Error; - messages?: unknown[]; }): Promise { this.events.push({ name: "onFinish", runId: ctx.runId, status: ctx.status, error: ctx.error?.message, - messages: ctx.messages, }); } @@ -160,6 +107,14 @@ const fakeTurn = (overrides?: { dispose?: () => Promise }) => { providerId: "p1", modelId: "m1", }, + recovery: { + imageProvider: "default" as const, + targetTokens: 1000, + keepRecentMessages: 10, + minPrunableChars: 2000, + summarize: async (t: string) => t, + }, + tier2: null, dispose, }; }; @@ -379,163 +334,283 @@ describe("AgentRunner.cancel", () => { }); }); -describe("AgentRunner.stream — success & interruption", () => { - let runner: AgentRunner; - beforeEach(() => { - runner = new AgentRunner(); - vi.clearAllMocks(); - streamHarness.queue = null; - streamHarness.onStepFinish = undefined; - streamHarness.onFinish = undefined; +// Smoke test the TimeoutError export so the type stays public-importable +describe("AgentRunner timeout types", () => { + it("TimeoutError remains an Error subclass", () => { + const e = new TimeoutError("x", "run"); + expect(e).toBeInstanceOf(Error); + expect(e.kind).toBe("run"); }); +}); - const tick = () => new Promise((r) => setTimeout(r, 0)); - - // Make streamText return a fake result whose UI-stream callbacks the test - // can drive by hand: `onStepFinish` (per step) and `onFinish` (completion). - const primeStreamText = () => { - mockStreamText.mockImplementation((opts: any) => { - streamHarness.onStepFinish = opts.onStepFinish; - return { - toUIMessageStream: (uiOpts: any) => { - streamHarness.onFinish = uiOpts.onFinish; - return { tee: () => [{}, {}] }; - }, - }; - }); - }; +describe("withToolTimestamps", () => { + const FIXED_NOW = "2026-05-30T12:00:00.000Z"; - it("runs the full lifecycle on success and persists the final messages", async () => { - const dispose = vi.fn().mockResolvedValue(undefined); - mockPrepareChatTurn.mockResolvedValueOnce(fakeTurn({ dispose })); - const queue = new streamHarness.AsyncQueue(); - streamHarness.queue = queue; - primeStreamText(); + const collect = async (stream: ReadableStream): Promise => { + const out: T[] = []; + const reader = stream.getReader(); + for (;;) { + const { done, value } = await reader.read(); + if (done) break; + out.push(value); + } + return out; + }; - const sink = new RecordingSink(); - const res = await runner.stream({ - scope, - input: { ...baseInput, runId: "s-ok" }, - sink, - options: { origin: "http://test" }, + const sourceOf = (chunks: UIMessageChunk[]): ReadableStream => + new ReadableStream({ + start(controller) { + for (const chunk of chunks) controller.enqueue(chunk); + controller.close(); + }, }); - expect(res).toBe(streamHarness.responseSentinel); - // A step completes -> onProgress. - streamHarness.onStepFinish!({ - usage: { inputTokens: 3, outputTokens: 4 }, - toolCalls: [], - }); - // A partial snapshot streams in over the server-side branch. - queue.push({ id: "m1", role: "assistant", parts: [] }); - await tick(); - // Natural completion delivers the final assistant message. - const finalMessages = [ - { id: "m1", role: "assistant", parts: [{ type: "text", text: "hi" }] }, - ]; - await streamHarness.onFinish!({ messages: finalMessages }); - queue.end(); - await tick(); - - expect(sink.names()).toEqual([ - "onStart", - "onResolved", - "onProgress", - "onFinish", - ]); - const finish = sink.events.at(-1) as Extract< - LifecycleEvent, - { name: "onFinish" } - >; - expect(finish.status).toBe("succeeded"); - expect(finish.error).toBeUndefined(); - expect(finish.messages).toEqual(finalMessages); - expect(dispose).toHaveBeenCalledTimes(1); - expect(runRegistry.has("s-ok")).toBe(false); + const toolInputAvailable = ( + overrides: Partial< + Extract + > = {}, + ): UIMessageChunk => ({ + type: "tool-input-available", + toolCallId: "t1", + toolName: "foo", + input: { x: 1 }, + ...overrides, }); - it("finalises as cancelled with the partial messages when cancelled mid-stream", async () => { - mockPrepareChatTurn.mockResolvedValueOnce(fakeTurn()); - const queue = new streamHarness.AsyncQueue(); - streamHarness.queue = queue; - primeStreamText(); + it("injects startedAt on tool-input-available chunks", async () => { + const { stream } = withToolTimestamps( + sourceOf([toolInputAvailable()]), + () => FIXED_NOW, + ); + const result = await collect(stream); + + expect(result).toHaveLength(1); + expect( + (result[0] as { toolMetadata?: Record }).toolMetadata, + ).toEqual({ startedAt: FIXED_NOW }); + }); - const sink = new RecordingSink(); - await runner.stream({ - scope, - input: { ...baseInput, runId: "s-cancel" }, - sink, - options: { origin: "http://test" }, + it("preserves existing toolMetadata fields", async () => { + const { stream } = withToolTimestamps( + sourceOf([toolInputAvailable({ toolMetadata: { custom: "value" } })]), + () => FIXED_NOW, + ); + const result = await collect(stream); + + expect( + (result[0] as { toolMetadata?: Record }).toolMetadata, + ).toEqual({ + custom: "value", + startedAt: FIXED_NOW, }); + }); - const partial = { - id: "m1", - role: "assistant", - parts: [{ type: "text", text: "par" }], - }; - queue.push(partial); - await tick(); + it("passes other chunks through unchanged", async () => { + const chunks: UIMessageChunk[] = [ + { type: "text-delta", id: "a", delta: "hello" }, + { + type: "tool-output-available", + toolCallId: "t1", + output: { ok: true }, + }, + { type: "finish", finishReason: "stop" }, + ]; - expect(runner.cancel("s-cancel")).toBe(true); - // The SDK observes the abort and finishes the UI stream. - await streamHarness.onFinish!({ messages: [partial] }); - queue.end(); - await tick(); + const { stream } = withToolTimestamps(sourceOf(chunks), () => FIXED_NOW); + const result = await collect(stream); - const finish = sink.events.at(-1) as Extract< - LifecycleEvent, - { name: "onFinish" } - >; - expect(finish.status).toBe("cancelled"); - expect(finish.messages).toEqual([partial]); + expect(result).toEqual(chunks); }); - it("finalises as failed with a TimeoutError and the partial messages on per-run timeout", async () => { - mockPrepareChatTurn.mockResolvedValueOnce(fakeTurn()); - const queue = new streamHarness.AsyncQueue(); - streamHarness.queue = queue; - primeStreamText(); + it("records completedAt for tool-output-available chunks", async () => { + const { stream, completions } = withToolTimestamps( + sourceOf([ + toolInputAvailable(), + { + type: "tool-output-available", + toolCallId: "t1", + output: { ok: true }, + }, + ]), + () => FIXED_NOW, + ); + // Completions are populated as the stream drains, so consume it first. + await collect(stream); - const sink = new RecordingSink(); - await runner.stream({ - scope, - input: { ...baseInput, runId: "s-timeout" }, - sink, - options: { - origin: "http://test", - timeouts: { perRunTimeoutMs: 5, perStepTimeoutMs: 1_000_000 }, + expect(completions.get("t1")).toBe(FIXED_NOW); + }); + + it("records completedAt for tool-output-error chunks", async () => { + const { stream, completions } = withToolTimestamps( + sourceOf([ + toolInputAvailable(), + { + type: "tool-output-error", + toolCallId: "t1", + errorText: "boom", + }, + ]), + () => FIXED_NOW, + ); + await collect(stream); + + expect(completions.get("t1")).toBe(FIXED_NOW); + }); + + // Mirrors AgentRunner.stream's pipeline: transform -> tee -> readUIMessageStream + // drains the snapshot branch. Verifies completions populate AND the built + // message's tool part carries the same toolCallId, so applyToolCompletions + // (matches on toolCallId) can stamp completedAt. + it("integration: completions + built tool part share toolCallId after tee+read", async () => { + const { readUIMessageStream } = + await vi.importActual("ai"); + + const chunks: UIMessageChunk[] = [ + { type: "start", messageId: "m1" }, + { type: "start-step" }, + { + type: "tool-input-available", + toolCallId: "call_xyz", + toolName: "foo", + input: { a: 1 }, }, + { + type: "tool-output-available", + toolCallId: "call_xyz", + output: { ok: true }, + }, + { type: "finish-step" }, + { type: "finish" }, + ]; + + const { stream, completions } = withToolTimestamps( + sourceOf(chunks), + () => FIXED_NOW, + ); + const [forResponse, forSnapshot] = stream.tee(); + + let lastMessage: { parts?: Array> } | undefined; + for await (const message of readUIMessageStream({ stream: forSnapshot })) { + lastMessage = message; + } + await collect(forResponse); + + expect(completions.get("call_xyz")).toBe(FIXED_NOW); + + const toolPart = lastMessage?.parts?.find( + (p) => (p as { toolCallId?: string }).toolCallId === "call_xyz", + ) as { toolMetadata?: Record; toolCallId?: string }; + expect(toolPart).toBeDefined(); + expect(toolPart.toolCallId).toBe("call_xyz"); + expect(toolPart.toolMetadata).toMatchObject({ startedAt: FIXED_NOW }); + }); +}); + +describe("buildTier2PrepareStep", () => { + const makeCtx = (triggerTokens = 100) => ({ + triggerTokens, + targetTokens: 50, + keepRecentMessages: 4, + minPrunableChars: 100, + imageProvider: "default" as const, + summarize: vi.fn().mockResolvedValue("summary"), + summarizerWindow: undefined, + }); + + // Invoke a PrepareStepFunction supplying only the field under test; the + // callback ignores steps/stepNumber/model/experimental_context. + const callStep = ( + fn: ReturnType, + messages: import("ai").ModelMessage[], + ) => + fn({ + messages, + steps: [], + stepNumber: 0, + model: {} as never, + experimental_context: undefined, }); - const partial = { - id: "m1", + const shortMessages: import("ai").ModelMessage[] = [ + { role: "user", content: [{ type: "text", text: "hi" }] }, + { role: "assistant", - parts: [{ type: "text", text: "par" }], - }; - queue.push(partial); - await tick(); - // Let the per-run timer fire -> registry aborts -> onTimeout -> finalize. - await new Promise((r) => setTimeout(r, 30)); - queue.end(); - await tick(); + content: [{ type: "text", text: "hello" }], + }, + ]; + + // 6 assistant/tool pairs where each tool result carries 1200 chars of text + // (≈ 300 tokens each via char/4). Total ≈ 1800+ tokens > any reasonable + // triggerTokens threshold used in these tests. + const longMessages = (): import("ai").ModelMessage[] => { + const msgs: import("ai").ModelMessage[] = [ + { role: "user", content: [{ type: "text", text: "start" }] }, + ]; + for (let i = 0; i < 6; i++) { + msgs.push({ + role: "assistant", + content: [ + { + type: "tool-call", + toolCallId: `tc${i}`, + toolName: "tool", + input: {}, + }, + ], + }); + msgs.push({ + role: "tool", + content: [ + { + type: "tool-result", + toolCallId: `tc${i}`, + toolName: "tool", + // Must use typed output shape so tokenEstimator counts the value. + output: { type: "text" as const, value: "x".repeat(1200) }, + }, + ], + }); + } + return msgs; + }; - const finish = sink.events.at(-1) as Extract< - LifecycleEvent, - { name: "onFinish" } - >; - expect(finish.status).toBe("failed"); - expect(finish.error).toMatch(/per-run timeout/); - // The snapshot accumulated before the timeout is what gets persisted. - expect(finish.messages).toEqual([partial]); - expect(runRegistry.has("s-timeout")).toBe(false); + it("returns undefined when messages are below triggerTokens (drift m3)", async () => { + const fn = buildTier2PrepareStep(makeCtx(10_000)); + const result = await callStep(fn, shortMessages); + expect(result).toBeUndefined(); }); -}); -// Smoke test the TimeoutError export so the type stays public-importable -describe("AgentRunner timeout types", () => { - it("TimeoutError remains an Error subclass", () => { - const e = new TimeoutError("x", "run"); - expect(e).toBeInstanceOf(Error); - expect(e.kind).toBe("run"); + it("compacts when messages exceed triggerTokens", async () => { + const msgs = longMessages(); + const ctx = makeCtx(1); + const fn = buildTier2PrepareStep(ctx); + const result = await callStep(fn, msgs); + expect(result?.messages).toBeDefined(); + const out = result!.messages!; + expect(out.length).toBeLessThan(msgs.length); + // Stage 2 summarizes the dropped prefix. + expect(ctx.summarize).toHaveBeenCalled(); + // First surviving message is the synthetic summary (role "user"); the one + // after it starts the kept tail and must not be an orphaned tool result + // (its assistant tool-call would have been dropped into the prefix). + expect(out[1]?.role).not.toBe("tool"); + }); + + it("returns undefined when prefix is empty (no-op, drift m3 / RV4)", async () => { + // Two messages, keepRecentMessages 4 → no prefix to summarize → + // compactModelMessages drops nothing → prepareStep returns undefined so the + // SDK proceeds unchanged, and the summarizer is never called. + const ctx = makeCtx(1); + const fn = buildTier2PrepareStep(ctx); + const result = await callStep(fn, shortMessages); + expect(result).toBeUndefined(); + expect(ctx.summarize).not.toHaveBeenCalled(); + }); + + it("does not call summarize when estimate is below triggerTokens", async () => { + const ctx = makeCtx(10_000); + const fn = buildTier2PrepareStep(ctx); + await callStep(fn, shortMessages); + expect(ctx.summarize).not.toHaveBeenCalled(); }); }); diff --git a/apps/backend/src/runs/agent-runner.ts b/apps/backend/src/runs/agent-runner.ts index 90110ac5..19b7c7f1 100644 --- a/apps/backend/src/runs/agent-runner.ts +++ b/apps/backend/src/runs/agent-runner.ts @@ -8,9 +8,17 @@ import { readUIMessageStream, stepCountIs, streamText, + wrapLanguageModel, type LanguageModel, + type UIMessageChunk, } from "ai"; import { + contextOverflowRecoveryMiddleware, + isContextOverflowError, +} from "./recovery.ts"; +import { buildTier2PrepareStep } from "./compaction.ts"; +import { + loadChatMessages, prepareChatTurn, type ChatTurn, type ToolActivityEvent, @@ -33,6 +41,129 @@ import type { RunStatus, } from "./types.ts"; +/** + * Result of {@link withToolTimestamps}: the transformed stream plus a map of + * `toolCallId` → completion ISO timestamp, populated as tool-output chunks + * pass through. + */ +export type ToolTimestampStream = { + stream: ReadableStream; + /** toolCallId → completedAt ISO timestamp, filled in as the stream drains. */ + completions: Map; +}; + +/** + * Stamps tool-call timing onto the stream so the UI can show each tool's run + * duration: + * + * - `startedAt` is injected into `tool-input-available` chunks via + * `toolMetadata`. It must go here (not on the output chunk) because the AI + * SDK's tool-output handlers ignore `chunk.toolMetadata` and reuse the + * invocation's existing `toolMetadata` from the input-available phase. + * - `completedAt` cannot ride the output chunk for the same reason, so it is + * recorded in the returned `completions` map keyed by `toolCallId`. The run + * loop applies it to the built message via {@link applyToolCompletions} + * before the sink persists it. + * + * Exported for unit testing. + */ +export function withToolTimestamps( + stream: ReadableStream, + now: () => string = () => new Date().toISOString(), +): ToolTimestampStream { + const completions = new Map(); + const out = stream.pipeThrough( + new TransformStream({ + transform(chunk, controller) { + if (chunk.type === "tool-input-available") { + controller.enqueue({ + ...chunk, + toolMetadata: { + ...chunk.toolMetadata, + startedAt: now(), + }, + }); + return; + } + if ( + chunk.type === "tool-output-available" || + chunk.type === "tool-output-error" + ) { + completions.set(chunk.toolCallId, now()); + } + controller.enqueue(chunk); + }, + }), + ); + return { stream: out, completions }; +} + +/** Stats stamped on the last assistant message's metadata after each stream (§H/§I). */ +export type MessageStats = { + /** Run-wide totals across every step (sum) — §I cost popover. */ + inputTokens: number; + outputTokens: number; + /** + * Input tokens of the LAST model call = peak context fullness — §H ring. + * NOT the run-wide sum (which over-counts on multi-step tool loops). + */ + contextTokens: number; + startedAt: string; + firstTokenAt?: string; + finishedAt: string; + contextWindow: number; + contextWindowIsDefault: boolean; +}; + +/** + * Stamps per-run stats (token counts, timing, resolved context window) onto + * the last assistant message's `metadata.stats` in place. Applied at the same + * point as {@link applyToolCompletions} so both mutations happen before the + * sink persists the final state (§H/§I). + */ +function applyMessageStats( + messages: PlatypusUIMessage[], + stats: MessageStats, +): void { + for (let i = messages.length - 1; i >= 0; i--) { + if (messages[i].role === "assistant") { + const msg = messages[i] as PlatypusUIMessage & { + metadata?: Record; + }; + msg.metadata = { ...msg.metadata, stats }; + return; + } + } +} + +/** + * Stamps `completedAt` onto assistant tool parts in place, reading from the + * `completions` map produced by {@link withToolTimestamps}. Applied to the + * built message just before it is persisted, since the AI SDK strips + * `toolMetadata` from tool-output chunks and the end time can't be injected + * inline. Paired with the injected `startedAt`, this lets the UI compute each + * tool's run duration. + */ +function applyToolCompletions( + messages: PlatypusUIMessage[], + completions: Map, +): void { + if (completions.size === 0) return; + for (const message of messages) { + for (const part of message.parts ?? []) { + const anyPart = part as { + toolCallId?: string; + toolMetadata?: Record; + }; + const completedAt = anyPart.toolCallId + ? completions.get(anyPart.toolCallId) + : undefined; + if (!completedAt) continue; + anyPart.toolMetadata = { ...anyPart.toolMetadata, completedAt }; + } + } +} + export type StreamOptions = { origin: string; frontendUrl?: string; @@ -154,6 +285,12 @@ type RunState = { stats: RunStats; messages: PlatypusUIMessage[]; terminated: boolean; + /** + * Input tokens reported by the most recent model step = peak context + * fullness for the §H ring. Tracked separately from `stats.inputTokens`, + * which is the run-wide SUM and over-counts multi-step tool loops. + */ + lastStepInputTokens: number; }; /** @@ -178,6 +315,7 @@ export class AgentRunner { origin: string | undefined, frontendUrl?: string, onActivity?: (event?: ToolActivityEvent) => void, + priorMessages?: PlatypusUIMessage[], ): Promise { return prepareChatTurn({ orgId: scope.orgId, @@ -189,6 +327,7 @@ export class AgentRunner { frontendUrl, runMode: scope.principal.kind === "user" ? "interactive" : "headless", onActivity, + priorMessages, }); } @@ -219,12 +358,29 @@ export class AgentRunner { timeouts?: Pick; }) { const { scope, input, sink } = params; + + // RV1: snapshot the DB state BEFORE onStart overwrites it so + // applyTier1IfNeeded has the correct C4 baseline. Only interactive chats + // carry a `request.id`; headless runs (triggers, sub-agents) have none. + const priorMessages = input.request.id + ? await loadChatMessages(input.request.id).catch((err) => { + // Falls back to the post-overwrite DB read inside applyTier1IfNeeded, + // which cannot detect edits below the watermark — log the degradation. + logger.warn( + { err, chatId: input.request.id }, + "RV1: failed to snapshot prior messages; C4 edit-detection degraded this turn", + ); + return undefined; + }) + : undefined; + await sink.onStart({ runId: input.runId, messages: input.messages }); const state: RunState = { stats: {}, messages: input.messages, terminated: false, + lastStepInputTokens: 0, }; const finalize = async ( @@ -277,6 +433,7 @@ export class AgentRunner { params.origin, params.frontendUrl, onActivity, + priorMessages, ); } catch (error) { const err = error instanceof Error ? error : new Error(String(error)); @@ -297,6 +454,8 @@ export class AgentRunner { }): void => { handle.bumpStep(); accumulateStepStats(state.stats, step); + state.lastStepInputTokens = + step.usage?.inputTokens ?? state.lastStepInputTokens; logger.info( { runId: input.runId, @@ -323,12 +482,20 @@ export class AgentRunner { // an `undefined` value identically, and the streaming path has always // passed them this way in production. const modelArgs = { - model: state.turn.stream.model as LanguageModel, + // Recovery middleware (§E, P4): every model call — first call and every + // tool-loop step, stream and generate alike — gets one trim-and-retry on + // a provider "context too long" rejection. Always on; not gated by §G. + model: withOverflowRecovery(state.turn), messages: await convertToModelMessages(state.turn.stream.messages), system: state.turn.stream.system, tools: state.turn.stream.tools, stopWhen: [stepCountIs(state.turn.stream.maxSteps)], abortSignal: handle.signal, + // Tier 2 (§D): in-turn compaction before each step when the live window + // nears the limit. Undefined when the turn has no Tier 2 runtime. + prepareStep: state.turn.tier2 + ? buildTier2PrepareStep(state.turn.tier2) + : undefined, temperature: state.turn.stream.temperature, topP: state.turn.stream.topP, topK: state.turn.stream.topK, @@ -358,6 +525,9 @@ export class AgentRunner { logger.debug({ systemPrompt: modelArgs.system }, "System prompt for chat"); + const startedAt = new Date().toISOString(); + let firstTokenAt: string | undefined; + const result = streamText({ ...modelArgs, onStepFinish: (step) => onStep(step), @@ -367,8 +537,7 @@ export class AgentRunner { // one branch; we drain the other server-side so a disconnected // client (cancelling the response branch) doesn't propagate back to // the source. The source keeps pulling as long as the snapshot - // branch is being read, so `onFinish` only fires on natural - // completion — not when the consumer cancels with partial state. + // branch is being read. const uiStream = result.toUIMessageStream({ originalMessages: input.messages, generateMessageId: createIdGenerator({ prefix: "msg", size: 16 }), @@ -377,47 +546,89 @@ export class AgentRunner { ? { agentId: state.turn.resolved.agentId } : undefined, onError: (error) => formatStreamError(error), - onFinish: async ({ messages: finalMessages }) => { - state.messages = finalMessages; - let status: RunStatus = "succeeded"; - let err: Error | undefined; - if (handle.signal.aborted) { - const reason = handle.signal.reason; - if (reason instanceof TimeoutError) { - status = "failed"; - err = reason; - } else { - status = "cancelled"; - } - } - await finalize(status, err); - }, }); - const [forResponse, forSnapshot] = uiStream.tee(); + const { stream: timedStream, completions } = withToolTimestamps(uiStream); + const [forResponse, forSnapshot] = timedStream.tee(); // Read the snapshot branch as message snapshots and keep `state.messages` // up to date. ChatSink's FlushScheduler then writes the in-progress // assistant message to the DB on each onProgress bump, so a user who // reconnects mid-run sees the partial answer (not just their own // input message). + // + // finalize is called here (not in toUIMessageStream's onFinish) so that + // state.messages reflects the fully-drained stream — including the tool + // `completedAt` timestamps and §H/§I stats applied below — before the sink + // persists it. + // RV8: an error chunk (model/tool failure surfaced via formatStreamError) or + // an internal stream fault ends the for-await without throwing, because + // readUIMessageStream defaults terminateOnError=false. Capture it so the + // finally finalizes "failed" instead of silently persisting a partial + // message as "succeeded". + let streamError: unknown; void (async () => { try { for await (const message of readUIMessageStream({ stream: forSnapshot, - onError: (err) => + onError: (err) => { + streamError = err; logger.error( { err, runId: input.runId }, "Snapshot stream parse error", - ), + ); + }, })) { + if (!firstTokenAt && message.parts?.some((p) => p.type === "text")) { + firstTokenAt = new Date().toISOString(); + } state.messages = [...input.messages, message]; } } catch (err) { + streamError = err; logger.error( { err, runId: input.runId }, "Server-side UI stream consumer error", ); + } finally { + const finishedAt = new Date().toISOString(); + applyToolCompletions(state.messages, completions); + if (state.turn) { + applyMessageStats(state.messages, { + inputTokens: state.stats.inputTokens ?? 0, + outputTokens: state.stats.outputTokens ?? 0, + contextTokens: state.lastStepInputTokens, + startedAt, + firstTokenAt, + finishedAt, + contextWindow: state.turn.resolved.contextWindow, + contextWindowIsDefault: state.turn.resolved.contextWindowIsDefault, + }); + } + let status: RunStatus = "succeeded"; + let err: Error | undefined; + if (handle.signal.aborted) { + const reason = handle.signal.reason; + if (reason instanceof TimeoutError) { + status = "failed"; + err = reason; + } else { + status = "cancelled"; + } + } else if (streamError !== undefined) { + // The stream errored (model/tool rejection or internal fault) but did + // not abort — record the run as failed rather than succeeded (RV8). + status = "failed"; + err = + streamError instanceof Error + ? streamError + : new Error( + typeof streamError === "string" + ? streamError + : "Server-side UI stream error", + ); + } + await finalize(status, err); } })(); @@ -489,6 +700,18 @@ export class AgentRunner { } } +/** + * Wraps the turn's model with the context-overflow recovery middleware (§E, + * P4): every model call — first call and every tool-loop step, stream and + * generate alike — gets one trim-and-retry on a provider "context too long" + * rejection. Always on; the §G kill switch does not gate it. + */ +const withOverflowRecovery = (turn: ChatTurn): LanguageModel => + wrapLanguageModel({ + model: turn.stream.model, + middleware: contextOverflowRecoveryMiddleware(turn.recovery), + }); + /** * Converts AI SDK errors into user-facing strings for the UI message stream. * Behaviour-preserving copy of the previous inline `onError` handler. @@ -498,6 +721,11 @@ const formatStreamError = (error: unknown): string => { if (LoadAPIKeyError.isInstance(error)) { return "AI provider API key is missing or not configured."; } + // Reaching here means recovery (§E) already trimmed and retried once and the + // provider still rejected the prompt — surface the actionable dead end. + if (isContextOverflowError(error)) { + return "Conversation too large for the model's context window even after trimming — start a new chat or reduce attachments."; + } if (APICallError.isInstance(error)) { if (error.statusCode === 401 || error.statusCode === 403) { return "AI provider authentication failed. Your API key may be invalid or expired."; diff --git a/apps/backend/src/runs/compaction.test.ts b/apps/backend/src/runs/compaction.test.ts new file mode 100644 index 00000000..1e94e581 --- /dev/null +++ b/apps/backend/src/runs/compaction.test.ts @@ -0,0 +1,859 @@ +import { describe, it, expect, vi } from "vitest"; + +vi.mock("../index.ts", () => ({ db: {} })); // drizzle store unused in these tests +vi.mock("../logger.ts", () => ({ + logger: { warn: vi.fn(), info: vi.fn(), error: vi.fn(), debug: vi.fn() }, +})); + +import { + commitWatermark, + compactUIMessages, + compactModelMessages, + pickKeepBoundary, + softTrim, + type CompactionStore, + type CompactionState, + type WatermarkPatch, +} from "./compaction.ts"; +import type { ModelMessage } from "ai"; +import type { PlatypusUIMessage } from "../types.ts"; + +/** + * In-memory store. Since JS is single-threaded, the version check in `casWrite` + * is atomic per call — exactly the guarantee Postgres gives via the `version` + * predicate. `readState` returns a snapshot copy, so a version bump that happens + * after a read (a racing winner) makes that reader's snapshot stale → CAS fails. + */ +class FakeStore implements CompactionStore { + state: CompactionState; + casCalls = 0; + + constructor(init: Partial = {}) { + this.state = { + version: 0, + summaryWatermark: null, + contextSummary: null, + compactionDirty: false, + ...init, + }; + } + + async readState() { + return { ...this.state }; + } + + async casWrite( + _chatId: string, + expectVersion: number, + patch: WatermarkPatch, + ) { + this.casCalls++; + if (this.state.version !== expectVersion) return false; + if ("watermark" in patch) + this.state.summaryWatermark = patch.watermark ?? null; + if ("summary" in patch) this.state.contextSummary = patch.summary ?? null; + if ("dirty" in patch) this.state.compactionDirty = patch.dirty ?? false; + this.state.version = expectVersion + 1; + return true; + } +} + +describe("casWrite — version-gated CAS (P3/R1)", () => { + it("applies and bumps version when the expected version matches", async () => { + const store = new FakeStore({ version: 3 }); + const won = await store.casWrite("c", 3, { summary: "s", watermark: "m1" }); + expect(won).toBe(true); + expect(store.state.version).toBe(4); + expect(store.state.contextSummary).toBe("s"); + expect(store.state.summaryWatermark).toBe("m1"); + }); + + it("two writers on the same version: one wins, the other loses", async () => { + const store = new FakeStore({ version: 0 }); + const first = await store.casWrite("c", 0, { summary: "A" }); + const second = await store.casWrite("c", 0, { summary: "B" }); + expect(first).toBe(true); + expect(second).toBe(false); // version is now 1, expected 0 + expect(store.state.contextSummary).toBe("A"); + }); + + it("an explicit null clears a field; an absent key leaves it untouched", async () => { + const store = new FakeStore({ + version: 1, + contextSummary: "old", + summaryWatermark: "m5", + }); + await store.casWrite("c", 1, { summary: null }); // reset summary only + expect(store.state.contextSummary).toBeNull(); + expect(store.state.summaryWatermark).toBe("m5"); // untouched + }); +}); + +describe("commitWatermark — loser logic (drift T10/R1)", () => { + it("applies a write on an uncontended commit", async () => { + const store = new FakeStore({ version: 2 }); + const res = await commitWatermark(store, "c", () => ({ + kind: "write", + patch: { summary: "sum", watermark: "m9" }, + })); + expect(res).toEqual({ status: "applied", version: 3 }); + expect(store.state.summaryWatermark).toBe("m9"); + }); + + it("skips immediately when the decision is a no-op", async () => { + const store = new FakeStore({ version: 0 }); + const res = await commitWatermark(store, "c", () => ({ + kind: "skip", + reason: "no-op", + })); + expect(res).toEqual({ status: "skipped", reason: "no-op" }); + expect(store.casCalls).toBe(0); + }); + + it("re-reads after a CAS conflict and succeeds on the retry", async () => { + const store = new FakeStore({ version: 0 }); + let firstDecision = true; + const res = await commitWatermark(store, "c", (state) => { + if (firstDecision) { + firstDecision = false; + // Simulate a racing winner committing between our read and write. + store.state.version = 1; + store.state.summaryWatermark = "winner"; + } + // Decide by the (re-read) version, not the watermark value. + return { kind: "write", patch: { summary: `at-v${state.version}` } }; + }); + expect(res.status).toBe("applied"); + // First attempt CAS expected v0 but row is v1 → lost; retry expects v1 → wins. + expect(store.state.version).toBe(2); + expect(store.state.contextSummary).toBe("at-v1"); + }); + + it("decides 'covered' on the retry and skips (winner already did the work)", async () => { + const store = new FakeStore({ version: 0, summaryWatermark: "m1" }); + let first = true; + const res = await commitWatermark(store, "c", (state) => { + if (first) { + first = false; + store.state.version = 1; + store.state.summaryWatermark = "m20"; // winner advanced past our prefix + return { kind: "write", patch: { summary: "mine", watermark: "m10" } }; + } + // On re-read we see the winner covered us → skip (decide by version). + expect(state.version).toBe(1); + return { kind: "skip", reason: "covered" }; + }); + expect(res).toEqual({ status: "skipped", reason: "covered" }); + expect(store.state.summaryWatermark).toBe("m20"); // winner's value preserved + }); + + it("gives up as 'contended' after two conflicts — no livelock", async () => { + const store = new FakeStore({ version: 0 }); + let decideCalls = 0; + const res = await commitWatermark(store, "c", (state) => { + decideCalls++; + // Every decision races a winner → both CAS attempts fail. + store.state.version = state.version + 1; + return { kind: "write", patch: { summary: "x" } }; + }); + expect(res).toEqual({ status: "skipped", reason: "contended" }); + expect(decideCalls).toBe(2); // exactly MAX_ATTEMPTS, then stop + }); +}); + +// --- Slice 2b: compaction primitives ------------------------------------ + +function uiText( + id: string, + role: "user" | "assistant", + text: string, +): PlatypusUIMessage { + return { id, role, parts: [{ type: "text", text }] } as PlatypusUIMessage; +} + +function uiTool(id: string, output: unknown): PlatypusUIMessage { + return { + id, + role: "assistant", + parts: [ + { + type: "tool-doThing", + toolCallId: `${id}-call`, + state: "output-available", + input: {}, + output, + }, + ], + } as unknown as PlatypusUIMessage; +} + +const noopSummarize = async () => "SUMMARY"; + +describe("softTrim", () => { + it("keeps short text untouched", () => { + expect(softTrim("short", 500)).toBe("short"); + }); + it("trims long text to head+tail with a marker", () => { + const out = softTrim("a".repeat(2000), 100); + expect(out.startsWith("a".repeat(100))).toBe(true); + expect(out).toContain("elided 1800 chars"); + expect(out.length).toBeLessThan(2000); + }); +}); + +describe("pickKeepBoundary", () => { + it("UIMessage: any split is safe", () => { + expect(pickKeepBoundary(5, 2, () => true)).toBe(3); + }); + it("ModelMessage: walks back so recent does not start on an orphan tool result", () => { + const roles = ["user", "assistant", "tool", "user"]; + const safe = (i: number) => i >= roles.length || roles[i] !== "tool"; + // start at 4-2=2 (role "tool", unsafe) → walk back to 1 (assistant, safe) + expect(pickKeepBoundary(4, 2, safe)).toBe(1); + }); +}); + +describe("compactUIMessages (Tier 1)", () => { + const baseOpts = { + keepRecentMessages: 2, + minPrunableChars: 2000, + summarize: noopSummarize, + }; + + it("is a no-op when already within target (hysteresis precondition)", async () => { + const msgs = [uiText("a", "user", "hi"), uiText("b", "assistant", "yo")]; + const res = await compactUIMessages(msgs, { + ...baseOpts, + targetTokens: 1000, + }); + expect(res.usedModelCall).toBe(false); + expect(res.messagesDropped).toBe(0); + expect(res.keptMessages).toBe(msgs); + }); + + it("Stage 1 prune reaches target WITHOUT a model call", async () => { + const summarize = vi.fn(noopSummarize); + const msgs = [ + uiTool("big", "X".repeat(4000)), // ~1000 tokens, prunes to ~250 + uiText("r1", "user", "hello"), + uiText("r2", "assistant", "world"), + ]; + const res = await compactUIMessages(msgs, { + ...baseOpts, + summarize, + targetTokens: 300, + }); + expect(res.usedModelCall).toBe(false); + expect(summarize).not.toHaveBeenCalled(); + expect(res.watermarkId).toBeNull(); + expect(res.keptMessages).toHaveLength(3); // pruned prefix stays visible + expect(res.estimatedTokens).toBeLessThanOrEqual(300); + }); + + it("Stage 2 summarizes when pruning is insufficient (text-heavy prefix)", async () => { + const summarize = vi.fn(noopSummarize); + const msgs = [ + uiText("p1", "user", "P".repeat(4000)), + uiText("p2", "assistant", "Q".repeat(4000)), + uiText("r1", "user", "hello"), + uiText("r2", "assistant", "world"), + ]; + const res = await compactUIMessages(msgs, { + ...baseOpts, + summarize, + targetTokens: 300, + }); + expect(res.usedModelCall).toBe(true); + expect(summarize).toHaveBeenCalledOnce(); + expect(res.summaryText).toBe("SUMMARY"); + expect(res.watermarkId).toBe("p2"); // last folded message + expect(res.keptMessages).toHaveLength(2); // only recent kept + expect(res.estimatedTokens).toBeLessThanOrEqual(300); + }); + + it("does NOT re-fire next turn: feeding the result back is a no-op (C2)", async () => { + const msgs = [ + uiText("p1", "user", "P".repeat(4000)), + uiText("p2", "assistant", "Q".repeat(4000)), + uiText("r1", "user", "hello"), + uiText("r2", "assistant", "world"), + ]; + const target = 300; + const first = await compactUIMessages(msgs, { + ...baseOpts, + targetTokens: target, + }); + expect(first.usedModelCall).toBe(true); + + const second = await compactUIMessages(first.keptMessages, { + ...baseOpts, + targetTokens: target, + priorSummary: first.summaryText, + }); + expect(second.usedModelCall).toBe(false); // already within target + expect(second.messagesDropped).toBe(0); + }); + + it("map-reduces an oversized prefix (drift M1)", async () => { + const summarize = vi.fn(noopSummarize); + const msgs = [ + uiText("p1", "user", "Z".repeat(4000)), // ~1000 tokens of transcript + uiText("r1", "user", "hello"), + uiText("r2", "assistant", "world"), + ]; + await compactUIMessages(msgs, { + ...baseOpts, + summarize, + targetTokens: 50, + summarizerWindow: 100, // 400-char chunks → several chunk calls + 1 reduce + }); + expect(summarize.mock.calls.length).toBeGreaterThan(1); + }); +}); + +describe("compactModelMessages (Tier 2 / recovery)", () => { + const baseOpts = { + keepRecentMessages: 2, + minPrunableChars: 2000, + summarize: noopSummarize, + }; + + it("is a no-op when within target", async () => { + const msgs: ModelMessage[] = [ + { role: "user", content: "hi" }, + { role: "assistant", content: "yo" }, + ]; + const res = await compactModelMessages(msgs, { + ...baseOpts, + targetTokens: 1000, + }); + expect(res.usedModelCall).toBe(false); + expect(res.messages).toBe(msgs); + }); + + it("summarizes and prepends one synthetic message, preserving tool pairing", async () => { + const msgs: ModelMessage[] = [ + { role: "user", content: "P".repeat(4000) }, + { + role: "assistant", + content: [ + { type: "tool-call", toolCallId: "t1", toolName: "f", input: {} }, + ], + }, + { + role: "tool", + content: [ + { + type: "tool-result", + toolCallId: "t1", + toolName: "f", + output: { type: "json", value: { ok: true } }, + }, + ], + }, + { role: "user", content: "recent" }, + ]; + const res = await compactModelMessages(msgs, { + ...baseOpts, + targetTokens: 50, + }); + expect(res.usedModelCall).toBe(true); + // First message is the synthetic summary (user-framed). + expect(res.messages[0].role).toBe("user"); + expect(JSON.stringify(res.messages[0].content)).toContain( + "Summary of earlier conversation", + ); + // The assistant tool-call and its tool result stay adjacent (not split). + const roles = res.messages.map((m) => m.role); + const toolIdx = roles.indexOf("tool"); + expect(roles[toolIdx - 1]).toBe("assistant"); + }); + + it("force bypasses BOTH no-op gates so recovery never retries byte-identically (RV3)", async () => { + // Estimator says we are within target AND nothing is prunable (small, + // non-bulky messages). Without force both the whole-message gate and the + // post-prune gate would no-op → recovery would retry the exact same prompt + // and fail again. force must push through to a real summarize. + const msgs: ModelMessage[] = [ + { role: "user", content: "a" }, + { role: "assistant", content: "b" }, + { role: "user", content: "recent-1" }, + { role: "assistant", content: "recent-2" }, + ]; + const res = await compactModelMessages(msgs, { + ...baseOpts, + targetTokens: 100000, // estimator is well under target + force: true, + }); + expect(res.usedModelCall).toBe(true); + expect(res.messagesDropped).toBeGreaterThan(0); + expect(res.messages).not.toBe(msgs); + }); + + it("force with an empty prefix is a no-op, not a prompt-growing summary (RV4 model-side)", async () => { + // recent alone exceeds keepRecentMessages → prefix is empty. Summarizing + // nothing would ADD a synthetic message and grow the prompt, never + // converging. Surface the overflow instead. + const msgs: ModelMessage[] = [ + { role: "user", content: "only-1" }, + { role: "assistant", content: "only-2" }, + ]; + const res = await compactModelMessages(msgs, { + ...baseOpts, + keepRecentMessages: 2, + targetTokens: 1, + force: true, + }); + expect(res.usedModelCall).toBe(false); + expect(res.messages.length).toBe(msgs.length); + }); +}); + +// --- Slice 2c: Tier 1 orchestration ------------------------------------- + +import { + applyTier1Compaction, + computeBudget, + resolveCompactionConfig, + invalidateCompaction, + affectedBelowWatermark, + summaryUIMessage, + DEFAULT_COMPACTION_CONFIG, + type Budget, + type CompactionConfig, +} from "./compaction.ts"; + +function storeFromState(state: Partial): FakeStore { + return new FakeStore(state); +} + +const cfg = (over: Partial = {}): CompactionConfig => ({ + ...DEFAULT_COMPACTION_CONFIG, + keepRecentMessages: 2, + ...over, +}); + +describe("resolveCompactionConfig (§G defaults)", () => { + it("returns defaults when overrides are null/undefined", () => { + expect(resolveCompactionConfig(null)).toEqual(DEFAULT_COMPACTION_CONFIG); + }); + it("applies partial overrides, keeping defaults for the rest", () => { + const c = resolveCompactionConfig({ + triggerRatio: 0.9, + compactionEnabled: false, + }); + expect(c.triggerRatio).toBe(0.9); + expect(c.compactionEnabled).toBe(false); + expect(c.targetRatio).toBe(DEFAULT_COMPACTION_CONFIG.targetRatio); + }); +}); + +describe("computeBudget (drift C3 — subtract both reserves)", () => { + it("subtracts output + safety reserve before applying ratios", () => { + const b = computeBudget( + 10000, + 2000, + cfg({ reserveRatio: 0.05, triggerRatio: 0.8, targetRatio: 0.5 }), + ); + expect(b.inputBudget).toBe(7500); // 10000 - 2000 - 500 + expect(b.triggerTokens).toBe(6000); + expect(b.targetTokens).toBe(3750); + }); + it("uses a conservative output reserve when maxOutputTokens is unknown", () => { + const b = computeBudget(10000, undefined, cfg({ reserveRatio: 0.05 })); + expect(b.inputBudget).toBe(7000); // 10000 - min(4096, 2500) - 500 + }); +}); + +const bigText = (id: string, role: "user" | "assistant") => + uiText(id, role, "X".repeat(4000)); + +describe("applyTier1Compaction", () => { + const baseBudget: Budget = { + inputBudget: 100, + triggerTokens: 50, + targetTokens: 50, + }; + + it("under trigger: reconstructs the persisted view, no write", async () => { + const store = storeFromState({ + version: 2, + summaryWatermark: "m2", + contextSummary: "PRIOR", + }); + const messages = ["m1", "m2", "m3", "m4"].map((id) => + uiText(id, "user", "hi"), + ); + const out = await applyTier1Compaction({ + chatId: "c", + messages, + state: { + version: 2, + summaryWatermark: "m2", + contextSummary: "PRIOR", + compactionDirty: false, + }, + budget: { + inputBudget: 100000, + triggerTokens: 100000, + targetTokens: 50000, + }, + config: cfg(), + imageProvider: "default", + summarize: noopSummarize, + store, + }); + expect(out.compacted).toBe(false); + expect(out.messages[0]).toEqual(summaryUIMessage("PRIOR")); // re-injected summary + expect(out.messages.map((m) => m.id)).toEqual([ + "context-summary", + "m3", + "m4", + ]); // dropped ≤ watermark + expect(store.casCalls).toBe(0); // nothing persisted + }); + + it("over trigger: compacts, persists summary+watermark, clears dirty, fires event", async () => { + const store = storeFromState({ version: 0 }); + const onEvent = vi.fn(); + const messages = [ + bigText("p1", "user"), + bigText("p2", "assistant"), + uiText("r1", "user", "a"), + uiText("r2", "assistant", "b"), + ]; + const out = await applyTier1Compaction({ + chatId: "c", + messages, + state: { + version: 0, + summaryWatermark: null, + contextSummary: null, + compactionDirty: false, + }, + budget: baseBudget, + config: cfg(), + imageProvider: "default", + summarize: noopSummarize, + store, + onEvent, + }); + expect(out.compacted).toBe(true); + expect(store.state.contextSummary).toBe("SUMMARY"); + expect(store.state.summaryWatermark).toBe("p2"); + expect(store.state.compactionDirty).toBe(false); + expect(store.state.version).toBe(1); + expect(out.messages[0].id).toBe("context-summary"); + expect(onEvent).toHaveBeenCalledOnce(); + }); + + it("disabled + not dirty: no compaction even when over the trigger", async () => { + const store = storeFromState({ version: 0 }); + const messages = [ + bigText("p1", "user"), + bigText("p2", "assistant"), + uiText("r1", "user", "a"), + ]; + const out = await applyTier1Compaction({ + chatId: "c", + messages, + state: { + version: 0, + summaryWatermark: null, + contextSummary: null, + compactionDirty: false, + }, + budget: baseBudget, + config: cfg({ compactionEnabled: false }), + imageProvider: "default", + summarize: noopSummarize, + store, + }); + expect(out.compacted).toBe(false); + expect(store.casCalls).toBe(0); + }); + + it("dirty forces compaction even when proactive is disabled (P4 recovery hand-off)", async () => { + const store = storeFromState({ version: 0, compactionDirty: true }); + const messages = [ + bigText("p1", "user"), + bigText("p2", "assistant"), + uiText("r1", "user", "a"), + uiText("r2", "assistant", "b"), + ]; + const out = await applyTier1Compaction({ + chatId: "c", + messages, + state: { + version: 0, + summaryWatermark: null, + contextSummary: null, + compactionDirty: true, + }, + budget: baseBudget, + config: cfg({ compactionEnabled: false }), + imageProvider: "default", + summarize: noopSummarize, + store, + }); + expect(out.compacted).toBe(true); + expect(store.state.compactionDirty).toBe(false); + }); + + it("dirty but already within target: just clears the flag (no summary)", async () => { + const store = storeFromState({ version: 0, compactionDirty: true }); + const messages = [ + uiText("r1", "user", "a"), + uiText("r2", "assistant", "b"), + ]; + const out = await applyTier1Compaction({ + chatId: "c", + messages, + state: { + version: 0, + summaryWatermark: null, + contextSummary: null, + compactionDirty: true, + }, + budget: { + inputBudget: 100000, + triggerTokens: 100000, + targetTokens: 100000, + }, + config: cfg(), + imageProvider: "default", + summarize: noopSummarize, + store, + }); + expect(out.compacted).toBe(false); + expect(store.state.compactionDirty).toBe(false); // flag cleared + expect(store.state.contextSummary).toBeNull(); // no summary written + expect(store.state.version).toBe(1); + }); +}); + +describe("invalidateCompaction (drift C4)", () => { + const ordered = ["m1", "m2", "m3", "m4"]; + + it("resets summary + watermark when a message at/below the watermark changes", async () => { + const store = storeFromState({ + version: 5, + summaryWatermark: "m2", + contextSummary: "S", + }); + const res = await invalidateCompaction(store, "c", ["m2"], ordered); + expect(res.status).toBe("applied"); + expect(store.state.summaryWatermark).toBeNull(); + expect(store.state.contextSummary).toBeNull(); + expect(store.state.version).toBe(6); // bumped so a racing compaction loses (R1) + }); + + it("is a no-op when the edit is entirely above the watermark", async () => { + const store = storeFromState({ + version: 5, + summaryWatermark: "m2", + contextSummary: "S", + }); + const res = await invalidateCompaction(store, "c", ["m4"], ordered); + expect(res).toEqual({ status: "skipped", reason: "no-op" }); + expect(store.state.contextSummary).toBe("S"); + }); + + it("resets when an affected message was deleted (missing from ordering)", async () => { + const store = storeFromState({ + version: 1, + summaryWatermark: "m3", + contextSummary: "S", + }); + const res = await invalidateCompaction(store, "c", ["gone"], ordered); + expect(res.status).toBe("applied"); + expect(store.state.summaryWatermark).toBeNull(); + }); + + it("is a no-op when there is no summary/watermark to invalidate", async () => { + const store = storeFromState({ version: 0 }); + const res = await invalidateCompaction(store, "c", ["m1"], ordered); + expect(res).toEqual({ status: "skipped", reason: "no-op" }); + }); +}); + +describe("affectedBelowWatermark (C4 divergence detection)", () => { + const persisted = [ + uiText("m1", "user", "one"), + uiText("m2", "assistant", "two"), + uiText("m3", "user", "three"), + ]; + + it("returns [] when the prefix is unchanged", () => { + const incoming = [ + uiText("m1", "user", "one"), + uiText("m2", "assistant", "two"), + uiText("m3", "user", "x"), + ]; + expect(affectedBelowWatermark(persisted, incoming, "m2")).toEqual([]); + }); + + it("flags a content edit at/below the watermark", () => { + const incoming = [ + uiText("m1", "user", "EDITED"), + uiText("m2", "assistant", "two"), + uiText("m3", "user", "three"), + ]; + expect(affectedBelowWatermark(persisted, incoming, "m2")).toEqual(["m1"]); + }); + + it("flags a deleted message below the watermark", () => { + const incoming = [ + uiText("m2", "assistant", "two"), + uiText("m3", "user", "three"), + ]; + expect(affectedBelowWatermark(persisted, incoming, "m2")).toEqual(["m1"]); + }); + + it("flags when the watermark message itself is gone from canonical history", () => { + expect(affectedBelowWatermark(persisted, persisted, "ghost")).toEqual([ + "ghost", + ]); + }); + + it("ignores edits strictly above the watermark", () => { + const incoming = [ + uiText("m1", "user", "one"), + uiText("m2", "assistant", "two"), + uiText("m3", "user", "CHANGED"), + ]; + expect(affectedBelowWatermark(persisted, incoming, "m2")).toEqual([]); + }); +}); + +// --- Chunk 3: C1/M2 trigger projection + recovery dirty-flag producer ----- + +import { + projectTier1Tokens, + setCompactionDirty, + COLD_START_MARGIN, +} from "./compaction.ts"; + +describe("projectTier1Tokens (drift C1/M2)", () => { + it("applies the cold-start margin when no provider baseline exists (M2)", () => { + expect( + projectTier1Tokens({ messageTokens: 100, priorSummaryTokens: 0 }), + ).toBe(Math.ceil(100 * COLD_START_MARGIN)); + }); + + it("counts the per-turn overhead toward the trigger (C1)", () => { + expect( + projectTier1Tokens({ + messageTokens: 100, + priorSummaryTokens: 20, + overheadTokens: 50, + }), + ).toBe(Math.ceil(170 * COLD_START_MARGIN)); + }); + + it("uses the provider-reported count as a floor when available", () => { + // The observed live gap: char/4 said ~986, the provider said 8888. + expect( + projectTier1Tokens({ + messageTokens: 986, + priorSummaryTokens: 0, + lastInputTokens: 8888, + }), + ).toBe(8888); + }); + + it("drops the margin when a provider baseline is present", () => { + expect( + projectTier1Tokens({ + messageTokens: 100, + priorSummaryTokens: 0, + lastInputTokens: 50, + }), + ).toBe(100); + }); +}); + +describe("applyTier1Compaction — overhead in the trigger (C1)", () => { + it("fires on system/tool overhead even when messages alone are under trigger", async () => { + const store = storeFromState({ version: 0 }); + // ~4 tokens of messages — far under the 50-token trigger on their own. + const messages = [ + uiText("p1", "user", "aaaa"), + uiText("p2", "assistant", "bbbb"), + uiText("r1", "user", "cccc"), + uiText("r2", "assistant", "dddd"), + ]; + const out = await applyTier1Compaction({ + chatId: "c", + messages, + state: { + version: 0, + summaryWatermark: null, + contextSummary: null, + compactionDirty: false, + }, + budget: { inputBudget: 100, triggerTokens: 50, targetTokens: 25 }, + config: cfg(), + imageProvider: "default", + summarize: noopSummarize, + store, + overheadTokens: 60, // tool schemas + system prompt dominate + }); + expect(out.compacted).toBe(true); + expect(store.state.summaryWatermark).toBe("p2"); + }); + + it("does not fire when messages + overhead stay under the trigger", async () => { + const store = storeFromState({ version: 0 }); + const messages = [ + uiText("r1", "user", "cccc"), + uiText("r2", "assistant", "dddd"), + ]; + const out = await applyTier1Compaction({ + chatId: "c", + messages, + state: { + version: 0, + summaryWatermark: null, + contextSummary: null, + compactionDirty: false, + }, + budget: { inputBudget: 100, triggerTokens: 50, targetTokens: 25 }, + config: cfg(), + imageProvider: "default", + summarize: noopSummarize, + store, + overheadTokens: 10, + }); + expect(out.compacted).toBe(false); + expect(store.casCalls).toBe(0); + }); +}); + +describe("setCompactionDirty (§E recovery producer, drift T3)", () => { + it("sets the flag through the CAS writer", async () => { + const store = storeFromState({ version: 3 }); + const res = await setCompactionDirty(store, "c"); + expect(res).toEqual({ status: "applied", version: 4 }); + expect(store.state.compactionDirty).toBe(true); + }); + + it("is a no-op when already dirty (no version churn)", async () => { + const store = storeFromState({ version: 3, compactionDirty: true }); + const res = await setCompactionDirty(store, "c"); + expect(res).toEqual({ status: "skipped", reason: "no-op" }); + expect(store.casCalls).toBe(0); + expect(store.state.version).toBe(3); + }); + + it("never touches summary or watermark (recovery only flags)", async () => { + const store = storeFromState({ + version: 1, + contextSummary: "KEEP", + summaryWatermark: "m7", + }); + await setCompactionDirty(store, "c"); + expect(store.state.contextSummary).toBe("KEEP"); + expect(store.state.summaryWatermark).toBe("m7"); + }); +}); diff --git a/apps/backend/src/runs/compaction.ts b/apps/backend/src/runs/compaction.ts new file mode 100644 index 00000000..c01266c0 --- /dev/null +++ b/apps/backend/src/runs/compaction.ts @@ -0,0 +1,1102 @@ +/** + * Context compaction (context-compaction-plan §C/§D, ADR-0009). + * + * This module owns durable compaction state and the message-shaping primitives. + * Slice 2a (this section) is the **single durable writer** (principle P3): every + * mutation of `summaryWatermark` / `contextSummary` / `compactionDirty` flows + * through {@link CompactionStore.casWrite}, a version-gated compare-and-swap. + * + * Why versioned CAS and not "compare the watermark value" (drift R1): history + * edits (§C invalidation) move the watermark **backward**. A loser that compared + * watermark values could mistake a reset for "not yet advanced" and write a stale + * summary over mutated history. Deciding by `version` removes the monotonicity + * assumption entirely — any concurrent mutation bumps the version, so a racing + * write simply loses the CAS and re-reads the truth. + */ + +import { and, eq } from "drizzle-orm"; +import type { ModelMessage, PrepareStepFunction } from "ai"; +import { db } from "../index.ts"; +import { chat as chatTable } from "../db/schema.ts"; +import { logger } from "../logger.ts"; +import type { PlatypusUIMessage } from "../types.ts"; +import { + estimateTokens, + stableStringify, + uiMessagesToCountUnits, + modelMessagesToCountUnits, + CHARS_PER_TOKEN, + type ImageProvider, +} from "./token-estimate.ts"; + +/** Durable compaction state on the chat row. */ +export type CompactionState = { + version: number; + summaryWatermark: string | null; + contextSummary: string | null; + compactionDirty: boolean; +}; + +/** + * A patch to the compaction fields. Only the keys present are written; absent + * keys are left untouched. `version` is always bumped by the writer (not here). + */ +export type WatermarkPatch = { + watermark?: string | null; + summary?: string | null; + dirty?: boolean; +}; + +/** + * The durable-state seam. Production wires this to Drizzle + * ({@link drizzleCompactionStore}); tests pass an in-memory implementation so + * the CAS algorithm is exercised without Postgres. + */ +export type CompactionStore = { + readState(chatId: string): Promise; + /** + * Version-gated compare-and-swap. Applies `patch` and sets + * `version = expectVersion + 1` **only if** the row's current version still + * equals `expectVersion`. Returns true iff exactly one row was updated + * (i.e. this writer won). The single durable writer (P3). + */ + casWrite( + chatId: string, + expectVersion: number, + patch: WatermarkPatch, + ): Promise; +}; + +export const drizzleCompactionStore: CompactionStore = { + async readState(chatId) { + const rows = await db + .select({ + version: chatTable.version, + summaryWatermark: chatTable.summaryWatermark, + contextSummary: chatTable.contextSummary, + compactionDirty: chatTable.compactionDirty, + }) + .from(chatTable) + .where(eq(chatTable.id, chatId)) + .limit(1); + return rows[0] ?? null; + }, + + async casWrite(chatId, expectVersion, patch) { + const set: Record = { + version: expectVersion + 1, + updatedAt: new Date(), + }; + // Only touch the fields named in the patch — `in` so an explicit null + // (clear summary / reset watermark) is distinguishable from "leave alone". + if ("watermark" in patch) set.summaryWatermark = patch.watermark; + if ("summary" in patch) set.contextSummary = patch.summary; + if ("dirty" in patch) set.compactionDirty = patch.dirty; + + const updated = await db + .update(chatTable) + .set(set) + .where( + and(eq(chatTable.id, chatId), eq(chatTable.version, expectVersion)), + ) + .returning({ id: chatTable.id }); + return updated.length === 1; + }, +}; + +/** Outcome of {@link commitWatermark}. */ +export type CommitResult = + | { status: "applied"; version: number } + | { status: "skipped"; reason: "no-op" | "covered" | "contended" }; + +/** + * Decision an attempt makes against the freshly-read state: either write a patch + * or skip (a no-op, or because a concurrent winner already covered this work). + */ +export type WatermarkDecision = + | { kind: "write"; patch: WatermarkPatch } + | { kind: "skip"; reason: "no-op" | "covered" }; + +/** + * The single entry point for mutating compaction state (P3, drift T10). + * + * Reads the current state, asks `decide` what to do, and CAS-writes it. On a + * CAS conflict it re-reads and retries the decision **once**; a second conflict + * terminates as `skipped: "contended"` — never a recompute loop, so there is no + * livelock. Because `decide` is re-run against the re-read state, a racing + * invalidation (which bumps version + resets the watermark) is seen on the + * retry, and `decide` can choose to skip rather than write a stale summary. + * + * `decide` returning `skip: "covered"` means a winner already did this work; the + * caller should pass a patch that also clears `compactionDirty` in that branch + * if it wants the flag cleared (it is just another field on the patch). + */ +export async function commitWatermark( + store: CompactionStore, + chatId: string, + decide: (state: CompactionState) => WatermarkDecision, +): Promise { + const MAX_ATTEMPTS = 2; + for (let attempt = 0; attempt < MAX_ATTEMPTS; attempt++) { + const state = await store.readState(chatId); + if (!state) return { status: "skipped", reason: "no-op" }; + + const decision = decide(state); + if (decision.kind === "skip") { + return { status: "skipped", reason: decision.reason }; + } + + const won = await store.casWrite(chatId, state.version, decision.patch); + if (won) return { status: "applied", version: state.version + 1 }; + // Lost the CAS — a concurrent writer moved the version. Loop to re-read and + // re-decide. The decision compares VERSION (via the re-read), not watermark + // values, so a backward watermark reset cannot be misread (R1). The metric + // gates whether the R4 read→summarize→write contention note ever needs a fix. + logger.info( + { metric: "cas.conflict", chatId, attempt, version: state.version }, + "cas.conflict", + ); + } + + logger.warn( + { metric: "cas.conflict", chatId, contended: true }, + "compaction CAS contended past retry — skipping (safe no-op)", + ); + return { status: "skipped", reason: "contended" }; +} + +// =========================================================================== +// Slice 2b — compaction primitives (the message-shaping leaves) +// +// Two adapters share the same staged, cheap-first strategy (LibreChat pattern): +// Stage 1 — prune bulky tool results (no model call). Often enough. +// Stage 2 — summarize the older prefix into one synthetic summary (model call). +// `compactUIMessages` (Tier 1, durable) and `compactModelMessages` (Tier 2 + +// recovery, throwaway) differ only in message shape and the tool-pairing rule. +// Token counting is the ONE estimator from token-estimate.ts (P2). +// =========================================================================== + +/** Summarizes a transcript into a compact paragraph. Injected (the task model). */ +export type Summarize = (text: string) => Promise; + +/** Rough token count of a bare string (summary text) — the same char/4 rule. */ +function textTokens(text: string): number { + return Math.ceil(text.length / CHARS_PER_TOKEN); +} + +/** + * Soft-trims an over-long string to head+tail with an elision marker, so a bulky + * tool result keeps some signal instead of vanishing entirely. + */ +export function softTrim(text: string, keepEachSide = 500): string { + if (text.length <= keepEachSide * 2) return text; + const head = text.slice(0, keepEachSide); + const tail = text.slice(-keepEachSide); + const elided = text.length - keepEachSide * 2; + return `${head}\n…[elided ${elided} chars]…\n${tail}`; +} + +/** + * Picks the index splitting `prefix = [0, boundary)` from `recent = [boundary, + * total)`. Starts at `total - keepRecent`, then walks backward while the + * boundary is unsafe so a tool-call/result pair is never split (drift in §C). + */ +export function pickKeepBoundary( + total: number, + keepRecent: number, + isSafeBoundary: (index: number) => boolean, +): number { + let boundary = Math.max(0, total - keepRecent); + while (boundary > 0 && !isSafeBoundary(boundary)) boundary--; + return boundary; +} + +// --- Tier 1: UIMessage shape --------------------------------------------- + +/** + * Prunes bulky tool-result outputs in a UIMessage in place on a shallow copy. + * The tool part is kept (never dropped — the assistant tool message is atomic, + * §C); only its `output` is soft-trimmed. Returns the (possibly) pruned message. + */ +function pruneUIMessage( + message: PlatypusUIMessage, + minPrunableChars: number, +): { message: PlatypusUIMessage; changed: boolean } { + let changed = false; + const parts = (message.parts ?? []).map((part) => { + const anyPart = part as { type: string; output?: unknown }; + const isTool = + anyPart.type === "dynamic-tool" || anyPart.type.startsWith("tool-"); + if (!isTool || anyPart.output === undefined) return part; + const serialized = + typeof anyPart.output === "string" + ? anyPart.output + : JSON.stringify(anyPart.output); + if (serialized.length <= minPrunableChars) return part; + changed = true; + return { ...anyPart, output: softTrim(serialized) }; + }); + return changed + ? { message: { ...message, parts } as PlatypusUIMessage, changed } + : { message, changed }; +} + +/** Builds a readable transcript of UIMessages for the summarizer. */ +function renderUIMessages(messages: PlatypusUIMessage[]): string { + return messages + .map((m) => { + const text = (m.parts ?? []) + .map((p) => { + const ap = p as { type: string; text?: string; output?: unknown }; + if (ap.type === "text") return ap.text ?? ""; + if (ap.type === "dynamic-tool" || ap.type.startsWith("tool-")) { + const out = + typeof ap.output === "string" + ? ap.output + : ap.output !== undefined + ? JSON.stringify(ap.output) + : ""; + return `[tool ${ap.type}] ${softTrim(out, 200)}`; + } + return ""; + }) + .filter(Boolean) + .join("\n"); + return `${m.role}: ${text}`; + }) + .join("\n\n"); +} + +export type UICompactOptions = { + /** Reduce the model view to at most this many tokens (hysteresis target). */ + targetTokens: number; + keepRecentMessages: number; + minPrunableChars: number; + imageProvider?: ImageProvider; + /** Existing durable summary to fold the new prefix into (incremental). */ + priorSummary?: string | null; + summarize: Summarize; + /** Token budget of one summarize call; larger prefixes are map-reduced (M1). */ + summarizerWindow?: number; + /** + * Bypass the no-op estimate gate and force compaction even when char/4 says + * we are within budget. Used for dirty-forced Tier 1 (§E/RV3): recovery sets + * the dirty flag AFTER a provider rejection, so the estimator already failed; + * re-using it as the no-op gate causes an infinite overflow→dirty→no-op loop. + */ + force?: boolean; + /** + * Pre-computed estimate of `messages` (RV9). The caller's trigger projection + * already ran the char/4 pass over this exact set, so reuse it instead of + * re-estimating the full history a second time on the hot path. + */ + knownEstimate?: number; +}; + +export type UICompactionResult = { + /** Messages to send to the model (recent verbatim; pruned prefix if no summary). */ + keptMessages: PlatypusUIMessage[]; + /** New folded summary, or unchanged prior summary, or null. */ + summaryText: string | null; + /** Id of the last message folded into the summary (the new watermark), or null. */ + watermarkId: string | null; + messagesDropped: number; + usedModelCall: boolean; + /** Post-compaction estimate incl. the summary — should be ≤ targetTokens (C2). */ + estimatedTokens: number; +}; + +/** + * Summarizes a prefix transcript, map-reducing when it exceeds the summarizer's + * own window (drift M1 — a huge cold-start history can't be sent whole). + */ +async function summarizePrefix( + prefixText: string, + priorSummary: string | null | undefined, + summarize: Summarize, + summarizerWindow: number | undefined, +): Promise { + const fold = (prior: string | null | undefined, body: string) => + prior ? `Previous summary:\n${prior}\n\nNewer messages:\n${body}` : body; + + if (!summarizerWindow || textTokens(prefixText) <= summarizerWindow) { + return summarize(fold(priorSummary, prefixText)); + } + + // Map-reduce: chunk the prefix by character budget, summarize each, then + // summarize the concatenated chunk summaries folded with the prior summary. + const charBudget = summarizerWindow * CHARS_PER_TOKEN; + const chunks: string[] = []; + for (let i = 0; i < prefixText.length; i += charBudget) { + chunks.push(prefixText.slice(i, i + charBudget)); + } + const chunkSummaries: string[] = []; + for (const chunk of chunks) chunkSummaries.push(await summarize(chunk)); + return summarize(fold(priorSummary, chunkSummaries.join("\n"))); +} + +/** + * Tier 1 (durable) compaction over UIMessages. Stage 1 prunes; if that reaches + * the target, no model call is made and the prefix stays (lighter). Otherwise + * Stage 2 summarizes the prefix into one synthetic summary and drops it from the + * model view. Raw messages are never mutated by the caller (P1 — this returns a + * view). + */ +export async function compactUIMessages( + messages: PlatypusUIMessage[], + opts: UICompactOptions, +): Promise { + const provider = opts.imageProvider ?? "default"; + const priorTokens = opts.priorSummary ? textTokens(opts.priorSummary) : 0; + const estimate = (msgs: PlatypusUIMessage[]) => + estimateTokens(uiMessagesToCountUnits(msgs, provider)); + + // RV9: reuse the caller's already-computed estimate of `messages` rather than + // re-running the full char/4 pass on the hot path. + const initialEstimate = opts.knownEstimate ?? estimate(messages); + + // No-op when already within target (incl. the existing summary). This is what + // makes a follow-up turn after compaction NOT re-fire (hysteresis, C2). + // Bypassed when `force` is set — recovery sets the dirty flag AFTER a provider + // rejection, so the estimator already proved wrong; using it as a no-op gate + // causes an infinite overflow→dirty→no-op loop (RV3). + if (!opts.force && initialEstimate + priorTokens <= opts.targetTokens) { + return { + keptMessages: messages, + summaryText: opts.priorSummary ?? null, + watermarkId: null, + messagesDropped: 0, + usedModelCall: false, + estimatedTokens: initialEstimate + priorTokens, + }; + } + + const boundary = pickKeepBoundary( + messages.length, + opts.keepRecentMessages, + () => true, // UIMessage tool-call+result live in one message — any split is safe + ); + const prefix = messages.slice(0, boundary); + const recent = messages.slice(boundary); + + // Stage 1 — prune bulky tool results in the prefix (no model call). + const prunedPrefix = prefix.map( + (m) => pruneUIMessage(m, opts.minPrunableChars).message, + ); + const prunedAll = [...prunedPrefix, ...recent]; + if (!opts.force && estimate(prunedAll) + priorTokens <= opts.targetTokens) { + return { + keptMessages: prunedAll, + summaryText: opts.priorSummary ?? null, + watermarkId: null, // pruning advances no watermark (no new summary) + messagesDropped: 0, + usedModelCall: false, + estimatedTokens: estimate(prunedAll) + priorTokens, + }; + } + + // RV4: nothing to summarize when the prefix is empty (history fits within + // keepRecentMessages). Also bail when the boundary message has no id — we + // cannot anchor a watermark there, and committing a watermark:null + + // non-null summary would orphan the summary (viewAfterWatermark ignores + // contextSummary when the watermark is null, so the previously-summarised + // prefix reappears every turn). + const watermarkId = + prefix.length > 0 ? (prefix[prefix.length - 1].id ?? null) : null; + if (prefix.length === 0 || watermarkId === null) { + return { + keptMessages: prunedAll, + summaryText: opts.priorSummary ?? null, + watermarkId: null, + messagesDropped: 0, + usedModelCall: false, + estimatedTokens: estimate(prunedAll) + priorTokens, + }; + } + + // Stage 2 — summarize the pruned prefix into one synthetic summary. + const summaryText = await summarizePrefix( + renderUIMessages(prunedPrefix), + opts.priorSummary, + opts.summarize, + opts.summarizerWindow, + ); + + return { + keptMessages: recent, + summaryText, + watermarkId, + messagesDropped: prefix.length, + usedModelCall: true, + estimatedTokens: estimate(recent) + textTokens(summaryText), + }; +} + +// --- Tier 2 / recovery: ModelMessage shape ------------------------------- + +/** Soft-trims bulky tool-result parts in a ModelMessage (role "tool"). */ +function pruneModelMessage( + message: ModelMessage, + minPrunableChars: number, +): ModelMessage { + if (message.role !== "tool" || typeof message.content === "string") { + return message; + } + const content = message.content.map((part) => { + if (part.type !== "tool-result") return part; + const output = part.output; + if (output.type === "text" || output.type === "error-text") { + if (output.value.length > minPrunableChars) { + return { + ...part, + output: { ...output, value: softTrim(output.value) }, + }; + } + return part; + } + if (output.type === "json" || output.type === "error-json") { + const serialized = JSON.stringify(output.value); + if (serialized.length > minPrunableChars) { + return { + ...part, + output: { type: "text" as const, value: softTrim(serialized) }, + }; + } + } + // RV5: @ai-sdk/mcp emits {type:"content"} for essentially every MCP tool + // result. Without this branch Stage 1 reclaims zero tokens from the bulkiest + // payloads and their text is invisible to the summarizer. + if (output.type === "content" && Array.isArray(output.value)) { + type ContentItem = { type: string; text?: string }; + const items = output.value as ContentItem[]; + const text = items + .filter((i) => i.type === "text") + .map((i) => i.text ?? "") + .join("\n"); + const mediaCount = items.filter((i) => i.type !== "text").length; + const marker = mediaCount > 0 ? `\n[${mediaCount} media item(s)]` : ""; + // Trim the text BEFORE appending the media marker so a huge text payload + // can never truncate the "[N media item(s)]" signal. + if (text.length + marker.length > minPrunableChars) { + return { + ...part, + output: { + type: "content" as const, + value: [ + { type: "text" as const, text: `${softTrim(text)}${marker}` }, + ], + }, + }; + } + } + return part; + }); + return { ...message, content }; +} + +function renderModelMessages(messages: ModelMessage[]): string { + return messages + .map((m) => { + if (typeof m.content === "string") return `${m.role}: ${m.content}`; + const text = m.content + .map((p) => { + if (p.type === "text") return p.text; + if (p.type === "tool-call") return `[tool-call ${p.toolName}]`; + if (p.type === "tool-result") { + const o = p.output; + let v: string; + if (o.type === "text" || o.type === "error-text") { + v = o.value; + } else if (o.type === "json" || o.type === "error-json") { + v = JSON.stringify(o.value); + } else if (o.type === "content") { + // RV5: extract text items from content-type MCP output (RV5). + type ContentItem = { type: string; text?: string }; + v = (o.value as ContentItem[]) + .filter((i) => i.type === "text") + .map((i) => i.text ?? "") + .join("\n"); + } else { + v = ""; + } + return `[tool-result] ${softTrim(v, 200)}`; + } + return ""; + }) + .filter(Boolean) + .join("\n"); + return `${m.role}: ${text}`; + }) + .join("\n\n"); +} + +/** A synthetic summary as a model message. User-role + clear framing is the most + * broadly accepted shape (avoids mid-array system-message restrictions). */ +export function summaryModelMessage(text: string): ModelMessage { + return { + role: "user", + content: [ + { type: "text", text: `[Summary of earlier conversation]\n${text}` }, + ], + }; +} + +export type ModelCompactOptions = { + targetTokens: number; + keepRecentMessages: number; + minPrunableChars: number; + imageProvider?: ImageProvider; + summarize: Summarize; + summarizerWindow?: number; + /** Bypass the no-op estimate gate (same semantics as UICompactOptions.force). */ + force?: boolean; + /** + * Estimate of `messages` the caller already computed (e.g. the Tier 2 + * prepareStep trigger check). Reuses it for gate 1 instead of re-running a + * full estimate pass over the same messages. + */ + knownEstimate?: number; +}; + +export type ModelCompactionResult = { + messages: ModelMessage[]; + messagesDropped: number; + usedModelCall: boolean; + estimatedTokens: number; +}; + +/** + * Tier 2 (intra-turn) / recovery compaction over ModelMessages. Throwaway — the + * SDK keeps its canonical list; this only keeps a heavy response executable. + * Pairing rule differs from Tier 1: an assistant tool-call and its following + * `role:"tool"` result are separate messages and must not be split. + */ +export async function compactModelMessages( + messages: ModelMessage[], + opts: ModelCompactOptions, +): Promise { + const provider = opts.imageProvider ?? "default"; + const estimate = (msgs: ModelMessage[]) => + estimateTokens(modelMessagesToCountUnits(msgs, provider)); + + const initialEstimate = opts.knownEstimate ?? estimate(messages); + if (!opts.force && initialEstimate <= opts.targetTokens) { + return { + messages, + messagesDropped: 0, + usedModelCall: false, + estimatedTokens: initialEstimate, + }; + } + + // A boundary is unsafe if it would start `recent` on a tool result orphaned + // from its assistant tool-call (which would sit in the dropped prefix). + const boundary = pickKeepBoundary( + messages.length, + opts.keepRecentMessages, + (i) => i >= messages.length || messages[i].role !== "tool", + ); + const prefix = messages.slice(0, boundary); + const recent = messages.slice(boundary); + + // Stage 1 — prune. + const prunedPrefix = prefix.map((m) => + pruneModelMessage(m, opts.minPrunableChars), + ); + const prunedAll = [...prunedPrefix, ...recent]; + // Force-guarded like gate 1 (RV3): when recovery forces a trim the provider + // already rejected this prompt, so the estimator proved wrong — re-trusting + // it here would return a byte-identical prompt and burn the single retry. + if (!opts.force && estimate(prunedAll) <= opts.targetTokens) { + return { + messages: prunedAll, + messagesDropped: 0, + usedModelCall: false, + estimatedTokens: estimate(prunedAll), + }; + } + + // RV4 (model-side): nothing to summarize when the prefix is empty (recent + // alone exceeds keepRecentMessages). Summarizing an empty prefix would add a + // synthetic message and GROW the prompt — never converges. Surface the + // overflow instead (recovery retries once, then propagates). + if (prefix.length === 0) { + return { + messages: prunedAll, + messagesDropped: 0, + usedModelCall: false, + estimatedTokens: estimate(prunedAll), + }; + } + + // Stage 2 — summarize the pruned prefix into one synthetic message. + const summaryText = await summarizePrefix( + renderModelMessages(prunedPrefix), + null, + opts.summarize, + opts.summarizerWindow, + ); + const compacted = [summaryModelMessage(summaryText), ...recent]; + return { + messages: compacted, + messagesDropped: prefix.length, + usedModelCall: true, + estimatedTokens: estimate(compacted), + }; +} + +// =========================================================================== +// Slice 2c — Tier 1 orchestration (budget, view reconstruction, persist) +// +// `applyTier1Compaction` is the durable, cross-turn entry point invoked from +// `prepareChatTurn`. It is dependency-injected (store + summarizer) so it is +// unit-testable without standing up the full turn machinery. It: +// 1. Reconstructs the compacted VIEW from persisted state every turn (P1) — +// drop messages up to the watermark, re-inject the stored summary. +// 2. Triggers a fresh compaction when the projected size crosses the trigger +// ratio, OR when `compactionDirty` forces it (recovery hand-off, §E). +// 3. Persists any new summary/watermark + clears dirty via the single CAS +// writer (P3), the loser skipping safely on contention (R4). +// =========================================================================== + +/** Resolved per-turn compaction config (§G), defaults applied. */ +export type CompactionConfig = { + compactionEnabled: boolean; + triggerRatio: number; + targetRatio: number; + reserveRatio: number; + keepRecentMessages: number; + minPrunableChars: number; +}; + +export const DEFAULT_COMPACTION_CONFIG: CompactionConfig = { + compactionEnabled: true, + triggerRatio: 0.8, + targetRatio: 0.5, + reserveRatio: 0.05, + keepRecentMessages: 10, + minPrunableChars: 2000, +}; + +/** Agent row fields that override the compaction defaults (all optional). */ +export type CompactionConfigOverrides = { + compactionEnabled?: boolean | null; + triggerRatio?: number | null; + targetRatio?: number | null; + reserveRatio?: number | null; + keepRecentMessages?: number | null; + minPrunableChars?: number | null; +}; + +export function resolveCompactionConfig( + overrides: CompactionConfigOverrides | null | undefined, +): CompactionConfig { + const o = overrides ?? {}; + const pick = (v: T | null | undefined, d: T): T => (v == null ? d : v); + const triggerRatio = pick( + o.triggerRatio, + DEFAULT_COMPACTION_CONFIG.triggerRatio, + ); + let targetRatio = pick(o.targetRatio, DEFAULT_COMPACTION_CONFIG.targetRatio); + // Hysteresis backstop (drift C2): the post-compaction target must stay below + // the trigger or compaction re-fires every turn. The agent create/update zod + // schema already rejects an inverted pair, but a pre-existing/legacy row (or a + // direct DB write) could still carry one — clamp it here so the runtime can + // never thrash. + if (targetRatio >= triggerRatio) { + targetRatio = triggerRatio * 0.9; + } + return { + compactionEnabled: pick( + o.compactionEnabled, + DEFAULT_COMPACTION_CONFIG.compactionEnabled, + ), + triggerRatio, + targetRatio, + reserveRatio: pick(o.reserveRatio, DEFAULT_COMPACTION_CONFIG.reserveRatio), + keepRecentMessages: pick( + o.keepRecentMessages, + DEFAULT_COMPACTION_CONFIG.keepRecentMessages, + ), + minPrunableChars: pick( + o.minPrunableChars, + DEFAULT_COMPACTION_CONFIG.minPrunableChars, + ), + }; +} + +export type Budget = { + inputBudget: number; + triggerTokens: number; + targetTokens: number; +}; + +/** + * Budget math (drift C3): the trigger/target are fractions of the INPUT budget — + * the window minus the output reservation and a safety headroom — not of the raw + * window. When the resolved max output is unknown, reserve a conservative slice. + */ +export function computeBudget( + contextWindow: number, + maxOutputTokens: number | undefined, + config: CompactionConfig, +): Budget { + const maxOutputReserve = + maxOutputTokens ?? Math.min(4096, Math.floor(contextWindow * 0.25)); + const safetyReserve = Math.floor(config.reserveRatio * contextWindow); + const inputBudget = Math.max( + 1, + contextWindow - maxOutputReserve - safetyReserve, + ); + return { + inputBudget, + triggerTokens: config.triggerRatio * inputBudget, + targetTokens: config.targetRatio * inputBudget, + }; +} + +/** + * First-turn safety margin on the char/4 projection (drift M2): char/4 + * under-counts CJK, dense JSON, and tool chatter, and on a cold start there is + * no provider-reported `usage.inputTokens` to correct it. + */ +export const COLD_START_MARGIN = 1.15; + +/** + * The Tier 1 trigger projection (drift C1): what THIS turn is about to put on + * the wire, not just the stored messages. `overheadTokens` carries the + * estimated system prompt + tool schemas + skill payload — invisible to a + * message-only estimate but sent to the model on every turn (the observed + * live-test gap: provider reported 8888 input tokens vs ~986 message-only). + * `lastInputTokens` is the provider-reported count from the prior turn — the + * corrective baseline for turns ≥ 2 (threaded in the §H usage-metadata chunk). + * When it is absent the whole char/4 projection is inflated by + * {@link COLD_START_MARGIN} (M2). + */ +export function projectTier1Tokens(args: { + messageTokens: number; + priorSummaryTokens: number; + overheadTokens?: number; + lastInputTokens?: number; +}): number { + const charBased = + args.messageTokens + args.priorSummaryTokens + (args.overheadTokens ?? 0); + if (args.lastInputTokens == null) { + return Math.ceil(charBased * COLD_START_MARGIN); + } + // Two independent estimates of this turn's payload: `charBased` is a fresh + // char/4 pass over the whole unsummarized view (+ summary + overhead); + // `lastInputTokens` is the provider's accurate count from the prior turn but + // stale (missing messages appended since). Take the larger — char/4 chronically + // under-counts, so this is usually `lastInputTokens`; over-counting only + // triggers compaction earlier, never an overflow. + return Math.max(Math.ceil(charBased), args.lastInputTokens); +} + +/** Synthetic UIMessage carrying the persisted summary, injected into the view. */ +export function summaryUIMessage(text: string): PlatypusUIMessage { + return { + id: "context-summary", + role: "user", + parts: [ + { type: "text", text: `[Summary of earlier conversation]\n${text}` }, + ], + } as PlatypusUIMessage; +} + +/** Fail-loud event so the transcript shows compaction happened (§C). */ +export type CompactionEvent = { + type: "context-compacted"; + messagesDropped: number; + tokensBefore: number; + tokensAfter: number; +}; + +export type Tier1Input = { + chatId: string; + /** Full durable history (post-`inlineFileUrls`, drift T2). */ + messages: PlatypusUIMessage[]; + state: CompactionState; + budget: Budget; + config: CompactionConfig; + imageProvider: ImageProvider; + summarize: Summarize; + store: CompactionStore; + summarizerWindow?: number; + /** + * Estimated tokens of the per-turn payload that is NOT in `messages` — + * system prompt, tool schemas, skill list (drift C1). Counted toward the + * trigger and subtracted from the compaction target (compaction cannot + * shrink it, so hysteresis must leave room for it — C2). + */ + overheadTokens?: number; + /** Provider-reported `usage.inputTokens` from the prior turn (C1, via §H). */ + lastInputTokens?: number; + onEvent?: (event: CompactionEvent) => void; +}; + +export type Tier1Output = { + /** The compacted view to send to the model (summary message + recent). */ + messages: PlatypusUIMessage[]; + /** True when a new summary was produced and persisted this turn. */ + compacted: boolean; + commit?: CommitResult; +}; + +/** Splits history at the watermark message id. Returns the messages after it and + * whether the stored summary is still trustworthy (watermark id still present). */ +function viewAfterWatermark( + messages: PlatypusUIMessage[], + state: CompactionState, +): { afterWatermark: PlatypusUIMessage[]; priorSummary: string | null } { + if (!state.summaryWatermark) { + return { afterWatermark: messages, priorSummary: null }; + } + const idx = messages.findIndex((m) => m.id === state.summaryWatermark); + if (idx === -1) { + // Watermark message is gone (edited/deleted before invalidation landed): + // distrust the summary and fall back to the full history (defensive C4). + return { afterWatermark: messages, priorSummary: null }; + } + return { + afterWatermark: messages.slice(idx + 1), + priorSummary: state.contextSummary, + }; +} + +export async function applyTier1Compaction( + input: Tier1Input, +): Promise { + const { messages, state, budget, config, imageProvider } = input; + const estimate = (msgs: PlatypusUIMessage[]) => + estimateTokens(uiMessagesToCountUnits(msgs, imageProvider)); + + const { afterWatermark, priorSummary } = viewAfterWatermark(messages, state); + const priorSummaryTokens = priorSummary ? textTokens(priorSummary) : 0; + + const inject = (summary: string | null, msgs: PlatypusUIMessage[]) => + summary ? [summaryUIMessage(summary), ...msgs] : msgs; + + // The view that would be sent if we did nothing more this turn. + const baseView = inject(priorSummary, afterWatermark); + const overheadTokens = input.overheadTokens ?? 0; + // RV9: compute the char/4 pass over the unsummarized view once and reuse it + // for both the trigger projection and compactUIMessages' no-op gate. + const messageTokens = estimate(afterWatermark); + const projected = projectTier1Tokens({ + messageTokens, + priorSummaryTokens, + overheadTokens, + lastInputTokens: input.lastInputTokens, + }); + + const forceCompact = state.compactionDirty; + const triggered = + forceCompact || + (config.compactionEnabled && projected >= budget.triggerTokens); + + if (!triggered) { + return { messages: baseView, compacted: false }; + } + + // Compaction can only shrink the messages, never the per-turn overhead, so + // the target the messages must fit in is reduced by it (C1/C2). When the + // overhead alone exhausts the target, hysteresis is impossible — warn loudly + // (compaction will re-fire every turn) but still compact: recovery is the + // only other net. + const effectiveTarget = Math.max(0, budget.targetTokens - overheadTokens); + if (overheadTokens >= budget.targetTokens) { + logger.warn( + { chatId: input.chatId, overheadTokens, target: budget.targetTokens }, + "system/tool overhead alone exceeds the compaction target — compaction will re-fire each turn", + ); + } + + const result = await compactUIMessages(afterWatermark, { + targetTokens: effectiveTarget, + keepRecentMessages: config.keepRecentMessages, + minPrunableChars: config.minPrunableChars, + imageProvider, + priorSummary, + summarize: input.summarize, + summarizerWindow: input.summarizerWindow, + // When dirty-forced the estimator already proved wrong (RV3): bypass the + // no-op gate so recovery's dirty flag actually shrinks the history. + force: forceCompact, + // RV9: the no-op gate estimates this exact set; reuse the value above. + knownEstimate: messageTokens, + }); + + const view = inject(result.summaryText ?? priorSummary, result.keptMessages); + + // Persist through the single CAS writer (P3). The decision is gated on the + // version we read; if a concurrent writer advanced it, we skip rather than + // recompute (R4 — the wasted summarize is bounded, never corrupting). The + // version-pinning gate is shared so both write paths decide identically. + const capturedVersion = state.version; + const pinnedWrite = (patch: WatermarkPatch) => + commitWatermark(input.store, input.chatId, (latest) => + latest.version === capturedVersion + ? { kind: "write", patch } + : { kind: "skip", reason: "covered" }, + ); + let commit: CommitResult | undefined; + + if (result.usedModelCall) { + commit = await pinnedWrite({ + summary: result.summaryText, + watermark: result.watermarkId, + dirty: false, + }); + logger.info( + { + metric: "compaction.fired", + tier: 1, + chatId: input.chatId, + tokensBefore: projected, + tokensAfter: result.estimatedTokens, + messagesDropped: result.messagesDropped, + }, + "compaction.fired", + ); + input.onEvent?.({ + type: "context-compacted", + messagesDropped: result.messagesDropped, + tokensBefore: projected, + tokensAfter: result.estimatedTokens, + }); + } else if (state.compactionDirty) { + // Forced by recovery but pruning/within-target sufficed: just clear the flag. + commit = await pinnedWrite({ dirty: false }); + } + + return { messages: view, compacted: result.usedModelCall, commit }; +} + +/** + * Detects which summarized messages (at/below the watermark) the freshly + * submitted history changed or dropped — the C4 trigger. Because the client + * resubmits the full message array each turn (there is no separate edit/delete + * endpoint), divergence is found by comparing the persisted canonical history + * against the incoming one up to the watermark. Returns the ids that an + * edit/delete/regenerate touched; empty means the summary is still valid. + */ +export function affectedBelowWatermark( + persisted: PlatypusUIMessage[], + incoming: PlatypusUIMessage[], + watermarkId: string | null, +): string[] { + if (!watermarkId) return []; + const wmIdx = persisted.findIndex((m) => m.id === watermarkId); + if (wmIdx === -1) return [watermarkId]; // watermark message gone entirely + const incomingById = new Map(incoming.map((m) => [m.id, m])); + const affected: string[] = []; + for (let i = 0; i <= wmIdx; i++) { + const p = persisted[i]; + if (!p.id) continue; + const inc = incomingById.get(p.id); + if (!inc || stableStringify(inc.parts) !== stableStringify(p.parts)) { + affected.push(p.id); + } + } + return affected; +} + +/** + * Persists `compactionDirty = true` after a context-overflow recovery (§E, + * drift T3). Recovery never writes summary/watermark — it only flags; the next + * `prepareChatTurn` sees the flag, forces Tier 1, and clears it inside the same + * CAS write that advances the watermark. Goes through the single writer (P3); + * already-dirty is a no-op. + */ +export async function setCompactionDirty( + store: CompactionStore, + chatId: string, +): Promise { + return commitWatermark(store, chatId, (state) => + state.compactionDirty + ? { kind: "skip", reason: "no-op" } + : { kind: "write", patch: { dirty: true } }, + ); +} + +export async function invalidateCompaction( + store: CompactionStore, + chatId: string, + affectedIds: string[], + orderedIds: string[], +): Promise { + return commitWatermark(store, chatId, (state) => { + if (!state.summaryWatermark && !state.contextSummary) { + return { kind: "skip", reason: "no-op" }; + } + const wmIndex = state.summaryWatermark + ? orderedIds.indexOf(state.summaryWatermark) + : orderedIds.length; // null watermark ⇒ everything is "summarized-from-start" + const affectsSummarized = affectedIds.some((id) => { + const i = orderedIds.indexOf(id); + // Affected message is missing (deleted) or sits at/below the watermark. + return i === -1 || (wmIndex !== -1 && i <= wmIndex); + }); + if (!affectsSummarized) return { kind: "skip", reason: "no-op" }; + return { kind: "write", patch: { summary: null, watermark: null } }; + }); +} + +// --- Tier 2 in-turn compaction (§D, ADR-0009) --- + +/** + * Per-turn Tier 2 compaction context (§D). Null when the §G kill switch or + * agent config disables proactive compaction. Sub-agents also receive Tier 2 + * (drift M3 — they have no durable history for Tier 1, but their tool loop + * can bloat intra-turn). + */ +export type Tier2Context = { + triggerTokens: number; + targetTokens: number; + keepRecentMessages: number; + minPrunableChars: number; + imageProvider: ImageProvider; + summarize: Summarize; + summarizerWindow?: number; +}; + +/** + * Builds the Tier 2 in-turn compaction `prepareStep` callback (§D). Fires + * before each step of a tool loop when the accumulated model messages exceed + * `triggerTokens` — compacts via `compactModelMessages` and returns the + * trimmed messages. Returns `undefined` when below the threshold so the SDK + * proceeds unchanged (drift m3: no per-step overhead when the loop is small). + */ +export function buildTier2PrepareStep(ctx: Tier2Context): PrepareStepFunction { + return async ({ messages }) => { + const estimate = estimateTokens( + modelMessagesToCountUnits(messages, ctx.imageProvider), + ); + if (estimate < ctx.triggerTokens) return undefined; + + const result = await compactModelMessages(messages, { + targetTokens: ctx.targetTokens, + keepRecentMessages: ctx.keepRecentMessages, + minPrunableChars: ctx.minPrunableChars, + imageProvider: ctx.imageProvider, + summarize: ctx.summarize, + summarizerWindow: ctx.summarizerWindow, + // Reuse the trigger-check estimate; skips a redundant full pass (RV9). + knownEstimate: estimate, + }); + + if (result.messagesDropped === 0) return undefined; + + logger.info( + { + messagesDropped: result.messagesDropped, + estimatedTokensBefore: estimate, + estimatedTokensAfter: result.estimatedTokens, + }, + "Tier 2 in-turn compaction fired", + ); + + return { messages: result.messages }; + }; +} diff --git a/apps/backend/src/runs/context-window.test.ts b/apps/backend/src/runs/context-window.test.ts new file mode 100644 index 00000000..bc84b978 --- /dev/null +++ b/apps/backend/src/runs/context-window.test.ts @@ -0,0 +1,330 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; + +vi.mock("../logger.ts", () => ({ + logger: { warn: vi.fn(), info: vi.fn(), error: vi.fn(), debug: vi.fn() }, +})); + +import { + ContextWindowResolver, + lookupRegistry, + DEFAULT_CONTEXT_WINDOW, + type Registry, + type ProviderWindowInput, +} from "./context-window.ts"; + +const REGISTRY: Registry = { + "gpt-4o": { max_input_tokens: 128000, max_output_tokens: 16384 }, + "claude-3-5-sonnet-20240620": { + max_input_tokens: 200000, + max_output_tokens: 8192, + }, + "anthropic.claude-3-5-sonnet-20240620-v1:0": { + max_input_tokens: 200000, + max_output_tokens: 4096, + }, + "legacy-model": { max_tokens: 4096 }, +}; + +const loadRegistry = async () => REGISTRY; + +function resolver() { + return new ContextWindowResolver({ loadRegistry }); +} + +const openai: ProviderWindowInput = { + id: "prov-openai", + providerType: "OpenAI", + baseUrl: null, + apiKey: "sk-x", +}; + +describe("lookupRegistry — key normalization (drift T4)", () => { + it("exact match", () => { + expect(lookupRegistry(REGISTRY, "gpt-4o")?.max_input_tokens).toBe(128000); + }); + + it("strips a provider prefix", () => { + expect(lookupRegistry(REGISTRY, "openai/gpt-4o")?.max_input_tokens).toBe( + 128000, + ); + }); + + it("lowercases", () => { + expect(lookupRegistry(REGISTRY, "GPT-4o")?.max_input_tokens).toBe(128000); + }); + + it("uses the alias map for an Azure deployment name", () => { + expect( + lookupRegistry(REGISTRY, "my-azure-deploy", { + "my-azure-deploy": "gpt-4o", + })?.max_input_tokens, + ).toBe(128000); + }); + + it("resolves a Bedrock ARN to its vendor.model id", () => { + const arn = + "arn:aws:bedrock:us-east-1::foundation-model/anthropic.claude-3-5-sonnet-20240620-v1:0"; + expect(lookupRegistry(REGISTRY, arn)?.max_output_tokens).toBe(4096); + }); + + it("family heuristic: dated suffix matches the base key", () => { + // "gpt-4o-2024-11-20" → longest prefix key "gpt-4o" + expect( + lookupRegistry(REGISTRY, "gpt-4o-2024-11-20")?.max_input_tokens, + ).toBe(128000); + }); + + it("returns undefined on a true MISS", () => { + expect(lookupRegistry(REGISTRY, "totally-unknown-xyz")).toBeUndefined(); + }); +}); + +describe("resolveContextWindow — resolution order", () => { + beforeEach(() => vi.clearAllMocks()); + + it("1. manual override wins over everything", async () => { + const r = resolver(); + const out = await r.resolve( + { + ...openai, + modelMeta: { + "gpt-4o": { contextWindow: 64000, maxOutputTokens: 2048 }, + }, + }, + "gpt-4o", + ); + expect(out).toEqual({ + contextWindow: 64000, + maxOutputTokens: 2048, + source: "override", + }); + }); + + it("3. falls to the litellm registry when no override / API", async () => { + const r = resolver(); + const out = await r.resolve({ ...openai }, "gpt-4o"); + expect(out).toEqual({ + contextWindow: 128000, + maxOutputTokens: 16384, + source: "registry", + }); + }); + + it("ignores litellm max_tokens (output cap, not window) → default (drift F1)", async () => { + // "legacy-model" has only max_tokens; that is the OUTPUT cap, so it must NOT + // be read as the context window. Falls through to the conservative default. + const r = resolver(); + const out = await r.resolve({ ...openai }, "legacy-model"); + expect(out.contextWindow).toBe(DEFAULT_CONTEXT_WINDOW); + expect(out.source).toBe("default"); + }); + + it("merges a maxOutputTokens-only override onto a registry window (drift F5)", async () => { + const r = resolver(); + const out = await r.resolve( + { ...openai, modelMeta: { "gpt-4o": { maxOutputTokens: 999 } } }, + "gpt-4o", + ); + // No contextWindow override → window from registry, but output cap overridden. + expect(out).toEqual({ + contextWindow: 128000, + maxOutputTokens: 999, + source: "registry", + }); + }); + + it("4. conservative default + source=default on a MISS (drift T6)", async () => { + const r = resolver(); + const out = await r.resolve({ ...openai }, "unknown-model-zzz"); + expect(out).toEqual({ + contextWindow: DEFAULT_CONTEXT_WINDOW, + maxOutputTokens: undefined, + source: "default", + }); + }); +}); + +describe("API auto-detect parsers", () => { + it("Google: inputTokenLimit / outputTokenLimit", async () => { + const httpGetJson = vi.fn().mockResolvedValue({ + inputTokenLimit: 1048576, + outputTokenLimit: 8192, + }); + const r = new ContextWindowResolver({ loadRegistry, httpGetJson }); + const out = await r.resolve( + { + id: "g", + providerType: "Google", + baseUrl: "https://gen.example", + apiKey: "k", + }, + "gemini-1.5-pro", + ); + expect(out).toEqual({ + contextWindow: 1048576, + maxOutputTokens: 8192, + source: "api", + }); + expect(httpGetJson).toHaveBeenCalledWith( + "https://gen.example/v1beta/models/gemini-1.5-pro", + { "x-goog-api-key": "k" }, + ); + }); + + it("OpenRouter: matches id → context_length", async () => { + const httpGetJson = vi.fn().mockResolvedValue({ + data: [ + { id: "other", context_length: 1 }, + { + id: "meta-llama/llama-3.1-70b", + context_length: 131072, + top_provider: { max_completion_tokens: 4096 }, + }, + ], + }); + const r = new ContextWindowResolver({ loadRegistry, httpGetJson }); + const out = await r.resolve( + { + id: "or", + providerType: "OpenRouter", + baseUrl: "https://openrouter.ai", + }, + "meta-llama/llama-3.1-70b", + ); + expect(out).toEqual({ + contextWindow: 131072, + maxOutputTokens: 4096, + source: "api", + }); + }); + + it("vLLM / OpenAI-compatible: max_model_len from a custom baseUrl", async () => { + const httpGetJson = vi.fn().mockResolvedValue({ + data: [{ id: "my-vllm-model", max_model_len: 32768 }], + }); + const r = new ContextWindowResolver({ loadRegistry, httpGetJson }); + const out = await r.resolve( + { + id: "v", + providerType: "OpenAI", + baseUrl: "http://localhost:8000", + apiKey: "x", + }, + "my-vllm-model", + ); + expect(out.contextWindow).toBe(32768); + expect(out.source).toBe("api"); + }); + + it("official OpenAI (no baseUrl) skips the probe and falls to registry", async () => { + const httpGetJson = vi.fn(); + const r = new ContextWindowResolver({ loadRegistry, httpGetJson }); + const out = await r.resolve({ ...openai, baseUrl: null }, "gpt-4o"); + expect(httpGetJson).not.toHaveBeenCalled(); + expect(out.source).toBe("registry"); + }); + + it("a failing API probe falls through to the registry", async () => { + const httpGetJson = vi.fn().mockRejectedValue(new Error("boom")); + const r = new ContextWindowResolver({ loadRegistry, httpGetJson }); + const out = await r.resolve( + { id: "g", providerType: "Google", baseUrl: "https://gen.example" }, + "gpt-4o", + ); + expect(out.source).toBe("registry"); + }); +}); + +describe("registry load failure (drift F3)", () => { + it("a throwing loader degrades to empty registry → default, no reject", async () => { + const r = new ContextWindowResolver({ + loadRegistry: async () => { + throw new Error("bad vendored json"); + }, + }); + const out = await r.resolve({ ...openai }, "gpt-4o"); + expect(out.source).toBe("default"); + expect(out.contextWindow).toBe(DEFAULT_CONTEXT_WINDOW); + }); +}); + +describe("cache + evict (drift T5)", () => { + it("caches within the TTL (one probe), evict forces a re-probe", async () => { + const httpGetJson = vi + .fn() + .mockResolvedValue({ data: [{ id: "m", max_model_len: 1000 }] }); + const r = new ContextWindowResolver({ loadRegistry, httpGetJson }); + const p = { + id: "v", + providerType: "OpenAI", + baseUrl: "http://x", + apiKey: "k", + }; + + await r.resolve(p, "m"); + await r.resolve(p, "m"); + expect(httpGetJson).toHaveBeenCalledTimes(1); // second hit served from cache + + r.evict("v"); + await r.resolve(p, "m"); + expect(httpGetJson).toHaveBeenCalledTimes(2); // evict busted the cache + }); + + it("the cached value expires after the TTL", async () => { + let now = 1000; + const httpGetJson = vi + .fn() + .mockResolvedValue({ data: [{ id: "m", max_model_len: 1000 }] }); + const r = new ContextWindowResolver({ + loadRegistry, + httpGetJson, + ttlMs: 100, + now: () => now, + }); + const p = { + id: "v", + providerType: "OpenAI", + baseUrl: "http://x", + apiKey: "k", + }; + + await r.resolve(p, "m"); + now += 50; + await r.resolve(p, "m"); + expect(httpGetJson).toHaveBeenCalledTimes(1); // still within TTL + + now += 100; // past TTL + await r.resolve(p, "m"); + expect(httpGetJson).toHaveBeenCalledTimes(2); + }); + + it("RV7d: a default-source result is cached briefly, not for the full TTL", async () => { + let now = 0; + // API probe never yields a window and the model is not in the registry → + // every resolve falls to source:"default". + const httpGetJson = vi.fn().mockResolvedValue({ data: [] }); + const r = new ContextWindowResolver({ + loadRegistry, + httpGetJson, + ttlMs: 60 * 60 * 1000, // full TTL is an hour + now: () => now, + }); + const p = { + id: "v", + providerType: "OpenAI", + baseUrl: "http://x", + apiKey: "k", + }; + + const first = await r.resolve(p, "unknown-model"); + expect(first.source).toBe("default"); + + now += 30 * 1000; // within the 60 s default-source TTL + await r.resolve(p, "unknown-model"); + expect(httpGetJson).toHaveBeenCalledTimes(1); // still cached + + now += 40 * 1000; // 70 s total — past the short TTL, far short of the hour + await r.resolve(p, "unknown-model"); + expect(httpGetJson).toHaveBeenCalledTimes(2); // re-probed, blip not pinned + }); +}); diff --git a/apps/backend/src/runs/context-window.ts b/apps/backend/src/runs/context-window.ts new file mode 100644 index 00000000..d2390633 --- /dev/null +++ b/apps/backend/src/runs/context-window.ts @@ -0,0 +1,466 @@ +/** + * Context-window resolution (context-compaction-plan §A). + * + * Resolves the usable context window (and max output tokens) for a + * provider+model, in this order: + * + * 1. Manual override — `provider.modelMeta[modelId]`. + * 2. API auto-detect — Google / OpenRouter / vLLM expose the window. + * 3. litellm registry — community model price/context JSON (covers + * OpenAI / Anthropic / Bedrock, which don't expose it). + * 4. Conservative default — {@link DEFAULT_CONTEXT_WINDOW} (8192). + * + * A fall-through to the default, and every registry key MISS, is `log.warn`'d: + * the window is then unknown and the ring must render neutral (drift T6). + * + * Results are cached per `providerId:modelId` with a TTL. Editing a `modelMeta` + * override must call {@link ContextWindowResolver.evict} immediately so the + * override takes effect without waiting for the TTL (drift T5). + * + * The registry lookup and HTTP probe are injected so this module is unit + * testable without network or a vendored multi-MB JSON file (drift T4 cases are + * exercised against small fixture registries). + */ + +import { logger } from "../logger.ts"; + +/** Conservative window when nothing else resolves. */ +export const DEFAULT_CONTEXT_WINDOW = 8192; + +/** Default cache TTL: API-detected windows can drift, the override path evicts. */ +export const DEFAULT_CACHE_TTL_MS = 60 * 60 * 1000; // 1 hour + +/** + * Short TTL for `source: "default"` resolutions (defect 6 / RV7d). A registry + * MISS or a transient API failure falls to 8192; caching that for the full hour + * pins a wrong window long after the blip clears. A 60 s TTL lets the next turn + * re-probe while still collapsing a burst of same-turn lookups. + */ +export const DEFAULT_SOURCE_CACHE_TTL_MS = 60 * 1000; // 1 minute + +/** Where a resolved window came from — drives ring neutrality (T6). */ +export type WindowSource = "override" | "api" | "registry" | "default"; + +export type ResolvedWindow = { + contextWindow: number; + maxOutputTokens?: number; + source: WindowSource; +}; + +/** The slice of a provider row this module needs. */ +export type ProviderWindowInput = { + id: string; + providerType: string; + baseUrl?: string | null; + apiKey?: string | null; + modelMeta?: Record< + string, + { contextWindow?: number; maxOutputTokens?: number } + > | null; +}; + +/** A litellm registry entry (subset of the fields we read). */ +export type RegistryEntry = { + max_input_tokens?: number; + max_output_tokens?: number; + max_tokens?: number; +}; + +export type Registry = Record; + +/** Fetches and parses JSON from a URL. Injected so tests avoid network. */ +export type HttpGetJson = ( + url: string, + headers?: Record, +) => Promise; + +export type ResolverDeps = { + /** Provides the litellm registry (lazy; may be empty until vendored). */ + loadRegistry?: () => Promise; + /** model id → registry key aliases (Bedrock ARNs, Azure deployments, …). */ + aliasMap?: Record; + httpGetJson?: HttpGetJson; + ttlMs?: number; + now?: () => number; +}; + +// --------------------------------------------------------------------------- +// litellm registry key normalization (drift T4) +// --------------------------------------------------------------------------- + +/** Strips a Bedrock ARN down to its `vendor.model` id, if it is one. */ +function bedrockModelFromArn(modelId: string): string | undefined { + const match = /foundation-model\/(.+)$/.exec(modelId); + return match?.[1]; +} + +/** + * Resolves a registry entry for a model id via the normalization chain: + * exact → strip provider prefix → lowercase → alias map → Bedrock ARN → + * family heuristic (longest registry key that prefixes the id) → MISS. + */ +export function lookupRegistry( + registry: Registry, + modelId: string, + aliasMap: Record = {}, +): RegistryEntry | undefined { + // 1. exact + if (registry[modelId]) return registry[modelId]; + + // 2. strip provider prefix ("openai/gpt-4o" → "gpt-4o") + const slash = modelId.indexOf("/"); + const stripped = slash >= 0 ? modelId.slice(slash + 1) : modelId; + if (stripped !== modelId && registry[stripped]) return registry[stripped]; + + // 3. lowercase variants + const lowerExact = modelId.toLowerCase(); + if (registry[lowerExact]) return registry[lowerExact]; + const lowerStripped = stripped.toLowerCase(); + if (registry[lowerStripped]) return registry[lowerStripped]; + + // 4. alias map (Azure deployment names, custom vLLM names, …) + const alias = aliasMap[modelId]; + if (alias && registry[alias]) return registry[alias]; + + // 5. Bedrock ARN → vendor.model, tried bare and under the "bedrock/" prefix, + // each also lowercased (registry keys for Bedrock are lowercase; ARNs are not + // guaranteed to be — defect 11). + const bedrock = bedrockModelFromArn(modelId); + if (bedrock) { + const candidates = [ + bedrock, + `bedrock/${bedrock}`, + bedrock.toLowerCase(), + `bedrock/${bedrock.toLowerCase()}`, + ]; + for (const c of candidates) if (registry[c]) return registry[c]; + } + + // 6. family heuristic — longest registry key that is a proper prefix of the + // id, separated by "-", ".", ":", or "/" so "gpt-4" does NOT match "gpt-4.5" + // (RV7b: raw startsWith caused gpt-4.5-preview to silently resolve via a + // stale gpt-4 entry with a wrong 8192 window). + // Case-insensitive so mixed-case registry keys ("Qwen/…", "meta-llama/…") + // still match lowercase ids from providers that normalize model names. + const strippedLower = stripped.toLowerCase(); + let best: { key: string; entry: RegistryEntry } | undefined; + for (const key of Object.keys(registry)) { + const keyLower = key.toLowerCase(); + const isMatch = + strippedLower === keyLower || + strippedLower.startsWith(keyLower + "-") || + strippedLower.startsWith(keyLower + ".") || + strippedLower.startsWith(keyLower + ":") || + strippedLower.startsWith(keyLower + "/"); + if (isMatch && (!best || key.length > best.key.length)) { + best = { key, entry: registry[key] }; + } + } + if (best) return best.entry; + + // 7. MISS + return undefined; +} + +function windowFromRegistryEntry(entry: RegistryEntry): { + contextWindow?: number; + maxOutputTokens?: number; +} { + // Only trust the explicit input limit. litellm's `max_tokens` is the OUTPUT + // cap (not the context window); using it would silently under-size the window + // and cause constant over-compaction (drift F1). When `max_input_tokens` is + // absent we return no window so the caller falls to the conservative default, + // which at least surfaces a warn + neutral ring rather than a wrong number. + return { + contextWindow: entry.max_input_tokens, + maxOutputTokens: entry.max_output_tokens, + }; +} + +// --------------------------------------------------------------------------- +// API auto-detect parsers +// --------------------------------------------------------------------------- + +function trimSlash(url: string): string { + return url.replace(/\/+$/, ""); +} + +async function detectGoogle( + provider: ProviderWindowInput, + modelId: string, + httpGetJson: HttpGetJson, +): Promise | undefined> { + const base = trimSlash( + provider.baseUrl || "https://generativelanguage.googleapis.com", + ); + const headers = provider.apiKey + ? { "x-goog-api-key": provider.apiKey } + : undefined; + const body = (await httpGetJson( + `${base}/v1beta/models/${modelId}`, + headers, + )) as { + inputTokenLimit?: number; + outputTokenLimit?: number; + }; + if (typeof body?.inputTokenLimit === "number") { + return { + contextWindow: body.inputTokenLimit, + maxOutputTokens: body.outputTokenLimit, + source: "api", + }; + } + return undefined; +} + +async function detectOpenRouter( + provider: ProviderWindowInput, + modelId: string, + httpGetJson: HttpGetJson, +): Promise | undefined> { + const base = trimSlash(provider.baseUrl || "https://openrouter.ai"); + const body = (await httpGetJson(`${base}/api/v1/models`)) as { + data?: Array<{ + id?: string; + context_length?: number; + top_provider?: { max_completion_tokens?: number }; + }>; + }; + const entry = body?.data?.find((m) => m.id === modelId); + if (entry && typeof entry.context_length === "number") { + return { + contextWindow: entry.context_length, + maxOutputTokens: entry.top_provider?.max_completion_tokens, + source: "api", + }; + } + return undefined; +} + +async function detectOpenAiCompatible( + provider: ProviderWindowInput, + modelId: string, + httpGetJson: HttpGetJson, +): Promise | undefined> { + if (!provider.baseUrl) return undefined; // official OpenAI omits the field + const base = trimSlash(provider.baseUrl); + const headers = provider.apiKey + ? { authorization: `Bearer ${provider.apiKey}` } + : undefined; + const body = (await httpGetJson(`${base}/v1/models`, headers)) as { + data?: Array<{ id?: string; max_model_len?: number }>; + }; + const entry = body?.data?.find((m) => m.id === modelId); + // vLLM and most OpenAI-compatible servers expose `max_model_len`. + if (entry && typeof entry.max_model_len === "number") { + return { contextWindow: entry.max_model_len, source: "api" }; + } + return undefined; +} + +async function detectViaApi( + provider: ProviderWindowInput, + modelId: string, + httpGetJson: HttpGetJson, +): Promise | undefined> { + try { + switch (provider.providerType) { + case "Google": + return await detectGoogle(provider, modelId, httpGetJson); + case "OpenRouter": + return await detectOpenRouter(provider, modelId, httpGetJson); + case "OpenAI": + return await detectOpenAiCompatible(provider, modelId, httpGetJson); + default: + return undefined; // Anthropic / Bedrock — no API window, use registry + } + } catch (error) { + logger.warn( + { + error, + providerId: provider.id, + modelId, + providerType: provider.providerType, + }, + "context-window API auto-detect failed; falling through", + ); + return undefined; + } +} + +// --------------------------------------------------------------------------- +// Resolver (cache + evict) +// --------------------------------------------------------------------------- + +/** RV7d: 5 s hard cap so a hung provider endpoint never blocks turns for ~300 s. */ +const API_DETECT_TIMEOUT_MS = 5000; + +const defaultHttpGetJson: HttpGetJson = async (url, headers) => { + const res = await fetch(url, { + headers, + signal: AbortSignal.timeout(API_DETECT_TIMEOUT_MS), + }); + if (!res.ok) throw new Error(`GET ${url} → ${res.status}`); + return res.json(); +}; + +type CacheEntry = { value: ResolvedWindow; expiresAt: number }; + +export class ContextWindowResolver { + #cache = new Map(); + /** RV7d: single-flight — concurrent callers for the same key share one fetch. */ + #inflight = new Map>(); + #loadRegistry: () => Promise; + #registry: Registry | undefined; + #aliasMap: Record; + #httpGetJson: HttpGetJson; + #ttlMs: number; + #now: () => number; + + constructor(deps: ResolverDeps = {}) { + this.#loadRegistry = deps.loadRegistry ?? (async () => ({})); + this.#aliasMap = deps.aliasMap ?? {}; + this.#httpGetJson = deps.httpGetJson ?? defaultHttpGetJson; + this.#ttlMs = deps.ttlMs ?? DEFAULT_CACHE_TTL_MS; + this.#now = deps.now ?? (() => Date.now()); + } + + /** Drops all cached windows for a provider — call on `modelMeta` edit (T5). */ + evict(providerId: string): void { + for (const key of this.#cache.keys()) { + if (key.startsWith(`${providerId}:`)) this.#cache.delete(key); + } + // Also cancel any in-flight fetch for this provider so the next call + // re-resolves with the updated modelMeta rather than caching a stale result. + for (const key of this.#inflight.keys()) { + if (key.startsWith(`${providerId}:`)) this.#inflight.delete(key); + } + } + + async #registryEntry(modelId: string): Promise { + if (this.#registry === undefined) { + // A failing loader (bad vendored JSON, fs error) must not reject the whole + // resolution — degrade to an empty registry + warn (drift F3). + try { + this.#registry = await this.#loadRegistry(); + } catch (error) { + logger.warn( + { error }, + "litellm registry load failed; treating as empty", + ); + this.#registry = {}; + } + } + return lookupRegistry(this.#registry, modelId, this.#aliasMap); + } + + async resolve( + provider: ProviderWindowInput, + modelId: string, + ): Promise { + const cacheKey = `${provider.id}:${modelId}`; + const cached = this.#cache.get(cacheKey); + if (cached && cached.expiresAt > this.#now()) return cached.value; + + // RV7d: single-flight — reuse an in-flight promise rather than spawning a + // second fetch for the same key (cold-cache stampede protection). + const existing = this.#inflight.get(cacheKey); + if (existing) return existing; + + const promise = this.#resolveUncached(provider, modelId).then((value) => { + // Only write the cache if this promise is still the live in-flight one. + // An evict() during the fetch deletes the inflight entry; without this + // guard the resolving promise would repopulate the cache with the stale + // pre-update value and defeat the eviction for a full TTL (RV7c race). + if (this.#inflight.get(cacheKey) === promise) { + // RV7d / defect 6: a default-source result (MISS or transient API + // failure) gets a short TTL so a blip doesn't pin 8192 for an hour. + const ttl = + value.source === "default" + ? Math.min(DEFAULT_SOURCE_CACHE_TTL_MS, this.#ttlMs) + : this.#ttlMs; + this.#cache.set(cacheKey, { value, expiresAt: this.#now() + ttl }); + this.#inflight.delete(cacheKey); + } + return value; + }); + // Store before awaiting so concurrent callers see the same promise. + this.#inflight.set(cacheKey, promise); + try { + return await promise; + } catch (err) { + this.#inflight.delete(cacheKey); + throw err; + } + } + + async #resolveUncached( + provider: ProviderWindowInput, + modelId: string, + ): Promise { + // 1. Manual override + const override = provider.modelMeta?.[modelId]; + if (override?.contextWindow) { + return { + contextWindow: override.contextWindow, + maxOutputTokens: override.maxOutputTokens, + source: "override", + }; + } + + // 2. API auto-detect + const api = await detectViaApi(provider, modelId, this.#httpGetJson); + if (api?.contextWindow) { + return { + contextWindow: api.contextWindow, + maxOutputTokens: override?.maxOutputTokens ?? api.maxOutputTokens, + source: "api", + }; + } + + // 3. litellm registry + const entry = await this.#registryEntry(modelId); + if (entry) { + const { contextWindow, maxOutputTokens } = windowFromRegistryEntry(entry); + if (contextWindow) { + return { + contextWindow, + maxOutputTokens: override?.maxOutputTokens ?? maxOutputTokens, + source: "registry", + }; + } + } else { + logger.warn( + { + metric: "litellm.key_miss", + providerId: provider.id, + modelId, + providerType: provider.providerType, + }, + "litellm registry key MISS — falling to default window", + ); + } + + // 4. Conservative default + logger.warn( + { + metric: "context_window.fell_to_default", + providerId: provider.id, + modelId, + default: DEFAULT_CONTEXT_WINDOW, + }, + "context window unresolved — using conservative default (ring neutral)", + ); + return { + contextWindow: DEFAULT_CONTEXT_WINDOW, + maxOutputTokens: override?.maxOutputTokens, + source: "default", + }; + } +} + +/** Process-wide resolver. Routes use this; tests construct their own. */ +import { loadBuiltinRegistry } from "./litellm-registry.ts"; +export const contextWindowResolver = new ContextWindowResolver({ + loadRegistry: loadBuiltinRegistry, +}); diff --git a/apps/backend/src/runs/litellm-registry.ts b/apps/backend/src/runs/litellm-registry.ts new file mode 100644 index 00000000..bca74532 --- /dev/null +++ b/apps/backend/src/runs/litellm-registry.ts @@ -0,0 +1,347 @@ +/** + * Minimal vendored subset of the litellm model_prices_and_context_window.json + * (MIT licence — https://github.com/BerriAI/litellm). + * + * Only includes `max_input_tokens` and `max_output_tokens` — the two fields + * {@link ContextWindowResolver} reads. Covers providers whose context window is + * not available via a live API call (OpenAI, Anthropic, Bedrock). Google and + * OpenRouter are auto-detected at runtime and do not need entries here. + * + * Keys follow the litellm naming convention — bare model ids without a provider + * prefix. The registry lookup in context-window.ts tries exact → stripped → + * lowercase → alias → Bedrock ARN → family heuristic before a MISS. + * + * Keep sorted alphabetically within each vendor section for easier diffing. + * Update when models whose windows differ from their family default are released. + */ + +import type { Registry } from "./context-window.ts"; + +const REGISTRY: Registry = { + // --------------------------------------------------------------------------- + // OpenAI + // --------------------------------------------------------------------------- + "chatgpt-4o-latest": { max_input_tokens: 128000, max_output_tokens: 16384 }, + "gpt-3.5-turbo": { max_input_tokens: 16385, max_output_tokens: 4096 }, + "gpt-3.5-turbo-0125": { max_input_tokens: 16385, max_output_tokens: 4096 }, + "gpt-3.5-turbo-16k": { max_input_tokens: 16385, max_output_tokens: 4096 }, + "gpt-4": { max_input_tokens: 8192, max_output_tokens: 8192 }, + "gpt-4-0125-preview": { max_input_tokens: 128000, max_output_tokens: 4096 }, + "gpt-4-1106-preview": { max_input_tokens: 128000, max_output_tokens: 4096 }, + "gpt-4-turbo": { max_input_tokens: 128000, max_output_tokens: 4096 }, + "gpt-4-turbo-preview": { max_input_tokens: 128000, max_output_tokens: 4096 }, + "gpt-4-vision-preview": { max_input_tokens: 128000, max_output_tokens: 4096 }, + "gpt-4.1": { max_input_tokens: 1047576, max_output_tokens: 32768 }, + "gpt-4.1-mini": { max_input_tokens: 1047576, max_output_tokens: 32768 }, + "gpt-4.1-nano": { max_input_tokens: 1047576, max_output_tokens: 32768 }, + "gpt-4.5-preview": { max_input_tokens: 128000, max_output_tokens: 16384 }, + "gpt-4o": { max_input_tokens: 128000, max_output_tokens: 16384 }, + "gpt-4o-2024-05-13": { max_input_tokens: 128000, max_output_tokens: 4096 }, + "gpt-4o-2024-08-06": { max_input_tokens: 128000, max_output_tokens: 16384 }, + "gpt-4o-2024-11-20": { max_input_tokens: 128000, max_output_tokens: 16384 }, + "gpt-4o-audio-preview": { + max_input_tokens: 128000, + max_output_tokens: 16384, + }, + "gpt-4o-mini": { max_input_tokens: 128000, max_output_tokens: 16384 }, + "gpt-4o-mini-2024-07-18": { + max_input_tokens: 128000, + max_output_tokens: 16384, + }, + "gpt-4o-mini-audio-preview": { + max_input_tokens: 128000, + max_output_tokens: 16384, + }, + o1: { max_input_tokens: 200000, max_output_tokens: 100000 }, + "o1-mini": { max_input_tokens: 128000, max_output_tokens: 65536 }, + "o1-preview": { max_input_tokens: 128000, max_output_tokens: 32768 }, + o3: { max_input_tokens: 200000, max_output_tokens: 100000 }, + "o3-mini": { max_input_tokens: 200000, max_output_tokens: 100000 }, + "o4-mini": { max_input_tokens: 200000, max_output_tokens: 100000 }, + + // --------------------------------------------------------------------------- + // Anthropic (direct API — also covered under bedrock/ below) + // --------------------------------------------------------------------------- + "claude-2": { max_input_tokens: 100000, max_output_tokens: 4096 }, + "claude-2.1": { max_input_tokens: 200000, max_output_tokens: 4096 }, + "claude-3-haiku-20240307": { + max_input_tokens: 200000, + max_output_tokens: 4096, + }, + "claude-3-opus-20240229": { + max_input_tokens: 200000, + max_output_tokens: 4096, + }, + "claude-3-sonnet-20240229": { + max_input_tokens: 200000, + max_output_tokens: 4096, + }, + "claude-3-5-haiku-20241022": { + max_input_tokens: 200000, + max_output_tokens: 8192, + }, + "claude-3-5-sonnet-20240620": { + max_input_tokens: 200000, + max_output_tokens: 8192, + }, + "claude-3-5-sonnet-20241022": { + max_input_tokens: 200000, + max_output_tokens: 8192, + }, + "claude-3-7-sonnet-20250219": { + max_input_tokens: 200000, + max_output_tokens: 128000, + }, + "claude-haiku-4-5-20251001": { + max_input_tokens: 200000, + max_output_tokens: 8192, + }, + "claude-opus-4-5": { max_input_tokens: 200000, max_output_tokens: 32000 }, + "claude-opus-4-8": { max_input_tokens: 200000, max_output_tokens: 32000 }, + "claude-sonnet-4-5": { max_input_tokens: 200000, max_output_tokens: 64000 }, + "claude-sonnet-4-6": { max_input_tokens: 200000, max_output_tokens: 64000 }, + "claude-instant-1": { max_input_tokens: 100000, max_output_tokens: 4096 }, + "claude-instant-1.2": { max_input_tokens: 100000, max_output_tokens: 4096 }, + + // --------------------------------------------------------------------------- + // Bedrock — Anthropic models + // --------------------------------------------------------------------------- + "bedrock/anthropic.claude-instant-v1": { + max_input_tokens: 100000, + max_output_tokens: 4096, + }, + "bedrock/anthropic.claude-v2": { + max_input_tokens: 100000, + max_output_tokens: 4096, + }, + "bedrock/anthropic.claude-v2:1": { + max_input_tokens: 200000, + max_output_tokens: 4096, + }, + "bedrock/anthropic.claude-3-haiku-20240307-v1:0": { + max_input_tokens: 200000, + max_output_tokens: 4096, + }, + "bedrock/anthropic.claude-3-sonnet-20240229-v1:0": { + max_input_tokens: 200000, + max_output_tokens: 4096, + }, + "bedrock/anthropic.claude-3-opus-20240229-v1:0": { + max_input_tokens: 200000, + max_output_tokens: 4096, + }, + "bedrock/anthropic.claude-3-5-haiku-20241022-v1:0": { + max_input_tokens: 200000, + max_output_tokens: 8192, + }, + "bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0": { + max_input_tokens: 200000, + max_output_tokens: 8192, + }, + "bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0": { + max_input_tokens: 200000, + max_output_tokens: 8192, + }, + "bedrock/anthropic.claude-3-7-sonnet-20250219-v1:0": { + max_input_tokens: 200000, + max_output_tokens: 128000, + }, + + // --------------------------------------------------------------------------- + // Bedrock — Meta Llama + // --------------------------------------------------------------------------- + "bedrock/meta.llama3-8b-instruct-v1:0": { + max_input_tokens: 128000, + max_output_tokens: 8192, + }, + "bedrock/meta.llama3-70b-instruct-v1:0": { + max_input_tokens: 128000, + max_output_tokens: 8192, + }, + "bedrock/meta.llama3-1-8b-instruct-v1:0": { + max_input_tokens: 128000, + max_output_tokens: 8192, + }, + "bedrock/meta.llama3-1-70b-instruct-v1:0": { + max_input_tokens: 128000, + max_output_tokens: 8192, + }, + "bedrock/meta.llama3-1-405b-instruct-v1:0": { + max_input_tokens: 128000, + max_output_tokens: 8192, + }, + "bedrock/meta.llama3-2-1b-instruct-v1:0": { + max_input_tokens: 128000, + max_output_tokens: 8192, + }, + "bedrock/meta.llama3-2-3b-instruct-v1:0": { + max_input_tokens: 128000, + max_output_tokens: 8192, + }, + "bedrock/meta.llama3-2-11b-instruct-v1:0": { + max_input_tokens: 128000, + max_output_tokens: 8192, + }, + "bedrock/meta.llama3-2-90b-instruct-v1:0": { + max_input_tokens: 128000, + max_output_tokens: 8192, + }, + + // --------------------------------------------------------------------------- + // Bedrock — Amazon Titan/Nova + // --------------------------------------------------------------------------- + "bedrock/amazon.nova-lite-v1:0": { + max_input_tokens: 300000, + max_output_tokens: 5120, + }, + "bedrock/amazon.nova-micro-v1:0": { + max_input_tokens: 128000, + max_output_tokens: 5120, + }, + "bedrock/amazon.nova-pro-v1:0": { + max_input_tokens: 300000, + max_output_tokens: 5120, + }, + "bedrock/amazon.titan-text-express-v1": { + max_input_tokens: 8192, + max_output_tokens: 8192, + }, + "bedrock/amazon.titan-text-lite-v1": { + max_input_tokens: 4096, + max_output_tokens: 4096, + }, + "bedrock/amazon.titan-text-premier-v1:0": { + max_input_tokens: 32000, + max_output_tokens: 3072, + }, + + // --------------------------------------------------------------------------- + // Bedrock — Mistral + // --------------------------------------------------------------------------- + "bedrock/mistral.mistral-7b-instruct-v0:2": { + max_input_tokens: 32768, + max_output_tokens: 8192, + }, + "bedrock/mistral.mistral-large-2402-v1:0": { + max_input_tokens: 32768, + max_output_tokens: 8192, + }, + "bedrock/mistral.mistral-large-2407-v1:0": { + max_input_tokens: 131072, + max_output_tokens: 8192, + }, + "bedrock/mistral.mixtral-8x7b-instruct-v0:1": { + max_input_tokens: 32768, + max_output_tokens: 8192, + }, + + // --------------------------------------------------------------------------- + // Mistral (direct API) + // --------------------------------------------------------------------------- + "mistral-large": { max_input_tokens: 131072, max_output_tokens: 4096 }, + "mistral-large-latest": { max_input_tokens: 131072, max_output_tokens: 4096 }, + "mistral-medium": { max_input_tokens: 32768, max_output_tokens: 4096 }, + "mistral-small": { max_input_tokens: 32768, max_output_tokens: 4096 }, + "mistral-small-latest": { max_input_tokens: 32768, max_output_tokens: 4096 }, + "mistral-tiny": { max_input_tokens: 32768, max_output_tokens: 4096 }, + "mixtral-8x7b": { max_input_tokens: 32768, max_output_tokens: 4096 }, + "mixtral-8x22b": { max_input_tokens: 65536, max_output_tokens: 4096 }, + + // --------------------------------------------------------------------------- + // Meta Llama (direct / OpenAI-compat, e.g. Together.ai, Fireworks) + // --------------------------------------------------------------------------- + "meta-llama/Llama-2-7b-chat-hf": { + max_input_tokens: 4096, + max_output_tokens: 4096, + }, + "meta-llama/Llama-2-13b-chat-hf": { + max_input_tokens: 4096, + max_output_tokens: 4096, + }, + "meta-llama/Llama-2-70b-chat-hf": { + max_input_tokens: 4096, + max_output_tokens: 4096, + }, + "meta-llama/Meta-Llama-3-8B-Instruct": { + max_input_tokens: 8192, + max_output_tokens: 8192, + }, + "meta-llama/Meta-Llama-3-70B-Instruct": { + max_input_tokens: 8192, + max_output_tokens: 8192, + }, + "meta-llama/Meta-Llama-3.1-8B-Instruct": { + max_input_tokens: 131072, + max_output_tokens: 8192, + }, + "meta-llama/Meta-Llama-3.1-70B-Instruct": { + max_input_tokens: 131072, + max_output_tokens: 8192, + }, + "meta-llama/Meta-Llama-3.1-405B-Instruct": { + max_input_tokens: 131072, + max_output_tokens: 8192, + }, + "meta-llama/Llama-3.2-1B-Instruct": { + max_input_tokens: 131072, + max_output_tokens: 8192, + }, + "meta-llama/Llama-3.2-3B-Instruct": { + max_input_tokens: 131072, + max_output_tokens: 8192, + }, + "meta-llama/Llama-3.2-11B-Vision-Instruct": { + max_input_tokens: 131072, + max_output_tokens: 8192, + }, + "meta-llama/Llama-3.2-90B-Vision-Instruct": { + max_input_tokens: 131072, + max_output_tokens: 8192, + }, + "meta-llama/Llama-3.3-70B-Instruct": { + max_input_tokens: 131072, + max_output_tokens: 8192, + }, + "meta-llama/Llama-4-Scout-17B-16E-Instruct": { + max_input_tokens: 10000000, + max_output_tokens: 16384, + }, + "meta-llama/Llama-4-Maverick-17B-128E-Instruct": { + max_input_tokens: 1000000, + max_output_tokens: 16384, + }, + + // --------------------------------------------------------------------------- + // Qwen (via OpenAI-compat, e.g. vLLM / Together) + // --------------------------------------------------------------------------- + "Qwen/Qwen2-7B-Instruct": { + max_input_tokens: 32768, + max_output_tokens: 8192, + }, + "Qwen/Qwen2-72B-Instruct": { + max_input_tokens: 32768, + max_output_tokens: 8192, + }, + "Qwen/Qwen2.5-7B-Instruct": { + max_input_tokens: 131072, + max_output_tokens: 8192, + }, + "Qwen/Qwen2.5-14B-Instruct": { + max_input_tokens: 131072, + max_output_tokens: 8192, + }, + "Qwen/Qwen2.5-72B-Instruct": { + max_input_tokens: 131072, + max_output_tokens: 8192, + }, + "Qwen/Qwen3-8B": { max_input_tokens: 131072, max_output_tokens: 8192 }, + "Qwen/Qwen3-14B": { max_input_tokens: 131072, max_output_tokens: 8192 }, + "Qwen/Qwen3-32B": { max_input_tokens: 131072, max_output_tokens: 8192 }, + "Qwen/Qwen3-72B": { max_input_tokens: 131072, max_output_tokens: 8192 }, +}; + +/** Returns the built-in minimal registry. Async so the signature matches the + * injected `loadRegistry` slot and allows a future async fetch path. */ +export async function loadBuiltinRegistry(): Promise { + return REGISTRY; +} diff --git a/apps/backend/src/runs/recovery.test.ts b/apps/backend/src/runs/recovery.test.ts new file mode 100644 index 00000000..372474ca --- /dev/null +++ b/apps/backend/src/runs/recovery.test.ts @@ -0,0 +1,364 @@ +import { describe, it, expect, vi } from "vitest"; + +vi.mock("../index.ts", () => ({ db: {} })); // drizzle store unused in these tests +vi.mock("../logger.ts", () => ({ + logger: { warn: vi.fn(), info: vi.fn(), error: vi.fn(), debug: vi.fn() }, +})); + +import { APICallError } from "ai"; +import { + contextOverflowRecoveryMiddleware, + isContextOverflowError, + trimOverflowingPrompt, + type RecoveryContext, +} from "./recovery.ts"; + +const apiError = (args: { + message?: string; + statusCode: number; + responseBody?: string; +}) => + new APICallError({ + message: args.message ?? "Bad Request", + url: "https://provider.example/v1", + requestBodyValues: {}, + statusCode: args.statusCode, + responseBody: args.responseBody, + }); + +// --- isContextOverflowError — per-provider body matrix (drift T9) --------- + +describe("isContextOverflowError (drift T9)", () => { + it("matches the OpenAI phrasing + code", () => { + const err = apiError({ + statusCode: 400, + responseBody: JSON.stringify({ + error: { + message: + "This model's maximum context length is 8192 tokens. However, your messages resulted in 10042 tokens. Please reduce the length of the messages.", + type: "invalid_request_error", + code: "context_length_exceeded", + }, + }), + }); + expect(isContextOverflowError(err)).toBe(true); + }); + + it("matches the Anthropic phrasing", () => { + const err = apiError({ + statusCode: 400, + message: "prompt is too long: 210042 tokens > 200000 maximum", + }); + expect(isContextOverflowError(err)).toBe(true); + }); + + it("matches the vLLM / OpenAI-compatible phrasing", () => { + const err = apiError({ + statusCode: 400, + responseBody: + '{"object":"error","message":"This model\'s maximum context length is 40960 tokens. However, you requested 45123 tokens (40123 in the messages, 5000 in the completion). Please reduce the length of the messages or completion.","code":40303}', + }); + expect(isContextOverflowError(err)).toBe(true); + }); + + it("matches the Google phrasing", () => { + const err = apiError({ + statusCode: 400, + responseBody: + '{"error":{"code":400,"message":"The input token count (1200000) exceeds the maximum number of tokens allowed (1048576).","status":"INVALID_ARGUMENT"}}', + }); + expect(isContextOverflowError(err)).toBe(true); + }); + + it("matches the Bedrock ValidationException phrasing", () => { + const err = apiError({ + statusCode: 400, + responseBody: '{"message":"Input is too long for requested model."}', + }); + expect(isContextOverflowError(err)).toBe(true); + }); + + it("matches a 413 payload-too-large with a token message", () => { + const err = apiError({ + statusCode: 413, + responseBody: '{"error":"too many tokens in request"}', + }); + expect(isContextOverflowError(err)).toBe(true); + }); + + it("rejects a 400 that is not about context (validation error)", () => { + const err = apiError({ + statusCode: 400, + responseBody: + '{"error":{"message":"Invalid value for temperature: must be between 0 and 2."}}', + }); + expect(isContextOverflowError(err)).toBe(false); + }); + + it("rejects 429 / 401 / 5xx regardless of body", () => { + for (const statusCode of [401, 429, 500, 503]) { + const err = apiError({ + statusCode, + responseBody: '{"error":"maximum context length exceeded"}', + }); + expect(isContextOverflowError(err)).toBe(false); + } + }); + + it("rejects non-APICallError values", () => { + expect(isContextOverflowError(new Error("prompt is too long"))).toBe(false); + expect(isContextOverflowError(undefined)).toBe(false); + }); +}); + +// --- middleware: trim + retry-once (§E, drift T3) -------------------------- + +type PromptMsg = { role: string; content: unknown }; + +const text = (role: "user" | "assistant", t: string): PromptMsg => ({ + role, + content: [{ type: "text", text: t }], +}); + +/** system + 2 big + 2 small messages: prune can't help (no tool results), so + * the trim must go through the shared summarize stage (drift T3). */ +const overflowPrompt = (): PromptMsg[] => [ + { role: "system", content: "SYS" }, + text("user", "X".repeat(4000)), + text("assistant", "Y".repeat(4000)), + text("user", "recent question"), + text("assistant", "recent answer"), +]; + +const ctx = (over: Partial = {}): RecoveryContext => ({ + chatId: "chat-1", + imageProvider: "default", + targetTokens: 100, + keepRecentMessages: 4, // recovery halves this → keep 2 + minPrunableChars: 2000, + summarize: async () => "RSUM", + ...over, +}); + +const overflow = () => + apiError({ + statusCode: 400, + responseBody: '{"error":{"code":"context_length_exceeded"}}', + }); + +/** Fake V3 model capturing retry params. */ +const fakeModel = (result: unknown = "RETRIED", fail?: unknown) => { + const calls: Array<{ prompt: PromptMsg[] }> = []; + const impl = async (params: { prompt: PromptMsg[] }) => { + calls.push(params); + if (fail) throw fail; + return result; + }; + return { calls, model: { doGenerate: impl, doStream: impl } }; +}; + +const runWrapGenerate = ( + mw: ReturnType, + args: { + doGenerate: () => Promise; + params: { prompt: PromptMsg[] }; + model: unknown; + }, +) => + (mw.wrapGenerate as (o: unknown) => Promise)({ + doStream: async () => { + throw new Error("unused"); + }, + ...args, + }); + +describe("contextOverflowRecoveryMiddleware (§E)", () => { + it("trims via the shared compactor and retries exactly once on overflow", async () => { + const markDirty = vi.fn(async () => undefined); + const mw = contextOverflowRecoveryMiddleware(ctx({ markDirty })); + const { calls, model } = fakeModel(); + const doGenerate = vi.fn(async () => { + throw overflow(); + }); + + const result = await runWrapGenerate(mw, { + doGenerate, + params: { prompt: overflowPrompt() }, + model, + }); + + expect(result).toBe("RETRIED"); + expect(doGenerate).toHaveBeenCalledTimes(1); + expect(calls).toHaveLength(1); + + const retried = calls[0].prompt; + // System head pinned verbatim at the front (§C). + expect(retried[0]).toEqual({ role: "system", content: "SYS" }); + // The big prefix was replaced by the shared summary message (drift T3 — + // compactModelMessages' shape, not a bespoke trim). + const summary = retried[1] as { content: Array<{ text: string }> }; + expect(summary.content[0].text).toContain( + "[Summary of earlier conversation]", + ); + expect(summary.content[0].text).toContain("RSUM"); + // Recent messages kept verbatim. + expect(retried.at(-1)).toEqual(text("assistant", "recent answer")); + // Dirty flag persisted on DETECTION (before the retry outcome is known). + expect(markDirty).toHaveBeenCalledTimes(1); + }); + + it("propagates the second overflow — no infinite retry", async () => { + const markDirty = vi.fn(async () => undefined); + const mw = contextOverflowRecoveryMiddleware(ctx({ markDirty })); + const second = overflow(); + const { model } = fakeModel(undefined, second); + + await expect( + runWrapGenerate(mw, { + doGenerate: async () => { + throw overflow(); + }, + params: { prompt: overflowPrompt() }, + model, + }), + ).rejects.toBe(second); + // Flag persisted anyway: the NEXT turn must compact durably (drift T3). + expect(markDirty).toHaveBeenCalledTimes(1); + }); + + it("rethrows non-overflow errors without retrying or flagging", async () => { + const markDirty = vi.fn(async () => undefined); + const mw = contextOverflowRecoveryMiddleware(ctx({ markDirty })); + const { calls, model } = fakeModel(); + const authError = apiError({ statusCode: 401, message: "bad key" }); + + await expect( + runWrapGenerate(mw, { + doGenerate: async () => { + throw authError; + }, + params: { prompt: overflowPrompt() }, + model, + }), + ).rejects.toBe(authError); + expect(calls).toHaveLength(0); + expect(markDirty).not.toHaveBeenCalled(); + }); + + it("still retries when persisting the dirty flag fails (best-effort)", async () => { + const markDirty = vi.fn(async () => { + throw new Error("db down"); + }); + const mw = contextOverflowRecoveryMiddleware(ctx({ markDirty })); + const { calls, model } = fakeModel(); + + const result = await runWrapGenerate(mw, { + doGenerate: async () => { + throw overflow(); + }, + params: { prompt: overflowPrompt() }, + model, + }); + expect(result).toBe("RETRIED"); + expect(calls).toHaveLength(1); + }); + + it("surfaces the ORIGINAL overflow when the trim itself fails", async () => { + const first = overflow(); + const mw = contextOverflowRecoveryMiddleware( + ctx({ + summarize: async () => { + throw new Error("summarizer down"); + }, + }), + ); + const { calls, model } = fakeModel(); + + await expect( + runWrapGenerate(mw, { + doGenerate: async () => { + throw first; + }, + params: { prompt: overflowPrompt() }, + model, + }), + ).rejects.toBe(first); + expect(calls).toHaveLength(0); + }); + + it("covers the stream path: doStream rejection is trimmed and retried", async () => { + const mw = contextOverflowRecoveryMiddleware(ctx()); + const { calls, model } = fakeModel("STREAMED"); + + const result = await (mw.wrapStream as (o: unknown) => Promise)({ + doGenerate: async () => { + throw new Error("unused"); + }, + doStream: async () => { + throw overflow(); + }, + params: { prompt: overflowPrompt() }, + model, + }); + expect(result).toBe("STREAMED"); + expect(calls).toHaveLength(1); + expect(calls[0].prompt[0]).toEqual({ role: "system", content: "SYS" }); + }); +}); + +describe("trimOverflowingPrompt", () => { + it("pins multiple leading system messages and halves keep-recent", async () => { + const prompt: PromptMsg[] = [ + { role: "system", content: "S1" }, + { role: "system", content: "S2" }, + text("user", "A".repeat(4000)), + text("assistant", "B".repeat(4000)), + text("user", "u2"), + text("assistant", "a2"), + ]; + const { prompt: out, messagesDropped } = await trimOverflowingPrompt( + prompt, + ctx(), // keepRecentMessages 4 → recovery keeps 2 + ); + expect(out[0]).toEqual({ role: "system", content: "S1" }); + expect(out[1]).toEqual({ role: "system", content: "S2" }); + expect(messagesDropped).toBe(2); // the two big messages summarized away + expect(out.at(-2)).toEqual(text("user", "u2")); + expect(out.at(-1)).toEqual(text("assistant", "a2")); + }); + + it("never orphans a tool result at the keep boundary", async () => { + const toolCall: PromptMsg = { + role: "assistant", + content: [ + { type: "tool-call", toolCallId: "t1", toolName: "search", input: {} }, + ], + }; + const toolResult: PromptMsg = { + role: "tool", + content: [ + { + type: "tool-result", + toolCallId: "t1", + toolName: "search", + output: { type: "text", value: "Z".repeat(4000) }, + }, + ], + }; + const prompt: PromptMsg[] = [ + { role: "system", content: "SYS" }, + text("user", "Q".repeat(4000)), + toolCall, + toolResult, // boundary at keep-2 would start recent here — must walk back + text("assistant", "done"), + ]; + const { prompt: out } = await trimOverflowingPrompt(prompt, ctx()); + const firstNonSystem = out.findIndex((m) => m.role !== "system"); + // Recent must not begin with an orphaned role:"tool" message. + expect(out[firstNonSystem].role).not.toBe("tool"); + const toolIdx = out.findIndex((m) => m.role === "tool"); + if (toolIdx !== -1) { + expect(out[toolIdx - 1].role).toBe("assistant"); + } + }); +}); diff --git a/apps/backend/src/runs/recovery.ts b/apps/backend/src/runs/recovery.ts new file mode 100644 index 00000000..b75669e6 --- /dev/null +++ b/apps/backend/src/runs/recovery.ts @@ -0,0 +1,229 @@ +/** + * Context-overflow recovery (context-compaction-plan §E, principle P4). + * + * Recovery is the NET, proactive compaction is the plan: even when Tier 1/2 are + * disabled (kill switch §G) or their estimates were wrong, a provider 400/413 + * "context too long" must not hard-fail the turn. The middleware here wraps the + * language model so EVERY individual model call — the first call of a turn and + * every later step of a tool loop, in both the stream and generate paths — gets + * one trim-and-retry: + * + * 1. Detect the overflow ({@link isContextOverflowError}, per-provider body + * matrix — drift T9). + * 2. Persist `compactionDirty = true` through the single CAS writer so the + * NEXT `prepareChatTurn` forces a durable Tier 1 compaction (drift T3 — + * recovery never writes summary/watermark itself; it only flags). + * 3. Trim in-memory via {@link compactModelMessages} — the shared Tier 2 + * adapter, NOT a bespoke trim (drift T3) — and retry the call once. + * 4. A second failure propagates; {@link formatStreamError} in agent-runner + * surfaces the "conversation too large" message. No infinite retry. + * + * The middleware operates on the `LanguageModelV3Prompt`. Its message shape is + * a structural subset of `ModelMessage` for everything compaction touches + * (roles, text / tool-call / tool-result / file parts, output wrappers), so the + * prompt is passed to `compactModelMessages` directly rather than through a + * lossy converter — one estimator, one trimmer (P2/T3). The leading system + * message(s) are split off first and re-attached verbatim (§C: pin the system + * prompt; the summary must never swallow it). + */ + +import { + APICallError, + type LanguageModelMiddleware, + type ModelMessage, +} from "ai"; +import { logger } from "../logger.ts"; +import { compactModelMessages, type Summarize } from "./compaction.ts"; +import type { ImageProvider } from "./token-estimate.ts"; + +/** + * Everything the middleware needs to trim and retry, resolved once per turn by + * `prepareChatTurn`. `markDirty` is absent for headless runs (triggers, + * sub-agents) — they have no durable chat row to flag. + */ +export type RecoveryContext = { + /** Chat id, for log correlation only. Absent on headless runs. */ + chatId?: string; + imageProvider: ImageProvider; + /** Trim down to this many tokens (the Tier 1 hysteresis target). */ + targetTokens: number; + /** The configured keep-recent; recovery halves it (aggressive trim, §E). */ + keepRecentMessages: number; + minPrunableChars: number; + summarize: Summarize; + summarizerWindow?: number; + /** + * Persists `compactionDirty = true` (via the single CAS writer). Called as + * soon as an overflow is DETECTED — before the retry — so the next turn + * compacts durably even if this retry fails. Best-effort: a failure here + * never blocks the retry. + */ + markDirty?: () => Promise; +}; + +/** + * Per-provider context-overflow phrasings (drift T9). Matched against the + * error message AND raw response body, case-insensitive: + * - OpenAI / vLLM / OpenAI-compatible: "This model's maximum context length is + * N tokens…" + code "context_length_exceeded" + * - Anthropic: "prompt is too long: N tokens > N maximum" + * - Google: "The input token count (N) exceeds the maximum number of tokens + * allowed (N)" + * - Bedrock: "Input is too long for requested model." (ValidationException) + * - Generic gateways: "too many tokens", "exceed context limit" + */ +const CONTEXT_OVERFLOW_PATTERN = + /context[ _]length|context_length_exceeded|prompt is too long|too many tokens|maximum context|exceeds the (?:maximum|max)(?: number of)? (?:input )?tokens|input is too long|exceeds? (?:the )?context limit/i; + +/** + * True when `error` is a provider context-overflow rejection: an `APICallError` + * with status 400 or 413 whose message/body matches a known overflow phrasing. + * Rate limits (429), auth (401/403), and 5xx are deliberately excluded — those + * have their own handling and a trim-retry would not help. + */ +export function isContextOverflowError(error: unknown): boolean { + if (!APICallError.isInstance(error)) return false; + if (error.statusCode !== 400 && error.statusCode !== 413) return false; + const haystack = `${error.message ?? ""}\n${ + typeof error.responseBody === "string" ? error.responseBody : "" + }`; + return CONTEXT_OVERFLOW_PATTERN.test(haystack); +} + +/** A V3 prompt message — structurally compatible with ModelMessage (see header). */ +type PromptMessage = { role: string; content: unknown }; + +/** + * Trims an overflowing prompt via the shared Tier 2 adapter. The system head + * (leading `role:"system"` messages) is pinned and re-attached verbatim. + * Exported for unit testing. + */ +export async function trimOverflowingPrompt( + prompt: T[], + ctx: RecoveryContext, +): Promise<{ prompt: T[]; messagesDropped: number }> { + let systemEnd = 0; + while (systemEnd < prompt.length && prompt[systemEnd].role === "system") { + systemEnd++; + } + const systemHead = prompt.slice(0, systemEnd); + const rest = prompt.slice(systemEnd) as unknown as ModelMessage[]; + + const result = await compactModelMessages(rest, { + // Aggressive: halve the configured keep-recent (§E), floor of 2 so a + // user/assistant pair survives. + keepRecentMessages: Math.max(2, Math.ceil(ctx.keepRecentMessages / 2)), + targetTokens: ctx.targetTokens, + minPrunableChars: ctx.minPrunableChars, + imageProvider: ctx.imageProvider, + summarize: ctx.summarize, + summarizerWindow: ctx.summarizerWindow, + // The provider already rejected this prompt, so the estimator is wrong; + // bypass the no-op gate or the retry will be byte-identical (RV3). + force: true, + }); + + return { + prompt: [...systemHead, ...(result.messages as unknown as T[])], + messagesDropped: result.messagesDropped, + }; +} + +/** + * Wraps both `doGenerate` and `doStream` with the detect → flag → trim → retry- + * once sequence. Apply via `wrapLanguageModel({ model, middleware })` in + * agent-runner. Note a stream that overflows MID-stream (after chunks started + * flowing) is not recoverable — providers reject oversized prompts up front, so + * the rejection surfaces from the `doStream()` promise itself, which is caught. + */ +export function contextOverflowRecoveryMiddleware( + ctx: RecoveryContext, +): LanguageModelMiddleware { + // Shared by both wrappers: returns the retried params, or rethrows. + const recoverParams = async

( + error: unknown, + params: P, + ): Promise

=> { + if (!isContextOverflowError(error)) throw error; + + logger.warn( + { + metric: "recovery.overflow_detected", + chatId: ctx.chatId, + error: String(error), + }, + "context overflow detected; trimming and retrying once", + ); + + // Flag durable compaction for the NEXT turn first (drift T3) — even if the + // retry below fails, the next prepareChatTurn must force Tier 1. + if (ctx.markDirty) { + try { + await ctx.markDirty(); + } catch (err) { + logger.error( + { err, chatId: ctx.chatId }, + "failed to persist compactionDirty after overflow", + ); + } + } + + try { + const { prompt, messagesDropped } = await trimOverflowingPrompt( + params.prompt, + ctx, + ); + logger.info( + { metric: "recovery.retry", chatId: ctx.chatId, messagesDropped }, + "overflow recovery trim complete; retrying model call", + ); + return { ...params, prompt }; + } catch (trimError) { + // The trim itself failed (e.g. the summarize call errored). Surface the + // ORIGINAL overflow so the user sees the actionable message. + logger.error( + { err: trimError, chatId: ctx.chatId }, + "overflow recovery trim failed", + ); + throw error; + } + }; + + // Runs the single retry and logs recovery.failed if the provider rejects the + // trimmed prompt too (the dead end formatStreamError then surfaces to the user). + const retry = async (op: () => PromiseLike): Promise => { + try { + return await op(); + } catch (retryError) { + logger.error( + { + metric: "recovery.failed", + chatId: ctx.chatId, + error: String(retryError), + }, + "overflow recovery retry still rejected by provider", + ); + throw retryError; + } + }; + + return { + specificationVersion: "v3", + wrapGenerate: async ({ doGenerate, params, model }) => { + try { + return await doGenerate(); + } catch (error) { + const next = await recoverParams(error, params); + return retry(() => model.doGenerate(next)); + } + }, + wrapStream: async ({ doStream, params, model }) => { + try { + return await doStream(); + } catch (error) { + const next = await recoverParams(error, params); + return retry(() => model.doStream(next)); + } + }, + }; +} diff --git a/apps/backend/src/runs/token-estimate.test.ts b/apps/backend/src/runs/token-estimate.test.ts new file mode 100644 index 00000000..ebac3e74 --- /dev/null +++ b/apps/backend/src/runs/token-estimate.test.ts @@ -0,0 +1,371 @@ +import { describe, it, expect } from "vitest"; +import { convertToModelMessages, type UIMessage } from "ai"; +import { + estimateTokens, + uiMessagesToCountUnits, + modelMessagesToCountUnits, + parseImageDimensions, + imageProviderFor, + CHARS_PER_TOKEN, + DEFAULT_NONTEXT_TOKENS, + MODEL_BOUND_UI_PART_TYPES, + type CountUnit, +} from "./token-estimate.ts"; +import type { ModelMessage } from "ai"; +import type { PlatypusUIMessage } from "../types.ts"; + +// A 24-byte PNG: 8-byte signature + IHDR length/type + width@16 + height@20. +function fakePng(width: number, height: number): Uint8Array { + const b = new Uint8Array(24); + b.set([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a], 0); // signature + b.set([0, 0, 0, 13], 8); // IHDR length + b.set([0x49, 0x48, 0x44, 0x52], 12); // "IHDR" + new DataView(b.buffer).setUint32(16, width); + new DataView(b.buffer).setUint32(20, height); + return b; +} + +// A minimal JPEG with a single SOF0 marker carrying dimensions. +function fakeJpeg(width: number, height: number): Uint8Array { + const b = new Uint8Array(12); + b.set([0xff, 0xd8, 0xff, 0xc0, 0x00, 0x11, 0x08], 0); // SOI + SOF0 + len + prec + const view = new DataView(b.buffer); + view.setUint16(7, height); + view.setUint16(9, width); + return b; +} + +function dataUrl(bytes: Uint8Array, mediaType = "image/png"): string { + return `data:${mediaType};base64,${Buffer.from(bytes).toString("base64")}`; +} + +describe("estimateTokens (the single estimator, P2)", () => { + it("applies char/4 to text only, rounding up", () => { + const units: CountUnit[] = [ + { role: "user", text: "abcdefgh", nonText: [] }, + ]; + expect(estimateTokens(units)).toBe(8 / CHARS_PER_TOKEN); + + const odd: CountUnit[] = [{ role: "user", text: "abcde", nonText: [] }]; + expect(estimateTokens(odd)).toBe(2); // ceil(5/4) + }); + + it("sums across multiple units (role-agnostic total)", () => { + const units: CountUnit[] = [ + { role: "system", text: "aaaa", nonText: [] }, + { role: "user", text: "bbbb", nonText: [] }, + { role: "assistant", text: "cccc", nonText: [] }, + ]; + expect(estimateTokens(units)).toBe(3); + }); +}); + +describe("modality table (drift T2 — never char/4 an image)", () => { + it("anthropic: ceil(w*h/750)", () => { + const units: CountUnit[] = [ + { + role: "user", + text: "", + nonText: [{ provider: "anthropic", width: 100, height: 100 }], + }, + ]; + expect(estimateTokens(units)).toBe(Math.ceil((100 * 100) / 750)); // 14 + }); + + it("openai high detail: 85 + 170 per tile", () => { + const units: CountUnit[] = [ + { + role: "user", + text: "", + nonText: [{ provider: "openai", width: 100, height: 100 }], + }, + ]; + expect(estimateTokens(units)).toBe(85 + 170 * 1); // single tile + }); + + it("openai low detail is a flat 85, even without dimensions", () => { + const withDims: CountUnit[] = [ + { + role: "user", + text: "", + nonText: [ + { provider: "openai", width: 4000, height: 4000, detail: "low" }, + ], + }, + ]; + expect(estimateTokens(withDims)).toBe(85); + + const noDims: CountUnit[] = [ + { + role: "user", + text: "", + nonText: [{ provider: "openai", detail: "low" }], + }, + ]; + expect(estimateTokens(noDims)).toBe(85); + }); + + it("missing dimensions fall to the conservative default", () => { + const units: CountUnit[] = [ + { role: "user", text: "", nonText: [{ provider: "anthropic" }] }, + ]; + expect(estimateTokens(units)).toBe(DEFAULT_NONTEXT_TOKENS); + }); + + it("unknown provider falls to the conservative default", () => { + const units: CountUnit[] = [ + { + role: "user", + text: "", + nonText: [{ provider: "default", width: 100, height: 100 }], + }, + ]; + expect(estimateTokens(units)).toBe(DEFAULT_NONTEXT_TOKENS); + }); + + it("an image is NOT counted as char/4 of its base64 bytes", () => { + const png = fakePng(64, 64); + const ui: PlatypusUIMessage[] = [ + { + id: "m1", + role: "user", + parts: [{ type: "file", mediaType: "image/png", url: dataUrl(png) }], + } as PlatypusUIMessage, + ]; + const tokens = estimateTokens(uiMessagesToCountUnits(ui, "anthropic")); + // char/4 of the base64 data URL would be far larger than the table cost. + const charsIfNaive = Math.ceil(dataUrl(png).length / CHARS_PER_TOKEN); + expect(tokens).toBe(Math.ceil((64 * 64) / 750)); + expect(tokens).toBeLessThan(charsIfNaive); + }); +}); + +describe("parseImageDimensions (cheap header parse)", () => { + it("reads PNG IHDR dimensions", () => { + expect(parseImageDimensions(fakePng(800, 600))).toEqual({ + width: 800, + height: 600, + }); + }); + + it("reads JPEG SOF dimensions", () => { + expect(parseImageDimensions(fakeJpeg(320, 240))).toEqual({ + width: 320, + height: 240, + }); + }); + + it("returns undefined for unrecognized bytes", () => { + expect(parseImageDimensions(new Uint8Array([1, 2, 3, 4]))).toBeUndefined(); + }); +}); + +describe("MODEL_BOUND filter (drift T1 — UI-only parts excluded)", () => { + it("counts text but ignores reasoning / source / step-start / data parts", () => { + const ui: PlatypusUIMessage[] = [ + { + id: "m1", + role: "assistant", + parts: [ + { type: "reasoning", text: "thinking hard about it" }, + { type: "text", text: "hello" }, + { type: "step-start" }, + { type: "source-url", sourceId: "s1", url: "https://example.com" }, + { type: "data-custom", data: { hidden: "payload" } }, + ], + } as unknown as PlatypusUIMessage, + ]; + const units = uiMessagesToCountUnits(ui); + expect(units).toHaveLength(1); + expect(units[0].text).toBe("hello"); + expect(units[0].nonText).toHaveLength(0); + }); + + it("only text/file UI part types are model-bound (RV10 — the documented set)", () => { + expect([...MODEL_BOUND_UI_PART_TYPES]).toEqual(["text", "file"]); + // The UI-only types the adapter must drop are NOT in the model-bound set. + for (const uiOnly of [ + "reasoning", + "source-url", + "source-document", + "step-start", + "data-custom", + ]) { + expect(MODEL_BOUND_UI_PART_TYPES).not.toContain(uiOnly); + } + }); +}); + +describe("tool-result output variants (RV10 — model adapter)", () => { + const unit = (output: unknown): CountUnit => { + const msg = { + role: "tool", + content: [ + { type: "tool-result", toolCallId: "c1", toolName: "t", output }, + ], + } as unknown as ModelMessage; + return modelMessagesToCountUnits([msg])[0]; + }; + + it("folds text / json / content value into char/4 text", () => { + expect(unit({ type: "text", value: "hello world" }).text).toContain( + "hello", + ); + expect(unit({ type: "json", value: { a: 1 } }).text).toContain('"a"'); + expect( + unit({ type: "content", value: [{ type: "text", text: "deep" }] }).text, + ).toContain("deep"); + }); + + it("uses the reason (not a value) for execution-denied", () => { + expect( + unit({ type: "execution-denied", reason: "blocked" }).text, + ).toContain("blocked"); + }); +}); + +describe("adapter equality (drift T1 — one estimate across both shapes)", () => { + it("estimate(UI) === estimate(convertToModelMessages(UI)) exactly", async () => { + const png = fakePng(128, 128); + const ui: UIMessage[] = [ + { + id: "s", + role: "system", + parts: [{ type: "text", text: "You are helpful." }], + }, + { + id: "u", + role: "user", + parts: [ + { type: "text", text: "What is the weather and look at this image?" }, + { type: "file", mediaType: "image/png", url: dataUrl(png) }, + ], + }, + { + id: "a", + role: "assistant", + parts: [ + { type: "text", text: "Let me check." }, + { + type: "tool-getWeather", + toolCallId: "call-1", + state: "output-available", + input: { city: "San Francisco", units: "metric" }, + output: { temperatureC: 18, condition: "foggy" }, + }, + ], + } as unknown as UIMessage, + { + id: "a2", + role: "assistant", + parts: [{ type: "text", text: "It is 18C and foggy." }], + }, + ]; + + const model = await convertToModelMessages(ui); + + const uiTokens = estimateTokens( + uiMessagesToCountUnits(ui as PlatypusUIMessage[], "openai"), + ); + const modelTokens = estimateTokens( + modelMessagesToCountUnits(model, "openai"), + ); + + expect(uiTokens).toBe(modelTokens); + expect(uiTokens).toBeGreaterThan(0); + }); +}); + +describe("imageProviderFor", () => { + it("maps provider types to cost families", () => { + expect(imageProviderFor("Anthropic")).toBe("anthropic"); + expect(imageProviderFor("Bedrock")).toBe("anthropic"); + expect(imageProviderFor("OpenAI")).toBe("openai"); + expect(imageProviderFor("OpenRouter")).toBe("default"); + expect(imageProviderFor("Google")).toBe("default"); + }); +}); + +// --- estimateOverheadTokens (drift C1) ------------------------------------- + +import { z } from "zod"; +import { tool } from "ai"; +import { estimateOverheadTokens } from "./token-estimate.ts"; + +describe("estimateOverheadTokens (drift C1)", () => { + it("counts the system prompt at char/4", () => { + const sys = "S".repeat(400); + expect(estimateOverheadTokens(sys, {})).toBe(100); + }); + + it("handles missing system prompt and tools", () => { + expect(estimateOverheadTokens(undefined, undefined)).toBe(0); + }); + + it("counts tool name, description, and serialized JSON schema", () => { + const sys = "system"; + const base = estimateOverheadTokens(sys, {}); + const withTool = estimateOverheadTokens(sys, { + searchDocuments: tool({ + description: + "Searches the workspace document store and returns ranked matches.", + inputSchema: z.object({ + query: z.string().describe("Full-text query string"), + limit: z.number().optional().describe("Maximum results to return"), + }), + }), + }); + // Name + description alone are ~20 tokens; the serialized schema (with + // property names and descriptions) must push it well past that. + expect(withTool).toBeGreaterThan(base + 40); + }); + + it("falls back to a conservative flat cost for unserializable schemas", () => { + const tokens = estimateOverheadTokens("", { + weird: { description: "", inputSchema: 42 } as never, + }); + // Either the fallback constant fired or some serialization succeeded — + // never zero, never a throw. + expect(tokens).toBeGreaterThanOrEqual(2); // ≥ name chars / 4 + expect(Number.isFinite(tokens)).toBe(true); + }); + + it("scales with a realistic multi-tool agent (the 8888-vs-986 gap)", () => { + const sys = "You are a helpful agent.\n".repeat(40); // ~1k chars + const tools = Object.fromEntries( + Array.from({ length: 8 }, (_, i) => [ + `tool_${i}`, + tool({ + description: + "A realistically verbose tool description explaining inputs, outputs, constraints, and error behaviour for the model.", + inputSchema: z.object({ + target: z.string().describe("The resource identifier to act on"), + options: z + .object({ + recursive: z.boolean().optional(), + depth: z.number().optional(), + filter: z.string().optional(), + }) + .optional(), + }), + }), + ]), + ); + // The point of C1: this payload is large even with a short history. + expect(estimateOverheadTokens(sys, tools)).toBeGreaterThan(500); + }); + + it("is stable across repeated calls (RV9 schema-cache must not change counts)", () => { + const sys = "system prompt"; + const tools = { + lookup: tool({ + description: "Look something up by id.", + inputSchema: z.object({ id: z.string().describe("identifier") }), + }), + }; + const first = estimateOverheadTokens(sys, tools); + // Same tool objects → WeakMap hit on the second call; the memoized schema + // length must reproduce the exact token count, never drift. + expect(estimateOverheadTokens(sys, tools)).toBe(first); + }); +}); diff --git a/apps/backend/src/runs/token-estimate.ts b/apps/backend/src/runs/token-estimate.ts new file mode 100644 index 00000000..0913cd87 --- /dev/null +++ b/apps/backend/src/runs/token-estimate.ts @@ -0,0 +1,518 @@ +/** + * The single token estimator (context-compaction-plan §B, principle P2). + * + * Token counting lives in **exactly one** function — {@link estimateTokens} — + * over **one** neutral structure ({@link CountUnit}). Tier 1 operates on + * UIMessages and Tier 2 on ModelMessages; both normalize into `CountUnit[]` via + * the adapters here, so the two tiers can never diverge on a count (drift T1). + * + * Hard rules baked in: + * - **char/4 applies to text only.** Tool-call inputs and tool-result outputs + * are text-like to the model, so they fold into a unit's `text`. Image / + * binary bytes are NEVER char/4'd — they go through the modality table + * ({@link nonTextTokens}, drift T2). + * - **UI-only parts are excluded on both sides.** `reasoning`, `source-url`, + * `source-document`, `step-start`, and `data-*` never reach the model, so + * they are dropped by both adapters (drift T1). + * - The estimate is content-only — **no per-message role framing overhead** — + * so the total is invariant to how messages are grouped. That is what lets + * the UIMessage and ModelMessage adapters agree exactly even though + * `convertToModelMessages` splits one UI message into several model messages. + * + * The char/4 estimate runs every turn. The provider-reported + * `usage.inputTokens` from the prior turn acts as a corrective baseline when + * available (`Tier1Input.lastInputTokens` — threaded by the §H usage-metadata + * chunk); until then the cold-start margin (M2) compensates for under-counts. + */ + +import { + asSchema, + type ModelMessage, + type Tool, + type ToolResultPart, + type DataContent, +} from "ai"; +import type { PlatypusUIMessage } from "../types.ts"; + +/** Number of characters approximated as one token (text only). */ +export const CHARS_PER_TOKEN = 4; + +/** + * Conservative flat cost for a non-text part whose true cost we cannot compute + * (unknown provider, missing image dimensions, non-image binary file). Over- + * counting beats overflow (drift T2). + */ +export const DEFAULT_NONTEXT_TOKENS = 1200; + +/** OpenAI's flat cost for a `detail: "low"` image, independent of size. */ +const OPENAI_LOW_DETAIL_TOKENS = 85; + +/** + * The provider families with a known image-cost formula. Everything else maps + * to `"default"` and pays the conservative flat cost. + */ +export type ImageProvider = "anthropic" | "openai" | "default"; + +/** + * A non-text, model-bound part reduced to what the estimator needs: which + * provider formula applies, and (when known) the decoded pixel dimensions. + * `width`/`height` undefined → the provider's missing-dimension fallback. + */ +export type NonTextPart = { + provider: ImageProvider; + width?: number; + height?: number; + /** OpenAI image detail hint. Unset is treated as `"high"` (over-count). */ + detail?: "low" | "high"; +}; + +/** Message role, neutral across UIMessage and ModelMessage shapes. */ +export type CountRole = "system" | "user" | "assistant" | "tool"; + +/** + * The neutral counting structure. One per source message. `text` is the + * char/4'd blob (text parts + serialized tool input/output); `nonText` holds + * images/binaries counted via the modality table. + */ +export type CountUnit = { + role: CountRole; + text: string; + nonText: NonTextPart[]; +}; + +/** + * UIMessage part `type`s that reach the model and are therefore counted. Kept + * as data so the test can assert the UI-only parts are excluded (drift T1). + * Tool parts are matched separately by the `tool-`/`dynamic-tool` prefix. + */ +export const MODEL_BOUND_UI_PART_TYPES = ["text", "file"] as const; + +// --------------------------------------------------------------------------- +// The estimator (the one function — P2) +// --------------------------------------------------------------------------- + +function nonTextTokens(part: NonTextPart): number { + const { provider, width, height, detail } = part; + + if (width == null || height == null) { + // Dimensions unknown. OpenAI low-detail has a flat cost even without dims; + // everything else falls to the conservative default. + if (provider === "openai" && detail === "low") + return OPENAI_LOW_DETAIL_TOKENS; + return DEFAULT_NONTEXT_TOKENS; + } + + switch (provider) { + case "anthropic": + // Anthropic's documented approximation: tokens ≈ (w × h) / 750. + return Math.ceil((width * height) / 750); + case "openai": + return detail === "low" + ? OPENAI_LOW_DETAIL_TOKENS + : openaiHighDetailTokens(width, height); + default: + return DEFAULT_NONTEXT_TOKENS; + } +} + +/** + * OpenAI's high-detail tiling cost (gpt-4o family): fit within 2048×2048, scale + * the shortest side to 768, then 85 base + 170 per 512px tile. + */ +function openaiHighDetailTokens(w: number, h: number): number { + let width = w; + let height = h; + const longest = Math.max(width, height); + if (longest > 2048) { + const scale = 2048 / longest; + width = Math.round(width * scale); + height = Math.round(height * scale); + } + const shortest = Math.min(width, height); + if (shortest > 768) { + const scale = 768 / shortest; + width = Math.round(width * scale); + height = Math.round(height * scale); + } + const tiles = Math.ceil(width / 512) * Math.ceil(height / 512); + return 85 + 170 * tiles; +} + +/** + * The single estimator. Sums char/4 of each unit's text plus the modality-table + * cost of each non-text part. Content-only, role-agnostic (see file header). + */ +export const estimateTokens = (units: CountUnit[]): number => { + let total = 0; + for (const unit of units) { + total += Math.ceil(unit.text.length / CHARS_PER_TOKEN); + for (const part of unit.nonText) total += nonTextTokens(part); + } + return total; +}; + +// --------------------------------------------------------------------------- +// Shared helpers +// --------------------------------------------------------------------------- + +/** + * Deterministic JSON with sorted keys, so the same value serializes to the same + * string from either adapter (the UIMessage and ModelMessage shapes must agree + * exactly — drift T1). Cheaper than guarding key order at every call site. + */ +export function stableStringify(value: unknown): string { + if (value === null || typeof value !== "object") + return JSON.stringify(value) ?? ""; + if (Array.isArray(value)) return `[${value.map(stableStringify).join(",")}]`; + const obj = value as Record; + const keys = Object.keys(obj).sort(); + return `{${keys + .map((k) => `${JSON.stringify(k)}:${stableStringify(obj[k])}`) + .join(",")}}`; +} + +function isImageMediaType(mediaType: string | undefined): boolean { + return typeof mediaType === "string" && mediaType.startsWith("image/"); +} + +/** + * Builds a {@link NonTextPart} for an image, parsing pixel dimensions from the + * bytes when available (drift T2: a cheap header read, no full decode). + */ +function imagePart( + provider: ImageProvider, + bytes: Uint8Array | undefined, + detail?: "low" | "high", +): NonTextPart { + const dims = bytes ? parseImageDimensions(bytes) : undefined; + return { provider, width: dims?.width, height: dims?.height, detail }; +} + +/** A non-image binary file: conservative flat cost, no formula. */ +function binaryPart(): NonTextPart { + return { provider: "default" }; +} + +// --------------------------------------------------------------------------- +// Image dimension parsing (cheap header parse — PNG IHDR / JPEG SOF) +// --------------------------------------------------------------------------- + +/** + * Reads pixel dimensions from PNG / JPEG headers without decoding the image. + * Returns undefined for unrecognized formats or truncated data — the caller + * then falls to the conservative constant (drift T2). + */ +export function parseImageDimensions( + bytes: Uint8Array, +): { width: number; height: number } | undefined { + // PNG: 8-byte signature, then IHDR chunk with width@16, height@20 (BE). + if ( + bytes.length >= 24 && + bytes[0] === 0x89 && + bytes[1] === 0x50 && + bytes[2] === 0x4e && + bytes[3] === 0x47 + ) { + const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength); + return { width: view.getUint32(16), height: view.getUint32(20) }; + } + + // JPEG: 0xFFD8 start, then walk segment markers to the SOF that carries dims. + if (bytes.length >= 4 && bytes[0] === 0xff && bytes[1] === 0xd8) { + let offset = 2; + while (offset + 9 < bytes.length) { + if (bytes[offset] !== 0xff) { + offset++; + continue; + } + const marker = bytes[offset + 1]; + // 0xFF fill bytes pad before a real marker; consume one and re-read so a + // run of fill bytes doesn't get mistaken for a segment (RV10). + if (marker === 0xff) { + offset++; + continue; + } + // 0xFF00 is a stuffed data byte inside entropy-coded data, not a marker. + if (marker === 0x00) { + offset += 2; + continue; + } + // SOF0..SOF15 carry frame dimensions, excluding DHT(C4)/JPG(C8)/DAC(CC). + const isSof = + marker >= 0xc0 && + marker <= 0xcf && + marker !== 0xc4 && + marker !== 0xc8 && + marker !== 0xcc; + if (isSof) { + const view = new DataView( + bytes.buffer, + bytes.byteOffset, + bytes.byteLength, + ); + const height = view.getUint16(offset + 5); + const width = view.getUint16(offset + 7); + return { width, height }; + } + // Standalone markers with no length payload: SOI(D8), EOI(D9), + // RSTn(D0-D7), TEM(01) (RV10). Skip the 2-byte marker. + if ( + marker === 0xd8 || + marker === 0xd9 || + marker === 0x01 || + (marker >= 0xd0 && marker <= 0xd7) + ) { + offset += 2; + continue; + } + const segLength = (bytes[offset + 2] << 8) | bytes[offset + 3]; + if (segLength < 2) return undefined; + offset += 2 + segLength; + } + } + + return undefined; +} + +/** + * Upper bound on bytes decoded from a data URL for header parsing (RV9). PNG + * dimensions live in the first 24 bytes; a JPEG SOF marker is almost always + * within the first few KB. Decoding only a 64 KB prefix avoids materializing a + * multi-MB image on every estimation pass — we never need the pixel data, only + * the header. base64 packs 3 bytes per 4 chars, so cap the input accordingly. + */ +const HEADER_DECODE_MAX_BYTES = 64 * 1024; +const HEADER_DECODE_MAX_B64_CHARS = Math.ceil(HEADER_DECODE_MAX_BYTES / 3) * 4; + +/** + * Decodes the bytes behind a UIMessage file URL when it is a base64 data URL. + * Hosted (http/https) URLs return undefined — we have no bytes in hand, so the + * caller falls to the conservative constant. Only a bounded prefix is decoded + * (RV9) since the caller only reads image headers. + */ +function bytesFromUrl(url: string): Uint8Array | undefined { + const match = /^data:[^;,]*;base64,(.*)$/s.exec(url); + if (!match) return undefined; + try { + const b64 = match[1].slice(0, HEADER_DECODE_MAX_B64_CHARS); + return new Uint8Array(Buffer.from(b64, "base64")); + } catch { + return undefined; + } +} + +/** Normalizes the various ModelMessage byte containers into a Uint8Array. */ +function bytesFromDataContent(data: DataContent | URL): Uint8Array | undefined { + if (typeof data === "string") return bytesFromUrl(data); + if (data instanceof URL) return undefined; + if (data instanceof Uint8Array) return data; + if (data instanceof ArrayBuffer) return new Uint8Array(data); + if (typeof Buffer !== "undefined" && Buffer.isBuffer(data)) { + return new Uint8Array(data); + } + return undefined; +} + +// --------------------------------------------------------------------------- +// Tier 1 adapter — UIMessage → CountUnit (one unit per message) +// --------------------------------------------------------------------------- + +function uiMessageToCountUnit( + message: PlatypusUIMessage, + provider: ImageProvider, +): CountUnit { + let text = ""; + const nonText: NonTextPart[] = []; + + for (const part of message.parts ?? []) { + const type = part.type; + + if (type === "text") { + text += (part as { text: string }).text; + continue; + } + + if (type === "file") { + const file = part as { mediaType?: string; url: string }; + const bytes = bytesFromUrl(file.url); + if (isImageMediaType(file.mediaType)) { + nonText.push(imagePart(provider, bytes)); + } else { + nonText.push(binaryPart()); + } + continue; + } + + // Tool invocations (`tool-` and `dynamic-tool`) are model-bound and + // text-like: fold their input + output into the char/4 blob. + if (type === "dynamic-tool" || type.startsWith("tool-")) { + const tool = part as { input?: unknown; output?: unknown }; + if (tool.input !== undefined) text += stableStringify(tool.input); + if (tool.output !== undefined) text += stableStringify(tool.output); + continue; + } + + // Everything else (reasoning, source-url, source-document, step-start, + // data-*) is UI-only and excluded on both sides (drift T1). + } + + return { role: message.role, text, nonText }; +} + +/** Tier 1 adapter: UIMessages → neutral count units. */ +export function uiMessagesToCountUnits( + messages: PlatypusUIMessage[], + provider: ImageProvider = "default", +): CountUnit[] { + return messages.map((m) => uiMessageToCountUnit(m, provider)); +} + +// --------------------------------------------------------------------------- +// Tier 2 adapter — ModelMessage → CountUnit (one unit per message) +// --------------------------------------------------------------------------- + +/** + * Extracts the model-visible string from a tool-result output wrapper. Only two + * behaviours exist: `execution-denied` carries a `reason`; every other variant + * (`text` / `error-text` / `json` / `error-json` / `content`) carries a `value` + * that is char/4'd via `stableStringify` — mirroring the UI adapter, which folds + * the raw output the same way (RV10: the old per-label switch collapsed to these + * two and carried an unreachable `default`). + */ +function toolResultOutputText(output: ToolResultPart["output"]): string { + return output.type === "execution-denied" + ? stableStringify(output.reason ?? "") + : stableStringify(output.value); +} + +function modelMessageToCountUnit( + message: ModelMessage, + provider: ImageProvider, +): CountUnit { + const role = message.role; + let text = ""; + const nonText: NonTextPart[] = []; + + const { content } = message; + if (typeof content === "string") { + return { role, text: content, nonText }; + } + + for (const part of content) { + switch (part.type) { + case "text": + text += part.text; + break; + case "tool-call": + text += stableStringify(part.input); + break; + case "tool-result": + text += toolResultOutputText(part.output); + break; + case "image": { + const img = part; + nonText.push(imagePart(provider, bytesFromDataContent(img.image))); + break; + } + case "file": { + const file = part; + if (isImageMediaType(file.mediaType)) { + nonText.push(imagePart(provider, bytesFromDataContent(file.data))); + } else { + nonText.push(binaryPart()); + } + break; + } + // reasoning / tool-approval-* are UI-only or control parts — excluded. + default: + break; + } + } + + return { role, text, nonText }; +} + +/** Tier 2 adapter: ModelMessages → neutral count units. */ +export function modelMessagesToCountUnits( + messages: ModelMessage[], + provider: ImageProvider = "default", +): CountUnit[] { + return messages.map((m) => modelMessageToCountUnit(m, provider)); +} + +// --------------------------------------------------------------------------- +// Per-turn overhead — system prompt + tool schemas (drift C1) +// --------------------------------------------------------------------------- + +/** + * Flat fallback for a tool whose input schema cannot be serialized (e.g. a + * provider-defined tool with no JSON-schema representation). Conservative — + * over-counting beats overflow. + */ +export const TOOL_SCHEMA_FALLBACK_TOKENS = 200; + +/** + * Serialized-schema char length cached per input-schema object (RV9). The + * `asSchema(...) → stableStringify` conversion is the expensive part of overhead + * estimation and a tool's schema object is stable across turns, so memoize it. + * A WeakMap keyed by the schema object never pins a tool that goes out of scope. + */ +const schemaLenCache = new WeakMap(); + +/** + * Estimates the tokens of the per-turn payload that is NOT in the message + * history: the rendered system prompt plus every tool's name, description, and + * JSON input schema — all sent to the model on every turn, and the dominant + * cause of the C1 trigger under-count on tool-bearing agents (observed 8888 + * provider-reported vs ~986 message-only). Same char/4 rule as the single + * estimator; the result feeds `Tier1Input.overheadTokens`. + */ +export function estimateOverheadTokens( + systemPrompt: string | undefined, + tools: Record | undefined, +): number { + let tokens = Math.ceil((systemPrompt ?? "").length / CHARS_PER_TOKEN); + for (const [name, tool] of Object.entries(tools ?? {})) { + const t = tool as { description?: string; inputSchema?: unknown }; + let schemaLen = 0; + if (t.inputSchema != null) { + const key = typeof t.inputSchema === "object" ? t.inputSchema : undefined; + const cached = key ? schemaLenCache.get(key) : undefined; + if (cached !== undefined) { + schemaLen = cached; + } else { + try { + // asSchema is the SDK's own conversion to the wire-format JSON schema. + schemaLen = stableStringify( + asSchema(t.inputSchema as never).jsonSchema, + ).length; + if (key) schemaLenCache.set(key, schemaLen); + } catch { + tokens += TOOL_SCHEMA_FALLBACK_TOKENS; + } + } + } + // Concatenated length == sum of lengths, so this stays numerically identical + // to folding the schema string into `text` before the single char/4 divide. + const baseLen = (name + (t.description ?? "")).length + schemaLen; + tokens += Math.ceil(baseLen / CHARS_PER_TOKEN); + } + return tokens; +} + +/** + * Maps a provider `providerType` (as stored on the provider row) to the image + * cost family. Bedrock most commonly serves Anthropic models, so it maps to + * `anthropic`; OpenRouter is heterogeneous and maps to `default`. + */ +export function imageProviderFor(providerType: string): ImageProvider { + switch (providerType) { + case "Anthropic": + case "Bedrock": + return "anthropic"; + case "OpenAI": + return "openai"; + default: + return "default"; + } +} diff --git a/apps/backend/src/runs/types.ts b/apps/backend/src/runs/types.ts index 56ac27f3..6a91d72a 100644 --- a/apps/backend/src/runs/types.ts +++ b/apps/backend/src/runs/types.ts @@ -1,5 +1,5 @@ import type { PlatypusUIMessage } from "../types.ts"; -import type { ChatSubmitData, ChatTurn } from "../services/chat-execution.ts"; +import type { ChatTurnRequest, ChatTurn } from "../services/chat-execution.ts"; export type RunId = string; @@ -19,7 +19,7 @@ export type RunStats = { */ export type RunInput = { runId: RunId; - request: ChatSubmitData; + request: ChatTurnRequest; messages: PlatypusUIMessage[]; }; diff --git a/apps/backend/src/services/chat-execution.ts b/apps/backend/src/services/chat-execution.ts index 2386db9a..fdfca441 100644 --- a/apps/backend/src/services/chat-execution.ts +++ b/apps/backend/src/services/chat-execution.ts @@ -23,16 +23,40 @@ import { retrieveRecentSummaries, type MemorySummary, } from "./memory-retrieval.ts"; -import type { - ChatSubmitData as ChatSubmitDataSchema, - Provider, - Skill, -} from "@platypus/schemas"; -import type { Tool } from "ai"; +import type { Provider, Skill } from "@platypus/schemas"; +import { generateText, type Tool } from "ai"; import { logger } from "../logger.ts"; import { buildMcpTransportConfig } from "./mcp-oauth-provider.ts"; import { inlineFileUrls } from "../storage/utils.ts"; import type { PlatypusUIMessage } from "../types.ts"; +import { chat as chatTable } from "../db/schema.ts"; +import { + contextWindowResolver, + DEFAULT_CONTEXT_WINDOW, +} from "../runs/context-window.ts"; +import { + estimateTokens, + estimateOverheadTokens, + imageProviderFor, + uiMessagesToCountUnits, + type ImageProvider, +} from "../runs/token-estimate.ts"; +import { + applyTier1Compaction, + affectedBelowWatermark, + buildTier2PrepareStep, + computeBudget, + drizzleCompactionStore, + invalidateCompaction, + resolveCompactionConfig, + setCompactionDirty, + type Budget, + type CompactionConfig, + type CompactionState, + type Summarize, + type Tier2Context, +} from "../runs/compaction.ts"; +import type { RecoveryContext } from "../runs/recovery.ts"; // --- Errors --- @@ -85,7 +109,20 @@ type GenerationConfig = { skills?: Array>; }; -type ChatSubmitData = { +/** + * The slim request shape `prepareChatTurn` actually consumes: agent/provider + * selection plus generation overrides. Distinct from `@platypus/schemas`' + * `ChatSubmitData` (the HTTP payload, which also carries id/workspaceId/ + * messages) — those arrive as separate `PrepareChatTurnInput` fields. + */ +export type ChatTurnRequest = { + /** + * Chat id. Present for interactive chat turns (the chatSubmit payload); + * absent for headless callers (triggers, sub-agents) whose `request` carries + * no chat. Tier 1 compaction keys on it — see the skip guard in + * `prepareChatTurn` (plan M3: headless runs are Tier 2 only). + */ + id?: string; agentId?: string; providerId?: string; modelId?: string; @@ -124,7 +161,23 @@ export type ChatTurn = { frequencyPenalty?: number; presencePenalty?: number; seed?: number; + /** Resolved context window for the main model (§H ring, §I stats). */ + contextWindow: number; + /** True when contextWindow fell to the conservative default (T6: ring → neutral). */ + contextWindowIsDefault: boolean; }; + /** + * Context-overflow recovery wiring (§E, P4). Always present — recovery is + * the safety net and stays on even when proactive compaction is disabled. + * agent-runner wraps the model with the recovery middleware using this. + */ + recovery: RecoveryContext; + /** + * Tier 2 in-turn compaction config (§D). Null when proactive compaction is + * disabled (§G kill switch or agent override). agent-runner builds the + * prepareStep callback from this and wires it into streamText/generateText. + */ + tier2: Tier2Context | null; dispose: () => Promise; }; @@ -132,7 +185,7 @@ export type PrepareChatTurnInput = { orgId: string; workspaceId: string; user: { id: string; name: string }; - request: ChatSubmitDataSchema; + request: ChatTurnRequest; messages: PlatypusUIMessage[]; /** * Used to rewrite `storage://` URLs in messages to absolute HTTP URLs so @@ -156,6 +209,13 @@ export type PrepareChatTurnInput = { * yield bumps invoke with no event (timer-only). */ onActivity?: (event?: ToolActivityEvent) => void; + /** + * Messages as they were in the DB BEFORE this submission's `ChatSink.onStart` + * overwrote them — the C4 baseline for detecting edits below the watermark + * (RV1). Loaded by agent-runner before calling onStart. When absent the C4 + * check falls back to a DB read that now returns the post-overwrite state. + */ + priorMessages?: PlatypusUIMessage[]; }; /** @@ -413,6 +473,215 @@ export const drizzleChatTurnQueries: ChatTurnQueries = { }, }; +// --- Tier 1 context compaction (ADR-0009) --- + +const EMPTY_COMPACTION_STATE: CompactionState = { + version: 0, + summaryWatermark: null, + contextSummary: null, + compactionDirty: false, +}; + +/** + * Loads the canonical (raw) persisted history for a chat. Exported so + * agent-runner can snapshot it BEFORE `ChatSink.onStart` overwrites the row — + * that snapshot is the C4 baseline (RV1: onStart runs before prepareChatTurn, + * so a read inside applyTier1IfNeeded would see the just-submitted messages). + */ +export async function loadChatMessages( + chatId: string, +): Promise { + const rows = await db + .select({ messages: chatTable.messages }) + .from(chatTable) + .where(eq(chatTable.id, chatId)) + .limit(1); + return (rows[0]?.messages as PlatypusUIMessage[] | null) ?? []; +} + +/** + * Everything the compaction machinery needs that is resolved once per turn: + * the budget (from the resolved context window), the effective config, the + * summarizer, and the summarizer's own window (drift M1). Shared by Tier 1 + * and the recovery middleware (§E) so the two never disagree. + */ +type CompactionRuntime = { + budget: Budget; + config: CompactionConfig; + imageProvider: ImageProvider; + summarize: Summarize; + summarizerWindow?: number; + /** Resolved context window for the main model (§H ring). */ + contextWindow: number; + /** True when the window fell to the conservative default (T6: ring → neutral). */ + contextWindowIsDefault: boolean; +}; + +/** + * Builds the per-turn compaction runtime. Never throws: a failed window + * resolution falls back to the conservative default so recovery (P4) always + * has a working configuration. + */ +async function buildCompactionRuntime(args: { + chatId?: string; + provider: Provider; + resolvedModelId: string; + agent: AgentRow | null; + opened: ReturnType; +}): Promise { + const { chatId, provider, resolvedModelId, agent, opened } = args; + + const config = resolveCompactionConfig(agent); + // Global kill switch (§G) gates proactive compaction; recovery is unaffected. + if (process.env.COMPACTION_ENABLED === "false") { + config.compactionEnabled = false; + } + + // RV7d: resolve both windows concurrently (they are independent). + const taskModelId = provider.taskModelId || resolvedModelId; + const [mainWindow, summarizerWindowResult] = await Promise.all([ + contextWindowResolver.resolve(provider, resolvedModelId).catch((error) => { + logger.error( + { error, chatId, resolvedModelId }, + "context window resolution failed; using conservative default", + ); + return null; + }), + contextWindowResolver.resolve(provider, taskModelId).catch(() => null), + ]); + + const contextWindow = mainWindow?.contextWindow ?? DEFAULT_CONTEXT_WINDOW; + const maxOutputTokens = mainWindow?.maxOutputTokens; + const budget = computeBudget(contextWindow, maxOutputTokens, config); + + const summarizerWindow = summarizerWindowResult + ? computeBudget( + summarizerWindowResult.contextWindow, + summarizerWindowResult.maxOutputTokens, + config, + ).inputBudget + : undefined; + + // Summarizer uses the provider's task model, falling back to the main model + // when unset (drift T7). generateText is one-shot, no tools. + const summarize = async (text: string): Promise => { + const startedAt = Date.now(); + const { text: summary, usage } = await generateText({ + model: opened.languageModel(taskModelId), + system: + "You compress conversation history for context reuse. Produce a dense summary capturing decisions made, facts established, files/tools touched, open questions, and the user's intent. Drop pleasantries and redundancy. Output only the summary.", + prompt: text, + }); + logger.info( + { + metric: "summarize.latency_ms", + latencyMs: Date.now() - startedAt, + chatId, + taskModelId, + usage, + }, + "context compaction summarize", + ); + return summary; + }; + + return { + budget, + config, + imageProvider: imageProviderFor(provider.providerType), + summarize, + summarizerWindow, + contextWindow, + contextWindowIsDefault: !mainWindow || mainWindow.source === "default", + }; +} + +type ApplyTier1Args = { + chatId: string; + runtime: CompactionRuntime; + /** Post-inlineFileUrls messages — used for the compaction itself (T2). */ + messages: PlatypusUIMessage[]; + /** + * Pre-inlineFileUrls messages from this submission — used as the incoming + * side of the C4 divergence check (RV1). Must NOT be inlined: the persisted + * side also uses storage:// / http:// URLs, so both sides are comparable. + */ + rawMessages: PlatypusUIMessage[]; + /** + * Messages as they were in the DB BEFORE this submission's onStart overwrote + * them (RV1). When absent, the C4 check falls back to a fresh DB read, which + * returns the post-overwrite state and therefore never detects edits. + */ + priorMessages?: PlatypusUIMessage[]; + /** Estimated system-prompt + tool-schema payload for this turn (drift C1). */ + overheadTokens: number; + /** Provider-reported `usage.inputTokens` from the prior turn (C1, §H). */ + lastInputTokens?: number; +}; + +/** + * Reconstructs/advances the compacted view and persists any new summary — all + * best-effort. Any throw degrades to the uncompacted messages (recovery §E + * remains the safety net). Returns the message array to send to the model. + */ +async function applyTier1IfNeeded( + args: ApplyTier1Args, +): Promise { + const { chatId, runtime, messages, rawMessages } = args; + try { + const store = drizzleCompactionStore; + let state = (await store.readState(chatId)) ?? EMPTY_COMPACTION_STATE; + + // C4 invalidation: if the submitted history changed at/below the watermark + // (edit/delete/regenerate), reset the stale summary before compacting. The + // single submit endpoint is the only "edit handler" in this architecture. + // + // RV1 fix: the baseline must be the DB state BEFORE this submission's + // onStart overwrote the row. agent-runner reads it before calling onStart + // and threads it here as `priorMessages`. We also compare the pre-inline + // (`rawMessages`) side so file-URL inlining doesn't trigger false positives. + if (state.summaryWatermark || state.contextSummary) { + const persisted = args.priorMessages ?? (await loadChatMessages(chatId)); + const affected = affectedBelowWatermark( + persisted, + rawMessages, + state.summaryWatermark, + ); + if (affected.length > 0) { + const orderedIds = rawMessages + .map((m) => m.id) + .filter((id): id is string => Boolean(id)); + await invalidateCompaction(store, chatId, affected, orderedIds); + state = (await store.readState(chatId)) ?? state; + } + } + + const result = await applyTier1Compaction({ + chatId, + messages, + state, + budget: runtime.budget, + config: runtime.config, + imageProvider: runtime.imageProvider, + summarize: runtime.summarize, + summarizerWindow: runtime.summarizerWindow, + overheadTokens: args.overheadTokens, + lastInputTokens: args.lastInputTokens, + store, + onEvent: (event) => + logger.info({ chatId, ...event }, "context-compacted"), + }); + + return result.messages; + } catch (error) { + logger.error( + { error, chatId }, + "Tier 1 compaction failed; sending uncompacted history", + ); + return messages; + } +} + /** * Whether the provider's native web_search tool should be injected for this * turn. True only when the request opted into search AND the provider hasn't @@ -515,11 +784,7 @@ export const prepareChatTurn = async ( runMode, }; - const generation = resolveGenerationConfig( - request, - agent, - promptCtx, - ); + const generation = resolveGenerationConfig(request, agent, promptCtx); if (skills.length > 0) { tools.loadSkill = createLoadSkillTool(orgId, workspaceId); @@ -556,12 +821,77 @@ export const prepareChatTurn = async ( const systemPrompt = generation.systemPrompt!; + // --- Context compaction & recovery (ADR-0009) --- + // The runtime (window budget, config, summarizer) is resolved once and shared + // by Tier 1 and the recovery middleware so they never disagree. Never throws. + const compactionRuntime = await buildCompactionRuntime({ + chatId: request.id, + provider, + resolvedModelId, + agent: agent ?? null, + opened, + }); + + // Per-turn overhead: system prompt + tool schemas, sent on every turn but + // invisible to a message-only estimate (drift C1). + const overheadTokens = estimateOverheadTokens(systemPrompt, wrappedTools); + + // Tier 1 is best-effort: a failure here must never break the turn — recovery + // (§E) is the net. Runs AFTER inlineFileUrls so the estimate sees the real + // payload (T2). Cross-turn durable compaction is keyed by chat id; headless + // runs (triggers, sub-agents) carry no chat id and have no durable history to + // compact (plan M3 — they are Tier 2 only), so send messages uncompacted. + const chatId = request.id; + const compactedMessages = chatId + ? await applyTier1IfNeeded({ + chatId, + runtime: compactionRuntime, + messages: inlinedMessages, + // Pre-inline messages for C4 comparison (RV1): both sides must use the + // same URL format (storage:// / http://) to avoid false positives. + rawMessages: messages, + // Pre-overwrite baseline threaded from agent-runner (RV1). + priorMessages: input.priorMessages, + overheadTokens, + // Prior turn's provider-reported input token count (C1 / §H): the last + // assistant message carries metadata.stats.contextTokens (stamped by + // applyMessageStats) — the corrective baseline for the Tier 1 trigger + // projection on turns ≥ 2. Absent on turn 1 → cold-start margin applies. + lastInputTokens: ( + messages.findLast((m) => m.role === "assistant")?.metadata as + | { stats?: { contextTokens?: number } } + | undefined + )?.stats?.contextTokens, + }) + : inlinedMessages; + + // Recovery (§E, P4): always wired, even when proactive compaction is off. + // Headless runs get trim+retry but no dirty flag (no durable chat row). + const recovery: RecoveryContext = { + chatId, + imageProvider: compactionRuntime.imageProvider, + // RV6: subtract the per-turn overhead so recovery uses the same effective + // target as Tier 1. Without this, a large overhead (e.g. 65%+ of the window) + // means the recovery retry still overflows even after trimming. + targetTokens: Math.max( + 0, + compactionRuntime.budget.targetTokens - overheadTokens, + ), + keepRecentMessages: compactionRuntime.config.keepRecentMessages, + minPrunableChars: compactionRuntime.config.minPrunableChars, + summarize: compactionRuntime.summarize, + summarizerWindow: compactionRuntime.summarizerWindow, + markDirty: chatId + ? () => setCompactionDirty(drizzleCompactionStore, chatId) + : undefined, + }; + return { stream: { model, tools: wrappedTools, system: systemPrompt, - messages: inlinedMessages, + messages: compactedMessages, maxSteps: resolvedMaxSteps, temperature: generation.temperature, topP: generation.topP, @@ -583,7 +913,34 @@ export const prepareChatTurn = async ( frequencyPenalty: agent ? undefined : generation.frequencyPenalty, presencePenalty: agent ? undefined : generation.presencePenalty, seed: agent ? undefined : request.seed, + contextWindow: compactionRuntime.contextWindow, + contextWindowIsDefault: compactionRuntime.contextWindowIsDefault, }, + recovery, + tier2: compactionRuntime.config.compactionEnabled + ? { + // RV6 (Tier 2): the prepareStep estimate counts ModelMessages only — + // system prompt + tool schemas go as separate streamText params and + // are invisible to it, yet they consume the same window. Subtract the + // per-turn overhead so the trigger/target reflect the real wire + // payload (mirrors the Tier 1 and recovery targets above). Without + // this, a large overhead lets the payload blow past the budget before + // Tier 2 ever fires — exactly the tool-heavy case it exists for. + triggerTokens: Math.max( + 0, + compactionRuntime.budget.triggerTokens - overheadTokens, + ), + targetTokens: Math.max( + 0, + compactionRuntime.budget.targetTokens - overheadTokens, + ), + keepRecentMessages: compactionRuntime.config.keepRecentMessages, + minPrunableChars: compactionRuntime.config.minPrunableChars, + imageProvider: compactionRuntime.imageProvider, + summarize: compactionRuntime.summarize, + summarizerWindow: compactionRuntime.summarizerWindow, + } + : null, dispose, }; }; @@ -728,7 +1085,7 @@ const wrapToolsWithBump = ( const resolveChatContext = async ( queries: ChatTurnQueries, - data: ChatSubmitData, + data: ChatTurnRequest, orgId: string, workspaceId: string, ): Promise => { @@ -849,7 +1206,7 @@ const loadTools = async ( }; const resolveGenerationConfig = ( - data: ChatSubmitData, + data: ChatTurnRequest, agent: AgentRow | undefined, promptCtx: SystemPromptContext, ): GenerationConfig => { @@ -907,20 +1264,74 @@ const loadSubAgents = async ( description: sa.description, })); + // Provider lookups are memoized so the Tier 2 loop below and the + // createModelFn callback don't each re-fetch + re-open the same provider + // (F1): one getProvider + openProvider per distinct providerId per turn. + const providerCache = new Map< + string, + { provider: Provider; opened: ReturnType } | null + >(); + const resolveSubProvider = async (providerId: string) => { + if (!providerCache.has(providerId)) { + const p = await queries.getProvider(providerId, orgId, workspaceId); + providerCache.set( + providerId, + p ? { provider: p, opened: openProvider(p) } : null, + ); + } + return providerCache.get(providerId) ?? null; + }; + + // Tier 2 only for sub-agents (drift M3: no durable history for Tier 1). + // Resolve per-sub-agent compaction runtime so each sub-agent's tool loop + // gets a prepareStep calibrated to its own model's context window. + const subAgentPrepareSteps = new Map< + string, + import("ai").PrepareStepFunction + >(); + await Promise.all( + subAgentRecords.map(async (sa) => { + try { + const resolved = await resolveSubProvider(sa.providerId); + if (!resolved) return; + const runtime = await buildCompactionRuntime({ + // Sub-agents have no chat row; tag logs with the sub-agent id (F3). + chatId: sa.id, + provider: resolved.provider, + resolvedModelId: sa.modelId, + agent: sa, + opened: resolved.opened, + }); + if (!runtime.config.compactionEnabled) return; + const tier2: Tier2Context = { + triggerTokens: Math.max(0, runtime.budget.triggerTokens), + targetTokens: Math.max(0, runtime.budget.targetTokens), + keepRecentMessages: runtime.config.keepRecentMessages, + minPrunableChars: runtime.config.minPrunableChars, + imageProvider: runtime.imageProvider, + summarize: runtime.summarize, + summarizerWindow: runtime.summarizerWindow, + }; + subAgentPrepareSteps.set(sa.id, buildTier2PrepareStep(tier2)); + } catch (error) { + logger.warn( + { error, subAgentId: sa.id }, + "Failed to build Tier 2 for sub-agent; skipping", + ); + } + }), + ); + const subAgentMcpClients: any[] = []; const subAgentTools = await createSubAgentTools( subAgentRecords, async (providerId: string, modelId: string) => { - const subProvider = await queries.getProvider( - providerId, - orgId, - workspaceId, - ); - if (!subProvider) { + const resolved = await resolveSubProvider(providerId); + if (!resolved) { throw new Error(`Provider '${providerId}' not found for sub-agent`); } - return openProvider(subProvider).languageModel(modelId); + return resolved.opened.languageModel(modelId); }, async (subAgentId: string, toolSetIds: string[]) => { const subAgentRecord = subAgentRecords.find((sa) => sa.id === subAgentId); @@ -935,7 +1346,120 @@ const loadSubAgents = async ( return subTools; }, onProgress, + (id) => subAgentPrepareSteps.get(id), ); return { subAgents, subAgentTools, subAgentMcpClients }; }; + +// --- Force-compact endpoint (§J) --- + +/** + * Runs Tier 1 compaction unconditionally for a chat (§J: clickable ring). + * Forces the compaction regardless of the token threshold by injecting + * compactionDirty=true so the RV3 force path bypasses the estimate gate. + * Called from `POST /chats/:id/compact`; the route guards against concurrent + * runs before calling here. + */ +export async function forceCompactChat( + chatId: string, + workspaceId: string, + orgId: string, +): Promise<{ + estimatedTokens: number; + contextWindow: number; + contextWindowIsDefault: boolean; +}> { + // Load the chat record (workspace-scoped). + const chatRows = await db + .select({ + agentId: chatTable.agentId, + providerId: chatTable.providerId, + modelId: chatTable.modelId, + }) + .from(chatTable) + .where( + and(eq(chatTable.id, chatId), eq(chatTable.workspaceId, workspaceId)), + ) + .limit(1); + if (chatRows.length === 0) throw new NotFoundError("Chat not found"); + const chatRow = chatRows[0]; + + // Resolve provider + model via the shared query layer (respects org-scoped + // Shared resources and the ADR-0007 attachment gate). + let provider: Provider; + let resolvedModelId: string; + let agent: AgentRow | null = null; + + if (chatRow.agentId) { + const agentRow = await drizzleChatTurnQueries.getAgent( + chatRow.agentId, + orgId, + workspaceId, + ); + if (!agentRow) throw new NotFoundError("Agent not found"); + agent = agentRow; + resolvedModelId = agent.modelId; + const providerRow = await drizzleChatTurnQueries.getProvider( + agent.providerId, + orgId, + workspaceId, + ); + if (!providerRow) throw new NotFoundError("Provider not found"); + provider = providerRow; + } else if (chatRow.providerId && chatRow.modelId) { + const providerRow = await drizzleChatTurnQueries.getProvider( + chatRow.providerId, + orgId, + workspaceId, + ); + if (!providerRow) throw new NotFoundError("Provider not found"); + provider = providerRow; + resolvedModelId = chatRow.modelId; + } else { + throw new ValidationError("Chat has no provider/model configured"); + } + + const opened = openProvider(provider); + const runtime = await buildCompactionRuntime({ + chatId, + provider, + resolvedModelId, + agent, + opened, + }); + + const messages = await loadChatMessages(chatId); + const rawState = + (await drizzleCompactionStore.readState(chatId)) ?? EMPTY_COMPACTION_STATE; + + // Force-trigger by marking dirty in the in-memory copy (RV3: bypass the + // estimate gate so the compaction actually shrinks the history). + const forcedState: CompactionState = { ...rawState, compactionDirty: true }; + + const result = await applyTier1Compaction({ + chatId, + messages, + state: forcedState, + budget: runtime.budget, + config: runtime.config, + imageProvider: runtime.imageProvider, + summarize: runtime.summarize, + store: drizzleCompactionStore, + summarizerWindow: runtime.summarizerWindow, + }); + + // Message-only estimate (no per-turn system/tool overhead): the ring uses it + // as a transient post-compact value that the next response's provider count + // supersedes. It therefore reads slightly low vs the live ring numerator + // (which includes overhead) — acceptable for an immediate visual refresh. + const estimatedTokens = estimateTokens( + uiMessagesToCountUnits(result.messages, runtime.imageProvider), + ); + + return { + estimatedTokens, + contextWindow: runtime.contextWindow, + contextWindowIsDefault: runtime.contextWindowIsDefault, + }; +} diff --git a/apps/backend/src/tools/sub-agent.test.ts b/apps/backend/src/tools/sub-agent.test.ts index c91b3885..ce73732d 100644 --- a/apps/backend/src/tools/sub-agent.test.ts +++ b/apps/backend/src/tools/sub-agent.test.ts @@ -26,13 +26,16 @@ function createMockFullStream( }; } -const { mockStream, MockToolLoopAgent } = vi.hoisted(() => { +const { mockStream, MockToolLoopAgent, capturedSettings } = vi.hoisted(() => { const mockStream = vi.fn(); + const capturedSettings: any[] = []; class MockToolLoopAgent { - constructor() {} + constructor(settings: any) { + capturedSettings.push(settings); + } stream = mockStream; } - return { mockStream, MockToolLoopAgent }; + return { mockStream, MockToolLoopAgent, capturedSettings }; }); vi.mock("ai", async () => { @@ -55,6 +58,25 @@ describe("createSubAgentTool", () => { tools: {}, }; + beforeEach(() => { + capturedSettings.length = 0; + }); + + describe("Tier 2 prepareStep (drift M3)", () => { + it("passes prepareStep to ToolLoopAgent when provided", () => { + const mockPrepareStep = vi.fn(); + createSubAgentTool({ ...baseOptions, prepareStep: mockPrepareStep }); + expect(capturedSettings[0]).toMatchObject({ + prepareStep: mockPrepareStep, + }); + }); + + it("passes undefined prepareStep when not provided", () => { + createSubAgentTool(baseOptions); + expect(capturedSettings[0].prepareStep).toBeUndefined(); + }); + }); + describe("toolName generation", () => { it("generates PascalCase delegateTo prefix", () => { const { toolName } = createSubAgentTool(baseOptions); @@ -379,4 +401,34 @@ describe("createSubAgentTools", () => { expect(Object.keys(result)).toHaveLength(1); }); + + it("threads prepareStepFn to ToolLoopAgent for each sub-agent (drift M3)", async () => { + capturedSettings.length = 0; + const subAgents = [ + { id: "sa-1", name: "Alpha", providerId: "p1", modelId: "m1" }, + { id: "sa-2", name: "Beta", providerId: "p1", modelId: "m1" }, + ]; + const mockStep1 = vi.fn(); + const mockStep2 = vi.fn(); + const prepareStepFn = vi + .fn() + .mockImplementation((id: string) => + id === "sa-1" ? mockStep1 : mockStep2, + ); + + const createModelFn = vi.fn().mockResolvedValue({}); + const loadToolsFn = vi.fn().mockResolvedValue({}); + + await createSubAgentTools( + subAgents, + createModelFn, + loadToolsFn, + undefined, + prepareStepFn, + ); + + expect(capturedSettings).toHaveLength(2); + expect(capturedSettings[0].prepareStep).toBe(mockStep1); + expect(capturedSettings[1].prepareStep).toBe(mockStep2); + }); }); diff --git a/apps/backend/src/tools/sub-agent.ts b/apps/backend/src/tools/sub-agent.ts index c4030d12..f67803eb 100644 --- a/apps/backend/src/tools/sub-agent.ts +++ b/apps/backend/src/tools/sub-agent.ts @@ -1,4 +1,10 @@ -import { stepCountIs, tool, ToolLoopAgent, type Tool } from "ai"; +import { + stepCountIs, + tool, + ToolLoopAgent, + type PrepareStepFunction, + type Tool, +} from "ai"; import { z } from "zod"; import { logger } from "../logger.ts"; @@ -43,6 +49,8 @@ interface SubAgentToolOptions { maxSteps?: number; /** Called on each activity update from the sub-agent. Used to reset the parent run's per-step timeout. */ onProgress?: () => void; + /** Tier 2 in-turn compaction callback (§D, drift M3). Null when compaction disabled. */ + prepareStep?: PrepareStepFunction; } /** @@ -62,6 +70,7 @@ export const createSubAgentTool = (options: SubAgentToolOptions) => { tools, maxSteps = 50, onProgress, + prepareStep, } = options; const toolName = subAgentToolName({ name }); @@ -73,6 +82,7 @@ export const createSubAgentTool = (options: SubAgentToolOptions) => { `You are a specialized sub-agent named "${name}". Complete the task you are given thoroughly and accurately.`, tools, stopWhen: [stepCountIs(maxSteps)], + prepareStep, }); return { @@ -183,6 +193,7 @@ export const createSubAgentTools = async ( toolSetIds: string[], ) => Promise>, onProgress?: () => void, + prepareStepFn?: (id: string) => PrepareStepFunction | undefined, ): Promise> => { const tools: Record = {}; @@ -207,6 +218,7 @@ export const createSubAgentTools = async ( tools: subAgentTools, maxSteps: subAgent.maxSteps || 50, onProgress, + prepareStep: prepareStepFn?.(subAgent.id), }); tools[toolName] = tool; diff --git a/docs/adr/0009-context-compaction.md b/docs/adr/0009-context-compaction.md new file mode 100644 index 00000000..07a26081 --- /dev/null +++ b/docs/adr/0009-context-compaction.md @@ -0,0 +1,136 @@ +--- +status: proposed +--- + +# Chat Context Compaction + +Chats hard-fail when message history exceeds a model's context window. This ADR +records **how** we decided to keep them alive and **why** the obvious simpler +options were rejected. The implementation spec (the _how_) lives in +`context-compaction-plan.md`; this ADR is the _why_ and the gate that +implementation answers to. + +Status is **proposed** until step 1 (window resolution + estimator + schema) +confirms the foundation holds; promote to **accepted** then. If implementation +forces a different choice, supersede with a new ADR rather than editing this one. + +## Decision + +A **two-tier, view-not-delete** compaction model, fed by a **single token +estimator**, with all durable state mutated through a **single versioned CAS +writer**, and an always-on **recovery net** for overflow errors the proactive +path misses. + +### Two tiers, not one + +- **Tier 1 — cross-turn, durable.** Runs in `prepareChatTurn` before a response. + Summarizes/prunes old history, persists a summary + watermark. Owns durable + state. +- **Tier 2 — intra-turn, throwaway.** Runs in the AI SDK `prepareStep` hook to + keep a single heavy response (many tool/sub-agent calls) executable mid-loop. + Not persisted — the SDK's canonical message list commits to history as normal, + and next turn Tier 1 folds it into the durable summary. + +One tier cannot cover both cases: a single response can blow the window without +any cross-turn history growth (Tier 2's job), and durable history must be +compacted before a turn even starts (Tier 1's job). Sub-agents — which start +fresh each invocation with no cross-turn history — therefore use **Tier 2 only**. + +### Compaction is a view, not a delete + +The watermark + summary change _what is sent to the model_, never _what is +stored_. Raw messages persist in the DB untouched. This makes forced/automatic +compaction non-destructive in the data sense (a user can still read full +history; a future "expand summary" UI is free), and reduces "irreversible data +loss" objections to a UX-courtesy confirmation rather than a correctness +concern. + +### One estimator + +Token counting lives in exactly one function over one neutral structure +(`CountUnit[]`). Tier 1 (UIMessages) and Tier 2 (ModelMessages) both normalize +into it, counting only model-bound parts. Divergence between the two tiers is +impossible by construction rather than monitored — a tier cannot fire on a +number the other never sees. + +### One durable writer (versioned CAS) + +All mutations of compaction state (`summaryWatermark`, `contextSummary`, +`compactionDirty`) go through a single compare-and-swap function keyed on a +`version` column. Concurrent runs on one chat (e.g. a trigger run and a user +run), and the interaction between compaction and history-edit invalidation, are +resolved by **version**, not by comparing watermark values — so a watermark that +moves _backward_ on an edit cannot be misread as "not yet advanced" and produce a +stale summary over mutated history. + +### Recovery is the net, not the plan + +A `400/413` context-overflow error is caught, the messages aggressively trimmed +in-memory (via the same Tier 2 adapter), and the call retried **once**. Recovery +never writes durable state directly — it flags `compactionDirty`, and the next +turn's Tier 1 does the durable compaction. Recovery stays on even when proactive +compaction is globally disabled; it is the last line of defense, not a risk +surface. + +### Char/4 estimate, not a real tokenizer + +Pre-call token counting uses a `char/4` heuristic (text parts only; a modality +table for images) on the **first turn only**. Every later turn uses the +provider-reported real `usage.inputTokens`. We accept first-turn imprecision +(guarded by a 1.15 margin and the recovery net) rather than ship a per-provider +tokenizer dependency. + +### Window source: API → litellm registry → default + +Resolve the context window per-model: manual override → provider API auto-detect +(Google/OpenRouter/vLLM expose it) → the community-maintained litellm registry +JSON (covers OpenAI/Anthropic/Bedrock, which don't) → a conservative 8192 +default. We do **not** maintain our own context-window table. + +## Considered Options + +- **Single-tier compaction (cross-turn only)** — rejected. Cannot rescue a + single response whose own tool loop overflows the window; would force the whole + response to fail even though durable history was fine. +- **One estimator per tier** — rejected. Two estimators over two message shapes + drift; one tier ends up firing on a count the other never computes, making + contention and threshold bugs undebuggable. Collapsed to one estimator + two + adapters. +- **Hard-delete / truncate old messages** — rejected. Irreversible, and the + "drops the middle silently" failure mode seen in gateway truncation (OpenRouter) + and silent clipping (Ollama). View-not-delete keeps the data and makes the + action auditable (a visible `context-compacted` event). +- **Homegrown context-window lookup table** — rejected. Unmaintainable across + providers and model churn; the litellm registry is the industry "don't maintain + your own table" answer (AnythingLLM dropped its hardcoded table for it). +- **A real pre-call tokenizer** — rejected for v1. Per-provider tokenizers are a + heavy dependency for a number that the provider returns accurately after the + first call anyway. +- **Optimistic-without-version concurrency (compare watermark values)** — + rejected. Breaks when history-edit invalidation moves the watermark backward; + the versioned CAS removes the monotonicity assumption entirely. +- **Compacting to the trigger threshold** — rejected; it re-fires every turn + (the Cline #5616 thrash). Trigger and target ratios are deliberately distinct + (0.8 vs 0.5 of the input budget) for hysteresis. + +## Consequences + +- **Schema additions.** `provider.modelMeta` (JSONB, per-model window/output + overrides); chat/run gain `contextSummary`, `summaryWatermark`, + `compactionDirty`, and `version`. All additive nullable columns. +- **Lazy rollout, no backfill.** Existing chats compact only on their next turn; + no eager backfill job (it would create a thundering herd of summarize calls). +- **A summarize call costs money and latency.** Stage 1 prunes without a model + call first; Stage 2 summarizes only when pruning is insufficient, using the task + model (falling back to the main model). +- **First-turn token estimates are imprecise**, especially for + image-heavy/CJK/JSON content; the recovery net absorbs the misses and a + divergence metric tunes the image constants over time. +- **A global `COMPACTION_ENABLED` kill switch** disables proactive compaction in + prod without a deploy; recovery is unaffected. +- **Observability is part of the contract** — compaction/recovery/CAS-conflict + metrics gate the two deferred optimizations (CAS contention, projected-input + ring arc); without the metrics those decisions are guesses. +- **Frontend gains a context-usage ring** (window resolved from the _selected_ + model, neutral when unknown) and a per-message stats popover, reusing the + existing tool-call timing mechanism. diff --git a/packages/schemas/index.test.ts b/packages/schemas/index.test.ts index c8a3b881..592943d6 100644 --- a/packages/schemas/index.test.ts +++ b/packages/schemas/index.test.ts @@ -13,7 +13,10 @@ import { sandboxEnvSchema, SANDBOX_ENV_MAX_ENTRIES, SANDBOX_ENV_MAX_VALUE_BYTES, + providerSchema, + providerUpdateSchema, providerCreateSchema, + chatSchema, } from "./index"; describe("Organization Schema", () => { @@ -286,6 +289,146 @@ describe("Agent Schema", () => { }); }); +describe("Provider modelMeta (context-compaction §A)", () => { + const base = { + id: "prov-1", + workspaceId: "ws-1", + name: "My Provider", + providerType: "OpenAI" as const, + apiKey: "sk-x", + modelIds: ["gpt-4o"], + taskModelId: "gpt-4o", + memoryExtractionModelId: "gpt-4o", + createdAt: new Date(), + updatedAt: new Date(), + }; + + it("is valid with modelMeta omitted (additive, optional)", () => { + expect(providerSchema.safeParse(base).success).toBe(true); + }); + + it("accepts per-model contextWindow / maxOutputTokens overrides", () => { + const result = providerSchema.safeParse({ + ...base, + modelMeta: { + "gpt-4o": { contextWindow: 128000, maxOutputTokens: 16384 }, + "o1-mini": { contextWindow: 200000 }, + }, + }); + expect(result.success).toBe(true); + }); + + it("rejects a non-positive contextWindow", () => { + const result = providerSchema.safeParse({ + ...base, + modelMeta: { "gpt-4o": { contextWindow: 0 } }, + }); + expect(result.success).toBe(false); + }); + + it("rejects a non-integer window", () => { + const result = providerSchema.safeParse({ + ...base, + modelMeta: { "gpt-4o": { contextWindow: 1.5 } }, + }); + expect(result.success).toBe(false); + }); + + it("carries modelMeta through the update schema", () => { + const result = providerUpdateSchema.safeParse({ + name: "My Provider", + providerType: "OpenAI", + apiKey: "sk-x", + modelIds: ["gpt-4o"], + taskModelId: "gpt-4o", + memoryExtractionModelId: "gpt-4o", + modelMeta: { "gpt-4o": { contextWindow: 128000 } }, + }); + expect(result.success).toBe(true); + }); +}); + +describe("Chat compaction state (context-compaction §C)", () => { + const base = { + id: "chat-1", + workspaceId: "ws-1", + title: "My Chat Title", + status: "succeeded" as const, + isPinned: false, + createdAt: new Date(), + updatedAt: new Date(), + }; + + it("is valid with compaction fields omitted (existing rows)", () => { + expect(chatSchema.safeParse(base).success).toBe(true); + }); + + it("accepts a populated summary + watermark + version", () => { + const result = chatSchema.safeParse({ + ...base, + contextSummary: "Summary of earlier turns.", + summaryWatermark: "msg-42", + compactionDirty: true, + version: 3, + }); + expect(result.success).toBe(true); + }); + + it("accepts an explicitly null summary / watermark", () => { + const result = chatSchema.safeParse({ + ...base, + contextSummary: null, + summaryWatermark: null, + }); + expect(result.success).toBe(true); + }); + + it("rejects a non-integer version", () => { + const result = chatSchema.safeParse({ ...base, version: 1.5 }); + expect(result.success).toBe(false); + }); +}); + +describe("Agent compaction config (context-compaction §G)", () => { + const base = { + id: "789", + workspaceId: "456", + providerId: "provider-123", + name: "Test Agent", + description: "A test agent", + modelId: "gpt-4", + createdAt: new Date(), + updatedAt: new Date(), + }; + + it("is valid with no compaction config (defaults applied in code)", () => { + expect(agentSchema.safeParse(base).success).toBe(true); + }); + + it("accepts a full compaction config", () => { + const result = agentSchema.safeParse({ + ...base, + compactionEnabled: true, + triggerRatio: 0.8, + targetRatio: 0.5, + reserveRatio: 0.05, + keepRecentMessages: 10, + minPrunableChars: 2000, + }); + expect(result.success).toBe(true); + }); + + it("rejects a ratio above 1", () => { + const result = agentSchema.safeParse({ ...base, triggerRatio: 1.2 }); + expect(result.success).toBe(false); + }); + + it("rejects a negative keepRecentMessages", () => { + const result = agentSchema.safeParse({ ...base, keepRecentMessages: -1 }); + expect(result.success).toBe(false); + }); +}); + describe("Provider Create Schema", () => { const baseProvider = { organizationId: "org-123", diff --git a/packages/schemas/index.ts b/packages/schemas/index.ts index 2073c784..4ea61096 100644 --- a/packages/schemas/index.ts +++ b/packages/schemas/index.ts @@ -101,6 +101,13 @@ export const chatSchema = z.object({ seed: z.number().optional(), presencePenalty: z.number().optional(), frequencyPenalty: z.number().optional(), + // Context-compaction state (docs/adr/0009). Server-managed; intentionally NOT + // part of chatSubmit/chatUpdate. summaryWatermark is the message id of the + // last summarized message (P1: a view over history, never a delete). + contextSummary: z.string().nullable().optional(), + summaryWatermark: z.string().nullable().optional(), + compactionDirty: z.boolean().optional(), + version: z.number().int().optional(), createdAt: z.date(), updatedAt: z.date(), }); @@ -198,6 +205,15 @@ const agentBaseSchema = z.object({ seed: z.number().optional(), presencePenalty: z.number().optional(), frequencyPenalty: z.number().optional(), + // Per-agent context-compaction config (context-compaction-plan §G). All + // optional; the runtime applies defaults when unset. Editable surface (adding + // these to agentCreate/Update picks + the form) lands in a later slice. + compactionEnabled: z.boolean().optional(), + triggerRatio: z.number().min(0).max(1).optional(), + targetRatio: z.number().min(0).max(1).optional(), + reserveRatio: z.number().min(0).max(1).optional(), + keepRecentMessages: z.number().int().min(1).optional(), + minPrunableChars: z.number().int().nonnegative().optional(), toolSetIds: z.array(z.string()).optional(), skillIds: z.array(z.string()).optional(), subAgentIds: z.array(z.string()).optional(), @@ -222,44 +238,78 @@ export const agentSchema = agentBaseSchema.refine( export type Agent = z.infer; -export const agentCreateSchema = agentBaseSchema.pick({ - workspaceId: true, - providerId: true, - name: true, - description: true, - systemPrompt: true, - modelId: true, - maxSteps: true, - temperature: true, - topP: true, - topK: true, - seed: true, - presencePenalty: true, - frequencyPenalty: true, - toolSetIds: true, - skillIds: true, - subAgentIds: true, - inputPlaceholder: true, -}); - -export const agentUpdateSchema = agentBaseSchema.pick({ - providerId: true, - name: true, - description: true, - systemPrompt: true, - modelId: true, - maxSteps: true, - temperature: true, - topP: true, - topK: true, - seed: true, - presencePenalty: true, - frequencyPenalty: true, - toolSetIds: true, - skillIds: true, - subAgentIds: true, - inputPlaceholder: true, -}); +// Hysteresis guard (context-compaction-plan §C2 / drift C2): the post-compaction +// target must sit BELOW the trigger, otherwise compaction re-fires every turn +// (the Cline #5616 thrash). Per-field bounds are 0..1; this enforces the +// relationship. Only checked when BOTH are supplied (either may be omitted to +// fall back to the runtime default). +const compactionRatioOrder = (data: { + triggerRatio?: number; + targetRatio?: number; +}) => + data.triggerRatio == null || + data.targetRatio == null || + data.targetRatio < data.triggerRatio; + +const compactionRatioOrderIssue = { + message: "targetRatio must be less than triggerRatio", + path: ["targetRatio"], +}; + +export const agentCreateSchema = agentBaseSchema + .pick({ + workspaceId: true, + providerId: true, + name: true, + description: true, + systemPrompt: true, + modelId: true, + maxSteps: true, + temperature: true, + topP: true, + topK: true, + seed: true, + presencePenalty: true, + frequencyPenalty: true, + toolSetIds: true, + skillIds: true, + subAgentIds: true, + inputPlaceholder: true, + compactionEnabled: true, + triggerRatio: true, + targetRatio: true, + reserveRatio: true, + keepRecentMessages: true, + minPrunableChars: true, + }) + .refine(compactionRatioOrder, compactionRatioOrderIssue); + +export const agentUpdateSchema = agentBaseSchema + .pick({ + providerId: true, + name: true, + description: true, + systemPrompt: true, + modelId: true, + maxSteps: true, + temperature: true, + topP: true, + topK: true, + seed: true, + presencePenalty: true, + frequencyPenalty: true, + toolSetIds: true, + skillIds: true, + subAgentIds: true, + inputPlaceholder: true, + compactionEnabled: true, + triggerRatio: true, + targetRatio: true, + reserveRatio: true, + keepRecentMessages: true, + minPrunableChars: true, + }) + .refine(compactionRatioOrder, compactionRatioOrderIssue); // Skill @@ -549,6 +599,19 @@ export const providerApiModeSchema = z.enum(["chat", "responses"]); export type ProviderApiMode = z.infer; +// Per-model context-window / output overrides (context-compaction-plan §A). +// Keyed by model id; both fields optional so an override can set just one. +export const modelMetaEntrySchema = z.object({ + contextWindow: z.number().int().positive().optional(), + maxOutputTokens: z.number().int().positive().optional(), +}); + +export type ModelMetaEntry = z.infer; + +export const modelMetaSchema = z.record(z.string(), modelMetaEntrySchema); + +export type ModelMeta = z.infer; + const providerBaseSchema = z.object({ id: z.string(), organizationId: z.string().optional(), @@ -588,6 +651,7 @@ const providerBaseSchema = z.object({ .max(4096) .nullable() .optional(), + modelMeta: modelMetaSchema.optional(), createdAt: z.date(), updatedAt: z.date(), }); @@ -639,6 +703,7 @@ export const providerCreateSchema = providerBaseSchema.pick({ memoryExtractionModelId: true, embeddingModelId: true, embeddingDimensions: true, + modelMeta: true, }); // Sandbox @@ -774,6 +839,7 @@ export const providerUpdateSchema = providerBaseSchema.pick({ memoryExtractionModelId: true, embeddingModelId: true, embeddingDimensions: true, + modelMeta: true, }); export type ProviderUpdateData = z.infer; @@ -1523,3 +1589,23 @@ export const dashboardUpdateSchema = z.object({ desktopLayout: z.array(rglLayoutItemSchema).optional(), mobileLayout: z.array(rglLayoutItemSchema).optional(), }); + +// Message stats (context-compaction-plan §H/§I) +// Stamped on the last assistant message's metadata.stats after each stream run. +// Used by the frontend context-usage ring (§H) and per-message stats popover (§I). + +export const messageStatsSchema = z.object({ + // Run-wide totals across every step (sum) — for the §I cost popover. + inputTokens: z.number().nonnegative(), + outputTokens: z.number().nonnegative(), + // Input tokens of the LAST model call = peak context fullness — for the §H + // ring. NOT the run-wide sum (which over-counts on multi-step tool loops). + contextTokens: z.number().nonnegative(), + startedAt: z.string(), + firstTokenAt: z.string().optional(), + finishedAt: z.string(), + contextWindow: z.number().positive(), + contextWindowIsDefault: z.boolean(), +}); + +export type MessageStats = z.infer; From 7e6f9b8150cd1320d4da564ca6972c70763bcf56 Mon Sep 17 00:00:00 2001 From: "frantisek.spacek@morosystems.cz" Date: Thu, 11 Jun 2026 20:26:22 +0200 Subject: [PATCH 02/21] feat(frontend): context-usage ring, per-message stats & tool-call durations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Surfaces the compaction work in the chat UI. - Context-usage ring beside the model selector (§H): fill = last input tokens / resolved window for the selected model; neutral grey when the window is unknown/default. Clickable to compact on demand (§J). - Per-message stats popover next to Regenerate (§I): input/output tokens, TTFT and total generation time from the server-stamped metadata.stats. - Tool-call run durations in the tool header, reusing server start/complete timestamps with a client-observed fallback (useToolDuration). The hook is written to satisfy the react-hooks purity / set-state-in-effect / refs rules: render reads only state, and all writes are deferred into timer callbacks. - Per-agent compaction config fields in the agent form. Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/frontend/components/agent-form.tsx | 134 ++++++++++++++ apps/frontend/components/ai-elements/tool.tsx | 13 ++ apps/frontend/components/chat-message.tsx | 93 +++++++++- apps/frontend/components/chat.tsx | 166 ++++++++++++++++-- .../components/context-usage-ring.tsx | 132 ++++++++++++++ .../components/dynamic-tool-header.tsx | 47 +++-- apps/frontend/hooks/use-tool-completed-at.ts | 90 ++++++++++ apps/frontend/lib/utils.test.ts | 30 +++- apps/frontend/lib/utils.ts | 30 ++++ 9 files changed, 703 insertions(+), 32 deletions(-) create mode 100644 apps/frontend/components/context-usage-ring.tsx create mode 100644 apps/frontend/hooks/use-tool-completed-at.ts diff --git a/apps/frontend/components/agent-form.tsx b/apps/frontend/components/agent-form.tsx index c2015858..c1acfba6 100644 --- a/apps/frontend/components/agent-form.tsx +++ b/apps/frontend/components/agent-form.tsx @@ -152,6 +152,12 @@ const AgentForm = ({ seed: undefined as number | undefined, presencePenalty: undefined as number | undefined, frequencyPenalty: undefined as number | undefined, + compactionEnabled: undefined as boolean | undefined, + triggerRatio: undefined as number | undefined, + targetRatio: undefined as number | undefined, + reserveRatio: undefined as number | undefined, + keepRecentMessages: undefined as number | undefined, + minPrunableChars: undefined as number | undefined, }); const [isSubmitting, setIsSubmitting] = useState(false); const [validationErrors, setValidationErrors] = useState< @@ -203,6 +209,12 @@ const AgentForm = ({ toolSetIds: agent.toolSetIds || [], skillIds: agent.skillIds || [], subAgentIds: agent.subAgentIds || [], + compactionEnabled: agent.compactionEnabled ?? undefined, + triggerRatio: agent.triggerRatio ?? undefined, + targetRatio: agent.targetRatio ?? undefined, + reserveRatio: agent.reserveRatio ?? undefined, + keepRecentMessages: agent.keepRecentMessages ?? undefined, + minPrunableChars: agent.minPrunableChars ?? undefined, }); if (agent.avatarUrl) { setAvatarPreviewUrl(agent.avatarUrl); @@ -327,6 +339,12 @@ const AgentForm = ({ toolSetIds: formData.toolSetIds, skillIds: formData.skillIds, subAgentIds: formData.subAgentIds, + compactionEnabled: formData.compactionEnabled, + triggerRatio: formData.triggerRatio, + targetRatio: formData.targetRatio, + reserveRatio: formData.reserveRatio, + keepRecentMessages: formData.keepRecentMessages, + minPrunableChars: formData.minPrunableChars, }; const url = agentId @@ -896,6 +914,122 @@ const AgentForm = ({ /> + +

+

Context compaction

+ + Override per-agent compaction behaviour. Leave blank to use + defaults (trigger 80%, target 50%, reserve 5%, keep last 10 + messages, prune outputs >2000 chars). Target ratio must stay + below trigger ratio. Has no effect when compaction is disabled + globally (COMPACTION_ENABLED). + + + + setFormData((prev) => ({ + ...prev, + compactionEnabled: checked, + })) + } + disabled={isSubmitting || readOnly} + /> + + Enable compaction + + + + + + Trigger ratio (0–1) + + + handleFloatChange("triggerRatio", e.target.value) + } + disabled={isSubmitting || readOnly} + /> + + + + Target ratio (0–1) + + + handleFloatChange("targetRatio", e.target.value) + } + disabled={isSubmitting || readOnly} + /> + + + + Reserve ratio (0–1) + + + handleFloatChange("reserveRatio", e.target.value) + } + disabled={isSubmitting || readOnly} + /> + + + + Keep recent messages + + + handleNumberChange("keepRecentMessages", e.target.value) + } + disabled={isSubmitting || readOnly} + /> + + + + Min prunable chars + + + handleNumberChange("minPrunableChars", e.target.value) + } + disabled={isSubmitting || readOnly} + /> + + +
diff --git a/apps/frontend/components/ai-elements/tool.tsx b/apps/frontend/components/ai-elements/tool.tsx index fee20b81..658ab27f 100644 --- a/apps/frontend/components/ai-elements/tool.tsx +++ b/apps/frontend/components/ai-elements/tool.tsx @@ -7,6 +7,7 @@ import { CollapsibleTrigger, } from "@/components/ui/collapsible"; import { cn } from "@/lib/utils"; +import { useToolDuration } from "@/hooks/use-tool-completed-at"; import type { ToolUIPart } from "ai"; import { ArrowRightLeftIcon, @@ -172,6 +173,10 @@ export type ToolHeaderProps = { label?: string; type: ToolUIPart["type"]; state: ToolUIPart["state"]; + /** ISO timestamp of when this tool call began, if known. */ + startedAt?: string; + /** ISO timestamp of when this tool call completed, if known. */ + completedAt?: string; className?: string; }; @@ -210,8 +215,11 @@ export const ToolHeader = ({ label, type, state, + startedAt, + completedAt, ...props }: ToolHeaderProps) => { + const duration = useToolDuration(state, startedAt, completedAt); // getToolIcon returns a stable module-level Lucide icon; render via // createElement so the dynamic selection isn't flagged as a component // created during render. @@ -237,6 +245,11 @@ export const ToolHeader = ({ )} {getStatusBadge(state)} + {duration && ( + + {duration} + + )} diff --git a/apps/frontend/components/chat-message.tsx b/apps/frontend/components/chat-message.tsx index 102e8bb1..3cd638f9 100644 --- a/apps/frontend/components/chat-message.tsx +++ b/apps/frontend/components/chat-message.tsx @@ -35,7 +35,7 @@ import { TextUIPart, type ChatStatus, } from "ai"; -import { Agent } from "@platypus/schemas"; +import { Agent, type MessageStats } from "@platypus/schemas"; import { BotIcon, CheckIcon, @@ -44,10 +44,76 @@ import { TrashIcon, RefreshCwIcon, XIcon, + InfoIcon, } from "lucide-react"; import { Textarea } from "./ui/textarea"; import { LoadSkillTool } from "./load-skill-tool"; import { SubAgentTool } from "./sub-agent-tool"; +import { Button } from "./ui/button"; +import { Popover, PopoverContent, PopoverTrigger } from "./ui/popover"; +import { formatDurationMs } from "@/lib/utils"; + +const getToolStartedAt = (part: unknown): string | undefined => { + const raw = (part as { toolMetadata?: { startedAt?: unknown } })?.toolMetadata + ?.startedAt; + return typeof raw === "string" ? raw : undefined; +}; + +const getToolCompletedAt = (part: unknown): string | undefined => { + const raw = (part as { toolMetadata?: { completedAt?: unknown } }) + ?.toolMetadata?.completedAt; + return typeof raw === "string" ? raw : undefined; +}; + +function MessageStatsPopover({ stats }: { stats: MessageStats }) { + const ttft = stats.firstTokenAt + ? formatDurationMs( + new Date(stats.firstTokenAt).getTime() - + new Date(stats.startedAt).getTime(), + ) + : undefined; + const total = formatDurationMs( + new Date(stats.finishedAt).getTime() - new Date(stats.startedAt).getTime(), + ); + return ( + + + + + +
+

+ Response stats +

+

+ In:{" "} + {stats.inputTokens.toLocaleString()}{" "} + Out:{" "} + {stats.outputTokens.toLocaleString()} +

+ {ttft && ( +

+ TTFT: {ttft} +

+ )} + {total && ( +

+ Total: {total} +

+ )} +
+
+
+ ); +} interface ChatMessageProps { /** The message object to render */ @@ -120,6 +186,7 @@ export const ChatMessage = memo(function ChatMessage({ )); + const fileParts = message.parts?.filter( (part): part is FileUIPart => part.type === "file" && !part.mediaType?.startsWith("image/"), @@ -134,6 +201,11 @@ export const ChatMessage = memo(function ChatMessage({ .map((part) => part.text) .join("") || ""; + const assistantStats = + message.role === "assistant" + ? (message.metadata as { stats?: MessageStats } | undefined)?.stats + : undefined; + return ( {fileParts && fileParts.length > 0 && ( @@ -154,7 +226,17 @@ export const ChatMessage = memo(function ChatMessage({ )} {message.parts?.map((part, i) => { - if (part.type === "text") { + if (part.type === "step-start") { + // The SDK emits step-start at every round boundary. We don't render + // it — tool-call timestamps appear inside the tool header below. + return null; + } else if (part.type === "text") { + const partText = (part as TextUIPart).text; + + // Skip empty text parts on assistant messages — the SDK emits them + // between steps; rendering would leave a bare avatar bubble. + if (message.role === "assistant" && !partText.trim()) return null; + if (isEditing) { const isFirstTextPart = i === message.parts.findIndex((p) => p.type === "text"); @@ -186,7 +268,7 @@ export const ChatMessage = memo(function ChatMessage({ avatar={assistantAvatar} > - {(part as TextUIPart).text} + {partText} ); @@ -212,6 +294,8 @@ export const ChatMessage = memo(function ChatMessage({ @@ -251,6 +335,8 @@ export const ChatMessage = memo(function ChatMessage({ state={toolPart.state} type={toolPart.type} label={toolLabel} + startedAt={getToolStartedAt(toolPart)} + completedAt={getToolCompletedAt(toolPart)} /> @@ -354,6 +440,7 @@ export const ChatMessage = memo(function ChatMessage({ )} + {assistantStats && } ))} diff --git a/apps/frontend/components/chat.tsx b/apps/frontend/components/chat.tsx index cfc38d1a..4c741f0a 100644 --- a/apps/frontend/components/chat.tsx +++ b/apps/frontend/components/chat.tsx @@ -32,6 +32,7 @@ import { Agent, ToolSet, Skill, + type MessageStats, } from "@platypus/schemas"; import { type PlatypusUIMessage } from "@platypus/backend/src/types"; import useSWR from "swr"; @@ -55,6 +56,7 @@ import { TooltipTrigger, } from "@/components/ui/tooltip"; import { ChatMessage } from "./chat-message"; +import { ContextUsageRing } from "./context-usage-ring"; import { ModelSelectorDialog } from "./model-selector-dialog"; import { toast } from "sonner"; @@ -378,19 +380,45 @@ export const Chat = ({ [messages, setMessages], ); - // TODO: Ideally show a loading indicator here - if (isLoading || !providersData) return null; + // Resolve the effective provider+model for the ring (drift U1: use selected + // model's window, not last message's window). When an agent is selected we + // look up its provider/model; otherwise use the directly selected values. + const effectiveRingProviderId = agentId + ? (agents.find((a) => a.id === agentId)?.providerId ?? "") + : providerId; + const effectiveRingModelId = agentId + ? (agents.find((a) => a.id === agentId)?.modelId ?? "") + : modelId; + + // Fetch resolved context window for the currently-selected model (cached on + // the backend). Returns null contextWindow when source = "default" so the ring + // renders neutral (drift T6). Re-fetches automatically on model/agent change. + const { data: contextWindowData } = useSWR<{ + contextWindow: number | null; + source: string; + }>( + backendUrl && user && effectiveRingProviderId && effectiveRingModelId + ? joinUrl( + backendUrl, + `/organizations/${orgId}/workspaces/${workspaceId}/providers/${effectiveRingProviderId}/context-window?modelId=${encodeURIComponent(effectiveRingModelId)}`, + ) + : null, + fetcher, + ); - // Show alert if no providers are configured - if (providers.length === 0) { - return ( -
-
- -
-
- ); - } + // Stats from the last completed assistant message for the ring (§H) and + // per-message stats popover (§I). + const lastAssistantStats = useMemo(() => { + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + const stats = (msg.metadata as { stats?: MessageStats } | undefined) + ?.stats; + if (msg.role === "assistant" && stats) { + return stats; + } + } + return null; + }, [messages]); const selectedAgent = agentId ? agents.find((a) => a.id === agentId) : null; // Resolve the provider backing the current selection, whether that's a raw @@ -416,6 +444,105 @@ export const Chat = ({ chatData?.status === "running" && status === "ready"; const effectiveStatus = isReconnectedToRunningRun ? "streaming" : status; + // §J: compact on demand — state for pending (deferred while streaming), + // in-flight compaction spinner, and the post-compact token estimate that + // refreshes the ring immediately (before the next completed message). + const [compactPending, setCompactPending] = useState(false); + const [isCompacting, setIsCompacting] = useState(false); + // Post-compact estimate, tagged with the message count at compaction time so + // it auto-expires once a new message arrives (the next provider count is + // authoritative). Tagging avoids a set-state-in-effect reset. + const [compacted, setCompacted] = useState<{ + atMessageCount: number; + tokens: number; + } | null>(null); + + const runCompact = useCallback(async () => { + if (!backendUrl) return; + setIsCompacting(true); + try { + const res = await fetch( + joinUrl( + backendUrl, + `/organizations/${orgId}/workspaces/${workspaceId}/chat/${chatId}/compact`, + ), + { method: "POST", credentials: "include" }, + ); + if (!res.ok) { + const body = await res.json().catch(() => ({})); + toast.error((body as { error?: string }).error ?? "Compact failed"); + return; + } + // Refresh the ring immediately from the post-compact estimate (§J). This + // is a message-only char/4 estimate (no per-turn system/tool overhead), + // so it reads slightly low until the next real response replaces it with + // the provider's authoritative count. + const body = (await res.json().catch(() => ({}))) as { + inputTokens?: number; + }; + if (typeof body.inputTokens === "number") { + setCompacted({ + atMessageCount: messages.length, + tokens: body.inputTokens, + }); + } + toast.success("Context compacted"); + } catch { + toast.error("Compact request failed"); + } finally { + setIsCompacting(false); + } + }, [backendUrl, orgId, workspaceId, chatId, messages.length]); + + const handleCompact = useCallback(() => { + // Confirm at click time (not after the deferred run fires) so the prompt + // never surprises the user mid-stream. Per P1 this is a view change, not + // data loss — the full history is preserved. + if ( + !window.confirm( + "This will summarize older messages to reduce context usage. The full conversation history is preserved. Continue?", + ) + ) + return; + if (effectiveStatus === "streaming" || effectiveStatus === "submitted") { + setCompactPending(true); + } else { + void runCompact(); + } + }, [effectiveStatus, runCompact]); + + // Fire deferred compact once streaming finishes (drift U4). Already confirmed + // at click time, so this just runs. + useEffect(() => { + if ( + compactPending && + effectiveStatus !== "streaming" && + effectiveStatus !== "submitted" + ) { + // Reacting to a streaming→idle transition to fire a queued action is the + // intended use of an effect here; clearing the flag prevents a re-fire. + // eslint-disable-next-line react-hooks/set-state-in-effect + setCompactPending(false); + void runCompact(); + } + }, [compactPending, effectiveStatus, runCompact]); + + // Early returns live below ALL hooks so hook order stays unconditional + // (react-hooks/rules-of-hooks). The §H/§J ring hooks above must always run. + // TODO: Ideally show a loading indicator here + if (isLoading || !providersData) return null; + + // Show alert if no providers are configured + if (providers.length === 0) { + return ( +
+
+ +
+
+ ); + } + const handleSubmit = async (message: PromptInputMessage) => { // Stop the stream if currently streaming or submitted if (effectiveStatus === "streaming" || effectiveStatus === "submitted") { @@ -576,6 +703,21 @@ export const Chat = ({ Search )} + = 0.7, red >= 0.9. + * Shows neutral grey with no percentage when contextWindow is unknown/default + * (drift T6) or when no run has completed yet. + * + * When `onClick` is provided the ring is clickable (§J: compact on demand). + * - While `isPending` (click queued, waiting for streaming to finish): shows + * a pending badge and is disabled (drift U4). + * - While `isCompacting`: shows a spinner. + * - `isStreaming` disables clicks entirely (frontend defers via pending flag). + */ +export function ContextUsageRing({ + usedTokens, + contextWindow, + onClick, + isStreaming, + isPending, + isCompacting, +}: { + usedTokens?: number; + contextWindow?: number | null; + onClick?: () => void; + isStreaming?: boolean; + isPending?: boolean; + isCompacting?: boolean; +}) { + const r = 7; + const circumference = 2 * Math.PI * r; + + const isNeutral = !contextWindow || usedTokens === undefined; + const fill = isNeutral + ? 0 + : Math.min(1, Math.max(0, usedTokens / contextWindow)); + + const color = isNeutral + ? "var(--color-muted-foreground)" + : fill >= 0.9 + ? "var(--color-destructive)" + : fill >= 0.7 + ? "#f59e0b" + : "var(--color-primary)"; + + const isDisabled = isPending || isCompacting || isStreaming || !onClick; + const isClickable = !!onClick && !isDisabled; + + // Append the compact affordance whenever the ring is actually clickable — + // including the neutral (unknown-window) state, where the user can still + // force a compaction even though no fill is shown. + const clickHint = isClickable ? " · click to compact" : ""; + let tooltipLabel: string; + if (isPending) { + tooltipLabel = "Will compact when response finishes"; + } else if (isCompacting) { + tooltipLabel = "Compacting…"; + } else if (isNeutral) { + tooltipLabel = `Context usage unknown${clickHint}`; + } else { + tooltipLabel = `Last response: ${usedTokens!.toLocaleString()} / ${contextWindow!.toLocaleString()} (${Math.round(fill * 100)}%) · current input not yet counted${clickHint}`; + } + + return ( + + +
{ + if (e.key === "Enter" || e.key === " ") { + e.preventDefault(); + onClick!(); + } + } + : undefined + } + tabIndex={isClickable ? 0 : undefined} + role={isClickable ? "button" : undefined} + aria-label={tooltipLabel} + aria-disabled={isDisabled || undefined} + > + {isCompacting ? ( + + ) : ( + + {/* Track */} + + {/* Fill */} + + {/* Pending dot */} + {isPending && } + + )} +
+
+ + {tooltipLabel} + +
+ ); +} diff --git a/apps/frontend/components/dynamic-tool-header.tsx b/apps/frontend/components/dynamic-tool-header.tsx index db72bdbf..766402fb 100644 --- a/apps/frontend/components/dynamic-tool-header.tsx +++ b/apps/frontend/components/dynamic-tool-header.tsx @@ -3,6 +3,7 @@ import { Badge } from "@/components/ui/badge"; import { CollapsibleTrigger } from "@/components/ui/collapsible"; import { cn } from "@/lib/utils"; +import { useToolDuration } from "@/hooks/use-tool-completed-at"; import type { DynamicToolUIPart } from "ai"; import { CheckCircleIcon, @@ -17,6 +18,10 @@ import type { ReactNode } from "react"; export type DynamicToolHeaderProps = { title: string; state: DynamicToolUIPart["state"]; + /** ISO timestamp of when this tool call began, if known. */ + startedAt?: string; + /** ISO timestamp of when this tool call completed, if known. */ + completedAt?: string; className?: string; }; @@ -53,20 +58,30 @@ export const DynamicToolHeader = ({ className, title, state, + startedAt, + completedAt, ...props -}: DynamicToolHeaderProps) => ( - -
- - {title} - {getStatusBadge(state)} -
- -
-); +}: DynamicToolHeaderProps) => { + const duration = useToolDuration(state, startedAt, completedAt); + return ( + +
+ + {title} + {getStatusBadge(state)} + {duration && ( + + {duration} + + )} +
+ +
+ ); +}; diff --git a/apps/frontend/hooks/use-tool-completed-at.ts b/apps/frontend/hooks/use-tool-completed-at.ts new file mode 100644 index 00000000..c7c59d65 --- /dev/null +++ b/apps/frontend/hooks/use-tool-completed-at.ts @@ -0,0 +1,90 @@ +import { useEffect, useState } from "react"; +import { formatDurationMs, formatToolDuration } from "@/lib/utils"; + +const isTerminalState = (state: string): boolean => state.startsWith("output-"); + +const toMs = (iso?: string): number | undefined => { + if (!iso) return undefined; + const t = new Date(iso).getTime(); + return Number.isNaN(t) ? undefined : t; +}; + +/** + * Resolves a tool call's run-duration string for the tool header. + * + * - While the tool is running it shows a live elapsed timer, ticking once a + * second from when the tool was first observed (the server start time isn't + * carried on the streamed message, so we measure on the client). + * - When it turns terminal it freezes: the exact server-measured span if both + * `startedAt`/`completedAt` are persisted (after a chat reload), otherwise + * the client-observed span. + * + * A client clock is only used when the tool was actually seen running this + * session, so reloading a chat (tool already terminal at mount) never shows a + * bogus value — it relies on the server timestamps or shows nothing. + * + * Returns undefined when there's nothing meaningful to show (e.g. a historical + * message that predates duration tracking). + */ +export function useToolDuration( + state: string, + startedAt?: string, + completedAt?: string, +): string | undefined { + const running = !isTerminalState(state); + // All render-visible values are state, never refs or live Date.now() reads + // (upstream's react-hooks rules forbid both during render). Every write is + // deferred into a timer callback — setState synchronously inside an effect + // body is also disallowed, but a timer/interval callback is a permitted site. + const [clientStart, setClientStart] = useState(); + const [clientEnd, setClientEnd] = useState(); + const [elapsedMs, setElapsedMs] = useState(0); + + // While running: record the client-observed start once and tick the elapsed + // time every second. `start` is captured in the effect body (reading + // Date.now() there is fine); the setState calls run in deferred callbacks. + useEffect(() => { + if (!running) return; + const start = Date.now(); + const startTimer = setTimeout( + () => setClientStart((prev) => prev ?? start), + 0, + ); + const id = setInterval(() => setElapsedMs(Date.now() - start), 1000); + return () => { + clearTimeout(startTimer); + clearInterval(id); + }; + }, [running]); + + // First terminal transition after we saw it running: freeze the end span. + // Deferred to a timer callback so it is not a synchronous effect-body write. + useEffect(() => { + if (running || clientStart === undefined) return; + const endTimer = setTimeout( + () => setClientEnd((prev) => prev ?? Date.now()), + 0, + ); + return () => clearTimeout(endTimer); + }, [running, clientStart]); + + // Live elapsed timer while running. `elapsedMs` is 0 until the first tick and + // `clientStart` is set on the next frame, so the very first render returns + // undefined (nothing meaningful to show yet). + if (running) { + if (clientStart === undefined) return undefined; + return formatDurationMs(elapsedMs); + } + + // Terminal: exact server span if available, else the client-observed span. + const serverDuration = formatToolDuration(startedAt, completedAt); + if (serverDuration) return serverDuration; + + const startMs = toMs(startedAt) ?? clientStart; + const endMs = + toMs(completedAt) ?? (clientStart !== undefined ? clientEnd : undefined); + if (startMs !== undefined && endMs !== undefined && endMs >= startMs) { + return formatDurationMs(endMs - startMs); + } + return undefined; +} diff --git a/apps/frontend/lib/utils.test.ts b/apps/frontend/lib/utils.test.ts index 53217425..ac8a6b25 100644 --- a/apps/frontend/lib/utils.test.ts +++ b/apps/frontend/lib/utils.test.ts @@ -1,5 +1,5 @@ import { describe, it, expect } from "vitest"; -import { joinUrl, parseValidationErrors } from "./utils"; +import { formatToolDuration, joinUrl, parseValidationErrors } from "./utils"; describe("joinUrl", () => { it("should join base URL and path", () => { @@ -25,6 +25,34 @@ describe("joinUrl", () => { }); }); +describe("formatToolDuration", () => { + const start = "2026-05-30T12:00:00.000Z"; + const plus = (ms: number) => + new Date(new Date(start).getTime() + ms).toISOString(); + + it("returns undefined when a timestamp is missing", () => { + expect(formatToolDuration(undefined, start)).toBeUndefined(); + expect(formatToolDuration(start, undefined)).toBeUndefined(); + }); + + it("returns undefined for invalid or negative durations", () => { + expect(formatToolDuration("not-a-date", start)).toBeUndefined(); + expect(formatToolDuration(plus(1000), start)).toBeUndefined(); + }); + + it("formats sub-second durations in milliseconds", () => { + expect(formatToolDuration(start, plus(950))).toBe("950ms"); + }); + + it("formats sub-minute durations in seconds with one decimal", () => { + expect(formatToolDuration(start, plus(1200))).toBe("1.2s"); + }); + + it("formats durations over a minute as minutes and seconds", () => { + expect(formatToolDuration(start, plus(63000))).toBe("1m 3s"); + }); +}); + describe("parseValidationErrors", () => { it("should parse validation errors correctly", () => { const errorData = { diff --git a/apps/frontend/lib/utils.ts b/apps/frontend/lib/utils.ts index d7b014f6..b5673274 100644 --- a/apps/frontend/lib/utils.ts +++ b/apps/frontend/lib/utils.ts @@ -18,6 +18,36 @@ export function joinUrl(base: string, path: string): string { return `${normalizedBase}${normalizedPath}`; } +/** + * Formats a tool call's run duration from its start/end ISO timestamps. + * Returns undefined when either timestamp is missing or invalid (e.g. an + * in-progress tool, or a historical message persisted before durations were + * tracked) so the UI can simply render nothing. + * + * Output scales with magnitude: `950ms`, `1.2s`, `1m 3s`. + */ +export function formatToolDuration( + startedAt?: string, + completedAt?: string, +): string | undefined { + if (!startedAt || !completedAt) return undefined; + const start = new Date(startedAt).getTime(); + const end = new Date(completedAt).getTime(); + if (Number.isNaN(start) || Number.isNaN(end) || end < start) return undefined; + return formatDurationMs(end - start); +} + +/** Formats an elapsed millisecond span: `950ms`, `1.2s`, `1m 3s`. */ +export function formatDurationMs(ms: number): string | undefined { + if (!Number.isFinite(ms) || ms < 0) return undefined; + if (ms < 1000) return `${Math.round(ms)}ms`; + const seconds = ms / 1000; + if (seconds < 60) return `${seconds.toFixed(1)}s`; + const minutes = Math.floor(seconds / 60); + const remSeconds = Math.round(seconds % 60); + return `${minutes}m ${remSeconds}s`; +} + export const fetcher = async (input: RequestInfo | URL, init?: RequestInit) => { const res = await fetch(input, { ...init, credentials: "include" }); if (!res.ok) { From 9e3bf4af8afe97b0333a95d50f01fda40ddc22b2 Mon Sep 17 00:00:00 2001 From: "frantisek.spacek@morosystems.cz" Date: Thu, 11 Jun 2026 22:06:13 +0200 Subject: [PATCH 03/21] fix(backend): resolve context window when OpenAI-compatible baseUrl ends in /v1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vLLM and most OpenAI-compatible providers set baseUrl to "{root}/v1" (the OpenAI SDK needs it that way for chat calls). detectOpenAiCompatible appended "/v1/models" to that, producing "{root}/v1/v1/models" → 404 → the window silently fell to the 8192 default and the §H usage ring rendered "unknown". Strip a trailing "/v1" before building the models URL so the probe hits "{root}/v1/models" and reads max_model_len. Added a regression test. Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/backend/src/runs/context-window.test.ts | 22 ++++++++++++++++++++ apps/backend/src/runs/context-window.ts | 6 +++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/apps/backend/src/runs/context-window.test.ts b/apps/backend/src/runs/context-window.test.ts index bc84b978..cac0228d 100644 --- a/apps/backend/src/runs/context-window.test.ts +++ b/apps/backend/src/runs/context-window.test.ts @@ -216,6 +216,28 @@ describe("API auto-detect parsers", () => { expect(out.source).toBe("api"); }); + it("vLLM: a baseUrl already ending in /v1 probes /v1/models, not /v1/v1/models", async () => { + const httpGetJson = vi.fn().mockResolvedValue({ + data: [{ id: "qwen36", max_model_len: 262144 }], + }); + const r = new ContextWindowResolver({ loadRegistry, httpGetJson }); + const out = await r.resolve( + { + id: "v", + providerType: "OpenAI", + baseUrl: "http://localhost:8000/v1", + apiKey: "x", + }, + "qwen36", + ); + expect(out.contextWindow).toBe(262144); + expect(out.source).toBe("api"); + expect(httpGetJson).toHaveBeenCalledWith( + "http://localhost:8000/v1/models", + expect.anything(), + ); + }); + it("official OpenAI (no baseUrl) skips the probe and falls to registry", async () => { const httpGetJson = vi.fn(); const r = new ContextWindowResolver({ loadRegistry, httpGetJson }); diff --git a/apps/backend/src/runs/context-window.ts b/apps/backend/src/runs/context-window.ts index d2390633..430fbec1 100644 --- a/apps/backend/src/runs/context-window.ts +++ b/apps/backend/src/runs/context-window.ts @@ -243,7 +243,11 @@ async function detectOpenAiCompatible( httpGetJson: HttpGetJson, ): Promise | undefined> { if (!provider.baseUrl) return undefined; // official OpenAI omits the field - const base = trimSlash(provider.baseUrl); + // baseUrl conventionally ends in "/v1" (the OpenAI SDK needs it that way for + // chat calls), but the models endpoint is "{root}/v1/models" — strip a + // trailing "/v1" first so we don't request "/v1/v1/models" (404 → the window + // silently falls to the default and the usage ring renders "unknown"). + const base = trimSlash(provider.baseUrl).replace(/\/v1$/, ""); const headers = provider.apiKey ? { authorization: `Bearer ${provider.apiKey}` } : undefined; From ff0620bb7e8e296048f33a3ebad8cf053e0ec8c1 Mon Sep 17 00:00:00 2001 From: "frantisek.spacek@morosystems.cz" Date: Thu, 11 Jun 2026 22:44:09 +0200 Subject: [PATCH 04/21] feat(backend): stream per-message stats on the finish event MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The §I stats popover (i) icon was gated on metadata.stats, which was only stamped post-stream and persisted to the DB — so it appeared a refetch round-trip after the answer finished, lagging the copy/delete/regenerate icons. Emit the stats via messageMetadata on the `finish` event so they ride the final stream chunk and the (i) appears the instant the answer completes. firstTokenAt is captured in streamText.onChunk (fires before finish) instead of the async snapshot drain. A single buildMessageStats() helper feeds both the streamed copy and the post-stream persist stamp (sharing one finishedAt) so live and reloaded stats match. Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/backend/src/runs/agent-runner.ts | 69 +++++++++++++++++++-------- 1 file changed, 50 insertions(+), 19 deletions(-) diff --git a/apps/backend/src/runs/agent-runner.ts b/apps/backend/src/runs/agent-runner.ts index 19b7c7f1..a929c5d7 100644 --- a/apps/backend/src/runs/agent-runner.ts +++ b/apps/backend/src/runs/agent-runner.ts @@ -527,10 +527,40 @@ export class AgentRunner { const startedAt = new Date().toISOString(); let firstTokenAt: string | undefined; + // Set when the §H/§I stats are first emitted (messageMetadata `finish`), so + // the post-stream persist stamp reuses the same value rather than a slightly + // later one — streamed and reloaded stats then match. + let finishedAt: string | undefined; + + // Single source of truth for the per-message stats, so the live-streamed + // copy (messageMetadata, below) and the persisted copy (applyMessageStats in + // the finally) are identical. Reads the mutable state at call time. + const buildMessageStats = ( + finishedAtValue: string, + ): MessageStats | undefined => { + if (!state.turn) return undefined; + return { + inputTokens: state.stats.inputTokens ?? 0, + outputTokens: state.stats.outputTokens ?? 0, + contextTokens: state.lastStepInputTokens, + startedAt, + firstTokenAt, + finishedAt: finishedAtValue, + contextWindow: state.turn.resolved.contextWindow, + contextWindowIsDefault: state.turn.resolved.contextWindowIsDefault, + }; + }; const result = streamText({ ...modelArgs, onStepFinish: (step) => onStep(step), + // TTFT: stamp the first text token here (fires before the `finish` event), + // so the stats are complete by the time messageMetadata emits them. + onChunk: ({ chunk }) => { + if (!firstTokenAt && chunk.type === "text-delta") { + firstTokenAt = new Date().toISOString(); + } + }, }); // Build the UI message stream and tee it. The response body consumes @@ -541,10 +571,22 @@ export class AgentRunner { const uiStream = result.toUIMessageStream({ originalMessages: input.messages, generateMessageId: createIdGenerator({ prefix: "msg", size: 16 }), - messageMetadata: () => - state.turn?.resolved.agentId + // Emit the §H/§I stats with the `finish` event so the client gets them on + // the final stream chunk — the (i) stats action then appears the instant + // the answer completes, not a DB-refetch round-trip later. `start` carries + // only agentId (timing/usage don't exist yet). The post-stream stamp in + // the finally still writes them to the persisted message for reload. + messageMetadata: ({ part }) => { + const agentId = state.turn?.resolved.agentId ? { agentId: state.turn.resolved.agentId } - : undefined, + : undefined; + if (part.type === "finish") { + finishedAt = new Date().toISOString(); + const stats = buildMessageStats(finishedAt); + return stats ? { ...agentId, stats } : agentId; + } + return agentId; + }, onError: (error) => formatStreamError(error), }); @@ -579,9 +621,6 @@ export class AgentRunner { ); }, })) { - if (!firstTokenAt && message.parts?.some((p) => p.type === "text")) { - firstTokenAt = new Date().toISOString(); - } state.messages = [...input.messages, message]; } } catch (err) { @@ -591,20 +630,12 @@ export class AgentRunner { "Server-side UI stream consumer error", ); } finally { - const finishedAt = new Date().toISOString(); + // Reuse the finish-event timestamp when present so the persisted stats + // match what was streamed; fall back if the stream ended without one. + const finishedAtFinal = finishedAt ?? new Date().toISOString(); applyToolCompletions(state.messages, completions); - if (state.turn) { - applyMessageStats(state.messages, { - inputTokens: state.stats.inputTokens ?? 0, - outputTokens: state.stats.outputTokens ?? 0, - contextTokens: state.lastStepInputTokens, - startedAt, - firstTokenAt, - finishedAt, - contextWindow: state.turn.resolved.contextWindow, - contextWindowIsDefault: state.turn.resolved.contextWindowIsDefault, - }); - } + const stats = buildMessageStats(finishedAtFinal); + if (stats) applyMessageStats(state.messages, stats); let status: RunStatus = "succeeded"; let err: Error | undefined; if (handle.signal.aborted) { From 8142c00037923d4733fd29da0e08c83f0b04f63d Mon Sep 17 00:00:00 2001 From: "frantisek.spacek@morosystems.cz" Date: Fri, 12 Jun 2026 10:02:27 +0200 Subject: [PATCH 05/21] =?UTF-8?q?feat(frontend):=20chunk=2011=20=E2=80=94?= =?UTF-8?q?=20ring=20fix,=20(i)=20tooltip,=20compaction=20timeline=20trace?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 11a (U5): ring no longer jumps back to pre-compaction value on user Send. Expiry now tracks `assistantMessageCount` instead of `messages.length` so optimistic user-message pushes don't trigger early fallback. 11b: wrap the agent-info (i) button in a Tooltip so hover reveals its purpose without opening the dialog. Shows agent description or "Agent info". 11c (§K): Tier 1 compaction is now visible in the chat timeline. - Backend emits synthetic `compact_context` tool-call + tool-result chunks into the UIMessage stream (via `prependCompactionChunks`) immediately after the `start` event; the existing tool-call expander renders them for free. - `CompactionTrace` threads through Tier1Output → ChatTurn.compactionTrace → agent-runner.stream() — only when a model summary was produced (prune-only turns produce no trace to avoid empty/confusing entries). - `COMPACT_CONTEXT_TOOL_NAME` constant shared across stream producer, strip filter, §J message builder, and frontend display-name mapping. - `stripCompactionTraceParts` removes trace parts before ModelMessage conversion so the provider never sees the phantom tool call on replay. - `buildCompactionTraceMessage` builds a standalone assistant message for §J (forced compact endpoint — no live stream to inject into). - `title: "Context compaction"` on the chunk + tool.tsx display-name mapping shows a friendly label instead of the raw underscore name. - `context-usage-ring.tsx`: improve neutral-state tooltip copy. Co-Authored-By: Claude Sonnet 4.6 --- apps/backend/src/routes/chat.ts | 3 + apps/backend/src/runs/agent-runner.test.ts | 153 +++++++++++++++++- apps/backend/src/runs/agent-runner.ts | 112 ++++++++++++- apps/backend/src/runs/compaction.test.ts | 66 ++++++++ apps/backend/src/runs/compaction.ts | 69 +++++++- apps/backend/src/services/chat-execution.ts | 57 ++++++- apps/frontend/components/ai-elements/tool.tsx | 3 + apps/frontend/components/chat.tsx | 62 +++++-- .../components/context-usage-ring.tsx | 6 +- 9 files changed, 506 insertions(+), 25 deletions(-) diff --git a/apps/backend/src/routes/chat.ts b/apps/backend/src/routes/chat.ts index 37038bb8..d2c6ea7c 100644 --- a/apps/backend/src/routes/chat.ts +++ b/apps/backend/src/routes/chat.ts @@ -466,6 +466,9 @@ chat.post( inputTokens: result.estimatedTokens, contextWindow: result.contextWindow, contextWindowIsDefault: result.contextWindowIsDefault, + // §J/11c: the persisted synthetic trace message (when a summary ran), so + // the frontend can append it to the timeline without a full refetch. + traceMessage: result.traceMessage, }); } catch (error) { if (error instanceof NotFoundError) { diff --git a/apps/backend/src/runs/agent-runner.test.ts b/apps/backend/src/runs/agent-runner.test.ts index ea3acc5b..516c0832 100644 --- a/apps/backend/src/runs/agent-runner.test.ts +++ b/apps/backend/src/runs/agent-runner.test.ts @@ -33,11 +33,17 @@ vi.mock("../logger.ts", () => ({ }, })); -import { AgentRunner, withToolTimestamps } from "./agent-runner.ts"; +import { + AgentRunner, + prependCompactionChunks, + stripCompactionTraceParts, + withToolTimestamps, +} from "./agent-runner.ts"; import { buildTier2PrepareStep } from "./compaction.ts"; import type { UIMessageChunk } from "ai"; import { runRegistry, TimeoutError } from "./run-registry.ts"; import type { ResolvedRunPlan, RunInput, RunSink } from "./types.ts"; +import type { PlatypusUIMessage } from "../types.ts"; import type { WorkspaceScope } from "../scope.ts"; type LifecycleEvent = @@ -506,6 +512,151 @@ describe("withToolTimestamps", () => { }); }); +describe("prependCompactionChunks", () => { + const collect = async ( + stream: ReadableStream, + ): Promise => { + const out: UIMessageChunk[] = []; + const reader = stream.getReader(); + for (;;) { + const { done, value } = await reader.read(); + if (done) break; + out.push(value); + } + return out; + }; + + const sourceOf = (chunks: UIMessageChunk[]): ReadableStream => + new ReadableStream({ + start(controller) { + for (const chunk of chunks) controller.enqueue(chunk); + controller.close(); + }, + }); + + it("injects a compact_context tool-call/result pair right after start, before any text", async () => { + const out = await collect( + prependCompactionChunks( + sourceOf([ + { type: "start" }, + { type: "text-start", id: "t" }, + { type: "text-delta", id: "t", delta: "hi" }, + ]), + { messagesDropped: 12, summaryExcerpt: "the user did X" }, + () => "cc1", + ), + ); + + expect(out.map((c) => c.type)).toEqual([ + "start", + "tool-input-available", + "tool-output-available", + "text-start", + "text-delta", + ]); + const input = out[1] as Extract< + UIMessageChunk, + { type: "tool-input-available" } + >; + expect(input.toolName).toBe("compact_context"); + expect(input.toolCallId).toBe("cc1"); + const output = out[2] as Extract< + UIMessageChunk, + { type: "tool-output-available" } + >; + expect(output.toolCallId).toBe("cc1"); + expect(output.output).toEqual({ + messagesDropped: 12, + summaryExcerpt: "the user did X", + }); + }); + + it("omits summaryExcerpt when absent", async () => { + const out = await collect( + prependCompactionChunks( + sourceOf([{ type: "start" }]), + { messagesDropped: 3 }, + () => "cc2", + ), + ); + const output = out[2] as Extract< + UIMessageChunk, + { type: "tool-output-available" } + >; + expect(output.output).toEqual({ messagesDropped: 3 }); + }); + + it("injects only once even if multiple start events appear", async () => { + const out = await collect( + prependCompactionChunks( + sourceOf([{ type: "start" }, { type: "start" }]), + { messagesDropped: 1 }, + () => "cc3", + ), + ); + expect(out.filter((c) => c.type === "tool-input-available")).toHaveLength( + 1, + ); + }); +}); + +describe("stripCompactionTraceParts", () => { + const traceMessage = (id: string): PlatypusUIMessage => + ({ + id, + role: "assistant", + parts: [ + { + type: "tool-compact_context", + toolCallId: `${id}-call`, + state: "output-available", + input: { messagesDropped: 2 }, + output: { messagesDropped: 2 }, + }, + ], + }) as unknown as PlatypusUIMessage; + + it("drops a trace-only assistant message entirely (never replayed to the model)", () => { + const messages = [ + { id: "u1", role: "user", parts: [{ type: "text", text: "hi" }] }, + traceMessage("t1"), + ] as unknown as PlatypusUIMessage[]; + + const out = stripCompactionTraceParts(messages); + expect(out.map((m) => m.id)).toEqual(["u1"]); + }); + + it("strips only the trace part from an assistant message with real content", () => { + const messages = [ + { + id: "a1", + role: "assistant", + parts: [ + { + type: "tool-compact_context", + toolCallId: "a1-call", + state: "output-available", + input: {}, + output: {}, + }, + { type: "text", text: "answer" }, + ], + }, + ] as unknown as PlatypusUIMessage[]; + + const out = stripCompactionTraceParts(messages); + expect(out).toHaveLength(1); + expect(out[0].parts.map((p) => p.type)).toEqual(["text"]); + }); + + it("returns the same array reference when nothing to strip", () => { + const messages = [ + { id: "u1", role: "user", parts: [{ type: "text", text: "hi" }] }, + ] as unknown as PlatypusUIMessage[]; + expect(stripCompactionTraceParts(messages)).toBe(messages); + }); +}); + describe("buildTier2PrepareStep", () => { const makeCtx = (triggerTokens = 100) => ({ triggerTokens, diff --git a/apps/backend/src/runs/agent-runner.ts b/apps/backend/src/runs/agent-runner.ts index a929c5d7..95db32de 100644 --- a/apps/backend/src/runs/agent-runner.ts +++ b/apps/backend/src/runs/agent-runner.ts @@ -16,7 +16,11 @@ import { contextOverflowRecoveryMiddleware, isContextOverflowError, } from "./recovery.ts"; -import { buildTier2PrepareStep } from "./compaction.ts"; +import { + buildTier2PrepareStep, + COMPACT_CONTEXT_TOOL_NAME, + type CompactionTrace, +} from "./compaction.ts"; import { loadChatMessages, prepareChatTurn, @@ -98,6 +102,89 @@ export function withToolTimestamps( return { stream: out, completions }; } +/** + * Injects synthetic `compact_context` tool-call + tool-result chunks into a + * UIMessage stream immediately after the `start` event (§K / 11c). Makes Tier + * 1 compaction visible in the chat timeline without a custom renderer — the + * existing tool-call expander handles it automatically. + * + * Exported for unit testing. + */ +export function prependCompactionChunks( + stream: ReadableStream, + trace: CompactionTrace, + generateId: () => string = createIdGenerator({ prefix: "cc", size: 12 }), +): ReadableStream { + const toolCallId = generateId(); + const syntheticChunks: UIMessageChunk[] = [ + { + type: "tool-input-available", + toolCallId, + toolName: COMPACT_CONTEXT_TOOL_NAME, + title: "Context compaction", + input: { messagesDropped: trace.messagesDropped }, + }, + { + type: "tool-output-available", + toolCallId, + output: { + messagesDropped: trace.messagesDropped, + ...(trace.summaryExcerpt + ? { summaryExcerpt: trace.summaryExcerpt } + : {}), + }, + }, + ]; + let injected = false; + return stream.pipeThrough( + new TransformStream({ + transform(chunk, controller) { + controller.enqueue(chunk); + if (!injected && chunk.type === "start") { + injected = true; + for (const c of syntheticChunks) controller.enqueue(c); + } + }, + }), + ); +} + +const COMPACT_CONTEXT_PART_TYPE = `tool-${COMPACT_CONTEXT_TOOL_NAME}`; + +/** + * Removes the synthetic `compact_context` trace parts (§K/11c) from a message + * list before it is converted to ModelMessages. The trace is a UI-only marker + * persisted in the assistant message for the chat timeline; it must NEVER be + * replayed to the provider, which would otherwise see a phantom tool call for a + * tool it was never given (provider rejection / model confusion). An assistant + * message left with no parts after stripping (the §J standalone trace message) + * is dropped entirely rather than sent empty. + * + * Exported for unit testing. + */ +export function stripCompactionTraceParts( + messages: PlatypusUIMessage[], +): PlatypusUIMessage[] { + let changed = false; + const out: PlatypusUIMessage[] = []; + for (const message of messages) { + if ( + message.role !== "assistant" || + !message.parts.some((p) => p.type === COMPACT_CONTEXT_PART_TYPE) + ) { + out.push(message); + continue; + } + changed = true; + const parts = message.parts.filter( + (p) => p.type !== COMPACT_CONTEXT_PART_TYPE, + ); + if (parts.length > 0) out.push({ ...message, parts }); + // else: trace-only message (§J) — drop it from the model payload. + } + return changed ? out : messages; +} + /** Stats stamped on the last assistant message's metadata after each stream (§H/§I). */ export type MessageStats = { /** Run-wide totals across every step (sum) — §I cost popover. */ @@ -486,7 +573,13 @@ export class AgentRunner { // tool-loop step, stream and generate alike — gets one trim-and-retry on // a provider "context too long" rejection. Always on; not gated by §G. model: withOverflowRecovery(state.turn), - messages: await convertToModelMessages(state.turn.stream.messages), + // Strip the UI-only synthetic compact_context trace parts (§K/11c) before + // sending history to the provider — replaying them surfaces a phantom tool + // call for a tool the model was never given. Applied here so both the + // streaming and generate paths (which share modelArgs) are covered. + messages: await convertToModelMessages( + stripCompactionTraceParts(state.turn.stream.messages), + ), system: state.turn.stream.system, tools: state.turn.stream.tools, stopWhen: [stepCountIs(state.turn.stream.maxSteps)], @@ -590,7 +683,20 @@ export class AgentRunner { onError: (error) => formatStreamError(error), }); - const { stream: timedStream, completions } = withToolTimestamps(uiStream); + // §K / 11c: if Tier 1 compaction fired this turn, prepend synthetic + // compact_context tool-call + tool-result chunks so the compaction is + // visible in the chat timeline. Injected after the 'start' event so the + // AI SDK builds them into the same assistant message as the response. + const tracedStream: ReadableStream = state.turn + ?.compactionTrace + ? prependCompactionChunks( + uiStream as ReadableStream, + state.turn.compactionTrace, + ) + : (uiStream as ReadableStream); + + const { stream: timedStream, completions } = + withToolTimestamps(tracedStream); const [forResponse, forSnapshot] = timedStream.tee(); // Read the snapshot branch as message snapshots and keep `state.messages` diff --git a/apps/backend/src/runs/compaction.test.ts b/apps/backend/src/runs/compaction.test.ts index 1e94e581..2c169543 100644 --- a/apps/backend/src/runs/compaction.test.ts +++ b/apps/backend/src/runs/compaction.test.ts @@ -413,6 +413,7 @@ describe("compactModelMessages (Tier 2 / recovery)", () => { import { applyTier1Compaction, + buildCompactionTraceMessage, computeBudget, resolveCompactionConfig, invalidateCompaction, @@ -423,6 +424,37 @@ import { type CompactionConfig, } from "./compaction.ts"; +describe("buildCompactionTraceMessage (§J/11c)", () => { + it("builds an assistant message with a completed compact_context tool part", () => { + const msg = buildCompactionTraceMessage( + { messagesDropped: 7, summaryExcerpt: "did things" }, + "msg-abc", + ); + expect(msg.id).toBe("msg-abc"); + expect(msg.role).toBe("assistant"); + expect(msg.parts).toHaveLength(1); + const part = msg.parts[0] as { + type: string; + state: string; + toolCallId: string; + output: unknown; + }; + expect(part.type).toBe("tool-compact_context"); + expect(part.state).toBe("output-available"); + expect(part.toolCallId).toBe("msg-abc-call"); + expect(part.output).toEqual({ + messagesDropped: 7, + summaryExcerpt: "did things", + }); + }); + + it("omits summaryExcerpt from the output when absent", () => { + const msg = buildCompactionTraceMessage({ messagesDropped: 1 }, "msg-x"); + const part = msg.parts[0] as { output: unknown }; + expect(part.output).toEqual({ messagesDropped: 1 }); + }); +}); + function storeFromState(state: Partial): FakeStore { return new FakeStore(state); } @@ -545,6 +577,12 @@ describe("applyTier1Compaction", () => { expect(store.state.version).toBe(1); expect(out.messages[0].id).toBe("context-summary"); expect(onEvent).toHaveBeenCalledOnce(); + // §K/11c: a summary ran → a trace is surfaced with the dropped count and a + // summary excerpt. + expect(out.compactionTrace).toEqual({ + messagesDropped: 2, + summaryExcerpt: "SUMMARY", + }); }); it("disabled + not dirty: no compaction even when over the trigger", async () => { @@ -629,6 +667,34 @@ describe("applyTier1Compaction", () => { expect(store.state.compactionDirty).toBe(false); // flag cleared expect(store.state.contextSummary).toBeNull(); // no summary written expect(store.state.version).toBe(1); + // §K/11c: no model summary ran → no trace (would be an empty timeline entry). + expect(out.compactionTrace).toBeUndefined(); + }); + + it("under trigger: no trace surfaced", async () => { + const store = storeFromState({ version: 0 }); + const messages = [uiText("r1", "user", "a")]; + const out = await applyTier1Compaction({ + chatId: "c", + messages, + state: { + version: 0, + summaryWatermark: null, + contextSummary: null, + compactionDirty: false, + }, + budget: { + inputBudget: 100000, + triggerTokens: 100000, + targetTokens: 50000, + }, + config: cfg(), + imageProvider: "default", + summarize: noopSummarize, + store, + }); + expect(out.compacted).toBe(false); + expect(out.compactionTrace).toBeUndefined(); }); }); diff --git a/apps/backend/src/runs/compaction.ts b/apps/backend/src/runs/compaction.ts index c01266c0..677c893b 100644 --- a/apps/backend/src/runs/compaction.ts +++ b/apps/backend/src/runs/compaction.ts @@ -835,12 +835,63 @@ export type Tier1Input = { onEvent?: (event: CompactionEvent) => void; }; +export type CompactionTrace = { + /** Number of messages that were folded into the summary. */ + messagesDropped: number; + /** First ~120 chars of the LLM-generated summary. */ + summaryExcerpt?: string; +}; + +/** Tool name for the synthetic compaction-trace tool-call/result pair (§K/11c). + * Shared by the stream-trace producer (agent-runner), the strip filter that + * keeps it out of the model payload, the §J persisted-message builder, and the + * frontend display-name mapping. */ +export const COMPACT_CONTEXT_TOOL_NAME = "compact_context"; + +/** Builds a standalone synthetic assistant message carrying the compaction + * trace as a `compact_context` tool-call/result pair (§J — forced compaction + * has no live stream to inject into, so the trace is persisted as its own + * message instead). The message is always appended ABOVE the watermark, so it + * is never itself summarized; the strip filter keeps it out of the model + * payload on subsequent turns. */ +export function buildCompactionTraceMessage( + trace: CompactionTrace, + id: string, +): PlatypusUIMessage { + return { + id, + role: "assistant", + parts: [ + { + type: `tool-${COMPACT_CONTEXT_TOOL_NAME}`, + toolCallId: `${id}-call`, + state: "output-available", + input: { messagesDropped: trace.messagesDropped }, + output: { + messagesDropped: trace.messagesDropped, + ...(trace.summaryExcerpt + ? { summaryExcerpt: trace.summaryExcerpt } + : {}), + }, + }, + ], + } as unknown as PlatypusUIMessage; +} + export type Tier1Output = { /** The compacted view to send to the model (summary message + recent). */ messages: PlatypusUIMessage[]; /** True when a new summary was produced and persisted this turn. */ compacted: boolean; commit?: CommitResult; + /** + * Present ONLY when a model summary was produced this turn — the user-visible + * "compaction happened" signal (§K/11c). Deliberately undefined for + * prune-only and force-dirty-within-target no-op turns: those drop 0 messages + * and have no excerpt, so a trace would render an empty/confusing timeline + * entry. + */ + compactionTrace?: CompactionTrace; }; /** Splits history at the watermark message id. Returns the messages after it and @@ -970,7 +1021,23 @@ export async function applyTier1Compaction( commit = await pinnedWrite({ dirty: false }); } - return { messages: view, compacted: result.usedModelCall, commit }; + // Only surface a trace when an actual model summary was produced. Prune-only + // and force-dirty-within-target runs drop 0 messages with no excerpt — a + // trace there would be an empty, confusing timeline entry (§K/11c). + const compactionTrace: CompactionTrace | undefined = + result.usedModelCall && result.summaryText + ? { + messagesDropped: result.messagesDropped, + summaryExcerpt: result.summaryText.slice(0, 120), + } + : undefined; + + return { + messages: view, + compacted: result.usedModelCall, + commit, + compactionTrace, + }; } /** diff --git a/apps/backend/src/services/chat-execution.ts b/apps/backend/src/services/chat-execution.ts index fdfca441..61a5f001 100644 --- a/apps/backend/src/services/chat-execution.ts +++ b/apps/backend/src/services/chat-execution.ts @@ -24,7 +24,7 @@ import { type MemorySummary, } from "./memory-retrieval.ts"; import type { Provider, Skill } from "@platypus/schemas"; -import { generateText, type Tool } from "ai"; +import { createIdGenerator, generateText, type Tool } from "ai"; import { logger } from "../logger.ts"; import { buildMcpTransportConfig } from "./mcp-oauth-provider.ts"; import { inlineFileUrls } from "../storage/utils.ts"; @@ -44,6 +44,7 @@ import { import { applyTier1Compaction, affectedBelowWatermark, + buildCompactionTraceMessage, buildTier2PrepareStep, computeBudget, drizzleCompactionStore, @@ -53,6 +54,7 @@ import { type Budget, type CompactionConfig, type CompactionState, + type CompactionTrace, type Summarize, type Tier2Context, } from "../runs/compaction.ts"; @@ -150,6 +152,12 @@ export type ChatTurn = { presencePenalty?: number; seed?: number; }; + /** + * Set when Tier 1 compaction fired this turn (§K / 11c). agent-runner emits + * a synthetic compact_context tool-call + tool-result pair into the stream so + * the compaction is visible in the chat timeline. + */ + compactionTrace?: CompactionTrace; resolved: { agentId?: string; providerId: string; @@ -619,14 +627,20 @@ type ApplyTier1Args = { lastInputTokens?: number; }; +type Tier1IfNeededResult = { + messages: PlatypusUIMessage[]; + compactionTrace?: CompactionTrace; +}; + /** * Reconstructs/advances the compacted view and persists any new summary — all * best-effort. Any throw degrades to the uncompacted messages (recovery §E - * remains the safety net). Returns the message array to send to the model. + * remains the safety net). Returns the messages to send to the model plus an + * optional compactionTrace for the stream trace (§K / 11c). */ async function applyTier1IfNeeded( args: ApplyTier1Args, -): Promise { +): Promise { const { chatId, runtime, messages, rawMessages } = args; try { const store = drizzleCompactionStore; @@ -672,13 +686,16 @@ async function applyTier1IfNeeded( logger.info({ chatId, ...event }, "context-compacted"), }); - return result.messages; + return { + messages: result.messages, + compactionTrace: result.compactionTrace, + }; } catch (error) { logger.error( { error, chatId }, "Tier 1 compaction failed; sending uncompacted history", ); - return messages; + return { messages }; } } @@ -842,7 +859,7 @@ export const prepareChatTurn = async ( // runs (triggers, sub-agents) carry no chat id and have no durable history to // compact (plan M3 — they are Tier 2 only), so send messages uncompacted. const chatId = request.id; - const compactedMessages = chatId + const tier1Result = chatId ? await applyTier1IfNeeded({ chatId, runtime: compactionRuntime, @@ -863,7 +880,8 @@ export const prepareChatTurn = async ( | undefined )?.stats?.contextTokens, }) - : inlinedMessages; + : { messages: inlinedMessages }; + const compactedMessages = tier1Result.messages; // Recovery (§E, P4): always wired, even when proactive compaction is off. // Headless runs get trim+retry but no dirty flag (no durable chat row). @@ -916,6 +934,7 @@ export const prepareChatTurn = async ( contextWindow: compactionRuntime.contextWindow, contextWindowIsDefault: compactionRuntime.contextWindowIsDefault, }, + compactionTrace: tier1Result.compactionTrace, recovery, tier2: compactionRuntime.config.compactionEnabled ? { @@ -1369,6 +1388,8 @@ export async function forceCompactChat( estimatedTokens: number; contextWindow: number; contextWindowIsDefault: boolean; + /** §J/11c — the persisted synthetic trace message, when a summary was produced. */ + traceMessage?: PlatypusUIMessage; }> { // Load the chat record (workspace-scoped). const chatRows = await db @@ -1457,9 +1478,31 @@ export async function forceCompactChat( uiMessagesToCountUnits(result.messages, runtime.imageProvider), ); + // §J/11c: a forced compaction has no live stream to inject the trace into, so + // persist it as a standalone synthetic assistant message. Appended after the + // last real message — above the watermark (which already advanced inside + // applyTier1Compaction), so it is never itself summarized. The strip filter + // keeps it out of the model payload on subsequent turns. Only written when a + // model summary was actually produced (result.compactionTrace is undefined + // otherwise — see Tier1Output). + let traceMessage: PlatypusUIMessage | undefined; + if (result.compactionTrace) { + traceMessage = buildCompactionTraceMessage( + result.compactionTrace, + createIdGenerator({ prefix: "msg", size: 16 })(), + ); + await db + .update(chatTable) + .set({ messages: [...messages, traceMessage] }) + .where( + and(eq(chatTable.id, chatId), eq(chatTable.workspaceId, workspaceId)), + ); + } + return { estimatedTokens, contextWindow: runtime.contextWindow, contextWindowIsDefault: runtime.contextWindowIsDefault, + traceMessage, }; } diff --git a/apps/frontend/components/ai-elements/tool.tsx b/apps/frontend/components/ai-elements/tool.tsx index 658ab27f..aebb14b3 100644 --- a/apps/frontend/components/ai-elements/tool.tsx +++ b/apps/frontend/components/ai-elements/tool.tsx @@ -45,6 +45,9 @@ import { CodeBlock } from "./code-block"; export function humanizeToolType(type: string): string { // Strip the "tool-" prefix const name = type.startsWith("tool-") ? type.slice(5) : type; + // Synthetic compaction trace (§K/11c) — render a human label instead of the + // raw, underscore-laden function name. + if (name === "compact_context") return "Context compaction"; // Split on camelCase boundaries const words = name.replace(/([a-z])([A-Z])/g, "$1 $2").split(" "); // Capitalise the first word, lowercase the rest diff --git a/apps/frontend/components/chat.tsx b/apps/frontend/components/chat.tsx index 4c741f0a..a29eac89 100644 --- a/apps/frontend/components/chat.tsx +++ b/apps/frontend/components/chat.tsx @@ -449,11 +449,22 @@ export const Chat = ({ // refreshes the ring immediately (before the next completed message). const [compactPending, setCompactPending] = useState(false); const [isCompacting, setIsCompacting] = useState(false); - // Post-compact estimate, tagged with the message count at compaction time so - // it auto-expires once a new message arrives (the next provider count is - // authoritative). Tagging avoids a set-state-in-effect reset. + // Stable count of assistant messages — unaffected by optimistic user-message + // pushes (11a / U5). Used to tag post-compact estimates so the ring doesn't + // snap back to the old value when the user hits Send. + const assistantMessageCount = useMemo( + () => messages.filter((m) => m.role === "assistant").length, + [messages], + ); + + // Post-compact estimate, tagged with the assistant message count at + // compaction time so it auto-expires once a new assistant message arrives + // (the next provider count is authoritative). Using assistantMessageCount + // instead of messages.length fixes the U5 ring jump: an optimistic user + // message increments messages.length but not assistantMessageCount, so the + // compacted estimate stays valid until the real response lands. const [compacted, setCompacted] = useState<{ - atMessageCount: number; + atAssistantMessageCount: number; tokens: number; } | null>(null); @@ -479,20 +490,39 @@ export const Chat = ({ // the provider's authoritative count. const body = (await res.json().catch(() => ({}))) as { inputTokens?: number; + traceMessage?: PlatypusUIMessage; }; if (typeof body.inputTokens === "number") { setCompacted({ - atMessageCount: messages.length, + atAssistantMessageCount: assistantMessageCount, tokens: body.inputTokens, }); } + // §J/11c: append the persisted compaction-trace message so it shows in the + // timeline immediately. It carries the id the backend persisted, so a + // later SWR revalidation reconciles rather than duplicating it. + if (body.traceMessage) { + const traceMessage = body.traceMessage; + setMessages((prev) => + prev.some((m) => m.id === traceMessage.id) + ? prev + : [...prev, traceMessage], + ); + } toast.success("Context compacted"); } catch { toast.error("Compact request failed"); } finally { setIsCompacting(false); } - }, [backendUrl, orgId, workspaceId, chatId, messages.length]); + }, [ + backendUrl, + orgId, + workspaceId, + chatId, + assistantMessageCount, + setMessages, + ]); const handleCompact = useCallback(() => { // Confirm at click time (not after the deferred run fires) so the prompt @@ -705,7 +735,8 @@ export const Chat = ({ )} - - - - - + + + + + + + + + + {selectedAgent.description?.trim() || "Agent info"} + + Date: Fri, 12 Jun 2026 10:24:49 +0200 Subject: [PATCH 06/21] fix(frontend): add hover tooltip to per-message stats (i) button Wraps the existing Popover trigger in a Tooltip using the TooltipTrigger asChild > PopoverTrigger asChild composition pattern. Hover shows compact stats (In/Out/TTFT/Total); click still opens the full popover. Co-Authored-By: Claude Sonnet 4.6 --- apps/frontend/components/chat-message.tsx | 77 +++++++++++++---------- 1 file changed, 44 insertions(+), 33 deletions(-) diff --git a/apps/frontend/components/chat-message.tsx b/apps/frontend/components/chat-message.tsx index 3cd638f9..b709af0a 100644 --- a/apps/frontend/components/chat-message.tsx +++ b/apps/frontend/components/chat-message.tsx @@ -51,6 +51,7 @@ import { LoadSkillTool } from "./load-skill-tool"; import { SubAgentTool } from "./sub-agent-tool"; import { Button } from "./ui/button"; import { Popover, PopoverContent, PopoverTrigger } from "./ui/popover"; +import { Tooltip, TooltipContent, TooltipTrigger } from "./ui/tooltip"; import { formatDurationMs } from "@/lib/utils"; const getToolStartedAt = (part: unknown): string | undefined => { @@ -76,42 +77,52 @@ function MessageStatsPopover({ stats }: { stats: MessageStats }) { new Date(stats.finishedAt).getTime() - new Date(stats.startedAt).getTime(), ); return ( - - - - - -
-

- Response stats -

-

- In:{" "} - {stats.inputTokens.toLocaleString()}{" "} - Out:{" "} - {stats.outputTokens.toLocaleString()} -

- {ttft && ( -

- TTFT: {ttft} + + + + + + + + +

+

+ Response stats

- )} - {total && (

- Total: {total} + In:{" "} + {stats.inputTokens.toLocaleString()}{" "} + Out:{" "} + {stats.outputTokens.toLocaleString()}

- )} -
- - + {ttft && ( +

+ TTFT: {ttft} +

+ )} + {total && ( +

+ Total: {total} +

+ )} +
+
+
+ + In: {stats.inputTokens.toLocaleString()} · Out:{" "} + {stats.outputTokens.toLocaleString()} + {ttft ? ` · TTFT: ${ttft}` : ""} + {total ? ` · Total: ${total}` : ""} + + ); } From 7d91a02b946dc5eeaefc8785cf908277cfceacf2 Mon Sep 17 00:00:00 2001 From: "frantisek.spacek@morosystems.cz" Date: Fri, 12 Jun 2026 22:51:07 +0200 Subject: [PATCH 07/21] feat: remove per-agent compaction config, use global defaults Chunk 12: drop the per-agent context-compaction override surface. Compaction now runs from DEFAULT_COMPACTION_CONFIG (gated by the COMPACTION_ENABLED kill switch); per-model config lands separately. - schema.ts: drop 6 agent columns (compaction_enabled, trigger_ratio, target_ratio, reserve_ratio, keep_recent_messages, min_prunable_chars) - schemas: drop matching base fields + the targetRatio --- .../drizzle/0046_context_compaction.sql | 8 +- apps/backend/drizzle/meta/0046_snapshot.json | 36 ----- apps/backend/src/db/schema.ts | 9 -- apps/backend/src/runs/compaction.test.ts | 16 --- apps/backend/src/runs/compaction.ts | 47 ------ apps/backend/src/services/chat-execution.ts | 4 +- apps/frontend/components/agent-form.tsx | 134 ------------------ packages/schemas/index.test.ts | 40 ------ packages/schemas/index.ts | 119 +++++----------- 9 files changed, 41 insertions(+), 372 deletions(-) diff --git a/apps/backend/drizzle/0046_context_compaction.sql b/apps/backend/drizzle/0046_context_compaction.sql index e526bc0a..573845c6 100644 --- a/apps/backend/drizzle/0046_context_compaction.sql +++ b/apps/backend/drizzle/0046_context_compaction.sql @@ -1,11 +1,5 @@ -ALTER TABLE "agent" ADD COLUMN "compaction_enabled" boolean;--> statement-breakpoint -ALTER TABLE "agent" ADD COLUMN "trigger_ratio" real;--> statement-breakpoint -ALTER TABLE "agent" ADD COLUMN "target_ratio" real;--> statement-breakpoint -ALTER TABLE "agent" ADD COLUMN "reserve_ratio" real;--> statement-breakpoint -ALTER TABLE "agent" ADD COLUMN "keep_recent_messages" integer;--> statement-breakpoint -ALTER TABLE "agent" ADD COLUMN "min_prunable_chars" integer;--> statement-breakpoint ALTER TABLE "chat" ADD COLUMN "context_summary" text;--> statement-breakpoint ALTER TABLE "chat" ADD COLUMN "summary_watermark" text;--> statement-breakpoint ALTER TABLE "chat" ADD COLUMN "compaction_dirty" boolean DEFAULT false NOT NULL;--> statement-breakpoint ALTER TABLE "chat" ADD COLUMN "version" integer DEFAULT 0 NOT NULL;--> statement-breakpoint -ALTER TABLE "provider" ADD COLUMN "model_meta" jsonb; \ No newline at end of file +ALTER TABLE "provider" ADD COLUMN "model_meta" jsonb; diff --git a/apps/backend/drizzle/meta/0046_snapshot.json b/apps/backend/drizzle/meta/0046_snapshot.json index 78402286..cda086e7 100644 --- a/apps/backend/drizzle/meta/0046_snapshot.json +++ b/apps/backend/drizzle/meta/0046_snapshot.json @@ -98,42 +98,6 @@ "primaryKey": false, "notNull": false }, - "compaction_enabled": { - "name": "compaction_enabled", - "type": "boolean", - "primaryKey": false, - "notNull": false - }, - "trigger_ratio": { - "name": "trigger_ratio", - "type": "real", - "primaryKey": false, - "notNull": false - }, - "target_ratio": { - "name": "target_ratio", - "type": "real", - "primaryKey": false, - "notNull": false - }, - "reserve_ratio": { - "name": "reserve_ratio", - "type": "real", - "primaryKey": false, - "notNull": false - }, - "keep_recent_messages": { - "name": "keep_recent_messages", - "type": "integer", - "primaryKey": false, - "notNull": false - }, - "min_prunable_chars": { - "name": "min_prunable_chars", - "type": "integer", - "primaryKey": false, - "notNull": false - }, "tool_set_ids": { "name": "tool_set_ids", "type": "jsonb", diff --git a/apps/backend/src/db/schema.ts b/apps/backend/src/db/schema.ts index 764692dc..2e0c6435 100644 --- a/apps/backend/src/db/schema.ts +++ b/apps/backend/src/db/schema.ts @@ -232,15 +232,6 @@ export const agent = pgTable( seed: t.real("seed"), presencePenalty: t.real("presence_penalty"), frequencyPenalty: t.real("frequency_penalty"), - // Per-agent context-compaction config (context-compaction-plan §G). All - // nullable; the runtime applies defaults when unset (true / 0.8 / 0.5 / - // 0.05 / 10 / 2000). Editable surface wired in a later slice. - compactionEnabled: t.boolean("compaction_enabled"), - triggerRatio: t.real("trigger_ratio"), - targetRatio: t.real("target_ratio"), - reserveRatio: t.real("reserve_ratio"), - keepRecentMessages: t.integer("keep_recent_messages"), - minPrunableChars: t.integer("min_prunable_chars"), toolSetIds: t.jsonb("tool_set_ids").$type().default([]), // Array of tool set ids skillIds: t.jsonb("skill_ids").$type().default([]), // Array of skill ids subAgentIds: t.jsonb("sub_agent_ids").$type().default([]), // Array of sub-agent ids diff --git a/apps/backend/src/runs/compaction.test.ts b/apps/backend/src/runs/compaction.test.ts index 2c169543..33b72de0 100644 --- a/apps/backend/src/runs/compaction.test.ts +++ b/apps/backend/src/runs/compaction.test.ts @@ -415,7 +415,6 @@ import { applyTier1Compaction, buildCompactionTraceMessage, computeBudget, - resolveCompactionConfig, invalidateCompaction, affectedBelowWatermark, summaryUIMessage, @@ -465,21 +464,6 @@ const cfg = (over: Partial = {}): CompactionConfig => ({ ...over, }); -describe("resolveCompactionConfig (§G defaults)", () => { - it("returns defaults when overrides are null/undefined", () => { - expect(resolveCompactionConfig(null)).toEqual(DEFAULT_COMPACTION_CONFIG); - }); - it("applies partial overrides, keeping defaults for the rest", () => { - const c = resolveCompactionConfig({ - triggerRatio: 0.9, - compactionEnabled: false, - }); - expect(c.triggerRatio).toBe(0.9); - expect(c.compactionEnabled).toBe(false); - expect(c.targetRatio).toBe(DEFAULT_COMPACTION_CONFIG.targetRatio); - }); -}); - describe("computeBudget (drift C3 — subtract both reserves)", () => { it("subtracts output + safety reserve before applying ratios", () => { const b = computeBudget( diff --git a/apps/backend/src/runs/compaction.ts b/apps/backend/src/runs/compaction.ts index 677c893b..ea2a5d3e 100644 --- a/apps/backend/src/runs/compaction.ts +++ b/apps/backend/src/runs/compaction.ts @@ -678,53 +678,6 @@ export const DEFAULT_COMPACTION_CONFIG: CompactionConfig = { minPrunableChars: 2000, }; -/** Agent row fields that override the compaction defaults (all optional). */ -export type CompactionConfigOverrides = { - compactionEnabled?: boolean | null; - triggerRatio?: number | null; - targetRatio?: number | null; - reserveRatio?: number | null; - keepRecentMessages?: number | null; - minPrunableChars?: number | null; -}; - -export function resolveCompactionConfig( - overrides: CompactionConfigOverrides | null | undefined, -): CompactionConfig { - const o = overrides ?? {}; - const pick = (v: T | null | undefined, d: T): T => (v == null ? d : v); - const triggerRatio = pick( - o.triggerRatio, - DEFAULT_COMPACTION_CONFIG.triggerRatio, - ); - let targetRatio = pick(o.targetRatio, DEFAULT_COMPACTION_CONFIG.targetRatio); - // Hysteresis backstop (drift C2): the post-compaction target must stay below - // the trigger or compaction re-fires every turn. The agent create/update zod - // schema already rejects an inverted pair, but a pre-existing/legacy row (or a - // direct DB write) could still carry one — clamp it here so the runtime can - // never thrash. - if (targetRatio >= triggerRatio) { - targetRatio = triggerRatio * 0.9; - } - return { - compactionEnabled: pick( - o.compactionEnabled, - DEFAULT_COMPACTION_CONFIG.compactionEnabled, - ), - triggerRatio, - targetRatio, - reserveRatio: pick(o.reserveRatio, DEFAULT_COMPACTION_CONFIG.reserveRatio), - keepRecentMessages: pick( - o.keepRecentMessages, - DEFAULT_COMPACTION_CONFIG.keepRecentMessages, - ), - minPrunableChars: pick( - o.minPrunableChars, - DEFAULT_COMPACTION_CONFIG.minPrunableChars, - ), - }; -} - export type Budget = { inputBudget: number; triggerTokens: number; diff --git a/apps/backend/src/services/chat-execution.ts b/apps/backend/src/services/chat-execution.ts index 61a5f001..0154ab85 100644 --- a/apps/backend/src/services/chat-execution.ts +++ b/apps/backend/src/services/chat-execution.ts @@ -49,7 +49,7 @@ import { computeBudget, drizzleCompactionStore, invalidateCompaction, - resolveCompactionConfig, + DEFAULT_COMPACTION_CONFIG, setCompactionDirty, type Budget, type CompactionConfig, @@ -539,7 +539,7 @@ async function buildCompactionRuntime(args: { }): Promise { const { chatId, provider, resolvedModelId, agent, opened } = args; - const config = resolveCompactionConfig(agent); + const config = { ...DEFAULT_COMPACTION_CONFIG }; // Global kill switch (§G) gates proactive compaction; recovery is unaffected. if (process.env.COMPACTION_ENABLED === "false") { config.compactionEnabled = false; diff --git a/apps/frontend/components/agent-form.tsx b/apps/frontend/components/agent-form.tsx index c1acfba6..c2015858 100644 --- a/apps/frontend/components/agent-form.tsx +++ b/apps/frontend/components/agent-form.tsx @@ -152,12 +152,6 @@ const AgentForm = ({ seed: undefined as number | undefined, presencePenalty: undefined as number | undefined, frequencyPenalty: undefined as number | undefined, - compactionEnabled: undefined as boolean | undefined, - triggerRatio: undefined as number | undefined, - targetRatio: undefined as number | undefined, - reserveRatio: undefined as number | undefined, - keepRecentMessages: undefined as number | undefined, - minPrunableChars: undefined as number | undefined, }); const [isSubmitting, setIsSubmitting] = useState(false); const [validationErrors, setValidationErrors] = useState< @@ -209,12 +203,6 @@ const AgentForm = ({ toolSetIds: agent.toolSetIds || [], skillIds: agent.skillIds || [], subAgentIds: agent.subAgentIds || [], - compactionEnabled: agent.compactionEnabled ?? undefined, - triggerRatio: agent.triggerRatio ?? undefined, - targetRatio: agent.targetRatio ?? undefined, - reserveRatio: agent.reserveRatio ?? undefined, - keepRecentMessages: agent.keepRecentMessages ?? undefined, - minPrunableChars: agent.minPrunableChars ?? undefined, }); if (agent.avatarUrl) { setAvatarPreviewUrl(agent.avatarUrl); @@ -339,12 +327,6 @@ const AgentForm = ({ toolSetIds: formData.toolSetIds, skillIds: formData.skillIds, subAgentIds: formData.subAgentIds, - compactionEnabled: formData.compactionEnabled, - triggerRatio: formData.triggerRatio, - targetRatio: formData.targetRatio, - reserveRatio: formData.reserveRatio, - keepRecentMessages: formData.keepRecentMessages, - minPrunableChars: formData.minPrunableChars, }; const url = agentId @@ -914,122 +896,6 @@ const AgentForm = ({ /> - -
-

Context compaction

- - Override per-agent compaction behaviour. Leave blank to use - defaults (trigger 80%, target 50%, reserve 5%, keep last 10 - messages, prune outputs >2000 chars). Target ratio must stay - below trigger ratio. Has no effect when compaction is disabled - globally (COMPACTION_ENABLED). - - - - setFormData((prev) => ({ - ...prev, - compactionEnabled: checked, - })) - } - disabled={isSubmitting || readOnly} - /> - - Enable compaction - - - - - - Trigger ratio (0–1) - - - handleFloatChange("triggerRatio", e.target.value) - } - disabled={isSubmitting || readOnly} - /> - - - - Target ratio (0–1) - - - handleFloatChange("targetRatio", e.target.value) - } - disabled={isSubmitting || readOnly} - /> - - - - Reserve ratio (0–1) - - - handleFloatChange("reserveRatio", e.target.value) - } - disabled={isSubmitting || readOnly} - /> - - - - Keep recent messages - - - handleNumberChange("keepRecentMessages", e.target.value) - } - disabled={isSubmitting || readOnly} - /> - - - - Min prunable chars - - - handleNumberChange("minPrunableChars", e.target.value) - } - disabled={isSubmitting || readOnly} - /> - - -
diff --git a/packages/schemas/index.test.ts b/packages/schemas/index.test.ts index 592943d6..c4915856 100644 --- a/packages/schemas/index.test.ts +++ b/packages/schemas/index.test.ts @@ -389,46 +389,6 @@ describe("Chat compaction state (context-compaction §C)", () => { }); }); -describe("Agent compaction config (context-compaction §G)", () => { - const base = { - id: "789", - workspaceId: "456", - providerId: "provider-123", - name: "Test Agent", - description: "A test agent", - modelId: "gpt-4", - createdAt: new Date(), - updatedAt: new Date(), - }; - - it("is valid with no compaction config (defaults applied in code)", () => { - expect(agentSchema.safeParse(base).success).toBe(true); - }); - - it("accepts a full compaction config", () => { - const result = agentSchema.safeParse({ - ...base, - compactionEnabled: true, - triggerRatio: 0.8, - targetRatio: 0.5, - reserveRatio: 0.05, - keepRecentMessages: 10, - minPrunableChars: 2000, - }); - expect(result.success).toBe(true); - }); - - it("rejects a ratio above 1", () => { - const result = agentSchema.safeParse({ ...base, triggerRatio: 1.2 }); - expect(result.success).toBe(false); - }); - - it("rejects a negative keepRecentMessages", () => { - const result = agentSchema.safeParse({ ...base, keepRecentMessages: -1 }); - expect(result.success).toBe(false); - }); -}); - describe("Provider Create Schema", () => { const baseProvider = { organizationId: "org-123", diff --git a/packages/schemas/index.ts b/packages/schemas/index.ts index 4ea61096..7dbdcd5a 100644 --- a/packages/schemas/index.ts +++ b/packages/schemas/index.ts @@ -205,15 +205,6 @@ const agentBaseSchema = z.object({ seed: z.number().optional(), presencePenalty: z.number().optional(), frequencyPenalty: z.number().optional(), - // Per-agent context-compaction config (context-compaction-plan §G). All - // optional; the runtime applies defaults when unset. Editable surface (adding - // these to agentCreate/Update picks + the form) lands in a later slice. - compactionEnabled: z.boolean().optional(), - triggerRatio: z.number().min(0).max(1).optional(), - targetRatio: z.number().min(0).max(1).optional(), - reserveRatio: z.number().min(0).max(1).optional(), - keepRecentMessages: z.number().int().min(1).optional(), - minPrunableChars: z.number().int().nonnegative().optional(), toolSetIds: z.array(z.string()).optional(), skillIds: z.array(z.string()).optional(), subAgentIds: z.array(z.string()).optional(), @@ -238,78 +229,44 @@ export const agentSchema = agentBaseSchema.refine( export type Agent = z.infer; -// Hysteresis guard (context-compaction-plan §C2 / drift C2): the post-compaction -// target must sit BELOW the trigger, otherwise compaction re-fires every turn -// (the Cline #5616 thrash). Per-field bounds are 0..1; this enforces the -// relationship. Only checked when BOTH are supplied (either may be omitted to -// fall back to the runtime default). -const compactionRatioOrder = (data: { - triggerRatio?: number; - targetRatio?: number; -}) => - data.triggerRatio == null || - data.targetRatio == null || - data.targetRatio < data.triggerRatio; - -const compactionRatioOrderIssue = { - message: "targetRatio must be less than triggerRatio", - path: ["targetRatio"], -}; - -export const agentCreateSchema = agentBaseSchema - .pick({ - workspaceId: true, - providerId: true, - name: true, - description: true, - systemPrompt: true, - modelId: true, - maxSteps: true, - temperature: true, - topP: true, - topK: true, - seed: true, - presencePenalty: true, - frequencyPenalty: true, - toolSetIds: true, - skillIds: true, - subAgentIds: true, - inputPlaceholder: true, - compactionEnabled: true, - triggerRatio: true, - targetRatio: true, - reserveRatio: true, - keepRecentMessages: true, - minPrunableChars: true, - }) - .refine(compactionRatioOrder, compactionRatioOrderIssue); - -export const agentUpdateSchema = agentBaseSchema - .pick({ - providerId: true, - name: true, - description: true, - systemPrompt: true, - modelId: true, - maxSteps: true, - temperature: true, - topP: true, - topK: true, - seed: true, - presencePenalty: true, - frequencyPenalty: true, - toolSetIds: true, - skillIds: true, - subAgentIds: true, - inputPlaceholder: true, - compactionEnabled: true, - triggerRatio: true, - targetRatio: true, - reserveRatio: true, - keepRecentMessages: true, - minPrunableChars: true, - }) - .refine(compactionRatioOrder, compactionRatioOrderIssue); +export const agentCreateSchema = agentBaseSchema.pick({ + workspaceId: true, + providerId: true, + name: true, + description: true, + systemPrompt: true, + modelId: true, + maxSteps: true, + temperature: true, + topP: true, + topK: true, + seed: true, + presencePenalty: true, + frequencyPenalty: true, + toolSetIds: true, + skillIds: true, + subAgentIds: true, + inputPlaceholder: true, +}); + +export const agentUpdateSchema = agentBaseSchema.pick({ + providerId: true, + name: true, + description: true, + systemPrompt: true, + modelId: true, + maxSteps: true, + temperature: true, + topP: true, + topK: true, + seed: true, + presencePenalty: true, + frequencyPenalty: true, + toolSetIds: true, + skillIds: true, + subAgentIds: true, + inputPlaceholder: true, +}); // Skill From 2242801cb1f1b7f57c38e5d78db70af65a78dd45 Mon Sep 17 00:00:00 2001 From: "frantisek.spacek@morosystems.cz" Date: Fri, 12 Jun 2026 23:05:10 +0200 Subject: [PATCH 08/21] chore(backend): drop dead agent binding from buildCompactionRuntime Per-agent compaction config was removed in 625ff96; the `agent` parameter threaded into buildCompactionRuntime is now unused. Remove it from the args type, the destructure, and all three call sites. Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/backend/src/services/chat-execution.ts | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/apps/backend/src/services/chat-execution.ts b/apps/backend/src/services/chat-execution.ts index 0154ab85..2a7e71b7 100644 --- a/apps/backend/src/services/chat-execution.ts +++ b/apps/backend/src/services/chat-execution.ts @@ -534,10 +534,9 @@ async function buildCompactionRuntime(args: { chatId?: string; provider: Provider; resolvedModelId: string; - agent: AgentRow | null; opened: ReturnType; }): Promise { - const { chatId, provider, resolvedModelId, agent, opened } = args; + const { chatId, provider, resolvedModelId, opened } = args; const config = { ...DEFAULT_COMPACTION_CONFIG }; // Global kill switch (§G) gates proactive compaction; recovery is unaffected. @@ -845,7 +844,6 @@ export const prepareChatTurn = async ( chatId: request.id, provider, resolvedModelId, - agent: agent ?? null, opened, }); @@ -1318,7 +1316,6 @@ const loadSubAgents = async ( chatId: sa.id, provider: resolved.provider, resolvedModelId: sa.modelId, - agent: sa, opened: resolved.opened, }); if (!runtime.config.compactionEnabled) return; @@ -1446,7 +1443,6 @@ export async function forceCompactChat( chatId, provider, resolvedModelId, - agent, opened, }); From da2c15993b4f2a82fa3369da3108589efe9d16d6 Mon Sep 17 00:00:00 2001 From: "frantisek.spacek@morosystems.cz" Date: Fri, 12 Jun 2026 23:06:43 +0200 Subject: [PATCH 09/21] feat(backend): env overrides for global compaction ceiling After Chunk 12 compaction config is global (DEFAULT_COMPACTION_CONFIG + COMPACTION_ENABLED kill switch). Add optional env overrides for the ceiling ratios so a test deployment can lower the trigger and exercise auto-compaction without filling a large context window. Unset/blank/invalid env values fall back to the built-in defaults, so production behavior is unchanged. Knobs: COMPACTION_TRIGGER_RATIO, COMPACTION_TARGET_RATIO, COMPACTION_RESERVE_RATIO, COMPACTION_KEEP_RECENT, COMPACTION_MIN_PRUNABLE_CHARS. Documented in .env.example. Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/backend/.env.example | 17 ++++++++++++++++- apps/backend/src/services/chat-execution.ts | 21 +++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/apps/backend/.env.example b/apps/backend/.env.example index 0c728bab..a2113b8f 100644 --- a/apps/backend/.env.example +++ b/apps/backend/.env.example @@ -49,4 +49,19 @@ PLATYPUS_SANDBOX_DOCKER_ENABLED=false # PLATYPUS_SANDBOX_DOCKER_ALLOWED_NETWORKS=shared-services,public-tools # Frontend URL for generating resource links in tool responses -FRONTEND_URL=http://localhost:3001 \ No newline at end of file +FRONTEND_URL=http://localhost:3001 + +# Context compaction (ADR-0009, context-compaction-plan §G). +# Compaction behavior is global; window/output size stays per-model. +# COMPACTION_ENABLED=false disables proactive compaction (recovery still runs). +# COMPACTION_ENABLED=true +# +# Optional overrides for the global ceiling. Unset = built-in defaults +# (trigger 0.8, target 0.5, reserve 0.05, keepRecent 10, minPrunable 2000). +# Lower the trigger to exercise auto-compaction on test deployments. +# Keep target < trigger or compaction re-fires every turn. +# COMPACTION_TRIGGER_RATIO=0.8 +# COMPACTION_TARGET_RATIO=0.5 +# COMPACTION_RESERVE_RATIO=0.05 +# COMPACTION_KEEP_RECENT=10 +# COMPACTION_MIN_PRUNABLE_CHARS=2000 \ No newline at end of file diff --git a/apps/backend/src/services/chat-execution.ts b/apps/backend/src/services/chat-execution.ts index 2a7e71b7..64e27e7b 100644 --- a/apps/backend/src/services/chat-execution.ts +++ b/apps/backend/src/services/chat-execution.ts @@ -543,6 +543,27 @@ async function buildCompactionRuntime(args: { if (process.env.COMPACTION_ENABLED === "false") { config.compactionEnabled = false; } + // Optional env overrides for the global ceiling (§G). Unset/blank/invalid → + // the DEFAULT_COMPACTION_CONFIG value stands, so production behavior is + // unchanged. Intended for tuning the trigger on test deployments without a + // code change. Keep targetRatio < triggerRatio or compaction re-fires every + // turn (the thrash trap). + const numEnv = (raw: string | undefined): number | undefined => { + if (raw == null || raw === "") return undefined; + const n = Number(raw); + return Number.isFinite(n) ? n : undefined; + }; + config.triggerRatio = + numEnv(process.env.COMPACTION_TRIGGER_RATIO) ?? config.triggerRatio; + config.targetRatio = + numEnv(process.env.COMPACTION_TARGET_RATIO) ?? config.targetRatio; + config.reserveRatio = + numEnv(process.env.COMPACTION_RESERVE_RATIO) ?? config.reserveRatio; + config.keepRecentMessages = + numEnv(process.env.COMPACTION_KEEP_RECENT) ?? config.keepRecentMessages; + config.minPrunableChars = + numEnv(process.env.COMPACTION_MIN_PRUNABLE_CHARS) ?? + config.minPrunableChars; // RV7d: resolve both windows concurrently (they are independent). const taskModelId = provider.taskModelId || resolvedModelId; From 55fc2f044ecb15293d804eae08714b1b36cf0def Mon Sep 17 00:00:00 2001 From: "frantisek.spacek@morosystems.cz" Date: Fri, 12 Jun 2026 20:55:42 +0200 Subject: [PATCH 10/21] chore(backend): add compaction.check debug log at trigger decision point --- apps/backend/src/runs/compaction.ts | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/apps/backend/src/runs/compaction.ts b/apps/backend/src/runs/compaction.ts index ea2a5d3e..9fdf9218 100644 --- a/apps/backend/src/runs/compaction.ts +++ b/apps/backend/src/runs/compaction.ts @@ -899,6 +899,25 @@ export async function applyTier1Compaction( forceCompact || (config.compactionEnabled && projected >= budget.triggerTokens); + logger.info( + { + metric: "compaction.check", + chatId: input.chatId, + compactionEnabled: config.compactionEnabled, + projected, + triggerTokens: budget.triggerTokens, + targetTokens: budget.targetTokens, + inputBudget: budget.inputBudget, + triggered, + forceCompact, + messageTokens, + priorSummaryTokens, + overheadTokens: input.overheadTokens ?? 0, + lastInputTokens: input.lastInputTokens, + }, + "compaction.check", + ); + if (!triggered) { return { messages: baseView, compacted: false }; } From 269a8be4ea66c851e73e9e411e84157fd9350d07 Mon Sep 17 00:00:00 2001 From: "frantisek.spacek@morosystems.cz" Date: Fri, 12 Jun 2026 22:15:28 +0200 Subject: [PATCH 11/21] chore(docs): add Hermes/Codex prior-art + 2026-06-12 field re-survey to compaction plan Survey of Hermes Agent, Codex CLI, Claude Code, Cline confirms the shipped design already implements real-token feeding (C1), input-tokens-only window (F1), reserve-carve trigger (C3 = Codex 90% cap), and two-layer proactive+ recovery (P4). Documents the Hermes token-floor (#14690) as a do-not-copy anti-pattern and defers three optional adds (message-count valve, maxOutput reserve floor, model-aware aggressiveness). Co-Authored-By: Claude Opus 4.8 (1M context) --- context-compaction-plan.md | 1388 ++++++++++++++++++++++++++++++++++++ 1 file changed, 1388 insertions(+) create mode 100644 context-compaction-plan.md diff --git a/context-compaction-plan.md b/context-compaction-plan.md new file mode 100644 index 00000000..7294bce7 --- /dev/null +++ b/context-compaction-plan.md @@ -0,0 +1,1388 @@ +# Plan: Chat Context Compaction & Usage Indicator + +Status: **chunks 1-11 implemented (ALL DONE)** (1-2 reviewed 2026-06-09; chunks 3-5 landed 2026-06-10; chunks 6-8 landed 2026-06-11; chunk 11 landed 2026-06-12; see §Code review 2026-06-10) · Branch target: `feature/context-compaction` + +> This doc is the spec to implement against, not a proposal. Sections A–J are the +> design. The **Drift log & code-review checklist** at the bottom records every +> flaw found during review and the trap to re-check once code exists — read it +> before coding and again at PR time. Do not re-derive the happy path and skip +> the failure modes; they are written down precisely so we do not drift into them +> twice. + +## Implementation status & code review — chunks 1-2 (reviewed 2026-06-09) + +Chunks **1** (window resolution + single estimator + schema) and **2** (compaction +module + `writeWatermark` CAS + Tier 1) are landed on `feature/context-compaction` +(post main v1.95.0 merge). Backend tests: 1037 pass; chunk 1-2 unit tests: +context-window 20, token-estimate 14, compaction (CAS/budget/pairing/invalidation) +all green. Source `tsc --noEmit` clean for these files. This section is the +**start point for chunk 3** — read it before coding. + +### Solid / verified + +- **CAS durable writer (P3 / R1 / T10)** — the hardest part is correct and + well-tested. Single versioned writer (`commitWatermark`→`casWrite`, + `compaction.ts:84-160`); all three mutations (advance / dirty-clear / C4 reset) + route through it; loser decides by **version** not watermark value; one-retry- + then-skip, no livelock. No field write bypasses it. +- **C2 hysteresis, C3 budget, C4 invalidation, M1 map-reduce primitive, T7 + summarizer fallback** — VERIFIED in `compaction.ts`. +- **C4 wiring** — ~~VERIFIED~~ **OVERTURNED by the 2026-06-10 review (RV1).** + The mechanism is wired but the comparison baseline is destroyed before it runs + (ChatSink.onStart overwrite) and the two sides are canonicalized differently + (inlined URLs, jsonb key order). See §Code review 2026-06-10, RV1. +- **Schema / migration / zod / lazy-rollout** — VERIFIED. Columns additive + + nullable/defaulted; migration `0047_context_compaction.sql` matches schema; + `modelMeta` optional in all variants; `contextSummary`/`summaryWatermark` kept + out of chatSubmit/chatUpdate (server-managed); no eager backfill job. +- **Tier 1 gating (plan M3)** — VERIFIED. `request.id ? applyTier1IfNeeded : skip`; + triggers (`{agentId,search}`, no id) and sub-agents (bypass `prepareChatTurn`) + skip Tier 1; best-effort try/catch never breaks a turn (P4). +- **`compactModelMessages` Tier 2 adapter** — fully implemented + tested (NOT a + stub). Recovery (chunk 3) and Tier 2 (chunk 4) can call it directly. + +### Chunk 3 (Recovery + C1/M2) — landed 2026-06-10 + +- **§E recovery** — new `runs/recovery.ts`. `isContextOverflowError` (400/413 + + per-provider body regex: OpenAI/vLLM, Anthropic, Google, Bedrock — drift T9, + fixture-tested). `contextOverflowRecoveryMiddleware` wraps the model via + `wrapLanguageModel` in BOTH `streamText` and `generateText` (agent-runner), so + every step of a tool loop gets detect → `setCompactionDirty` (flag persisted on + DETECTION, before retry outcome) → trim via **`compactModelMessages`** (T3, no + bespoke trim; system head pinned; keep-recent halved, floor 2) → retry once. + Second failure surfaces "Conversation too large… start a new chat" via + `formatStreamError`. The V3 prompt is passed to `compactModelMessages` + directly (structurally compatible shape) — no converter, no second trimmer. + `setCompactionDirty` goes through `commitWatermark` (P3); no-op when already + dirty. Headless runs get trim+retry but no dirty flag (no chat row). +- **C1 fix (partial — overhead path)** — `estimateOverheadTokens(systemPrompt, +tools)` in token-estimate.ts (char/4 of system prompt + each tool's name, + description, `asSchema(...).jsonSchema`; flat 200/tool fallback). Threaded as + `Tier1Input.overheadTokens`; the trigger projection + (`projectTier1Tokens`) now counts it, and the compaction target is reduced by + it (`targetTokens − overhead`) so hysteresis (C2) still holds. `log.warn` when + overhead alone ≥ target (compaction would re-fire each turn). + **C1 second half — DONE 2026-06-11.** `prepareChatTurn` now threads + `lastInputTokens` from the last assistant message's + `metadata.stats.contextTokens` (stamped by `applyMessageStats`, §H) into + `applyTier1IfNeeded`. `projectTier1Tokens` takes + `max(charBased, lastInputTokens)` (not additive — `charBased` is the whole + unsummarized view, so adding would double-count history); cold-start margin + applies only when it is absent (turn 1). C1 fully closed. +- **M2 fixed** — `COLD_START_MARGIN = 1.15` applied to the whole char-based + projection whenever no provider baseline exists; dropped when + `lastInputTokens` is present. +- **Defect 4 fixed** — `summarizerWindow` now resolved (task-model window → + `computeBudget(...).inputBudget`) in `buildCompactionRuntime` and threaded to + Tier 1 and recovery; M1 map-reduce is live in the wired flow. +- **Defect 9 fixed** — token-estimate header no longer claims per-turn provider + counts. +- **Refactor** — `buildCompactionRuntime` (chat-execution) resolves window / + config / budget / summarizer once per turn, never throws (falls back to the + 8192 default), and is shared by Tier 1 and the recovery middleware; `ChatTurn` + gained a required `recovery: RecoveryContext` field consumed by agent-runner. +- Tests: backend suite 1068 pass (was 1037) — recovery matrix + middleware + retry/dirty/failure paths, trim boundary safety, projection C1/M2 cases, + `setCompactionDirty`, overhead estimator. Source tsc clean; eslint 0 errors. + +## Chunk 3a — RV1-RV4 fixes (landed 2026-06-10) + +All 4 critical defects resolved. Tests: 1068 pass (unchanged count). tsc clean on +source files. Key changes: + +- **RV1** — `stableStringify` exported from `token-estimate.ts`; `affectedBelowWatermark` + now uses it instead of `JSON.stringify` (jsonb key-order stability). C4 baseline + fixed: `agent-runner.stream()` reads `loadChatMessages(id)` BEFORE `sink.onStart` + overwrites the row, threads as `priorMessages` through `prepare()` → + `prepareChatTurn()` → `applyTier1IfNeeded()`. C4 comparison now uses + `rawMessages` (pre-`inlineFileUrls`) so file URLs match on both sides. +- **RV2** — Submit handler in `routes/chat.ts` verifies `data.id` belongs to + `scope.workspaceId` (SELECT + 404 if workspace mismatch) before any run starts. +- **RV3** — `force?: boolean` added to both `UICompactOptions` and `ModelCompactOptions`; + no-op estimate gate skipped when `force:true`. `applyTier1Compaction` passes + `force: forceCompact` to `compactUIMessages` (dirty-forced path). Recovery's + `trimOverflowingPrompt` passes `force: true` to `compactModelMessages`. +- **RV4** — Empty-prefix guard added before Stage 2 in `compactUIMessages`: when + `prefix.length === 0` (history ≤ keepRecentMessages), return the pruned-recent + without calling `summarize` and without committing a `watermark:null` + non-null + summary (which would orphan the summary every turn). + +## Chunk 3b — RV5-RV7 fixes (landed 2026-06-10) + +All 3 HIGH non-blocking defects resolved. Tests: 1068 pass (unchanged count). tsc clean. Key changes: + +- **RV5** — `content`-type tool results: `pruneModelMessage` now soft-trims text items and replaces + media with `[N media item(s)]` placeholders; `renderModelMessages` extracts text from `content` + items so the summarizer sees their content. Both paths covered by `// RV5:` inline markers. +- **RV6** — Recovery target overhead: `RecoveryContext.targetTokens` is now set to + `Math.max(0, budget.targetTokens − overheadTokens)` in `chat-execution.ts` + (mirrors the overhead-adjusted target Tier 1 already used). +- **RV7** — Context-window resolution family (all four sub-items): + - (a) `litellm-registry.ts` populated with full registry covering OpenAI, Anthropic, Bedrock + (Anthropic + Meta Llama + Amazon Titan/Nova + Mistral), Mistral direct, Meta Llama direct, + and Qwen. Wired as `loadBuiltinRegistry` on the process-wide `contextWindowResolver`. + - (b) Family heuristic uses boundary-safe `startsWith(key + "-"|"."|":"|"/")` — `gpt-4.5-preview` + no longer silently resolves via the stale `gpt-4` entry. + - (c) `contextWindowResolver.evict(providerId)` called in both `routes/provider.ts` PUT handlers + on `modelMeta` change. + - (d) `defaultHttpGetJson` uses `AbortSignal.timeout(5000)`; `#inflight` map prevents cold-cache + stampede; the two `resolve` calls in `buildCompactionRuntime` are run in parallel + (`Promise.all`). Note: default-source results are still cached for the full TTL (old defect 6 / + MED priority — open, tracked separately). + +## Code review 2026-06-10 — full-branch review of chunks 1-3 (RV1-RV7 ALL FIXED) + +Multi-angle adversarial review of all compaction code (7 finder angles, every +candidate independently verified against the source). Every finding below is +CONFIRMED unless marked otherwise. **RV1-RV4 were blocking** and are now fixed. +**RV5-RV7 HIGH non-blocking fixes landed 2026-06-10 (chunk 3b).** + +### Critical + +- **RV1 — C4 invalidation broken in BOTH directions; durable summary likely never + survives a turn in prod.** (`chat-execution.ts` `applyTier1IfNeeded` / + `chat-sink.ts` `onStart` / `compaction.ts` `affectedBelowWatermark`) + - _Missed edits:_ `AgentRunner.stream` awaits `sink.onStart` **before** + `prepareChatTurn`, and `ChatSink.onStart` overwrites `chat.messages` with the + just-submitted history — so `loadPersistedMessages` reads back the edited + submission and the C4 check compares the edit against itself. An **in-place + edit below the watermark is never detected**: the model gets the stale + summary and the edited message is dropped from the view. (Truncate-and- + regenerate accidentally still invalidates via the watermark-gone fallback.) + - _Spurious invalidation:_ the incoming side is post-`inlineFileUrls` + (`data:` URLs) while the persisted side holds `http /files/…` — any chat + with a file at/below the watermark **invalidates + fully re-summarizes every + turn** (one wasted summarize model call per turn, forever). Additionally + `chat.messages` is **jsonb** (Postgres re-orders object keys), so the + `JSON.stringify` byte-equality very likely diverges for ALL chats after one + write→read round trip — the incremental summary never survives. + - _Fix direction:_ capture the pre-overwrite history (read the row BEFORE + onStart overwrites, or have onStart return the previous messages), compare + the **un-inlined** submission, and use semantic equality (id + extracted + text/tool content, or the SDK's `isDeepEqualData`) instead of + `JSON.stringify` byte-equality. Also consider a content digest persisted at + compaction time to avoid the per-turn full-history read (see RV9). +- **RV2 — cross-tenant compaction writes via unvalidated `request.id` + (security).** (`routes/chat.ts` submit handler; `compaction.ts` + `drizzleCompactionStore`) The submit route never verifies the body `id` + belongs to the caller's workspace (every other chat route filters + `id AND workspaceId`; submit does no chat-row lookup at all). The compaction + store, `loadPersistedMessages`, `invalidateCompaction`, `setCompactionDirty` + are keyed by `chat.id` only. A workspace-A owner submitting `id` = a + workspace-B chat id can clear B's summary/watermark, set B's dirty flag, and + CAS-write a summary derived from A's messages onto B's row (integrity, not + read-exfiltration; requires knowing B's chat id). _Fix:_ verify `request.id` + belongs to `scope.workspaceId` before the run starts (mirror the other chat + routes), and/or scope the store's queries by workspaceId. +- **RV3 — recovery + dirty-forced compaction trust the estimator that already + failed → permanent fail loop.** (`compaction.ts` no-op branches in both + compactors; `recovery.ts` `trimOverflowingPrompt`) Both compactors return the + messages **unchanged** when the char/4 estimate is ≤ target — but recovery + only runs after the provider has REJECTED the prompt. With a >2× under-count + (CJK ≈1 token/char; assistant `reasoning` parts excluded from counting AND + from pruning/summarizing — they ARE wire payload in the V3 prompt), the retry + resends a byte-identical prompt and deterministically fails; next turn the + dirty flag forces Tier 1, which no-ops and **clears the flag without + shrinking** → overflow → dirty again, every turn. _Fix:_ recovery (and + dirty-forced Tier 1) must force-trim past the estimate gate — e.g. a `force` + option on the compactors that skips the no-op branch and/or scales the + target down when invoked post-rejection; count reasoning parts in the + ModelMessage adapter (the "reasoning is UI-only" assumption in §B is wrong + for the V3 prompt path). +- **RV4 — small-history compaction clobbers the watermark and orphans the + summary.** (`compaction.ts` `compactUIMessages` boundary=0 path + + `applyTier1Compaction` commit) When the over-target history has ≤ + `keepRecentMessages` messages (one huge paste; or `effectiveTarget≈0` from + overhead), prefix=[] → a **wasted summarize call over an empty transcript** → + commit of `{summary, watermark: null}`. A pre-existing watermark is + overwritten with null; `viewAfterWatermark` ignores `contextSummary` when the + watermark is null, so the summary is orphaned, the previously-summarized + prefix reappears in the view, and the cycle repeats each turn. _Fix:_ skip + Stage 2 when the prefix is empty (return the no-op shape; optionally prune + inside `recent` for oversized tool outputs), and never commit + `watermark: null` together with a non-null summary. + +### High (all FIXED 2026-06-10 chunk 3b) + +- **RV5 — `content`-type tool results (standard MCP output) never pruned and + invisible to the summarizer.** ~~(`compaction.ts` `pruneModelMessage` handles + only text/json variants; `renderModelMessages` renders `content` as `""`)~~ + **FIXED:** `pruneModelMessage` soft-trims text items + media placeholder; + `renderModelMessages` extracts text items from `content` outputs. +- **RV6 — recovery target ignores per-turn overhead.** ~~(`chat-execution.ts` + RecoveryContext gets raw `budget.targetTokens`; Tier 1 uses `target − overhead`)~~ + **FIXED:** `RecoveryContext.targetTokens = Math.max(0, budget.targetTokens − overheadTokens)`. +- **RV7 — context-window resolution family.** ~~(`context-window.ts`)~~ + (a) ~~prod registry still empty~~ **FIXED:** `litellm-registry.ts` vendored with + full OpenAI/Anthropic/Bedrock/Mistral/Llama/Qwen coverage; + (b) ~~raw `startsWith` heuristic~~ **FIXED:** boundary-safe separators + (`"-"`, `"."`, `":"`, `"/"`) prevent `gpt-4.5-preview` → `gpt-4` resolution; + (c) ~~`evict` called by zero routes~~ **FIXED:** `contextWindowResolver.evict(providerId)` + wired in `routes/provider.ts` PUT handler; + (d) ~~no timeout / no single-flight~~ **FIXED:** `AbortSignal.timeout(5000)` + + `#inflight` Map + `Promise.all` the two resolve calls. + (e) ~~default-source full-TTL cache (old defect 6, MED)~~ **FIXED 2026-06-11:** + `source:"default"` results get `DEFAULT_SOURCE_CACHE_TTL_MS` (60 s) instead of + the hour, so a registry MISS / transient API blip no longer pins 8192. + +### Medium / low (RV8-RV10 — FIXED 2026-06-11, chunk 10) + +- **RV8 — `finalize` in the snapshot-consumer's `finally` could mark a broken + run "succeeded".** ~~(`agent-runner.ts`)~~ **FIXED:** the snapshot loop now + captures the stream error (both the `readUIMessageStream` `onError` callback + and the surrounding `catch` set `streamError`); the `finally` finalizes + `"failed"` with that error unless the run was aborted/cancelled. (Origin note + retained for the upstream-PR exclusion list: introduced by the tool-timestamps + commits `3851da6`/`b97312f`, not the compaction chunks.) +- **RV9 — hot-path waste.** ~~3-4 full-history estimation passes; full base64 + decode per image; tool schemas re-serialized every turn~~ **FIXED (partial):** + Tier 1 computes the unsummarized-view estimate **once** and threads it as + `knownEstimate` into `compactUIMessages` (mirrors the Tier 2 `knownEstimate` + from chunk 4); `bytesFromUrl` decodes only a 64 KB prefix for header parsing; + `estimateOverheadTokens` memoizes each tool's serialized-schema length in a + `WeakMap` keyed by the schema object. **Deliberately deferred:** the digest-based + C4 check — the full-prefix compare is already correct (RV1 landed), so this is + pure optimization of a correct path; revisit only if the per-turn JSONB + read+stringify shows up in profiling. +- **RV10 — cleanup minors. FIXED:** `MODEL_BOUND_UI_PART_TYPES` now has the + promised test (membership assertion); `toolResultOutputText` collapsed to the + two real behaviours (`execution-denied` reason vs `value`), removing the dead + `default`; the two `commitWatermark` closures in `applyTier1Compaction` share + one `pinnedWrite` helper; the orphaned `invalidateCompaction` jsdoc above + `affectedBelowWatermark` removed; JPEG walker skips `0xFF` fill bytes + `0xFF00` + stuffing and treats TEM (`0x01`) as standalone. **Not done (cosmetic):** + `bytesFromUrl` still duplicates storage/utils' private `parseDataUrl` — left as + is; merging them couples the estimator to the storage layer for no behaviour + change. + +### Re-affirmed solid by this review + +CAS writer/loser logic (P3/R1/T10), budget math (C3), tool-pairing boundaries, +the synthetic `context-summary` message (server-side only — never leaks to +persistence/frontend; cannot become the watermark), recovery middleware +single-retry semantics and summarizer non-recursion (fresh unwrapped task +model), Tier-1 skip for headless runs (M3), kill-switch wiring (§G) with the +documented dirty-forces-compaction exception (intent, has a test — though §G's +wording "disables ALL proactive compaction" should gain a sentence noting the +recovery hand-off still summarizes). + +### Defects to fix (ordered by impact) + +> 2026-06-10 note: still-open items below are subsumed by the §Code review +> 2026-06-10 list — defect 2 → RV7(a), 5 → RV7(c), 6 → RV7(d), 7 → RV5, +> 11's heuristic item → RV7(b). Track them there. + +1. **C1 — trigger under-counts (HIGH). _FIXED: overhead half 2026-06-10; `lastInputTokens` half 2026-06-11 — threaded from last assistant message `metadata.stats.contextTokens` in `chat-execution.ts`._** `compaction.ts:719` + `projected = estimate(afterWatermark) + priorSummaryTokens` — omits the prior + turn's provider `usage.inputTokens` AND the system prompt / tool schemas / skill + payload sent every turn. `Tier1Input` has no `lastInputTokens` field; the call + site (`chat-execution.ts:537`) passes none. This is the live-test under-count + (8888 real vs ~986 estimated) — trigger can silently never fire on tool-bearing + agents; only recovery (chunk 3) catches the overflow. Same root issue as the + **§Open trigger-estimator scope** note below. +2. **Empty litellm registry + alias map in prod (HIGH, compounding).** + `context-window.ts:285,389` — the production singleton injects an empty registry + loader and empty alias map, so every non-API provider (OpenAI/Anthropic/Bedrock) + resolves to `DEFAULT_CONTEXT_WINDOW = 8192`. The budget math is therefore + wrong-defaulted for those providers today. Must vendor litellm + `model_prices_and_context_window.json` + build the alias map and wire them in. +3. **M2 — first-turn ×1.15 margin absent (MED). _FIXED 2026-06-10._** + `compaction.ts:719` applies no + cold-start inflation; a char/4 under-count can keep turn-1 from triggering. +4. **`summarizerWindow` not threaded (MED). _FIXED 2026-06-10._** + `chat-execution.ts:537` calls + `applyTier1Compaction` without `summarizerWindow`, so the M1 map-reduce path is + dead in the wired flow — a large cold-start/imported history can overflow the + summarizer call itself. +5. **T5 evict not wired (MED).** `routes/provider.ts:126` updates a provider + (incl. `modelMeta`) without `contextWindowResolver.evict(providerId)`; window + cache serves stale values until TTL. +6. **Window cache pins transient failures (MED). _FIXED 2026-06-11 (RV7e)._** + default/MISS results now get a 60 s `DEFAULT_SOURCE_CACHE_TTL_MS`, not the hour. +7. **Latent T2 violation (LOW).** `token-estimate.ts` `case "content"` + `stableStringify`s tool-output base64 image bytes into char/4 text. No current + tool emits this shape; fix before any tool returns `content`-type media. + _Still open — see note below._ +8. **Latent T1 divergence (LOW). _Test added 2026-06-11._** The model adapter now + has explicit per-variant tool-result-output coverage (text/json/content/ + execution-denied), which is the shape a custom `toModelOutput` emits. The + exact UI-vs-Model equality (T1) holds for SDK-converted messages; a tool whose + `toModelOutput` reshapes the payload remains a bounded, documented divergence. +9. **Doc bug. _FIXED 2026-06-10._** `token-estimate.ts` header claims "every later turn uses the real + provider count" — false; char/4 is used every turn (ties to C1). Fix the comment + when C1 is plumbed. +10. **Observability metrics absent. _FIXED 2026-06-11._** No metrics infra exists + (pino only), so emitted as structured `metric:`-tagged log lines, greppable / + dashboardable: `cas.conflict` (commitWatermark), `context_window.fell_to_default` + - `litellm.key_miss` (context-window), `compaction.fired` (Tier 1), + `summarize.latency_ms` (summarize wrapper), `recovery.overflow_detected` / + `recovery.retry` / `recovery.failed` (recovery middleware). +11. **Low. _FIXED 2026-06-11 (partial):_** key-boundary heuristic done (RV7b); + Bedrock-ARN path now also tries lowercased candidates (`context-window.ts`); + dead `default: return ""` in the output switch removed (switch collapsed). + +### Drift-checklist deltas (vs the table at the bottom) + +`C1` → **VERIFIED** (overhead + margin 2026-06-10; `lastInputTokens` threaded 2026-06-11). `M2` → **VERIFIED**. `T3` → **VERIFIED** (producer landed +in chunk 3). `T9` → **VERIFIED**. `R4` → **PARTIAL** (window present & correctly +unfixed, but the gating `cas.conflict` metric is missing). `C4` → **BROKEN** +(2026-06-10 review, RV1 — baseline overwritten + byte-equality false +positives/negatives). `T1` → **PARTIAL** (reasoning parts ARE wire payload in +the V3 prompt path but excluded — RV3). Everything else listed +above → VERIFIED. `T5` → module hook present, **PUT-handler call missing** (RV7c). + +### Chunk 3 (Recovery) — hand-off is clean + +`applyTier1Compaction` already honors `state.compactionDirty` as a force-trigger and +clears it inside the same CAS write. Chunk 3 only needs the **producer** in +`agent-runner.ts`: `isContextOverflowError` (per-provider 400/413 body matrix, drift +T9), retry-once via `compactModelMessages` (NOT a bespoke trim, drift T3), and set +`compactionDirty=true` through `commitWatermark`. Recommend folding the **C1 fix** +(thread prior-turn `usage.inputTokens` + system/tool payload into the projection) +into the same chunk or the §H usage-metadata chunk, since recovery makes provider +`usage` available — without C1, recovery is the only thing standing between a +tool-bearing agent and a hard overflow. + +### Branch & upstream-PR hygiene + +**Decided 2026-06-09. Three roles, two live branches:** + +- **`feature/context-compaction` = compaction DEV branch.** Sits on the fork/deploy + lineage (off v1.90.0, later merged with main v1.95.0), so it carries + non-compaction commits. It is **NOT** a clean upstream PR base. +- **`deploy/fresh` = TEST/deploy target.** The test server tracks the `deploy/fresh` + **name** (`/srv/platypus`, compose project `platypus`; rebuilds rename back to it). + It also carries **deploy-runtime fixes that must NOT live in feature or the PR** + (MCP OAuth quirks, host routing). Do **not** deploy `feature` directly — it lacks + those fixes and deploying it would regress MCP OAuth + host-based URL routing. +- **Upstream PR branch = one-time throwaway.** Cherry-pick compaction-only commits + onto current upstream `main` at PR time. Never PR `feature` directly. + +**Test cycle:** `git checkout deploy/fresh` → `git merge feature/context-compaction` +→ deploy. Cheap (shared lineage). The old compaction on `deploy/fresh` is +**superseded automatically** by the merge (chat-execution resolves to feature's +version) — nothing to remove by hand. A small `chat-execution.ts` conflict is +expected (deploy/fresh's `request.id` Tier-1 call vs feature's gated +`ChatTurnRequest`); resolve to feature's version. + +**Why not collapse to one branch:** `deploy/fresh` holds deploy-runtime commits +feature deliberately omits (see below); folding them into feature would re-pollute +it and force a server reconfig. Keeping two branches is the lower-friction choice. + +**Deploy/fresh-only commits feature must NOT absorb** (deploy-runtime; keep on +deploy/fresh, exclude from PR): `9ad424b`, `5c2fd38`, `b9e0172`, `455a390`, +`b24f623`, `852a54a` (6 MCP OAuth runtime fixes — client_secret_post, host +rewrites, skip resource-origin check, sync auth binding); `e26d95b` +(backendUrl-from-Host); `4daed7a`, `43171b1` (deploy/fresh's own main merges). + +**EXCLUDE from the upstream PR** (fork/deploy-only or unrelated features — they +predate the compaction work and must not leak into the diff): + +- `e3ccf25` — `compose.yaml` deploy local-build edit (pure deploy) +- `d4cd6f2` — backendUrl-from-Host (fork deploy hack) +- `cdef399` — MCP auto-refresh on 401 + scoped quirks (separate feature) +- `7320000`, `5cfb882` — configurable agent-run timeouts (separate feature) +- `b1daa88` — deploy/fresh main(v1.90) merge commit +- `759aae1` — fork docs (PROJECT.md / CLAUDE.md fork refs) +- `b97312f`, `c18c18d`, `d194edc`, `51f69af`, `0737a6a`, `3851da6` — tool-call + duration / timestamps. **Borderline:** plan §I reuses this. Include ONLY if §I + (per-message stats) ships in the same PR; otherwise it is a separate feature. + +**INCLUDE** (compaction): `68cf725` (foundation+Tier 1), `d1d699e` (migration), +`e19029c` + the chunk-1-2 fix/plan commits from this session, plus chunks 3-9. +Note: the `0047_context_compaction` migration will need **renumbering** to match +upstream `main`'s migration sequence at cherry-pick time. + +## Goal + +Stop chats from hard-failing when message history exceeds a model's context +window. Three capabilities: + +1. **Proactive compaction** — summarize old history before the window fills. +2. **Recovery** — catch context-overflow errors from providers and recover. +3. **Visibility** — a context-usage indicator (ring) next to the model selector. + +Applies to top-level chats **and** sub-agents (both run through the shared +`agent-runner` / `ToolLoopAgent`, so implementing once covers both). + +--- + +## Design principles (read first — these are load-bearing) + +- **P1 — Compaction is a VIEW, not a DELETE.** The watermark + summary change + _what is sent to the model_, never _what is stored_. Raw messages stay in the + DB untouched. Consequences: forced compaction (§J) is **not** data loss — the + user can still read full history; only the model payload is compacted. A future + "expand summary" UI is free because originals persist. Never hard-delete a + summarized message. +- **P2 — One estimator.** Token counting lives in exactly one function over one + neutral structure (`CountUnit[]`). Tier 1 (UIMessages) and Tier 2 + (ModelMessages) both normalize into it. Divergence is impossible by + construction, not monitored. (See drift T1.) +- **P3 — One durable writer.** All mutations of compaction state + (`summaryWatermark`, `contextSummary`, `compactionDirty`) go through a single + versioned CAS function `writeWatermark`. No other code path writes these + fields. (See drift R1.) +- **P4 — Recovery is the net, proactive compaction is the plan.** The overflow + catch (§E) must stay on even if proactive compaction is globally disabled. It + is the last line, not a risk surface. + +--- + +## Key facts established during research + +- AI SDK (`ai@6.0.191`) reports real token usage **after** each call: + `usage.inputTokens` / `outputTokens` / `totalTokens` + (`apps/backend/src/runs/agent-runner.ts:148-194`). This is the primary, + provider-accurate signal driving compaction — no pre-call counting needed + except on the very first turn. +- AI SDK exposes **no** context-window metadata on the model interface, and + there is **no** built-in pre-call tokenizer. +- AI SDK `prepareStep` hook (`ai/dist/index.d.ts:960-1023`) runs before each + step of an in-flight response and can rewrite the `messages` sent to the + model — this is how we compact _within_ a single response. Receives + **ModelMessages** (post-`convertToModelMessages`). +- `prepareChatTurn` (`chat-execution.ts:430`) holds **UIMessages** + (`turn.stream.messages`); conversion to ModelMessages happens later at + `agent-runner.ts:360` (`await convertToModelMessages(...)`). **Tier 1 and + Tier 2 therefore operate on different message shapes — see drift T1.** +- Provider context-window availability: + - Google (`inputTokenLimit`), OpenRouter (`context_length`), + vLLM/OpenAI-compatible (`max_model_len`) — **available via API**. + - OpenAI, Anthropic, Bedrock — **not** via API; need lookup table / manual. +- Sub-agents run as tools (`apps/backend/src/tools/sub-agent.ts:56-159`) with + fresh history (only a `task` string), each its own `ToolLoopAgent`. +- Model call sites: `streamText` `agent-runner.ts:358-397`, + `generateText` `agent-runner.ts:543-584`. +- Error handling today only covers auth/rate-limit/5xx + (`agent-runner.ts:636-657`) — no context-overflow handling. +- Frontend selector + `(i)` icon: `apps/frontend/components/chat.tsx:561-593` + inside `PromptInputTools`. No progress/ring component exists yet. Token usage + is **not** currently streamed to the client per message. +- `inlineFileUrls` (`chat-execution.ts:524`) fetches file/image bytes and inlines + them into messages. It does **not** decode image dimensions today (see drift T2). +- `messageMetadata` callback (`agent-runner.ts:408`) fires at message **start**, + before timing/usage exist — cannot carry stats. Stamp at the + `applyToolCompletions` point (`:443`) instead (see §I). + +--- + +## Design + +### A. Context-window resolution + +New module `apps/backend/src/runs/context-window.ts`. + +`resolveContextWindow(provider, modelId): Promise` resolution order: + +1. **Manual override** — per-model entry in provider config (see schema below). +2. **API auto-detect** by provider type (cached per provider+model): + - Google: `GET {baseUrl}/v1beta/models/{modelId}` → `inputTokenLimit`. + - OpenRouter: `GET {baseUrl}/api/v1/models` → match id → `context_length`. + - OpenAI-type: `GET {baseUrl}/v1/models` → if entry has `max_model_len` + (vLLM and most OpenAI-compatible servers expose it) use it; official + OpenAI omits it → fall through. +3. **litellm model registry** (replaces a homegrown table) — vendor/fetch + litellm's `model_prices_and_context_window.json` (MIT, community-maintained). + Each entry has `max_input_tokens` / `max_output_tokens`. Covers OpenAI / + Anthropic / Bedrock families that don't expose the window via API. + - **Key normalization (drift T4):** registry keys don't match our + `resolvedModelId` 1:1. Lookup order: + `exact(modelId) → strip provider prefix ("openai/") → lowercase → alias map → family heuristic → MISS`. + Maintain a small alias map for Bedrock ARNs, Azure deployment names, vLLM + custom names. `log.warn` on every MISS (it falls to default — must be visible). +4. **Conservative default** — `DEFAULT_CONTEXT_WINDOW = 8192`. `log.warn` on every + fall-to-default. When the window is default/unknown the **ring renders neutral** + (§H), never a guessed green→red ramp. + +Detection results cached in-memory (per provider id + model id) with a TTL. +**Cache invalidation (drift T5):** editing a `modelMeta` override must +`cache.evict(providerId)` **immediately** in the provider PATCH handler — do not +wait for TTL. TTL is only a backstop for API-detected drift. Also resolve +`maxOutputTokens` the same way (registry `max_output_tokens` / API) — needed for +the budget math in §C. + +#### Schema change (per-model, not per-provider) + +A single per-provider number is wrong: one provider serves many models with +different windows. Store a per-model map. + +- DB: add `modelMeta` JSONB column to the `provider` table + (`apps/backend/src/db/schema.ts`), shape: + `{ "": { contextWindow?: number, maxOutputTokens?: number } }`. +- Zod: extend provider schema in `packages/schemas/index.ts` (full / create / + update variants) with optional `modelMeta`. +- Apply via `pnpm drizzle-kit-push` (DDL only — additive nullable column, safe). +- UI (later): provider edit form shows resolved window per enabled model with an + editable override field. + +### B. Token estimation (cold start only) — the single estimator (P2) + +`apps/backend/src/runs/token-estimate.ts`. + +One function over one neutral structure — **no per-tier estimator**: + +```ts +const MODEL_BOUND: PartType[] = ["text", "tool-call", "tool-result", "file", "image"]; +// reasoning / source / step-start / data-* are UI-only — they never reach the +// model and MUST be excluded on both sides (drift T1). + +type CountUnit = { role: Role; text: string; nonText: NonTextPart[] }; + +function toCountUnits(m: UIMessage): CountUnit[] // Tier 1 adapter +function toCountUnits(m: ModelMessage): CountUnit[] // Tier 2 adapter +const estimateTokens = (units: CountUnit[]): number // char/4 text + modality table +``` + +- char/4 applies to **text parts only**. Never char/4 a base64 image. +- **Modality table (drift T2)** for non-text parts: + - `anthropic: (w,h) => ceil(w*h/750)` + - `openai: (w,h,detail) => detail==="low" ? 85 : tile85(w,h)` — **detail is + usually unset → assume `high`** (over-count beats overflow). + - `default: () => 1200` (conservative). + - Dimensions via **cheap header parse** (PNG IHDR / JPEG SOF marker, ~32 bytes — + no full decode) when bytes are in hand; bare URL or parse failure → `default` + constant. Not "free": one buffer read per image, cold-start only. +- Used **only** on the first turn before any provider `usage` exists; every later + turn uses the real `usage.inputTokens`. +- **Tier 1 estimate runs AFTER `inlineFileUrls` (drift T2)** so the payload is + real, not a pre-inline underestimate. +- **Divergence feedback loop (drift T2):** on turn 2, compare the cold-start + estimate vs real `usage.inputTokens`; `log.warn` when `|est−real|/real > 0.5` + with model + part breakdown. That signal tunes the image constants over time. +- (Optional future: Anthropic `/v1/messages/count_tokens` for exact Claude counts.) + +### C. Tier 1 — cross-turn compaction (durable) + +Runs in `prepareChatTurn` (`chat-execution.ts:524-549`) before a response starts. +Operates on durable chat history (**UIMessages**). Remember **P1: this is a view +over history; raw messages are never deleted.** + +**Budget math** (not a raw window ratio — fixes drift C3): + +``` +inputBudget = contextWindow − maxOutputReserve − safetyReserve + (safetyReserve = reserveRatio × contextWindow, default 0.05, per LibreChat; + maxOutputReserve from resolved maxOutputTokens) +triggerTokens = triggerRatio × inputBudget (triggerRatio default 0.8) +targetTokens = targetRatio × inputBudget (targetRatio default 0.5) +``` + +**Trigger** (drift C1 — must count what _this_ turn adds, not just the last +response): `projected = lastInputTokens + estimateTokens(messagesSinceWatermarkOrLastTurn)`. +First turn: `projected = estimateTokens(allMessages) × 1.15` (char/4 safety +margin, drift M2). Compact when `projected >= triggerTokens`. + +**Hysteresis** (drift C2 — the Cline #5616 thrash failure): compaction must reduce +the conversation to `<= targetTokens`, well below the trigger, so it does NOT +re-fire next turn. Trigger ratio (0.8) and target ratio (0.5) are deliberately +distinct. + +Compaction (`apps/backend/src/runs/compaction.ts`) — staged, cheap-first +(LibreChat pattern). **Two adapters, shared leaf primitives** (P2): +`compactUIMessages` (Tier 1) and `compactModelMessages` (Tier 2 + recovery) both +call `estimateTokens` / `summarizePrefix` / `pickKeepBoundary`. Pairing rule +differs by shape: + +- Tier 1 / UIMessage: an assistant message carrying tool-invocation parts is + **atomic** — never split, never drop its paired result. +- Tier 2 / ModelMessage: keep assistant + following `role:"tool"` messages + together. + +Stages: + +- Pin the system prompt. +- Keep the last `keepRecentMessages` (default ~10) verbatim; never split a + tool-call / tool-result pair across the boundary. +- **Stage 1 — prune (no model call):** in the older prefix, degrade bulky tool / + RAG results — soft-trim to head+tail, then replace with a placeholder + (`[tool result elided]`) for results over `minPrunableChars`. Often enough to + reach `targetTokens` without a summarization call. +- **Stage 2 — summarize:** if still above target, summarize the older prefix + with the **task model** into one synthetic summary message. + - **Model fallback (drift T7):** `provider.taskModelId → resolvedModelId (main)`. + `log` which model summarized + token cost. + - **Chunked / map-reduce** when the prefix exceeds the summarizer's own window + (drift M1 — cold-start on a large imported history). +- Output: `[system, summaryMessage, ...pruned/keptRecent]`. +- **Fail loud:** emit a visible transcript event (`context-compacted`, + "Summarized N earlier messages") rather than silently mutating. + +**Persistence + watermark** — all writes through `writeWatermark` (P3, drift R1): + +- Add to chat/run record: `contextSummary: text`, `summaryWatermark: int` + (id/index of last summarized message), `compactionDirty: boolean default false` + (drift T3), `version: int default 0` (drift R1 — CAS token). +- Each turn, summarize only messages _after_ the watermark and fold into the + existing summary, then advance the watermark — incremental. +- **The single versioned CAS writer (P3, drift R1):** + + ```ts + // EVERY mutation — advance | C4 reset | dirty-clear — goes through this. + async function writeWatermark( + chatId, + expectVersion, + patch /* {watermark?, summary?, dirty?} */, + ) { + const res = await db + .update(chat) + .set({ ...patch, version: expectVersion + 1 }) + .where(and(eq(chat.id, chatId), eq(chat.version, expectVersion))); + return res.rowCount === 1; // false = conflict → re-read, decide by VERSION not watermark value + } + ``` + +- **Loser behavior on CAS conflict (drift T10):** re-read the row. If + `version` moved and the watermark now covers my prefix → **SKIP** (winner + already compacted; safe no-op) **and clear dirty**. Else retry **once**; second + conflict → SKIP + `log.warn(contended)`. No recompute-loop, no livelock. +- **Invalidation (drift C4 + R1):** if any message at/below `summaryWatermark` is + edited/deleted/regenerated, the summary is stale. The edit/delete/regenerate + handler calls `writeWatermark` to **bump version + clear `contextSummary` + + reset watermark** to the last unaffected message — all in one CAS write. Because + the loser compares **version** (not watermark value), a compaction racing an + invalidation sees a conflict and re-reads the reset state — it can never write a + stale summary over mutated history. Branch/regenerate that forks below the + watermark resets it on the new branch. + +### D. Tier 2 — intra-turn compaction (in-memory) + +For a single response with many tool/sub-agent calls that bloats the window +mid-loop. Uses `prepareStep` on both `streamText` and `generateText`. +Operates on **ModelMessages** via `compactModelMessages`. + +`prepareStep({ messages, stepNumber, steps })`: + +- Estimate current step tokens (same `estimateTokens`, P2); if `>= threshold`: + - Summarize **old completed** tool results (steps several back), keep recent + steps verbatim, preserve call/result pairing. + - Return `{ messages: compacted }`. +- **Only fire when genuinely near limit** (drift m3 — mid-step summary adds + latency; don't run it every step). + +**Not persisted.** `prepareStep` edits are throwaway per-call — the SDK keeps its +own canonical message list and returns the _full_ messages in +`result.response.messages`, which commit to history as normal. Next turn, Tier 1 +folds that finished (still-bloated) turn into the durable summary. Tier 2 only +keeps a heavy response executable; Tier 1 owns durable state. + +### E. Recovery — context-overflow error handling (P4) + +In `agent-runner.ts` (around `formatStreamError`, `:636-657`): + +- `isContextOverflowError(err)` — `APICallError.isInstance(err)` AND + (`statusCode` in {400, 413}) AND body matches + `/context length|context_length_exceeded|prompt is too long|too many tokens|maximum context/i`. +- On detect mid-run: + 1. In-memory aggressive trim via **`compactModelMessages`** with a smaller + `keepRecentMessages` (drift T3 — reuse the Tier 2 adapter, **no bespoke + trim**, or T1 divergence returns). + 2. Retry the call **once**. + 3. Persist `compactionDirty = true` via `writeWatermark` (a small standalone + UPDATE, independent of the stream's finalize — drift T3). Recovery **never** + writes summary/watermark directly; it only flags. + 4. If retry still fails, surface: "Conversation too large even after trimming — + start a new chat or reduce attachments." (No infinite retry.) +- **Durable compaction happens on the NEXT `prepareChatTurn`** (drift T3 — + chosen path, not "finalize-or-next"): it sees `compactionDirty`, forces Tier 1 + before building messages, and clears the flag inside the same CAS write that + advances the watermark. `compactionDirty` is **persisted** so a crashed/swapped + worker still resumes correctly. + +### F. Wiring for sub-agents + +`ToolLoopAgent` constructor takes `contextWindow` + `maxOutputTokens` + +compaction config. Sub-agent tool creation (`sub-agent.ts:56-159`) already builds +a `ToolLoopAgent` per sub-agent — resolve each sub-agent model's window and pass +it through. + +**Tier 2 only (drift M3):** sub-agents start fresh each invocation (only a `task` +string, no cross-turn history), so there is no durable history for Tier 1 to +compact. Just pass the window so `prepareStep` (Tier 2) fires if a sub-agent's own +tool loop bloats. Recovery (§E) covers them too since `agent-runner` is shared. + +### G. Config surface + kill switch + +Per-agent (and/or per-workspace) optional fields, with sane defaults: + +- `compactionEnabled` (default true) +- `triggerRatio` (default 0.8), `targetRatio` (default 0.5), + `reserveRatio` (default 0.05), `keepRecentMessages` (default 10), + `minPrunableChars` (default ~2000) + +Add to agent schema (`packages/schemas`, agent table) — optional, defaulted. + +**Global kill switch:** env `COMPACTION_ENABLED` (default true) disables all +proactive compaction (Tier 1 + Tier 2) in prod without a deploy. **Recovery (§E) +ignores this flag** — it is the safety net (P4). + +### H. Frontend context-usage indicator (the ring) + +1. **Backend emits usage to client.** On run finish, include + `{ inputTokens, contextWindow }` in the streamed message metadata. Today usage + is run-level only (`types.ts:8-13`); surface last input-token count + resolved + window per assistant message. +2. **New component** `apps/frontend/components/context-usage-ring.tsx` — small + SVG/conic-gradient ring, fill = `inputTokens / contextWindow`. Color ramps + (green → amber ≥0.7 → red ≥0.9). **Neutral grey, no fill %, when the window is + unknown/default** (drift T6). Wrapped in `Tooltip`. +3. **Placement** — in `PromptInputTools` between the model selector and the `(i)` + info icon (`chat.tsx:574-575`). +4. **Data source (drift U1):** resolve the window from the **currently selected + model** (frontend already holds it in `PromptInputTools`; expose via the + provider/model metadata API / `modelMeta` map), NOT from the last assistant + message's metadata — else the ring shows the previous model's window after a + model switch. Fill = `lastInputTokens / selectedModelWindow`. +5. **Tooltip label is REQUIRED, not optional (drift U2/m2):** + `Last response: N / W (NN%) · current input not yet counted`. The ring reflects + the last response, not the unsent composer input — say so. (Projected-input arc + is deferred, see Open.) + +### I. Per-message stats popover (next to Regenerate) + +An `(i)` action under each assistant response showing input tokens, output +tokens, TTFT, and total generation time. Hover = tooltip, click = popover. + +**Reuse the existing tool-call duration mechanism** (commits c18c18d / b97312f) — +`withToolTimestamps` + `applyToolCompletions` (`agent-runner.ts:63-120`), +`useToolDuration` + `formatDurationMs` (`hooks/use-tool-completed-at.ts`, +`lib/utils.ts:29-49`). + +Backend (`agent-runner.ts`): + +- Capture `startedAt` at run start, `firstTokenAt` at the first text-delta chunk, + `finishedAt` when the stream finalizes (the `applyToolCompletions` point, `:443`). +- Capture token usage from `onFinish` / `totalUsage` (`:148-194`). +- Stamp onto `message.metadata` **at the `applyToolCompletions` point, NOT the + `messageMetadata` callback** (which fires at message start before timing/usage + exist). Shape: + `metadata.stats = { inputTokens, outputTokens, startedAt, firstTokenAt, finishedAt }`. + +Frontend: + +- New `MessageAction` info icon in `chat-message.tsx:335-378`, beside Regenerate, + rendered when `metadata.stats` exists. +- Content: `Input: N · Output: N`, `TTFT: formatDurationMs(firstTokenAt − startedAt)`, + `Total: formatDurationMs(finishedAt − startedAt)`. +- TTFT/total are **server-measured**. Optional client-observed "Round-trip" line + from the `useChat` send timestamp. Reuse `formatDurationMs`; mirror the + client-observed fallback in `useToolDuration` for in-flight messages. + +### J. Clickable ring — compact on demand + +Make the §H ring actionable. **Remember P1: this compacts the model view, not the +stored history — it is not destructive in the data sense.** + +- **Hover** — tooltip with percentage filled (already in H). +- **Click** — request compaction. If a response is generating, defer until the + current message finishes, then run; if idle, run immediately. + +Backend endpoint `POST /chats/:id/compact` (`routes/chat.ts`): + +- Runs Tier 1 compaction once **regardless of threshold** (force), persists via + `writeWatermark`, returns the new resolved usage (`inputTokens` estimate after + compaction + `contextWindow`) so the ring refreshes immediately. +- Reuses the Tier 1 `compaction.ts` module. + +Frontend: + +- Ring `onClick` → if `status === "streaming"`, set a pending flag and fire on the + chat's finish callback; else call now. +- **Pending-while-streaming visual (drift U4):** ring shows a pending badge + + tooltip "will compact when response finishes", and is **disabled** (no + re-click). On finish → spinner → updated fill from the response. +- **Confirm default-ON when the drop is significant (drift U3):** + `messagesDropped > keepRecentMessages` OR estimated reduction `> 30%` of history + → confirm. Below that → immediate, no prompt. (Confirm is UX courtesy; per P1 no + data is destroyed regardless.) + +--- + +## File-by-file change list + +Backend: + +- `apps/backend/src/runs/context-window.ts` — **new**: window resolution + API + auto-detect + litellm registry w/ key normalization + cache + evict hook. +- `apps/backend/src/runs/token-estimate.ts` — **new**: single `estimateTokens` + + `toCountUnits` adapters + image modality table + header-parse dims. +- `apps/backend/src/runs/compaction.ts` — **new**: `compactUIMessages` (Tier 1) + + `compactModelMessages` (Tier 2 + recovery) + shared leaf primitives + + `writeWatermark` CAS. +- `apps/backend/src/runs/agent-runner.ts` — `prepareStep` (Tier 2) on + `streamText`/`generateText`; `isContextOverflowError` + retry-once recovery + (reuses `compactModelMessages`, sets `compactionDirty`); pass `contextWindow` + through; emit usage metadata; capture `startedAt`/`firstTokenAt`/`finishedAt` + + usage and stamp `metadata.stats` at the `applyToolCompletions` point (§I). +- `apps/backend/src/services/chat-execution.ts` — Tier 1 in `prepareChatTurn` + (after `inlineFileUrls`); resolve window; check/clear `compactionDirty`; all + state writes via `writeWatermark`. +- `apps/backend/src/routes/chat.ts` — `POST /chats/:id/compact` (§J). +- `apps/backend/src/routes/provider.ts` — `cache.evict(providerId)` on modelMeta + update (drift T5). +- `apps/backend/src/tools/sub-agent.ts` — pass per-sub-agent window/config. +- `apps/backend/src/db/schema.ts` — `provider.modelMeta` JSONB; chat/run + `contextSummary` + `summaryWatermark` + `compactionDirty` + `version`; agent + compaction fields. +- message edit/delete/regenerate handlers — call `writeWatermark` to invalidate + (version bump + clear summary + reset watermark) (drift C4/R1). + +Schemas: + +- `packages/schemas/index.ts` — provider `modelMeta`; agent compaction fields; + message-metadata usage + stats shape for the frontend. + +Frontend: + +- `apps/frontend/components/context-usage-ring.tsx` — **new**: ring (window from + selected model), hover tooltip, clickable force-compact, pending/disabled state. +- `apps/frontend/components/chat.tsx` — render ring; resolve selected-model window; + wire `onClick` → compact endpoint (defer while streaming). +- `apps/frontend/components/chat-message.tsx` — `(i)` stats `MessageAction` (§I). +- `apps/frontend/hooks/use-message-stats.ts` — **new** (optional): client-observed + timing fallback for in-flight messages. + +Migration: + +- Additive nullable columns via `pnpm drizzle-kit-push` (dev). Prod via + `scripts/migrate.ts`. +- **Lazy rollout (item 3):** existing chats get `version=0`, null summary, + `compactionDirty=false`. They do **NOT** eagerly compact on deploy — Tier 1 only + fires on each chat's next turn. **Do not add a backfill job** "to be safe" — it + would create a thundering herd of summarize calls that lazy rollout avoids. + +--- + +## Observability (item 4 — the design is only as good as the prod signal) + +**Landed 2026-06-11.** No metrics infra exists in the backend (pino logging +only), so each signal is emitted as a structured `metric:`-tagged log line — +greppable today, trivially shipped to a counter later. Emitted: + +- `compaction.fired` (Tier 1) with `tier` / `tokensBefore` / `tokensAfter` / + `messagesDropped`. ✅ +- `summarize.latency_ms` with `latencyMs` + `taskModelId` + `usage` (the model + is in the same line). ✅ +- `recovery.overflow_detected`, `recovery.retry`, `recovery.failed`. ✅ +- `context_window.fell_to_default` + `litellm.key_miss` (drift T4/T6). ✅ +- `cas.conflict` (per lost CAS + on contended-skip) — **decides whether the R4 + efficiency note ever needs fixing**. ✅ + +Still log-only (no dedicated metric): `estimate_vs_real.divergence` (drift T2 +feedback loop) — deferred with the T2 image-constant tuning work. + +--- + +## Tests + +- `context-window`: resolution order; API parse for Google / OpenRouter / vLLM; + litellm hits **incl. key normalization + Bedrock ARN / Azure / MISS→default** + (drift T4); default fallback; cache evict-on-override (drift T5). +- `token-estimate`: char/4 text-only bounds; **`MODEL_BOUND` filter — UI-only + parts excluded**; **`estimate(toCountUnits(ui)) === estimate(toCountUnits(convert(ui)))` + exact on the filtered set** (drift T1); image modality table (constant, not + char/4) + header-parse dims + missing-dims fallback (drift T2); estimate runs + after inline. +- `compaction`: preserves tool-call/result pairing **both UIMessage and + ModelMessage shapes**; respects `keepRecentMessages`; incremental watermark + folding; prune Stage 1 reaches target without a model call when possible; + hysteresis — output `<= targetTokens`, does not re-fire next turn; chunked + summarize over an oversized prefix; **summarizer model fallback** when + `taskModelId` unset (drift T7). +- `budget`: window − output reserve − safety reserve; trigger counts new + (unsummarized) messages, not just last response (drift C1). +- `writeWatermark` / CAS (drift R1/T10): concurrent writers — one wins + (`rowCount===1`), loser re-reads, decides by **version** not watermark value; + loser SKIPs + clears dirty when winner advanced; one-retry-then-skip, no + livelock; **invalidation reset bumps version and a racing compaction sees a + conflict** (never writes stale summary over mutated history). +- `watermark-invalidation`: editing/deleting a message ≤ watermark clears summary, + resets watermark, bumps version (drift C4). +- `agent-runner`: `prepareStep` trims old tool results only, fires only near limit + (drift m3); overflow-error detection true/false matrix **across per-provider + error bodies** (OpenAI / Anthropic / Google-vLLM fixtures, drift T9); retry-once + **reuses `compactModelMessages`** (drift T3); sets `compactionDirty`; then clean + failure. +- `recovery-persistence` (drift T3): after recovery, next `prepareChatTurn` sees + `compactionDirty`, forces Tier 1, clears flag — and the next turn does **not** + re-overflow; recovery never writes summary directly. +- Integration: synthetic long history → Tier 1 compacts + persists; injected 400 + overflow → recovery retries + succeeds; sub-agent inherits window (Tier 2 only). +- `message-stats`: `startedAt`/`firstTokenAt`/`finishedAt` captured in order; + metadata stamped at `applyToolCompletions`, not message-start; TTFT/total format. +- `compact-endpoint`: force-compaction advances watermark (via `writeWatermark`) + and returns refreshed usage; defers/queues while a run is mid-stream. +- Frontend: ring window comes from selected model not last-message metadata + (drift U1); neutral state on unknown window (drift T6); pending/disabled while + streaming (drift U4); confirm fires above the §J threshold (drift U3). + +--- + +## Sequencing + +1. Window resolution + single estimator + schema (`modelMeta`, `version`, + `compactionDirty`, summary/watermark) — foundation. **✅ DONE** (open defects: + empty prod registry, T5 evict, cache-pins-default — see Review §). +2. Compaction module + `writeWatermark` CAS + Tier 1 (cross-turn, persist). + **✅ DONE** (open defects: C1 trigger under-count, M2 margin, summarizerWindow + not threaded — see Review §). +3. Recovery (overflow detect + retry-once + dirty flag). **✅ DONE 2026-06-10** + (C1 overhead fix + M2 margin + summarizerWindow threading folded in; the + `lastInputTokens` half of C1 moves to step 6 — see Chunk 3 §). + 3a. **Review-fix chunk (RV1-RV4). ✅ DONE 2026-06-10.** + 3b. **Review-fix chunk (RV5-RV7). ✅ DONE 2026-06-10.** All HIGH non-blocking + defects resolved — content-type pruner/renderer, recovery overhead target, + litellm registry populated, heuristic boundary-safe, evict wired, timeout + + single-flight in context-window resolver. +4. Tier 2 (`prepareStep`, in-memory). **✅ DONE 2026-06-10** `buildTier2PrepareStep` + wired into both `streamText` and `generateText`; fires when accumulated + ModelMessages exceed `triggerTokens` (drift m3); uses shared + `compactModelMessages` adapter (drift T3); null when kill switch off. + `Tier2Context` on `ChatTurn` threads config from `buildCompactionRuntime`. + Tests: 1074 pass; tsc source clean. + - **Chunk 4 review fix (2026-06-10):** Tier 2 trigger/target now subtract + `overheadTokens` (RV6 extended to Tier 2 — the prepareStep estimate sees + ModelMessages only, but system prompt + tool schemas consume the same + window; without this, a large overhead lets the payload exceed the budget + before Tier 2 fires). `compactModelMessages` gained `knownEstimate` so the + prepareStep trigger estimate is reused instead of recomputed (RV9). Tests + strengthened: summarize-on-fire + pairing-safety asserted, empty-prefix + no-op asserts `undefined`. +5. Sub-agent wiring (Tier 2 only). **✅ DONE 2026-06-10** `Tier2Context` + `buildTier2PrepareStep` moved to `compaction.ts` (no-cycle); `createSubAgentTool` gains `prepareStep?`; `createSubAgentTools` gains `prepareStepFn?`; `loadSubAgents` resolves per-sub-agent compaction runtime + builds prepareStep map. Tests: 1077 pass; tsc source clean. +6. Frontend usage metadata + ring (§H). **✅ DONE 2026-06-10** `CompactionRuntime` + `ChatTurn.resolved` carry `contextWindow` + `contextWindowIsDefault` (from resolved window source). `applyMessageStats` stamps `metadata.stats = { inputTokens, outputTokens, contextTokens, startedAt, firstTokenAt, finishedAt, contextWindow, contextWindowIsDefault }` on last assistant message at `applyToolCompletions` point. New `GET /:providerId/context-window?modelId=X` endpoint returns resolved window (null when source = "default", drift T6). `MessageStats` schema in `@platypus/schemas`. New `ContextUsageRing` component (SVG donut, green/amber/red ramp, neutral when unknown, required tooltip, drift T6/U2). Ring placed in `PromptInputTools` between search and model selector; `contextWindowData` SWR-fetched per selected model (drift U1). Tests: 1077 pass; source tsc clean. + - **✅ Code review for chunk 6 (2026-06-11).** One critical bug fixed: `inputTokens`/`outputTokens` from `accumulateStepStats` are the run-wide SUM across steps; feeding the summed input into the ring over-counts on multi-step tool loops (5-step loop → reported ≈ sum-of-all-prompts, pegging the ring red >100% when real fill ~37%). **Fix:** added `contextTokens` = last step's `usage.inputTokens` (peak context fullness, tracked in `streamText.onStepFinish`); the ring uses `contextTokens`, the §I cost popover keeps the summed `inputTokens`/`outputTokens`. Also: `ContextUsageRing` prop `inputTokens`→`usedTokens`; frontend `as any` casts replaced with typed `MessageStats`; removed dead `Tier2Context` import in `agent-runner.ts`. Documented trade-offs left as-is: numerator (last response's model) vs denominator (selected model) mismatch after a model switch is intentional (drift U1); `generate()` headless path stamps no stats (no UI); TTFT = first text part, excludes leading reasoning (matches §I wording). +7. Per-message stats popover (§I). **✅ DONE 2026-06-11** `MessageStatsPopover` in `chat-message.tsx`: info icon (lucide `InfoIcon`) in `MessageActions` for all assistant messages with `metadata.stats`; popover shows In/Out token counts (run-wide sums), TTFT (when `firstTokenAt` present), Total elapsed. Uses `formatDurationMs` from `lib/utils`. tsc clean; 1077 tests pass. +8. Clickable ring → compact endpoint (§J). **✅ DONE 2026-06-11** `POST /chats/:id/compact` runs force-Tier-1 via new `forceCompactChat` helper in `chat-execution.ts`; returns token estimate + context window so ring refreshes immediately. Frontend: `onClick` with defer-while-streaming + pending badge (drift U4); confirm dialog above threshold (drift U3); ring keyboard-accessible; hooks hoisted above early returns (rules-of-hooks fix). Tests: 1081 pass; tsc clean. +9. Per-agent config surface + `COMPACTION_ENABLED` kill switch. **✅ DONE 2026-06-11** DB + Zod schemas already had per-agent fields (`compactionEnabled`, `triggerRatio`, `targetRatio`, `reserveRatio`, `keepRecentMessages`, `minPrunableChars`); `resolveCompactionConfig` + `buildCompactionRuntime` already wired them; global `COMPACTION_ENABLED=false` kill switch in `chat-execution.ts`. Added compaction fields to `agentCreateSchema` / `agentUpdateSchema` (so routes pass through) and "Context compaction" section in `agent-form.tsx` Advanced settings. Backend 1081 pass; tsc clean. + - **✅ Code review for chunk 9 (2026-06-11).** One MEDIUM + minors fixed. The + editable surface newly exposed the C2 thrash hole (a user/API could set + `targetRatio >= triggerRatio` → compaction re-fires every turn). **Fix:** + (1) `agentCreateSchema`/`agentUpdateSchema` gained a zod `.refine` rejecting + an inverted pair (checked only when both supplied; error on `targetRatio`); + (2) `resolveCompactionConfig` clamps `targetRatio → triggerRatio * 0.9` as a + runtime backstop for legacy/direct-write rows. Minors: `keepRecentMessages` + base schema tightened `.nonnegative()`→`.min(1)` (0 keep-recent breaks + pairing; form already enforced min=1); form description now states the + target messages.filter((m) => m.role === "assistant").length, + [messages], +); + +const [compacted, setCompacted] = useState<{ + atAssistantMessageCount: number; + tokens: number; +} | null>(null); + +// at compact time: +setCompacted({ + atAssistantMessageCount: assistantMessageCount, + tokens: body.inputTokens, +}); + +// ring usedTokens expression: +compacted?.atAssistantMessageCount === assistantMessageCount + ? compacted.tokens + : lastAssistantStats?.contextTokens; +``` + +A new user message does NOT change `assistantMessageCount` → compacted stays +valid. When the next assistant response lands, `assistantMessageCount` increments +→ compacted expires → ring reads the fresh `lastAssistantStats.contextTokens`. + +**Files:** `apps/frontend/components/chat.tsx` only (3 small edits). + +#### 11b. (i) icon: add mouseover tooltip + +**Current state.** The `Info` button at `chat.tsx:741-743` is a bare +`DialogTrigger` — no tooltip, click-only. The user has to click to discover what +it does. + +**Fix.** Wrap the existing `DialogTrigger`/`PromptInputButton` in a `Tooltip` +so hover shows a label without opening the dialog. Click still opens the dialog +(unchanged). Use the same `delayDuration={500}` as the ring. Tooltip text: +`"Agent info"` (or a one-line agent description if `selectedAgent.description` +is non-empty). Pattern: + +```tsx + + + + + + + + + + + {selectedAgent.description?.trim() || "Agent info"} + + + + +``` + +Note: `TooltipTrigger asChild` wrapping `DialogTrigger asChild` is a safe +Radix composition — Radix merges event handlers via slot; the `onClick` from +`DialogTrigger` and the hover callbacks from `TooltipTrigger` coexist on the +same underlying `PromptInputButton` element. + +**Files:** `apps/frontend/components/chat.tsx` only (reshape the existing JSX +block, no new imports needed — `Tooltip`/`TooltipTrigger`/`TooltipContent` +already imported). + +#### 11c. Compaction chat trace (new §K) + +**Goal.** Make compaction visible inside the chat timeline — not just via the +ring. Two states: + +1. **Active / in-flight** — "compaction is happening right now" (between the + user hitting Send and the first response token arriving). +2. **Historical** — "compaction happened here" (visible in the scrollback, + including the LLM-generated summary since that IS a model call). + +**Mental model.** Compaction is a model call forced by the system on the +user's behalf — structurally equivalent to a tool call initiated by the +assistant. It therefore maps naturally to the **existing tool-call UI**: +emit it as a synthetic `compact_context` tool-call + tool-result pair in the +stream before the actual response. No new rendering component needed — the +existing tool-call expander handles both active ("in flight") and historical +states for free. No fake user message injected; no custom banner. + +**Why the backend needs to emit an event.** Tier 1 runs inside +`prepareChatTurn` (server-side, before streaming). The frontend has no other +channel to distinguish "compaction ran before this response" from normal +response latency. §C already prescribes `context-compacted` as a fail-loud +stream event; this chunk wires that emission as a tool-call pair. + +**Backend change.** After a successful Tier 1 compaction in +`applyTier1Compaction`, emit a synthetic tool-call + tool-result into the +AI-SDK `dataStream` before the first assistant text part. Use the SDK's +`writeData` / `writeTool*` primitives (exact API depends on AI SDK version — +check `dataStream` surface in `chat-execution.ts`). Logical shape: + +```ts +// tool-call part +{ toolCallId: "", toolName: "compact_context", args: { messagesSummarized: N } } + +// tool-result part +{ + toolCallId: "", + toolName: "compact_context", + result: { + messagesDropped: N, + summaryExcerpt: string | undefined, // first ~120 chars of LLM summary; absent for Stage-1-only (prune, no model call) + } +} +``` + +`summaryExcerpt` carries the first ~120 chars of the LLM-generated summary. +This IS the model's own words — not a risk, and it gives users transparency +into what was retained. Omit when compaction ran Stage 1 only (prune, no +model call). + +**Frontend — no new component.** The existing tool-call renderer in +`chat-message.tsx` already handles `tool-call` + `tool-result` parts. The +`compact_context` call will render like any other tool invocation: + +- While streaming: shows "compact_context" with a spinner (active indicator). +- After complete: collapses to the tool-call expander showing args + + result (including `summaryExcerpt` when present). + +The result persists in `UIMessage.parts` (AI SDK durable storage) → appears +in scrollback automatically. + +**What the user sees:** + +``` +▶ compact_context [expandable] + ↳ messagesDropped: 34 + summary: "The user has been working on the Platypus + monorepo, specifically the context-compaction + feature…" +────────────────────────────────────────────── +[actual assistant response to user's question] +``` + +**Forced compaction (§J, ring click).** The `POST /chats/:id/compact` +endpoint runs outside of a normal streaming turn — no `dataStream` available. +Instead, after compaction succeeds the backend **persists a synthetic +assistant message** directly into the chat's message list (same DB write path +as real messages). Shape: role `assistant`, parts = `[tool-call, tool-result]` +for `compact_context`, no text content. The frontend refreshes the message +list after the POST resolves (SWR revalidation or optimistic append from the +response body) — the new message appears in the scrollback exactly like any +other tool-call exchange. The existing ring spinner + toast remain; the +synthetic message is the persistent trace. + +**C4 / watermark safety — two paths:** + +- **Tier 1 trace** — emitted parts live in `UIMessage.parts` of the following + assistant message (stream data only, not a separate DB row). Do NOT affect + message IDs, watermark comparisons, or C4 logic. +- **§J trace** — IS a real DB message row. Must be written with a message ID + that is **above** the current `summaryWatermark` so it is never itself + summarized. The existing `writeWatermark` CAS is not involved (the + watermark already advanced during the compaction); just insert with a + timestamp after the last real message. C4 invalidation only triggers on + edits/deletes at/below the watermark — this new row is always above it, so + no risk. + +**Files:** + +- `apps/backend/src/runs/compaction.ts` — return compaction result metadata + (`messagesDropped`, `summaryExcerpt`) from `applyTier1Compaction` (or add + an optional `dataStream` param to emit directly). +- `apps/backend/src/services/chat-execution.ts` — after + `applyTier1IfNeeded`, if compaction ran, emit the tool-call + tool-result + pair into `dataStream`. +- `packages/schemas/index.ts` — no change needed (tool-call parts already + in the union). +- `apps/frontend/components/chat-message.tsx` — no change needed (existing + tool-call renderer handles `compact_context` automatically). Optionally: + add a display-name entry so it shows "Context compaction" instead of the + raw function name. + +**Tests:** + +- Backend: `applyTier1Compaction` returns `{ messagesDropped, summaryExcerpt? }`; + no emission when compaction does not fire (below trigger). +- Integration: stream from a compacted turn contains a `tool-call` part with + `toolName: "compact_context"` before any `text` part. + +**Sequencing.** 11a and 11b are trivial — do them first (one commit each). +11c has a backend+frontend surface; implement backend emission first (easy +to verify via the stream in DevTools), then verify existing tool-call UI +renders it without frontend changes. + +#### Chunk 11 — code review fixes (landed 2026-06-12) + +Review of the first 11c cut surfaced one correctness defect + three gaps; all fixed. + +- **RV11 (HIGH) — synthetic trace was replayed to the provider.** The + `compact_context` tool part is persisted into the assistant message (for + scrollback) and was therefore re-converted by `convertToModelMessages` and + sent to the model on every later turn — a phantom tool call for a tool not in + `tools` (provider-rejection / model-confusion risk). **Fix:** + `stripCompactionTraceParts` removes the part at both `convertToModelMessages` + call sites (`agent-runner.ts` stream + generate); a trace-only message (§J) is + dropped entirely so no empty assistant message is sent. The part still + persists for the timeline; it just never reaches the model. +- **RV12 (MED) — trace emitted for no-op turns.** `compactionTrace` was built + whenever `triggered`, but `messagesDropped` is 0 with no excerpt on prune-only + and force-dirty-within-target runs → empty timeline entry. **Fix:** + `compactionTrace` is now `undefined` unless an actual model summary ran + (`usedModelCall && summaryText`). +- **RV13 (MED) — §J forced-compaction trace was unimplemented.** Ring-click + compaction produced no timeline trace (only the auto path did). **Fix:** + `forceCompactChat` persists a standalone synthetic assistant message via + `buildCompactionTraceMessage` (above the watermark; stripped from the model + payload like the Tier-1 trace), returns it from `POST /chats/:id/compact`, and + the frontend appends it (id-dedup so SWR revalidation reconciles, no + duplicate). +- **RV14 (LOW) — tests + display name.** Added `prependCompactionChunks`, + `stripCompactionTraceParts`, `buildCompactionTraceMessage`, and trace-gating + tests (backend suite 1096 pass). `humanizeToolType` maps `compact_context` → + "Context compaction". + +--- + +## Open / deferred decisions + +- **OpenAI-compatible as a separate provider type** — not required (auto-detect + probes `max_model_len` regardless of label). Deferred. +- **Persisting Tier 2** — deferred; revisit only if storing tool outputs verbatim + is itself a problem. +- **Anthropic exact token counting** via `/v1/messages/count_tokens` — optional + accuracy upgrade; deferred. +- **Projected-input arc on the ring (drift U2)** — char/4 of composer text added + as a faint arc. Deferred; the honest tooltip label ships instead. +- **CAS contention optimization (drift R4)** — under a contended chat, the + version is read → summarize (seconds) → CAS write, so the version can be stale + by write time → wasted summarize (not corruption; loser skips safely). Bounded + by one-retry-then-skip. **Do NOT fix now.** Gated on the `cas.conflict` metric + (now emitted, chunk 10); if it shows repeated waste, move the version read to + just-before-write or take a short advisory lock for the summarize window. + +### Deliberately NOT done in chunk 10 (2026-06-11) — with reasons + +The RV7e-RV10 + observability sweep closed the review backlog; these four were +left undone **on purpose**, not missed: + +- **RV9 digest-based C4 check** — once a watermark exists, C4 reads the full + `messages` JSONB row and `stableStringify`-compares the whole prefix every + turn. The compare is already **correct** (RV1 landed); a content digest would + only make it cheaper. Pure optimization of a correct path → revisit only if the + per-turn read+stringify shows up in profiling, and fold it into any future + C4 rework rather than touching the correctness path now. +- **defect 7 — `content`-type tool output base64 → char/4** (`token-estimate.ts`). + The `content` tool-result variant `stableStringify`s media bytes into the + char/4 blob. Fixing it **symmetrically** (so estimate(UI) === estimate(Model) + still holds — the load-bearing P2/T1 invariant) requires extracting media into + `nonText` on BOTH adapters, where the UI side stores `output` as untyped + `unknown`. The risk to the tested invariant outweighs the benefit: **no current + tool emits `content`-type media**. Fix before the first tool that does. +- **`bytesFromUrl` vs storage/utils `parseDataUrl` duplication** — merging them + couples the estimator to the storage layer for zero behaviour change. Left as + two small private regexes. +- **`estimate_vs_real.divergence` metric** (drift T2 feedback loop) — deferred + with the image-constant tuning work it feeds; still log-only. +- **Trigger estimator scope — FIXED (drift C1).** Originally flagged from live + test 2026-06-03; confirmed unfixed in chunk 2 by the 2026-06-09 review. **Both + prescribed paths now landed:** + 1. **DONE 2026-06-10** — `estimateOverheadTokens` adds the system prompt + tool + schemas to the projection (`projectTier1Tokens`) and subtracts them from the + compaction target (the ~986-vs-8888 gap was dominated by tool schemas). + 2. **DONE 2026-06-11** — the ADR-prescribed prior-turn provider baseline is + wired: `prepareChatTurn` threads `lastInputTokens` from the last assistant + message's `metadata.stats.contextTokens`; `projectTier1Tokens` returns + `max(charBased, lastInputTokens)` so turns ≥ 2 are floored by the real + provider count instead of trusting char/4. + + The Qwen3.6 / vLLM under-count (provider 8888 vs estimate ~986) is closed: the + projection now sees both the tool-schema overhead and the prior-turn provider + count, so it no longer blows past the trigger silently. + +--- + +## Drift log & code-review checklist + +Every issue found across 4 review rounds, the resolution, and **the exact thing +to re-verify once the code exists.** Round trajectory: R1 design holes → R2 +second-order effects → R3 a third-order race → R4 zero correctness findings (one +telemetry-gated note). This is the anti-regression list — check it at PR time. + +| ID | Issue | Resolution | ✅ Verify in code | +| ------- | ------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| **C1** | Trigger only counted last response, not what this turn adds | `projected = lastInputTokens + estimate(newMsgs)` | Trigger sums unsummarized new messages, not just last `usage` | +| **C2** | Compacting to the trigger ratio re-fires next turn (Cline #5616 thrash) | Hysteresis: target 0.5 ≠ trigger 0.8 | Post-compaction output `<= targetTokens`; a follow-up turn does not re-compact | +| **C3** | Raw window ratio ignored output + safety headroom | `inputBudget = window − maxOutputReserve − safetyReserve` | Budget subtracts both reserves before ratios | +| **C4** | Edit/delete/regenerate below watermark → stale summary | Invalidate via `writeWatermark`: version bump + clear summary + reset watermark | Every edit/delete/regenerate handler calls `writeWatermark`; forking below watermark resets on new branch | +| **M1** | Cold-start on huge imported history exceeds summarizer's own window | Chunked / map-reduce summarize | Prefix larger than summarizer window is chunked, not sent whole | +| **M2** | First-turn char/4 underestimate | `× 1.15` margin + recovery net | First-turn projection applies the margin | +| **M3** | "Both tiers apply to sub-agents" was wrong | Sub-agents = Tier 2 only (no durable history) | Sub-agent path wires Tier 2 + window only, no Tier 1 | +| **T1** | Tier 1 (UIMessage) and Tier 2 (ModelMessage) measured by different estimators → divergence | **One** `estimateTokens` over `CountUnit[]`, two adapters; `MODEL_BOUND` filter excludes UI-only parts both sides | No second estimator exists; equality test passes exactly on filtered set; UI-only parts (reasoning/source/step/data) never counted | +| **T2** | char/4 on base64 images is meaningless; ordering vs inline unclear | Modality table (anthropic/openai/default), header-parse dims w/ constant fallback, detail→high; estimate AFTER `inlineFileUrls`; divergence `log.warn` | No char/4 on image bytes; Tier 1 runs post-inline; missing dims → 1200; turn-2 divergence logged | +| **T3** | Recovery compaction vs "single durable writer"; finalize-mid-error ambiguous | Recovery does in-memory trim via `compactModelMessages` + sets persisted `compactionDirty`; durable write on NEXT `prepareChatTurn` only | Recovery never writes summary/watermark directly; `compactionDirty` is a DB column; recovery trim calls the Tier 2 adapter, not a bespoke trim | +| **T4** | litellm registry keys don't match our model IDs | Normalization chain + alias map + `log.warn` on MISS | Lookup tries exact→strip-prefix→lower→alias→family; Bedrock ARN / Azure resolve or log a miss | +| **T5** | Window cache stale after override edit | `cache.evict(providerId)` in provider PATCH, immediate | Editing modelMeta busts cache without waiting TTL | +| **T6** | 8192 default silently over-compacts | `log.warn` on default; ring renders **neutral**, no false ramp | Fall-to-default is logged; ring is grey/no-% when window unknown | +| **T7** | `taskModelId` may be unset | Fallback `taskModelId → main`; log model + cost | Summarizer falls back to main model; no crash on unset | +| **T8** | char/4 underestimates CJK/JSON | Accepted; margin + real-usage handoff + recovery net | (No code; documented as text-only heuristic) | +| **T9** | One synthetic 400 doesn't cover per-provider error bodies | Fixture set: OpenAI / Anthropic / Google-vLLM | `isContextOverflowError` matrix tests real per-provider phrasings | +| **T10** | CAS rejects stale write but loser behavior undefined → livelock risk | Re-read; if winner advanced → SKIP+clear-dirty; else retry once then SKIP | Loser never recompute-loops; terminal state is skip; decides by version | +| **R1** | Loser-skip assumed monotonic watermark; C4 reset moves it backward → stale write back door | All writes (advance/reset/dirty) through one versioned CAS; loser compares **version** not watermark value | Single `writeWatermark`; invalidation bumps version; no path mutates these fields outside it | +| **U1** | Ring showed previous model's window after a model switch | Resolve window from **selected** model, not last-message metadata | Ring reads selected-model window from `modelMeta`, refreshes on switch | +| **U2** | Ring lags pending composer input | Required tooltip label "current input not yet counted"; arc deferred | Tooltip text present and unmistakable | +| **U3** | Forced-compact confirm too soft | Confirm default-ON when drop significant (`>keepRecent` or `>30%`) | Threshold confirm wired; (P1: not destructive anyway) | +| **U4** | No feedback for defer-while-streaming click | Pending badge + disabled ring + "will compact on finish" tooltip | Ring disables + shows pending state between click and finish | +| **R4** | CAS read→summarize→write window wastes summarize under contention | Accepted, **not fixed**; gated on `cas.conflict` metric | `cas.conflict` metric emitted; no premature lock added | +| **P1** | (principle) compaction misread as data loss | View-not-delete: raw messages persist | No code path hard-deletes a summarized message | + +--- + +## Appendix: prior art & review + +### Prior art (open-source tools surveyed) + +| Tool | Strategy | Window source | Threshold | Pitfall | +| ------------ | ---------------------------------------- | --------------------------------------------- | ------------------------------------------------------- | ------------------------------------------------ | +| Open WebUI | none (BYO filter), errors out | `num_ctx` (Ollama only) | n/a | silent overspend on API providers | +| LibreChat | **both**: prune tool results → summarize | `maxContextTokens` (yaml) | trigger on prune; `reserveRatio` 0.05 | ignored on some endpoints | +| LangGraph | `trim_messages` vs `SummarizationNode` | you supply | you set | trim breaks tool pairs | +| llama.cpp | context-shift or HTTP 400 | `--ctx-size`, `--keep N` | off by default | infinite shift loop / hard 400 | +| Ollama | silent clip | `num_ctx` (default 2048) | clips | silent token loss | +| Cline | **summarize at %** | reads window − model buffer | `autoCondenseThreshold` (0-1) | **thrash (cline #5616)** | +| Claude Code | **summarize at %** | reads window, live meter | ~83.5%, `CLAUDE_AUTOCOMPACT_PCT_OVERRIDE` | ~33k buffer reserved for compact | +| Codex CLI | prune-from-memory → summarize | `ctx − min(maxOut,20k)` | `effective − 13k`; **hard 90% cap** | freezes near threshold (#19116) | +| Hermes Agent | prune tool results → summarize | provider metadata + `context_length` override | **0.50** primary + **0.85** hygiene net + 400-msg valve | token **floor** breaks sub-floor models (#14690) | +| OpenRouter | middle-out truncate (gateway) | `/models.context_length` | on overflow | drops middle silently | +| litellm | `trim_messages` (trim_ratio 0.75) | **model registry JSON** | ratio | orphaned tool-call msgs | + +Borrowed: litellm registry (§A), `reserveRatio` headroom (§C), prune-then-summarize +staging (§C), hysteresis vs thrash (§C), fail-loud event (§C), live usage meter (§H). + +Sources: Open WebUI context-window docs + discussions #4983/#6402; LibreChat +summarization/model_specs/token_usage docs; LangGraph add-memory docs; llama.cpp +server README + issues #17284/#3969; Cline auto-compact docs + issue #5616; +litellm token_usage/message_trimming docs + `model_prices_and_context_window.json`; +OpenRouter models + message-transforms docs; vLLM engine args; Codex CLI compaction +docs + issues #11805/#19116; Claude Code auto-compact env override + issue #41818; +Hermes Agent context-compression docs + issues #12626/#14690. + +### Field re-survey (2026-06-12) — verified complete vs Hermes / Codex / Claude Code / Cline + +Re-checked the shipped implementation against the four agents above. **Every +"gap" the survey suggested is already implemented** — the design is at or ahead +of the field: + +- **Real provider-token feeding** — already done (C1). `projectTier1Tokens` + returns `max(charBased, lastInputTokens)`; the estimate is the cold-start + fallback (turn 1, no prior `usage`) and the stale-tail top-up, not a + replacement. Strictly safer than Hermes (which only falls back to estimate). +- **Input-tokens-only window** — already done (F1). `windowFromRegistryEntry` + trusts only `max_input_tokens`, never litellm's `max_tokens` (the output cap). +- **Reserve carve / "90% cap" equivalent** — already done (C3). The trigger is a + fraction of `inputBudget = window − maxOutputReserve − safetyReserve`, so even + `triggerRatio = 1.0` (the schema max) fires below the raw window. Codex's hard + 90% clamp is structural here, not a separate clamp. +- **Two-layer (proactive + always-on recovery)** — P4; matches Hermes + primary+hygiene and Claude's auto-compact+buffer. + +**The token-FLOOR anti-pattern (Hermes #14690) — explicitly DO NOT copy.** +Hermes clamps the _trigger_ up with `max(ctx·pct, 64000)`, which exceeds the real +window on any sub-64k model → compaction never fires → silent overflow. Our +fraction-of-`inputBudget` trigger is inherently safe at any window size; a floor +belongs only on the _window fallback_ (`detected ?? DEFAULT`, never +`max(detected, FLOOR)`), which §A already does. + +**Genuinely-absent, deferred (optional — not bugs):** + +- **Message-count force-compact valve** (Hermes 400-msg `hygiene_hard_message_limit`). + A count-based backstop independent of the token estimate — catches a blown + estimator that the recovery net would otherwise have to absorb. Cheap; consider + if the `estimate_vs_real.divergence` signal ever shows the estimator drifting. +- **`maxOutputReserve` floor when max-output is unknown.** `computeBudget` reserves + `maxOutputTokens ?? min(4096, 0.25·ctx)`; a reasoning model with a large real + output but no resolved `max_output_tokens` could under-reserve. Recovery covers + it today; revisit only if overflow-on-output shows up for such a model. +- **Model-aware aggressiveness** (Cline trims 75% on small windows vs 50%). Marginal; + our fixed `targetRatio` is adequate. Deferred. + +### Review change log (applied to this doc) + +- **C1–C4, M1–M3** — see drift table. +- **A** litellm registry replaces homegrown lookup table. +- **T1–T10, R1, R4, U1–U4** — round 2-4 findings, see drift table. +- **P1–P4** — design principles extracted from the review consensus. +- Added: prune-before-summarize Stage 1, fail-loud `context-compacted` event, + Observability section, global kill switch, lazy-rollout note, ADR (queued: + `docs/adr/NNNN-context-compaction.md` capturing the _why_ — two tiers, + view-not-delete, CAS-on-version, char/4-not-tokenizer). From 9604bdd9a2b10833e4a8d488acf1d47f645b645e Mon Sep 17 00:00:00 2001 From: "frantisek.spacek@morosystems.cz" Date: Fri, 12 Jun 2026 23:58:17 +0200 Subject: [PATCH 12/21] =?UTF-8?q?chore(docs):=20plan=20Chunk=2013=20?= =?UTF-8?q?=E2=80=94=20compaction=20reliability=20+=20prompt=20overhaul?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Capture the 2026-06-12 live test-server findings: per-step-timeout bug that kills pre-stream compaction (150s summarize > 120s watchdog), the 8,631-token runaway, and the lost-turn root cause. Add the four fixes (heartbeat, maxOutputTokens cap + concise prompt, stream-open-before-compaction with live Running status, abortSignal), the prior-art summarization-prompt survey (Claude Code / Codex / OpenCode / Hermes), the proposed replacement prompt, and the decided-against provider-model-selector. Co-Authored-By: Claude Opus 4.8 (1M context) --- context-compaction-plan.md | 209 +++++++++++++++++++++++++++++++++++-- 1 file changed, 203 insertions(+), 6 deletions(-) diff --git a/context-compaction-plan.md b/context-compaction-plan.md index 7294bce7..7c64e893 100644 --- a/context-compaction-plan.md +++ b/context-compaction-plan.md @@ -703,18 +703,25 @@ tool loop bloats. Recovery (§E) covers them too since `agent-runner` is shared. ### G. Config surface + kill switch -Per-agent (and/or per-workspace) optional fields, with sane defaults: +**SUPERSEDED 2026-06-12 — going global+per-model (see Chunk 12).** The per-agent +fields below shipped in chunk 10 but are being removed: no surveyed tool +(Hermes/Codex/Claude/Cline) exposes per-agent compaction tuning, and the ratios +self-normalize to the model window so per-agent variance buys nothing measurable. -- `compactionEnabled` (default true) -- `triggerRatio` (default 0.8), `targetRatio` (default 0.5), +~~Per-agent optional fields, with sane defaults:~~ + +- ~~`compactionEnabled` (default true)~~ +- ~~`triggerRatio` (default 0.8), `targetRatio` (default 0.5), `reserveRatio` (default 0.05), `keepRecentMessages` (default 10), - `minPrunableChars` (default ~2000) + `minPrunableChars` (default ~2000)~~ -Add to agent schema (`packages/schemas`, agent table) — optional, defaulted. +The runtime now uses `DEFAULT_COMPACTION_CONFIG` for all agents; window/output +size stays per-model via the §A resolver (`provider.modelMeta` override). **Global kill switch:** env `COMPACTION_ENABLED` (default true) disables all proactive compaction (Tier 1 + Tier 2) in prod without a deploy. **Recovery (§E) -ignores this flag** — it is the safety net (P4). +ignores this flag** — it is the safety net (P4). After Chunk 12 this env flag is +the ONLY compaction toggle (the per-agent `compactionEnabled` is gone). ### H. Frontend context-usage indicator (the ring) @@ -1214,6 +1221,196 @@ Review of the first 11c cut surfaced one correctness defect + three gaps; all fi --- +## Chunk 12 — remove per-agent compaction config, go global+per-model (planned, decided 2026-06-12) + +**Decision.** Drop ALL per-agent compaction tuning shipped in chunk 10. Compaction +behavior becomes global (`DEFAULT_COMPACTION_CONFIG` + the `COMPACTION_ENABLED` +env kill switch); only window/output **size** stays per-model via the §A resolver. + +**Why.** The 2026-06-12 field re-survey: no surveyed agent (Hermes, Codex CLI, +Claude Code, Cline) exposes per-agent compaction knobs — all use global config + +per-model window. Trigger/target are fractions of an already model-normalized +`inputBudget`, so per-agent variance is speculative generality. The agent-edit +form clutter is real cost for a feature ~100% of agents leave at default (every +agent on the test server has all six columns NULL). + +**Trade-off (accepted).** Removing per-agent `compactionEnabled` loses the ability +to disable compaction for a single agent (e.g. an exact-recall code/legal agent +where lossy summarization corrupts output). Mitigation: the global +`COMPACTION_ENABLED` env still exists, and recovery (§E, P4) keeps such an agent +from hard-failing on overflow regardless. If a real need for single-agent opt-out +appears, revisit as a **per-model or per-workspace** flag — NOT per-agent. + +**Change list.** + +- `packages/schemas/index.ts` — remove `compactionEnabled`, `triggerRatio`, + `targetRatio`, `reserveRatio`, `keepRecentMessages`, `minPrunableChars` from + `agentSchema` + the `agentCreate`/`agentUpdate` picks; delete the + `compactionRatioOrder` refinement (+ its `index.test.ts` cases). +- `apps/backend/src/db/schema.ts` — drop the six `agent` columns. +- New migration — `ALTER TABLE "agent" DROP COLUMN IF EXISTS ...` ×6. `IF EXISTS` + because divergent-lineage server DBs (see deploy notes) may not have all six; + destructive but safe — the columns hold only tuning overrides, NULL in practice. +- `apps/backend/src/runs/compaction.ts` — `resolveCompactionConfig` returns + `DEFAULT_COMPACTION_CONFIG` unconditionally; delete `CompactionConfigOverrides` + and the per-agent merge. Keep `DEFAULT_COMPACTION_CONFIG` + `computeBudget`. +- `apps/backend/src/services/chat-execution.ts` — drop the `agent` argument to + `resolveCompactionConfig`; keep the `COMPACTION_ENABLED` env override. +- `apps/frontend` — remove the six compaction fields from the agent-edit form. + +**Verify.** Agent create/update no longer accepts the six fields; chat still +compacts using defaults; `COMPACTION_ENABLED=false` still disables proactively; +recovery still fires when proactive is off; migrate is idempotent on a DB missing +some columns. + +--- + +## Chunk 13 — compaction reliability + prompt overhaul (planned, 2026-06-12) + +Chunk 12 shipped (`625ff96` + dead-`agent`-binding cleanup + env-override knobs +`da2c159`). Live test-server run on a single-vLLM provider (`qwen36`, lowered +ceiling via `.env`: trigger 0.2 / target 0.1 / keepRecent 4 / minPrunable 500) +surfaced a turn-killing bug + several prompt/UX gaps. All findings + fixes below. + +### Observed bug — per-step timeout kills pre-stream compaction + +Live log evidence (chat `hur61ZR79koiHQysBDS2o`): + +- Trigger fired (`projected` 63,511 > `triggerTokens` 48,988). +- `summarize` ran **149,955 ms** on `qwen36`, **input 6,178 → output 8,631 tokens** + (the summary was LONGER than its input — degenerate expansion, not compression). +- The run's **per-step stall timeout (120,000 ms) fired at ~120 s** → `level:50 +"Run timed out" kind:"step"` → run aborted ~30 s **before** summarize returned. +- `summarize` ignored the abort, finished at 150 s, committed the watermark → + `context-compacted` logged (dropped 9, 63,511 → 14,785). But the turn was already + dead → **no model answer streamed, and the turn's assistant message was lost.** + +Root cause: Tier-1 `summarize()` is a long blocking call that runs **inside +`prepareChatTurn`, before the response stream opens**, and does **not bump the +per-step stall timer**. The 120 s watchdog treats it as a stalled step and kills +the run. + +Why the turn vanished from the chat: two separate writes. The durable +summary/watermark is a CAS write on the **chat row** (survived — later turns have +the summary, persisted value is a clean **770-char / ~193-token** structured +summary). The turn's **assistant message** (answer + the synthetic `compact_context` +trace part) only persists via the **response stream**, which never opened. So the +chat-row state advanced but the visible turn was lost. + +Note: the 8,631-token runaway was the timed-out turn and was **discarded** — the +persisted summary is the good 193-token one. So `qwen36` _can_ summarize tightly; +the 8,631 was a pathological one-off. Confirms a `maxOutputTokens` ceiling loses +no context in normal operation (healthy output is ~10× below a 2k ceiling). + +### Fixes (in priority order) + +1. **Heartbeat during summarize (CRITICAL).** Compaction is legitimate long work, + not a stall. Ping `onActivity` / bump the per-step timer on an interval while + `summarize` runs so the 120 s watchdog keeps resetting. Directly stops the + spurious kill. (`buildCompactionRuntime` already has `onActivity` in scope via + the turn; thread it into the summarize wrapper, tick ~every 10 s.) + +2. **`maxOutputTokens` ceiling (~2,000) + "be concise" prompt instruction.** Pure + safety backstop against the runaway — NOT a blind truncation. The prompt asks + the model to compress _to fit_ (length target); the ceiling only catches a + degenerate run. Proven safe: real summaries are ~193 tokens. Also log + `finishReason === "length"` so we know if the cap ever bit. + +3. **Open the response stream BEFORE compaction (bigger refactor).** Today the + synthetic `compact_context` chunks are injected post-hoc by `prependCompactionChunks` + as a paired `tool-input-available` + `tool-output-available` — i.e. already + "Completed", emitted only after the (already-open-too-late) model stream's + `start`. To show live **Pending → Running → Completed** AND keep the HTTP / + playit-tunnel connection alive during the wait (a second timeout vector): + - Split the cheap trigger decision (`projectTier1Tokens` vs `triggerTokens`, no + LLM) from the expensive `summarize`, so we know to emit the pending chunk + before paying for summarize. + - Build a prelude stream: `start` + `tool-input-available(compact_context)` → + await summarize → `tool-output-available` → concat the model stream (suppress/ + merge the model's own `start` so the synthetic part + answer share one message + id). Replaces the post-hoc `prependCompactionChunks` injection. + - Move the compacted-messages await out of `prepareChatTurn` into the stream step. + - Frontend is **already done** — `tool.tsx` renders `input-available` = "Running" + (pulsing clock), `output-available` = "Completed". Zero frontend change. + - Error path: if `summarize` throws after the pending chunk shows, resolve the + tool part to `output-error` — do not leave it stuck "Running". Tier-1 stays + best-effort (fail → proceed uncompacted) but must close the tool part. + - Preserve invariants: `stripCompactionTraceParts` + snapshot persistence expect + a well-formed input+output pair; tee/snapshot drain must see prelude chunks. + +4. **Pass `abortSignal` into `summarize` (minor correctness).** Today summarize + burned 30 s + a full LLM call _after_ the turn was dead. Make it cancellable so + a real abort stops it. Fold into #3. + +### Prompt overhaul + +Current prompt (`chat-execution.ts` ~L578) is an unstructured one-liner with **no +length instruction** (the runaway's root). Prior-art survey of real summarization +prompts (2026-06-12): + +- **Claude Code** — heaviest: chronological analysis + **9 sections** (Primary + Request & Intent, Key Technical Concepts, Files & Code, Errors & fixes, Problem + Solving, All user messages, Pending Tasks, Current Work, Optional Next Step); + security-relevant instructions preserved **verbatim**. +- **Codex CLI** — handoff-oriented: _"You are performing a CONTEXT CHECKPOINT + COMPACTION. Create a handoff summary for another LLM that will resume the task."_ + - 4 sections (progress & decisions · context/constraints/prefs · what remains · + critical data/refs). Prepends a **resume prefix** next turn (_"Another language + model started to solve this problem and produced a summary…"_) so the resumer + builds on prior work instead of restarting. Issue #14347: sections **reduce loss + over repeated compactions**. +- **OpenCode** — 6 sections (done · WIP · files · next steps · user requests/ + constraints · decisions & rationale). +- **Hermes Agent** — weakest: just _"Summarize these conversation turns concisely"_ + → `[CONTEXT SUMMARY]: `, positional keep (first 3 + last 4 turns), Gemini + Flash aux model. Open issue #499 proposes **copying Codex's structured handoff** + — Hermes is behind us, not ahead. + +Gaps vs prior art: (a) no length instruction → the runaway; (b) no section +structure → erodes across repeated re-compactions (we feed the prior summary back +in via `priorSummaryTokens`); (c) no "build on prior work" framing (our +`summaryUIMessage` prefix `[Summary of earlier conversation]` is just a label). + +**Proposed replacement system prompt** (handoff + sections + concise + integrate- +prior; pairs with the #2 ceiling): + +``` +You are performing a context checkpoint compaction. Another instance of this +assistant will resume using ONLY your summary plus the most recent messages — +earlier history will be gone. Write a dense markdown handoff under these +headings (omit one only if truly empty): + +- **Intent & open requests** — what the user wants, the latest explicit request, pending tasks. +- **Decisions & facts** — conclusions, confirmed values/IDs/paths, constraints and user + preferences (preserve any security-relevant instruction verbatim). +- **Files & tools touched** — what was read/changed and why. +- **Current state & next step** — where things stand and the immediate next action. + +If a prior summary appears in the history, integrate it — don't drop facts it +captured. Be concise: aim under ~1500 tokens. Output only the summary. +``` + +### Deferred / decided-against here + +- **Selectable compaction model in provider UI** — DECIDED NO. The compaction + model already = `provider.taskModelId` (same-provider only: summarize runs through + the chat provider's own client `opened.languageModel(taskModelId)`). On a single + vLLM (`modelIds` has one entry) there is no other model to pick, so a dropdown is + a no-op. `workspace.taskModelProviderId` routes _other_ task work (tag/title) to a + different provider but compaction does NOT use it. A separate fast compaction + endpoint would need new wiring (route summarize through a task provider's client) + + multi-model infra (2nd provider or a LiteLLM gateway) — not worth it now. + +### Verify (Chunk 13) + +Compaction no longer trips the per-step timeout (heartbeat); a slow summarize +streams a Running tool part instead of a blank turn; summary output is bounded +(`finishReason` logged if capped); the compacted turn's assistant message + trace +persist even when summarize is slow; the new prompt yields structured, concise, +multi-compaction-stable summaries. + +--- + ## Open / deferred decisions - **OpenAI-compatible as a separate provider type** — not required (auto-detect From 212220f623d0fa2e0bf894e5211a53469eb20cd8 Mon Sep 17 00:00:00 2001 From: "frantisek.spacek@morosystems.cz" Date: Sat, 13 Jun 2026 08:18:23 +0200 Subject: [PATCH 13/21] =?UTF-8?q?feat(backend):=20chunk=2013=20=E2=80=94?= =?UTF-8?q?=20summarize=20heartbeat=20+=20output=20ceiling=20+=20prompt=20?= =?UTF-8?q?overhaul?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix 1 (CRITICAL): thread `onActivity` into `buildCompactionRuntime` and tick it every 10 s via `setInterval` while `generateText` runs inside the `summarize` closure. The 120 s per-step stall watchdog now sees regular activity during a slow summarize call instead of silence, and no longer kills the run before the summary is committed. Fix 2: add `maxOutputTokens: 2000` ceiling to the summarize `generateText` call as a hard backstop against the runaway-expansion failure mode observed in the live test (8,631-token output for a 6,178-token input). Log `finishReason` and emit a `warn` if the ceiling is hit so the event is visible in prod logs. Replace the unstructured one-liner system prompt with a structured four-section handoff prompt (Intent / Decisions / Files / Next step) that survives repeated re-compactions and includes an explicit "aim under ~1500 tokens" length target. Fix 3 (stream-before-compaction) deferred — heartbeat addresses the immediate timeout bug; the live-indicator refactor is a larger architectural change. Co-Authored-By: Claude Sonnet 4.6 --- apps/backend/src/services/chat-execution.ts | 84 ++++++++++++++++----- 1 file changed, 66 insertions(+), 18 deletions(-) diff --git a/apps/backend/src/services/chat-execution.ts b/apps/backend/src/services/chat-execution.ts index 64e27e7b..c1cee0c6 100644 --- a/apps/backend/src/services/chat-execution.ts +++ b/apps/backend/src/services/chat-execution.ts @@ -530,13 +530,26 @@ type CompactionRuntime = { * resolution falls back to the conservative default so recovery (P4) always * has a working configuration. */ +/** Safety ceiling on summarizer output (Chunk 13, Fix 2). Prevents a runaway + * model from producing a summary longer than its input. Healthy summaries on + * a 6k-token prefix are ~150-200 tokens; 2000 is a generous ceiling. */ +const SUMMARIZE_MAX_OUTPUT_TOKENS = 2000; + +/** Heartbeat interval while the summarizer runs (Chunk 13, Fix 1). Resets the + * per-step stall watchdog so a slow summarize call is not misidentified as a + * frozen run and killed before it returns. */ +const SUMMARIZE_HEARTBEAT_INTERVAL_MS = 10_000; + async function buildCompactionRuntime(args: { chatId?: string; provider: Provider; resolvedModelId: string; opened: ReturnType; + /** When present, called every ~10 s during `summarize` to keep the per-step + * stall watchdog alive (Chunk 13, Fix 1). */ + onActivity?: () => void; }): Promise { - const { chatId, provider, resolvedModelId, opened } = args; + const { chatId, provider, resolvedModelId, opened, onActivity } = args; const config = { ...DEFAULT_COMPACTION_CONFIG }; // Global kill switch (§G) gates proactive compaction; recovery is unaffected. @@ -594,23 +607,55 @@ async function buildCompactionRuntime(args: { // when unset (drift T7). generateText is one-shot, no tools. const summarize = async (text: string): Promise => { const startedAt = Date.now(); - const { text: summary, usage } = await generateText({ - model: opened.languageModel(taskModelId), - system: - "You compress conversation history for context reuse. Produce a dense summary capturing decisions made, facts established, files/tools touched, open questions, and the user's intent. Drop pleasantries and redundancy. Output only the summary.", - prompt: text, - }); - logger.info( - { - metric: "summarize.latency_ms", - latencyMs: Date.now() - startedAt, - chatId, - taskModelId, - usage, - }, - "context compaction summarize", - ); - return summary; + // Fix 1 (Chunk 13): keep the per-step stall watchdog alive while the + // summarizer runs. Tier-1 compaction is legitimate long work, not a stall; + // without this ping the 120 s watchdog fires and kills the run. + const heartbeat = onActivity + ? setInterval(onActivity, SUMMARIZE_HEARTBEAT_INTERVAL_MS) + : null; + try { + const result = await generateText({ + model: opened.languageModel(taskModelId), + // Fix 2 (Chunk 13): structured handoff prompt — sections reduce loss + // across repeated re-compactions (Codex CLI pattern); explicit concise + // instruction + "aim under ~1500 tokens" pairs with the output ceiling. + system: `You are performing a context checkpoint compaction. Another instance of this assistant will resume using ONLY your summary plus the most recent messages — earlier history will be gone. Write a dense markdown handoff under these headings (omit one only if truly empty): + +- **Intent & open requests** — what the user wants, the latest explicit request, pending tasks. +- **Decisions & facts** — conclusions, confirmed values/IDs/paths, constraints and user preferences (preserve any security-relevant instruction verbatim). +- **Files & tools touched** — what was read/changed and why. +- **Current state & next step** — where things stand and the immediate next action. + +If a prior summary appears in the history, integrate it — don't drop facts it captured. Be concise: aim under ~1500 tokens. Output only the summary.`, + prompt: text, + // Fix 2 (Chunk 13): hard ceiling prevents a degenerate run from + // producing a summary longer than its input. Healthy summaries on a + // 6k-token prefix are ~150-200 tokens; 2000 is a generous backstop. + maxOutputTokens: SUMMARIZE_MAX_OUTPUT_TOKENS, + }); + const { text: summary, usage, finishReason } = result; + logger.info( + { + metric: "summarize.latency_ms", + latencyMs: Date.now() - startedAt, + chatId, + taskModelId, + usage, + finishReason, + hitOutputCeiling: finishReason === "length", + }, + "context compaction summarize", + ); + if (finishReason === "length") { + logger.warn( + { chatId, taskModelId, maxTokens: SUMMARIZE_MAX_OUTPUT_TOKENS }, + "summarize hit maxOutputTokens ceiling — summary may be truncated", + ); + } + return summary; + } finally { + if (heartbeat !== null) clearInterval(heartbeat); + } }; return { @@ -866,6 +911,9 @@ export const prepareChatTurn = async ( provider, resolvedModelId, opened, + // Thread the activity callback so the summarizer heartbeat can bump the + // per-step stall watchdog (Chunk 13, Fix 1). + onActivity: onActivity ? () => onActivity() : undefined, }); // Per-turn overhead: system prompt + tool schemas, sent on every turn but From 48cb9cf85b7768e0a5ef51c5cec8370dbd398222 Mon Sep 17 00:00:00 2001 From: "frantisek.spacek@morosystems.cz" Date: Sat, 13 Jun 2026 09:01:10 +0200 Subject: [PATCH 14/21] fix(backend): harden chunk 13 summarizer per review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address review findings on the chunk 13 summarize changes: - Reorder the handoff prompt sections most-critical-first (Intent → Current state & next step → Decisions → Files) so a truncation at the maxOutputTokens ceiling drops file/tool detail rather than the resume-critical intent and next step. Add a front-load instruction. - Thread the run's AbortSignal through prepareChatTurn and buildCompactionRuntime into the summarizer generateText call. The Fix 1 heartbeat suppresses the per-step stall watchdog during summarize, so without this a hung call would leak until the 10 min per-run timeout; now a cancel/timeout aborts it. - Drop the redundant onActivity wrapper at the call site (the optional event param already satisfies the () => void heartbeat signature). - Reconcile the conflicting token-count comments (~150-200 vs ~1500) around the output ceiling. - Add tests for the summarize closure: abort signal + ceiling + ordered prompt threading, the finishReason === length warn path, and the heartbeat tick/clear lifecycle. Export buildCompactionRuntime for test. Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/backend/src/runs/agent-runner.ts | 3 + .../src/services/chat-execution.test.ts | 114 ++++++++++++++++++ apps/backend/src/services/chat-execution.ts | 47 ++++++-- 3 files changed, 154 insertions(+), 10 deletions(-) diff --git a/apps/backend/src/runs/agent-runner.ts b/apps/backend/src/runs/agent-runner.ts index 95db32de..5309b48e 100644 --- a/apps/backend/src/runs/agent-runner.ts +++ b/apps/backend/src/runs/agent-runner.ts @@ -403,6 +403,7 @@ export class AgentRunner { frontendUrl?: string, onActivity?: (event?: ToolActivityEvent) => void, priorMessages?: PlatypusUIMessage[], + signal?: AbortSignal, ): Promise { return prepareChatTurn({ orgId: scope.orgId, @@ -415,6 +416,7 @@ export class AgentRunner { runMode: scope.principal.kind === "user" ? "interactive" : "headless", onActivity, priorMessages, + signal, }); } @@ -521,6 +523,7 @@ export class AgentRunner { params.frontendUrl, onActivity, priorMessages, + handle.signal, ); } catch (error) { const err = error instanceof Error ? error : new Error(String(error)); diff --git a/apps/backend/src/services/chat-execution.test.ts b/apps/backend/src/services/chat-execution.test.ts index 68bc212a..fd332a9a 100644 --- a/apps/backend/src/services/chat-execution.test.ts +++ b/apps/backend/src/services/chat-execution.test.ts @@ -56,14 +56,27 @@ vi.mock("@ai-sdk/mcp", () => ({ auth: vi.fn(), })); +// Partial mock of "ai": only `generateText` is replaced (used by the compaction +// summarizer). `createIdGenerator` and the rest stay real via importActual. +const { mockGenerateText } = vi.hoisted(() => ({ + mockGenerateText: vi.fn(), +})); +vi.mock("ai", async (importActual) => { + const actual = await importActual(); + return { ...actual, generateText: mockGenerateText }; +}); + import { prepareChatTurn, + buildCompactionRuntime, NotFoundError, ValidationError, createToolHeartbeat, shouldInjectNativeSearch, } from "./chat-execution.ts"; import { createInMemoryChatTurnQueries } from "./chat-execution.test-fixtures.ts"; +import { logger } from "../logger.ts"; +import { contextWindowResolver } from "../runs/context-window.ts"; const baseProvider = { id: "p1", @@ -646,4 +659,105 @@ describe("chat-execution", () => { ).toBe(true); }); }); + + describe("buildCompactionRuntime summarize (Chunk 13 / review Fix B)", () => { + const buildRuntime = (signal?: AbortSignal, onActivity?: () => void) => + buildCompactionRuntime({ + chatId: "chat-1", + provider: baseProvider as never, + resolvedModelId: "gpt-4", + opened: { + languageModel: vi.fn(() => ({ modelId: "task-model" })), + } as never, + onActivity, + signal, + }); + + beforeEach(() => { + mockGenerateText.mockReset(); + vi.spyOn(contextWindowResolver, "resolve").mockResolvedValue({ + contextWindow: 128_000, + maxOutputTokens: 4096, + source: "registry", + } as never); + }); + + afterEach(() => { + vi.restoreAllMocks(); + vi.useRealTimers(); + }); + + it("threads the abort signal, output ceiling, and ordered prompt into generateText", async () => { + mockGenerateText.mockResolvedValue({ + text: "SUMMARY", + usage: {}, + finishReason: "stop", + }); + const controller = new AbortController(); + const runtime = await buildRuntime(controller.signal); + + const out = await runtime.summarize("history text"); + + expect(out).toBe("SUMMARY"); + expect(mockGenerateText).toHaveBeenCalledTimes(1); + const arg = mockGenerateText.mock.calls[0][0]; + expect(arg.maxOutputTokens).toBe(2000); + expect(arg.abortSignal).toBe(controller.signal); + expect(arg.prompt).toBe("history text"); + expect(arg.system).toContain("context checkpoint compaction"); + // Sections ordered most-critical-first so truncation drops the tail + // (file/tool detail), not intent or next step. + const intentIdx = arg.system.indexOf("Intent & open requests"); + const nextStepIdx = arg.system.indexOf("Current state & next step"); + const filesIdx = arg.system.indexOf("Files & tools touched"); + expect(intentIdx).toBeGreaterThanOrEqual(0); + expect(nextStepIdx).toBeGreaterThan(intentIdx); + expect(filesIdx).toBeGreaterThan(nextStepIdx); + }); + + it("warns but still returns the summary when the output ceiling is hit", async () => { + mockGenerateText.mockResolvedValue({ + text: "TRUNCATED", + usage: {}, + finishReason: "length", + }); + const warn = vi.spyOn(logger, "warn").mockImplementation(() => {}); + const runtime = await buildRuntime(); + + const out = await runtime.summarize("x"); + + expect(out).toBe("TRUNCATED"); + expect(warn).toHaveBeenCalledWith( + expect.objectContaining({ maxTokens: 2000 }), + expect.stringContaining("maxOutputTokens ceiling"), + ); + }); + + it("bumps onActivity on each heartbeat tick while summarize runs, then stops", async () => { + let resolveGen: (v: unknown) => void = () => {}; + mockGenerateText.mockImplementation( + () => + new Promise((resolve) => { + resolveGen = resolve; + }), + ); + const onActivity = vi.fn(); + const runtime = await buildRuntime(undefined, onActivity); + + vi.useFakeTimers(); + const pending = runtime.summarize("x"); + + // Two heartbeat intervals (10 s each) elapse mid-call. + await vi.advanceTimersByTimeAsync(25_000); + expect(onActivity).toHaveBeenCalledTimes(2); + + resolveGen({ text: "S", usage: {}, finishReason: "stop" }); + await pending; + + // Interval cleared in the finally block — no further bumps. + onActivity.mockClear(); + await vi.advanceTimersByTimeAsync(30_000); + expect(onActivity).not.toHaveBeenCalled(); + }); + }); }); diff --git a/apps/backend/src/services/chat-execution.ts b/apps/backend/src/services/chat-execution.ts index c1cee0c6..724970d0 100644 --- a/apps/backend/src/services/chat-execution.ts +++ b/apps/backend/src/services/chat-execution.ts @@ -224,6 +224,13 @@ export type PrepareChatTurnInput = { * check falls back to a DB read that now returns the post-overwrite state. */ priorMessages?: PlatypusUIMessage[]; + /** + * Run abort signal from the run registry. Threaded into the compaction + * summarizer so a cancelled or timed-out run aborts the in-flight + * `generateText` (review Fix B). Optional: callers without a registry-backed + * run (tests, ad-hoc) omit it and the summarize call simply runs uncancelled. + */ + signal?: AbortSignal; }; /** @@ -531,8 +538,8 @@ type CompactionRuntime = { * has a working configuration. */ /** Safety ceiling on summarizer output (Chunk 13, Fix 2). Prevents a runaway - * model from producing a summary longer than its input. Healthy summaries on - * a 6k-token prefix are ~150-200 tokens; 2000 is a generous ceiling. */ + * model from producing a summary longer than its input. The system prompt + * targets ~1500 tokens; this 2000 backstop only trips on a degenerate run. */ const SUMMARIZE_MAX_OUTPUT_TOKENS = 2000; /** Heartbeat interval while the summarizer runs (Chunk 13, Fix 1). Resets the @@ -540,7 +547,7 @@ const SUMMARIZE_MAX_OUTPUT_TOKENS = 2000; * frozen run and killed before it returns. */ const SUMMARIZE_HEARTBEAT_INTERVAL_MS = 10_000; -async function buildCompactionRuntime(args: { +export async function buildCompactionRuntime(args: { chatId?: string; provider: Provider; resolvedModelId: string; @@ -548,8 +555,13 @@ async function buildCompactionRuntime(args: { /** When present, called every ~10 s during `summarize` to keep the per-step * stall watchdog alive (Chunk 13, Fix 1). */ onActivity?: () => void; + /** Run abort signal, threaded into the summarizer `generateText` so a + * cancelled / per-run-timed-out run aborts the call instead of leaking it + * past the heartbeat-suppressed per-step watchdog (review Fix B). */ + signal?: AbortSignal; }): Promise { - const { chatId, provider, resolvedModelId, opened, onActivity } = args; + const { chatId, provider, resolvedModelId, opened, onActivity, signal } = + args; const config = { ...DEFAULT_COMPACTION_CONFIG }; // Global kill switch (§G) gates proactive compaction; recovery is unaffected. @@ -619,19 +631,28 @@ async function buildCompactionRuntime(args: { // Fix 2 (Chunk 13): structured handoff prompt — sections reduce loss // across repeated re-compactions (Codex CLI pattern); explicit concise // instruction + "aim under ~1500 tokens" pairs with the output ceiling. - system: `You are performing a context checkpoint compaction. Another instance of this assistant will resume using ONLY your summary plus the most recent messages — earlier history will be gone. Write a dense markdown handoff under these headings (omit one only if truly empty): + // Sections are ordered most-critical-first: if the output is truncated + // at the ceiling (finishReason === "length"), the tail that drops is + // the least resume-critical (file/tool detail), not intent or next step. + system: `You are performing a context checkpoint compaction. Another instance of this assistant will resume using ONLY your summary plus the most recent messages — earlier history will be gone. Write a dense markdown handoff under these headings, in this order (omit one only if truly empty). Front-load the most important facts within each section — if you run long, later detail may be cut: - **Intent & open requests** — what the user wants, the latest explicit request, pending tasks. +- **Current state & next step** — where things stand and the immediate next action. - **Decisions & facts** — conclusions, confirmed values/IDs/paths, constraints and user preferences (preserve any security-relevant instruction verbatim). - **Files & tools touched** — what was read/changed and why. -- **Current state & next step** — where things stand and the immediate next action. If a prior summary appears in the history, integrate it — don't drop facts it captured. Be concise: aim under ~1500 tokens. Output only the summary.`, prompt: text, // Fix 2 (Chunk 13): hard ceiling prevents a degenerate run from - // producing a summary longer than its input. Healthy summaries on a - // 6k-token prefix are ~150-200 tokens; 2000 is a generous backstop. + // producing a summary longer than its input. Healthy summaries run a + // few hundred tokens; the ~1500-token prompt target leaves headroom + // under this 2000 backstop, which only trips on a degenerate run. maxOutputTokens: SUMMARIZE_MAX_OUTPUT_TOKENS, + // Fix B (review): thread the run's abort signal so a cancelled or + // per-run-timed-out run actually aborts this call. The heartbeat above + // keeps the per-step watchdog from firing, so without this a hung + // summarize would otherwise run until the 10 min per-run timeout. + abortSignal: signal, }); const { text: summary, usage, finishReason } = result; logger.info( @@ -807,6 +828,7 @@ export const prepareChatTurn = async ( frontendUrl, runMode = "interactive", onActivity, + signal, } = input; const workspace = await queries.getWorkspace(workspaceId); @@ -912,8 +934,13 @@ export const prepareChatTurn = async ( resolvedModelId, opened, // Thread the activity callback so the summarizer heartbeat can bump the - // per-step stall watchdog (Chunk 13, Fix 1). - onActivity: onActivity ? () => onActivity() : undefined, + // per-step stall watchdog (Chunk 13, Fix 1). `onActivity` accepts an + // optional event, so it satisfies the `() => void` heartbeat signature + // directly — the interval invokes it with no event (timer-only bump). + onActivity, + // Thread the abort signal so a cancelled/timed-out run aborts summarize + // instead of leaking past the heartbeat-suppressed watchdog (review Fix B). + signal, }); // Per-turn overhead: system prompt + tool schemas, sent on every turn but From 797af60728e5c9f641bbfe5ab1edfcd25236952a Mon Sep 17 00:00:00 2001 From: "frantisek.spacek@morosystems.cz" Date: Sun, 14 Jun 2026 00:07:33 +0200 Subject: [PATCH 15/21] fix(backend): prune large tool results in kept recent messages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stage 2 compaction returned recent messages verbatim, so large tool results (e.g. MCP dumps) in the kept window dominated tokensAfter and prevented reaching targetTokens. Prune recent tool outputs with a higher threshold (minRecentPrunableChars, default minPrunableChars*5) and apply it across every over-target return path — including the empty-prefix / null-watermark bail where the whole history fits within keepRecentMessages. Warn when the post-compaction estimate still exceeds 2x target. Raise summarizer output ceiling 2000 -> 4000 to catch models that ignore the 1500-token prompt limit. Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/backend/.env.example | 6 +- apps/backend/src/runs/compaction.test.ts | 88 +++++++++++++++++++ apps/backend/src/runs/compaction.ts | 48 +++++++++- .../src/services/chat-execution.test.ts | 4 +- apps/backend/src/services/chat-execution.ts | 17 ++-- 5 files changed, 148 insertions(+), 15 deletions(-) diff --git a/apps/backend/.env.example b/apps/backend/.env.example index a2113b8f..1756e6b3 100644 --- a/apps/backend/.env.example +++ b/apps/backend/.env.example @@ -57,11 +57,13 @@ FRONTEND_URL=http://localhost:3001 # COMPACTION_ENABLED=true # # Optional overrides for the global ceiling. Unset = built-in defaults -# (trigger 0.8, target 0.5, reserve 0.05, keepRecent 10, minPrunable 2000). +# (trigger 0.8, target 0.5, reserve 0.05, keepRecent 10, minPrunable 2000, +# minRecentPrunable 10000). # Lower the trigger to exercise auto-compaction on test deployments. # Keep target < trigger or compaction re-fires every turn. # COMPACTION_TRIGGER_RATIO=0.8 # COMPACTION_TARGET_RATIO=0.5 # COMPACTION_RESERVE_RATIO=0.05 # COMPACTION_KEEP_RECENT=10 -# COMPACTION_MIN_PRUNABLE_CHARS=2000 \ No newline at end of file +# COMPACTION_MIN_PRUNABLE_CHARS=2000 +# COMPACTION_MIN_RECENT_PRUNABLE_CHARS=10000 \ No newline at end of file diff --git a/apps/backend/src/runs/compaction.test.ts b/apps/backend/src/runs/compaction.test.ts index 33b72de0..467c752f 100644 --- a/apps/backend/src/runs/compaction.test.ts +++ b/apps/backend/src/runs/compaction.test.ts @@ -15,6 +15,7 @@ import { type CompactionState, type WatermarkPatch, } from "./compaction.ts"; +import { logger } from "../logger.ts"; import type { ModelMessage } from "ai"; import type { PlatypusUIMessage } from "../types.ts"; @@ -309,6 +310,93 @@ describe("compactUIMessages (Tier 1)", () => { }); expect(summarize.mock.calls.length).toBeGreaterThan(1); }); + + it("Stage 2 prunes large tool results in kept (recent) messages", async () => { + const msgs = [ + uiText("p1", "user", "P".repeat(4000)), + uiText("p2", "assistant", "Q".repeat(4000)), + uiTool("r1", "X".repeat(12000)), // big tool result in recent + uiText("r2", "user", "done"), + ]; + const res = await compactUIMessages(msgs, { + ...baseOpts, + summarize: noopSummarize, + targetTokens: 300, + minRecentPrunableChars: 5000, // 12000-char output exceeds threshold + }); + expect(res.usedModelCall).toBe(true); + expect(res.keptMessages).toHaveLength(2); // r1 + r2 + // Tool result in r1 should be trimmed (soft-trim produces head+tail, not full string) + const toolPart = res.keptMessages[0].parts?.find((p) => + (p as { type: string }).type.startsWith("tool-"), + ) as { output?: string } | undefined; + expect(typeof toolPart?.output).toBe("string"); + expect((toolPart?.output as string).length).toBeLessThan(12000); + }); + + it("Stage 2 does not prune recent tool results below minRecentPrunableChars", async () => { + const msgs = [ + uiText("p1", "user", "P".repeat(4000)), + uiText("p2", "assistant", "Q".repeat(4000)), + uiTool("r1", "X".repeat(3000)), // below threshold of 20000 + uiText("r2", "user", "done"), + ]; + const res = await compactUIMessages(msgs, { + ...baseOpts, + summarize: noopSummarize, + targetTokens: 300, + minRecentPrunableChars: 20000, // threshold above 3000 → no pruning + }); + expect(res.usedModelCall).toBe(true); + const toolPart = res.keptMessages[0].parts?.find((p) => + (p as { type: string }).type.startsWith("tool-"), + ) as { output?: string } | undefined; + // Output unchanged — 3000 chars below threshold + expect(toolPart?.output).toBe("X".repeat(3000)); + }); + + it("prunes large recent tool results when the prefix is empty (no summary)", async () => { + // Whole history fits within keepRecentMessages (2) but a huge tool result + // pushes it over target. boundary=0 → empty prefix → no model call, but the + // outlier in recent must still be trimmed (Finding 1 gap). + const msgs = [ + uiTool("r1", "X".repeat(12000)), // big tool result, no prefix to summarize + uiText("r2", "user", "done"), + ]; + const res = await compactUIMessages(msgs, { + ...baseOpts, + summarize: noopSummarize, + targetTokens: 300, + minRecentPrunableChars: 5000, + }); + expect(res.usedModelCall).toBe(false); // empty prefix → no summarize + const toolPart = res.keptMessages[0].parts?.find((p) => + (p as { type: string }).type.startsWith("tool-"), + ) as { output?: string } | undefined; + expect(typeof toolPart?.output).toBe("string"); + expect((toolPart?.output as string).length).toBeLessThan(12000); + }); + + it("warns when Stage 2 result still exceeds 2× targetTokens after pruning", async () => { + const warn = vi.spyOn(logger, "warn").mockReturnValue(undefined); + const msgs = [ + uiText("p1", "user", "P".repeat(4000)), + uiText("p2", "assistant", "Q".repeat(4000)), + // recent messages are huge text (not tool), cannot be pruned + uiText("r1", "user", "R".repeat(8000)), + uiText("r2", "assistant", "S".repeat(8000)), + ]; + await compactUIMessages(msgs, { + ...baseOpts, + summarize: noopSummarize, + targetTokens: 50, // recent alone is ~4000 tokens → well over 2×50 + }); + expect(warn).toHaveBeenCalledWith( + expect.objectContaining({ targetTokens: 50 }), + expect.stringContaining("recent messages exceed target"), + ); + warn.mockRestore(); + }); }); describe("compactModelMessages (Tier 2 / recovery)", () => { diff --git a/apps/backend/src/runs/compaction.ts b/apps/backend/src/runs/compaction.ts index 9fdf9218..11d3bb6e 100644 --- a/apps/backend/src/runs/compaction.ts +++ b/apps/backend/src/runs/compaction.ts @@ -272,6 +272,9 @@ export type UICompactOptions = { targetTokens: number; keepRecentMessages: number; minPrunableChars: number; + /** Threshold for pruning tool results in kept (recent) messages after Stage 2. + * Defaults to minPrunableChars * 5 when omitted. */ + minRecentPrunableChars?: number; imageProvider?: ImageProvider; /** Existing durable summary to fold the new prefix into (incremental). */ priorSummary?: string | null; @@ -395,6 +398,31 @@ export async function compactUIMessages( }; } + // Past this point we are over target. Prune large tool results in the kept + // (recent) messages — these stay in the model view, so extreme outliers (e.g. + // large MCP tool dumps) bloat tokensAfter and prevent reaching targetTokens. + // Computed once and reused by every over-target return below (the empty-prefix + // / null-watermark bail and the Stage 2 summarize path) so a history that fits + // entirely within keepRecentMessages still gets its outliers trimmed. + const recentThreshold = + opts.minRecentPrunableChars ?? opts.minPrunableChars * 5; + const prunedRecent = recent.map( + (m) => pruneUIMessage(m, recentThreshold).message, + ); + + const warnIfOverTarget = (afterEstimate: number) => { + if (afterEstimate > opts.targetTokens * 2) { + logger.warn( + { + afterEstimate, + targetTokens: opts.targetTokens, + keepRecentMessages: opts.keepRecentMessages, + }, + "compaction fired but recent messages exceed target — keepRecentMessages may be locking in large tool results", + ); + } + }; + // RV4: nothing to summarize when the prefix is empty (history fits within // keepRecentMessages). Also bail when the boundary message has no id — we // cannot anchor a watermark there, and committing a watermark:null + @@ -404,13 +432,16 @@ export async function compactUIMessages( const watermarkId = prefix.length > 0 ? (prefix[prefix.length - 1].id ?? null) : null; if (prefix.length === 0 || watermarkId === null) { + const kept = [...prunedPrefix, ...prunedRecent]; + const afterEstimate = estimate(kept) + priorTokens; + warnIfOverTarget(afterEstimate); return { - keptMessages: prunedAll, + keptMessages: kept, summaryText: opts.priorSummary ?? null, watermarkId: null, messagesDropped: 0, usedModelCall: false, - estimatedTokens: estimate(prunedAll) + priorTokens, + estimatedTokens: afterEstimate, }; } @@ -422,13 +453,16 @@ export async function compactUIMessages( opts.summarizerWindow, ); + const afterEstimate = estimate(prunedRecent) + textTokens(summaryText); + warnIfOverTarget(afterEstimate); + return { - keptMessages: recent, + keptMessages: prunedRecent, summaryText, watermarkId, messagesDropped: prefix.length, usedModelCall: true, - estimatedTokens: estimate(recent) + textTokens(summaryText), + estimatedTokens: afterEstimate, }; } @@ -667,6 +701,10 @@ export type CompactionConfig = { reserveRatio: number; keepRecentMessages: number; minPrunableChars: number; + /** Threshold for pruning tool results in the kept (recent) messages after + * Stage 2 summarization. Higher than minPrunableChars — we trim extreme + * outliers (e.g. huge MCP tool dumps) without destroying useful context. */ + minRecentPrunableChars: number; }; export const DEFAULT_COMPACTION_CONFIG: CompactionConfig = { @@ -676,6 +714,7 @@ export const DEFAULT_COMPACTION_CONFIG: CompactionConfig = { reserveRatio: 0.05, keepRecentMessages: 10, minPrunableChars: 2000, + minRecentPrunableChars: 10000, }; export type Budget = { @@ -939,6 +978,7 @@ export async function applyTier1Compaction( targetTokens: effectiveTarget, keepRecentMessages: config.keepRecentMessages, minPrunableChars: config.minPrunableChars, + minRecentPrunableChars: config.minRecentPrunableChars, imageProvider, priorSummary, summarize: input.summarize, diff --git a/apps/backend/src/services/chat-execution.test.ts b/apps/backend/src/services/chat-execution.test.ts index fd332a9a..304646f2 100644 --- a/apps/backend/src/services/chat-execution.test.ts +++ b/apps/backend/src/services/chat-execution.test.ts @@ -701,7 +701,7 @@ describe("chat-execution", () => { expect(out).toBe("SUMMARY"); expect(mockGenerateText).toHaveBeenCalledTimes(1); const arg = mockGenerateText.mock.calls[0][0]; - expect(arg.maxOutputTokens).toBe(2000); + expect(arg.maxOutputTokens).toBe(4000); expect(arg.abortSignal).toBe(controller.signal); expect(arg.prompt).toBe("history text"); expect(arg.system).toContain("context checkpoint compaction"); @@ -728,7 +728,7 @@ describe("chat-execution", () => { expect(out).toBe("TRUNCATED"); expect(warn).toHaveBeenCalledWith( - expect.objectContaining({ maxTokens: 2000 }), + expect.objectContaining({ maxTokens: 4000 }), expect.stringContaining("maxOutputTokens ceiling"), ); }); diff --git a/apps/backend/src/services/chat-execution.ts b/apps/backend/src/services/chat-execution.ts index 724970d0..35e836d5 100644 --- a/apps/backend/src/services/chat-execution.ts +++ b/apps/backend/src/services/chat-execution.ts @@ -539,8 +539,9 @@ type CompactionRuntime = { */ /** Safety ceiling on summarizer output (Chunk 13, Fix 2). Prevents a runaway * model from producing a summary longer than its input. The system prompt - * targets ~1500 tokens; this 2000 backstop only trips on a degenerate run. */ -const SUMMARIZE_MAX_OUTPUT_TOKENS = 2000; + * hard-limits to 1500 tokens; this 4000 backstop catches models that ignore + * the instruction (e.g. qwen36 on large tool-heavy inputs). */ +const SUMMARIZE_MAX_OUTPUT_TOKENS = 4000; /** Heartbeat interval while the summarizer runs (Chunk 13, Fix 1). Resets the * per-step stall watchdog so a slow summarize call is not misidentified as a @@ -589,6 +590,9 @@ export async function buildCompactionRuntime(args: { config.minPrunableChars = numEnv(process.env.COMPACTION_MIN_PRUNABLE_CHARS) ?? config.minPrunableChars; + config.minRecentPrunableChars = + numEnv(process.env.COMPACTION_MIN_RECENT_PRUNABLE_CHARS) ?? + config.minRecentPrunableChars; // RV7d: resolve both windows concurrently (they are independent). const taskModelId = provider.taskModelId || resolvedModelId; @@ -641,12 +645,11 @@ export async function buildCompactionRuntime(args: { - **Decisions & facts** — conclusions, confirmed values/IDs/paths, constraints and user preferences (preserve any security-relevant instruction verbatim). - **Files & tools touched** — what was read/changed and why. -If a prior summary appears in the history, integrate it — don't drop facts it captured. Be concise: aim under ~1500 tokens. Output only the summary.`, +If a prior summary appears in the history, integrate it — don't drop facts it captured. Be concise: hard limit 1500 tokens maximum. Output only the summary.`, prompt: text, - // Fix 2 (Chunk 13): hard ceiling prevents a degenerate run from - // producing a summary longer than its input. Healthy summaries run a - // few hundred tokens; the ~1500-token prompt target leaves headroom - // under this 2000 backstop, which only trips on a degenerate run. + // Fix 2 (Chunk 13): hard ceiling prevents a runaway model from + // producing a summary longer than its input. Prompt hard-limits to + // 1500 tokens; 4000 backstop catches models that ignore the instruction. maxOutputTokens: SUMMARIZE_MAX_OUTPUT_TOKENS, // Fix B (review): thread the run's abort signal so a cancelled or // per-run-timed-out run actually aborts this call. The heartbeat above From 9d0348087b663fb1b93c93ab7b8e90b214beadc8 Mon Sep 17 00:00:00 2001 From: "frantisek.spacek@morosystems.cz" Date: Sun, 14 Jun 2026 01:05:32 +0200 Subject: [PATCH 16/21] =?UTF-8?q?chore(docs):=20plan=20Chunk=2014=20?= =?UTF-8?q?=E2=80=94=20kept-message=20tool-result=20handling?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three tasks: (1) Tier 1 recent-trim safety (64232d8 shipped; option D overflow-gate + exempt-newest remaining), (2) context-editing-style prune of kept tool results (needs review + bigger plan; session iteration captured), (3) ingestion cap for oversized MCP/sub-agent results (upstream-issue candidate, marked not filed). Co-Authored-By: Claude Opus 4.8 (1M context) --- context-compaction-plan.md | 117 +++++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) diff --git a/context-compaction-plan.md b/context-compaction-plan.md index 7c64e893..8532efa1 100644 --- a/context-compaction-plan.md +++ b/context-compaction-plan.md @@ -1411,6 +1411,123 @@ multi-compaction-stable summaries. --- +## Chunk 14 — kept-message tool-result handling (planned, 2026-06-14) + +Origin: live event on the test server — compaction fired but missed target badly +(`tokensAfter=75226` vs `targetTokens≈24000`). Cause: 4 recent messages were +massive mempalace (MCP) tool-result JSON dumps. Step 1 (drop prefix) already +collapses prefix tool results via `softTrim(out, 200)` in `renderUIMessages`, so +the summarizer only saw ~2.5k input tokens — but `keptMessages: recent` was +returned **verbatim**, so the bulky kept results dominated `tokensAfter`. + +### The two-tier reality (established this session) + +- **Tier 2 (`compactModelMessages`, `prepareStep`)** runs _between tool steps in + one stream_. It prunes **prefix only**; `recent` is passed **verbatim** + ([compaction.ts](apps/backend/src/runs/compaction.ts) `[summaryModelMessage(...), ...recent]`). + So current-stream tool results are never trimmed mid-stream. **Caveat:** the + keep-window is the last `keepRecentMessages` _messages_ counted flat, and in + ModelMessage form a tool call + its result are two messages — so in a >5-tool + stream the _early_ results of the same stream scroll into the prefix and DO get + summarized mid-stream. Tier 2 protects the tail, not the whole stream. +- **Tier 1 (`compactUIMessages`)** runs _between turns_. The just-finished + stream's tool results are the newest messages → land in `recent`. This is the + only place a result the user is actively asking about can be trimmed. + +### Mapping to Anthropic's shipped mechanisms (claude-api skill, 2026-06-14) + +The Anthropic API converged on **three** complementary layers; Platypus today has +only the summarize leg: + +1. **Compaction** — summarize earlier context near the window limit (beta + `compact-2026-01-12`). ≈ Platypus Tier 1/Tier 2. +2. **Context editing (`clear_tool_uses`)** — _prune_ stale tool results + thinking + blocks at configurable thresholds, keeping conversation structure; "keeps the + transcript lean **without summarizing**." Platypus does NOT have this. → Task 2. +3. **Ingestion offload** — Managed Agents auto-offloads any MCP tool result + > 100K tokens to a sandbox file, returning a truncated preview + path. An + > ingestion cap, not compaction — the only real fix for "one result too big to + > fit the window at all." → Task 3. + +### Task 1 — make Tier 1 recent-trim safe (READY; partially shipped) + +**Shipped 2026-06-14 (`64232d8`, on `test/compaction-clean-deploy`, deployed):** +recent tool results pruned via `pruneUIMessage(m, minRecentPrunableChars)` (default +`minPrunableChars * 5` = 10000 chars) across every over-target return path +including the empty-prefix / null-watermark bail; warns when post-compaction +estimate > `targetTokens * 2`; env override `COMPACTION_MIN_RECENT_PRUNABLE_CHARS`; +summarizer output ceiling 2000 → 4000. + +**Still to do — adopt "option D" (overflow-gate + exempt newest)** so we stop +gutting active data for a _soft_-target miss: + +- Missing `targetTokens` (0.5) is cheap — it's a hysteresis goal; the call still + succeeds as long as `recent < inputBudget`. The hard wall is `inputBudget` + (window − output reserve − safety, from `computeBudget`). +- Gate recent-trim on `estimate(recent)+summary > inputBudget` (call would + actually overflow), NOT on the soft target. Below the wall, leave `recent` + intact — full fidelity, just a missed hysteresis target that re-compacts next + turn (cheap; empty-prefix path makes no summarizer call). +- Always exempt the single newest message regardless. +- Requires threading `inputBudget` into `UICompactOptions`. +- This is strictly better than the alternatives we weighed (B: exempt newest N — + still over-trims under the test box's 0.1 target; A: raise threshold to 100k — + reduces frequency only; C: revert — loses the pathological-dump guard). The test + server's `COMPACTION_TARGET_RATIO=0.1` manufactures the worst case; prod 0.5 + rarely trips it. + +### Task 2 — context-editing-style prune of kept tool results (NEEDS REVIEW + bigger plan) + +The user's ideal model (decided 2026-06-14): in-stream, only compact near the +ceiling so the stream doesn't stop (Tier 2 already does this); **at stream end, +strip bulky tool results so the next turn doesn't start at 40k** (NEW); then +threshold-compact as chat continues (Tier 1 already does this). This is exactly +Anthropic's **context editing (`clear_tool_uses`)** — prune, don't summarize. + +Design notes captured this session (to expand before implementing): + +- The current "keep last N _messages_ verbatim" heuristic is backwards: in a + > 10-tool stream the keep-window fills with raw tool results while the small, + > high-value conversational text (question + answer) gets summarized into the + > prefix. A type-aware policy is better: **keep conversational text + the newest + > turn's results verbatim; compress older bulky results to placeholders.** +- Implement as a **model-view transform**, not destruction — full result stays in + the DB/UI (display/audit); only the lean version is _sent_ to the model each + subsequent turn. Platypus already separates stored messages from the model view + (watermark + summary reconstruction), so this fits. +- Keep tool call↔result structural validity: replace the result _content_ with a + short placeholder marker (`[mempalace result, 40k chars — elided after turn N]`), + don't drop the message (providers reject an orphaned call). Size-gate it — tiny + results (`"OK"`) gain nothing from a placeholder. +- The unavoidable tradeoff: eager stripping loses the immediate "based on those + results, do X" follow-up. Options: (a) strip immediately + agent re-calls if + needed; (b) placeholder-with-summary so the model can re-fetch intelligently + (recommended); (c) one-turn grace (keep the just-finished turn's results one + more turn). User's "next turn must not carry 40k" leans toward (b). +- Anthropic's `clear_tool_uses` is **threshold-triggered** with configurable + thresholds, not strictly "every stream end" — decide whether to mirror that or + go eager at each turn boundary. + +This is a redesign of the compaction unit (messages → type-aware policies keyed +off turn boundaries), bigger than Task 1, and overlaps Task 3. Review before +building. + +### Task 3 — ingestion cap for oversized MCP / sub-agent results (UPSTREAM ISSUE — file so we don't forget) + +No tier can fix a _single_ result larger than the window by trimming other +messages. Today: sandbox tools self-cap (ADR-0002, `truncated` flag), but **MCP +tools (mempalace) and sub-agent returns have no cap** — one oversized result +overflows all tiers (Tier 2/recovery never prune `recent`) → provider reject → +error. The fix is an **ingestion cap at tool-result storage time** (mirror the +sandbox/ADR-0002 pattern and Anthropic's 100K MCP offload): truncate/offload the +oversized result, set a `truncated`-style marker, tell the model to narrow / +re-fetch. Lives in tool wrapping + agent-runner, not compaction. + +**Action: file as an upstream issue on `willdady/platypus`** (per the issue-tracker +skill). Marked here so it isn't lost; not yet filed. + +--- + ## Open / deferred decisions - **OpenAI-compatible as a separate provider type** — not required (auto-detect From f4649103b697ee77bc7eed8ea3806a75902da0d4 Mon Sep 17 00:00:00 2001 From: "frantisek.spacek@morosystems.cz" Date: Sun, 14 Jun 2026 09:25:21 +0200 Subject: [PATCH 17/21] =?UTF-8?q?fix(backend):=20chunk=2014=20option=20D?= =?UTF-8?q?=20=E2=80=94=20overflow-gate=20recent-trim=20+=20exempt=20newes?= =?UTF-8?q?t?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tier 1 recent (kept) tool results are now trimmed only when the kept view would breach the hard window wall (inputBudget), not on a soft targetTokens (hysteresis) miss. A soft miss is left at full fidelity and re-compacts next turn. The single newest message is always exempt. - UICompactOptions.inputBudget threaded from applyTier1Compaction as max(0, budget.inputBudget - overheadTokens) (mirrors effectiveTarget). - keepRecentWithinWall applied on both over-target paths (Stage 2 + empty prefix); omitted budget falls back to always-trim (pre-option-D guard). Review fixes: - Re-gate the over-target warning to afterEstimate > inputBudget; the old target*2 heuristic fired on every healthy compaction under a low target ratio. Falls back to target*2 when no wall is supplied. - keepRecentWithinWall returns the recent estimate to avoid a double pass. Tests: 57 pass (option-D verbatim/trim/empty-prefix/no-warn cases). Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/backend/src/runs/compaction.test.ts | 94 ++++++++++++++++++++- apps/backend/src/runs/compaction.ts | 100 +++++++++++++++++++---- context-compaction-plan.md | 35 ++++++-- 3 files changed, 204 insertions(+), 25 deletions(-) diff --git a/apps/backend/src/runs/compaction.test.ts b/apps/backend/src/runs/compaction.test.ts index 467c752f..dbb794b4 100644 --- a/apps/backend/src/runs/compaction.test.ts +++ b/apps/backend/src/runs/compaction.test.ts @@ -377,7 +377,28 @@ describe("compactUIMessages (Tier 1)", () => { expect((toolPart?.output as string).length).toBeLessThan(12000); }); - it("warns when Stage 2 result still exceeds 2× targetTokens after pruning", async () => { + it("option D: keeps recent VERBATIM in the empty-prefix path when within inputBudget", async () => { + // Whole history fits within keepRecentMessages (2) → empty prefix, no model + // call. Over the soft target but under the wall → outlier must stay untouched. + const msgs = [ + uiTool("r1", "X".repeat(12000)), + uiText("r2", "user", "done"), + ]; + const res = await compactUIMessages(msgs, { + ...baseOpts, + summarize: noopSummarize, + targetTokens: 300, + minRecentPrunableChars: 5000, + inputBudget: 100000, // wall far above → no recent trim + }); + expect(res.usedModelCall).toBe(false); + const toolPart = res.keptMessages[0].parts?.find((p) => + (p as { type: string }).type.startsWith("tool-"), + ) as { output?: string } | undefined; + expect(toolPart?.output).toBe("X".repeat(12000)); // untouched + }); + + it("warns (no wall) when Stage 2 result still exceeds 2× targetTokens after pruning", async () => { const warn = vi.spyOn(logger, "warn").mockReturnValue(undefined); const msgs = [ uiText("p1", "user", "P".repeat(4000)), @@ -390,13 +411,82 @@ describe("compactUIMessages (Tier 1)", () => { ...baseOpts, summarize: noopSummarize, targetTokens: 50, // recent alone is ~4000 tokens → well over 2×50 + // no inputBudget → warn falls back to the target*2 heuristic }); expect(warn).toHaveBeenCalledWith( expect.objectContaining({ targetTokens: 50 }), - expect.stringContaining("recent messages exceed target"), + expect.stringContaining("recent messages exceed the window"), ); warn.mockRestore(); }); + + it("option D: does NOT warn on a soft-target miss when recent is under the wall", async () => { + const warn = vi.spyOn(logger, "warn").mockReturnValue(undefined); + const msgs = [ + uiText("p1", "user", "P".repeat(4000)), + uiText("p2", "assistant", "Q".repeat(4000)), + uiText("r1", "user", "R".repeat(8000)), + uiText("r2", "assistant", "S".repeat(8000)), + ]; + await compactUIMessages(msgs, { + ...baseOpts, + summarize: noopSummarize, + targetTokens: 50, // way over target... + inputBudget: 100000, // ...but well under the hard wall → no warn + }); + expect(warn).not.toHaveBeenCalled(); + warn.mockRestore(); + }); + + it("option D: keeps recent tool results VERBATIM when within inputBudget", async () => { + // Over the soft target (300) so Stage 2 fires, but the kept view (summary + + // recent) stays under the hard wall → recent must NOT be trimmed. + const msgs = [ + uiText("p1", "user", "P".repeat(4000)), + uiText("p2", "assistant", "Q".repeat(4000)), + uiTool("r1", "X".repeat(12000)), // ~3000 tokens in recent + uiText("r2", "user", "done"), + ]; + const res = await compactUIMessages(msgs, { + ...baseOpts, + summarize: noopSummarize, + targetTokens: 300, + minRecentPrunableChars: 5000, + inputBudget: 100000, // wall far above the kept view → no recent trim + }); + expect(res.usedModelCall).toBe(true); + const toolPart = res.keptMessages[0].parts?.find((p) => + (p as { type: string }).type.startsWith("tool-"), + ) as { output?: string } | undefined; + expect(toolPart?.output).toBe("X".repeat(12000)); // untouched + }); + + it("option D: trims recent (except newest) when the kept view breaches inputBudget", async () => { + // Two big tool results in recent; the kept view breaches the wall → trim the + // older one, exempt the single newest message even though it is bulky. + const msgs = [ + uiText("p1", "user", "P".repeat(4000)), + uiText("p2", "assistant", "Q".repeat(4000)), + uiTool("r1", "X".repeat(12000)), // older recent → trimmed + uiTool("r2", "Y".repeat(12000)), // newest → exempt + ]; + const res = await compactUIMessages(msgs, { + ...baseOpts, + summarize: noopSummarize, + targetTokens: 300, + minRecentPrunableChars: 5000, + inputBudget: 100, // wall well below the kept view → trim + }); + expect(res.usedModelCall).toBe(true); + const out = (i: number) => + ( + res.keptMessages[i].parts?.find((p) => + (p as { type: string }).type.startsWith("tool-"), + ) as { output?: string } | undefined + )?.output; + expect((out(0) as string).length).toBeLessThan(12000); // r1 trimmed + expect(out(1)).toBe("Y".repeat(12000)); // r2 (newest) exempt + }); }); describe("compactModelMessages (Tier 2 / recovery)", () => { diff --git a/apps/backend/src/runs/compaction.ts b/apps/backend/src/runs/compaction.ts index 11d3bb6e..55586820 100644 --- a/apps/backend/src/runs/compaction.ts +++ b/apps/backend/src/runs/compaction.ts @@ -275,6 +275,18 @@ export type UICompactOptions = { /** Threshold for pruning tool results in kept (recent) messages after Stage 2. * Defaults to minPrunableChars * 5 when omitted. */ minRecentPrunableChars?: number; + /** + * The HARD window wall (Chunk 14 Task 1, option D): the kept view's tokens + * above which the call would actually overflow (already net of per-turn + * overhead by the caller). Recent (kept) tool results are trimmed ONLY when + * the kept view breaches this wall — a mere `targetTokens` (hysteresis) miss + * is cheap (it re-compacts next turn) and is not worth gutting active data the + * user is asking about. The single newest message is always exempt regardless. + * When omitted, recent results are always trimmed once over target (the + * pre-option-D behaviour) — safer than never trimming for callers that cannot + * supply the wall. + */ + inputBudget?: number; imageProvider?: ImageProvider; /** Existing durable summary to fold the new prefix into (incremental). */ priorSummary?: string | null; @@ -398,27 +410,69 @@ export async function compactUIMessages( }; } - // Past this point we are over target. Prune large tool results in the kept - // (recent) messages — these stay in the model view, so extreme outliers (e.g. - // large MCP tool dumps) bloat tokensAfter and prevent reaching targetTokens. - // Computed once and reused by every over-target return below (the empty-prefix - // / null-watermark bail and the Stage 2 summarize path) so a history that fits - // entirely within keepRecentMessages still gets its outliers trimmed. + // Past this point we are over target. Recent (kept) messages stay in the model + // view, so extreme outliers (e.g. large MCP tool dumps) bloat tokensAfter. + // Option D (Chunk 14 Task 1): trim them ONLY when the kept view would breach + // the hard window wall (`inputBudget`); a soft `targetTokens` miss is left at + // full fidelity and just re-compacts next turn (cheap). The newest message is + // always exempt — it is the data the current turn is actively about. const recentThreshold = opts.minRecentPrunableChars ?? opts.minPrunableChars * 5; - const prunedRecent = recent.map( - (m) => pruneUIMessage(m, recentThreshold).message, - ); + const pruneRecentExemptNewest = ( + msgs: PlatypusUIMessage[], + ): { messages: PlatypusUIMessage[]; changed: boolean } => { + let changed = false; + const messages = msgs.map((m, i) => { + if (i === msgs.length - 1) return m; // newest always exempt + const pruned = pruneUIMessage(m, recentThreshold); + if (pruned.changed) changed = true; + return pruned.message; + }); + return { messages, changed }; + }; + // Decides whether to keep `recent` verbatim or trim it (option D). Returns the + // kept messages and their token estimate (reused for `afterEstimate` so the + // recent set is never re-estimated). `fixedTokens` is the kept view's NON-recent + // part (pruned prefix and/or folded summary). When `inputBudget` is omitted the + // wall is unknown → always trim once over target (pre-option-D guard). + const keepRecentWithinWall = ( + fixedTokens: number, + recentMsgs: PlatypusUIMessage[], + ): { messages: PlatypusUIMessage[]; recentTokens: number } => { + const recentTokens = estimate(recentMsgs); + if ( + opts.inputBudget !== undefined && + fixedTokens + recentTokens <= opts.inputBudget + ) { + return { messages: recentMsgs, recentTokens }; // within wall — full fidelity + } + const trimmed = pruneRecentExemptNewest(recentMsgs); + // Nothing prunable (no tool outputs over threshold) → reuse the estimate. + return { + messages: trimmed.messages, + recentTokens: trimmed.changed ? estimate(trimmed.messages) : recentTokens, + }; + }; - const warnIfOverTarget = (afterEstimate: number) => { - if (afterEstimate > opts.targetTokens * 2) { + // Warn only when the kept view still breaches the HARD wall after trimming — + // i.e. recent genuinely couldn't be brought under the window (one oversized + // result; Task 3 ingestion-cap territory). Post-option-D a soft `targetTokens` + // miss is by design (recent kept verbatim below the wall), so it is NOT a + // warning. Falls back to the old `target * 2` heuristic when no wall is supplied. + const warnIfOverWall = (afterEstimate: number) => { + const over = + opts.inputBudget !== undefined + ? afterEstimate > opts.inputBudget + : afterEstimate > opts.targetTokens * 2; + if (over) { logger.warn( { afterEstimate, targetTokens: opts.targetTokens, + inputBudget: opts.inputBudget, keepRecentMessages: opts.keepRecentMessages, }, - "compaction fired but recent messages exceed target — keepRecentMessages may be locking in large tool results", + "compaction fired but recent messages exceed the window — a single oversized tool result may be uncompactable (see ingestion cap)", ); } }; @@ -432,9 +486,11 @@ export async function compactUIMessages( const watermarkId = prefix.length > 0 ? (prefix[prefix.length - 1].id ?? null) : null; if (prefix.length === 0 || watermarkId === null) { - const kept = [...prunedPrefix, ...prunedRecent]; - const afterEstimate = estimate(kept) + priorTokens; - warnIfOverTarget(afterEstimate); + const prunedPrefixTokens = estimate(prunedPrefix) + priorTokens; + const keptRecent = keepRecentWithinWall(prunedPrefixTokens, recent); + const kept = [...prunedPrefix, ...keptRecent.messages]; + const afterEstimate = prunedPrefixTokens + keptRecent.recentTokens; + warnIfOverWall(afterEstimate); return { keptMessages: kept, summaryText: opts.priorSummary ?? null, @@ -453,11 +509,13 @@ export async function compactUIMessages( opts.summarizerWindow, ); - const afterEstimate = estimate(prunedRecent) + textTokens(summaryText); - warnIfOverTarget(afterEstimate); + const summaryTokens = textTokens(summaryText); + const keptRecent = keepRecentWithinWall(summaryTokens, recent); + const afterEstimate = keptRecent.recentTokens + summaryTokens; + warnIfOverWall(afterEstimate); return { - keptMessages: prunedRecent, + keptMessages: keptRecent.messages, summaryText, watermarkId, messagesDropped: prefix.length, @@ -974,8 +1032,14 @@ export async function applyTier1Compaction( ); } + // The hard wall the kept view must fit under (option D), net of the per-turn + // overhead compaction cannot shrink — mirrors how effectiveTarget adjusts the + // soft target. Recent tool results are trimmed only when this is breached. + const effectiveInputBudget = Math.max(0, budget.inputBudget - overheadTokens); + const result = await compactUIMessages(afterWatermark, { targetTokens: effectiveTarget, + inputBudget: effectiveInputBudget, keepRecentMessages: config.keepRecentMessages, minPrunableChars: config.minPrunableChars, minRecentPrunableChars: config.minRecentPrunableChars, diff --git a/context-compaction-plan.md b/context-compaction-plan.md index 8532efa1..0b5f3cab 100644 --- a/context-compaction-plan.md +++ b/context-compaction-plan.md @@ -1458,18 +1458,43 @@ including the empty-prefix / null-watermark bail; warns when post-compaction estimate > `targetTokens * 2`; env override `COMPACTION_MIN_RECENT_PRUNABLE_CHARS`; summarizer output ceiling 2000 → 4000. -**Still to do — adopt "option D" (overflow-gate + exempt newest)** so we stop +**DONE 2026-06-14 — "option D" (overflow-gate + exempt newest) shipped** so we stop gutting active data for a _soft_-target miss: - Missing `targetTokens` (0.5) is cheap — it's a hysteresis goal; the call still succeeds as long as `recent < inputBudget`. The hard wall is `inputBudget` (window − output reserve − safety, from `computeBudget`). -- Gate recent-trim on `estimate(recent)+summary > inputBudget` (call would - actually overflow), NOT on the soft target. Below the wall, leave `recent` +- Recent-trim is now gated on `estimate(recent)+summary > inputBudget` (call would + actually overflow), NOT on the soft target. Below the wall, `recent` is left intact — full fidelity, just a missed hysteresis target that re-compacts next turn (cheap; empty-prefix path makes no summarizer call). -- Always exempt the single newest message regardless. -- Requires threading `inputBudget` into `UICompactOptions`. +- The single newest message is always exempt regardless (`pruneRecentExemptNewest`). +- Implementation: `UICompactOptions.inputBudget` added; `keepRecentWithinWall` + helper in `compactUIMessages` applied on both over-target return paths (Stage 2 + - empty-prefix bail); `applyTier1Compaction` threads + `effectiveInputBudget = max(0, budget.inputBudget − overheadTokens)` (mirrors + `effectiveTarget`). When `inputBudget` is omitted (recovery/tests) it falls + back to always-trim (pre-option-D guard). +- **Caveat (not a full close): newest-exempt narrows the prior code's accidental + coverage of the single-oversized-result case.** Old code trimmed ALL recent incl. + the newest; option D keeps the newest verbatim. So if the NEWEST single message + alone exceeds the wall (e.g. a 40k mempalace dump as the last message), no tier + trims it — recovery also prunes prefix-only — and the turn hard-errors ("start a + new chat"). That is the Task 3 ingestion-cap gap, unsolved by any tier today; + option D does not introduce it but no longer accidentally masks it. The origin + event (4 big results) is fixed: option D trims 3, the newest stays, and the + result fits **as long as the newest < wall**. +- **Review fixes (2026-06-14, post-implementation review):** + - The over-target warning was re-gated from `afterEstimate > targetTokens * 2` + to `afterEstimate > inputBudget` (`warnIfOverWall`). Post-option-D a soft + target miss is by design (recent kept verbatim below the wall), so the old + `target*2` warn fired on every healthy compaction under a low target ratio + (test box `0.1`). It now fires only when recent genuinely can't fit the window + (the Task 3 case). Falls back to `target*2` when no wall is supplied. + - `keepRecentWithinWall` now returns the recent token estimate so `afterEstimate` + never re-estimates the recent set (avoids the double char/4 pass). + - Tests: 57 pass (option-D Stage-2 verbatim + trim-except-newest; empty-prefix + verbatim; soft-miss-no-warn; the no-wall warn case renamed). - This is strictly better than the alternatives we weighed (B: exempt newest N — still over-trims under the test box's 0.1 target; A: raise threshold to 100k — reduces frequency only; C: revert — loses the pathological-dump guard). The test From 32b186e810347a665dfc194bcc5624810c784722 Mon Sep 17 00:00:00 2001 From: "frantisek.spacek@morosystems.cz" Date: Sun, 14 Jun 2026 22:23:47 +0200 Subject: [PATCH 18/21] =?UTF-8?q?feat(backend):=20chunk=2014=20task=202=20?= =?UTF-8?q?=E2=80=94=20Stage=200=20context=20editing=20(tool-result=20elis?= =?UTF-8?q?ion)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit editToolResults + elidedToolPlaceholder: pure view transform eliding OLD bulky tool-result bodies to a self-describing placeholder before the Tier 1 trigger, so a leaned view can skip summarization. Recency by tool-result count, newest message exempt, size-gated; no durable state (P1). Post-review hardening: grow-guard (never elide when placeholder >= output; no prompt inflation / negative reclaim) + idempotency guard, both decided up front into a Map so the rewrite runs only on real work and never copies on a no-op. 3 config fields + 3 env overrides. +10 tests; runs/ suite 193 pass. Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/backend/src/runs/compaction.test.ts | 228 ++++++++++++++++++ apps/backend/src/runs/compaction.ts | 177 +++++++++++++- apps/backend/src/services/chat-execution.ts | 11 + context-compaction-plan.md | 254 +++++++++++++++++--- 4 files changed, 632 insertions(+), 38 deletions(-) diff --git a/apps/backend/src/runs/compaction.test.ts b/apps/backend/src/runs/compaction.test.ts index dbb794b4..dcb58781 100644 --- a/apps/backend/src/runs/compaction.test.ts +++ b/apps/backend/src/runs/compaction.test.ts @@ -9,6 +9,8 @@ import { commitWatermark, compactUIMessages, compactModelMessages, + editToolResults, + elidedToolPlaceholder, pickKeepBoundary, softTrim, type CompactionStore, @@ -1085,3 +1087,229 @@ describe("setCompactionDirty (§E recovery producer, drift T3)", () => { expect(store.state.summaryWatermark).toBe("m7"); }); }); + +// --- Chunk 14 Task 2: Stage 0 context editing --------------------------- + +/** Tool message with a named tool and arbitrary output. */ +const toolMsg = ( + id: string, + name: string, + output: unknown, +): PlatypusUIMessage => + ({ + id, + role: "assistant", + parts: [ + { + type: `tool-${name}`, + toolCallId: `${id}-call`, + state: "output-available", + input: { q: "x" }, + output, + }, + ], + }) as unknown as PlatypusUIMessage; + +const bigOut = (n = 200) => "D".repeat(n); +const outputOf = (m: PlatypusUIMessage) => + (m.parts[0] as { output?: unknown }).output; + +describe("editToolResults (Stage 0 — context editing)", () => { + const opts = { keepRecentToolResults: 1, minEditableToolChars: 100 }; + + it("elides OLD bulky results past the keep-window; keeps recent + all text", () => { + const messages = [ + toolMsg("t1", "search", bigOut()), + uiText("u1", "user", "carry on"), + toolMsg("t2", "search", bigOut()), + toolMsg("t3", "search", bigOut()), + ]; + const res = editToolResults(messages, opts); + // 3 results, keep last 1 (t3) → t1, t2 are candidates and both bulky. + expect(res.resultsElided).toBe(2); + expect(outputOf(res.messages[0])).toBe( + elidedToolPlaceholder("search", 200), + ); + expect(outputOf(res.messages[2])).toBe( + elidedToolPlaceholder("search", 200), + ); + expect(outputOf(res.messages[3])).toBe(bigOut()); // t3 within keep-window + expect(res.messages[1]).toBe(messages[1]); // text untouched (same ref) + }); + + it("keeps results within keepRecentToolResults verbatim", () => { + const messages = [ + toolMsg("t1", "f", bigOut()), + toolMsg("t2", "f", bigOut()), + toolMsg("t3", "f", bigOut()), + ]; + const res = editToolResults(messages, { + keepRecentToolResults: 2, + minEditableToolChars: 100, + }); + expect(res.resultsElided).toBe(1); // only t1 + expect(outputOf(res.messages[0])).toBe(elidedToolPlaceholder("f", 200)); + expect(outputOf(res.messages[1])).toBe(bigOut()); + expect(outputOf(res.messages[2])).toBe(bigOut()); + }); + + it("exempts the newest message even with keepRecentToolResults=0", () => { + const messages = [ + toolMsg("t1", "f", bigOut()), + toolMsg("t2", "f", bigOut()), + ]; + const res = editToolResults(messages, { + keepRecentToolResults: 0, + minEditableToolChars: 100, + }); + expect(res.resultsElided).toBe(1); // t1 only; t2 is the newest message + expect(outputOf(res.messages[0])).toBe(elidedToolPlaceholder("f", 200)); + expect(outputOf(res.messages[1])).toBe(bigOut()); + }); + + it("size gate: leaves results at/under minEditableToolChars untouched", () => { + const messages = [ + toolMsg("small", "f", bigOut(50)), // ≤ gate + toolMsg("big", "f", bigOut(200)), // > gate + uiText("u1", "user", "tail"), // newest, so both tools are candidates + ]; + const res = editToolResults(messages, { + keepRecentToolResults: 0, + minEditableToolChars: 100, + }); + expect(res.resultsElided).toBe(1); + expect(outputOf(res.messages[0])).toBe(bigOut(50)); // small kept + expect(outputOf(res.messages[1])).toBe(elidedToolPlaceholder("f", 200)); + }); + + it("pairing: keeps the tool-call part, swaps only the output body", () => { + const messages = [ + toolMsg("t1", "search", bigOut()), + uiText("u1", "user", "x"), + ]; + const res = editToolResults(messages, { + keepRecentToolResults: 0, + minEditableToolChars: 100, + }); + const part = res.messages[0].parts[0] as Record; + expect(part.type).toBe("tool-search"); + expect(part.toolCallId).toBe("t1-call"); + expect(part.input).toEqual({ q: "x" }); + expect(part.state).toBe("output-available"); + expect(part.output).toBe(elidedToolPlaceholder("search", 200)); + }); + + it("is deterministic/monotonic: feeding the edited view back elides nothing new", () => { + const messages = [ + toolMsg("t1", "f", bigOut()), + toolMsg("t2", "f", bigOut()), + uiText("u1", "user", "tail"), + ]; + const first = editToolResults(messages, opts); + expect(first.resultsElided).toBeGreaterThan(0); + const second = editToolResults(first.messages, opts); + expect(second.resultsElided).toBe(0); + expect(second.messages).toBe(first.messages); // stable ⇒ cache-friendly + }); + + it("grow-guard: never elides when the placeholder would be longer than the output", () => { + // Tiny gate picks a result just over it, but shorter than the ~140-char + // placeholder ⇒ eliding would inflate the prompt. Must skip (no negative + // reclaim, no churn, no-op identity). + const shortOut = "D".repeat(30); // > gate 10, < placeholder length + const messages = [ + toolMsg("t1", "f", shortOut), + uiText("u1", "user", "tail"), + ]; + const res = editToolResults(messages, { + keepRecentToolResults: 0, + minEditableToolChars: 10, + }); + expect(res.resultsElided).toBe(0); + expect(res.charsReclaimed).toBe(0); + expect(res.messages).toBe(messages); + }); + + it("no-op identity: returns the same array reference when nothing qualifies", () => { + const messages = [ + toolMsg("t1", "f", bigOut(50)), // under gate + uiText("u1", "user", "hi"), + ]; + const res = editToolResults(messages, opts); + expect(res.resultsElided).toBe(0); + expect(res.charsReclaimed).toBe(0); + expect(res.messages).toBe(messages); + }); +}); + +describe("applyTier1Compaction — Stage 0 avoids summarization (Task 2)", () => { + const hugeTool = (id: string) => toolMsg(id, "dump", "Z".repeat(8000)); + // High minPrunableChars so Stage 1 prefix-pruning does NOT rescue the no-edit + // case — it must reach Stage 2 (the model call) to make Stage 0's avoidance of + // it the real discriminator. + const editCfg = cfg({ + keepRecentToolResults: 1, + minEditableToolChars: 100, + keepRecentMessages: 2, + minPrunableChars: 100000, + }); + // Trigger sits between the post-edit size (~one big tool left) and the + // pre-edit size (~two big tools). + const budget: Budget = { + inputBudget: 100000, + triggerTokens: 3000, + targetTokens: 1500, + }; + const state: CompactionState = { + version: 0, + summaryWatermark: null, + contextSummary: null, + compactionDirty: false, + }; + const messages = () => [ + hugeTool("bt1"), + hugeTool("bt2"), + uiText("r1", "user", "ok"), + uiText("r2", "assistant", "done"), + ]; + + it("elides the old dump, drops under trigger, skips the model call", async () => { + const summarize = vi.fn(async () => "SUMMARY"); + const out = await applyTier1Compaction({ + chatId: "c", + messages: messages(), + state, + budget, + config: editCfg, + imageProvider: "default", + summarize, + store: storeFromState({ version: 0 }), + }); + expect(summarize).not.toHaveBeenCalled(); + expect(out.compacted).toBe(false); + // Stage 0 still leaned the view: the old dump (bt1) is a placeholder, the + // recent dump (bt2, within keep) stays verbatim. + expect(outputOf(out.messages[0])).toBe(elidedToolPlaceholder("dump", 8000)); + expect(outputOf(out.messages[1])).toBe("Z".repeat(8000)); + }); + + it("without context editing the same chat triggers summarization", async () => { + const summarize = vi.fn(async () => "SUMMARY"); + const out = await applyTier1Compaction({ + chatId: "c", + messages: messages(), + state, + budget, + config: cfg({ + contextEditingEnabled: false, + keepRecentMessages: 2, + minPrunableChars: 100000, + }), + imageProvider: "default", + summarize, + store: storeFromState({ version: 0 }), + }); + expect(summarize).toHaveBeenCalledOnce(); + expect(out.compacted).toBe(true); + }); +}); diff --git a/apps/backend/src/runs/compaction.ts b/apps/backend/src/runs/compaction.ts index 55586820..af3d4cbc 100644 --- a/apps/backend/src/runs/compaction.ts +++ b/apps/backend/src/runs/compaction.ts @@ -241,6 +241,133 @@ function pruneUIMessage( : { message, changed }; } +/** + * Placeholder body for an elided tool result (Chunk 14 Task 2 — context editing). + * LLM-AGNOSTIC: Platypus may run small/weak background models, so the string is + * EXPLICIT and self-describing. A terse marker ("[Old tool result content + * cleared]") assumes the model infers it can re-call the tool; a small model may + * not. Names the tool + elided size so the model can decide to re-run it, and is + * short enough that Stage 1 / option-D never re-trim it. + */ +const ELIDED_PLACEHOLDER_PREFIX = '[Tool result for "'; + +export function elidedToolPlaceholder(toolName: string, chars: number): string { + return `${ELIDED_PLACEHOLDER_PREFIX}${toolName}" omitted to save context (${chars} chars). The full result is still available — call the tool again with the same input if you need it.]`; +} + +export type EditToolResultsOptions = { + /** Exempt the last N tool results (most recent) from elision. */ + keepRecentToolResults: number; + /** Only elide a tool result whose serialized output exceeds this many chars. */ + minEditableToolChars: number; +}; + +export type EditToolResultsResult = { + messages: PlatypusUIMessage[]; + resultsElided: number; + /** Net chars removed (original output length − placeholder length), for metrics. */ + charsReclaimed: number; +}; + +/** + * Stage 0 (Chunk 14 Task 2 — context editing; Anthropic `clear_tool_uses` + * equivalent): replaces the `output` of OLD bulky tool-result parts with a short + * placeholder, keeping the tool part itself (pairing) and ALL text parts intact. + * Pure + deterministic — no model call, recomputed from raw messages each turn by + * recency, so it needs no durable state (P1: raw `chat.messages` is untouched, the + * full result stays for UI/audit). + * + * Recency is by COUNT of tool results (we have no clean turn id): the last + * `keepRecentToolResults` results are exempt, and the newest message is exempt + * regardless (same invariant as option D, Task 1). A result is elided only when + * its serialized `output` exceeds `minEditableToolChars` — the size gate ≈ + * Anthropic's `clear_at_least`, so trivial results never churn the prompt cache. + * + * Monotonic + deterministic ⇒ cache-friendly: a result is elided the turn it ages + * past the keep-window and stays elided. Returns the SAME array reference when + * nothing qualified, so callers can skip a re-estimate. + */ +export function editToolResults( + messages: PlatypusUIMessage[], + opts: EditToolResultsOptions, +): EditToolResultsResult { + // Enumerate every tool-result-bearing part in order so "keep the last N" is a + // simple tail slice. A single message can carry several tool parts. + const toolResultLocs: Array<{ mi: number; pi: number }> = []; + messages.forEach((m, mi) => { + (m.parts ?? []).forEach((part, pi) => { + const ap = part as { type: string; output?: unknown }; + const isTool = ap.type === "dynamic-tool" || ap.type.startsWith("tool-"); + if (isTool && ap.output !== undefined) toolResultLocs.push({ mi, pi }); + }); + }); + + // Candidates for elision = all but the last `keepRecentToolResults`; the newest + // MESSAGE is exempt regardless (decision 5 / option-D invariant). Decide the + // FULL elision policy here (recency + size gate + idempotency + grow-guard) and + // record the precomputed placeholder, so the rewrite map below fires only when + // there is real work — and never allocates a copy for a pure no-op. + const keepFrom = Math.max( + 0, + toolResultLocs.length - opts.keepRecentToolResults, + ); + const newestMessageIndex = messages.length - 1; + const elideAt = new Map(); // "mi:pi" -> placeholder + let charsReclaimed = 0; + for (let k = 0; k < keepFrom; k++) { + const loc = toolResultLocs[k]; + if (loc.mi === newestMessageIndex) continue; // newest message exempt + const ap = (messages[loc.mi].parts ?? [])[loc.pi] as { + type: string; + output?: unknown; + toolName?: string; + }; + const serialized = + typeof ap.output === "string" ? ap.output : JSON.stringify(ap.output); + // Size gate (≈ clear_at_least): leave trivial results untouched — no churn. + if (serialized.length <= opts.minEditableToolChars) continue; + // Idempotency guard: never re-elide our own placeholder. At the default gate + // (50k) the ~150-char placeholder is far below it, but a misconfigured tiny + // gate would otherwise re-elide it every turn. Keeps this monotonic. + if ( + typeof ap.output === "string" && + ap.output.startsWith(ELIDED_PLACEHOLDER_PREFIX) + ) { + continue; + } + const toolName = + ap.type === "dynamic-tool" + ? (ap.toolName ?? "unknown") + : ap.type.slice("tool-".length); + const placeholder = elidedToolPlaceholder(toolName, serialized.length); + // Grow-guard: a tiny gate could pick a result shorter than the placeholder; + // eliding would INFLATE the prompt (negative reclaim). Skip — never grow. + if (placeholder.length >= serialized.length) continue; + elideAt.set(`${loc.mi}:${loc.pi}`, placeholder); + charsReclaimed += serialized.length - placeholder.length; + } + + // Nothing truly qualified ⇒ return the original reference so callers skip the + // re-estimate (cache-friendly no-op) and we allocate no copy. + if (elideAt.size === 0) { + return { messages, resultsElided: 0, charsReclaimed: 0 }; + } + + const out = messages.map((m, mi) => { + const parts = m.parts ?? []; + if (!parts.some((_, pi) => elideAt.has(`${mi}:${pi}`))) return m; + const newParts = parts.map((part, pi) => { + const placeholder = elideAt.get(`${mi}:${pi}`); + if (placeholder === undefined) return part; + const ap = part as { output?: unknown }; + return { ...ap, output: placeholder }; + }); + return { ...m, parts: newParts } as PlatypusUIMessage; + }); + + return { messages: out, resultsElided: elideAt.size, charsReclaimed }; +} + /** Builds a readable transcript of UIMessages for the summarizer. */ function renderUIMessages(messages: PlatypusUIMessage[]): string { return messages @@ -763,6 +890,14 @@ export type CompactionConfig = { * Stage 2 summarization. Higher than minPrunableChars — we trim extreme * outliers (e.g. huge MCP tool dumps) without destroying useful context. */ minRecentPrunableChars: number; + /** Stage 0 context editing (Chunk 14 Task 2): elide OLD bulky tool results to a + * placeholder before the trigger check, so a leaned view can avoid summarizing + * entirely. Gated alongside the COMPACTION_ENABLED kill switch. */ + contextEditingEnabled: boolean; + /** Stage 0: exempt the last N tool results from elision (recency, by count). */ + keepRecentToolResults: number; + /** Stage 0: only elide a tool result whose serialized output exceeds this. */ + minEditableToolChars: number; }; export const DEFAULT_COMPACTION_CONFIG: CompactionConfig = { @@ -773,6 +908,12 @@ export const DEFAULT_COMPACTION_CONFIG: CompactionConfig = { keepRecentMessages: 10, minPrunableChars: 2000, minRecentPrunableChars: 10000, + contextEditingEnabled: true, + keepRecentToolResults: 4, + // 50k chars ≈ 12.5k tokens — matches LibreChat's minPrunableToolChars, the only + // direct per-result char-gate analog. High enough to spare medium results (less + // cache churn) while still catching the ~160k-char mempalace dump. + minEditableToolChars: 50000, }; export type Budget = { @@ -975,15 +1116,45 @@ export async function applyTier1Compaction( const { afterWatermark, priorSummary } = viewAfterWatermark(messages, state); const priorSummaryTokens = priorSummary ? textTokens(priorSummary) : 0; + // Stage 0 — context editing (Chunk 14 Task 2): elide OLD bulky tool results to + // placeholders BEFORE the trigger projection, so a leaned view can drop under + // the trigger and skip summarization entirely. Pure/deterministic, no durable + // state (P1). Gated by the COMPACTION_ENABLED kill switch (recovery stays the + // net, P4) AND the per-feature `contextEditingEnabled`. Returns the same array + // reference when nothing qualified, so the no-op case re-estimates nothing. + // NB (plan decision 7): the elided placeholders also flow into the prefix that + // Stage 2 would summarize, so a summarized result keeps only its placeholder — + // an accepted fidelity trade-off (a 40k dump's head+tail is poor summary fodder + // and the raw stays in the DB). + const contextEditing = + config.compactionEnabled && config.contextEditingEnabled + ? editToolResults(afterWatermark, { + keepRecentToolResults: config.keepRecentToolResults, + minEditableToolChars: config.minEditableToolChars, + }) + : { messages: afterWatermark, resultsElided: 0, charsReclaimed: 0 }; + const editedView = contextEditing.messages; + if (contextEditing.resultsElided > 0) { + logger.info( + { + metric: "context_edited", + chatId: input.chatId, + resultsElided: contextEditing.resultsElided, + charsReclaimed: contextEditing.charsReclaimed, + }, + "context_edited", + ); + } + const inject = (summary: string | null, msgs: PlatypusUIMessage[]) => summary ? [summaryUIMessage(summary), ...msgs] : msgs; // The view that would be sent if we did nothing more this turn. - const baseView = inject(priorSummary, afterWatermark); + const baseView = inject(priorSummary, editedView); const overheadTokens = input.overheadTokens ?? 0; // RV9: compute the char/4 pass over the unsummarized view once and reuse it // for both the trigger projection and compactUIMessages' no-op gate. - const messageTokens = estimate(afterWatermark); + const messageTokens = estimate(editedView); const projected = projectTier1Tokens({ messageTokens, priorSummaryTokens, @@ -1037,7 +1208,7 @@ export async function applyTier1Compaction( // soft target. Recent tool results are trimmed only when this is breached. const effectiveInputBudget = Math.max(0, budget.inputBudget - overheadTokens); - const result = await compactUIMessages(afterWatermark, { + const result = await compactUIMessages(editedView, { targetTokens: effectiveTarget, inputBudget: effectiveInputBudget, keepRecentMessages: config.keepRecentMessages, diff --git a/apps/backend/src/services/chat-execution.ts b/apps/backend/src/services/chat-execution.ts index 35e836d5..a23b7ed6 100644 --- a/apps/backend/src/services/chat-execution.ts +++ b/apps/backend/src/services/chat-execution.ts @@ -593,6 +593,17 @@ export async function buildCompactionRuntime(args: { config.minRecentPrunableChars = numEnv(process.env.COMPACTION_MIN_RECENT_PRUNABLE_CHARS) ?? config.minRecentPrunableChars; + // Stage 0 context editing (Chunk 14 Task 2). Disabled via + // COMPACTION_CONTEXT_EDITING_ENABLED=false; recency/size gates tunable. + if (process.env.COMPACTION_CONTEXT_EDITING_ENABLED === "false") { + config.contextEditingEnabled = false; + } + config.keepRecentToolResults = + numEnv(process.env.COMPACTION_KEEP_RECENT_TOOL_RESULTS) ?? + config.keepRecentToolResults; + config.minEditableToolChars = + numEnv(process.env.COMPACTION_MIN_EDITABLE_TOOL_CHARS) ?? + config.minEditableToolChars; // RV7d: resolve both windows concurrently (they are independent). const taskModelId = provider.taskModelId || resolvedModelId; diff --git a/context-compaction-plan.md b/context-compaction-plan.md index 0b5f3cab..f1435ccc 100644 --- a/context-compaction-plan.md +++ b/context-compaction-plan.md @@ -1501,41 +1501,225 @@ gutting active data for a _soft_-target miss: server's `COMPACTION_TARGET_RATIO=0.1` manufactures the worst case; prod 0.5 rarely trips it. -### Task 2 — context-editing-style prune of kept tool results (NEEDS REVIEW + bigger plan) - -The user's ideal model (decided 2026-06-14): in-stream, only compact near the -ceiling so the stream doesn't stop (Tier 2 already does this); **at stream end, -strip bulky tool results so the next turn doesn't start at 40k** (NEW); then -threshold-compact as chat continues (Tier 1 already does this). This is exactly -Anthropic's **context editing (`clear_tool_uses`)** — prune, don't summarize. - -Design notes captured this session (to expand before implementing): - -- The current "keep last N _messages_ verbatim" heuristic is backwards: in a - > 10-tool stream the keep-window fills with raw tool results while the small, - > high-value conversational text (question + answer) gets summarized into the - > prefix. A type-aware policy is better: **keep conversational text + the newest - > turn's results verbatim; compress older bulky results to placeholders.** -- Implement as a **model-view transform**, not destruction — full result stays in - the DB/UI (display/audit); only the lean version is _sent_ to the model each - subsequent turn. Platypus already separates stored messages from the model view - (watermark + summary reconstruction), so this fits. -- Keep tool call↔result structural validity: replace the result _content_ with a - short placeholder marker (`[mempalace result, 40k chars — elided after turn N]`), - don't drop the message (providers reject an orphaned call). Size-gate it — tiny - results (`"OK"`) gain nothing from a placeholder. -- The unavoidable tradeoff: eager stripping loses the immediate "based on those - results, do X" follow-up. Options: (a) strip immediately + agent re-calls if - needed; (b) placeholder-with-summary so the model can re-fetch intelligently - (recommended); (c) one-turn grace (keep the just-finished turn's results one - more turn). User's "next turn must not carry 40k" leans toward (b). -- Anthropic's `clear_tool_uses` is **threshold-triggered** with configurable - thresholds, not strictly "every stream end" — decide whether to mirror that or - go eager at each turn boundary. - -This is a redesign of the compaction unit (messages → type-aware policies keyed -off turn boundaries), bigger than Task 1, and overlaps Task 3. Review before -building. +### Task 2 — context editing: recency-based tool-result pruning, no summary (DONE 2026-06-14) + +**Shipped.** `editToolResults` + `elidedToolPlaceholder` in `compaction.ts`, wired +as Stage 0 in `applyTier1Compaction` (after `viewAfterWatermark`, before the +trigger projection); 3 new `CompactionConfig` fields + defaults +(`contextEditingEnabled=true`, `keepRecentToolResults=4`, +`minEditableToolChars=50000`); 3 env overrides in `buildCompactionRuntime` +(`COMPACTION_CONTEXT_EDITING_ENABLED`, `COMPACTION_KEEP_RECENT_TOOL_RESULTS`, +`COMPACTION_MIN_EDITABLE_TOOL_CHARS`). No schema / agent-runner / frontend change. +Gated by the `COMPACTION_ENABLED` kill switch AND `contextEditingEnabled`; metric +`context_edited` logged when `resultsElided > 0`. Implementation notes: + +- **`minEditableToolChars` defaulted to 50000, not the initial 10000** — 10k (≈2.5k + tokens) was well below every shipped tool; 50k matches LibreChat and still + catches the ~160k-char mempalace dump while sparing medium results (less churn). +- **Idempotency guard added** — `editToolResults` skips a result whose output + already starts with the placeholder prefix, so a misconfigured tiny gate cannot + re-elide its own placeholder (monotonic for any gate, not just the 50k default). +- **Grow-guard added (post-review 2026-06-14)** — skip when `placeholder.length + > = serialized.length`: a tiny gate could pick a result shorter than the +~140-char placeholder, where eliding would INFLATE the prompt (negative +`charsReclaimed`). Restructured so the full elision policy (recency + size + +idempotency + grow) is decided up front into a `Map<"mi:pi", placeholder>`; the + > rewrite map runs only when real work exists and never allocates a copy on a pure + > no-op. New test: grow-guard (placeholder longer than output ⇒ no-op identity). +- **Placeholder is explicit/self-describing** (LLM-agnostic — small background + models): `[Tool result for "" omitted to save context ( chars). The +full result is still available — call the tool again with the same input if you +need it.]`. Not a terse copied marker. +- **Plan decision 7 (accepted fidelity loss)**: elided placeholders also flow into + any prefix Stage 2 later summarizes — accepted (a huge dump's head+tail is poor + summary fodder; raw stays in DB). +- Tests: +10 Task-2 cases (elide-old / keep-within-window / newest-exempt / + size-gate / pairing-survives / determinism+monotonic / grow-guard / + no-op-identity / Stage-0-avoids-summarization / off-control); compaction suite + 67 pass. tsc: 0 new errors (260 pre-existing in unrelated test mocks); lint: 0 + new errors. + +#### Original spec (for reference) + +The user's ideal model (2026-06-14): in-stream, compact only near the ceiling so +the stream doesn't stop (Tier 2 ✅); at stream end strip bulky tool results so the +next turn doesn't start at 40k (NEW — this task); then threshold-compact as the +chat continues (Tier 1 ✅). This is Anthropic's **context editing +(`clear_tool_uses`)** — prune tool-result bodies, do NOT summarize. + +#### Field research that fixed the design (2026-06-14, cited) + +Every shipped "prune-not-summarize" implementation converged on the **same +mechanics** — adopt them, don't reinvent: + +| Product | Trigger | What it prunes | Keep / protect | Placeholder | Pairing | +| ---------------------------------------- | --------------------------------------------- | ------------------------------------------------------------- | ------------------------------------------------------------------------------------------- | ----------------------------------------------------------------- | ----------------------------------------- | +| **Anthropic `clear_tool_uses_20250919`** | `trigger`, default **100k input tokens** | tool **results** only (calls kept unless `clear_tool_inputs`) | `keep` = **3** most-recent tool uses; `exclude_tools`; `clear_at_least` (cache-break guard) | inserts a placeholder (exact string undocumented) | yes — keeps `tool_use`, swaps result body | +| **Hermes** | **0.50** agent + **0.85**/400-msg hygiene | tool results only; dedups, strips images | `protect_last_n=20`, `protect_first_n=3`; never-prune list | `"[Old tool output cleared to save context space]"` | yes | +| **Codex (remote)** | model window, oldest-first, stop when it fits | `FunctionCallOutput` / tool outputs — rewrites body | stops as soon as history fits | `"Output exceeded the available model context and was truncated"` | yes — keeps `call_id` | +| **Claude Code microcompact** | "when context grows long" | old tool results (prune) before auto-compact (summarize) | recent context + current file/task state | `[Old tool result content cleared]` (community-reported) | yes | +| **LibreChat `contextPruning`** | gated, off by default | tool-result content; soft (head+tail) → hard (clear) | `keepLastAssistants=3`; `minPrunableToolChars` **50000** | hard: `"[Old tool result content cleared]"` | content-only | + +Takeaways baked into the spec below: **prune-then-summarize staging** (we already +have it); **never delete the tool-call block — swap only the result body** (keeps +`call_id`/pairing valid, no orphan rejection); **keep a recent tail by COUNT of +tool results** (not by turn — we have no turn id); **size-gate** the edit (cache +guard ≈ `clear_at_least`); **visible placeholder** so the model knows to re-call. +NB: Anthropic's literal default placeholder string is **not documented** — do not +copy `"[cleared to save context]"` from the cookbook (it is demo code). + +#### Architectural decisions (converged) + +1. **A pure view transform, no new durable state.** Unlike summary/watermark + (which need CAS + a DB column because they are LLM-derived and must persist), + context editing is **deterministic and recomputed from raw messages each turn** + by recency. **No schema change, no CAS, no `version` bump.** Sibling in spirit + to `stripCompactionTraceParts` ([agent-runner.ts:165](apps/backend/src/runs/agent-runner.ts#L165)), + the existing "stored-but-altered-for-the-wire" transform. P1 holds trivially: + raw `chat.messages` is untouched; the full result stays for UI/audit. +2. **Runs as Stage 0 inside `applyTier1Compaction`, before the summarize trigger + decision** — NOT in agent-runner. Rationale: it is chat-only (durable history, + recency across the whole chat), and running it _before_ the trigger lets a lean + view AVOID summarization entirely (cheaper, the whole point). agent-runner's + strip stays for the synthetic trace (a different concern, all paths). +3. **Recency by COUNT of tool results, not turn-grouping.** Matches Anthropic + `keep` / Hermes `protect_last_n`. We have no clean turn id; do not invent one. +4. **Monotonic + deterministic ⇒ cache-friendly.** A result is elided the turn it + ages past the keep-window and stays elided. One cache break per result as it + scrolls out — far cheaper than re-summarizing. The size gate is our + `clear_at_least`: trivial results are never touched, so no churn. +5. **Newest message always exempt** — same invariant as option D (Task 1). +6. **Scope line vs Task 3 (explicit).** Task 2 handles **accumulation of older + bulky results**. A single result too large to afford _even as the newest_ is + **Task 3** (ingestion cap at storage time). They overlap by design: Task 2's + keep-window is the one-turn grace for "based on those results, do X"; Task 3 + caps the pathological single dump. Task 2 does **not** fully deliver "next turn + carries no 40k" for a 40k _newest_ result — that is Task 3. +7. **Accepted fidelity loss on the summarize path (2026-06-14).** Stage 0 runs on + `afterWatermark` _before_ the trigger; when the trigger still fires anyway, + `compactUIMessages` summarizes the **prefix** — which Stage 0 has already + reduced to `[… result omitted …]` placeholders for old bulky results. So the + summarizer never sees that result's content (vs Stage 1's head+tail soft-trim, + which leaves ~1000 chars of gist). **This is strictly more lossy than today on + the summarize path, and accepted:** a 40k MCP/JSON dump's head+tail is not + useful summary fodder either, the raw stays in the DB (P1) for UI/audit, and the + common case is Stage 0 dropping the view under the trigger so no summary fires + at all. If this ever bites a real case, the mitigation is to run Stage 0 only on + the kept/recent region and let the about-to-be-summarized prefix keep its + soft-trim — not done now. + +#### Implementation spec + +**New in `compaction.ts`:** + +```ts +// Placeholder body for an elided tool result. Names the tool + elided size so the +// model can decide to re-run it. Short enough that Stage 1 / option-D never re-trim it. +// LLM-AGNOSTIC: Platypus may run small/weak background models, so the string must be +// EXPLICIT and self-describing — do NOT copy a terse marker (LibreChat's +// "[Old tool result content cleared]") that assumes the model infers it can re-call. +// Canonical form: +// `[Tool result for "" omitted to save context ( chars). The full +// result is still available — call the tool again with the same input if you +// need it.]` +function elidedToolPlaceholder(toolName: string, chars: number): string; + +// Stage 0: replace the `output` of OLD bulky tool-result parts with a placeholder. +// Pure, deterministic, no model call. Keeps the tool part (pairing); text parts +// untouched. Returns { messages, resultsElided, charsReclaimed } (unchanged array +// identity when nothing qualified, so callers can skip a re-estimate). +export function editToolResults( + messages: PlatypusUIMessage[], + opts: { + keepRecentToolResults: number; // exempt the last N tool results (default 4) + minEditableToolChars: number; // only elide results larger than this (default 50000) + }, +): { + messages: PlatypusUIMessage[]; + resultsElided: number; + charsReclaimed: number; +}; +``` + +Policy inside `editToolResults`: + +- Walk messages; collect indices of tool-result-bearing parts (`dynamic-tool` / + `tool-*` with `output !== undefined`), in order. +- The last `keepRecentToolResults` of them are exempt (verbatim). The message at + the end of the array (newest) is exempt regardless (decision 5). +- For each remaining tool result whose serialized `output` length + `> minEditableToolChars`, replace `output` with `elidedToolPlaceholder(name, len)`. + Tool calls/inputs and all text parts are left intact. +- Operate on shallow copies (P1). Reuse the `pruneUIMessage` shape-walking style. + +**Wiring in `applyTier1Compaction`** (after `viewAfterWatermark`, before the +trigger projection): + +- `const edited = config.contextEditingEnabled ? editToolResults(afterWatermark, …) : { messages: afterWatermark, … }` +- Use `edited.messages` everywhere `afterWatermark` is used downstream (projection, + `compactUIMessages`, the injected view). +- Log `metric: "context_edited"` with `{ resultsElided, charsReclaimed }` when + `resultsElided > 0` (no per-turn noise when it's a no-op). +- This shrinks `messageTokens` before the trigger check ⇒ summarize fires less. + No behavior change when nothing qualifies (array identity preserved). +- Gated by the existing `COMPACTION_ENABLED` kill switch (P4: recovery still the + net) AND a per-feature `contextEditingEnabled`. + +**Config (`CompactionConfig` + `DEFAULT_COMPACTION_CONFIG` + env in +`buildCompactionRuntime`):** + +- `contextEditingEnabled: boolean` (default `true`) — env + `COMPACTION_CONTEXT_EDITING_ENABLED=false` to disable. +- `keepRecentToolResults: number` (default `4`) — env + `COMPACTION_KEEP_RECENT_TOOL_RESULTS`. +- `minEditableToolChars: number` (default `50000`) — env + `COMPACTION_MIN_EDITABLE_TOOL_CHARS`. Matches LibreChat's `minPrunableToolChars` + (50k chars ≈ 12.5k tokens), the only direct per-result char-gate analog in the + field survey. Deliberately higher than option-D's `minRecentPrunableChars` + (10k) — Stage 0 only elides genuinely huge dumps (the 40k-token ≈ 160k-char + mempalace case), sparing medium results to minimize cache churn. 10k was too + aggressive vs every shipped tool. + +**Touches to current implementation (yes — confirmed):** + +- `compaction.ts`: new `editToolResults` + placeholder helper; Stage 0 call in + `applyTier1Compaction`; 3 new `CompactionConfig` fields + defaults. +- `chat-execution.ts` `buildCompactionRuntime`: 3 env overrides (mirror the + existing `numEnv` / kill-switch pattern at the COMPACTION\_\* block). +- No schema change. No agent-runner change (the leaned view flows through the + existing reconstruction → `stripCompactionTraceParts` → `convertToModelMessages` + path unchanged). No frontend change (full result still stored/displayed). + +**Out of scope (deferred, noted so we don't drift):** + +- Tier 2 / sub-agent context editing — Tier 2 already prunes its prefix + intra-turn; could adopt the same recency placeholder in `compactModelMessages` + later. Not now. +- Placeholder-WITH-summary (re-fetch hint that includes an LLM mini-summary of the + elided result) — needs a model call, belongs with Stage 2; the name+size + placeholder is enough for the model to re-call. Defer. +- A UI "tool result elided" timeline marker — log/metric only for now (a per-turn + UI entry would be noisy). + +**Tests (`compaction.test.ts`):** + +- elides an OLD bulky result beyond the keep-window; keeps recent + all text. +- result within `keepRecentToolResults` kept verbatim. +- newest message exempt even when bulky. +- result ≤ `minEditableToolChars` untouched (size gate). +- pairing: the tool-call part survives; only `output` changes. +- determinism/monotonicity: feeding the edited view back elides nothing new + (stable prefix ⇒ cache-friendly). +- integration: a chat that was over the summarize trigger drops under it after + Stage 0 ⇒ `usedModelCall === false` (summarization avoided). +- no-op identity: nothing qualifies ⇒ returns the same array reference. + +#### Open question for the user before coding + +- **RESOLVED 2026-06-14.** `keepRecentToolResults` = **4**, `minEditableToolChars` + = **50000** (raised from the initial 10k proposal — see config note above; 10k + was well below every shipped tool, 50k matches LibreChat and still catches the + mempalace dump). Both env-overridable so prod can tune without a deploy. ### Task 3 — ingestion cap for oversized MCP / sub-agent results (UPSTREAM ISSUE — file so we don't forget) From 724d87403ccac00dd248b04bea487f0d407461f4 Mon Sep 17 00:00:00 2001 From: "frantisek.spacek@morosystems.cz" Date: Mon, 15 Jun 2026 00:01:12 +0200 Subject: [PATCH 19/21] fix(backend): chunk 14 compaction hardening (A1-A6, B-F1-F8) Defensive fixes to context compaction surfaced by chunk 14 review: - A1: projectTier1Tokens treats lastInputTokens<=0 as no-baseline (usage-less gateways persist contextTokens=0); findLastInputTokens skips trace messages and zero-count turns symmetrically. - A2: no-dimension image token fallback uses pessimistic per-provider ceilings (Anthropic 1600 / OpenAI-high 2000) instead of flat 1200. - A6: cap output reserve at half the window so an input-scoped contextWindow minus a large max_output_tokens cannot collapse inputBudget and thrash. - B-F1/C2: range-validate compaction env overrides (warn + default on out-of-range); restore target --- apps/backend/src/runs/compaction.test.ts | 20 +++ apps/backend/src/runs/compaction.ts | 45 +++++- apps/backend/src/runs/token-estimate.test.ts | 20 ++- apps/backend/src/runs/token-estimate.ts | 18 +++ apps/backend/src/services/chat-execution.ts | 160 ++++++++++++++++--- apps/backend/src/tools/sub-agent.ts | 35 +++- context-compaction-plan.md | 61 ++++--- 7 files changed, 297 insertions(+), 62 deletions(-) diff --git a/apps/backend/src/runs/compaction.test.ts b/apps/backend/src/runs/compaction.test.ts index dcb58781..adc28692 100644 --- a/apps/backend/src/runs/compaction.test.ts +++ b/apps/backend/src/runs/compaction.test.ts @@ -659,6 +659,14 @@ describe("computeBudget (drift C3 — subtract both reserves)", () => { const b = computeBudget(10000, undefined, cfg({ reserveRatio: 0.05 })); expect(b.inputBudget).toBe(7000); // 10000 - min(4096, 2500) - 500 }); + + it("caps the output reserve at half the window so inputBudget can't collapse (A6)", () => { + // A bogus registry entry where max_output >= the input-scoped window would + // otherwise drive inputBudget toward 1 and thrash. The cap keeps it sane. + const b = computeBudget(10000, 20000, cfg({ reserveRatio: 0.05 })); + // reserve capped at 5000 (half), safety 500 → 10000 - 5000 - 500 = 4500. + expect(b.inputBudget).toBe(4500); + }); }); const bigText = (id: string, role: "user" | "assistant") => @@ -1001,6 +1009,18 @@ describe("projectTier1Tokens (drift C1/M2)", () => { }), ).toBe(100); }); + + it("treats a 0 provider count as no baseline and keeps the margin (A1)", () => { + // Usage-less providers persist contextTokens=0; a bare `== null` check would + // skip the margin AND no-op the max(), leaving the raw char/4 with no buffer. + expect( + projectTier1Tokens({ + messageTokens: 100, + priorSummaryTokens: 0, + lastInputTokens: 0, + }), + ).toBe(Math.ceil(100 * COLD_START_MARGIN)); + }); }); describe("applyTier1Compaction — overhead in the trigger (C1)", () => { diff --git a/apps/backend/src/runs/compaction.ts b/apps/backend/src/runs/compaction.ts index af3d4cbc..12e01a21 100644 --- a/apps/backend/src/runs/compaction.ts +++ b/apps/backend/src/runs/compaction.ts @@ -932,8 +932,17 @@ export function computeBudget( maxOutputTokens: number | undefined, config: CompactionConfig, ): Budget { - const maxOutputReserve = + const rawOutputReserve = maxOutputTokens ?? Math.min(4096, Math.floor(contextWindow * 0.25)); + // Cap the output reservation at half the window (A6). litellm's + // `max_input_tokens` (which feeds `contextWindow`) is already input-scoped for + // some providers, so subtracting a large `max_output_tokens` again can collapse + // `inputBudget` toward 1 — making trigger/target ≈ 0 and thrashing. Capping + // keeps the (otherwise-safe) over-reservation from degenerating. + const maxOutputReserve = Math.min( + rawOutputReserve, + Math.floor(contextWindow * 0.5), + ); const safetyReserve = Math.floor(config.reserveRatio * contextWindow); const inputBudget = Math.max( 1, @@ -972,7 +981,13 @@ export function projectTier1Tokens(args: { }): number { const charBased = args.messageTokens + args.priorSummaryTokens + (args.overheadTokens ?? 0); - if (args.lastInputTokens == null) { + // Treat a non-positive count as "no baseline" (A1): some OpenAI-compatible / + // vLLM gateways omit `usage.inputTokens`, which we persist as + // `contextTokens = 0`. A bare `== null` check would let that 0 slip through — + // skipping the cold-start margin AND no-op-ing the `max()` below — leaving the + // raw char/4 projection with no safety buffer on EVERY turn for those + // providers. Falling back to the margin keeps the conservative over-count. + if (args.lastInputTokens == null || args.lastInputTokens <= 0) { return Math.ceil(charBased * COLD_START_MARGIN); } // Two independent estimates of this turn's payload: `charBased` is a fresh @@ -1232,6 +1247,13 @@ export async function applyTier1Compaction( // recompute (R4 — the wasted summarize is bounded, never corrupting). The // version-pinning gate is shared so both write paths decide identically. const capturedVersion = state.version; + // On a version mismatch we skip as "covered" WITHOUT clearing dirty. Plan T10 + // says "winner advanced → SKIP + clear-dirty", but that is only safe when the + // winner actually compacted. A concurrent invalidateCompaction also advances + // the version yet leaves dirty set on purpose (it resets the summary, it does + // not shrink history) — clearing dirty here would then drop the forced + // compaction the overflow demanded. Leaving dirty set is strictly safe: worst + // case is one extra compaction next turn. (Intentional deviation from T10.) const pinnedWrite = (patch: WatermarkPatch) => commitWatermark(input.store, input.chatId, (latest) => latest.version === capturedVersion @@ -1241,6 +1263,15 @@ export async function applyTier1Compaction( let commit: CommitResult | undefined; if (result.usedModelCall) { + // Same-basis before/after for the user-visible reduction (B-F7): both are + // char/4 message estimates plus the per-turn overhead. The trigger + // `projected` mixes in the provider's `lastInputTokens` floor and is NOT + // comparable to the message-only post estimate, so reporting it as "before" + // overstated the drop. Computed only on the model-call path (the only place + // these are reported). + const tokensBefore = messageTokens + priorSummaryTokens + overheadTokens; + const tokensAfter = result.estimatedTokens + overheadTokens; + commit = await pinnedWrite({ summary: result.summaryText, watermark: result.watermarkId, @@ -1251,8 +1282,10 @@ export async function applyTier1Compaction( metric: "compaction.fired", tier: 1, chatId: input.chatId, - tokensBefore: projected, - tokensAfter: result.estimatedTokens, + tokensBefore, + tokensAfter, + // Keep the raw trigger projection for correlation with compaction.check. + projected, messagesDropped: result.messagesDropped, }, "compaction.fired", @@ -1260,8 +1293,8 @@ export async function applyTier1Compaction( input.onEvent?.({ type: "context-compacted", messagesDropped: result.messagesDropped, - tokensBefore: projected, - tokensAfter: result.estimatedTokens, + tokensBefore, + tokensAfter, }); } else if (state.compactionDirty) { // Forced by recovery but pruning/within-target sufficed: just clear the flag. diff --git a/apps/backend/src/runs/token-estimate.test.ts b/apps/backend/src/runs/token-estimate.test.ts index ebac3e74..d30c6551 100644 --- a/apps/backend/src/runs/token-estimate.test.ts +++ b/apps/backend/src/runs/token-estimate.test.ts @@ -105,11 +105,25 @@ describe("modality table (drift T2 — never char/4 an image)", () => { expect(estimateTokens(noDims)).toBe(85); }); - it("missing dimensions fall to the conservative default", () => { - const units: CountUnit[] = [ + it("missing dimensions use a pessimistic per-provider ceiling (A2)", () => { + // Providers with a real per-image cost would be UNDER-counted by the flat + // 1200 default when bytes/dims are unavailable (hosted URL), so they fall to + // a pessimistic ceiling near each provider's post-resize max instead. + const anthropic: CountUnit[] = [ { role: "user", text: "", nonText: [{ provider: "anthropic" }] }, ]; - expect(estimateTokens(units)).toBe(DEFAULT_NONTEXT_TOKENS); + expect(estimateTokens(anthropic)).toBe(1600); + + const openaiHigh: CountUnit[] = [ + { role: "user", text: "", nonText: [{ provider: "openai" }] }, + ]; + expect(estimateTokens(openaiHigh)).toBe(2000); + + // The unknown ("default") provider keeps the conservative flat default. + const unknown: CountUnit[] = [ + { role: "user", text: "", nonText: [{ provider: "default" }] }, + ]; + expect(estimateTokens(unknown)).toBe(DEFAULT_NONTEXT_TOKENS); }); it("unknown provider falls to the conservative default", () => { diff --git a/apps/backend/src/runs/token-estimate.ts b/apps/backend/src/runs/token-estimate.ts index 0913cd87..74eed36b 100644 --- a/apps/backend/src/runs/token-estimate.ts +++ b/apps/backend/src/runs/token-estimate.ts @@ -47,6 +47,21 @@ export const DEFAULT_NONTEXT_TOKENS = 1200; /** OpenAI's flat cost for a `detail: "low"` image, independent of size. */ const OPENAI_LOW_DETAIL_TOKENS = 85; +/** + * No-dimension fallbacks for providers with a real per-image cost (A2). When the + * bytes are absent (hosted http(s) URL — and note `inlineFileUrls` turns every + * stored attachment into one) or the header can't be parsed, we have no pixels + * to plug into the formula. The flat {@link DEFAULT_NONTEXT_TOKENS} (1200) + * under-counts a large image on these providers, defeating "over-count beats + * overflow" exactly where it matters. Use a pessimistic value near each + * provider's effective per-image ceiling after its own resize: + * - Anthropic resizes to ≤1.15 MP ⇒ ~1600 tokens max. + * - OpenAI high-detail tiling tops out a few thousand; 2000 is a safe ceiling + * for the common ≤2048² case. + */ +const ANTHROPIC_NO_DIMS_TOKENS = 1600; +const OPENAI_HIGH_NO_DIMS_TOKENS = 2000; + /** * The provider families with a known image-cost formula. Everything else maps * to `"default"` and pays the conservative flat cost. @@ -96,9 +111,12 @@ function nonTextTokens(part: NonTextPart): number { if (width == null || height == null) { // Dimensions unknown. OpenAI low-detail has a flat cost even without dims; + // providers with a real per-image cost get a pessimistic ceiling (A2); // everything else falls to the conservative default. if (provider === "openai" && detail === "low") return OPENAI_LOW_DETAIL_TOKENS; + if (provider === "anthropic") return ANTHROPIC_NO_DIMS_TOKENS; + if (provider === "openai") return OPENAI_HIGH_NO_DIMS_TOKENS; return DEFAULT_NONTEXT_TOKENS; } diff --git a/apps/backend/src/services/chat-execution.ts b/apps/backend/src/services/chat-execution.ts index fc38d56a..8eb387d1 100644 --- a/apps/backend/src/services/chat-execution.ts +++ b/apps/backend/src/services/chat-execution.ts @@ -3,7 +3,7 @@ import { type MCPClient, } from "@ai-sdk/mcp"; import { openProvider } from "./provider.ts"; -import { and, eq, or, inArray } from "drizzle-orm"; +import { and, eq, or, inArray, sql } from "drizzle-orm"; import { db } from "../index.ts"; import { agent as agentTable, @@ -522,6 +522,28 @@ export async function loadChatMessages( return (rows[0]?.messages as PlatypusUIMessage[] | null) ?? []; } +/** + * Newest-first scan for the last assistant message carrying a POSITIVE + * provider-reported `contextTokens` (the §H stat). Skips two messages that would + * otherwise shadow the real baseline: + * - the §J standalone trace message (assistant role, no `metadata.stats`) — C2; + * - a turn from a usage-less provider stamped `contextTokens = 0` — A1. + * Either would make the Tier 1 projection drop the corrective baseline (and, for + * the 0 case, the cold-start margin too). + */ +function findLastInputTokens( + messages: PlatypusUIMessage[], +): number | undefined { + for (let i = messages.length - 1; i >= 0; i--) { + if (messages[i].role !== "assistant") continue; + const ct = ( + messages[i].metadata as { stats?: { contextTokens?: number } } | undefined + )?.stats?.contextTokens; + if (typeof ct === "number" && ct > 0) return ct; + } + return undefined; +} + /** * Everything the compaction machinery needs that is resolved once per turn: * the budget (from the resolved context window), the effective config, the @@ -582,36 +604,103 @@ export async function buildCompactionRuntime(args: { // unchanged. Intended for tuning the trigger on test deployments without a // code change. Keep targetRatio < triggerRatio or compaction re-fires every // turn (the thrash trap). - const numEnv = (raw: string | undefined): number | undefined => { + // Reads + RANGE-VALIDATES a numeric env override (A3/B-F1). An out-of-range or + // non-finite value is rejected (warn + fall back to the default) rather than + // silently applied: the old `Number.isFinite`-only check let `0` and negatives + // through, so `COMPACTION_KEEP_RECENT=0` summarized the current message away + // and `COMPACTION_TRIGGER_RATIO=0` fired on empty chats. + const numEnv = ( + name: string, + raw: string | undefined, + opts: { min?: number; max?: number; integer?: boolean } = {}, + ): number | undefined => { if (raw == null || raw === "") return undefined; - const n = Number(raw); - return Number.isFinite(n) ? n : undefined; + let n = Number(raw); + let invalid = !Number.isFinite(n); + if (!invalid && opts.integer) n = Math.floor(n); + if (!invalid && opts.min !== undefined && n < opts.min) invalid = true; + if (!invalid && opts.max !== undefined && n > opts.max) invalid = true; + if (invalid) { + logger.warn( + { env: name, raw, ...opts }, + "ignoring out-of-range compaction env override; using default", + ); + return undefined; + } + return n; }; + const RATIO = { min: 0.01, max: 1 }; config.triggerRatio = - numEnv(process.env.COMPACTION_TRIGGER_RATIO) ?? config.triggerRatio; + numEnv( + "COMPACTION_TRIGGER_RATIO", + process.env.COMPACTION_TRIGGER_RATIO, + RATIO, + ) ?? config.triggerRatio; config.targetRatio = - numEnv(process.env.COMPACTION_TARGET_RATIO) ?? config.targetRatio; + numEnv( + "COMPACTION_TARGET_RATIO", + process.env.COMPACTION_TARGET_RATIO, + RATIO, + ) ?? config.targetRatio; config.reserveRatio = - numEnv(process.env.COMPACTION_RESERVE_RATIO) ?? config.reserveRatio; + numEnv("COMPACTION_RESERVE_RATIO", process.env.COMPACTION_RESERVE_RATIO, { + min: 0, + max: 0.9, + }) ?? config.reserveRatio; config.keepRecentMessages = - numEnv(process.env.COMPACTION_KEEP_RECENT) ?? config.keepRecentMessages; + numEnv("COMPACTION_KEEP_RECENT", process.env.COMPACTION_KEEP_RECENT, { + min: 1, + integer: true, + }) ?? config.keepRecentMessages; config.minPrunableChars = - numEnv(process.env.COMPACTION_MIN_PRUNABLE_CHARS) ?? - config.minPrunableChars; + numEnv( + "COMPACTION_MIN_PRUNABLE_CHARS", + process.env.COMPACTION_MIN_PRUNABLE_CHARS, + { + min: 1, + integer: true, + }, + ) ?? config.minPrunableChars; config.minRecentPrunableChars = - numEnv(process.env.COMPACTION_MIN_RECENT_PRUNABLE_CHARS) ?? - config.minRecentPrunableChars; + numEnv( + "COMPACTION_MIN_RECENT_PRUNABLE_CHARS", + process.env.COMPACTION_MIN_RECENT_PRUNABLE_CHARS, + { min: 1, integer: true }, + ) ?? config.minRecentPrunableChars; // Stage 0 context editing (Chunk 14 Task 2). Disabled via // COMPACTION_CONTEXT_EDITING_ENABLED=false; recency/size gates tunable. if (process.env.COMPACTION_CONTEXT_EDITING_ENABLED === "false") { config.contextEditingEnabled = false; } config.keepRecentToolResults = - numEnv(process.env.COMPACTION_KEEP_RECENT_TOOL_RESULTS) ?? - config.keepRecentToolResults; + numEnv( + "COMPACTION_KEEP_RECENT_TOOL_RESULTS", + process.env.COMPACTION_KEEP_RECENT_TOOL_RESULTS, + { min: 0, integer: true }, + ) ?? config.keepRecentToolResults; config.minEditableToolChars = - numEnv(process.env.COMPACTION_MIN_EDITABLE_TOOL_CHARS) ?? - config.minEditableToolChars; + numEnv( + "COMPACTION_MIN_EDITABLE_TOOL_CHARS", + process.env.COMPACTION_MIN_EDITABLE_TOOL_CHARS, + { min: 1, integer: true }, + ) ?? config.minEditableToolChars; + + // Hysteresis backstop (C2 / B-F1): target must stay below trigger or + // compaction re-fires every turn. The chunk-9 runtime clamp was lost when + // chunk-12 deleted resolveCompactionConfig; restore it here so an operator who + // sets COMPACTION_TARGET_RATIO >= COMPACTION_TRIGGER_RATIO still runs safely. + if (config.targetRatio >= config.triggerRatio) { + const clamped = config.triggerRatio * 0.9; + logger.warn( + { + targetRatio: config.targetRatio, + triggerRatio: config.triggerRatio, + clamped, + }, + "COMPACTION_TARGET_RATIO >= COMPACTION_TRIGGER_RATIO; clamping target to triggerRatio*0.9 (C2 hysteresis)", + ); + config.targetRatio = clamped; + } // RV7d: resolve both windows concurrently (they are independent). const taskModelId = provider.taskModelId || resolvedModelId; @@ -986,15 +1075,10 @@ export const prepareChatTurn = async ( // Pre-overwrite baseline threaded from agent-runner (RV1). priorMessages: input.priorMessages, overheadTokens, - // Prior turn's provider-reported input token count (C1 / §H): the last - // assistant message carries metadata.stats.contextTokens (stamped by - // applyMessageStats) — the corrective baseline for the Tier 1 trigger - // projection on turns ≥ 2. Absent on turn 1 → cold-start margin applies. - lastInputTokens: ( - messages.findLast((m) => m.role === "assistant")?.metadata as - | { stats?: { contextTokens?: number } } - | undefined - )?.stats?.contextTokens, + // Prior turn's provider-reported input token count (C1 / §H): the + // corrective baseline for the Tier 1 trigger projection on turns ≥ 2. + // Absent on turn 1 → cold-start margin applies. + lastInputTokens: findLastInputTokens(messages), }) : { messages: inlinedMessages }; const compactedMessages = tier1Result.messages; @@ -1429,6 +1513,10 @@ const loadSubAgents = async ( string, import("ai").PrepareStepFunction >(); + // Per-sub-agent overflow recovery (C1/B-F3). Built ALWAYS — recovery (P4) is + // the net even when the §G kill switch disables proactive compaction, exactly + // as on the main path. Tier 2 (below) is the only part gated by the switch. + const subAgentRecoveries = new Map(); await Promise.all( subAgentRecords.map(async (sa) => { try { @@ -1441,6 +1529,17 @@ const loadSubAgents = async ( resolvedModelId: sa.modelId, opened: resolved.opened, }); + // Recovery net first (not gated by compactionEnabled). No markDirty — + // sub-agents have no durable chat row to flag. + subAgentRecoveries.set(sa.id, { + chatId: sa.id, + imageProvider: runtime.imageProvider, + targetTokens: Math.max(0, runtime.budget.targetTokens), + keepRecentMessages: runtime.config.keepRecentMessages, + minPrunableChars: runtime.config.minPrunableChars, + summarize: runtime.summarize, + summarizerWindow: runtime.summarizerWindow, + }); if (!runtime.config.compactionEnabled) return; const tier2: Tier2Context = { triggerTokens: Math.max(0, runtime.budget.triggerTokens), @@ -1486,6 +1585,7 @@ const loadSubAgents = async ( }, onProgress, (id) => subAgentPrepareSteps.get(id), + (id) => subAgentRecoveries.get(id), ); return { subAgents, subAgentTools, subAgentMcpClients }; @@ -1610,9 +1710,17 @@ export async function forceCompactChat( result.compactionTrace, createIdGenerator({ prefix: "msg", size: 16 })(), ); + // Atomic jsonb append (A4/B-F8): concatenate at the DB rather than overwrite + // the whole column from the in-memory `messages` snapshot loaded earlier. + // The route guards with runRegistry.has(chatId), but a run that registers in + // the has()→write window — or a second concurrent POST /compact — would + // otherwise be clobbered by this stale array. `||` appends to whatever is + // stored now, so no concurrently-written messages are lost. await db .update(chatTable) - .set({ messages: [...messages, traceMessage] }) + .set({ + messages: sql`coalesce(${chatTable.messages}, '[]'::jsonb) || ${JSON.stringify([traceMessage])}::jsonb`, + }) .where( and(eq(chatTable.id, chatId), eq(chatTable.workspaceId, workspaceId)), ); diff --git a/apps/backend/src/tools/sub-agent.ts b/apps/backend/src/tools/sub-agent.ts index bf4b6f71..2e1a5a21 100644 --- a/apps/backend/src/tools/sub-agent.ts +++ b/apps/backend/src/tools/sub-agent.ts @@ -2,12 +2,17 @@ import { stepCountIs, tool, ToolLoopAgent, + wrapLanguageModel, type LanguageModel, type PrepareStepFunction, type Tool, } from "ai"; import { z } from "zod"; import { logger } from "../logger.ts"; +import { + contextOverflowRecoveryMiddleware, + type RecoveryContext, +} from "../runs/recovery.ts"; /** * Single source of truth for the sub-agent delegation tool name. @@ -52,6 +57,15 @@ interface SubAgentToolOptions { onProgress?: () => void; /** Tier 2 in-turn compaction callback (§D, drift M3). Null when compaction disabled. */ prepareStep?: PrepareStepFunction; + /** + * Context-overflow recovery (§E, P4) for the sub-agent's own model calls. + * Sub-agents run a ToolLoopAgent OUTSIDE the parent run's recovery-wrapped + * model, so without this their only overflow protection is Tier 2 — which + * fires late (its trigger omits the sub-agent's tool/prompt overhead) and has + * no net behind it. Wrapping here gives every sub-agent step one trim+retry, + * matching the main path (C1/B-F3). `markDirty` is omitted (no chat row). + */ + recovery?: RecoveryContext; } /** @@ -72,12 +86,29 @@ export const createSubAgentTool = (options: SubAgentToolOptions) => { maxSteps = 50, onProgress, prepareStep, + recovery, } = options; const toolName = subAgentToolName({ name }); + // Wrap the sub-agent model with the overflow-recovery middleware (C1/B-F3) so + // a step that overflows gets one trim+retry instead of hard-failing the task. + // Guard on `typeof model !== "string"`: `wrapLanguageModel` needs a model + // INSTANCE, and `LanguageModel` permits a bare string id. The factory returns + // an instance today, but a string would otherwise throw here and the catch in + // `createSubAgentTools` would silently drop the whole sub-agent — so degrade to + // the unwrapped model instead. The remaining cast only reconciles the + // V2/V3 instance union (wrapLanguageModel accepts both at runtime). + const recoveredModel: LanguageModel = + recovery && typeof model !== "string" + ? wrapLanguageModel({ + model: model as Parameters[0]["model"], + middleware: contextOverflowRecoveryMiddleware(recovery), + }) + : model; + const agent = new ToolLoopAgent({ - model, + model: recoveredModel, instructions: systemPrompt || `You are a specialized sub-agent named "${name}". Complete the task you are given thoroughly and accurately.`, @@ -198,6 +229,7 @@ export const createSubAgentTools = async ( ) => Promise>, onProgress?: () => void, prepareStepFn?: (id: string) => PrepareStepFunction | undefined, + recoveryFn?: (id: string) => RecoveryContext | undefined, ): Promise> => { const tools: Record = {}; @@ -223,6 +255,7 @@ export const createSubAgentTools = async ( maxSteps: subAgent.maxSteps || 50, onProgress, prepareStep: prepareStepFn?.(subAgent.id), + recovery: recoveryFn?.(subAgent.id), }); tools[toolName] = tool; diff --git a/context-compaction-plan.md b/context-compaction-plan.md index f1435ccc..218acfed 100644 --- a/context-compaction-plan.md +++ b/context-compaction-plan.md @@ -1304,6 +1304,15 @@ no context in normal operation (healthy output is ~10× below a 2k ceiling). ### Fixes (in priority order) +> **Status (B-F2):** Fix 1 (heartbeat), Fix 2 (`maxOutputTokens` ceiling + prompt), +> and Fix 4 (`abortSignal`) are **SHIPPED** in `buildCompactionRuntime`. Fix 3 +> (open the response stream before compaction) is **NOT done** — deferred as the +> bigger refactor. The trace therefore still renders "Completed" (post-hoc +> `prependCompactionChunks`), not live Pending→Running. The HTTP/tunnel-drop +> vector during an in-`prepareChatTurn` summarize is largely neutralized by +> run/connection decoupling (issue #113: runs persist server-side regardless of +> the socket), so this is a UX/liveness gap, not data loss. + 1. **Heartbeat during summarize (CRITICAL).** Compaction is legitimate long work, not a stall. Ping `onActivity` / bump the per-step timer on an interval while `summarize` runs so the 120 s watchdog keeps resetting. Directly stops the @@ -1802,32 +1811,32 @@ to re-verify once the code exists.** Round trajectory: R1 design holes → R2 second-order effects → R3 a third-order race → R4 zero correctness findings (one telemetry-gated note). This is the anti-regression list — check it at PR time. -| ID | Issue | Resolution | ✅ Verify in code | -| ------- | ------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------- | -| **C1** | Trigger only counted last response, not what this turn adds | `projected = lastInputTokens + estimate(newMsgs)` | Trigger sums unsummarized new messages, not just last `usage` | -| **C2** | Compacting to the trigger ratio re-fires next turn (Cline #5616 thrash) | Hysteresis: target 0.5 ≠ trigger 0.8 | Post-compaction output `<= targetTokens`; a follow-up turn does not re-compact | -| **C3** | Raw window ratio ignored output + safety headroom | `inputBudget = window − maxOutputReserve − safetyReserve` | Budget subtracts both reserves before ratios | -| **C4** | Edit/delete/regenerate below watermark → stale summary | Invalidate via `writeWatermark`: version bump + clear summary + reset watermark | Every edit/delete/regenerate handler calls `writeWatermark`; forking below watermark resets on new branch | -| **M1** | Cold-start on huge imported history exceeds summarizer's own window | Chunked / map-reduce summarize | Prefix larger than summarizer window is chunked, not sent whole | -| **M2** | First-turn char/4 underestimate | `× 1.15` margin + recovery net | First-turn projection applies the margin | -| **M3** | "Both tiers apply to sub-agents" was wrong | Sub-agents = Tier 2 only (no durable history) | Sub-agent path wires Tier 2 + window only, no Tier 1 | -| **T1** | Tier 1 (UIMessage) and Tier 2 (ModelMessage) measured by different estimators → divergence | **One** `estimateTokens` over `CountUnit[]`, two adapters; `MODEL_BOUND` filter excludes UI-only parts both sides | No second estimator exists; equality test passes exactly on filtered set; UI-only parts (reasoning/source/step/data) never counted | -| **T2** | char/4 on base64 images is meaningless; ordering vs inline unclear | Modality table (anthropic/openai/default), header-parse dims w/ constant fallback, detail→high; estimate AFTER `inlineFileUrls`; divergence `log.warn` | No char/4 on image bytes; Tier 1 runs post-inline; missing dims → 1200; turn-2 divergence logged | -| **T3** | Recovery compaction vs "single durable writer"; finalize-mid-error ambiguous | Recovery does in-memory trim via `compactModelMessages` + sets persisted `compactionDirty`; durable write on NEXT `prepareChatTurn` only | Recovery never writes summary/watermark directly; `compactionDirty` is a DB column; recovery trim calls the Tier 2 adapter, not a bespoke trim | -| **T4** | litellm registry keys don't match our model IDs | Normalization chain + alias map + `log.warn` on MISS | Lookup tries exact→strip-prefix→lower→alias→family; Bedrock ARN / Azure resolve or log a miss | -| **T5** | Window cache stale after override edit | `cache.evict(providerId)` in provider PATCH, immediate | Editing modelMeta busts cache without waiting TTL | -| **T6** | 8192 default silently over-compacts | `log.warn` on default; ring renders **neutral**, no false ramp | Fall-to-default is logged; ring is grey/no-% when window unknown | -| **T7** | `taskModelId` may be unset | Fallback `taskModelId → main`; log model + cost | Summarizer falls back to main model; no crash on unset | -| **T8** | char/4 underestimates CJK/JSON | Accepted; margin + real-usage handoff + recovery net | (No code; documented as text-only heuristic) | -| **T9** | One synthetic 400 doesn't cover per-provider error bodies | Fixture set: OpenAI / Anthropic / Google-vLLM | `isContextOverflowError` matrix tests real per-provider phrasings | -| **T10** | CAS rejects stale write but loser behavior undefined → livelock risk | Re-read; if winner advanced → SKIP+clear-dirty; else retry once then SKIP | Loser never recompute-loops; terminal state is skip; decides by version | -| **R1** | Loser-skip assumed monotonic watermark; C4 reset moves it backward → stale write back door | All writes (advance/reset/dirty) through one versioned CAS; loser compares **version** not watermark value | Single `writeWatermark`; invalidation bumps version; no path mutates these fields outside it | -| **U1** | Ring showed previous model's window after a model switch | Resolve window from **selected** model, not last-message metadata | Ring reads selected-model window from `modelMeta`, refreshes on switch | -| **U2** | Ring lags pending composer input | Required tooltip label "current input not yet counted"; arc deferred | Tooltip text present and unmistakable | -| **U3** | Forced-compact confirm too soft | Confirm default-ON when drop significant (`>keepRecent` or `>30%`) | Threshold confirm wired; (P1: not destructive anyway) | -| **U4** | No feedback for defer-while-streaming click | Pending badge + disabled ring + "will compact on finish" tooltip | Ring disables + shows pending state between click and finish | -| **R4** | CAS read→summarize→write window wastes summarize under contention | Accepted, **not fixed**; gated on `cas.conflict` metric | `cas.conflict` metric emitted; no premature lock added | -| **P1** | (principle) compaction misread as data loss | View-not-delete: raw messages persist | No code path hard-deletes a summarized message | +| ID | Issue | Resolution | ✅ Verify in code | +| ------- | ------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **C1** | Trigger only counted last response, not what this turn adds | `projected = lastInputTokens + estimate(newMsgs)` | Trigger sums unsummarized new messages, not just last `usage` | +| **C2** | Compacting to the trigger ratio re-fires next turn (Cline #5616 thrash) | Hysteresis: target 0.5 ≠ trigger 0.8. Chunk-12 removed the schema fields + `resolveCompactionConfig` clamp; the runtime `targetRatio→triggerRatio*0.9` clamp now lives in `buildCompactionRuntime` after the env overrides (B-F1). | Post-compaction output `<= targetTokens`; a follow-up turn does not re-compact; an inverted `COMPACTION_TARGET_RATIO`/`COMPACTION_TRIGGER_RATIO` env pair is clamped (warn) not honored | +| **C3** | Raw window ratio ignored output + safety headroom | `inputBudget = window − maxOutputReserve − safetyReserve` | Budget subtracts both reserves before ratios | +| **C4** | Edit/delete/regenerate below watermark → stale summary | Invalidate via `writeWatermark`: version bump + clear summary + reset watermark | Every edit/delete/regenerate handler calls `writeWatermark`; forking below watermark resets on new branch | +| **M1** | Cold-start on huge imported history exceeds summarizer's own window | Chunked / map-reduce summarize | Prefix larger than summarizer window is chunked, not sent whole | +| **M2** | First-turn char/4 underestimate | `× 1.15` margin + recovery net | First-turn projection applies the margin | +| **M3** | "Both tiers apply to sub-agents" was wrong | Sub-agents = Tier 2 only (no durable history) | Sub-agent path wires Tier 2 + window only, no Tier 1 | +| **T1** | Tier 1 (UIMessage) and Tier 2 (ModelMessage) measured by different estimators → divergence | **One** `estimateTokens` over `CountUnit[]`, two adapters; `MODEL_BOUND` filter excludes UI-only parts both sides | No second estimator exists; equality test passes exactly on filtered set; UI-only parts (reasoning/source/step/data) never counted | +| **T2** | char/4 on base64 images is meaningless; ordering vs inline unclear | Modality table (anthropic/openai/default), header-parse dims w/ constant fallback, detail→high; estimate AFTER `inlineFileUrls`; divergence `log.warn` | No char/4 on image bytes; Tier 1 runs post-inline; missing dims → 1200; turn-2 divergence logged | +| **T3** | Recovery compaction vs "single durable writer"; finalize-mid-error ambiguous | Recovery does in-memory trim via `compactModelMessages` + sets persisted `compactionDirty`; durable write on NEXT `prepareChatTurn` only | Recovery never writes summary/watermark directly; `compactionDirty` is a DB column; recovery trim calls the Tier 2 adapter, not a bespoke trim | +| **T4** | litellm registry keys don't match our model IDs | Normalization chain + alias map + `log.warn` on MISS | Lookup tries exact→strip-prefix→lower→alias→family; Bedrock ARN / Azure resolve or log a miss | +| **T5** | Window cache stale after override edit | `cache.evict(providerId)` in provider PATCH, immediate | Editing modelMeta busts cache without waiting TTL | +| **T6** | 8192 default silently over-compacts | `log.warn` on default; ring renders **neutral**, no false ramp | Fall-to-default is logged; ring is grey/no-% when window unknown | +| **T7** | `taskModelId` may be unset | Fallback `taskModelId → main`; log model + cost | Summarizer falls back to main model; no crash on unset | +| **T8** | char/4 underestimates CJK/JSON | Accepted; margin + real-usage handoff + recovery net | (No code; documented as text-only heuristic) | +| **T9** | One synthetic 400 doesn't cover per-provider error bodies | Fixture set: OpenAI / Anthropic / Google-vLLM | `isContextOverflowError` matrix tests real per-provider phrasings | +| **T10** | CAS rejects stale write but loser behavior undefined → livelock risk | Re-read; if winner advanced → SKIP; else retry once then SKIP. NOTE: the covered-skip deliberately does NOT clear dirty (B-F6) — a concurrent `invalidateCompaction` also advances the version but leaves dirty set on purpose, so clearing it on a covered skip could drop a forced compaction. Leaving dirty is strictly safe (≤1 extra compaction next turn). | Loser never recompute-loops; terminal state is skip; decides by version; covered-skip leaves dirty for the next turn | +| **R1** | Loser-skip assumed monotonic watermark; C4 reset moves it backward → stale write back door | All writes (advance/reset/dirty) through one versioned CAS; loser compares **version** not watermark value | Single `writeWatermark`; invalidation bumps version; no path mutates these fields outside it | +| **U1** | Ring showed previous model's window after a model switch | Resolve window from **selected** model, not last-message metadata | Ring reads selected-model window from `modelMeta`, refreshes on switch | +| **U2** | Ring lags pending composer input | Required tooltip label "current input not yet counted"; arc deferred | Tooltip text present and unmistakable | +| **U3** | Forced-compact confirm too soft | Confirm default-ON when drop significant (`>keepRecent` or `>30%`) | Threshold confirm wired; (P1: not destructive anyway) | +| **U4** | No feedback for defer-while-streaming click | Pending badge + disabled ring + "will compact on finish" tooltip | Ring disables + shows pending state between click and finish | +| **R4** | CAS read→summarize→write window wastes summarize under contention | Accepted, **not fixed**; gated on `cas.conflict` metric | `cas.conflict` metric emitted; no premature lock added | +| **P1** | (principle) compaction misread as data loss | View-not-delete: raw messages persist | No code path hard-deletes a summarized message | --- From 041f69758a0ab85a291de2e1dadf491053f0e58f Mon Sep 17 00:00:00 2001 From: "frantisek.spacek@morosystems.cz" Date: Mon, 15 Jun 2026 10:57:23 +0200 Subject: [PATCH 20/21] fix(backend): harden compaction per final ADR-0012 review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Map-reduce summarizer no longer re-overflows: summarizePrefix checks the folded size (prior summary + framing) and recurses the reduce step; chunks split on message boundaries via packSegments instead of arbitrary char slices. Force-compact confirm now gates on significance (ADR-0012 §Force-compact): /compact returns tokensBefore/messagesDropped/keepRecentMessages and the context-window endpoint exposes keepRecentMessages so the client confirms only when the drop is significant, else runs immediately. Token estimator counts errorText on output-error UI parts, restoring the UI/Model adapter count equality (§One estimator). Sub-agent recovery/Tier 2 subtract per-sub-agent overhead. Extracted resolveCompactionConfig so the runtime and the context-window route share one config source. Also: drop fabricated Qwen3-72B registry key, rename ADR 0009→0012 (resolves collision with the invitation ADR-0009), refresh stale plan/0009 references, ring amber via CSS var with hex fallback, add cross-tenant 404 submit test, remove shipped internal plan doc. Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/backend/.env.example | 2 +- apps/backend/src/db/schema.ts | 8 +- apps/backend/src/routes/chat.test.ts | 34 +- apps/backend/src/routes/chat.ts | 11 +- apps/backend/src/routes/provider.ts | 14 +- apps/backend/src/runs/agent-runner.test.ts | 4 +- apps/backend/src/runs/agent-runner.ts | 51 +- apps/backend/src/runs/compaction.test.ts | 56 +- apps/backend/src/runs/compaction.ts | 362 ++-- apps/backend/src/runs/context-window.test.ts | 14 +- apps/backend/src/runs/context-window.ts | 35 +- apps/backend/src/runs/litellm-registry.ts | 1 - apps/backend/src/runs/recovery.test.ts | 18 +- apps/backend/src/runs/recovery.ts | 29 +- apps/backend/src/runs/token-estimate.test.ts | 23 +- apps/backend/src/runs/token-estimate.ts | 71 +- .../src/services/chat-execution.test.ts | 2 +- apps/backend/src/services/chat-execution.ts | 333 +-- apps/backend/src/tools/sub-agent.test.ts | 4 +- apps/backend/src/tools/sub-agent.ts | 8 +- apps/frontend/components/chat.tsx | 43 +- .../components/context-usage-ring.tsx | 15 +- context-compaction-plan.md | 1920 ----------------- docs/adr/0009-context-compaction.md | 136 -- docs/adr/0012-context-compaction.md | 385 ++++ packages/schemas/index.ts | 14 +- 26 files changed, 1047 insertions(+), 2546 deletions(-) delete mode 100644 context-compaction-plan.md delete mode 100644 docs/adr/0009-context-compaction.md create mode 100644 docs/adr/0012-context-compaction.md diff --git a/apps/backend/.env.example b/apps/backend/.env.example index 1756e6b3..b8197606 100644 --- a/apps/backend/.env.example +++ b/apps/backend/.env.example @@ -51,7 +51,7 @@ PLATYPUS_SANDBOX_DOCKER_ENABLED=false # Frontend URL for generating resource links in tool responses FRONTEND_URL=http://localhost:3001 -# Context compaction (ADR-0009, context-compaction-plan §G). +# Context compaction (ADR-0012 §Config & kill switch). # Compaction behavior is global; window/output size stays per-model. # COMPACTION_ENABLED=false disables proactive compaction (recovery still runs). # COMPACTION_ENABLED=true diff --git a/apps/backend/src/db/schema.ts b/apps/backend/src/db/schema.ts index 5f7e06a1..52015213 100644 --- a/apps/backend/src/db/schema.ts +++ b/apps/backend/src/db/schema.ts @@ -67,7 +67,7 @@ export const provider = pgTable( memoryExtractionModelId: t.text("memory_extraction_model_id").notNull(), embeddingModelId: t.text("embedding_model_id"), embeddingDimensions: t.integer("embedding_dimensions"), - // Per-model context-window / output overrides (context-compaction-plan §A). + // Per-model context-window / output overrides (ADR-0012 §Window resolution). // Keyed by model id; resolveContextWindow consults this before API/registry. modelMeta: t .jsonb("model_meta") @@ -169,10 +169,10 @@ export const chat = pgTable( presencePenalty: t.real("presence_penalty"), frequencyPenalty: t.real("frequency_penalty"), - // Context-compaction state (docs/adr/0009). All additive nullable/defaulted. - // P1 view-not-delete: these change what is sent to the model, never the + // Context-compaction state (docs/adr/0012). All additive nullable/defaulted. + // View-not-delete (ADR-0012 §View, not delete): these change what is sent to the model, never the // stored `messages`. `summaryWatermark` = message id of the last summarized - // message. All mutations go through the single versioned CAS writer (P3/R1); + // message. All mutations go through the single versioned CAS writer (ADR-0012 §One durable writer); // `version` is its compare-and-swap token. contextSummary: t.text("context_summary"), summaryWatermark: t.text("summary_watermark"), diff --git a/apps/backend/src/routes/chat.test.ts b/apps/backend/src/routes/chat.test.ts index 7c42d420..43b3161e 100644 --- a/apps/backend/src/routes/chat.test.ts +++ b/apps/backend/src/routes/chat.test.ts @@ -27,7 +27,7 @@ vi.mock("../services/chat-execution.ts", () => { return { prepareChatTurn: mockPrepareChatTurn, forceCompactChat: mockForceCompactChat, - // loadChatMessages is called by agent-runner before onStart (RV1 baseline). + // loadChatMessages is called by agent-runner before onStart (ADR-0012 §Summary invalidation baseline). loadChatMessages: vi.fn().mockResolvedValue([]), ValidationError, NotFoundError, @@ -252,7 +252,7 @@ describe("Chat Routes", () => { mockDb.limit.mockResolvedValueOnce([ { ownerId: "user-1", organizationId: "org-1" }, ]); // requireWorkspaceAccess - mockDb.limit.mockResolvedValueOnce([{ workspaceId: "ws-1" }]); // RV2 chat workspace check + mockDb.limit.mockResolvedValueOnce([{ workspaceId: "ws-1" }]); // ADR-0012 §Consequences (cross-tenant safety) chat workspace check // ChatSink.onStart upserts the chat row with status=running before // prepareChatTurn runs. Returning a non-empty array skips the insert @@ -293,6 +293,36 @@ describe("Chat Routes", () => { expect(res.status).toBe(200); expect(await res.text()).toBe("stream"); }); + + it("returns 404 when the submitted chat id belongs to another workspace (ADR-0012 §Consequences cross-tenant safety)", async () => { + mockSession({ + id: "user-1", + name: "Test User", + email: "test@example.com", + }); + mockDb.limit.mockResolvedValueOnce([{ role: "member" }]); // requireOrgAccess + mockDb.limit.mockResolvedValueOnce([ + { ownerId: "user-1", organizationId: "org-1" }, + ]); // requireWorkspaceAccess + // Cross-tenant check: the chat exists but in a DIFFERENT workspace. + mockDb.limit.mockResolvedValueOnce([{ workspaceId: "ws-other" }]); + + const res = await app.request(baseUrl, { + method: "POST", + body: JSON.stringify({ + id: "chat-1", + workspaceId, + providerId: "p1", + modelId: "m1", + messages: [{ role: "user", content: "hello" }], + }), + headers: { "Content-Type": "application/json" }, + }); + + expect(res.status).toBe(404); + // The run must never start — no compaction-store mutation on another tenant's chat. + expect(mockPrepareChatTurn).not.toHaveBeenCalled(); + }); }); describe("DELETE /:chatId", () => { diff --git a/apps/backend/src/routes/chat.ts b/apps/backend/src/routes/chat.ts index 5e05ff68..b0129ad4 100644 --- a/apps/backend/src/routes/chat.ts +++ b/apps/backend/src/routes/chat.ts @@ -148,7 +148,7 @@ chat.post( const scope = c.get("workspaceScope")!; const data = c.req.valid("json"); - // RV2: verify the submitted chat id (if any) belongs to this workspace. + // ADR-0012 §Consequences (cross-tenant safety): verify the submitted chat id (if any) belongs to this workspace. // Without this check a workspace-A user could supply a workspace-B chat id // and corrupt B's compaction state via the unscoped store writes. if (data.id) { @@ -451,7 +451,7 @@ chat.post( const workspaceId = c.req.param("workspaceId")!; // Reject if a run is currently in flight — the frontend defers the click - // until streaming finishes (drift U4), but guard here as a belt-and-suspenders + // until streaming finishes (ADR-0012 §Force-compact on demand), but guard here as a belt-and-suspenders // check to avoid CAS races with an in-progress writer. if (runRegistry.has(chatId)) { return c.json( @@ -464,9 +464,14 @@ chat.post( const result = await forceCompactChat(chatId, workspaceId, orgId); return c.json({ inputTokens: result.estimatedTokens, + // ADR-0012 §Force-compact on demand: the client confirms only when the drop + // is significant (messagesDropped > keepRecentMessages OR reduction > 30%). + tokensBefore: result.tokensBefore, + messagesDropped: result.messagesDropped, + keepRecentMessages: result.keepRecentMessages, contextWindow: result.contextWindow, contextWindowIsDefault: result.contextWindowIsDefault, - // §J/11c: the persisted synthetic trace message (when a summary ran), so + // ADR-0012 §Compaction trace in the timeline: the persisted synthetic trace message (when a summary ran), so // the frontend can append it to the timeline without a full refetch. traceMessage: result.traceMessage, }); diff --git a/apps/backend/src/routes/provider.ts b/apps/backend/src/routes/provider.ts index 49a16334..74170291 100644 --- a/apps/backend/src/routes/provider.ts +++ b/apps/backend/src/routes/provider.ts @@ -8,6 +8,7 @@ import { eq, and } from "drizzle-orm"; import { handleEmbeddingConfigChange } from "../services/embedding-invalidation.ts"; import { dedupeArray } from "../utils.ts"; import { contextWindowResolver } from "../runs/context-window.ts"; +import { resolveCompactionConfig } from "../services/chat-execution.ts"; import { requireAuth } from "../middleware/authentication.ts"; import { requireOrgAccess, @@ -136,8 +137,8 @@ provider.put( ) .returning(); - // RV7c: bust the cached context window so a modelMeta override takes effect - // immediately rather than waiting out the 1-hour TTL (drift T5). + // ADR-0012 §Window resolution (caching & eviction): bust the cached context window so a modelMeta override takes effect + // immediately rather than waiting out the 1-hour TTL (ADR-0012 §Window resolution (caching & eviction)). contextWindowResolver.evict(providerId); return c.json(record[0], 200); @@ -178,9 +179,9 @@ provider.delete( /** * Returns the resolved context window for a specific model on this provider - * (§H ring, drift U1). Uses the cached resolver — fast for repeated calls. + * (ADR-0012 §Context-usage ring). Uses the cached resolver — fast for repeated calls. * Returns `{ contextWindow: null }` when the window fell to the conservative - * default so the frontend can render the ring neutral (drift T6). + * default so the frontend can render the ring neutral (ADR-0012 §Context-usage ring). */ provider.get( "/:providerId/context-window", @@ -212,6 +213,11 @@ provider.get( ? resolved.contextWindow : null, source: resolved?.source ?? "default", + // ADR-0012 §Force-compact on demand: the client gates the confirm dialog on + // the drop being significant. messagesDropped ≈ total − keepRecent, so + // "messagesDropped > keepRecent" ⟺ "total > 2 × keepRecent" — a pre-run + // proxy computable client-side from the message count. + keepRecentMessages: resolveCompactionConfig().keepRecentMessages, }); }, ); diff --git a/apps/backend/src/runs/agent-runner.test.ts b/apps/backend/src/runs/agent-runner.test.ts index 0161aa83..1fc2dd45 100644 --- a/apps/backend/src/runs/agent-runner.test.ts +++ b/apps/backend/src/runs/agent-runner.test.ts @@ -950,7 +950,7 @@ describe("buildTier2PrepareStep", () => { return msgs; }; - it("returns undefined when messages are below triggerTokens (drift m3)", async () => { + it("returns undefined when messages are below triggerTokens (ADR-0012 §Sub-agents)", async () => { const fn = buildTier2PrepareStep(makeCtx(10_000)); const result = await callStep(fn, shortMessages); expect(result).toBeUndefined(); @@ -972,7 +972,7 @@ describe("buildTier2PrepareStep", () => { expect(out[1]?.role).not.toBe("tool"); }); - it("returns undefined when prefix is empty (no-op, drift m3 / RV4)", async () => { + it("returns undefined when prefix is empty (no-op, ADR-0012 §Sub-agents)", async () => { // Two messages, keepRecentMessages 4 → no prefix to summarize → // compactModelMessages drops nothing → prepareStep returns undefined so the // SDK proceeds unchanged, and the summarizer is never called. diff --git a/apps/backend/src/runs/agent-runner.ts b/apps/backend/src/runs/agent-runner.ts index d9976e13..51efe020 100644 --- a/apps/backend/src/runs/agent-runner.ts +++ b/apps/backend/src/runs/agent-runner.ts @@ -104,7 +104,7 @@ export function withToolTimestamps( /** * Injects synthetic `compact_context` tool-call + tool-result chunks into a - * UIMessage stream immediately after the `start` event (§K / 11c). Makes Tier + * UIMessage stream immediately after the `start` event (ADR-0012 §Compaction trace in the timeline). Makes Tier * 1 compaction visible in the chat timeline without a custom renderer — the * existing tool-call expander handles it automatically. * @@ -152,12 +152,12 @@ export function prependCompactionChunks( const COMPACT_CONTEXT_PART_TYPE = `tool-${COMPACT_CONTEXT_TOOL_NAME}`; /** - * Removes the synthetic `compact_context` trace parts (§K/11c) from a message + * Removes the synthetic `compact_context` trace parts (ADR-0012 §Compaction trace in the timeline) from a message * list before it is converted to ModelMessages. The trace is a UI-only marker * persisted in the assistant message for the chat timeline; it must NEVER be * replayed to the provider, which would otherwise see a phantom tool call for a * tool it was never given (provider rejection / model confusion). An assistant - * message left with no parts after stripping (the §J standalone trace message) + * message left with no parts after stripping (the ADR-0012 §Force-compact on demand standalone trace message) * is dropped entirely rather than sent empty. * * Exported for unit testing. @@ -180,18 +180,18 @@ export function stripCompactionTraceParts( (p) => p.type !== COMPACT_CONTEXT_PART_TYPE, ); if (parts.length > 0) out.push({ ...message, parts }); - // else: trace-only message (§J) — drop it from the model payload. + // else: trace-only message (ADR-0012 §Force-compact on demand) — drop it from the model payload. } return changed ? out : messages; } -/** Stats stamped on the last assistant message's metadata after each stream (§H/§I). */ +/** Stats stamped on the last assistant message's metadata after each stream (ADR-0012 §Context-usage ring / §Per-message stats). */ export type MessageStats = { - /** Run-wide totals across every step (sum) — §I cost popover. */ + /** Run-wide totals across every step (sum) — ADR-0012 §Per-message stats cost popover. */ inputTokens: number; outputTokens: number; /** - * Input tokens of the LAST model call = peak context fullness — §H ring. + * Input tokens of the LAST model call = peak context fullness — ADR-0012 §Context-usage ring. * NOT the run-wide sum (which over-counts on multi-step tool loops). */ contextTokens: number; @@ -206,7 +206,7 @@ export type MessageStats = { * Stamps per-run stats (token counts, timing, resolved context window) onto * the last assistant message's `metadata.stats` in place. Applied at the same * point as {@link applyToolCompletions} so both mutations happen before the - * sink persists the final state (§H/§I). + * sink persists the final state (ADR-0012 §Context-usage ring / §Per-message stats). */ function applyMessageStats( messages: PlatypusUIMessage[], @@ -374,7 +374,7 @@ type RunState = { terminated: boolean; /** * Input tokens reported by the most recent model step = peak context - * fullness for the §H ring. Tracked separately from `stats.inputTokens`, + * fullness for the ADR-0012 §Context-usage ring. Tracked separately from `stats.inputTokens`, * which is the run-wide SUM and over-counts multi-step tool loops. */ lastStepInputTokens: number; @@ -448,8 +448,8 @@ export class AgentRunner { }) { const { scope, input, sink } = params; - // RV1: snapshot the DB state BEFORE onStart overwrites it so - // applyTier1IfNeeded has the correct C4 baseline. Only interactive chats + // ADR-0012 §Summary invalidation: snapshot the DB state BEFORE onStart overwrites it so + // applyTier1IfNeeded has the correct ADR-0012 §Summary invalidation baseline. Only interactive chats // carry a `request.id`; headless runs (triggers, sub-agents) have none. const priorMessages = input.request.id ? await loadChatMessages(input.request.id).catch((err) => { @@ -457,7 +457,7 @@ export class AgentRunner { // which cannot detect edits below the watermark — log the degradation. logger.warn( { err, chatId: input.request.id }, - "RV1: failed to snapshot prior messages; C4 edit-detection degraded this turn", + "ADR-0012 §Summary invalidation: failed to snapshot prior messages; ADR-0012 §Summary invalidation edit-detection degraded this turn", ); return undefined; }) @@ -572,11 +572,11 @@ export class AgentRunner { // an `undefined` value identically, and the streaming path has always // passed them this way in production. const modelArgs = { - // Recovery middleware (§E, P4): every model call — first call and every + // Recovery middleware (ADR-0012 §Recovery): every model call — first call and every // tool-loop step, stream and generate alike — gets one trim-and-retry on - // a provider "context too long" rejection. Always on; not gated by §G. + // a provider "context too long" rejection. Always on; not gated by ADR-0012 §Config & kill switch. model: withOverflowRecovery(state.turn), - // Strip the UI-only synthetic compact_context trace parts (§K/11c) before + // Strip the UI-only synthetic compact_context trace parts (ADR-0012 §Compaction trace in the timeline) before // sending history to the provider — replaying them surfaces a phantom tool // call for a tool the model was never given. Applied here so both the // streaming and generate paths (which share modelArgs) are covered. @@ -587,7 +587,7 @@ export class AgentRunner { tools: state.turn.stream.tools, stopWhen: [stepCountIs(state.turn.stream.maxSteps)], abortSignal: handle.signal, - // Tier 2 (§D): in-turn compaction before each step when the live window + // Tier 2 (ADR-0012 §Tier 2): in-turn compaction before each step when the live window // nears the limit. Undefined when the turn has no Tier 2 runtime. prepareStep: state.turn.tier2 ? buildTier2PrepareStep(state.turn.tier2) @@ -623,7 +623,7 @@ export class AgentRunner { const startedAt = new Date().toISOString(); let firstTokenAt: string | undefined; - // Set when the §H/§I stats are first emitted (messageMetadata `finish`), so + // Set when the ADR-0012 §Context-usage ring / §Per-message stats are first emitted (messageMetadata `finish`), so // the post-stream persist stamp reuses the same value rather than a slightly // later one — streamed and reloaded stats then match. let finishedAt: string | undefined; @@ -667,7 +667,7 @@ export class AgentRunner { const uiStream = result.toUIMessageStream({ originalMessages: input.messages, generateMessageId: createIdGenerator({ prefix: "msg", size: 16 }), - // Emit the §H/§I stats with the `finish` event so the client gets them on + // Emit the ADR-0012 §Context-usage ring / §Per-message stats with the `finish` event so the client gets them on // the final stream chunk — the (i) stats action then appears the instant // the answer completes, not a DB-refetch round-trip later. `start` carries // only agentId (timing/usage don't exist yet). The post-stream stamp in @@ -686,7 +686,7 @@ export class AgentRunner { onError: (error) => formatStreamError(error), }); - // §K / 11c: if Tier 1 compaction fired this turn, prepend synthetic + // ADR-0012 §Compaction trace in the timeline: if Tier 1 compaction fired this turn, prepend synthetic // compact_context tool-call + tool-result chunks so the compaction is // visible in the chat timeline. Injected after the 'start' event so the // AI SDK builds them into the same assistant message as the response. @@ -710,9 +710,9 @@ export class AgentRunner { // // finalize is called here (not in toUIMessageStream's onFinish) so that // state.messages reflects the fully-drained stream — including the tool - // `completedAt` timestamps and §H/§I stats applied below — before the sink + // `completedAt` timestamps and ADR-0012 §Context-usage ring / §Per-message stats applied below — before the sink // persists it. - // RV8: an error chunk (model/tool failure surfaced via formatStreamError) or + // An error chunk (model/tool failure surfaced via formatStreamError) or // an internal stream fault ends the for-await without throwing, because // readUIMessageStream defaults terminateOnError=false. Capture it so the // finally finalizes "failed" instead of silently persisting a partial @@ -757,7 +757,7 @@ export class AgentRunner { } } else if (streamError !== undefined) { // The stream errored (model/tool rejection or internal fault) but did - // not abort — record the run as failed rather than succeeded (RV8). + // not abort — record the run as failed rather than succeeded. status = "failed"; err = streamError instanceof Error @@ -841,10 +841,9 @@ export class AgentRunner { } /** - * Wraps the turn's model with the context-overflow recovery middleware (§E, - * P4): every model call — first call and every tool-loop step, stream and + * Wraps the turn's model with the context-overflow recovery middleware (ADR-0012 §Recovery): every model call — first call and every tool-loop step, stream and * generate alike — gets one trim-and-retry on a provider "context too long" - * rejection. Always on; the §G kill switch does not gate it. + * rejection. Always on; the ADR-0012 §Config & kill switch does not gate it. */ const withOverflowRecovery = (turn: ChatTurn): LanguageModel => wrapLanguageModel({ @@ -866,7 +865,7 @@ const formatStreamError = (error: unknown): string => { if (LoadAPIKeyError.isInstance(error)) { return "AI provider API key is missing or not configured."; } - // Reaching here means recovery (§E) already trimmed and retried once and the + // Reaching here means recovery (ADR-0012 §Recovery) already trimmed and retried once and the // provider still rejected the prompt — surface the actionable dead end. if (isContextOverflowError(error)) { return "Conversation too large for the model's context window even after trimming — start a new chat or reduce attachments."; diff --git a/apps/backend/src/runs/compaction.test.ts b/apps/backend/src/runs/compaction.test.ts index adc28692..2321fdbc 100644 --- a/apps/backend/src/runs/compaction.test.ts +++ b/apps/backend/src/runs/compaction.test.ts @@ -61,7 +61,7 @@ class FakeStore implements CompactionStore { } } -describe("casWrite — version-gated CAS (P3/R1)", () => { +describe("casWrite — version-gated CAS (ADR-0012 §One durable writer)", () => { it("applies and bumps version when the expected version matches", async () => { const store = new FakeStore({ version: 3 }); const won = await store.casWrite("c", 3, { summary: "s", watermark: "m1" }); @@ -92,7 +92,7 @@ describe("casWrite — version-gated CAS (P3/R1)", () => { }); }); -describe("commitWatermark — loser logic (drift T10/R1)", () => { +describe("commitWatermark — loser logic (ADR-0012 §One durable writer)", () => { it("applies a write on an uncontended commit", async () => { const store = new FakeStore({ version: 2 }); const res = await commitWatermark(store, "c", () => ({ @@ -274,7 +274,7 @@ describe("compactUIMessages (Tier 1)", () => { expect(res.estimatedTokens).toBeLessThanOrEqual(300); }); - it("does NOT re-fire next turn: feeding the result back is a no-op (C2)", async () => { + it("does NOT re-fire next turn: feeding the result back is a no-op (ADR-0012 §Tier 1 (hysteresis))", async () => { const msgs = [ uiText("p1", "user", "P".repeat(4000)), uiText("p2", "assistant", "Q".repeat(4000)), @@ -297,7 +297,7 @@ describe("compactUIMessages (Tier 1)", () => { expect(second.messagesDropped).toBe(0); }); - it("map-reduces an oversized prefix (drift M1)", async () => { + it("map-reduces an oversized prefix (ADR-0012 §Tier 1 (summarizer model & map-reduce))", async () => { const summarize = vi.fn(noopSummarize); const msgs = [ uiText("p1", "user", "Z".repeat(4000)), // ~1000 tokens of transcript @@ -379,7 +379,7 @@ describe("compactUIMessages (Tier 1)", () => { expect((toolPart?.output as string).length).toBeLessThan(12000); }); - it("option D: keeps recent VERBATIM in the empty-prefix path when within inputBudget", async () => { + it("ADR-0012 §Hard window wall: keeps recent VERBATIM in the empty-prefix path when within inputBudget", async () => { // Whole history fits within keepRecentMessages (2) → empty prefix, no model // call. Over the soft target but under the wall → outlier must stay untouched. const msgs = [ @@ -422,7 +422,7 @@ describe("compactUIMessages (Tier 1)", () => { warn.mockRestore(); }); - it("option D: does NOT warn on a soft-target miss when recent is under the wall", async () => { + it("ADR-0012 §Hard window wall: does NOT warn on a soft-target miss when recent is under the wall", async () => { const warn = vi.spyOn(logger, "warn").mockReturnValue(undefined); const msgs = [ uiText("p1", "user", "P".repeat(4000)), @@ -440,7 +440,7 @@ describe("compactUIMessages (Tier 1)", () => { warn.mockRestore(); }); - it("option D: keeps recent tool results VERBATIM when within inputBudget", async () => { + it("ADR-0012 §Hard window wall: keeps recent tool results VERBATIM when within inputBudget", async () => { // Over the soft target (300) so Stage 2 fires, but the kept view (summary + // recent) stays under the hard wall → recent must NOT be trimmed. const msgs = [ @@ -463,7 +463,7 @@ describe("compactUIMessages (Tier 1)", () => { expect(toolPart?.output).toBe("X".repeat(12000)); // untouched }); - it("option D: trims recent (except newest) when the kept view breaches inputBudget", async () => { + it("ADR-0012 §Hard window wall: trims recent (except newest) when the kept view breaches inputBudget", async () => { // Two big tool results in recent; the kept view breaches the wall → trim the // older one, exempt the single newest message even though it is bulky. const msgs = [ @@ -549,7 +549,7 @@ describe("compactModelMessages (Tier 2 / recovery)", () => { expect(roles[toolIdx - 1]).toBe("assistant"); }); - it("force bypasses BOTH no-op gates so recovery never retries byte-identically (RV3)", async () => { + it("force bypasses BOTH no-op gates so recovery never retries byte-identically (ADR-0012 §Recovery)", async () => { // Estimator says we are within target AND nothing is prunable (small, // non-bulky messages). Without force both the whole-message gate and the // post-prune gate would no-op → recovery would retry the exact same prompt @@ -570,7 +570,7 @@ describe("compactModelMessages (Tier 2 / recovery)", () => { expect(res.messages).not.toBe(msgs); }); - it("force with an empty prefix is a no-op, not a prompt-growing summary (RV4 model-side)", async () => { + it("force with an empty prefix is a no-op, not a prompt-growing summary (ADR-0012 §Tier 1, model-side)", async () => { // recent alone exceeds keepRecentMessages → prefix is empty. Summarizing // nothing would ADD a synthetic message and grow the prompt, never // converging. Surface the overflow instead. @@ -603,7 +603,7 @@ import { type CompactionConfig, } from "./compaction.ts"; -describe("buildCompactionTraceMessage (§J/11c)", () => { +describe("buildCompactionTraceMessage (ADR-0012 §Force-compact on demand)", () => { it("builds an assistant message with a completed compact_context tool part", () => { const msg = buildCompactionTraceMessage( { messagesDropped: 7, summaryExcerpt: "did things" }, @@ -644,7 +644,7 @@ const cfg = (over: Partial = {}): CompactionConfig => ({ ...over, }); -describe("computeBudget (drift C3 — subtract both reserves)", () => { +describe("computeBudget (ADR-0012 §Tier 1 (budget math) — subtract both reserves)", () => { it("subtracts output + safety reserve before applying ratios", () => { const b = computeBudget( 10000, @@ -660,7 +660,7 @@ describe("computeBudget (drift C3 — subtract both reserves)", () => { expect(b.inputBudget).toBe(7000); // 10000 - min(4096, 2500) - 500 }); - it("caps the output reserve at half the window so inputBudget can't collapse (A6)", () => { + it("caps the output reserve at half the window so inputBudget can't collapse (ADR-0012 §Tier 1 (budget math))", () => { // A bogus registry entry where max_output >= the input-scoped window would // otherwise drive inputBudget toward 1 and thrash. The cap keeps it sane. const b = computeBudget(10000, 20000, cfg({ reserveRatio: 0.05 })); @@ -749,7 +749,7 @@ describe("applyTier1Compaction", () => { expect(store.state.version).toBe(1); expect(out.messages[0].id).toBe("context-summary"); expect(onEvent).toHaveBeenCalledOnce(); - // §K/11c: a summary ran → a trace is surfaced with the dropped count and a + // ADR-0012 §Compaction trace in the timeline: a summary ran → a trace is surfaced with the dropped count and a // summary excerpt. expect(out.compactionTrace).toEqual({ messagesDropped: 2, @@ -783,7 +783,7 @@ describe("applyTier1Compaction", () => { expect(store.casCalls).toBe(0); }); - it("dirty forces compaction even when proactive is disabled (P4 recovery hand-off)", async () => { + it("dirty forces compaction even when proactive is disabled (ADR-0012 §Recovery is the net recovery hand-off)", async () => { const store = storeFromState({ version: 0, compactionDirty: true }); const messages = [ bigText("p1", "user"), @@ -839,7 +839,7 @@ describe("applyTier1Compaction", () => { expect(store.state.compactionDirty).toBe(false); // flag cleared expect(store.state.contextSummary).toBeNull(); // no summary written expect(store.state.version).toBe(1); - // §K/11c: no model summary ran → no trace (would be an empty timeline entry). + // ADR-0012 §Compaction trace in the timeline: no model summary ran → no trace (would be an empty timeline entry). expect(out.compactionTrace).toBeUndefined(); }); @@ -870,7 +870,7 @@ describe("applyTier1Compaction", () => { }); }); -describe("invalidateCompaction (drift C4)", () => { +describe("invalidateCompaction (ADR-0012 §Summary invalidation)", () => { const ordered = ["m1", "m2", "m3", "m4"]; it("resets summary + watermark when a message at/below the watermark changes", async () => { @@ -883,7 +883,7 @@ describe("invalidateCompaction (drift C4)", () => { expect(res.status).toBe("applied"); expect(store.state.summaryWatermark).toBeNull(); expect(store.state.contextSummary).toBeNull(); - expect(store.state.version).toBe(6); // bumped so a racing compaction loses (R1) + expect(store.state.version).toBe(6); // bumped so a racing compaction loses (ADR-0012 §One durable writer) }); it("is a no-op when the edit is entirely above the watermark", async () => { @@ -915,7 +915,7 @@ describe("invalidateCompaction (drift C4)", () => { }); }); -describe("affectedBelowWatermark (C4 divergence detection)", () => { +describe("affectedBelowWatermark (ADR-0012 §Summary invalidation divergence detection)", () => { const persisted = [ uiText("m1", "user", "one"), uiText("m2", "assistant", "two"), @@ -964,7 +964,7 @@ describe("affectedBelowWatermark (C4 divergence detection)", () => { }); }); -// --- Chunk 3: C1/M2 trigger projection + recovery dirty-flag producer ----- +// --- ADR-0012 §Tier 1 (trigger projection) / §Token estimation (cold-start margin): trigger projection + recovery dirty-flag producer ----- import { projectTier1Tokens, @@ -972,14 +972,14 @@ import { COLD_START_MARGIN, } from "./compaction.ts"; -describe("projectTier1Tokens (drift C1/M2)", () => { - it("applies the cold-start margin when no provider baseline exists (M2)", () => { +describe("projectTier1Tokens (ADR-0012 §Tier 1 (trigger projection) / §Token estimation (cold-start margin))", () => { + it("applies the cold-start margin when no provider baseline exists (ADR-0012 §Token estimation (cold-start margin))", () => { expect( projectTier1Tokens({ messageTokens: 100, priorSummaryTokens: 0 }), ).toBe(Math.ceil(100 * COLD_START_MARGIN)); }); - it("counts the per-turn overhead toward the trigger (C1)", () => { + it("counts the per-turn overhead toward the trigger (ADR-0012 §Tier 1 (trigger projection))", () => { expect( projectTier1Tokens({ messageTokens: 100, @@ -1010,7 +1010,7 @@ describe("projectTier1Tokens (drift C1/M2)", () => { ).toBe(100); }); - it("treats a 0 provider count as no baseline and keeps the margin (A1)", () => { + it("treats a 0 provider count as no baseline and keeps the margin (ADR-0012 §Tier 1 (trigger projection))", () => { // Usage-less providers persist contextTokens=0; a bare `== null` check would // skip the margin AND no-op the max(), leaving the raw char/4 with no buffer. expect( @@ -1023,7 +1023,7 @@ describe("projectTier1Tokens (drift C1/M2)", () => { }); }); -describe("applyTier1Compaction — overhead in the trigger (C1)", () => { +describe("applyTier1Compaction — overhead in the trigger (ADR-0012 §Tier 1 (trigger projection))", () => { it("fires on system/tool overhead even when messages alone are under trigger", async () => { const store = storeFromState({ version: 0 }); // ~4 tokens of messages — far under the 50-token trigger on their own. @@ -1080,7 +1080,7 @@ describe("applyTier1Compaction — overhead in the trigger (C1)", () => { }); }); -describe("setCompactionDirty (§E recovery producer, drift T3)", () => { +describe("setCompactionDirty (ADR-0012 §Recovery producer)", () => { it("sets the flag through the CAS writer", async () => { const store = storeFromState({ version: 3 }); const res = await setCompactionDirty(store, "c"); @@ -1108,7 +1108,7 @@ describe("setCompactionDirty (§E recovery producer, drift T3)", () => { }); }); -// --- Chunk 14 Task 2: Stage 0 context editing --------------------------- +// --- ADR-0012 §Stage 0 — context editing --------------------------- /** Tool message with a named tool and arbitrary output. */ const toolMsg = ( @@ -1262,7 +1262,7 @@ describe("editToolResults (Stage 0 — context editing)", () => { }); }); -describe("applyTier1Compaction — Stage 0 avoids summarization (Task 2)", () => { +describe("applyTier1Compaction — Stage 0 avoids summarization (ADR-0012 §Stage 0 — context editing)", () => { const hugeTool = (id: string) => toolMsg(id, "dump", "Z".repeat(8000)); // High minPrunableChars so Stage 1 prefix-pruning does NOT rescue the no-edit // case — it must reach Stage 2 (the model call) to make Stage 0's avoidance of diff --git a/apps/backend/src/runs/compaction.ts b/apps/backend/src/runs/compaction.ts index 12e01a21..74d543ea 100644 --- a/apps/backend/src/runs/compaction.ts +++ b/apps/backend/src/runs/compaction.ts @@ -1,13 +1,13 @@ /** - * Context compaction (context-compaction-plan §C/§D, ADR-0009). + * Context compaction (ADR-0012 §Tier 1 / §Tier 2). * * This module owns durable compaction state and the message-shaping primitives. - * Slice 2a (this section) is the **single durable writer** (principle P3): every + * Slice 2a (this section) is the **single durable writer** (principle ADR-0012 §One durable writer): every * mutation of `summaryWatermark` / `contextSummary` / `compactionDirty` flows * through {@link CompactionStore.casWrite}, a version-gated compare-and-swap. * - * Why versioned CAS and not "compare the watermark value" (drift R1): history - * edits (§C invalidation) move the watermark **backward**. A loser that compared + * Why versioned CAS and not "compare the watermark value" (ADR-0012 §One durable writer): history + * edits (ADR-0012 §Tier 1 invalidation) move the watermark **backward**. A loser that compared * watermark values could mistake a reset for "not yet advanced" and write a stale * summary over mutated history. Deciding by `version` removes the monotonicity * assumption entirely — any concurrent mutation bumps the version, so a racing @@ -58,7 +58,7 @@ export type CompactionStore = { * Version-gated compare-and-swap. Applies `patch` and sets * `version = expectVersion + 1` **only if** the row's current version still * equals `expectVersion`. Returns true iff exactly one row was updated - * (i.e. this writer won). The single durable writer (P3). + * (i.e. this writer won). The single durable writer (ADR-0012 §One durable writer). */ casWrite( chatId: string, @@ -118,7 +118,7 @@ export type WatermarkDecision = | { kind: "skip"; reason: "no-op" | "covered" }; /** - * The single entry point for mutating compaction state (P3, drift T10). + * The single entry point for mutating compaction state (ADR-0012 §One durable writer). * * Reads the current state, asks `decide` what to do, and CAS-writes it. On a * CAS conflict it re-reads and retries the decision **once**; a second conflict @@ -150,8 +150,8 @@ export async function commitWatermark( if (won) return { status: "applied", version: state.version + 1 }; // Lost the CAS — a concurrent writer moved the version. Loop to re-read and // re-decide. The decision compares VERSION (via the re-read), not watermark - // values, so a backward watermark reset cannot be misread (R1). The metric - // gates whether the R4 read→summarize→write contention note ever needs a fix. + // values, so a backward watermark reset cannot be misread (ADR-0012 §One durable writer). The metric + // gates whether the read→summarize→write contention note ever needs a fix. logger.info( { metric: "cas.conflict", chatId, attempt, version: state.version }, "cas.conflict", @@ -173,7 +173,7 @@ export async function commitWatermark( // Stage 2 — summarize the older prefix into one synthetic summary (model call). // `compactUIMessages` (Tier 1, durable) and `compactModelMessages` (Tier 2 + // recovery, throwaway) differ only in message shape and the tool-pairing rule. -// Token counting is the ONE estimator from token-estimate.ts (P2). +// Token counting is the ONE estimator from token-estimate.ts (ADR-0012 §One estimator). // =========================================================================== /** Summarizes a transcript into a compact paragraph. Injected (the task model). */ @@ -199,7 +199,7 @@ export function softTrim(text: string, keepEachSide = 500): string { /** * Picks the index splitting `prefix = [0, boundary)` from `recent = [boundary, * total)`. Starts at `total - keepRecent`, then walks backward while the - * boundary is unsafe so a tool-call/result pair is never split (drift in §C). + * boundary is unsafe so a tool-call/result pair is never split (ADR-0012 §Tier 1). */ export function pickKeepBoundary( total: number, @@ -216,7 +216,7 @@ export function pickKeepBoundary( /** * Prunes bulky tool-result outputs in a UIMessage in place on a shallow copy. * The tool part is kept (never dropped — the assistant tool message is atomic, - * §C); only its `output` is soft-trimmed. Returns the (possibly) pruned message. + * ADR-0012 §Tier 1); only its `output` is soft-trimmed. Returns the (possibly) pruned message. */ function pruneUIMessage( message: PlatypusUIMessage, @@ -242,12 +242,12 @@ function pruneUIMessage( } /** - * Placeholder body for an elided tool result (Chunk 14 Task 2 — context editing). + * Placeholder body for an elided tool result (ADR-0012 §Stage 0 — context editing). * LLM-AGNOSTIC: Platypus may run small/weak background models, so the string is * EXPLICIT and self-describing. A terse marker ("[Old tool result content * cleared]") assumes the model infers it can re-call the tool; a small model may * not. Names the tool + elided size so the model can decide to re-run it, and is - * short enough that Stage 1 / option-D never re-trim it. + * short enough that Stage 1 / the hard window wall never re-trim it. */ const ELIDED_PLACEHOLDER_PREFIX = '[Tool result for "'; @@ -270,16 +270,16 @@ export type EditToolResultsResult = { }; /** - * Stage 0 (Chunk 14 Task 2 — context editing; Anthropic `clear_tool_uses` + * Stage 0 (ADR-0012 §Stage 0 — context editing; Anthropic `clear_tool_uses` * equivalent): replaces the `output` of OLD bulky tool-result parts with a short * placeholder, keeping the tool part itself (pairing) and ALL text parts intact. * Pure + deterministic — no model call, recomputed from raw messages each turn by - * recency, so it needs no durable state (P1: raw `chat.messages` is untouched, the + * recency, so it needs no durable state (ADR-0012 §View, not delete: raw `chat.messages` is untouched, the * full result stays for UI/audit). * * Recency is by COUNT of tool results (we have no clean turn id): the last * `keepRecentToolResults` results are exempt, and the newest message is exempt - * regardless (same invariant as option D, Task 1). A result is elided only when + * regardless (same invariant as ADR-0012 §Hard window wall). A result is elided only when * its serialized `output` exceeds `minEditableToolChars` — the size gate ≈ * Anthropic's `clear_at_least`, so trivial results never churn the prompt cache. * @@ -303,7 +303,7 @@ export function editToolResults( }); // Candidates for elision = all but the last `keepRecentToolResults`; the newest - // MESSAGE is exempt regardless (decision 5 / option-D invariant). Decide the + // MESSAGE is exempt regardless (ADR-0012 §Hard window wall invariant). Decide the // FULL elision policy here (recency + size gate + idempotency + grow-guard) and // record the precomputed placeholder, so the rewrite map below fires only when // there is real work — and never allocates a copy for a pure no-op. @@ -369,29 +369,30 @@ export function editToolResults( } /** Builds a readable transcript of UIMessages for the summarizer. */ -function renderUIMessages(messages: PlatypusUIMessage[]): string { - return messages - .map((m) => { - const text = (m.parts ?? []) - .map((p) => { - const ap = p as { type: string; text?: string; output?: unknown }; - if (ap.type === "text") return ap.text ?? ""; - if (ap.type === "dynamic-tool" || ap.type.startsWith("tool-")) { - const out = - typeof ap.output === "string" - ? ap.output - : ap.output !== undefined - ? JSON.stringify(ap.output) - : ""; - return `[tool ${ap.type}] ${softTrim(out, 200)}`; - } - return ""; - }) - .filter(Boolean) - .join("\n"); - return `${m.role}: ${text}`; - }) - .join("\n\n"); +/** Renders each message to its own transcript string (one entry per message), so + * the map-reduce summarizer can chunk on message boundaries and never split a + * single message mid-content (ADR-0012 §Tier 1 map-reduce). */ +function renderUIMessageList(messages: PlatypusUIMessage[]): string[] { + return messages.map((m) => { + const text = (m.parts ?? []) + .map((p) => { + const ap = p as { type: string; text?: string; output?: unknown }; + if (ap.type === "text") return ap.text ?? ""; + if (ap.type === "dynamic-tool" || ap.type.startsWith("tool-")) { + const out = + typeof ap.output === "string" + ? ap.output + : ap.output !== undefined + ? JSON.stringify(ap.output) + : ""; + return `[tool ${ap.type}] ${softTrim(out, 200)}`; + } + return ""; + }) + .filter(Boolean) + .join("\n"); + return `${m.role}: ${text}`; + }); } export type UICompactOptions = { @@ -403,14 +404,14 @@ export type UICompactOptions = { * Defaults to minPrunableChars * 5 when omitted. */ minRecentPrunableChars?: number; /** - * The HARD window wall (Chunk 14 Task 1, option D): the kept view's tokens + * The HARD window wall (ADR-0012 §Hard window wall): the kept view's tokens * above which the call would actually overflow (already net of per-turn * overhead by the caller). Recent (kept) tool results are trimmed ONLY when * the kept view breaches this wall — a mere `targetTokens` (hysteresis) miss * is cheap (it re-compacts next turn) and is not worth gutting active data the * user is asking about. The single newest message is always exempt regardless. * When omitted, recent results are always trimmed once over target (the - * pre-option-D behaviour) — safer than never trimming for callers that cannot + * behaviour predating ADR-0012 §Hard window wall) — safer than never trimming for callers that cannot * supply the wall. */ inputBudget?: number; @@ -418,17 +419,17 @@ export type UICompactOptions = { /** Existing durable summary to fold the new prefix into (incremental). */ priorSummary?: string | null; summarize: Summarize; - /** Token budget of one summarize call; larger prefixes are map-reduced (M1). */ + /** Token budget of one summarize call; larger prefixes are map-reduced (ADR-0012 §Tier 1 (summarizer model & map-reduce)). */ summarizerWindow?: number; /** * Bypass the no-op estimate gate and force compaction even when char/4 says - * we are within budget. Used for dirty-forced Tier 1 (§E/RV3): recovery sets + * we are within budget. Used for dirty-forced Tier 1 (ADR-0012 §Recovery): recovery sets * the dirty flag AFTER a provider rejection, so the estimator already failed; * re-using it as the no-op gate causes an infinite overflow→dirty→no-op loop. */ force?: boolean; /** - * Pre-computed estimate of `messages` (RV9). The caller's trigger projection + * Pre-computed estimate of `messages`. The caller's trigger projection * already ran the char/4 pass over this exact set, so reuse it instead of * re-estimating the full history a second time on the hot path. */ @@ -444,16 +445,52 @@ export type UICompactionResult = { watermarkId: string | null; messagesDropped: number; usedModelCall: boolean; - /** Post-compaction estimate incl. the summary — should be ≤ targetTokens (C2). */ + /** Post-compaction estimate incl. the summary — should be ≤ targetTokens (ADR-0012 §Tier 1 (hysteresis)). */ estimatedTokens: number; }; /** * Summarizes a prefix transcript, map-reducing when it exceeds the summarizer's - * own window (drift M1 — a huge cold-start history can't be sent whole). + * own window (ADR-0012 §Tier 1 (summarizer model & map-reduce) — a huge cold-start history can't be sent whole). + */ +/** + * Packs per-message transcript segments into chunks that each fit `windowTokens`, + * splitting only on MESSAGE boundaries — never mid-message. A lone segment larger + * than the window (a single oversized message) is char-sliced as a last resort, + * which is unavoidable for one message that cannot fit whole. */ +function packSegments(segments: string[], windowTokens: number): string[] { + const chunks: string[] = []; + let cur = ""; + const flush = () => { + if (cur) { + chunks.push(cur); + cur = ""; + } + }; + for (const seg of segments) { + if (textTokens(seg) > windowTokens) { + flush(); + const charBudget = windowTokens * CHARS_PER_TOKEN; + for (let i = 0; i < seg.length; i += charBudget) { + chunks.push(seg.slice(i, i + charBudget)); + } + continue; + } + const next = cur ? `${cur}\n\n${seg}` : seg; + if (textTokens(next) > windowTokens) { + flush(); + cur = seg; + } else { + cur = next; + } + } + flush(); + return chunks; +} + async function summarizePrefix( - prefixText: string, + segments: string[], priorSummary: string | null | undefined, summarize: Summarize, summarizerWindow: number | undefined, @@ -461,27 +498,39 @@ async function summarizePrefix( const fold = (prior: string | null | undefined, body: string) => prior ? `Previous summary:\n${prior}\n\nNewer messages:\n${body}` : body; - if (!summarizerWindow || textTokens(prefixText) <= summarizerWindow) { - return summarize(fold(priorSummary, prefixText)); + // Single pass when everything — prior summary AND fold framing included — + // fits the window. Checking the *folded* size (not the bare body) closes the + // gap where a large prior summary overflowed an otherwise-fitting prefix. + const joined = segments.join("\n\n"); + if ( + !summarizerWindow || + textTokens(fold(priorSummary, joined)) <= summarizerWindow + ) { + return summarize(fold(priorSummary, joined)); } - // Map-reduce: chunk the prefix by character budget, summarize each, then - // summarize the concatenated chunk summaries folded with the prior summary. - const charBudget = summarizerWindow * CHARS_PER_TOKEN; - const chunks: string[] = []; - for (let i = 0; i < prefixText.length; i += charBudget) { - chunks.push(prefixText.slice(i, i + charBudget)); - } + // Map: summarize each window-sized chunk (message-boundary aligned). + const chunks = packSegments(segments, summarizerWindow); const chunkSummaries: string[] = []; for (const chunk of chunks) chunkSummaries.push(await summarize(chunk)); - return summarize(fold(priorSummary, chunkSummaries.join("\n"))); + + // Reduce: the joined chunk summaries (+ prior) can THEMSELVES exceed the window + // when there are many chunks, so recurse rather than summarizing them whole — + // the reduce step must never re-overflow (ADR-0012 §Tier 1 map-reduce). Each + // pass shrinks the segment count, so this converges. + return summarizePrefix( + chunkSummaries, + priorSummary, + summarize, + summarizerWindow, + ); } /** * Tier 1 (durable) compaction over UIMessages. Stage 1 prunes; if that reaches * the target, no model call is made and the prefix stays (lighter). Otherwise * Stage 2 summarizes the prefix into one synthetic summary and drops it from the - * model view. Raw messages are never mutated by the caller (P1 — this returns a + * model view. Raw messages are never mutated by the caller (ADR-0012 §View, not delete — this returns a * view). */ export async function compactUIMessages( @@ -493,15 +542,15 @@ export async function compactUIMessages( const estimate = (msgs: PlatypusUIMessage[]) => estimateTokens(uiMessagesToCountUnits(msgs, provider)); - // RV9: reuse the caller's already-computed estimate of `messages` rather than + // Reuse the caller's already-computed estimate of `messages` rather than // re-running the full char/4 pass on the hot path. const initialEstimate = opts.knownEstimate ?? estimate(messages); // No-op when already within target (incl. the existing summary). This is what - // makes a follow-up turn after compaction NOT re-fire (hysteresis, C2). + // makes a follow-up turn after compaction NOT re-fire (hysteresis, ADR-0012 §Tier 1 (hysteresis)). // Bypassed when `force` is set — recovery sets the dirty flag AFTER a provider // rejection, so the estimator already proved wrong; using it as a no-op gate - // causes an infinite overflow→dirty→no-op loop (RV3). + // causes an infinite overflow→dirty→no-op loop (ADR-0012 §Recovery). if (!opts.force && initialEstimate + priorTokens <= opts.targetTokens) { return { keptMessages: messages, @@ -539,7 +588,7 @@ export async function compactUIMessages( // Past this point we are over target. Recent (kept) messages stay in the model // view, so extreme outliers (e.g. large MCP tool dumps) bloat tokensAfter. - // Option D (Chunk 14 Task 1): trim them ONLY when the kept view would breach + // The hard window wall (ADR-0012 §Hard window wall): trim them ONLY when the kept view would breach // the hard window wall (`inputBudget`); a soft `targetTokens` miss is left at // full fidelity and just re-compacts next turn (cheap). The newest message is // always exempt — it is the data the current turn is actively about. @@ -557,11 +606,11 @@ export async function compactUIMessages( }); return { messages, changed }; }; - // Decides whether to keep `recent` verbatim or trim it (option D). Returns the + // Decides whether to keep `recent` verbatim or trim it (ADR-0012 §Hard window wall). Returns the // kept messages and their token estimate (reused for `afterEstimate` so the // recent set is never re-estimated). `fixedTokens` is the kept view's NON-recent // part (pruned prefix and/or folded summary). When `inputBudget` is omitted the - // wall is unknown → always trim once over target (pre-option-D guard). + // wall is unknown → always trim once over target (guard predating ADR-0012 §Hard window wall). const keepRecentWithinWall = ( fixedTokens: number, recentMsgs: PlatypusUIMessage[], @@ -583,7 +632,7 @@ export async function compactUIMessages( // Warn only when the kept view still breaches the HARD wall after trimming — // i.e. recent genuinely couldn't be brought under the window (one oversized - // result; Task 3 ingestion-cap territory). Post-option-D a soft `targetTokens` + // result; ingestion-cap territory). Under ADR-0012 §Hard window wall a soft `targetTokens` // miss is by design (recent kept verbatim below the wall), so it is NOT a // warning. Falls back to the old `target * 2` heuristic when no wall is supplied. const warnIfOverWall = (afterEstimate: number) => { @@ -604,7 +653,7 @@ export async function compactUIMessages( } }; - // RV4: nothing to summarize when the prefix is empty (history fits within + // ADR-0012 §Tier 1: nothing to summarize when the prefix is empty (history fits within // keepRecentMessages). Also bail when the boundary message has no id — we // cannot anchor a watermark there, and committing a watermark:null + // non-null summary would orphan the summary (viewAfterWatermark ignores @@ -630,7 +679,7 @@ export async function compactUIMessages( // Stage 2 — summarize the pruned prefix into one synthetic summary. const summaryText = await summarizePrefix( - renderUIMessages(prunedPrefix), + renderUIMessageList(prunedPrefix), opts.priorSummary, opts.summarize, opts.summarizerWindow, @@ -682,7 +731,7 @@ function pruneModelMessage( }; } } - // RV5: @ai-sdk/mcp emits {type:"content"} for essentially every MCP tool + // ADR-0012 §Tier 1 (Stage 1 prune): @ai-sdk/mcp emits {type:"content"} for essentially every MCP tool // result. Without this branch Stage 1 reclaims zero tokens from the bulkiest // payloads and their text is invisible to the summarizer. if (output.type === "content" && Array.isArray(output.value)) { @@ -713,40 +762,39 @@ function pruneModelMessage( return { ...message, content }; } -function renderModelMessages(messages: ModelMessage[]): string { - return messages - .map((m) => { - if (typeof m.content === "string") return `${m.role}: ${m.content}`; - const text = m.content - .map((p) => { - if (p.type === "text") return p.text; - if (p.type === "tool-call") return `[tool-call ${p.toolName}]`; - if (p.type === "tool-result") { - const o = p.output; - let v: string; - if (o.type === "text" || o.type === "error-text") { - v = o.value; - } else if (o.type === "json" || o.type === "error-json") { - v = JSON.stringify(o.value); - } else if (o.type === "content") { - // RV5: extract text items from content-type MCP output (RV5). - type ContentItem = { type: string; text?: string }; - v = (o.value as ContentItem[]) - .filter((i) => i.type === "text") - .map((i) => i.text ?? "") - .join("\n"); - } else { - v = ""; - } - return `[tool-result] ${softTrim(v, 200)}`; +/** Per-message transcript strings (one entry per message). See renderUIMessageList. */ +function renderModelMessageList(messages: ModelMessage[]): string[] { + return messages.map((m) => { + if (typeof m.content === "string") return `${m.role}: ${m.content}`; + const text = m.content + .map((p) => { + if (p.type === "text") return p.text; + if (p.type === "tool-call") return `[tool-call ${p.toolName}]`; + if (p.type === "tool-result") { + const o = p.output; + let v: string; + if (o.type === "text" || o.type === "error-text") { + v = o.value; + } else if (o.type === "json" || o.type === "error-json") { + v = JSON.stringify(o.value); + } else if (o.type === "content") { + // ADR-0012 §Tier 1 (Stage 1 prune): extract text items from content-type MCP output. + type ContentItem = { type: string; text?: string }; + v = (o.value as ContentItem[]) + .filter((i) => i.type === "text") + .map((i) => i.text ?? "") + .join("\n"); + } else { + v = ""; } - return ""; - }) - .filter(Boolean) - .join("\n"); - return `${m.role}: ${text}`; - }) - .join("\n\n"); + return `[tool-result] ${softTrim(v, 200)}`; + } + return ""; + }) + .filter(Boolean) + .join("\n"); + return `${m.role}: ${text}`; + }); } /** A synthetic summary as a model message. User-role + clear framing is the most @@ -823,7 +871,7 @@ export async function compactModelMessages( pruneModelMessage(m, opts.minPrunableChars), ); const prunedAll = [...prunedPrefix, ...recent]; - // Force-guarded like gate 1 (RV3): when recovery forces a trim the provider + // Force-guarded like gate 1 (ADR-0012 §Recovery): when recovery forces a trim the provider // already rejected this prompt, so the estimator proved wrong — re-trusting // it here would return a byte-identical prompt and burn the single retry. if (!opts.force && estimate(prunedAll) <= opts.targetTokens) { @@ -835,7 +883,7 @@ export async function compactModelMessages( }; } - // RV4 (model-side): nothing to summarize when the prefix is empty (recent + // ADR-0012 §Tier 1 (model-side): nothing to summarize when the prefix is empty (recent // alone exceeds keepRecentMessages). Summarizing an empty prefix would add a // synthetic message and GROW the prompt — never converges. Surface the // overflow instead (recovery retries once, then propagates). @@ -850,7 +898,7 @@ export async function compactModelMessages( // Stage 2 — summarize the pruned prefix into one synthetic message. const summaryText = await summarizePrefix( - renderModelMessages(prunedPrefix), + renderModelMessageList(prunedPrefix), null, opts.summarize, opts.summarizerWindow, @@ -870,15 +918,15 @@ export async function compactModelMessages( // `applyTier1Compaction` is the durable, cross-turn entry point invoked from // `prepareChatTurn`. It is dependency-injected (store + summarizer) so it is // unit-testable without standing up the full turn machinery. It: -// 1. Reconstructs the compacted VIEW from persisted state every turn (P1) — +// 1. Reconstructs the compacted VIEW from persisted state every turn (ADR-0012 §View, not delete) — // drop messages up to the watermark, re-inject the stored summary. // 2. Triggers a fresh compaction when the projected size crosses the trigger -// ratio, OR when `compactionDirty` forces it (recovery hand-off, §E). +// ratio, OR when `compactionDirty` forces it (recovery hand-off, ADR-0012 §Recovery). // 3. Persists any new summary/watermark + clears dirty via the single CAS -// writer (P3), the loser skipping safely on contention (R4). +// writer (ADR-0012 §One durable writer), the loser skipping safely on contention. // =========================================================================== -/** Resolved per-turn compaction config (§G), defaults applied. */ +/** Resolved per-turn compaction config (ADR-0012 §Config & kill switch), defaults applied. */ export type CompactionConfig = { compactionEnabled: boolean; triggerRatio: number; @@ -890,7 +938,7 @@ export type CompactionConfig = { * Stage 2 summarization. Higher than minPrunableChars — we trim extreme * outliers (e.g. huge MCP tool dumps) without destroying useful context. */ minRecentPrunableChars: number; - /** Stage 0 context editing (Chunk 14 Task 2): elide OLD bulky tool results to a + /** Stage 0 context editing (ADR-0012 §Stage 0 — context editing): elide OLD bulky tool results to a * placeholder before the trigger check, so a leaned view can avoid summarizing * entirely. Gated alongside the COMPACTION_ENABLED kill switch. */ contextEditingEnabled: boolean; @@ -923,7 +971,7 @@ export type Budget = { }; /** - * Budget math (drift C3): the trigger/target are fractions of the INPUT budget — + * Budget math (ADR-0012 §Tier 1 (budget math)): the trigger/target are fractions of the INPUT budget — * the window minus the output reservation and a safety headroom — not of the raw * window. When the resolved max output is unknown, reserve a conservative slice. */ @@ -934,7 +982,7 @@ export function computeBudget( ): Budget { const rawOutputReserve = maxOutputTokens ?? Math.min(4096, Math.floor(contextWindow * 0.25)); - // Cap the output reservation at half the window (A6). litellm's + // Cap the output reservation at half the window (ADR-0012 §Tier 1 (budget math)). litellm's // `max_input_tokens` (which feeds `contextWindow`) is already input-scoped for // some providers, so subtracting a large `max_output_tokens` again can collapse // `inputBudget` toward 1 — making trigger/target ≈ 0 and thrashing. Capping @@ -956,22 +1004,22 @@ export function computeBudget( } /** - * First-turn safety margin on the char/4 projection (drift M2): char/4 + * First-turn safety margin on the char/4 projection (ADR-0012 §Token estimation (cold-start margin)): char/4 * under-counts CJK, dense JSON, and tool chatter, and on a cold start there is * no provider-reported `usage.inputTokens` to correct it. */ export const COLD_START_MARGIN = 1.15; /** - * The Tier 1 trigger projection (drift C1): what THIS turn is about to put on + * The Tier 1 trigger projection (ADR-0012 §Tier 1 (trigger projection)): what THIS turn is about to put on * the wire, not just the stored messages. `overheadTokens` carries the * estimated system prompt + tool schemas + skill payload — invisible to a * message-only estimate but sent to the model on every turn (the observed * live-test gap: provider reported 8888 input tokens vs ~986 message-only). * `lastInputTokens` is the provider-reported count from the prior turn — the - * corrective baseline for turns ≥ 2 (threaded in the §H usage-metadata chunk). + * corrective baseline for turns ≥ 2 (threaded in the ADR-0012 §Context-usage ring usage-metadata chunk). * When it is absent the whole char/4 projection is inflated by - * {@link COLD_START_MARGIN} (M2). + * {@link COLD_START_MARGIN} (ADR-0012 §Token estimation (cold-start margin)). */ export function projectTier1Tokens(args: { messageTokens: number; @@ -981,7 +1029,7 @@ export function projectTier1Tokens(args: { }): number { const charBased = args.messageTokens + args.priorSummaryTokens + (args.overheadTokens ?? 0); - // Treat a non-positive count as "no baseline" (A1): some OpenAI-compatible / + // Treat a non-positive count as "no baseline" (ADR-0012 §Tier 1 (trigger projection)): some OpenAI-compatible / // vLLM gateways omit `usage.inputTokens`, which we persist as // `contextTokens = 0`. A bare `== null` check would let that 0 slip through — // skipping the cold-start margin AND no-op-ing the `max()` below — leaving the @@ -1010,7 +1058,7 @@ export function summaryUIMessage(text: string): PlatypusUIMessage { } as PlatypusUIMessage; } -/** Fail-loud event so the transcript shows compaction happened (§C). */ +/** Fail-loud event so the transcript shows compaction happened (ADR-0012 §Tier 1). */ export type CompactionEvent = { type: "context-compacted"; messagesDropped: number; @@ -1020,7 +1068,7 @@ export type CompactionEvent = { export type Tier1Input = { chatId: string; - /** Full durable history (post-`inlineFileUrls`, drift T2). */ + /** Full durable history (post-`inlineFileUrls`, ADR-0012 §Token estimation). */ messages: PlatypusUIMessage[]; state: CompactionState; budget: Budget; @@ -1031,12 +1079,12 @@ export type Tier1Input = { summarizerWindow?: number; /** * Estimated tokens of the per-turn payload that is NOT in `messages` — - * system prompt, tool schemas, skill list (drift C1). Counted toward the + * system prompt, tool schemas, skill list (ADR-0012 §Tier 1 (trigger projection)). Counted toward the * trigger and subtracted from the compaction target (compaction cannot - * shrink it, so hysteresis must leave room for it — C2). + * shrink it, so hysteresis must leave room for it — ADR-0012 §Tier 1 (hysteresis)). */ overheadTokens?: number; - /** Provider-reported `usage.inputTokens` from the prior turn (C1, via §H). */ + /** Provider-reported `usage.inputTokens` from the prior turn (ADR-0012 §Tier 1 (trigger projection), via ADR-0012 §Context-usage ring). */ lastInputTokens?: number; onEvent?: (event: CompactionEvent) => void; }; @@ -1048,14 +1096,14 @@ export type CompactionTrace = { summaryExcerpt?: string; }; -/** Tool name for the synthetic compaction-trace tool-call/result pair (§K/11c). +/** Tool name for the synthetic compaction-trace tool-call/result pair (ADR-0012 §Compaction trace in the timeline). * Shared by the stream-trace producer (agent-runner), the strip filter that - * keeps it out of the model payload, the §J persisted-message builder, and the + * keeps it out of the model payload, the ADR-0012 §Force-compact on demand persisted-message builder, and the * frontend display-name mapping. */ export const COMPACT_CONTEXT_TOOL_NAME = "compact_context"; /** Builds a standalone synthetic assistant message carrying the compaction - * trace as a `compact_context` tool-call/result pair (§J — forced compaction + * trace as a `compact_context` tool-call/result pair (ADR-0012 §Force-compact on demand — forced compaction * has no live stream to inject into, so the trace is persisted as its own * message instead). The message is always appended ABOVE the watermark, so it * is never itself summarized; the strip filter keeps it out of the model @@ -1092,7 +1140,7 @@ export type Tier1Output = { commit?: CommitResult; /** * Present ONLY when a model summary was produced this turn — the user-visible - * "compaction happened" signal (§K/11c). Deliberately undefined for + * "compaction happened" signal (ADR-0012 §Compaction trace in the timeline). Deliberately undefined for * prune-only and force-dirty-within-target no-op turns: those drop 0 messages * and have no excerpt, so a trace would render an empty/confusing timeline * entry. @@ -1112,7 +1160,7 @@ function viewAfterWatermark( const idx = messages.findIndex((m) => m.id === state.summaryWatermark); if (idx === -1) { // Watermark message is gone (edited/deleted before invalidation landed): - // distrust the summary and fall back to the full history (defensive C4). + // distrust the summary and fall back to the full history (defensive ADR-0012 §Summary invalidation). return { afterWatermark: messages, priorSummary: null }; } return { @@ -1131,13 +1179,13 @@ export async function applyTier1Compaction( const { afterWatermark, priorSummary } = viewAfterWatermark(messages, state); const priorSummaryTokens = priorSummary ? textTokens(priorSummary) : 0; - // Stage 0 — context editing (Chunk 14 Task 2): elide OLD bulky tool results to + // Stage 0 — context editing (ADR-0012 §Stage 0 — context editing): elide OLD bulky tool results to // placeholders BEFORE the trigger projection, so a leaned view can drop under // the trigger and skip summarization entirely. Pure/deterministic, no durable - // state (P1). Gated by the COMPACTION_ENABLED kill switch (recovery stays the - // net, P4) AND the per-feature `contextEditingEnabled`. Returns the same array + // state (ADR-0012 §View, not delete). Gated by the COMPACTION_ENABLED kill switch (recovery stays the + // net, ADR-0012 §Recovery is the net) AND the per-feature `contextEditingEnabled`. Returns the same array // reference when nothing qualified, so the no-op case re-estimates nothing. - // NB (plan decision 7): the elided placeholders also flow into the prefix that + // NB (ADR-0012 §Stage 0 — context editing): the elided placeholders also flow into the prefix that // Stage 2 would summarize, so a summarized result keeps only its placeholder — // an accepted fidelity trade-off (a 40k dump's head+tail is poor summary fodder // and the raw stays in the DB). @@ -1167,7 +1215,7 @@ export async function applyTier1Compaction( // The view that would be sent if we did nothing more this turn. const baseView = inject(priorSummary, editedView); const overheadTokens = input.overheadTokens ?? 0; - // RV9: compute the char/4 pass over the unsummarized view once and reuse it + // Compute the char/4 pass over the unsummarized view once and reuse it // for both the trigger projection and compactUIMessages' no-op gate. const messageTokens = estimate(editedView); const projected = projectTier1Tokens({ @@ -1206,7 +1254,7 @@ export async function applyTier1Compaction( } // Compaction can only shrink the messages, never the per-turn overhead, so - // the target the messages must fit in is reduced by it (C1/C2). When the + // the target the messages must fit in is reduced by it (ADR-0012 §Tier 1 (hysteresis)). When the // overhead alone exhausts the target, hysteresis is impossible — warn loudly // (compaction will re-fire every turn) but still compact: recovery is the // only other net. @@ -1218,7 +1266,7 @@ export async function applyTier1Compaction( ); } - // The hard wall the kept view must fit under (option D), net of the per-turn + // The hard wall the kept view must fit under (ADR-0012 §Hard window wall), net of the per-turn // overhead compaction cannot shrink — mirrors how effectiveTarget adjusts the // soft target. Recent tool results are trimmed only when this is breached. const effectiveInputBudget = Math.max(0, budget.inputBudget - overheadTokens); @@ -1233,27 +1281,27 @@ export async function applyTier1Compaction( priorSummary, summarize: input.summarize, summarizerWindow: input.summarizerWindow, - // When dirty-forced the estimator already proved wrong (RV3): bypass the + // When dirty-forced the estimator already proved wrong (ADR-0012 §Recovery): bypass the // no-op gate so recovery's dirty flag actually shrinks the history. force: forceCompact, - // RV9: the no-op gate estimates this exact set; reuse the value above. + // The no-op gate estimates this exact set; reuse the value above. knownEstimate: messageTokens, }); const view = inject(result.summaryText ?? priorSummary, result.keptMessages); - // Persist through the single CAS writer (P3). The decision is gated on the + // Persist through the single CAS writer (ADR-0012 §One durable writer). The decision is gated on the // version we read; if a concurrent writer advanced it, we skip rather than - // recompute (R4 — the wasted summarize is bounded, never corrupting). The + // recompute (the wasted summarize is bounded, never corrupting). The // version-pinning gate is shared so both write paths decide identically. const capturedVersion = state.version; - // On a version mismatch we skip as "covered" WITHOUT clearing dirty. Plan T10 - // says "winner advanced → SKIP + clear-dirty", but that is only safe when the - // winner actually compacted. A concurrent invalidateCompaction also advances - // the version yet leaves dirty set on purpose (it resets the summary, it does - // not shrink history) — clearing dirty here would then drop the forced - // compaction the overflow demanded. Leaving dirty set is strictly safe: worst - // case is one extra compaction next turn. (Intentional deviation from T10.) + // On a version mismatch we skip as "covered" WITHOUT clearing dirty (ADR-0012 + // §One durable writer). Clearing on skip is only safe when the winner actually + // compacted; a concurrent invalidateCompaction also advances the version yet + // leaves dirty set on purpose (it resets the summary, it does not shrink + // history) — clearing dirty here would then drop the forced compaction the + // overflow demanded. Leaving dirty set is strictly safe: worst case is one + // extra compaction next turn. const pinnedWrite = (patch: WatermarkPatch) => commitWatermark(input.store, input.chatId, (latest) => latest.version === capturedVersion @@ -1263,7 +1311,7 @@ export async function applyTier1Compaction( let commit: CommitResult | undefined; if (result.usedModelCall) { - // Same-basis before/after for the user-visible reduction (B-F7): both are + // Same-basis before/after for the user-visible reduction: both are // char/4 message estimates plus the per-turn overhead. The trigger // `projected` mixes in the provider's `lastInputTokens` floor and is NOT // comparable to the message-only post estimate, so reporting it as "before" @@ -1303,7 +1351,7 @@ export async function applyTier1Compaction( // Only surface a trace when an actual model summary was produced. Prune-only // and force-dirty-within-target runs drop 0 messages with no excerpt — a - // trace there would be an empty, confusing timeline entry (§K/11c). + // trace there would be an empty, confusing timeline entry (ADR-0012 §Compaction trace in the timeline). const compactionTrace: CompactionTrace | undefined = result.usedModelCall && result.summaryText ? { @@ -1322,7 +1370,7 @@ export async function applyTier1Compaction( /** * Detects which summarized messages (at/below the watermark) the freshly - * submitted history changed or dropped — the C4 trigger. Because the client + * submitted history changed or dropped — the ADR-0012 §Summary invalidation trigger. Because the client * resubmits the full message array each turn (there is no separate edit/delete * endpoint), divergence is found by comparing the persisted canonical history * against the incoming one up to the watermark. Returns the ids that an @@ -1350,10 +1398,10 @@ export function affectedBelowWatermark( } /** - * Persists `compactionDirty = true` after a context-overflow recovery (§E, - * drift T3). Recovery never writes summary/watermark — it only flags; the next + * Persists `compactionDirty = true` after a context-overflow recovery (ADR-0012 §Recovery). + * Recovery never writes summary/watermark — it only flags; the next * `prepareChatTurn` sees the flag, forces Tier 1, and clears it inside the same - * CAS write that advances the watermark. Goes through the single writer (P3); + * CAS write that advances the watermark. Goes through the single writer (ADR-0012 §One durable writer); * already-dirty is a no-op. */ export async function setCompactionDirty( @@ -1390,12 +1438,12 @@ export async function invalidateCompaction( }); } -// --- Tier 2 in-turn compaction (§D, ADR-0009) --- +// --- Tier 2 in-turn compaction (ADR-0012 §Tier 2) --- /** - * Per-turn Tier 2 compaction context (§D). Null when the §G kill switch or + * Per-turn Tier 2 compaction context (ADR-0012 §Tier 2). Null when the ADR-0012 §Config & kill switch or * agent config disables proactive compaction. Sub-agents also receive Tier 2 - * (drift M3 — they have no durable history for Tier 1, but their tool loop + * (ADR-0012 §Sub-agents / §Tier 2 — they have no durable history for Tier 1, but their tool loop * can bloat intra-turn). */ export type Tier2Context = { @@ -1409,11 +1457,11 @@ export type Tier2Context = { }; /** - * Builds the Tier 2 in-turn compaction `prepareStep` callback (§D). Fires + * Builds the Tier 2 in-turn compaction `prepareStep` callback (ADR-0012 §Tier 2). Fires * before each step of a tool loop when the accumulated model messages exceed * `triggerTokens` — compacts via `compactModelMessages` and returns the * trimmed messages. Returns `undefined` when below the threshold so the SDK - * proceeds unchanged (drift m3: no per-step overhead when the loop is small). + * proceeds unchanged (ADR-0012 §Sub-agents / §Tier 2: no per-step overhead when the loop is small). */ export function buildTier2PrepareStep(ctx: Tier2Context): PrepareStepFunction { return async ({ messages }) => { @@ -1429,7 +1477,7 @@ export function buildTier2PrepareStep(ctx: Tier2Context): PrepareStepFunction { imageProvider: ctx.imageProvider, summarize: ctx.summarize, summarizerWindow: ctx.summarizerWindow, - // Reuse the trigger-check estimate; skips a redundant full pass (RV9). + // Reuse the trigger-check estimate; skips a redundant full pass. knownEstimate: estimate, }); diff --git a/apps/backend/src/runs/context-window.test.ts b/apps/backend/src/runs/context-window.test.ts index cac0228d..99b5bbba 100644 --- a/apps/backend/src/runs/context-window.test.ts +++ b/apps/backend/src/runs/context-window.test.ts @@ -38,7 +38,7 @@ const openai: ProviderWindowInput = { apiKey: "sk-x", }; -describe("lookupRegistry — key normalization (drift T4)", () => { +describe("lookupRegistry — key normalization (ADR-0012 §Window resolution (key normalization))", () => { it("exact match", () => { expect(lookupRegistry(REGISTRY, "gpt-4o")?.max_input_tokens).toBe(128000); }); @@ -110,7 +110,7 @@ describe("resolveContextWindow — resolution order", () => { }); }); - it("ignores litellm max_tokens (output cap, not window) → default (drift F1)", async () => { + it("ignores litellm max_tokens (output cap, not window) → default (ADR-0012 §Window resolution)", async () => { // "legacy-model" has only max_tokens; that is the OUTPUT cap, so it must NOT // be read as the context window. Falls through to the conservative default. const r = resolver(); @@ -119,7 +119,7 @@ describe("resolveContextWindow — resolution order", () => { expect(out.source).toBe("default"); }); - it("merges a maxOutputTokens-only override onto a registry window (drift F5)", async () => { + it("merges a maxOutputTokens-only override onto a registry window (ADR-0012 §Window resolution)", async () => { const r = resolver(); const out = await r.resolve( { ...openai, modelMeta: { "gpt-4o": { maxOutputTokens: 999 } } }, @@ -133,7 +133,7 @@ describe("resolveContextWindow — resolution order", () => { }); }); - it("4. conservative default + source=default on a MISS (drift T6)", async () => { + it("4. conservative default + source=default on a MISS (ADR-0012 §Context-usage ring)", async () => { const r = resolver(); const out = await r.resolve({ ...openai }, "unknown-model-zzz"); expect(out).toEqual({ @@ -257,7 +257,7 @@ describe("API auto-detect parsers", () => { }); }); -describe("registry load failure (drift F3)", () => { +describe("registry load failure (ADR-0012 §Window resolution)", () => { it("a throwing loader degrades to empty registry → default, no reject", async () => { const r = new ContextWindowResolver({ loadRegistry: async () => { @@ -270,7 +270,7 @@ describe("registry load failure (drift F3)", () => { }); }); -describe("cache + evict (drift T5)", () => { +describe("cache + evict (ADR-0012 §Window resolution (caching & eviction))", () => { it("caches within the TTL (one probe), evict forces a re-probe", async () => { const httpGetJson = vi .fn() @@ -320,7 +320,7 @@ describe("cache + evict (drift T5)", () => { expect(httpGetJson).toHaveBeenCalledTimes(2); }); - it("RV7d: a default-source result is cached briefly, not for the full TTL", async () => { + it("a default-source result is cached briefly, not for the full TTL", async () => { let now = 0; // API probe never yields a window and the model is not in the registry → // every resolve falls to source:"default". diff --git a/apps/backend/src/runs/context-window.ts b/apps/backend/src/runs/context-window.ts index 430fbec1..2e698a0d 100644 --- a/apps/backend/src/runs/context-window.ts +++ b/apps/backend/src/runs/context-window.ts @@ -1,5 +1,5 @@ /** - * Context-window resolution (context-compaction-plan §A). + * Context-window resolution (ADR-0012 §Window resolution). * * Resolves the usable context window (and max output tokens) for a * provider+model, in this order: @@ -11,14 +11,17 @@ * 4. Conservative default — {@link DEFAULT_CONTEXT_WINDOW} (8192). * * A fall-through to the default, and every registry key MISS, is `log.warn`'d: - * the window is then unknown and the ring must render neutral (drift T6). + * the window is then unknown and the ring must render neutral + * (ADR-0012 §Context-usage ring). * * Results are cached per `providerId:modelId` with a TTL. Editing a `modelMeta` * override must call {@link ContextWindowResolver.evict} immediately so the - * override takes effect without waiting for the TTL (drift T5). + * override takes effect without waiting for the TTL + * (ADR-0012 §Window resolution (caching & eviction)). * * The registry lookup and HTTP probe are injected so this module is unit - * testable without network or a vendored multi-MB JSON file (drift T4 cases are + * testable without network or a vendored multi-MB JSON file + * (ADR-0012 §Window resolution (key normalization) cases are * exercised against small fixture registries). */ @@ -31,14 +34,14 @@ export const DEFAULT_CONTEXT_WINDOW = 8192; export const DEFAULT_CACHE_TTL_MS = 60 * 60 * 1000; // 1 hour /** - * Short TTL for `source: "default"` resolutions (defect 6 / RV7d). A registry + * Short TTL for `source: "default"` resolutions (ADR-0012 §Window resolution (caching & eviction)). A registry * MISS or a transient API failure falls to 8192; caching that for the full hour * pins a wrong window long after the blip clears. A 60 s TTL lets the next turn * re-probe while still collapsing a burst of same-turn lookups. */ export const DEFAULT_SOURCE_CACHE_TTL_MS = 60 * 1000; // 1 minute -/** Where a resolved window came from — drives ring neutrality (T6). */ +/** Where a resolved window came from — drives ring neutrality (ADR-0012 §Context-usage ring). */ export type WindowSource = "override" | "api" | "registry" | "default"; export type ResolvedWindow = { @@ -85,7 +88,7 @@ export type ResolverDeps = { }; // --------------------------------------------------------------------------- -// litellm registry key normalization (drift T4) +// litellm registry key normalization (ADR-0012 §Window resolution (key normalization)) // --------------------------------------------------------------------------- /** Strips a Bedrock ARN down to its `vendor.model` id, if it is one. */ @@ -138,7 +141,7 @@ export function lookupRegistry( // 6. family heuristic — longest registry key that is a proper prefix of the // id, separated by "-", ".", ":", or "/" so "gpt-4" does NOT match "gpt-4.5" - // (RV7b: raw startsWith caused gpt-4.5-preview to silently resolve via a + // (ADR-0012 §Window resolution (key normalization): raw startsWith caused gpt-4.5-preview to silently resolve via a // stale gpt-4 entry with a wrong 8192 window). // Case-insensitive so mixed-case registry keys ("Qwen/…", "meta-llama/…") // still match lowercase ids from providers that normalize model names. @@ -168,7 +171,7 @@ function windowFromRegistryEntry(entry: RegistryEntry): { } { // Only trust the explicit input limit. litellm's `max_tokens` is the OUTPUT // cap (not the context window); using it would silently under-size the window - // and cause constant over-compaction (drift F1). When `max_input_tokens` is + // and cause constant over-compaction (ADR-0012 §Window resolution). When `max_input_tokens` is // absent we return no window so the caller falls to the conservative default, // which at least surfaces a warn + neutral ring rather than a wrong number. return { @@ -296,7 +299,7 @@ async function detectViaApi( // Resolver (cache + evict) // --------------------------------------------------------------------------- -/** RV7d: 5 s hard cap so a hung provider endpoint never blocks turns for ~300 s. */ +/** ADR-0012 §Window resolution (caching & eviction): 5 s hard cap so a hung provider endpoint never blocks turns for ~300 s. */ const API_DETECT_TIMEOUT_MS = 5000; const defaultHttpGetJson: HttpGetJson = async (url, headers) => { @@ -312,7 +315,7 @@ type CacheEntry = { value: ResolvedWindow; expiresAt: number }; export class ContextWindowResolver { #cache = new Map(); - /** RV7d: single-flight — concurrent callers for the same key share one fetch. */ + /** ADR-0012 §Window resolution (caching & eviction): single-flight — concurrent callers for the same key share one fetch. */ #inflight = new Map>(); #loadRegistry: () => Promise; #registry: Registry | undefined; @@ -329,7 +332,7 @@ export class ContextWindowResolver { this.#now = deps.now ?? (() => Date.now()); } - /** Drops all cached windows for a provider — call on `modelMeta` edit (T5). */ + /** Drops all cached windows for a provider — call on `modelMeta` edit (ADR-0012 §Window resolution (caching & eviction)). */ evict(providerId: string): void { for (const key of this.#cache.keys()) { if (key.startsWith(`${providerId}:`)) this.#cache.delete(key); @@ -344,7 +347,7 @@ export class ContextWindowResolver { async #registryEntry(modelId: string): Promise { if (this.#registry === undefined) { // A failing loader (bad vendored JSON, fs error) must not reject the whole - // resolution — degrade to an empty registry + warn (drift F3). + // resolution — degrade to an empty registry + warn (ADR-0012 §Window resolution). try { this.#registry = await this.#loadRegistry(); } catch (error) { @@ -366,7 +369,7 @@ export class ContextWindowResolver { const cached = this.#cache.get(cacheKey); if (cached && cached.expiresAt > this.#now()) return cached.value; - // RV7d: single-flight — reuse an in-flight promise rather than spawning a + // ADR-0012 §Window resolution (caching & eviction): single-flight — reuse an in-flight promise rather than spawning a // second fetch for the same key (cold-cache stampede protection). const existing = this.#inflight.get(cacheKey); if (existing) return existing; @@ -375,9 +378,9 @@ export class ContextWindowResolver { // Only write the cache if this promise is still the live in-flight one. // An evict() during the fetch deletes the inflight entry; without this // guard the resolving promise would repopulate the cache with the stale - // pre-update value and defeat the eviction for a full TTL (RV7c race). + // pre-update value and defeat the eviction for a full TTL (ADR-0012 §Window resolution (caching & eviction) race). if (this.#inflight.get(cacheKey) === promise) { - // RV7d / defect 6: a default-source result (MISS or transient API + // ADR-0012 §Window resolution (caching & eviction): a default-source result (MISS or transient API // failure) gets a short TTL so a blip doesn't pin 8192 for an hour. const ttl = value.source === "default" diff --git a/apps/backend/src/runs/litellm-registry.ts b/apps/backend/src/runs/litellm-registry.ts index bca74532..b260ffb2 100644 --- a/apps/backend/src/runs/litellm-registry.ts +++ b/apps/backend/src/runs/litellm-registry.ts @@ -337,7 +337,6 @@ const REGISTRY: Registry = { "Qwen/Qwen3-8B": { max_input_tokens: 131072, max_output_tokens: 8192 }, "Qwen/Qwen3-14B": { max_input_tokens: 131072, max_output_tokens: 8192 }, "Qwen/Qwen3-32B": { max_input_tokens: 131072, max_output_tokens: 8192 }, - "Qwen/Qwen3-72B": { max_input_tokens: 131072, max_output_tokens: 8192 }, }; /** Returns the built-in minimal registry. Async so the signature matches the diff --git a/apps/backend/src/runs/recovery.test.ts b/apps/backend/src/runs/recovery.test.ts index 372474ca..87d658a6 100644 --- a/apps/backend/src/runs/recovery.test.ts +++ b/apps/backend/src/runs/recovery.test.ts @@ -26,9 +26,9 @@ const apiError = (args: { responseBody: args.responseBody, }); -// --- isContextOverflowError — per-provider body matrix (drift T9) --------- +// --- isContextOverflowError — per-provider body matrix (ADR-0012 §Recovery) --------- -describe("isContextOverflowError (drift T9)", () => { +describe("isContextOverflowError (ADR-0012 §Recovery)", () => { it("matches the OpenAI phrasing + code", () => { const err = apiError({ statusCode: 400, @@ -111,7 +111,7 @@ describe("isContextOverflowError (drift T9)", () => { }); }); -// --- middleware: trim + retry-once (§E, drift T3) -------------------------- +// --- middleware: trim + retry-once (ADR-0012 §Recovery) -------------------------- type PromptMsg = { role: string; content: unknown }; @@ -121,7 +121,7 @@ const text = (role: "user" | "assistant", t: string): PromptMsg => ({ }); /** system + 2 big + 2 small messages: prune can't help (no tool results), so - * the trim must go through the shared summarize stage (drift T3). */ + * the trim must go through the shared summarize stage (ADR-0012 §Recovery). */ const overflowPrompt = (): PromptMsg[] => [ { role: "system", content: "SYS" }, text("user", "X".repeat(4000)), @@ -172,7 +172,7 @@ const runWrapGenerate = ( ...args, }); -describe("contextOverflowRecoveryMiddleware (§E)", () => { +describe("contextOverflowRecoveryMiddleware (ADR-0012 §Recovery)", () => { it("trims via the shared compactor and retries exactly once on overflow", async () => { const markDirty = vi.fn(async () => undefined); const mw = contextOverflowRecoveryMiddleware(ctx({ markDirty })); @@ -192,10 +192,10 @@ describe("contextOverflowRecoveryMiddleware (§E)", () => { expect(calls).toHaveLength(1); const retried = calls[0].prompt; - // System head pinned verbatim at the front (§C). + // System head pinned verbatim at the front (ADR-0012 §Tier 1). expect(retried[0]).toEqual({ role: "system", content: "SYS" }); - // The big prefix was replaced by the shared summary message (drift T3 — - // compactModelMessages' shape, not a bespoke trim). + // The big prefix was replaced by the shared summary message (ADR-0012 + // §Recovery — compactModelMessages' shape, not a bespoke trim). const summary = retried[1] as { content: Array<{ text: string }> }; expect(summary.content[0].text).toContain( "[Summary of earlier conversation]", @@ -222,7 +222,7 @@ describe("contextOverflowRecoveryMiddleware (§E)", () => { model, }), ).rejects.toBe(second); - // Flag persisted anyway: the NEXT turn must compact durably (drift T3). + // Flag persisted anyway: the NEXT turn must compact durably (ADR-0012 §Recovery). expect(markDirty).toHaveBeenCalledTimes(1); }); diff --git a/apps/backend/src/runs/recovery.ts b/apps/backend/src/runs/recovery.ts index b75669e6..ad94ad34 100644 --- a/apps/backend/src/runs/recovery.ts +++ b/apps/backend/src/runs/recovery.ts @@ -1,20 +1,20 @@ /** - * Context-overflow recovery (context-compaction-plan §E, principle P4). + * Context-overflow recovery (ADR-0012 §Recovery). * * Recovery is the NET, proactive compaction is the plan: even when Tier 1/2 are - * disabled (kill switch §G) or their estimates were wrong, a provider 400/413 + * disabled (kill switch — ADR-0012 §Config & kill switch) or their estimates were wrong, a provider 400/413 * "context too long" must not hard-fail the turn. The middleware here wraps the * language model so EVERY individual model call — the first call of a turn and * every later step of a tool loop, in both the stream and generate paths — gets * one trim-and-retry: * * 1. Detect the overflow ({@link isContextOverflowError}, per-provider body - * matrix — drift T9). + * matrix — ADR-0012 §Recovery). * 2. Persist `compactionDirty = true` through the single CAS writer so the - * NEXT `prepareChatTurn` forces a durable Tier 1 compaction (drift T3 — - * recovery never writes summary/watermark itself; it only flags). + * NEXT `prepareChatTurn` forces a durable Tier 1 compaction (ADR-0012 + * §Recovery — recovery never writes summary/watermark itself; it only flags). * 3. Trim in-memory via {@link compactModelMessages} — the shared Tier 2 - * adapter, NOT a bespoke trim (drift T3) — and retry the call once. + * adapter, NOT a bespoke trim (ADR-0012 §Recovery) — and retry the call once. * 4. A second failure propagates; {@link formatStreamError} in agent-runner * surfaces the "conversation too large" message. No infinite retry. * @@ -22,9 +22,10 @@ * a structural subset of `ModelMessage` for everything compaction touches * (roles, text / tool-call / tool-result / file parts, output wrappers), so the * prompt is passed to `compactModelMessages` directly rather than through a - * lossy converter — one estimator, one trimmer (P2/T3). The leading system - * message(s) are split off first and re-attached verbatim (§C: pin the system - * prompt; the summary must never swallow it). + * lossy converter — one estimator, one trimmer (ADR-0012 §One estimator / + * §Recovery). The leading system message(s) are split off first and re-attached + * verbatim (ADR-0012 §Tier 1: pin the system prompt; the summary must never + * swallow it). */ import { @@ -47,7 +48,7 @@ export type RecoveryContext = { imageProvider: ImageProvider; /** Trim down to this many tokens (the Tier 1 hysteresis target). */ targetTokens: number; - /** The configured keep-recent; recovery halves it (aggressive trim, §E). */ + /** The configured keep-recent; recovery halves it (aggressive trim, ADR-0012 §Recovery). */ keepRecentMessages: number; minPrunableChars: number; summarize: Summarize; @@ -62,7 +63,7 @@ export type RecoveryContext = { }; /** - * Per-provider context-overflow phrasings (drift T9). Matched against the + * Per-provider context-overflow phrasings (ADR-0012 §Recovery). Matched against the * error message AND raw response body, case-insensitive: * - OpenAI / vLLM / OpenAI-compatible: "This model's maximum context length is * N tokens…" + code "context_length_exceeded" @@ -110,7 +111,7 @@ export async function trimOverflowingPrompt( const rest = prompt.slice(systemEnd) as unknown as ModelMessage[]; const result = await compactModelMessages(rest, { - // Aggressive: halve the configured keep-recent (§E), floor of 2 so a + // Aggressive: halve the configured keep-recent (ADR-0012 §Recovery), floor of 2 so a // user/assistant pair survives. keepRecentMessages: Math.max(2, Math.ceil(ctx.keepRecentMessages / 2)), targetTokens: ctx.targetTokens, @@ -119,7 +120,7 @@ export async function trimOverflowingPrompt( summarize: ctx.summarize, summarizerWindow: ctx.summarizerWindow, // The provider already rejected this prompt, so the estimator is wrong; - // bypass the no-op gate or the retry will be byte-identical (RV3). + // bypass the no-op gate or the retry will be byte-identical (ADR-0012 §Recovery). force: true, }); @@ -155,7 +156,7 @@ export function contextOverflowRecoveryMiddleware( "context overflow detected; trimming and retrying once", ); - // Flag durable compaction for the NEXT turn first (drift T3) — even if the + // Flag durable compaction for the NEXT turn first (ADR-0012 §Recovery) — even if the // retry below fails, the next prepareChatTurn must force Tier 1. if (ctx.markDirty) { try { diff --git a/apps/backend/src/runs/token-estimate.test.ts b/apps/backend/src/runs/token-estimate.test.ts index d30c6551..710f4af5 100644 --- a/apps/backend/src/runs/token-estimate.test.ts +++ b/apps/backend/src/runs/token-estimate.test.ts @@ -39,7 +39,7 @@ function dataUrl(bytes: Uint8Array, mediaType = "image/png"): string { return `data:${mediaType};base64,${Buffer.from(bytes).toString("base64")}`; } -describe("estimateTokens (the single estimator, P2)", () => { +describe("estimateTokens (the single estimator, ADR-0012 §One estimator)", () => { it("applies char/4 to text only, rounding up", () => { const units: CountUnit[] = [ { role: "user", text: "abcdefgh", nonText: [] }, @@ -60,7 +60,7 @@ describe("estimateTokens (the single estimator, P2)", () => { }); }); -describe("modality table (drift T2 — never char/4 an image)", () => { +describe("modality table (ADR-0012 §Token estimation — never char/4 an image)", () => { it("anthropic: ceil(w*h/750)", () => { const units: CountUnit[] = [ { @@ -105,7 +105,7 @@ describe("modality table (drift T2 — never char/4 an image)", () => { expect(estimateTokens(noDims)).toBe(85); }); - it("missing dimensions use a pessimistic per-provider ceiling (A2)", () => { + it("missing dimensions use a pessimistic per-provider ceiling (ADR-0012 §Token estimation)", () => { // Providers with a real per-image cost would be UNDER-counted by the flat // 1200 default when bytes/dims are unavailable (hosted URL), so they fall to // a pessimistic ceiling near each provider's post-resize max instead. @@ -174,7 +174,7 @@ describe("parseImageDimensions (cheap header parse)", () => { }); }); -describe("MODEL_BOUND filter (drift T1 — UI-only parts excluded)", () => { +describe("MODEL_BOUND filter (ADR-0012 §One estimator — UI-only parts excluded)", () => { it("counts text but ignores reasoning / source / step-start / data parts", () => { const ui: PlatypusUIMessage[] = [ { @@ -195,7 +195,7 @@ describe("MODEL_BOUND filter (drift T1 — UI-only parts excluded)", () => { expect(units[0].nonText).toHaveLength(0); }); - it("only text/file UI part types are model-bound (RV10 — the documented set)", () => { + it("only text/file UI part types are model-bound (the documented set)", () => { expect([...MODEL_BOUND_UI_PART_TYPES]).toEqual(["text", "file"]); // The UI-only types the adapter must drop are NOT in the model-bound set. for (const uiOnly of [ @@ -210,7 +210,7 @@ describe("MODEL_BOUND filter (drift T1 — UI-only parts excluded)", () => { }); }); -describe("tool-result output variants (RV10 — model adapter)", () => { +describe("tool-result output variants (model adapter)", () => { const unit = (output: unknown): CountUnit => { const msg = { role: "tool", @@ -238,7 +238,7 @@ describe("tool-result output variants (RV10 — model adapter)", () => { }); }); -describe("adapter equality (drift T1 — one estimate across both shapes)", () => { +describe("adapter equality (ADR-0012 §One estimator — one estimate across both shapes)", () => { it("estimate(UI) === estimate(convertToModelMessages(UI)) exactly", async () => { const png = fakePng(128, 128); const ui: UIMessage[] = [ @@ -300,13 +300,13 @@ describe("imageProviderFor", () => { }); }); -// --- estimateOverheadTokens (drift C1) ------------------------------------- +// --- estimateOverheadTokens (ADR-0012 §Tier 1 (trigger projection)) -------- import { z } from "zod"; import { tool } from "ai"; import { estimateOverheadTokens } from "./token-estimate.ts"; -describe("estimateOverheadTokens (drift C1)", () => { +describe("estimateOverheadTokens (ADR-0012 §Tier 1 (trigger projection))", () => { it("counts the system prompt at char/4", () => { const sys = "S".repeat(400); expect(estimateOverheadTokens(sys, {})).toBe(100); @@ -365,11 +365,12 @@ describe("estimateOverheadTokens (drift C1)", () => { }), ]), ); - // The point of C1: this payload is large even with a short history. + // The point of ADR-0012 §Tier 1 (trigger projection): this payload is large + // even with a short history. expect(estimateOverheadTokens(sys, tools)).toBeGreaterThan(500); }); - it("is stable across repeated calls (RV9 schema-cache must not change counts)", () => { + it("is stable across repeated calls (schema-cache must not change counts)", () => { const sys = "system prompt"; const tools = { lookup: tool({ diff --git a/apps/backend/src/runs/token-estimate.ts b/apps/backend/src/runs/token-estimate.ts index 74eed36b..62ea28d3 100644 --- a/apps/backend/src/runs/token-estimate.ts +++ b/apps/backend/src/runs/token-estimate.ts @@ -1,19 +1,20 @@ /** - * The single token estimator (context-compaction-plan §B, principle P2). + * The single token estimator (ADR-0012 §One estimator). * * Token counting lives in **exactly one** function — {@link estimateTokens} — * over **one** neutral structure ({@link CountUnit}). Tier 1 operates on * UIMessages and Tier 2 on ModelMessages; both normalize into `CountUnit[]` via - * the adapters here, so the two tiers can never diverge on a count (drift T1). + * the adapters here, so the two tiers can never diverge on a count + * (ADR-0012 §One estimator). * * Hard rules baked in: * - **char/4 applies to text only.** Tool-call inputs and tool-result outputs * are text-like to the model, so they fold into a unit's `text`. Image / * binary bytes are NEVER char/4'd — they go through the modality table - * ({@link nonTextTokens}, drift T2). + * ({@link nonTextTokens}, ADR-0012 §Token estimation). * - **UI-only parts are excluded on both sides.** `reasoning`, `source-url`, * `source-document`, `step-start`, and `data-*` never reach the model, so - * they are dropped by both adapters (drift T1). + * they are dropped by both adapters (ADR-0012 §One estimator). * - The estimate is content-only — **no per-message role framing overhead** — * so the total is invariant to how messages are grouped. That is what lets * the UIMessage and ModelMessage adapters agree exactly even though @@ -21,8 +22,9 @@ * * The char/4 estimate runs every turn. The provider-reported * `usage.inputTokens` from the prior turn acts as a corrective baseline when - * available (`Tier1Input.lastInputTokens` — threaded by the §H usage-metadata - * chunk); until then the cold-start margin (M2) compensates for under-counts. + * available (`Tier1Input.lastInputTokens` — threaded by the ADR-0012 + * §Context-usage ring); until then the cold-start margin + * (ADR-0012 §Token estimation (cold-start margin)) compensates for under-counts. */ import { @@ -40,7 +42,7 @@ export const CHARS_PER_TOKEN = 4; /** * Conservative flat cost for a non-text part whose true cost we cannot compute * (unknown provider, missing image dimensions, non-image binary file). Over- - * counting beats overflow (drift T2). + * counting beats overflow (ADR-0012 §Token estimation). */ export const DEFAULT_NONTEXT_TOKENS = 1200; @@ -48,7 +50,7 @@ export const DEFAULT_NONTEXT_TOKENS = 1200; const OPENAI_LOW_DETAIL_TOKENS = 85; /** - * No-dimension fallbacks for providers with a real per-image cost (A2). When the + * No-dimension fallbacks for providers with a real per-image cost (ADR-0012 §Token estimation). When the * bytes are absent (hosted http(s) URL — and note `inlineFileUrls` turns every * stored attachment into one) or the header can't be parsed, we have no pixels * to plug into the formula. The flat {@link DEFAULT_NONTEXT_TOKENS} (1200) @@ -97,13 +99,14 @@ export type CountUnit = { /** * UIMessage part `type`s that reach the model and are therefore counted. Kept - * as data so the test can assert the UI-only parts are excluded (drift T1). + * as data so the test can assert the UI-only parts are excluded + * (ADR-0012 §One estimator). * Tool parts are matched separately by the `tool-`/`dynamic-tool` prefix. */ export const MODEL_BOUND_UI_PART_TYPES = ["text", "file"] as const; // --------------------------------------------------------------------------- -// The estimator (the one function — P2) +// The estimator (the one function — ADR-0012 §One estimator) // --------------------------------------------------------------------------- function nonTextTokens(part: NonTextPart): number { @@ -111,7 +114,7 @@ function nonTextTokens(part: NonTextPart): number { if (width == null || height == null) { // Dimensions unknown. OpenAI low-detail has a flat cost even without dims; - // providers with a real per-image cost get a pessimistic ceiling (A2); + // providers with a real per-image cost get a pessimistic ceiling (ADR-0012 §Token estimation); // everything else falls to the conservative default. if (provider === "openai" && detail === "low") return OPENAI_LOW_DETAIL_TOKENS; @@ -176,7 +179,8 @@ export const estimateTokens = (units: CountUnit[]): number => { /** * Deterministic JSON with sorted keys, so the same value serializes to the same * string from either adapter (the UIMessage and ModelMessage shapes must agree - * exactly — drift T1). Cheaper than guarding key order at every call site. + * exactly — ADR-0012 §One estimator). Cheaper than guarding key order at every + * call site. */ export function stableStringify(value: unknown): string { if (value === null || typeof value !== "object") @@ -195,7 +199,8 @@ function isImageMediaType(mediaType: string | undefined): boolean { /** * Builds a {@link NonTextPart} for an image, parsing pixel dimensions from the - * bytes when available (drift T2: a cheap header read, no full decode). + * bytes when available (ADR-0012 §Token estimation: a cheap header read, no full + * decode). */ function imagePart( provider: ImageProvider, @@ -218,7 +223,7 @@ function binaryPart(): NonTextPart { /** * Reads pixel dimensions from PNG / JPEG headers without decoding the image. * Returns undefined for unrecognized formats or truncated data — the caller - * then falls to the conservative constant (drift T2). + * then falls to the conservative constant (ADR-0012 §Token estimation). */ export function parseImageDimensions( bytes: Uint8Array, @@ -245,7 +250,7 @@ export function parseImageDimensions( } const marker = bytes[offset + 1]; // 0xFF fill bytes pad before a real marker; consume one and re-read so a - // run of fill bytes doesn't get mistaken for a segment (RV10). + // run of fill bytes doesn't get mistaken for a segment. if (marker === 0xff) { offset++; continue; @@ -273,7 +278,7 @@ export function parseImageDimensions( return { width, height }; } // Standalone markers with no length payload: SOI(D8), EOI(D9), - // RSTn(D0-D7), TEM(01) (RV10). Skip the 2-byte marker. + // RSTn(D0-D7), TEM(01). Skip the 2-byte marker. if ( marker === 0xd8 || marker === 0xd9 || @@ -293,7 +298,7 @@ export function parseImageDimensions( } /** - * Upper bound on bytes decoded from a data URL for header parsing (RV9). PNG + * Upper bound on bytes decoded from a data URL for header parsing. PNG * dimensions live in the first 24 bytes; a JPEG SOF marker is almost always * within the first few KB. Decoding only a 64 KB prefix avoids materializing a * multi-MB image on every estimation pass — we never need the pixel data, only @@ -306,7 +311,7 @@ const HEADER_DECODE_MAX_B64_CHARS = Math.ceil(HEADER_DECODE_MAX_BYTES / 3) * 4; * Decodes the bytes behind a UIMessage file URL when it is a base64 data URL. * Hosted (http/https) URLs return undefined — we have no bytes in hand, so the * caller falls to the conservative constant. Only a bounded prefix is decoded - * (RV9) since the caller only reads image headers. + * since the caller only reads image headers. */ function bytesFromUrl(url: string): Uint8Array | undefined { const match = /^data:[^;,]*;base64,(.*)$/s.exec(url); @@ -364,14 +369,28 @@ function uiMessageToCountUnit( // Tool invocations (`tool-` and `dynamic-tool`) are model-bound and // text-like: fold their input + output into the char/4 blob. if (type === "dynamic-tool" || type.startsWith("tool-")) { - const tool = part as { input?: unknown; output?: unknown }; + const tool = part as { + input?: unknown; + output?: unknown; + errorText?: string; + }; if (tool.input !== undefined) text += stableStringify(tool.input); - if (tool.output !== undefined) text += stableStringify(tool.output); + // Count the output OR the error text — `convertToModelMessages` maps an + // `output-error` UI part to a `tool-result` with `output: {type:"error-text", + // value: errorText}`, which the model adapter counts via `toolResultOutputText`. + // Skipping errorText here would make the UI side count 0 for a failed tool call + // while the model side counts the error string — breaking the §One estimator + // equality (a tier could fire on a number the other never sees). + if (tool.output !== undefined) { + text += stableStringify(tool.output); + } else if (tool.errorText !== undefined) { + text += stableStringify(tool.errorText); + } continue; } // Everything else (reasoning, source-url, source-document, step-start, - // data-*) is UI-only and excluded on both sides (drift T1). + // data-*) is UI-only and excluded on both sides (ADR-0012 §One estimator). } return { role: message.role, text, nonText }; @@ -394,7 +413,7 @@ export function uiMessagesToCountUnits( * behaviours exist: `execution-denied` carries a `reason`; every other variant * (`text` / `error-text` / `json` / `error-json` / `content`) carries a `value` * that is char/4'd via `stableStringify` — mirroring the UI adapter, which folds - * the raw output the same way (RV10: the old per-label switch collapsed to these + * the raw output the same way (the old per-label switch collapsed to these * two and carried an unreachable `default`). */ function toolResultOutputText(output: ToolResultPart["output"]): string { @@ -459,7 +478,8 @@ export function modelMessagesToCountUnits( } // --------------------------------------------------------------------------- -// Per-turn overhead — system prompt + tool schemas (drift C1) +// Per-turn overhead — system prompt + tool schemas +// (ADR-0012 §Tier 1 (trigger projection)) // --------------------------------------------------------------------------- /** @@ -470,7 +490,7 @@ export function modelMessagesToCountUnits( export const TOOL_SCHEMA_FALLBACK_TOKENS = 200; /** - * Serialized-schema char length cached per input-schema object (RV9). The + * Serialized-schema char length cached per input-schema object. The * `asSchema(...) → stableStringify` conversion is the expensive part of overhead * estimation and a tool's schema object is stable across turns, so memoize it. * A WeakMap keyed by the schema object never pins a tool that goes out of scope. @@ -481,7 +501,8 @@ const schemaLenCache = new WeakMap(); * Estimates the tokens of the per-turn payload that is NOT in the message * history: the rendered system prompt plus every tool's name, description, and * JSON input schema — all sent to the model on every turn, and the dominant - * cause of the C1 trigger under-count on tool-bearing agents (observed 8888 + * cause of the trigger under-count on tool-bearing agents + * (ADR-0012 §Tier 1 (trigger projection)) (observed 8888 * provider-reported vs ~986 message-only). Same char/4 rule as the single * estimator; the result feeds `Tier1Input.overheadTokens`. */ diff --git a/apps/backend/src/services/chat-execution.test.ts b/apps/backend/src/services/chat-execution.test.ts index 12db8b71..b2844a92 100644 --- a/apps/backend/src/services/chat-execution.test.ts +++ b/apps/backend/src/services/chat-execution.test.ts @@ -664,7 +664,7 @@ describe("chat-execution", () => { }); }); - describe("buildCompactionRuntime summarize (Chunk 13 / review Fix B)", () => { + describe("buildCompactionRuntime summarize (ADR-0012 §Summarizer hardening / review Fix B)", () => { const buildRuntime = (signal?: AbortSignal, onActivity?: () => void) => buildCompactionRuntime({ chatId: "chat-1", diff --git a/apps/backend/src/services/chat-execution.ts b/apps/backend/src/services/chat-execution.ts index 8eb387d1..4feae0c0 100644 --- a/apps/backend/src/services/chat-execution.ts +++ b/apps/backend/src/services/chat-execution.ts @@ -130,7 +130,7 @@ export type ChatTurnRequest = { * Chat id. Present for interactive chat turns (the chatSubmit payload); * absent for headless callers (triggers, sub-agents) whose `request` carries * no chat. Tier 1 compaction keys on it — see the skip guard in - * `prepareChatTurn` (plan M3: headless runs are Tier 2 only). + * `prepareChatTurn` (ADR-0012 §Sub-agents: headless runs are Tier 2 only). */ id?: string; agentId?: string; @@ -161,7 +161,7 @@ export type ChatTurn = { seed?: number; }; /** - * Set when Tier 1 compaction fired this turn (§K / 11c). agent-runner emits + * Set when Tier 1 compaction fired this turn (ADR-0012 §Compaction trace in the timeline). agent-runner emits * a synthetic compact_context tool-call + tool-result pair into the stream so * the compaction is visible in the chat timeline. */ @@ -177,20 +177,20 @@ export type ChatTurn = { frequencyPenalty?: number; presencePenalty?: number; seed?: number; - /** Resolved context window for the main model (§H ring, §I stats). */ + /** Resolved context window for the main model (ADR-0012 §Context-usage ring, ADR-0012 §Per-message stats). */ contextWindow: number; - /** True when contextWindow fell to the conservative default (T6: ring → neutral). */ + /** True when contextWindow fell to the conservative default (ADR-0012 §Context-usage ring: ring → neutral). */ contextWindowIsDefault: boolean; }; /** - * Context-overflow recovery wiring (§E, P4). Always present — recovery is + * Context-overflow recovery wiring (ADR-0012 §Recovery, ADR-0012 §Recovery is the net). Always present — recovery is * the safety net and stays on even when proactive compaction is disabled. * agent-runner wraps the model with the recovery middleware using this. */ recovery: RecoveryContext; /** - * Tier 2 in-turn compaction config (§D). Null when proactive compaction is - * disabled (§G kill switch or agent override). agent-runner builds the + * Tier 2 in-turn compaction config (ADR-0012 §Tier 2). Null when proactive compaction is + * disabled (ADR-0012 §Config & kill switch or agent override). agent-runner builds the * prepareStep callback from this and wires it into streamText/generateText. */ tier2: Tier2Context | null; @@ -227,8 +227,8 @@ export type PrepareChatTurnInput = { onActivity?: (event?: ToolActivityEvent) => void; /** * Messages as they were in the DB BEFORE this submission's `ChatSink.onStart` - * overwrote them — the C4 baseline for detecting edits below the watermark - * (RV1). Loaded by agent-runner before calling onStart. When absent the C4 + * overwrote them — the ADR-0012 §Summary invalidation baseline for detecting edits below the watermark + * (ADR-0012 §Summary invalidation). Loaded by agent-runner before calling onStart. When absent the ADR-0012 §Summary invalidation * check falls back to a DB read that now returns the post-overwrite state. */ priorMessages?: PlatypusUIMessage[]; @@ -496,7 +496,7 @@ export const drizzleChatTurnQueries: ChatTurnQueries = { }, }; -// --- Tier 1 context compaction (ADR-0009) --- +// --- Tier 1 context compaction (ADR-0012) --- const EMPTY_COMPACTION_STATE: CompactionState = { version: 0, @@ -506,105 +506,24 @@ const EMPTY_COMPACTION_STATE: CompactionState = { }; /** - * Loads the canonical (raw) persisted history for a chat. Exported so - * agent-runner can snapshot it BEFORE `ChatSink.onStart` overwrites the row — - * that snapshot is the C4 baseline (RV1: onStart runs before prepareChatTurn, - * so a read inside applyTier1IfNeeded would see the just-submitted messages). - */ -export async function loadChatMessages( - chatId: string, -): Promise { - const rows = await db - .select({ messages: chatTable.messages }) - .from(chatTable) - .where(eq(chatTable.id, chatId)) - .limit(1); - return (rows[0]?.messages as PlatypusUIMessage[] | null) ?? []; -} - -/** - * Newest-first scan for the last assistant message carrying a POSITIVE - * provider-reported `contextTokens` (the §H stat). Skips two messages that would - * otherwise shadow the real baseline: - * - the §J standalone trace message (assistant role, no `metadata.stats`) — C2; - * - a turn from a usage-less provider stamped `contextTokens = 0` — A1. - * Either would make the Tier 1 projection drop the corrective baseline (and, for - * the 0 case, the cold-start margin too). - */ -function findLastInputTokens( - messages: PlatypusUIMessage[], -): number | undefined { - for (let i = messages.length - 1; i >= 0; i--) { - if (messages[i].role !== "assistant") continue; - const ct = ( - messages[i].metadata as { stats?: { contextTokens?: number } } | undefined - )?.stats?.contextTokens; - if (typeof ct === "number" && ct > 0) return ct; - } - return undefined; -} - -/** - * Everything the compaction machinery needs that is resolved once per turn: - * the budget (from the resolved context window), the effective config, the - * summarizer, and the summarizer's own window (drift M1). Shared by Tier 1 - * and the recovery middleware (§E) so the two never disagree. + * Resolves the effective global compaction config from DEFAULT_COMPACTION_CONFIG + * + env overrides (ADR-0012 §Config & kill switch). Extracted so both + * buildCompactionRuntime and the context-window endpoint (which surfaces + * keepRecentMessages to the force-compact confirm gate) share one source of + * truth. Pure — depends only on process.env. */ -type CompactionRuntime = { - budget: Budget; - config: CompactionConfig; - imageProvider: ImageProvider; - summarize: Summarize; - summarizerWindow?: number; - /** Resolved context window for the main model (§H ring). */ - contextWindow: number; - /** True when the window fell to the conservative default (T6: ring → neutral). */ - contextWindowIsDefault: boolean; -}; - -/** - * Builds the per-turn compaction runtime. Never throws: a failed window - * resolution falls back to the conservative default so recovery (P4) always - * has a working configuration. - */ -/** Safety ceiling on summarizer output (Chunk 13, Fix 2). Prevents a runaway - * model from producing a summary longer than its input. The system prompt - * hard-limits to 1500 tokens; this 4000 backstop catches models that ignore - * the instruction (e.g. qwen36 on large tool-heavy inputs). */ -const SUMMARIZE_MAX_OUTPUT_TOKENS = 4000; - -/** Heartbeat interval while the summarizer runs (Chunk 13, Fix 1). Resets the - * per-step stall watchdog so a slow summarize call is not misidentified as a - * frozen run and killed before it returns. */ -const SUMMARIZE_HEARTBEAT_INTERVAL_MS = 10_000; - -export async function buildCompactionRuntime(args: { - chatId?: string; - provider: Provider; - resolvedModelId: string; - opened: ReturnType; - /** When present, called every ~10 s during `summarize` to keep the per-step - * stall watchdog alive (Chunk 13, Fix 1). */ - onActivity?: () => void; - /** Run abort signal, threaded into the summarizer `generateText` so a - * cancelled / per-run-timed-out run aborts the call instead of leaking it - * past the heartbeat-suppressed per-step watchdog (review Fix B). */ - signal?: AbortSignal; -}): Promise { - const { chatId, provider, resolvedModelId, opened, onActivity, signal } = - args; - +export function resolveCompactionConfig(): CompactionConfig { const config = { ...DEFAULT_COMPACTION_CONFIG }; - // Global kill switch (§G) gates proactive compaction; recovery is unaffected. + // Global kill switch (ADR-0012 §Config & kill switch) gates proactive compaction; recovery is unaffected. if (process.env.COMPACTION_ENABLED === "false") { config.compactionEnabled = false; } - // Optional env overrides for the global ceiling (§G). Unset/blank/invalid → + // Optional env overrides for the global ceiling (ADR-0012 §Config & kill switch). Unset/blank/invalid → // the DEFAULT_COMPACTION_CONFIG value stands, so production behavior is // unchanged. Intended for tuning the trigger on test deployments without a // code change. Keep targetRatio < triggerRatio or compaction re-fires every // turn (the thrash trap). - // Reads + RANGE-VALIDATES a numeric env override (A3/B-F1). An out-of-range or + // Reads + RANGE-VALIDATES a numeric env override (ADR-0012 §Config & kill switch). An out-of-range or // non-finite value is rejected (warn + fall back to the default) rather than // silently applied: the old `Number.isFinite`-only check let `0` and negatives // through, so `COMPACTION_KEEP_RECENT=0` summarized the current message away @@ -667,7 +586,7 @@ export async function buildCompactionRuntime(args: { process.env.COMPACTION_MIN_RECENT_PRUNABLE_CHARS, { min: 1, integer: true }, ) ?? config.minRecentPrunableChars; - // Stage 0 context editing (Chunk 14 Task 2). Disabled via + // ADR-0012 §Stage 0 — context editing. Disabled via // COMPACTION_CONTEXT_EDITING_ENABLED=false; recency/size gates tunable. if (process.env.COMPACTION_CONTEXT_EDITING_ENABLED === "false") { config.contextEditingEnabled = false; @@ -685,9 +604,9 @@ export async function buildCompactionRuntime(args: { { min: 1, integer: true }, ) ?? config.minEditableToolChars; - // Hysteresis backstop (C2 / B-F1): target must stay below trigger or - // compaction re-fires every turn. The chunk-9 runtime clamp was lost when - // chunk-12 deleted resolveCompactionConfig; restore it here so an operator who + // Hysteresis backstop (ADR-0012 §Tier 1 (hysteresis)): target must stay below trigger or + // compaction re-fires every turn (ADR-0012 §Tier 1 hysteresis). The earlier runtime clamp was + // dropped when per-agent config was removed (ADR-0012 §Config & kill switch); restore it here so an operator who // sets COMPACTION_TARGET_RATIO >= COMPACTION_TRIGGER_RATIO still runs safely. if (config.targetRatio >= config.triggerRatio) { const clamped = config.triggerRatio * 0.9; @@ -697,12 +616,105 @@ export async function buildCompactionRuntime(args: { triggerRatio: config.triggerRatio, clamped, }, - "COMPACTION_TARGET_RATIO >= COMPACTION_TRIGGER_RATIO; clamping target to triggerRatio*0.9 (C2 hysteresis)", + "COMPACTION_TARGET_RATIO >= COMPACTION_TRIGGER_RATIO; clamping target to triggerRatio*0.9 (hysteresis)", ); config.targetRatio = clamped; } + return config; +} + +/** + * Loads the canonical (raw) persisted history for a chat. Exported so + * agent-runner can snapshot it BEFORE `ChatSink.onStart` overwrites the row — + * that snapshot is the ADR-0012 §Summary invalidation baseline (ADR-0012 §Summary invalidation: onStart runs before prepareChatTurn, + * so a read inside applyTier1IfNeeded would see the just-submitted messages). + */ +export async function loadChatMessages( + chatId: string, +): Promise { + const rows = await db + .select({ messages: chatTable.messages }) + .from(chatTable) + .where(eq(chatTable.id, chatId)) + .limit(1); + return (rows[0]?.messages as PlatypusUIMessage[] | null) ?? []; +} + +/** + * Newest-first scan for the last assistant message carrying a POSITIVE + * provider-reported `contextTokens` (the ADR-0012 §Context-usage ring stat). Skips two messages that would + * otherwise shadow the real baseline: + * - the ADR-0012 §Force-compact on demand standalone trace message (assistant role, no `metadata.stats`) — ADR-0012 §Tier 1 (hysteresis); + * - a turn from a usage-less provider stamped `contextTokens = 0` — ADR-0012 §Tier 1 (trigger projection). + * Either would make the Tier 1 projection drop the corrective baseline (and, for + * the 0 case, the cold-start margin too). + */ +function findLastInputTokens( + messages: PlatypusUIMessage[], +): number | undefined { + for (let i = messages.length - 1; i >= 0; i--) { + if (messages[i].role !== "assistant") continue; + const ct = ( + messages[i].metadata as { stats?: { contextTokens?: number } } | undefined + )?.stats?.contextTokens; + if (typeof ct === "number" && ct > 0) return ct; + } + return undefined; +} - // RV7d: resolve both windows concurrently (they are independent). +/** + * Everything the compaction machinery needs that is resolved once per turn: + * the budget (from the resolved context window), the effective config, the + * summarizer, and the summarizer's own window (ADR-0012 §Tier 1 (summarizer model & map-reduce)). Shared by Tier 1 + * and the recovery middleware (ADR-0012 §Recovery) so the two never disagree. + */ +type CompactionRuntime = { + budget: Budget; + config: CompactionConfig; + imageProvider: ImageProvider; + summarize: Summarize; + summarizerWindow?: number; + /** Resolved context window for the main model (ADR-0012 §Context-usage ring). */ + contextWindow: number; + /** True when the window fell to the conservative default (ADR-0012 §Context-usage ring: ring → neutral). */ + contextWindowIsDefault: boolean; +}; + +/** + * Builds the per-turn compaction runtime. Never throws: a failed window + * resolution falls back to the conservative default so recovery (ADR-0012 §Recovery is the net) always + * has a working configuration. + */ +/** Safety ceiling on summarizer output (ADR-0012 §Summarizer hardening). Prevents a runaway + * model from producing a summary longer than its input. The system prompt + * hard-limits to 1500 tokens; this 4000 backstop catches models that ignore + * the instruction (e.g. qwen36 on large tool-heavy inputs). */ +const SUMMARIZE_MAX_OUTPUT_TOKENS = 4000; + +/** Heartbeat interval while the summarizer runs (ADR-0012 §Summarizer hardening). Resets the + * per-step stall watchdog so a slow summarize call is not misidentified as a + * frozen run and killed before it returns. */ +const SUMMARIZE_HEARTBEAT_INTERVAL_MS = 10_000; + +export async function buildCompactionRuntime(args: { + chatId?: string; + provider: Provider; + resolvedModelId: string; + opened: ReturnType; + /** When present, called every ~10 s during `summarize` to keep the per-step + * stall watchdog alive (ADR-0012 §Summarizer hardening). */ + onActivity?: () => void; + /** Run abort signal, threaded into the summarizer `generateText` so a + * cancelled / per-run-timed-out run aborts the call instead of leaking it + * past the heartbeat-suppressed per-step watchdog (review Fix B). */ + signal?: AbortSignal; +}): Promise { + const { chatId, provider, resolvedModelId, opened, onActivity, signal } = + args; + + const config = resolveCompactionConfig(); + + // ADR-0012 §Window resolution: resolve both windows concurrently (they are independent). const taskModelId = provider.taskModelId || resolvedModelId; const [mainWindow, summarizerWindowResult] = await Promise.all([ contextWindowResolver.resolve(provider, resolvedModelId).catch((error) => { @@ -728,10 +740,10 @@ export async function buildCompactionRuntime(args: { : undefined; // Summarizer uses the provider's task model, falling back to the main model - // when unset (drift T7). generateText is one-shot, no tools. + // when unset (ADR-0012 §Tier 1 (summarizer model)). generateText is one-shot, no tools. const summarize = async (text: string): Promise => { const startedAt = Date.now(); - // Fix 1 (Chunk 13): keep the per-step stall watchdog alive while the + // ADR-0012 §Summarizer hardening: keep the per-step stall watchdog alive while the // summarizer runs. Tier-1 compaction is legitimate long work, not a stall; // without this ping the 120 s watchdog fires and kills the run. const heartbeat = onActivity @@ -740,7 +752,7 @@ export async function buildCompactionRuntime(args: { try { const result = await generateText({ model: opened.languageModel(taskModelId), - // Fix 2 (Chunk 13): structured handoff prompt — sections reduce loss + // ADR-0012 §Summarizer hardening: structured handoff prompt — sections reduce loss // across repeated re-compactions (Codex CLI pattern); explicit concise // instruction + "aim under ~1500 tokens" pairs with the output ceiling. // Sections are ordered most-critical-first: if the output is truncated @@ -755,7 +767,7 @@ export async function buildCompactionRuntime(args: { If a prior summary appears in the history, integrate it — don't drop facts it captured. Be concise: hard limit 1500 tokens maximum. Output only the summary.`, prompt: text, - // Fix 2 (Chunk 13): hard ceiling prevents a runaway model from + // ADR-0012 §Summarizer hardening: hard ceiling prevents a runaway model from // producing a summary longer than its input. Prompt hard-limits to // 1500 tokens; 4000 backstop catches models that ignore the instruction. maxOutputTokens: SUMMARIZE_MAX_OUTPUT_TOKENS, @@ -804,23 +816,23 @@ If a prior summary appears in the history, integrate it — don't drop facts it type ApplyTier1Args = { chatId: string; runtime: CompactionRuntime; - /** Post-inlineFileUrls messages — used for the compaction itself (T2). */ + /** Post-inlineFileUrls messages — used for the compaction itself (ADR-0012 §Token estimation). */ messages: PlatypusUIMessage[]; /** * Pre-inlineFileUrls messages from this submission — used as the incoming - * side of the C4 divergence check (RV1). Must NOT be inlined: the persisted + * side of the ADR-0012 §Summary invalidation divergence check (ADR-0012 §Summary invalidation). Must NOT be inlined: the persisted * side also uses storage:// / http:// URLs, so both sides are comparable. */ rawMessages: PlatypusUIMessage[]; /** * Messages as they were in the DB BEFORE this submission's onStart overwrote - * them (RV1). When absent, the C4 check falls back to a fresh DB read, which + * them (ADR-0012 §Summary invalidation). When absent, the ADR-0012 §Summary invalidation check falls back to a fresh DB read, which * returns the post-overwrite state and therefore never detects edits. */ priorMessages?: PlatypusUIMessage[]; - /** Estimated system-prompt + tool-schema payload for this turn (drift C1). */ + /** Estimated system-prompt + tool-schema payload for this turn (ADR-0012 §Tier 1 (trigger projection)). */ overheadTokens: number; - /** Provider-reported `usage.inputTokens` from the prior turn (C1, §H). */ + /** Provider-reported `usage.inputTokens` from the prior turn (ADR-0012 §Tier 1 (trigger projection), ADR-0012 §Context-usage ring). */ lastInputTokens?: number; }; @@ -831,9 +843,9 @@ type Tier1IfNeededResult = { /** * Reconstructs/advances the compacted view and persists any new summary — all - * best-effort. Any throw degrades to the uncompacted messages (recovery §E + * best-effort. Any throw degrades to the uncompacted messages (recovery ADR-0012 §Recovery * remains the safety net). Returns the messages to send to the model plus an - * optional compactionTrace for the stream trace (§K / 11c). + * optional compactionTrace for the stream trace (ADR-0012 §Compaction trace in the timeline). */ async function applyTier1IfNeeded( args: ApplyTier1Args, @@ -843,11 +855,11 @@ async function applyTier1IfNeeded( const store = drizzleCompactionStore; let state = (await store.readState(chatId)) ?? EMPTY_COMPACTION_STATE; - // C4 invalidation: if the submitted history changed at/below the watermark + // ADR-0012 §Summary invalidation: if the submitted history changed at/below the watermark // (edit/delete/regenerate), reset the stale summary before compacting. The // single submit endpoint is the only "edit handler" in this architecture. // - // RV1 fix: the baseline must be the DB state BEFORE this submission's + // ADR-0012 §Summary invalidation fix: the baseline must be the DB state BEFORE this submission's // onStart overwrote the row. agent-runner reads it before calling onStart // and threads it here as `priorMessages`. We also compare the pre-inline // (`rawMessages`) side so file-URL inlining doesn't trigger false positives. @@ -1036,7 +1048,7 @@ export const prepareChatTurn = async ( const systemPrompt = generation.systemPrompt!; - // --- Context compaction & recovery (ADR-0009) --- + // --- Context compaction & recovery (ADR-0012) --- // The runtime (window budget, config, summarizer) is resolved once and shared // by Tier 1 and the recovery middleware so they never disagree. Never throws. const compactionRuntime = await buildCompactionRuntime({ @@ -1045,7 +1057,7 @@ export const prepareChatTurn = async ( resolvedModelId, opened, // Thread the activity callback so the summarizer heartbeat can bump the - // per-step stall watchdog (Chunk 13, Fix 1). `onActivity` accepts an + // per-step stall watchdog (ADR-0012 §Summarizer hardening). `onActivity` accepts an // optional event, so it satisfies the `() => void` heartbeat signature // directly — the interval invokes it with no event (timer-only bump). onActivity, @@ -1055,27 +1067,27 @@ export const prepareChatTurn = async ( }); // Per-turn overhead: system prompt + tool schemas, sent on every turn but - // invisible to a message-only estimate (drift C1). + // invisible to a message-only estimate (ADR-0012 §Tier 1 (trigger projection)). const overheadTokens = estimateOverheadTokens(systemPrompt, wrappedTools); // Tier 1 is best-effort: a failure here must never break the turn — recovery - // (§E) is the net. Runs AFTER inlineFileUrls so the estimate sees the real - // payload (T2). Cross-turn durable compaction is keyed by chat id; headless + // (ADR-0012 §Recovery) is the net. Runs AFTER inlineFileUrls so the estimate sees the real + // payload (ADR-0012 §Token estimation). Cross-turn durable compaction is keyed by chat id; headless // runs (triggers, sub-agents) carry no chat id and have no durable history to - // compact (plan M3 — they are Tier 2 only), so send messages uncompacted. + // compact (ADR-0012 §Sub-agents — they are Tier 2 only), so send messages uncompacted. const chatId = request.id; const tier1Result = chatId ? await applyTier1IfNeeded({ chatId, runtime: compactionRuntime, messages: inlinedMessages, - // Pre-inline messages for C4 comparison (RV1): both sides must use the + // Pre-inline messages for ADR-0012 §Summary invalidation comparison (ADR-0012 §Summary invalidation): both sides must use the // same URL format (storage:// / http://) to avoid false positives. rawMessages: messages, - // Pre-overwrite baseline threaded from agent-runner (RV1). + // Pre-overwrite baseline threaded from agent-runner (ADR-0012 §Summary invalidation). priorMessages: input.priorMessages, overheadTokens, - // Prior turn's provider-reported input token count (C1 / §H): the + // Prior turn's provider-reported input token count (ADR-0012 §Tier 1 (trigger projection) / ADR-0012 §Context-usage ring): the // corrective baseline for the Tier 1 trigger projection on turns ≥ 2. // Absent on turn 1 → cold-start margin applies. lastInputTokens: findLastInputTokens(messages), @@ -1083,12 +1095,12 @@ export const prepareChatTurn = async ( : { messages: inlinedMessages }; const compactedMessages = tier1Result.messages; - // Recovery (§E, P4): always wired, even when proactive compaction is off. + // Recovery (ADR-0012 §Recovery, ADR-0012 §Recovery is the net): always wired, even when proactive compaction is off. // Headless runs get trim+retry but no dirty flag (no durable chat row). const recovery: RecoveryContext = { chatId, imageProvider: compactionRuntime.imageProvider, - // RV6: subtract the per-turn overhead so recovery uses the same effective + // ADR-0012 §Tier 1 (budget math): subtract the per-turn overhead so recovery uses the same effective // target as Tier 1. Without this, a large overhead (e.g. 65%+ of the window) // means the recovery retry still overflows even after trimming. targetTokens: Math.max( @@ -1138,7 +1150,7 @@ export const prepareChatTurn = async ( recovery, tier2: compactionRuntime.config.compactionEnabled ? { - // RV6 (Tier 2): the prepareStep estimate counts ModelMessages only — + // ADR-0012 §Tier 1 (budget math) (Tier 2): the prepareStep estimate counts ModelMessages only — // system prompt + tool schemas go as separate streamText params and // are invisible to it, yet they consume the same window. Subtract the // per-turn overhead so the trigger/target reflect the real wire @@ -1506,15 +1518,15 @@ const loadSubAgents = async ( return providerCache.get(providerId) ?? null; }; - // Tier 2 only for sub-agents (drift M3: no durable history for Tier 1). + // Tier 2 only for sub-agents (ADR-0012 §Sub-agents: no durable history for Tier 1). // Resolve per-sub-agent compaction runtime so each sub-agent's tool loop // gets a prepareStep calibrated to its own model's context window. const subAgentPrepareSteps = new Map< string, import("ai").PrepareStepFunction >(); - // Per-sub-agent overflow recovery (C1/B-F3). Built ALWAYS — recovery (P4) is - // the net even when the §G kill switch disables proactive compaction, exactly + // Per-sub-agent overflow recovery (ADR-0012 §Sub-agents). Built ALWAYS — recovery (ADR-0012 §Recovery is the net) is + // the net even when the ADR-0012 §Config & kill switch disables proactive compaction, exactly // as on the main path. Tier 2 (below) is the only part gated by the switch. const subAgentRecoveries = new Map(); await Promise.all( @@ -1529,12 +1541,25 @@ const loadSubAgents = async ( resolvedModelId: sa.modelId, opened: resolved.opened, }); + // ADR-0012 §Tier 1 (budget math): subtract the sub-agent's per-turn + // overhead so its recovery/Tier 2 targets match the main path. The + // sub-agent's tool schemas resolve lazily at invocation and aren't + // available here, so the system prompt — the dominant, predictable + // component — is the floor; under-counting overhead only trims slightly + // less aggressively, and recovery's force-halving still backstops it. + const subOverheadTokens = estimateOverheadTokens( + sa.systemPrompt ?? undefined, + undefined, + ); // Recovery net first (not gated by compactionEnabled). No markDirty — // sub-agents have no durable chat row to flag. subAgentRecoveries.set(sa.id, { chatId: sa.id, imageProvider: runtime.imageProvider, - targetTokens: Math.max(0, runtime.budget.targetTokens), + targetTokens: Math.max( + 0, + runtime.budget.targetTokens - subOverheadTokens, + ), keepRecentMessages: runtime.config.keepRecentMessages, minPrunableChars: runtime.config.minPrunableChars, summarize: runtime.summarize, @@ -1542,8 +1567,14 @@ const loadSubAgents = async ( }); if (!runtime.config.compactionEnabled) return; const tier2: Tier2Context = { - triggerTokens: Math.max(0, runtime.budget.triggerTokens), - targetTokens: Math.max(0, runtime.budget.targetTokens), + triggerTokens: Math.max( + 0, + runtime.budget.triggerTokens - subOverheadTokens, + ), + targetTokens: Math.max( + 0, + runtime.budget.targetTokens - subOverheadTokens, + ), keepRecentMessages: runtime.config.keepRecentMessages, minPrunableChars: runtime.config.minPrunableChars, imageProvider: runtime.imageProvider, @@ -1591,12 +1622,12 @@ const loadSubAgents = async ( return { subAgents, subAgentTools, subAgentMcpClients }; }; -// --- Force-compact endpoint (§J) --- +// --- Force-compact endpoint (ADR-0012 §Force-compact on demand) --- /** - * Runs Tier 1 compaction unconditionally for a chat (§J: clickable ring). + * Runs Tier 1 compaction unconditionally for a chat (ADR-0012 §Force-compact on demand: clickable ring). * Forces the compaction regardless of the token threshold by injecting - * compactionDirty=true so the RV3 force path bypasses the estimate gate. + * compactionDirty=true so the ADR-0012 §Recovery force path bypasses the estimate gate. * Called from `POST /chats/:id/compact`; the route guards against concurrent * runs before calling here. */ @@ -1606,9 +1637,15 @@ export async function forceCompactChat( orgId: string, ): Promise<{ estimatedTokens: number; + /** Message-only estimate of the history BEFORE compaction (same basis as estimatedTokens). */ + tokensBefore: number; + /** Number of prefix messages folded into the summary this run (0 if no summary). */ + messagesDropped: number; + /** The config keep-recent count — the client compares messagesDropped against it. */ + keepRecentMessages: number; contextWindow: number; contextWindowIsDefault: boolean; - /** §J/11c — the persisted synthetic trace message, when a summary was produced. */ + /** ADR-0012 §Compaction trace in the timeline — the persisted synthetic trace message, when a summary was produced. */ traceMessage?: PlatypusUIMessage; }> { // Load the chat record (workspace-scoped). @@ -1673,7 +1710,7 @@ export async function forceCompactChat( const rawState = (await drizzleCompactionStore.readState(chatId)) ?? EMPTY_COMPACTION_STATE; - // Force-trigger by marking dirty in the in-memory copy (RV3: bypass the + // Force-trigger by marking dirty in the in-memory copy (ADR-0012 §Recovery: bypass the // estimate gate so the compaction actually shrinks the history). const forcedState: CompactionState = { ...rawState, compactionDirty: true }; @@ -1696,8 +1733,13 @@ export async function forceCompactChat( const estimatedTokens = estimateTokens( uiMessagesToCountUnits(result.messages, runtime.imageProvider), ); + // Pre-compaction estimate (same basis) so the client can decide whether the + // drop is significant enough to confirm — ADR-0012 §Force-compact on demand. + const tokensBefore = estimateTokens( + uiMessagesToCountUnits(messages, runtime.imageProvider), + ); - // §J/11c: a forced compaction has no live stream to inject the trace into, so + // ADR-0012 §Compaction trace in the timeline: a forced compaction has no live stream to inject the trace into, so // persist it as a standalone synthetic assistant message. Appended after the // last real message — above the watermark (which already advanced inside // applyTier1Compaction), so it is never itself summarized. The strip filter @@ -1710,7 +1752,7 @@ export async function forceCompactChat( result.compactionTrace, createIdGenerator({ prefix: "msg", size: 16 })(), ); - // Atomic jsonb append (A4/B-F8): concatenate at the DB rather than overwrite + // Atomic jsonb append: concatenate at the DB rather than overwrite // the whole column from the in-memory `messages` snapshot loaded earlier. // The route guards with runRegistry.has(chatId), but a run that registers in // the has()→write window — or a second concurrent POST /compact — would @@ -1728,6 +1770,9 @@ export async function forceCompactChat( return { estimatedTokens, + tokensBefore, + messagesDropped: result.compactionTrace?.messagesDropped ?? 0, + keepRecentMessages: runtime.config.keepRecentMessages, contextWindow: runtime.contextWindow, contextWindowIsDefault: runtime.contextWindowIsDefault, traceMessage, diff --git a/apps/backend/src/tools/sub-agent.test.ts b/apps/backend/src/tools/sub-agent.test.ts index 42e57677..26be1c6c 100644 --- a/apps/backend/src/tools/sub-agent.test.ts +++ b/apps/backend/src/tools/sub-agent.test.ts @@ -81,7 +81,7 @@ describe("createSubAgentTool", () => { capturedSettings.length = 0; }); - describe("Tier 2 prepareStep (drift M3)", () => { + describe("Tier 2 prepareStep (ADR-0012 §Sub-agents)", () => { it("passes prepareStep to ToolLoopAgent when provided", () => { const mockPrepareStep = vi.fn(); createSubAgentTool({ ...baseOptions, prepareStep: mockPrepareStep }); @@ -421,7 +421,7 @@ describe("createSubAgentTools", () => { expect(Object.keys(result)).toHaveLength(1); }); - it("threads prepareStepFn to ToolLoopAgent for each sub-agent (drift M3)", async () => { + it("threads prepareStepFn to ToolLoopAgent for each sub-agent (ADR-0012 §Sub-agents)", async () => { capturedSettings.length = 0; const subAgents = [ { id: "sa-1", name: "Alpha", providerId: "p1", modelId: "m1" }, diff --git a/apps/backend/src/tools/sub-agent.ts b/apps/backend/src/tools/sub-agent.ts index 2e1a5a21..52f13f55 100644 --- a/apps/backend/src/tools/sub-agent.ts +++ b/apps/backend/src/tools/sub-agent.ts @@ -55,15 +55,15 @@ interface SubAgentToolOptions { maxSteps?: number; /** Called on each activity update from the sub-agent. Used to reset the parent run's per-step timeout. */ onProgress?: () => void; - /** Tier 2 in-turn compaction callback (§D, drift M3). Null when compaction disabled. */ + /** Tier 2 in-turn compaction callback (ADR-0012 §Tier 2 / §Sub-agents). Null when compaction disabled. */ prepareStep?: PrepareStepFunction; /** - * Context-overflow recovery (§E, P4) for the sub-agent's own model calls. + * Context-overflow recovery (ADR-0012 §Recovery) for the sub-agent's own model calls. * Sub-agents run a ToolLoopAgent OUTSIDE the parent run's recovery-wrapped * model, so without this their only overflow protection is Tier 2 — which * fires late (its trigger omits the sub-agent's tool/prompt overhead) and has * no net behind it. Wrapping here gives every sub-agent step one trim+retry, - * matching the main path (C1/B-F3). `markDirty` is omitted (no chat row). + * matching the main path (ADR-0012 §Sub-agents). `markDirty` is omitted (no chat row). */ recovery?: RecoveryContext; } @@ -91,7 +91,7 @@ export const createSubAgentTool = (options: SubAgentToolOptions) => { const toolName = subAgentToolName({ name }); - // Wrap the sub-agent model with the overflow-recovery middleware (C1/B-F3) so + // Wrap the sub-agent model with the overflow-recovery middleware (ADR-0012 §Sub-agents) so // a step that overflows gets one trim+retry instead of hard-failing the task. // Guard on `typeof model !== "string"`: `wrapLanguageModel` needs a model // INSTANCE, and `LanguageModel` permits a bare string id. The factory returns diff --git a/apps/frontend/components/chat.tsx b/apps/frontend/components/chat.tsx index a29eac89..d9fe1a19 100644 --- a/apps/frontend/components/chat.tsx +++ b/apps/frontend/components/chat.tsx @@ -380,7 +380,7 @@ export const Chat = ({ [messages, setMessages], ); - // Resolve the effective provider+model for the ring (drift U1: use selected + // Resolve the effective provider+model for the ring (ADR-0012 §Context-usage ring: use selected // model's window, not last message's window). When an agent is selected we // look up its provider/model; otherwise use the directly selected values. const effectiveRingProviderId = agentId @@ -392,10 +392,11 @@ export const Chat = ({ // Fetch resolved context window for the currently-selected model (cached on // the backend). Returns null contextWindow when source = "default" so the ring - // renders neutral (drift T6). Re-fetches automatically on model/agent change. + // renders neutral (ADR-0012 §Context-usage ring). Re-fetches automatically on model/agent change. const { data: contextWindowData } = useSWR<{ contextWindow: number | null; source: string; + keepRecentMessages?: number; }>( backendUrl && user && effectiveRingProviderId && effectiveRingModelId ? joinUrl( @@ -406,8 +407,8 @@ export const Chat = ({ fetcher, ); - // Stats from the last completed assistant message for the ring (§H) and - // per-message stats popover (§I). + // Stats from the last completed assistant message for the ring (ADR-0012 §Context-usage ring) and + // per-message stats popover (ADR-0012 §Per-message stats). const lastAssistantStats = useMemo(() => { for (let i = messages.length - 1; i >= 0; i--) { const msg = messages[i]; @@ -444,13 +445,13 @@ export const Chat = ({ chatData?.status === "running" && status === "ready"; const effectiveStatus = isReconnectedToRunningRun ? "streaming" : status; - // §J: compact on demand — state for pending (deferred while streaming), + // ADR-0012 §Force-compact on demand — state for pending (deferred while streaming), // in-flight compaction spinner, and the post-compact token estimate that // refreshes the ring immediately (before the next completed message). const [compactPending, setCompactPending] = useState(false); const [isCompacting, setIsCompacting] = useState(false); // Stable count of assistant messages — unaffected by optimistic user-message - // pushes (11a / U5). Used to tag post-compact estimates so the ring doesn't + // pushes (ADR-0012 §Context-usage ring). Used to tag post-compact estimates so the ring doesn't // snap back to the old value when the user hits Send. const assistantMessageCount = useMemo( () => messages.filter((m) => m.role === "assistant").length, @@ -460,7 +461,7 @@ export const Chat = ({ // Post-compact estimate, tagged with the assistant message count at // compaction time so it auto-expires once a new assistant message arrives // (the next provider count is authoritative). Using assistantMessageCount - // instead of messages.length fixes the U5 ring jump: an optimistic user + // instead of messages.length fixes the ring-jump bug (ADR-0012 §Context-usage ring): an optimistic user // message increments messages.length but not assistantMessageCount, so the // compacted estimate stays valid until the real response lands. const [compacted, setCompacted] = useState<{ @@ -484,7 +485,7 @@ export const Chat = ({ toast.error((body as { error?: string }).error ?? "Compact failed"); return; } - // Refresh the ring immediately from the post-compact estimate (§J). This + // Refresh the ring immediately from the post-compact estimate (ADR-0012 §Force-compact on demand). This // is a message-only char/4 estimate (no per-turn system/tool overhead), // so it reads slightly low until the next real response replaces it with // the provider's authoritative count. @@ -498,7 +499,7 @@ export const Chat = ({ tokens: body.inputTokens, }); } - // §J/11c: append the persisted compaction-trace message so it shows in the + // ADR-0012 §Compaction trace in the timeline: append the persisted compaction-trace message so it shows in the // timeline immediately. It carries the id the backend persisted, so a // later SWR revalidation reconciles rather than duplicating it. if (body.traceMessage) { @@ -525,23 +526,33 @@ export const Chat = ({ ]); const handleCompact = useCallback(() => { - // Confirm at click time (not after the deferred run fires) so the prompt - // never surprises the user mid-stream. Per P1 this is a view change, not - // data loss — the full history is preserved. + // ADR-0012 §Force-compact on demand: confirm ONLY when the drop is significant; + // below that, run immediately. The summarized prefix is everything before the + // keep-recent boundary, so messagesDropped ≈ messages.length − keepRecent, and + // the ADR's "messagesDropped > keepRecentMessages" criterion reduces to the + // pre-run-computable "messages.length > 2 × keepRecent". (The >30%-reduction + // criterion needs the post-run summary size; we don't gate on it here — the op + // is non-destructive either way per ADR-0012 §View, not delete.) + // Confirm at click time (not after the deferred run fires) so the prompt never + // surprises the user mid-stream. + const keepRecent = contextWindowData?.keepRecentMessages ?? 10; + const significant = messages.length > keepRecent * 2; if ( + significant && !window.confirm( "This will summarize older messages to reduce context usage. The full conversation history is preserved. Continue?", ) - ) + ) { return; + } if (effectiveStatus === "streaming" || effectiveStatus === "submitted") { setCompactPending(true); } else { void runCompact(); } - }, [effectiveStatus, runCompact]); + }, [contextWindowData, messages.length, effectiveStatus, runCompact]); - // Fire deferred compact once streaming finishes (drift U4). Already confirmed + // Fire deferred compact once streaming finishes (ADR-0012 §Force-compact on demand). Already confirmed // at click time, so this just runs. useEffect(() => { if ( @@ -558,7 +569,7 @@ export const Chat = ({ }, [compactPending, effectiveStatus, runCompact]); // Early returns live below ALL hooks so hook order stays unconditional - // (react-hooks/rules-of-hooks). The §H/§J ring hooks above must always run. + // (react-hooks/rules-of-hooks). The ADR-0012 §Context-usage ring / §Force-compact ring hooks above must always run. // TODO: Ideally show a loading indicator here if (isLoading || !providersData) return null; diff --git a/apps/frontend/components/context-usage-ring.tsx b/apps/frontend/components/context-usage-ring.tsx index 137378c4..15994829 100644 --- a/apps/frontend/components/context-usage-ring.tsx +++ b/apps/frontend/components/context-usage-ring.tsx @@ -8,16 +8,16 @@ import { import { Loader2 } from "lucide-react"; /** - * Context-usage ring (context-compaction-plan §H + §J). + * Context-usage ring (ADR-0012 §Context-usage ring + §Force-compact on demand). * * Renders a small SVG donut ring showing `usedTokens / contextWindow` fill. * Colours: green < 0.7, amber >= 0.7, red >= 0.9. * Shows neutral grey with no percentage when contextWindow is unknown/default - * (drift T6) or when no run has completed yet. + * (ADR-0012 §Context-usage ring) or when no run has completed yet. * - * When `onClick` is provided the ring is clickable (§J: compact on demand). + * When `onClick` is provided the ring is clickable (ADR-0012 §Force-compact on demand). * - While `isPending` (click queued, waiting for streaming to finish): shows - * a pending badge and is disabled (drift U4). + * a pending badge and is disabled (ADR-0012 §Force-compact on demand). * - While `isCompacting`: shows a spinner. * - `isStreaming` disables clicks entirely (frontend defers via pending flag). */ @@ -38,6 +38,9 @@ export function ContextUsageRing({ }) { const r = 7; const circumference = 2 * Math.PI * r; + // Amber has no semantic token (unlike primary/destructive); use Tailwind v4's + // default-palette CSS var so the threshold colour isn't a bare hex literal. + const amber = "var(--color-amber-500, #f59e0b)"; const isNeutral = !contextWindow || usedTokens === undefined; const fill = isNeutral @@ -49,7 +52,7 @@ export function ContextUsageRing({ : fill >= 0.9 ? "var(--color-destructive)" : fill >= 0.7 - ? "#f59e0b" + ? amber : "var(--color-primary)"; const isDisabled = isPending || isCompacting || isStreaming || !onClick; @@ -123,7 +126,7 @@ export function ContextUsageRing({ style={{ transition: "stroke-dashoffset 0.3s ease" }} /> {/* Pending dot */} - {isPending && } + {isPending && } )} diff --git a/context-compaction-plan.md b/context-compaction-plan.md deleted file mode 100644 index 218acfed..00000000 --- a/context-compaction-plan.md +++ /dev/null @@ -1,1920 +0,0 @@ -# Plan: Chat Context Compaction & Usage Indicator - -Status: **chunks 1-11 implemented (ALL DONE)** (1-2 reviewed 2026-06-09; chunks 3-5 landed 2026-06-10; chunks 6-8 landed 2026-06-11; chunk 11 landed 2026-06-12; see §Code review 2026-06-10) · Branch target: `feature/context-compaction` - -> This doc is the spec to implement against, not a proposal. Sections A–J are the -> design. The **Drift log & code-review checklist** at the bottom records every -> flaw found during review and the trap to re-check once code exists — read it -> before coding and again at PR time. Do not re-derive the happy path and skip -> the failure modes; they are written down precisely so we do not drift into them -> twice. - -## Implementation status & code review — chunks 1-2 (reviewed 2026-06-09) - -Chunks **1** (window resolution + single estimator + schema) and **2** (compaction -module + `writeWatermark` CAS + Tier 1) are landed on `feature/context-compaction` -(post main v1.95.0 merge). Backend tests: 1037 pass; chunk 1-2 unit tests: -context-window 20, token-estimate 14, compaction (CAS/budget/pairing/invalidation) -all green. Source `tsc --noEmit` clean for these files. This section is the -**start point for chunk 3** — read it before coding. - -### Solid / verified - -- **CAS durable writer (P3 / R1 / T10)** — the hardest part is correct and - well-tested. Single versioned writer (`commitWatermark`→`casWrite`, - `compaction.ts:84-160`); all three mutations (advance / dirty-clear / C4 reset) - route through it; loser decides by **version** not watermark value; one-retry- - then-skip, no livelock. No field write bypasses it. -- **C2 hysteresis, C3 budget, C4 invalidation, M1 map-reduce primitive, T7 - summarizer fallback** — VERIFIED in `compaction.ts`. -- **C4 wiring** — ~~VERIFIED~~ **OVERTURNED by the 2026-06-10 review (RV1).** - The mechanism is wired but the comparison baseline is destroyed before it runs - (ChatSink.onStart overwrite) and the two sides are canonicalized differently - (inlined URLs, jsonb key order). See §Code review 2026-06-10, RV1. -- **Schema / migration / zod / lazy-rollout** — VERIFIED. Columns additive + - nullable/defaulted; migration `0047_context_compaction.sql` matches schema; - `modelMeta` optional in all variants; `contextSummary`/`summaryWatermark` kept - out of chatSubmit/chatUpdate (server-managed); no eager backfill job. -- **Tier 1 gating (plan M3)** — VERIFIED. `request.id ? applyTier1IfNeeded : skip`; - triggers (`{agentId,search}`, no id) and sub-agents (bypass `prepareChatTurn`) - skip Tier 1; best-effort try/catch never breaks a turn (P4). -- **`compactModelMessages` Tier 2 adapter** — fully implemented + tested (NOT a - stub). Recovery (chunk 3) and Tier 2 (chunk 4) can call it directly. - -### Chunk 3 (Recovery + C1/M2) — landed 2026-06-10 - -- **§E recovery** — new `runs/recovery.ts`. `isContextOverflowError` (400/413 + - per-provider body regex: OpenAI/vLLM, Anthropic, Google, Bedrock — drift T9, - fixture-tested). `contextOverflowRecoveryMiddleware` wraps the model via - `wrapLanguageModel` in BOTH `streamText` and `generateText` (agent-runner), so - every step of a tool loop gets detect → `setCompactionDirty` (flag persisted on - DETECTION, before retry outcome) → trim via **`compactModelMessages`** (T3, no - bespoke trim; system head pinned; keep-recent halved, floor 2) → retry once. - Second failure surfaces "Conversation too large… start a new chat" via - `formatStreamError`. The V3 prompt is passed to `compactModelMessages` - directly (structurally compatible shape) — no converter, no second trimmer. - `setCompactionDirty` goes through `commitWatermark` (P3); no-op when already - dirty. Headless runs get trim+retry but no dirty flag (no chat row). -- **C1 fix (partial — overhead path)** — `estimateOverheadTokens(systemPrompt, -tools)` in token-estimate.ts (char/4 of system prompt + each tool's name, - description, `asSchema(...).jsonSchema`; flat 200/tool fallback). Threaded as - `Tier1Input.overheadTokens`; the trigger projection - (`projectTier1Tokens`) now counts it, and the compaction target is reduced by - it (`targetTokens − overhead`) so hysteresis (C2) still holds. `log.warn` when - overhead alone ≥ target (compaction would re-fire each turn). - **C1 second half — DONE 2026-06-11.** `prepareChatTurn` now threads - `lastInputTokens` from the last assistant message's - `metadata.stats.contextTokens` (stamped by `applyMessageStats`, §H) into - `applyTier1IfNeeded`. `projectTier1Tokens` takes - `max(charBased, lastInputTokens)` (not additive — `charBased` is the whole - unsummarized view, so adding would double-count history); cold-start margin - applies only when it is absent (turn 1). C1 fully closed. -- **M2 fixed** — `COLD_START_MARGIN = 1.15` applied to the whole char-based - projection whenever no provider baseline exists; dropped when - `lastInputTokens` is present. -- **Defect 4 fixed** — `summarizerWindow` now resolved (task-model window → - `computeBudget(...).inputBudget`) in `buildCompactionRuntime` and threaded to - Tier 1 and recovery; M1 map-reduce is live in the wired flow. -- **Defect 9 fixed** — token-estimate header no longer claims per-turn provider - counts. -- **Refactor** — `buildCompactionRuntime` (chat-execution) resolves window / - config / budget / summarizer once per turn, never throws (falls back to the - 8192 default), and is shared by Tier 1 and the recovery middleware; `ChatTurn` - gained a required `recovery: RecoveryContext` field consumed by agent-runner. -- Tests: backend suite 1068 pass (was 1037) — recovery matrix + middleware - retry/dirty/failure paths, trim boundary safety, projection C1/M2 cases, - `setCompactionDirty`, overhead estimator. Source tsc clean; eslint 0 errors. - -## Chunk 3a — RV1-RV4 fixes (landed 2026-06-10) - -All 4 critical defects resolved. Tests: 1068 pass (unchanged count). tsc clean on -source files. Key changes: - -- **RV1** — `stableStringify` exported from `token-estimate.ts`; `affectedBelowWatermark` - now uses it instead of `JSON.stringify` (jsonb key-order stability). C4 baseline - fixed: `agent-runner.stream()` reads `loadChatMessages(id)` BEFORE `sink.onStart` - overwrites the row, threads as `priorMessages` through `prepare()` → - `prepareChatTurn()` → `applyTier1IfNeeded()`. C4 comparison now uses - `rawMessages` (pre-`inlineFileUrls`) so file URLs match on both sides. -- **RV2** — Submit handler in `routes/chat.ts` verifies `data.id` belongs to - `scope.workspaceId` (SELECT + 404 if workspace mismatch) before any run starts. -- **RV3** — `force?: boolean` added to both `UICompactOptions` and `ModelCompactOptions`; - no-op estimate gate skipped when `force:true`. `applyTier1Compaction` passes - `force: forceCompact` to `compactUIMessages` (dirty-forced path). Recovery's - `trimOverflowingPrompt` passes `force: true` to `compactModelMessages`. -- **RV4** — Empty-prefix guard added before Stage 2 in `compactUIMessages`: when - `prefix.length === 0` (history ≤ keepRecentMessages), return the pruned-recent - without calling `summarize` and without committing a `watermark:null` + non-null - summary (which would orphan the summary every turn). - -## Chunk 3b — RV5-RV7 fixes (landed 2026-06-10) - -All 3 HIGH non-blocking defects resolved. Tests: 1068 pass (unchanged count). tsc clean. Key changes: - -- **RV5** — `content`-type tool results: `pruneModelMessage` now soft-trims text items and replaces - media with `[N media item(s)]` placeholders; `renderModelMessages` extracts text from `content` - items so the summarizer sees their content. Both paths covered by `// RV5:` inline markers. -- **RV6** — Recovery target overhead: `RecoveryContext.targetTokens` is now set to - `Math.max(0, budget.targetTokens − overheadTokens)` in `chat-execution.ts` - (mirrors the overhead-adjusted target Tier 1 already used). -- **RV7** — Context-window resolution family (all four sub-items): - - (a) `litellm-registry.ts` populated with full registry covering OpenAI, Anthropic, Bedrock - (Anthropic + Meta Llama + Amazon Titan/Nova + Mistral), Mistral direct, Meta Llama direct, - and Qwen. Wired as `loadBuiltinRegistry` on the process-wide `contextWindowResolver`. - - (b) Family heuristic uses boundary-safe `startsWith(key + "-"|"."|":"|"/")` — `gpt-4.5-preview` - no longer silently resolves via the stale `gpt-4` entry. - - (c) `contextWindowResolver.evict(providerId)` called in both `routes/provider.ts` PUT handlers - on `modelMeta` change. - - (d) `defaultHttpGetJson` uses `AbortSignal.timeout(5000)`; `#inflight` map prevents cold-cache - stampede; the two `resolve` calls in `buildCompactionRuntime` are run in parallel - (`Promise.all`). Note: default-source results are still cached for the full TTL (old defect 6 / - MED priority — open, tracked separately). - -## Code review 2026-06-10 — full-branch review of chunks 1-3 (RV1-RV7 ALL FIXED) - -Multi-angle adversarial review of all compaction code (7 finder angles, every -candidate independently verified against the source). Every finding below is -CONFIRMED unless marked otherwise. **RV1-RV4 were blocking** and are now fixed. -**RV5-RV7 HIGH non-blocking fixes landed 2026-06-10 (chunk 3b).** - -### Critical - -- **RV1 — C4 invalidation broken in BOTH directions; durable summary likely never - survives a turn in prod.** (`chat-execution.ts` `applyTier1IfNeeded` / - `chat-sink.ts` `onStart` / `compaction.ts` `affectedBelowWatermark`) - - _Missed edits:_ `AgentRunner.stream` awaits `sink.onStart` **before** - `prepareChatTurn`, and `ChatSink.onStart` overwrites `chat.messages` with the - just-submitted history — so `loadPersistedMessages` reads back the edited - submission and the C4 check compares the edit against itself. An **in-place - edit below the watermark is never detected**: the model gets the stale - summary and the edited message is dropped from the view. (Truncate-and- - regenerate accidentally still invalidates via the watermark-gone fallback.) - - _Spurious invalidation:_ the incoming side is post-`inlineFileUrls` - (`data:` URLs) while the persisted side holds `http /files/…` — any chat - with a file at/below the watermark **invalidates + fully re-summarizes every - turn** (one wasted summarize model call per turn, forever). Additionally - `chat.messages` is **jsonb** (Postgres re-orders object keys), so the - `JSON.stringify` byte-equality very likely diverges for ALL chats after one - write→read round trip — the incremental summary never survives. - - _Fix direction:_ capture the pre-overwrite history (read the row BEFORE - onStart overwrites, or have onStart return the previous messages), compare - the **un-inlined** submission, and use semantic equality (id + extracted - text/tool content, or the SDK's `isDeepEqualData`) instead of - `JSON.stringify` byte-equality. Also consider a content digest persisted at - compaction time to avoid the per-turn full-history read (see RV9). -- **RV2 — cross-tenant compaction writes via unvalidated `request.id` - (security).** (`routes/chat.ts` submit handler; `compaction.ts` - `drizzleCompactionStore`) The submit route never verifies the body `id` - belongs to the caller's workspace (every other chat route filters - `id AND workspaceId`; submit does no chat-row lookup at all). The compaction - store, `loadPersistedMessages`, `invalidateCompaction`, `setCompactionDirty` - are keyed by `chat.id` only. A workspace-A owner submitting `id` = a - workspace-B chat id can clear B's summary/watermark, set B's dirty flag, and - CAS-write a summary derived from A's messages onto B's row (integrity, not - read-exfiltration; requires knowing B's chat id). _Fix:_ verify `request.id` - belongs to `scope.workspaceId` before the run starts (mirror the other chat - routes), and/or scope the store's queries by workspaceId. -- **RV3 — recovery + dirty-forced compaction trust the estimator that already - failed → permanent fail loop.** (`compaction.ts` no-op branches in both - compactors; `recovery.ts` `trimOverflowingPrompt`) Both compactors return the - messages **unchanged** when the char/4 estimate is ≤ target — but recovery - only runs after the provider has REJECTED the prompt. With a >2× under-count - (CJK ≈1 token/char; assistant `reasoning` parts excluded from counting AND - from pruning/summarizing — they ARE wire payload in the V3 prompt), the retry - resends a byte-identical prompt and deterministically fails; next turn the - dirty flag forces Tier 1, which no-ops and **clears the flag without - shrinking** → overflow → dirty again, every turn. _Fix:_ recovery (and - dirty-forced Tier 1) must force-trim past the estimate gate — e.g. a `force` - option on the compactors that skips the no-op branch and/or scales the - target down when invoked post-rejection; count reasoning parts in the - ModelMessage adapter (the "reasoning is UI-only" assumption in §B is wrong - for the V3 prompt path). -- **RV4 — small-history compaction clobbers the watermark and orphans the - summary.** (`compaction.ts` `compactUIMessages` boundary=0 path + - `applyTier1Compaction` commit) When the over-target history has ≤ - `keepRecentMessages` messages (one huge paste; or `effectiveTarget≈0` from - overhead), prefix=[] → a **wasted summarize call over an empty transcript** → - commit of `{summary, watermark: null}`. A pre-existing watermark is - overwritten with null; `viewAfterWatermark` ignores `contextSummary` when the - watermark is null, so the summary is orphaned, the previously-summarized - prefix reappears in the view, and the cycle repeats each turn. _Fix:_ skip - Stage 2 when the prefix is empty (return the no-op shape; optionally prune - inside `recent` for oversized tool outputs), and never commit - `watermark: null` together with a non-null summary. - -### High (all FIXED 2026-06-10 chunk 3b) - -- **RV5 — `content`-type tool results (standard MCP output) never pruned and - invisible to the summarizer.** ~~(`compaction.ts` `pruneModelMessage` handles - only text/json variants; `renderModelMessages` renders `content` as `""`)~~ - **FIXED:** `pruneModelMessage` soft-trims text items + media placeholder; - `renderModelMessages` extracts text items from `content` outputs. -- **RV6 — recovery target ignores per-turn overhead.** ~~(`chat-execution.ts` - RecoveryContext gets raw `budget.targetTokens`; Tier 1 uses `target − overhead`)~~ - **FIXED:** `RecoveryContext.targetTokens = Math.max(0, budget.targetTokens − overheadTokens)`. -- **RV7 — context-window resolution family.** ~~(`context-window.ts`)~~ - (a) ~~prod registry still empty~~ **FIXED:** `litellm-registry.ts` vendored with - full OpenAI/Anthropic/Bedrock/Mistral/Llama/Qwen coverage; - (b) ~~raw `startsWith` heuristic~~ **FIXED:** boundary-safe separators - (`"-"`, `"."`, `":"`, `"/"`) prevent `gpt-4.5-preview` → `gpt-4` resolution; - (c) ~~`evict` called by zero routes~~ **FIXED:** `contextWindowResolver.evict(providerId)` - wired in `routes/provider.ts` PUT handler; - (d) ~~no timeout / no single-flight~~ **FIXED:** `AbortSignal.timeout(5000)` + - `#inflight` Map + `Promise.all` the two resolve calls. - (e) ~~default-source full-TTL cache (old defect 6, MED)~~ **FIXED 2026-06-11:** - `source:"default"` results get `DEFAULT_SOURCE_CACHE_TTL_MS` (60 s) instead of - the hour, so a registry MISS / transient API blip no longer pins 8192. - -### Medium / low (RV8-RV10 — FIXED 2026-06-11, chunk 10) - -- **RV8 — `finalize` in the snapshot-consumer's `finally` could mark a broken - run "succeeded".** ~~(`agent-runner.ts`)~~ **FIXED:** the snapshot loop now - captures the stream error (both the `readUIMessageStream` `onError` callback - and the surrounding `catch` set `streamError`); the `finally` finalizes - `"failed"` with that error unless the run was aborted/cancelled. (Origin note - retained for the upstream-PR exclusion list: introduced by the tool-timestamps - commits `3851da6`/`b97312f`, not the compaction chunks.) -- **RV9 — hot-path waste.** ~~3-4 full-history estimation passes; full base64 - decode per image; tool schemas re-serialized every turn~~ **FIXED (partial):** - Tier 1 computes the unsummarized-view estimate **once** and threads it as - `knownEstimate` into `compactUIMessages` (mirrors the Tier 2 `knownEstimate` - from chunk 4); `bytesFromUrl` decodes only a 64 KB prefix for header parsing; - `estimateOverheadTokens` memoizes each tool's serialized-schema length in a - `WeakMap` keyed by the schema object. **Deliberately deferred:** the digest-based - C4 check — the full-prefix compare is already correct (RV1 landed), so this is - pure optimization of a correct path; revisit only if the per-turn JSONB - read+stringify shows up in profiling. -- **RV10 — cleanup minors. FIXED:** `MODEL_BOUND_UI_PART_TYPES` now has the - promised test (membership assertion); `toolResultOutputText` collapsed to the - two real behaviours (`execution-denied` reason vs `value`), removing the dead - `default`; the two `commitWatermark` closures in `applyTier1Compaction` share - one `pinnedWrite` helper; the orphaned `invalidateCompaction` jsdoc above - `affectedBelowWatermark` removed; JPEG walker skips `0xFF` fill bytes + `0xFF00` - stuffing and treats TEM (`0x01`) as standalone. **Not done (cosmetic):** - `bytesFromUrl` still duplicates storage/utils' private `parseDataUrl` — left as - is; merging them couples the estimator to the storage layer for no behaviour - change. - -### Re-affirmed solid by this review - -CAS writer/loser logic (P3/R1/T10), budget math (C3), tool-pairing boundaries, -the synthetic `context-summary` message (server-side only — never leaks to -persistence/frontend; cannot become the watermark), recovery middleware -single-retry semantics and summarizer non-recursion (fresh unwrapped task -model), Tier-1 skip for headless runs (M3), kill-switch wiring (§G) with the -documented dirty-forces-compaction exception (intent, has a test — though §G's -wording "disables ALL proactive compaction" should gain a sentence noting the -recovery hand-off still summarizes). - -### Defects to fix (ordered by impact) - -> 2026-06-10 note: still-open items below are subsumed by the §Code review -> 2026-06-10 list — defect 2 → RV7(a), 5 → RV7(c), 6 → RV7(d), 7 → RV5, -> 11's heuristic item → RV7(b). Track them there. - -1. **C1 — trigger under-counts (HIGH). _FIXED: overhead half 2026-06-10; `lastInputTokens` half 2026-06-11 — threaded from last assistant message `metadata.stats.contextTokens` in `chat-execution.ts`._** `compaction.ts:719` - `projected = estimate(afterWatermark) + priorSummaryTokens` — omits the prior - turn's provider `usage.inputTokens` AND the system prompt / tool schemas / skill - payload sent every turn. `Tier1Input` has no `lastInputTokens` field; the call - site (`chat-execution.ts:537`) passes none. This is the live-test under-count - (8888 real vs ~986 estimated) — trigger can silently never fire on tool-bearing - agents; only recovery (chunk 3) catches the overflow. Same root issue as the - **§Open trigger-estimator scope** note below. -2. **Empty litellm registry + alias map in prod (HIGH, compounding).** - `context-window.ts:285,389` — the production singleton injects an empty registry - loader and empty alias map, so every non-API provider (OpenAI/Anthropic/Bedrock) - resolves to `DEFAULT_CONTEXT_WINDOW = 8192`. The budget math is therefore - wrong-defaulted for those providers today. Must vendor litellm - `model_prices_and_context_window.json` + build the alias map and wire them in. -3. **M2 — first-turn ×1.15 margin absent (MED). _FIXED 2026-06-10._** - `compaction.ts:719` applies no - cold-start inflation; a char/4 under-count can keep turn-1 from triggering. -4. **`summarizerWindow` not threaded (MED). _FIXED 2026-06-10._** - `chat-execution.ts:537` calls - `applyTier1Compaction` without `summarizerWindow`, so the M1 map-reduce path is - dead in the wired flow — a large cold-start/imported history can overflow the - summarizer call itself. -5. **T5 evict not wired (MED).** `routes/provider.ts:126` updates a provider - (incl. `modelMeta`) without `contextWindowResolver.evict(providerId)`; window - cache serves stale values until TTL. -6. **Window cache pins transient failures (MED). _FIXED 2026-06-11 (RV7e)._** - default/MISS results now get a 60 s `DEFAULT_SOURCE_CACHE_TTL_MS`, not the hour. -7. **Latent T2 violation (LOW).** `token-estimate.ts` `case "content"` - `stableStringify`s tool-output base64 image bytes into char/4 text. No current - tool emits this shape; fix before any tool returns `content`-type media. - _Still open — see note below._ -8. **Latent T1 divergence (LOW). _Test added 2026-06-11._** The model adapter now - has explicit per-variant tool-result-output coverage (text/json/content/ - execution-denied), which is the shape a custom `toModelOutput` emits. The - exact UI-vs-Model equality (T1) holds for SDK-converted messages; a tool whose - `toModelOutput` reshapes the payload remains a bounded, documented divergence. -9. **Doc bug. _FIXED 2026-06-10._** `token-estimate.ts` header claims "every later turn uses the real - provider count" — false; char/4 is used every turn (ties to C1). Fix the comment - when C1 is plumbed. -10. **Observability metrics absent. _FIXED 2026-06-11._** No metrics infra exists - (pino only), so emitted as structured `metric:`-tagged log lines, greppable / - dashboardable: `cas.conflict` (commitWatermark), `context_window.fell_to_default` - - `litellm.key_miss` (context-window), `compaction.fired` (Tier 1), - `summarize.latency_ms` (summarize wrapper), `recovery.overflow_detected` / - `recovery.retry` / `recovery.failed` (recovery middleware). -11. **Low. _FIXED 2026-06-11 (partial):_** key-boundary heuristic done (RV7b); - Bedrock-ARN path now also tries lowercased candidates (`context-window.ts`); - dead `default: return ""` in the output switch removed (switch collapsed). - -### Drift-checklist deltas (vs the table at the bottom) - -`C1` → **VERIFIED** (overhead + margin 2026-06-10; `lastInputTokens` threaded 2026-06-11). `M2` → **VERIFIED**. `T3` → **VERIFIED** (producer landed -in chunk 3). `T9` → **VERIFIED**. `R4` → **PARTIAL** (window present & correctly -unfixed, but the gating `cas.conflict` metric is missing). `C4` → **BROKEN** -(2026-06-10 review, RV1 — baseline overwritten + byte-equality false -positives/negatives). `T1` → **PARTIAL** (reasoning parts ARE wire payload in -the V3 prompt path but excluded — RV3). Everything else listed -above → VERIFIED. `T5` → module hook present, **PUT-handler call missing** (RV7c). - -### Chunk 3 (Recovery) — hand-off is clean - -`applyTier1Compaction` already honors `state.compactionDirty` as a force-trigger and -clears it inside the same CAS write. Chunk 3 only needs the **producer** in -`agent-runner.ts`: `isContextOverflowError` (per-provider 400/413 body matrix, drift -T9), retry-once via `compactModelMessages` (NOT a bespoke trim, drift T3), and set -`compactionDirty=true` through `commitWatermark`. Recommend folding the **C1 fix** -(thread prior-turn `usage.inputTokens` + system/tool payload into the projection) -into the same chunk or the §H usage-metadata chunk, since recovery makes provider -`usage` available — without C1, recovery is the only thing standing between a -tool-bearing agent and a hard overflow. - -### Branch & upstream-PR hygiene - -**Decided 2026-06-09. Three roles, two live branches:** - -- **`feature/context-compaction` = compaction DEV branch.** Sits on the fork/deploy - lineage (off v1.90.0, later merged with main v1.95.0), so it carries - non-compaction commits. It is **NOT** a clean upstream PR base. -- **`deploy/fresh` = TEST/deploy target.** The test server tracks the `deploy/fresh` - **name** (`/srv/platypus`, compose project `platypus`; rebuilds rename back to it). - It also carries **deploy-runtime fixes that must NOT live in feature or the PR** - (MCP OAuth quirks, host routing). Do **not** deploy `feature` directly — it lacks - those fixes and deploying it would regress MCP OAuth + host-based URL routing. -- **Upstream PR branch = one-time throwaway.** Cherry-pick compaction-only commits - onto current upstream `main` at PR time. Never PR `feature` directly. - -**Test cycle:** `git checkout deploy/fresh` → `git merge feature/context-compaction` -→ deploy. Cheap (shared lineage). The old compaction on `deploy/fresh` is -**superseded automatically** by the merge (chat-execution resolves to feature's -version) — nothing to remove by hand. A small `chat-execution.ts` conflict is -expected (deploy/fresh's `request.id` Tier-1 call vs feature's gated -`ChatTurnRequest`); resolve to feature's version. - -**Why not collapse to one branch:** `deploy/fresh` holds deploy-runtime commits -feature deliberately omits (see below); folding them into feature would re-pollute -it and force a server reconfig. Keeping two branches is the lower-friction choice. - -**Deploy/fresh-only commits feature must NOT absorb** (deploy-runtime; keep on -deploy/fresh, exclude from PR): `9ad424b`, `5c2fd38`, `b9e0172`, `455a390`, -`b24f623`, `852a54a` (6 MCP OAuth runtime fixes — client_secret_post, host -rewrites, skip resource-origin check, sync auth binding); `e26d95b` -(backendUrl-from-Host); `4daed7a`, `43171b1` (deploy/fresh's own main merges). - -**EXCLUDE from the upstream PR** (fork/deploy-only or unrelated features — they -predate the compaction work and must not leak into the diff): - -- `e3ccf25` — `compose.yaml` deploy local-build edit (pure deploy) -- `d4cd6f2` — backendUrl-from-Host (fork deploy hack) -- `cdef399` — MCP auto-refresh on 401 + scoped quirks (separate feature) -- `7320000`, `5cfb882` — configurable agent-run timeouts (separate feature) -- `b1daa88` — deploy/fresh main(v1.90) merge commit -- `759aae1` — fork docs (PROJECT.md / CLAUDE.md fork refs) -- `b97312f`, `c18c18d`, `d194edc`, `51f69af`, `0737a6a`, `3851da6` — tool-call - duration / timestamps. **Borderline:** plan §I reuses this. Include ONLY if §I - (per-message stats) ships in the same PR; otherwise it is a separate feature. - -**INCLUDE** (compaction): `68cf725` (foundation+Tier 1), `d1d699e` (migration), -`e19029c` + the chunk-1-2 fix/plan commits from this session, plus chunks 3-9. -Note: the `0047_context_compaction` migration will need **renumbering** to match -upstream `main`'s migration sequence at cherry-pick time. - -## Goal - -Stop chats from hard-failing when message history exceeds a model's context -window. Three capabilities: - -1. **Proactive compaction** — summarize old history before the window fills. -2. **Recovery** — catch context-overflow errors from providers and recover. -3. **Visibility** — a context-usage indicator (ring) next to the model selector. - -Applies to top-level chats **and** sub-agents (both run through the shared -`agent-runner` / `ToolLoopAgent`, so implementing once covers both). - ---- - -## Design principles (read first — these are load-bearing) - -- **P1 — Compaction is a VIEW, not a DELETE.** The watermark + summary change - _what is sent to the model_, never _what is stored_. Raw messages stay in the - DB untouched. Consequences: forced compaction (§J) is **not** data loss — the - user can still read full history; only the model payload is compacted. A future - "expand summary" UI is free because originals persist. Never hard-delete a - summarized message. -- **P2 — One estimator.** Token counting lives in exactly one function over one - neutral structure (`CountUnit[]`). Tier 1 (UIMessages) and Tier 2 - (ModelMessages) both normalize into it. Divergence is impossible by - construction, not monitored. (See drift T1.) -- **P3 — One durable writer.** All mutations of compaction state - (`summaryWatermark`, `contextSummary`, `compactionDirty`) go through a single - versioned CAS function `writeWatermark`. No other code path writes these - fields. (See drift R1.) -- **P4 — Recovery is the net, proactive compaction is the plan.** The overflow - catch (§E) must stay on even if proactive compaction is globally disabled. It - is the last line, not a risk surface. - ---- - -## Key facts established during research - -- AI SDK (`ai@6.0.191`) reports real token usage **after** each call: - `usage.inputTokens` / `outputTokens` / `totalTokens` - (`apps/backend/src/runs/agent-runner.ts:148-194`). This is the primary, - provider-accurate signal driving compaction — no pre-call counting needed - except on the very first turn. -- AI SDK exposes **no** context-window metadata on the model interface, and - there is **no** built-in pre-call tokenizer. -- AI SDK `prepareStep` hook (`ai/dist/index.d.ts:960-1023`) runs before each - step of an in-flight response and can rewrite the `messages` sent to the - model — this is how we compact _within_ a single response. Receives - **ModelMessages** (post-`convertToModelMessages`). -- `prepareChatTurn` (`chat-execution.ts:430`) holds **UIMessages** - (`turn.stream.messages`); conversion to ModelMessages happens later at - `agent-runner.ts:360` (`await convertToModelMessages(...)`). **Tier 1 and - Tier 2 therefore operate on different message shapes — see drift T1.** -- Provider context-window availability: - - Google (`inputTokenLimit`), OpenRouter (`context_length`), - vLLM/OpenAI-compatible (`max_model_len`) — **available via API**. - - OpenAI, Anthropic, Bedrock — **not** via API; need lookup table / manual. -- Sub-agents run as tools (`apps/backend/src/tools/sub-agent.ts:56-159`) with - fresh history (only a `task` string), each its own `ToolLoopAgent`. -- Model call sites: `streamText` `agent-runner.ts:358-397`, - `generateText` `agent-runner.ts:543-584`. -- Error handling today only covers auth/rate-limit/5xx - (`agent-runner.ts:636-657`) — no context-overflow handling. -- Frontend selector + `(i)` icon: `apps/frontend/components/chat.tsx:561-593` - inside `PromptInputTools`. No progress/ring component exists yet. Token usage - is **not** currently streamed to the client per message. -- `inlineFileUrls` (`chat-execution.ts:524`) fetches file/image bytes and inlines - them into messages. It does **not** decode image dimensions today (see drift T2). -- `messageMetadata` callback (`agent-runner.ts:408`) fires at message **start**, - before timing/usage exist — cannot carry stats. Stamp at the - `applyToolCompletions` point (`:443`) instead (see §I). - ---- - -## Design - -### A. Context-window resolution - -New module `apps/backend/src/runs/context-window.ts`. - -`resolveContextWindow(provider, modelId): Promise` resolution order: - -1. **Manual override** — per-model entry in provider config (see schema below). -2. **API auto-detect** by provider type (cached per provider+model): - - Google: `GET {baseUrl}/v1beta/models/{modelId}` → `inputTokenLimit`. - - OpenRouter: `GET {baseUrl}/api/v1/models` → match id → `context_length`. - - OpenAI-type: `GET {baseUrl}/v1/models` → if entry has `max_model_len` - (vLLM and most OpenAI-compatible servers expose it) use it; official - OpenAI omits it → fall through. -3. **litellm model registry** (replaces a homegrown table) — vendor/fetch - litellm's `model_prices_and_context_window.json` (MIT, community-maintained). - Each entry has `max_input_tokens` / `max_output_tokens`. Covers OpenAI / - Anthropic / Bedrock families that don't expose the window via API. - - **Key normalization (drift T4):** registry keys don't match our - `resolvedModelId` 1:1. Lookup order: - `exact(modelId) → strip provider prefix ("openai/") → lowercase → alias map → family heuristic → MISS`. - Maintain a small alias map for Bedrock ARNs, Azure deployment names, vLLM - custom names. `log.warn` on every MISS (it falls to default — must be visible). -4. **Conservative default** — `DEFAULT_CONTEXT_WINDOW = 8192`. `log.warn` on every - fall-to-default. When the window is default/unknown the **ring renders neutral** - (§H), never a guessed green→red ramp. - -Detection results cached in-memory (per provider id + model id) with a TTL. -**Cache invalidation (drift T5):** editing a `modelMeta` override must -`cache.evict(providerId)` **immediately** in the provider PATCH handler — do not -wait for TTL. TTL is only a backstop for API-detected drift. Also resolve -`maxOutputTokens` the same way (registry `max_output_tokens` / API) — needed for -the budget math in §C. - -#### Schema change (per-model, not per-provider) - -A single per-provider number is wrong: one provider serves many models with -different windows. Store a per-model map. - -- DB: add `modelMeta` JSONB column to the `provider` table - (`apps/backend/src/db/schema.ts`), shape: - `{ "": { contextWindow?: number, maxOutputTokens?: number } }`. -- Zod: extend provider schema in `packages/schemas/index.ts` (full / create / - update variants) with optional `modelMeta`. -- Apply via `pnpm drizzle-kit-push` (DDL only — additive nullable column, safe). -- UI (later): provider edit form shows resolved window per enabled model with an - editable override field. - -### B. Token estimation (cold start only) — the single estimator (P2) - -`apps/backend/src/runs/token-estimate.ts`. - -One function over one neutral structure — **no per-tier estimator**: - -```ts -const MODEL_BOUND: PartType[] = ["text", "tool-call", "tool-result", "file", "image"]; -// reasoning / source / step-start / data-* are UI-only — they never reach the -// model and MUST be excluded on both sides (drift T1). - -type CountUnit = { role: Role; text: string; nonText: NonTextPart[] }; - -function toCountUnits(m: UIMessage): CountUnit[] // Tier 1 adapter -function toCountUnits(m: ModelMessage): CountUnit[] // Tier 2 adapter -const estimateTokens = (units: CountUnit[]): number // char/4 text + modality table -``` - -- char/4 applies to **text parts only**. Never char/4 a base64 image. -- **Modality table (drift T2)** for non-text parts: - - `anthropic: (w,h) => ceil(w*h/750)` - - `openai: (w,h,detail) => detail==="low" ? 85 : tile85(w,h)` — **detail is - usually unset → assume `high`** (over-count beats overflow). - - `default: () => 1200` (conservative). - - Dimensions via **cheap header parse** (PNG IHDR / JPEG SOF marker, ~32 bytes — - no full decode) when bytes are in hand; bare URL or parse failure → `default` - constant. Not "free": one buffer read per image, cold-start only. -- Used **only** on the first turn before any provider `usage` exists; every later - turn uses the real `usage.inputTokens`. -- **Tier 1 estimate runs AFTER `inlineFileUrls` (drift T2)** so the payload is - real, not a pre-inline underestimate. -- **Divergence feedback loop (drift T2):** on turn 2, compare the cold-start - estimate vs real `usage.inputTokens`; `log.warn` when `|est−real|/real > 0.5` - with model + part breakdown. That signal tunes the image constants over time. -- (Optional future: Anthropic `/v1/messages/count_tokens` for exact Claude counts.) - -### C. Tier 1 — cross-turn compaction (durable) - -Runs in `prepareChatTurn` (`chat-execution.ts:524-549`) before a response starts. -Operates on durable chat history (**UIMessages**). Remember **P1: this is a view -over history; raw messages are never deleted.** - -**Budget math** (not a raw window ratio — fixes drift C3): - -``` -inputBudget = contextWindow − maxOutputReserve − safetyReserve - (safetyReserve = reserveRatio × contextWindow, default 0.05, per LibreChat; - maxOutputReserve from resolved maxOutputTokens) -triggerTokens = triggerRatio × inputBudget (triggerRatio default 0.8) -targetTokens = targetRatio × inputBudget (targetRatio default 0.5) -``` - -**Trigger** (drift C1 — must count what _this_ turn adds, not just the last -response): `projected = lastInputTokens + estimateTokens(messagesSinceWatermarkOrLastTurn)`. -First turn: `projected = estimateTokens(allMessages) × 1.15` (char/4 safety -margin, drift M2). Compact when `projected >= triggerTokens`. - -**Hysteresis** (drift C2 — the Cline #5616 thrash failure): compaction must reduce -the conversation to `<= targetTokens`, well below the trigger, so it does NOT -re-fire next turn. Trigger ratio (0.8) and target ratio (0.5) are deliberately -distinct. - -Compaction (`apps/backend/src/runs/compaction.ts`) — staged, cheap-first -(LibreChat pattern). **Two adapters, shared leaf primitives** (P2): -`compactUIMessages` (Tier 1) and `compactModelMessages` (Tier 2 + recovery) both -call `estimateTokens` / `summarizePrefix` / `pickKeepBoundary`. Pairing rule -differs by shape: - -- Tier 1 / UIMessage: an assistant message carrying tool-invocation parts is - **atomic** — never split, never drop its paired result. -- Tier 2 / ModelMessage: keep assistant + following `role:"tool"` messages - together. - -Stages: - -- Pin the system prompt. -- Keep the last `keepRecentMessages` (default ~10) verbatim; never split a - tool-call / tool-result pair across the boundary. -- **Stage 1 — prune (no model call):** in the older prefix, degrade bulky tool / - RAG results — soft-trim to head+tail, then replace with a placeholder - (`[tool result elided]`) for results over `minPrunableChars`. Often enough to - reach `targetTokens` without a summarization call. -- **Stage 2 — summarize:** if still above target, summarize the older prefix - with the **task model** into one synthetic summary message. - - **Model fallback (drift T7):** `provider.taskModelId → resolvedModelId (main)`. - `log` which model summarized + token cost. - - **Chunked / map-reduce** when the prefix exceeds the summarizer's own window - (drift M1 — cold-start on a large imported history). -- Output: `[system, summaryMessage, ...pruned/keptRecent]`. -- **Fail loud:** emit a visible transcript event (`context-compacted`, - "Summarized N earlier messages") rather than silently mutating. - -**Persistence + watermark** — all writes through `writeWatermark` (P3, drift R1): - -- Add to chat/run record: `contextSummary: text`, `summaryWatermark: int` - (id/index of last summarized message), `compactionDirty: boolean default false` - (drift T3), `version: int default 0` (drift R1 — CAS token). -- Each turn, summarize only messages _after_ the watermark and fold into the - existing summary, then advance the watermark — incremental. -- **The single versioned CAS writer (P3, drift R1):** - - ```ts - // EVERY mutation — advance | C4 reset | dirty-clear — goes through this. - async function writeWatermark( - chatId, - expectVersion, - patch /* {watermark?, summary?, dirty?} */, - ) { - const res = await db - .update(chat) - .set({ ...patch, version: expectVersion + 1 }) - .where(and(eq(chat.id, chatId), eq(chat.version, expectVersion))); - return res.rowCount === 1; // false = conflict → re-read, decide by VERSION not watermark value - } - ``` - -- **Loser behavior on CAS conflict (drift T10):** re-read the row. If - `version` moved and the watermark now covers my prefix → **SKIP** (winner - already compacted; safe no-op) **and clear dirty**. Else retry **once**; second - conflict → SKIP + `log.warn(contended)`. No recompute-loop, no livelock. -- **Invalidation (drift C4 + R1):** if any message at/below `summaryWatermark` is - edited/deleted/regenerated, the summary is stale. The edit/delete/regenerate - handler calls `writeWatermark` to **bump version + clear `contextSummary` + - reset watermark** to the last unaffected message — all in one CAS write. Because - the loser compares **version** (not watermark value), a compaction racing an - invalidation sees a conflict and re-reads the reset state — it can never write a - stale summary over mutated history. Branch/regenerate that forks below the - watermark resets it on the new branch. - -### D. Tier 2 — intra-turn compaction (in-memory) - -For a single response with many tool/sub-agent calls that bloats the window -mid-loop. Uses `prepareStep` on both `streamText` and `generateText`. -Operates on **ModelMessages** via `compactModelMessages`. - -`prepareStep({ messages, stepNumber, steps })`: - -- Estimate current step tokens (same `estimateTokens`, P2); if `>= threshold`: - - Summarize **old completed** tool results (steps several back), keep recent - steps verbatim, preserve call/result pairing. - - Return `{ messages: compacted }`. -- **Only fire when genuinely near limit** (drift m3 — mid-step summary adds - latency; don't run it every step). - -**Not persisted.** `prepareStep` edits are throwaway per-call — the SDK keeps its -own canonical message list and returns the _full_ messages in -`result.response.messages`, which commit to history as normal. Next turn, Tier 1 -folds that finished (still-bloated) turn into the durable summary. Tier 2 only -keeps a heavy response executable; Tier 1 owns durable state. - -### E. Recovery — context-overflow error handling (P4) - -In `agent-runner.ts` (around `formatStreamError`, `:636-657`): - -- `isContextOverflowError(err)` — `APICallError.isInstance(err)` AND - (`statusCode` in {400, 413}) AND body matches - `/context length|context_length_exceeded|prompt is too long|too many tokens|maximum context/i`. -- On detect mid-run: - 1. In-memory aggressive trim via **`compactModelMessages`** with a smaller - `keepRecentMessages` (drift T3 — reuse the Tier 2 adapter, **no bespoke - trim**, or T1 divergence returns). - 2. Retry the call **once**. - 3. Persist `compactionDirty = true` via `writeWatermark` (a small standalone - UPDATE, independent of the stream's finalize — drift T3). Recovery **never** - writes summary/watermark directly; it only flags. - 4. If retry still fails, surface: "Conversation too large even after trimming — - start a new chat or reduce attachments." (No infinite retry.) -- **Durable compaction happens on the NEXT `prepareChatTurn`** (drift T3 — - chosen path, not "finalize-or-next"): it sees `compactionDirty`, forces Tier 1 - before building messages, and clears the flag inside the same CAS write that - advances the watermark. `compactionDirty` is **persisted** so a crashed/swapped - worker still resumes correctly. - -### F. Wiring for sub-agents - -`ToolLoopAgent` constructor takes `contextWindow` + `maxOutputTokens` + -compaction config. Sub-agent tool creation (`sub-agent.ts:56-159`) already builds -a `ToolLoopAgent` per sub-agent — resolve each sub-agent model's window and pass -it through. - -**Tier 2 only (drift M3):** sub-agents start fresh each invocation (only a `task` -string, no cross-turn history), so there is no durable history for Tier 1 to -compact. Just pass the window so `prepareStep` (Tier 2) fires if a sub-agent's own -tool loop bloats. Recovery (§E) covers them too since `agent-runner` is shared. - -### G. Config surface + kill switch - -**SUPERSEDED 2026-06-12 — going global+per-model (see Chunk 12).** The per-agent -fields below shipped in chunk 10 but are being removed: no surveyed tool -(Hermes/Codex/Claude/Cline) exposes per-agent compaction tuning, and the ratios -self-normalize to the model window so per-agent variance buys nothing measurable. - -~~Per-agent optional fields, with sane defaults:~~ - -- ~~`compactionEnabled` (default true)~~ -- ~~`triggerRatio` (default 0.8), `targetRatio` (default 0.5), - `reserveRatio` (default 0.05), `keepRecentMessages` (default 10), - `minPrunableChars` (default ~2000)~~ - -The runtime now uses `DEFAULT_COMPACTION_CONFIG` for all agents; window/output -size stays per-model via the §A resolver (`provider.modelMeta` override). - -**Global kill switch:** env `COMPACTION_ENABLED` (default true) disables all -proactive compaction (Tier 1 + Tier 2) in prod without a deploy. **Recovery (§E) -ignores this flag** — it is the safety net (P4). After Chunk 12 this env flag is -the ONLY compaction toggle (the per-agent `compactionEnabled` is gone). - -### H. Frontend context-usage indicator (the ring) - -1. **Backend emits usage to client.** On run finish, include - `{ inputTokens, contextWindow }` in the streamed message metadata. Today usage - is run-level only (`types.ts:8-13`); surface last input-token count + resolved - window per assistant message. -2. **New component** `apps/frontend/components/context-usage-ring.tsx` — small - SVG/conic-gradient ring, fill = `inputTokens / contextWindow`. Color ramps - (green → amber ≥0.7 → red ≥0.9). **Neutral grey, no fill %, when the window is - unknown/default** (drift T6). Wrapped in `Tooltip`. -3. **Placement** — in `PromptInputTools` between the model selector and the `(i)` - info icon (`chat.tsx:574-575`). -4. **Data source (drift U1):** resolve the window from the **currently selected - model** (frontend already holds it in `PromptInputTools`; expose via the - provider/model metadata API / `modelMeta` map), NOT from the last assistant - message's metadata — else the ring shows the previous model's window after a - model switch. Fill = `lastInputTokens / selectedModelWindow`. -5. **Tooltip label is REQUIRED, not optional (drift U2/m2):** - `Last response: N / W (NN%) · current input not yet counted`. The ring reflects - the last response, not the unsent composer input — say so. (Projected-input arc - is deferred, see Open.) - -### I. Per-message stats popover (next to Regenerate) - -An `(i)` action under each assistant response showing input tokens, output -tokens, TTFT, and total generation time. Hover = tooltip, click = popover. - -**Reuse the existing tool-call duration mechanism** (commits c18c18d / b97312f) — -`withToolTimestamps` + `applyToolCompletions` (`agent-runner.ts:63-120`), -`useToolDuration` + `formatDurationMs` (`hooks/use-tool-completed-at.ts`, -`lib/utils.ts:29-49`). - -Backend (`agent-runner.ts`): - -- Capture `startedAt` at run start, `firstTokenAt` at the first text-delta chunk, - `finishedAt` when the stream finalizes (the `applyToolCompletions` point, `:443`). -- Capture token usage from `onFinish` / `totalUsage` (`:148-194`). -- Stamp onto `message.metadata` **at the `applyToolCompletions` point, NOT the - `messageMetadata` callback** (which fires at message start before timing/usage - exist). Shape: - `metadata.stats = { inputTokens, outputTokens, startedAt, firstTokenAt, finishedAt }`. - -Frontend: - -- New `MessageAction` info icon in `chat-message.tsx:335-378`, beside Regenerate, - rendered when `metadata.stats` exists. -- Content: `Input: N · Output: N`, `TTFT: formatDurationMs(firstTokenAt − startedAt)`, - `Total: formatDurationMs(finishedAt − startedAt)`. -- TTFT/total are **server-measured**. Optional client-observed "Round-trip" line - from the `useChat` send timestamp. Reuse `formatDurationMs`; mirror the - client-observed fallback in `useToolDuration` for in-flight messages. - -### J. Clickable ring — compact on demand - -Make the §H ring actionable. **Remember P1: this compacts the model view, not the -stored history — it is not destructive in the data sense.** - -- **Hover** — tooltip with percentage filled (already in H). -- **Click** — request compaction. If a response is generating, defer until the - current message finishes, then run; if idle, run immediately. - -Backend endpoint `POST /chats/:id/compact` (`routes/chat.ts`): - -- Runs Tier 1 compaction once **regardless of threshold** (force), persists via - `writeWatermark`, returns the new resolved usage (`inputTokens` estimate after - compaction + `contextWindow`) so the ring refreshes immediately. -- Reuses the Tier 1 `compaction.ts` module. - -Frontend: - -- Ring `onClick` → if `status === "streaming"`, set a pending flag and fire on the - chat's finish callback; else call now. -- **Pending-while-streaming visual (drift U4):** ring shows a pending badge + - tooltip "will compact when response finishes", and is **disabled** (no - re-click). On finish → spinner → updated fill from the response. -- **Confirm default-ON when the drop is significant (drift U3):** - `messagesDropped > keepRecentMessages` OR estimated reduction `> 30%` of history - → confirm. Below that → immediate, no prompt. (Confirm is UX courtesy; per P1 no - data is destroyed regardless.) - ---- - -## File-by-file change list - -Backend: - -- `apps/backend/src/runs/context-window.ts` — **new**: window resolution + API - auto-detect + litellm registry w/ key normalization + cache + evict hook. -- `apps/backend/src/runs/token-estimate.ts` — **new**: single `estimateTokens` + - `toCountUnits` adapters + image modality table + header-parse dims. -- `apps/backend/src/runs/compaction.ts` — **new**: `compactUIMessages` (Tier 1) + - `compactModelMessages` (Tier 2 + recovery) + shared leaf primitives + - `writeWatermark` CAS. -- `apps/backend/src/runs/agent-runner.ts` — `prepareStep` (Tier 2) on - `streamText`/`generateText`; `isContextOverflowError` + retry-once recovery - (reuses `compactModelMessages`, sets `compactionDirty`); pass `contextWindow` - through; emit usage metadata; capture `startedAt`/`firstTokenAt`/`finishedAt` + - usage and stamp `metadata.stats` at the `applyToolCompletions` point (§I). -- `apps/backend/src/services/chat-execution.ts` — Tier 1 in `prepareChatTurn` - (after `inlineFileUrls`); resolve window; check/clear `compactionDirty`; all - state writes via `writeWatermark`. -- `apps/backend/src/routes/chat.ts` — `POST /chats/:id/compact` (§J). -- `apps/backend/src/routes/provider.ts` — `cache.evict(providerId)` on modelMeta - update (drift T5). -- `apps/backend/src/tools/sub-agent.ts` — pass per-sub-agent window/config. -- `apps/backend/src/db/schema.ts` — `provider.modelMeta` JSONB; chat/run - `contextSummary` + `summaryWatermark` + `compactionDirty` + `version`; agent - compaction fields. -- message edit/delete/regenerate handlers — call `writeWatermark` to invalidate - (version bump + clear summary + reset watermark) (drift C4/R1). - -Schemas: - -- `packages/schemas/index.ts` — provider `modelMeta`; agent compaction fields; - message-metadata usage + stats shape for the frontend. - -Frontend: - -- `apps/frontend/components/context-usage-ring.tsx` — **new**: ring (window from - selected model), hover tooltip, clickable force-compact, pending/disabled state. -- `apps/frontend/components/chat.tsx` — render ring; resolve selected-model window; - wire `onClick` → compact endpoint (defer while streaming). -- `apps/frontend/components/chat-message.tsx` — `(i)` stats `MessageAction` (§I). -- `apps/frontend/hooks/use-message-stats.ts` — **new** (optional): client-observed - timing fallback for in-flight messages. - -Migration: - -- Additive nullable columns via `pnpm drizzle-kit-push` (dev). Prod via - `scripts/migrate.ts`. -- **Lazy rollout (item 3):** existing chats get `version=0`, null summary, - `compactionDirty=false`. They do **NOT** eagerly compact on deploy — Tier 1 only - fires on each chat's next turn. **Do not add a backfill job** "to be safe" — it - would create a thundering herd of summarize calls that lazy rollout avoids. - ---- - -## Observability (item 4 — the design is only as good as the prod signal) - -**Landed 2026-06-11.** No metrics infra exists in the backend (pino logging -only), so each signal is emitted as a structured `metric:`-tagged log line — -greppable today, trivially shipped to a counter later. Emitted: - -- `compaction.fired` (Tier 1) with `tier` / `tokensBefore` / `tokensAfter` / - `messagesDropped`. ✅ -- `summarize.latency_ms` with `latencyMs` + `taskModelId` + `usage` (the model - is in the same line). ✅ -- `recovery.overflow_detected`, `recovery.retry`, `recovery.failed`. ✅ -- `context_window.fell_to_default` + `litellm.key_miss` (drift T4/T6). ✅ -- `cas.conflict` (per lost CAS + on contended-skip) — **decides whether the R4 - efficiency note ever needs fixing**. ✅ - -Still log-only (no dedicated metric): `estimate_vs_real.divergence` (drift T2 -feedback loop) — deferred with the T2 image-constant tuning work. - ---- - -## Tests - -- `context-window`: resolution order; API parse for Google / OpenRouter / vLLM; - litellm hits **incl. key normalization + Bedrock ARN / Azure / MISS→default** - (drift T4); default fallback; cache evict-on-override (drift T5). -- `token-estimate`: char/4 text-only bounds; **`MODEL_BOUND` filter — UI-only - parts excluded**; **`estimate(toCountUnits(ui)) === estimate(toCountUnits(convert(ui)))` - exact on the filtered set** (drift T1); image modality table (constant, not - char/4) + header-parse dims + missing-dims fallback (drift T2); estimate runs - after inline. -- `compaction`: preserves tool-call/result pairing **both UIMessage and - ModelMessage shapes**; respects `keepRecentMessages`; incremental watermark - folding; prune Stage 1 reaches target without a model call when possible; - hysteresis — output `<= targetTokens`, does not re-fire next turn; chunked - summarize over an oversized prefix; **summarizer model fallback** when - `taskModelId` unset (drift T7). -- `budget`: window − output reserve − safety reserve; trigger counts new - (unsummarized) messages, not just last response (drift C1). -- `writeWatermark` / CAS (drift R1/T10): concurrent writers — one wins - (`rowCount===1`), loser re-reads, decides by **version** not watermark value; - loser SKIPs + clears dirty when winner advanced; one-retry-then-skip, no - livelock; **invalidation reset bumps version and a racing compaction sees a - conflict** (never writes stale summary over mutated history). -- `watermark-invalidation`: editing/deleting a message ≤ watermark clears summary, - resets watermark, bumps version (drift C4). -- `agent-runner`: `prepareStep` trims old tool results only, fires only near limit - (drift m3); overflow-error detection true/false matrix **across per-provider - error bodies** (OpenAI / Anthropic / Google-vLLM fixtures, drift T9); retry-once - **reuses `compactModelMessages`** (drift T3); sets `compactionDirty`; then clean - failure. -- `recovery-persistence` (drift T3): after recovery, next `prepareChatTurn` sees - `compactionDirty`, forces Tier 1, clears flag — and the next turn does **not** - re-overflow; recovery never writes summary directly. -- Integration: synthetic long history → Tier 1 compacts + persists; injected 400 - overflow → recovery retries + succeeds; sub-agent inherits window (Tier 2 only). -- `message-stats`: `startedAt`/`firstTokenAt`/`finishedAt` captured in order; - metadata stamped at `applyToolCompletions`, not message-start; TTFT/total format. -- `compact-endpoint`: force-compaction advances watermark (via `writeWatermark`) - and returns refreshed usage; defers/queues while a run is mid-stream. -- Frontend: ring window comes from selected model not last-message metadata - (drift U1); neutral state on unknown window (drift T6); pending/disabled while - streaming (drift U4); confirm fires above the §J threshold (drift U3). - ---- - -## Sequencing - -1. Window resolution + single estimator + schema (`modelMeta`, `version`, - `compactionDirty`, summary/watermark) — foundation. **✅ DONE** (open defects: - empty prod registry, T5 evict, cache-pins-default — see Review §). -2. Compaction module + `writeWatermark` CAS + Tier 1 (cross-turn, persist). - **✅ DONE** (open defects: C1 trigger under-count, M2 margin, summarizerWindow - not threaded — see Review §). -3. Recovery (overflow detect + retry-once + dirty flag). **✅ DONE 2026-06-10** - (C1 overhead fix + M2 margin + summarizerWindow threading folded in; the - `lastInputTokens` half of C1 moves to step 6 — see Chunk 3 §). - 3a. **Review-fix chunk (RV1-RV4). ✅ DONE 2026-06-10.** - 3b. **Review-fix chunk (RV5-RV7). ✅ DONE 2026-06-10.** All HIGH non-blocking - defects resolved — content-type pruner/renderer, recovery overhead target, - litellm registry populated, heuristic boundary-safe, evict wired, timeout + - single-flight in context-window resolver. -4. Tier 2 (`prepareStep`, in-memory). **✅ DONE 2026-06-10** `buildTier2PrepareStep` - wired into both `streamText` and `generateText`; fires when accumulated - ModelMessages exceed `triggerTokens` (drift m3); uses shared - `compactModelMessages` adapter (drift T3); null when kill switch off. - `Tier2Context` on `ChatTurn` threads config from `buildCompactionRuntime`. - Tests: 1074 pass; tsc source clean. - - **Chunk 4 review fix (2026-06-10):** Tier 2 trigger/target now subtract - `overheadTokens` (RV6 extended to Tier 2 — the prepareStep estimate sees - ModelMessages only, but system prompt + tool schemas consume the same - window; without this, a large overhead lets the payload exceed the budget - before Tier 2 fires). `compactModelMessages` gained `knownEstimate` so the - prepareStep trigger estimate is reused instead of recomputed (RV9). Tests - strengthened: summarize-on-fire + pairing-safety asserted, empty-prefix - no-op asserts `undefined`. -5. Sub-agent wiring (Tier 2 only). **✅ DONE 2026-06-10** `Tier2Context` + `buildTier2PrepareStep` moved to `compaction.ts` (no-cycle); `createSubAgentTool` gains `prepareStep?`; `createSubAgentTools` gains `prepareStepFn?`; `loadSubAgents` resolves per-sub-agent compaction runtime + builds prepareStep map. Tests: 1077 pass; tsc source clean. -6. Frontend usage metadata + ring (§H). **✅ DONE 2026-06-10** `CompactionRuntime` + `ChatTurn.resolved` carry `contextWindow` + `contextWindowIsDefault` (from resolved window source). `applyMessageStats` stamps `metadata.stats = { inputTokens, outputTokens, contextTokens, startedAt, firstTokenAt, finishedAt, contextWindow, contextWindowIsDefault }` on last assistant message at `applyToolCompletions` point. New `GET /:providerId/context-window?modelId=X` endpoint returns resolved window (null when source = "default", drift T6). `MessageStats` schema in `@platypus/schemas`. New `ContextUsageRing` component (SVG donut, green/amber/red ramp, neutral when unknown, required tooltip, drift T6/U2). Ring placed in `PromptInputTools` between search and model selector; `contextWindowData` SWR-fetched per selected model (drift U1). Tests: 1077 pass; source tsc clean. - - **✅ Code review for chunk 6 (2026-06-11).** One critical bug fixed: `inputTokens`/`outputTokens` from `accumulateStepStats` are the run-wide SUM across steps; feeding the summed input into the ring over-counts on multi-step tool loops (5-step loop → reported ≈ sum-of-all-prompts, pegging the ring red >100% when real fill ~37%). **Fix:** added `contextTokens` = last step's `usage.inputTokens` (peak context fullness, tracked in `streamText.onStepFinish`); the ring uses `contextTokens`, the §I cost popover keeps the summed `inputTokens`/`outputTokens`. Also: `ContextUsageRing` prop `inputTokens`→`usedTokens`; frontend `as any` casts replaced with typed `MessageStats`; removed dead `Tier2Context` import in `agent-runner.ts`. Documented trade-offs left as-is: numerator (last response's model) vs denominator (selected model) mismatch after a model switch is intentional (drift U1); `generate()` headless path stamps no stats (no UI); TTFT = first text part, excludes leading reasoning (matches §I wording). -7. Per-message stats popover (§I). **✅ DONE 2026-06-11** `MessageStatsPopover` in `chat-message.tsx`: info icon (lucide `InfoIcon`) in `MessageActions` for all assistant messages with `metadata.stats`; popover shows In/Out token counts (run-wide sums), TTFT (when `firstTokenAt` present), Total elapsed. Uses `formatDurationMs` from `lib/utils`. tsc clean; 1077 tests pass. -8. Clickable ring → compact endpoint (§J). **✅ DONE 2026-06-11** `POST /chats/:id/compact` runs force-Tier-1 via new `forceCompactChat` helper in `chat-execution.ts`; returns token estimate + context window so ring refreshes immediately. Frontend: `onClick` with defer-while-streaming + pending badge (drift U4); confirm dialog above threshold (drift U3); ring keyboard-accessible; hooks hoisted above early returns (rules-of-hooks fix). Tests: 1081 pass; tsc clean. -9. Per-agent config surface + `COMPACTION_ENABLED` kill switch. **✅ DONE 2026-06-11** DB + Zod schemas already had per-agent fields (`compactionEnabled`, `triggerRatio`, `targetRatio`, `reserveRatio`, `keepRecentMessages`, `minPrunableChars`); `resolveCompactionConfig` + `buildCompactionRuntime` already wired them; global `COMPACTION_ENABLED=false` kill switch in `chat-execution.ts`. Added compaction fields to `agentCreateSchema` / `agentUpdateSchema` (so routes pass through) and "Context compaction" section in `agent-form.tsx` Advanced settings. Backend 1081 pass; tsc clean. - - **✅ Code review for chunk 9 (2026-06-11).** One MEDIUM + minors fixed. The - editable surface newly exposed the C2 thrash hole (a user/API could set - `targetRatio >= triggerRatio` → compaction re-fires every turn). **Fix:** - (1) `agentCreateSchema`/`agentUpdateSchema` gained a zod `.refine` rejecting - an inverted pair (checked only when both supplied; error on `targetRatio`); - (2) `resolveCompactionConfig` clamps `targetRatio → triggerRatio * 0.9` as a - runtime backstop for legacy/direct-write rows. Minors: `keepRecentMessages` - base schema tightened `.nonnegative()`→`.min(1)` (0 keep-recent breaks - pairing; form already enforced min=1); form description now states the - target messages.filter((m) => m.role === "assistant").length, - [messages], -); - -const [compacted, setCompacted] = useState<{ - atAssistantMessageCount: number; - tokens: number; -} | null>(null); - -// at compact time: -setCompacted({ - atAssistantMessageCount: assistantMessageCount, - tokens: body.inputTokens, -}); - -// ring usedTokens expression: -compacted?.atAssistantMessageCount === assistantMessageCount - ? compacted.tokens - : lastAssistantStats?.contextTokens; -``` - -A new user message does NOT change `assistantMessageCount` → compacted stays -valid. When the next assistant response lands, `assistantMessageCount` increments -→ compacted expires → ring reads the fresh `lastAssistantStats.contextTokens`. - -**Files:** `apps/frontend/components/chat.tsx` only (3 small edits). - -#### 11b. (i) icon: add mouseover tooltip - -**Current state.** The `Info` button at `chat.tsx:741-743` is a bare -`DialogTrigger` — no tooltip, click-only. The user has to click to discover what -it does. - -**Fix.** Wrap the existing `DialogTrigger`/`PromptInputButton` in a `Tooltip` -so hover shows a label without opening the dialog. Click still opens the dialog -(unchanged). Use the same `delayDuration={500}` as the ring. Tooltip text: -`"Agent info"` (or a one-line agent description if `selectedAgent.description` -is non-empty). Pattern: - -```tsx - - - - - - - - - - - {selectedAgent.description?.trim() || "Agent info"} - - - - -``` - -Note: `TooltipTrigger asChild` wrapping `DialogTrigger asChild` is a safe -Radix composition — Radix merges event handlers via slot; the `onClick` from -`DialogTrigger` and the hover callbacks from `TooltipTrigger` coexist on the -same underlying `PromptInputButton` element. - -**Files:** `apps/frontend/components/chat.tsx` only (reshape the existing JSX -block, no new imports needed — `Tooltip`/`TooltipTrigger`/`TooltipContent` -already imported). - -#### 11c. Compaction chat trace (new §K) - -**Goal.** Make compaction visible inside the chat timeline — not just via the -ring. Two states: - -1. **Active / in-flight** — "compaction is happening right now" (between the - user hitting Send and the first response token arriving). -2. **Historical** — "compaction happened here" (visible in the scrollback, - including the LLM-generated summary since that IS a model call). - -**Mental model.** Compaction is a model call forced by the system on the -user's behalf — structurally equivalent to a tool call initiated by the -assistant. It therefore maps naturally to the **existing tool-call UI**: -emit it as a synthetic `compact_context` tool-call + tool-result pair in the -stream before the actual response. No new rendering component needed — the -existing tool-call expander handles both active ("in flight") and historical -states for free. No fake user message injected; no custom banner. - -**Why the backend needs to emit an event.** Tier 1 runs inside -`prepareChatTurn` (server-side, before streaming). The frontend has no other -channel to distinguish "compaction ran before this response" from normal -response latency. §C already prescribes `context-compacted` as a fail-loud -stream event; this chunk wires that emission as a tool-call pair. - -**Backend change.** After a successful Tier 1 compaction in -`applyTier1Compaction`, emit a synthetic tool-call + tool-result into the -AI-SDK `dataStream` before the first assistant text part. Use the SDK's -`writeData` / `writeTool*` primitives (exact API depends on AI SDK version — -check `dataStream` surface in `chat-execution.ts`). Logical shape: - -```ts -// tool-call part -{ toolCallId: "", toolName: "compact_context", args: { messagesSummarized: N } } - -// tool-result part -{ - toolCallId: "", - toolName: "compact_context", - result: { - messagesDropped: N, - summaryExcerpt: string | undefined, // first ~120 chars of LLM summary; absent for Stage-1-only (prune, no model call) - } -} -``` - -`summaryExcerpt` carries the first ~120 chars of the LLM-generated summary. -This IS the model's own words — not a risk, and it gives users transparency -into what was retained. Omit when compaction ran Stage 1 only (prune, no -model call). - -**Frontend — no new component.** The existing tool-call renderer in -`chat-message.tsx` already handles `tool-call` + `tool-result` parts. The -`compact_context` call will render like any other tool invocation: - -- While streaming: shows "compact_context" with a spinner (active indicator). -- After complete: collapses to the tool-call expander showing args + - result (including `summaryExcerpt` when present). - -The result persists in `UIMessage.parts` (AI SDK durable storage) → appears -in scrollback automatically. - -**What the user sees:** - -``` -▶ compact_context [expandable] - ↳ messagesDropped: 34 - summary: "The user has been working on the Platypus - monorepo, specifically the context-compaction - feature…" -────────────────────────────────────────────── -[actual assistant response to user's question] -``` - -**Forced compaction (§J, ring click).** The `POST /chats/:id/compact` -endpoint runs outside of a normal streaming turn — no `dataStream` available. -Instead, after compaction succeeds the backend **persists a synthetic -assistant message** directly into the chat's message list (same DB write path -as real messages). Shape: role `assistant`, parts = `[tool-call, tool-result]` -for `compact_context`, no text content. The frontend refreshes the message -list after the POST resolves (SWR revalidation or optimistic append from the -response body) — the new message appears in the scrollback exactly like any -other tool-call exchange. The existing ring spinner + toast remain; the -synthetic message is the persistent trace. - -**C4 / watermark safety — two paths:** - -- **Tier 1 trace** — emitted parts live in `UIMessage.parts` of the following - assistant message (stream data only, not a separate DB row). Do NOT affect - message IDs, watermark comparisons, or C4 logic. -- **§J trace** — IS a real DB message row. Must be written with a message ID - that is **above** the current `summaryWatermark` so it is never itself - summarized. The existing `writeWatermark` CAS is not involved (the - watermark already advanced during the compaction); just insert with a - timestamp after the last real message. C4 invalidation only triggers on - edits/deletes at/below the watermark — this new row is always above it, so - no risk. - -**Files:** - -- `apps/backend/src/runs/compaction.ts` — return compaction result metadata - (`messagesDropped`, `summaryExcerpt`) from `applyTier1Compaction` (or add - an optional `dataStream` param to emit directly). -- `apps/backend/src/services/chat-execution.ts` — after - `applyTier1IfNeeded`, if compaction ran, emit the tool-call + tool-result - pair into `dataStream`. -- `packages/schemas/index.ts` — no change needed (tool-call parts already - in the union). -- `apps/frontend/components/chat-message.tsx` — no change needed (existing - tool-call renderer handles `compact_context` automatically). Optionally: - add a display-name entry so it shows "Context compaction" instead of the - raw function name. - -**Tests:** - -- Backend: `applyTier1Compaction` returns `{ messagesDropped, summaryExcerpt? }`; - no emission when compaction does not fire (below trigger). -- Integration: stream from a compacted turn contains a `tool-call` part with - `toolName: "compact_context"` before any `text` part. - -**Sequencing.** 11a and 11b are trivial — do them first (one commit each). -11c has a backend+frontend surface; implement backend emission first (easy -to verify via the stream in DevTools), then verify existing tool-call UI -renders it without frontend changes. - -#### Chunk 11 — code review fixes (landed 2026-06-12) - -Review of the first 11c cut surfaced one correctness defect + three gaps; all fixed. - -- **RV11 (HIGH) — synthetic trace was replayed to the provider.** The - `compact_context` tool part is persisted into the assistant message (for - scrollback) and was therefore re-converted by `convertToModelMessages` and - sent to the model on every later turn — a phantom tool call for a tool not in - `tools` (provider-rejection / model-confusion risk). **Fix:** - `stripCompactionTraceParts` removes the part at both `convertToModelMessages` - call sites (`agent-runner.ts` stream + generate); a trace-only message (§J) is - dropped entirely so no empty assistant message is sent. The part still - persists for the timeline; it just never reaches the model. -- **RV12 (MED) — trace emitted for no-op turns.** `compactionTrace` was built - whenever `triggered`, but `messagesDropped` is 0 with no excerpt on prune-only - and force-dirty-within-target runs → empty timeline entry. **Fix:** - `compactionTrace` is now `undefined` unless an actual model summary ran - (`usedModelCall && summaryText`). -- **RV13 (MED) — §J forced-compaction trace was unimplemented.** Ring-click - compaction produced no timeline trace (only the auto path did). **Fix:** - `forceCompactChat` persists a standalone synthetic assistant message via - `buildCompactionTraceMessage` (above the watermark; stripped from the model - payload like the Tier-1 trace), returns it from `POST /chats/:id/compact`, and - the frontend appends it (id-dedup so SWR revalidation reconciles, no - duplicate). -- **RV14 (LOW) — tests + display name.** Added `prependCompactionChunks`, - `stripCompactionTraceParts`, `buildCompactionTraceMessage`, and trace-gating - tests (backend suite 1096 pass). `humanizeToolType` maps `compact_context` → - "Context compaction". - ---- - -## Chunk 12 — remove per-agent compaction config, go global+per-model (planned, decided 2026-06-12) - -**Decision.** Drop ALL per-agent compaction tuning shipped in chunk 10. Compaction -behavior becomes global (`DEFAULT_COMPACTION_CONFIG` + the `COMPACTION_ENABLED` -env kill switch); only window/output **size** stays per-model via the §A resolver. - -**Why.** The 2026-06-12 field re-survey: no surveyed agent (Hermes, Codex CLI, -Claude Code, Cline) exposes per-agent compaction knobs — all use global config + -per-model window. Trigger/target are fractions of an already model-normalized -`inputBudget`, so per-agent variance is speculative generality. The agent-edit -form clutter is real cost for a feature ~100% of agents leave at default (every -agent on the test server has all six columns NULL). - -**Trade-off (accepted).** Removing per-agent `compactionEnabled` loses the ability -to disable compaction for a single agent (e.g. an exact-recall code/legal agent -where lossy summarization corrupts output). Mitigation: the global -`COMPACTION_ENABLED` env still exists, and recovery (§E, P4) keeps such an agent -from hard-failing on overflow regardless. If a real need for single-agent opt-out -appears, revisit as a **per-model or per-workspace** flag — NOT per-agent. - -**Change list.** - -- `packages/schemas/index.ts` — remove `compactionEnabled`, `triggerRatio`, - `targetRatio`, `reserveRatio`, `keepRecentMessages`, `minPrunableChars` from - `agentSchema` + the `agentCreate`/`agentUpdate` picks; delete the - `compactionRatioOrder` refinement (+ its `index.test.ts` cases). -- `apps/backend/src/db/schema.ts` — drop the six `agent` columns. -- New migration — `ALTER TABLE "agent" DROP COLUMN IF EXISTS ...` ×6. `IF EXISTS` - because divergent-lineage server DBs (see deploy notes) may not have all six; - destructive but safe — the columns hold only tuning overrides, NULL in practice. -- `apps/backend/src/runs/compaction.ts` — `resolveCompactionConfig` returns - `DEFAULT_COMPACTION_CONFIG` unconditionally; delete `CompactionConfigOverrides` - and the per-agent merge. Keep `DEFAULT_COMPACTION_CONFIG` + `computeBudget`. -- `apps/backend/src/services/chat-execution.ts` — drop the `agent` argument to - `resolveCompactionConfig`; keep the `COMPACTION_ENABLED` env override. -- `apps/frontend` — remove the six compaction fields from the agent-edit form. - -**Verify.** Agent create/update no longer accepts the six fields; chat still -compacts using defaults; `COMPACTION_ENABLED=false` still disables proactively; -recovery still fires when proactive is off; migrate is idempotent on a DB missing -some columns. - ---- - -## Chunk 13 — compaction reliability + prompt overhaul (planned, 2026-06-12) - -Chunk 12 shipped (`625ff96` + dead-`agent`-binding cleanup + env-override knobs -`da2c159`). Live test-server run on a single-vLLM provider (`qwen36`, lowered -ceiling via `.env`: trigger 0.2 / target 0.1 / keepRecent 4 / minPrunable 500) -surfaced a turn-killing bug + several prompt/UX gaps. All findings + fixes below. - -### Observed bug — per-step timeout kills pre-stream compaction - -Live log evidence (chat `hur61ZR79koiHQysBDS2o`): - -- Trigger fired (`projected` 63,511 > `triggerTokens` 48,988). -- `summarize` ran **149,955 ms** on `qwen36`, **input 6,178 → output 8,631 tokens** - (the summary was LONGER than its input — degenerate expansion, not compression). -- The run's **per-step stall timeout (120,000 ms) fired at ~120 s** → `level:50 -"Run timed out" kind:"step"` → run aborted ~30 s **before** summarize returned. -- `summarize` ignored the abort, finished at 150 s, committed the watermark → - `context-compacted` logged (dropped 9, 63,511 → 14,785). But the turn was already - dead → **no model answer streamed, and the turn's assistant message was lost.** - -Root cause: Tier-1 `summarize()` is a long blocking call that runs **inside -`prepareChatTurn`, before the response stream opens**, and does **not bump the -per-step stall timer**. The 120 s watchdog treats it as a stalled step and kills -the run. - -Why the turn vanished from the chat: two separate writes. The durable -summary/watermark is a CAS write on the **chat row** (survived — later turns have -the summary, persisted value is a clean **770-char / ~193-token** structured -summary). The turn's **assistant message** (answer + the synthetic `compact_context` -trace part) only persists via the **response stream**, which never opened. So the -chat-row state advanced but the visible turn was lost. - -Note: the 8,631-token runaway was the timed-out turn and was **discarded** — the -persisted summary is the good 193-token one. So `qwen36` _can_ summarize tightly; -the 8,631 was a pathological one-off. Confirms a `maxOutputTokens` ceiling loses -no context in normal operation (healthy output is ~10× below a 2k ceiling). - -### Fixes (in priority order) - -> **Status (B-F2):** Fix 1 (heartbeat), Fix 2 (`maxOutputTokens` ceiling + prompt), -> and Fix 4 (`abortSignal`) are **SHIPPED** in `buildCompactionRuntime`. Fix 3 -> (open the response stream before compaction) is **NOT done** — deferred as the -> bigger refactor. The trace therefore still renders "Completed" (post-hoc -> `prependCompactionChunks`), not live Pending→Running. The HTTP/tunnel-drop -> vector during an in-`prepareChatTurn` summarize is largely neutralized by -> run/connection decoupling (issue #113: runs persist server-side regardless of -> the socket), so this is a UX/liveness gap, not data loss. - -1. **Heartbeat during summarize (CRITICAL).** Compaction is legitimate long work, - not a stall. Ping `onActivity` / bump the per-step timer on an interval while - `summarize` runs so the 120 s watchdog keeps resetting. Directly stops the - spurious kill. (`buildCompactionRuntime` already has `onActivity` in scope via - the turn; thread it into the summarize wrapper, tick ~every 10 s.) - -2. **`maxOutputTokens` ceiling (~2,000) + "be concise" prompt instruction.** Pure - safety backstop against the runaway — NOT a blind truncation. The prompt asks - the model to compress _to fit_ (length target); the ceiling only catches a - degenerate run. Proven safe: real summaries are ~193 tokens. Also log - `finishReason === "length"` so we know if the cap ever bit. - -3. **Open the response stream BEFORE compaction (bigger refactor).** Today the - synthetic `compact_context` chunks are injected post-hoc by `prependCompactionChunks` - as a paired `tool-input-available` + `tool-output-available` — i.e. already - "Completed", emitted only after the (already-open-too-late) model stream's - `start`. To show live **Pending → Running → Completed** AND keep the HTTP / - playit-tunnel connection alive during the wait (a second timeout vector): - - Split the cheap trigger decision (`projectTier1Tokens` vs `triggerTokens`, no - LLM) from the expensive `summarize`, so we know to emit the pending chunk - before paying for summarize. - - Build a prelude stream: `start` + `tool-input-available(compact_context)` → - await summarize → `tool-output-available` → concat the model stream (suppress/ - merge the model's own `start` so the synthetic part + answer share one message - id). Replaces the post-hoc `prependCompactionChunks` injection. - - Move the compacted-messages await out of `prepareChatTurn` into the stream step. - - Frontend is **already done** — `tool.tsx` renders `input-available` = "Running" - (pulsing clock), `output-available` = "Completed". Zero frontend change. - - Error path: if `summarize` throws after the pending chunk shows, resolve the - tool part to `output-error` — do not leave it stuck "Running". Tier-1 stays - best-effort (fail → proceed uncompacted) but must close the tool part. - - Preserve invariants: `stripCompactionTraceParts` + snapshot persistence expect - a well-formed input+output pair; tee/snapshot drain must see prelude chunks. - -4. **Pass `abortSignal` into `summarize` (minor correctness).** Today summarize - burned 30 s + a full LLM call _after_ the turn was dead. Make it cancellable so - a real abort stops it. Fold into #3. - -### Prompt overhaul - -Current prompt (`chat-execution.ts` ~L578) is an unstructured one-liner with **no -length instruction** (the runaway's root). Prior-art survey of real summarization -prompts (2026-06-12): - -- **Claude Code** — heaviest: chronological analysis + **9 sections** (Primary - Request & Intent, Key Technical Concepts, Files & Code, Errors & fixes, Problem - Solving, All user messages, Pending Tasks, Current Work, Optional Next Step); - security-relevant instructions preserved **verbatim**. -- **Codex CLI** — handoff-oriented: _"You are performing a CONTEXT CHECKPOINT - COMPACTION. Create a handoff summary for another LLM that will resume the task."_ - - 4 sections (progress & decisions · context/constraints/prefs · what remains · - critical data/refs). Prepends a **resume prefix** next turn (_"Another language - model started to solve this problem and produced a summary…"_) so the resumer - builds on prior work instead of restarting. Issue #14347: sections **reduce loss - over repeated compactions**. -- **OpenCode** — 6 sections (done · WIP · files · next steps · user requests/ - constraints · decisions & rationale). -- **Hermes Agent** — weakest: just _"Summarize these conversation turns concisely"_ - → `[CONTEXT SUMMARY]: `, positional keep (first 3 + last 4 turns), Gemini - Flash aux model. Open issue #499 proposes **copying Codex's structured handoff** - — Hermes is behind us, not ahead. - -Gaps vs prior art: (a) no length instruction → the runaway; (b) no section -structure → erodes across repeated re-compactions (we feed the prior summary back -in via `priorSummaryTokens`); (c) no "build on prior work" framing (our -`summaryUIMessage` prefix `[Summary of earlier conversation]` is just a label). - -**Proposed replacement system prompt** (handoff + sections + concise + integrate- -prior; pairs with the #2 ceiling): - -``` -You are performing a context checkpoint compaction. Another instance of this -assistant will resume using ONLY your summary plus the most recent messages — -earlier history will be gone. Write a dense markdown handoff under these -headings (omit one only if truly empty): - -- **Intent & open requests** — what the user wants, the latest explicit request, pending tasks. -- **Decisions & facts** — conclusions, confirmed values/IDs/paths, constraints and user - preferences (preserve any security-relevant instruction verbatim). -- **Files & tools touched** — what was read/changed and why. -- **Current state & next step** — where things stand and the immediate next action. - -If a prior summary appears in the history, integrate it — don't drop facts it -captured. Be concise: aim under ~1500 tokens. Output only the summary. -``` - -### Deferred / decided-against here - -- **Selectable compaction model in provider UI** — DECIDED NO. The compaction - model already = `provider.taskModelId` (same-provider only: summarize runs through - the chat provider's own client `opened.languageModel(taskModelId)`). On a single - vLLM (`modelIds` has one entry) there is no other model to pick, so a dropdown is - a no-op. `workspace.taskModelProviderId` routes _other_ task work (tag/title) to a - different provider but compaction does NOT use it. A separate fast compaction - endpoint would need new wiring (route summarize through a task provider's client) + - multi-model infra (2nd provider or a LiteLLM gateway) — not worth it now. - -### Verify (Chunk 13) - -Compaction no longer trips the per-step timeout (heartbeat); a slow summarize -streams a Running tool part instead of a blank turn; summary output is bounded -(`finishReason` logged if capped); the compacted turn's assistant message + trace -persist even when summarize is slow; the new prompt yields structured, concise, -multi-compaction-stable summaries. - ---- - -## Chunk 14 — kept-message tool-result handling (planned, 2026-06-14) - -Origin: live event on the test server — compaction fired but missed target badly -(`tokensAfter=75226` vs `targetTokens≈24000`). Cause: 4 recent messages were -massive mempalace (MCP) tool-result JSON dumps. Step 1 (drop prefix) already -collapses prefix tool results via `softTrim(out, 200)` in `renderUIMessages`, so -the summarizer only saw ~2.5k input tokens — but `keptMessages: recent` was -returned **verbatim**, so the bulky kept results dominated `tokensAfter`. - -### The two-tier reality (established this session) - -- **Tier 2 (`compactModelMessages`, `prepareStep`)** runs _between tool steps in - one stream_. It prunes **prefix only**; `recent` is passed **verbatim** - ([compaction.ts](apps/backend/src/runs/compaction.ts) `[summaryModelMessage(...), ...recent]`). - So current-stream tool results are never trimmed mid-stream. **Caveat:** the - keep-window is the last `keepRecentMessages` _messages_ counted flat, and in - ModelMessage form a tool call + its result are two messages — so in a >5-tool - stream the _early_ results of the same stream scroll into the prefix and DO get - summarized mid-stream. Tier 2 protects the tail, not the whole stream. -- **Tier 1 (`compactUIMessages`)** runs _between turns_. The just-finished - stream's tool results are the newest messages → land in `recent`. This is the - only place a result the user is actively asking about can be trimmed. - -### Mapping to Anthropic's shipped mechanisms (claude-api skill, 2026-06-14) - -The Anthropic API converged on **three** complementary layers; Platypus today has -only the summarize leg: - -1. **Compaction** — summarize earlier context near the window limit (beta - `compact-2026-01-12`). ≈ Platypus Tier 1/Tier 2. -2. **Context editing (`clear_tool_uses`)** — _prune_ stale tool results + thinking - blocks at configurable thresholds, keeping conversation structure; "keeps the - transcript lean **without summarizing**." Platypus does NOT have this. → Task 2. -3. **Ingestion offload** — Managed Agents auto-offloads any MCP tool result - > 100K tokens to a sandbox file, returning a truncated preview + path. An - > ingestion cap, not compaction — the only real fix for "one result too big to - > fit the window at all." → Task 3. - -### Task 1 — make Tier 1 recent-trim safe (READY; partially shipped) - -**Shipped 2026-06-14 (`64232d8`, on `test/compaction-clean-deploy`, deployed):** -recent tool results pruned via `pruneUIMessage(m, minRecentPrunableChars)` (default -`minPrunableChars * 5` = 10000 chars) across every over-target return path -including the empty-prefix / null-watermark bail; warns when post-compaction -estimate > `targetTokens * 2`; env override `COMPACTION_MIN_RECENT_PRUNABLE_CHARS`; -summarizer output ceiling 2000 → 4000. - -**DONE 2026-06-14 — "option D" (overflow-gate + exempt newest) shipped** so we stop -gutting active data for a _soft_-target miss: - -- Missing `targetTokens` (0.5) is cheap — it's a hysteresis goal; the call still - succeeds as long as `recent < inputBudget`. The hard wall is `inputBudget` - (window − output reserve − safety, from `computeBudget`). -- Recent-trim is now gated on `estimate(recent)+summary > inputBudget` (call would - actually overflow), NOT on the soft target. Below the wall, `recent` is left - intact — full fidelity, just a missed hysteresis target that re-compacts next - turn (cheap; empty-prefix path makes no summarizer call). -- The single newest message is always exempt regardless (`pruneRecentExemptNewest`). -- Implementation: `UICompactOptions.inputBudget` added; `keepRecentWithinWall` - helper in `compactUIMessages` applied on both over-target return paths (Stage 2 - - empty-prefix bail); `applyTier1Compaction` threads - `effectiveInputBudget = max(0, budget.inputBudget − overheadTokens)` (mirrors - `effectiveTarget`). When `inputBudget` is omitted (recovery/tests) it falls - back to always-trim (pre-option-D guard). -- **Caveat (not a full close): newest-exempt narrows the prior code's accidental - coverage of the single-oversized-result case.** Old code trimmed ALL recent incl. - the newest; option D keeps the newest verbatim. So if the NEWEST single message - alone exceeds the wall (e.g. a 40k mempalace dump as the last message), no tier - trims it — recovery also prunes prefix-only — and the turn hard-errors ("start a - new chat"). That is the Task 3 ingestion-cap gap, unsolved by any tier today; - option D does not introduce it but no longer accidentally masks it. The origin - event (4 big results) is fixed: option D trims 3, the newest stays, and the - result fits **as long as the newest < wall**. -- **Review fixes (2026-06-14, post-implementation review):** - - The over-target warning was re-gated from `afterEstimate > targetTokens * 2` - to `afterEstimate > inputBudget` (`warnIfOverWall`). Post-option-D a soft - target miss is by design (recent kept verbatim below the wall), so the old - `target*2` warn fired on every healthy compaction under a low target ratio - (test box `0.1`). It now fires only when recent genuinely can't fit the window - (the Task 3 case). Falls back to `target*2` when no wall is supplied. - - `keepRecentWithinWall` now returns the recent token estimate so `afterEstimate` - never re-estimates the recent set (avoids the double char/4 pass). - - Tests: 57 pass (option-D Stage-2 verbatim + trim-except-newest; empty-prefix - verbatim; soft-miss-no-warn; the no-wall warn case renamed). -- This is strictly better than the alternatives we weighed (B: exempt newest N — - still over-trims under the test box's 0.1 target; A: raise threshold to 100k — - reduces frequency only; C: revert — loses the pathological-dump guard). The test - server's `COMPACTION_TARGET_RATIO=0.1` manufactures the worst case; prod 0.5 - rarely trips it. - -### Task 2 — context editing: recency-based tool-result pruning, no summary (DONE 2026-06-14) - -**Shipped.** `editToolResults` + `elidedToolPlaceholder` in `compaction.ts`, wired -as Stage 0 in `applyTier1Compaction` (after `viewAfterWatermark`, before the -trigger projection); 3 new `CompactionConfig` fields + defaults -(`contextEditingEnabled=true`, `keepRecentToolResults=4`, -`minEditableToolChars=50000`); 3 env overrides in `buildCompactionRuntime` -(`COMPACTION_CONTEXT_EDITING_ENABLED`, `COMPACTION_KEEP_RECENT_TOOL_RESULTS`, -`COMPACTION_MIN_EDITABLE_TOOL_CHARS`). No schema / agent-runner / frontend change. -Gated by the `COMPACTION_ENABLED` kill switch AND `contextEditingEnabled`; metric -`context_edited` logged when `resultsElided > 0`. Implementation notes: - -- **`minEditableToolChars` defaulted to 50000, not the initial 10000** — 10k (≈2.5k - tokens) was well below every shipped tool; 50k matches LibreChat and still - catches the ~160k-char mempalace dump while sparing medium results (less churn). -- **Idempotency guard added** — `editToolResults` skips a result whose output - already starts with the placeholder prefix, so a misconfigured tiny gate cannot - re-elide its own placeholder (monotonic for any gate, not just the 50k default). -- **Grow-guard added (post-review 2026-06-14)** — skip when `placeholder.length - > = serialized.length`: a tiny gate could pick a result shorter than the -~140-char placeholder, where eliding would INFLATE the prompt (negative -`charsReclaimed`). Restructured so the full elision policy (recency + size + -idempotency + grow) is decided up front into a `Map<"mi:pi", placeholder>`; the - > rewrite map runs only when real work exists and never allocates a copy on a pure - > no-op. New test: grow-guard (placeholder longer than output ⇒ no-op identity). -- **Placeholder is explicit/self-describing** (LLM-agnostic — small background - models): `[Tool result for "" omitted to save context ( chars). The -full result is still available — call the tool again with the same input if you -need it.]`. Not a terse copied marker. -- **Plan decision 7 (accepted fidelity loss)**: elided placeholders also flow into - any prefix Stage 2 later summarizes — accepted (a huge dump's head+tail is poor - summary fodder; raw stays in DB). -- Tests: +10 Task-2 cases (elide-old / keep-within-window / newest-exempt / - size-gate / pairing-survives / determinism+monotonic / grow-guard / - no-op-identity / Stage-0-avoids-summarization / off-control); compaction suite - 67 pass. tsc: 0 new errors (260 pre-existing in unrelated test mocks); lint: 0 - new errors. - -#### Original spec (for reference) - -The user's ideal model (2026-06-14): in-stream, compact only near the ceiling so -the stream doesn't stop (Tier 2 ✅); at stream end strip bulky tool results so the -next turn doesn't start at 40k (NEW — this task); then threshold-compact as the -chat continues (Tier 1 ✅). This is Anthropic's **context editing -(`clear_tool_uses`)** — prune tool-result bodies, do NOT summarize. - -#### Field research that fixed the design (2026-06-14, cited) - -Every shipped "prune-not-summarize" implementation converged on the **same -mechanics** — adopt them, don't reinvent: - -| Product | Trigger | What it prunes | Keep / protect | Placeholder | Pairing | -| ---------------------------------------- | --------------------------------------------- | ------------------------------------------------------------- | ------------------------------------------------------------------------------------------- | ----------------------------------------------------------------- | ----------------------------------------- | -| **Anthropic `clear_tool_uses_20250919`** | `trigger`, default **100k input tokens** | tool **results** only (calls kept unless `clear_tool_inputs`) | `keep` = **3** most-recent tool uses; `exclude_tools`; `clear_at_least` (cache-break guard) | inserts a placeholder (exact string undocumented) | yes — keeps `tool_use`, swaps result body | -| **Hermes** | **0.50** agent + **0.85**/400-msg hygiene | tool results only; dedups, strips images | `protect_last_n=20`, `protect_first_n=3`; never-prune list | `"[Old tool output cleared to save context space]"` | yes | -| **Codex (remote)** | model window, oldest-first, stop when it fits | `FunctionCallOutput` / tool outputs — rewrites body | stops as soon as history fits | `"Output exceeded the available model context and was truncated"` | yes — keeps `call_id` | -| **Claude Code microcompact** | "when context grows long" | old tool results (prune) before auto-compact (summarize) | recent context + current file/task state | `[Old tool result content cleared]` (community-reported) | yes | -| **LibreChat `contextPruning`** | gated, off by default | tool-result content; soft (head+tail) → hard (clear) | `keepLastAssistants=3`; `minPrunableToolChars` **50000** | hard: `"[Old tool result content cleared]"` | content-only | - -Takeaways baked into the spec below: **prune-then-summarize staging** (we already -have it); **never delete the tool-call block — swap only the result body** (keeps -`call_id`/pairing valid, no orphan rejection); **keep a recent tail by COUNT of -tool results** (not by turn — we have no turn id); **size-gate** the edit (cache -guard ≈ `clear_at_least`); **visible placeholder** so the model knows to re-call. -NB: Anthropic's literal default placeholder string is **not documented** — do not -copy `"[cleared to save context]"` from the cookbook (it is demo code). - -#### Architectural decisions (converged) - -1. **A pure view transform, no new durable state.** Unlike summary/watermark - (which need CAS + a DB column because they are LLM-derived and must persist), - context editing is **deterministic and recomputed from raw messages each turn** - by recency. **No schema change, no CAS, no `version` bump.** Sibling in spirit - to `stripCompactionTraceParts` ([agent-runner.ts:165](apps/backend/src/runs/agent-runner.ts#L165)), - the existing "stored-but-altered-for-the-wire" transform. P1 holds trivially: - raw `chat.messages` is untouched; the full result stays for UI/audit. -2. **Runs as Stage 0 inside `applyTier1Compaction`, before the summarize trigger - decision** — NOT in agent-runner. Rationale: it is chat-only (durable history, - recency across the whole chat), and running it _before_ the trigger lets a lean - view AVOID summarization entirely (cheaper, the whole point). agent-runner's - strip stays for the synthetic trace (a different concern, all paths). -3. **Recency by COUNT of tool results, not turn-grouping.** Matches Anthropic - `keep` / Hermes `protect_last_n`. We have no clean turn id; do not invent one. -4. **Monotonic + deterministic ⇒ cache-friendly.** A result is elided the turn it - ages past the keep-window and stays elided. One cache break per result as it - scrolls out — far cheaper than re-summarizing. The size gate is our - `clear_at_least`: trivial results are never touched, so no churn. -5. **Newest message always exempt** — same invariant as option D (Task 1). -6. **Scope line vs Task 3 (explicit).** Task 2 handles **accumulation of older - bulky results**. A single result too large to afford _even as the newest_ is - **Task 3** (ingestion cap at storage time). They overlap by design: Task 2's - keep-window is the one-turn grace for "based on those results, do X"; Task 3 - caps the pathological single dump. Task 2 does **not** fully deliver "next turn - carries no 40k" for a 40k _newest_ result — that is Task 3. -7. **Accepted fidelity loss on the summarize path (2026-06-14).** Stage 0 runs on - `afterWatermark` _before_ the trigger; when the trigger still fires anyway, - `compactUIMessages` summarizes the **prefix** — which Stage 0 has already - reduced to `[… result omitted …]` placeholders for old bulky results. So the - summarizer never sees that result's content (vs Stage 1's head+tail soft-trim, - which leaves ~1000 chars of gist). **This is strictly more lossy than today on - the summarize path, and accepted:** a 40k MCP/JSON dump's head+tail is not - useful summary fodder either, the raw stays in the DB (P1) for UI/audit, and the - common case is Stage 0 dropping the view under the trigger so no summary fires - at all. If this ever bites a real case, the mitigation is to run Stage 0 only on - the kept/recent region and let the about-to-be-summarized prefix keep its - soft-trim — not done now. - -#### Implementation spec - -**New in `compaction.ts`:** - -```ts -// Placeholder body for an elided tool result. Names the tool + elided size so the -// model can decide to re-run it. Short enough that Stage 1 / option-D never re-trim it. -// LLM-AGNOSTIC: Platypus may run small/weak background models, so the string must be -// EXPLICIT and self-describing — do NOT copy a terse marker (LibreChat's -// "[Old tool result content cleared]") that assumes the model infers it can re-call. -// Canonical form: -// `[Tool result for "" omitted to save context ( chars). The full -// result is still available — call the tool again with the same input if you -// need it.]` -function elidedToolPlaceholder(toolName: string, chars: number): string; - -// Stage 0: replace the `output` of OLD bulky tool-result parts with a placeholder. -// Pure, deterministic, no model call. Keeps the tool part (pairing); text parts -// untouched. Returns { messages, resultsElided, charsReclaimed } (unchanged array -// identity when nothing qualified, so callers can skip a re-estimate). -export function editToolResults( - messages: PlatypusUIMessage[], - opts: { - keepRecentToolResults: number; // exempt the last N tool results (default 4) - minEditableToolChars: number; // only elide results larger than this (default 50000) - }, -): { - messages: PlatypusUIMessage[]; - resultsElided: number; - charsReclaimed: number; -}; -``` - -Policy inside `editToolResults`: - -- Walk messages; collect indices of tool-result-bearing parts (`dynamic-tool` / - `tool-*` with `output !== undefined`), in order. -- The last `keepRecentToolResults` of them are exempt (verbatim). The message at - the end of the array (newest) is exempt regardless (decision 5). -- For each remaining tool result whose serialized `output` length - `> minEditableToolChars`, replace `output` with `elidedToolPlaceholder(name, len)`. - Tool calls/inputs and all text parts are left intact. -- Operate on shallow copies (P1). Reuse the `pruneUIMessage` shape-walking style. - -**Wiring in `applyTier1Compaction`** (after `viewAfterWatermark`, before the -trigger projection): - -- `const edited = config.contextEditingEnabled ? editToolResults(afterWatermark, …) : { messages: afterWatermark, … }` -- Use `edited.messages` everywhere `afterWatermark` is used downstream (projection, - `compactUIMessages`, the injected view). -- Log `metric: "context_edited"` with `{ resultsElided, charsReclaimed }` when - `resultsElided > 0` (no per-turn noise when it's a no-op). -- This shrinks `messageTokens` before the trigger check ⇒ summarize fires less. - No behavior change when nothing qualifies (array identity preserved). -- Gated by the existing `COMPACTION_ENABLED` kill switch (P4: recovery still the - net) AND a per-feature `contextEditingEnabled`. - -**Config (`CompactionConfig` + `DEFAULT_COMPACTION_CONFIG` + env in -`buildCompactionRuntime`):** - -- `contextEditingEnabled: boolean` (default `true`) — env - `COMPACTION_CONTEXT_EDITING_ENABLED=false` to disable. -- `keepRecentToolResults: number` (default `4`) — env - `COMPACTION_KEEP_RECENT_TOOL_RESULTS`. -- `minEditableToolChars: number` (default `50000`) — env - `COMPACTION_MIN_EDITABLE_TOOL_CHARS`. Matches LibreChat's `minPrunableToolChars` - (50k chars ≈ 12.5k tokens), the only direct per-result char-gate analog in the - field survey. Deliberately higher than option-D's `minRecentPrunableChars` - (10k) — Stage 0 only elides genuinely huge dumps (the 40k-token ≈ 160k-char - mempalace case), sparing medium results to minimize cache churn. 10k was too - aggressive vs every shipped tool. - -**Touches to current implementation (yes — confirmed):** - -- `compaction.ts`: new `editToolResults` + placeholder helper; Stage 0 call in - `applyTier1Compaction`; 3 new `CompactionConfig` fields + defaults. -- `chat-execution.ts` `buildCompactionRuntime`: 3 env overrides (mirror the - existing `numEnv` / kill-switch pattern at the COMPACTION\_\* block). -- No schema change. No agent-runner change (the leaned view flows through the - existing reconstruction → `stripCompactionTraceParts` → `convertToModelMessages` - path unchanged). No frontend change (full result still stored/displayed). - -**Out of scope (deferred, noted so we don't drift):** - -- Tier 2 / sub-agent context editing — Tier 2 already prunes its prefix - intra-turn; could adopt the same recency placeholder in `compactModelMessages` - later. Not now. -- Placeholder-WITH-summary (re-fetch hint that includes an LLM mini-summary of the - elided result) — needs a model call, belongs with Stage 2; the name+size - placeholder is enough for the model to re-call. Defer. -- A UI "tool result elided" timeline marker — log/metric only for now (a per-turn - UI entry would be noisy). - -**Tests (`compaction.test.ts`):** - -- elides an OLD bulky result beyond the keep-window; keeps recent + all text. -- result within `keepRecentToolResults` kept verbatim. -- newest message exempt even when bulky. -- result ≤ `minEditableToolChars` untouched (size gate). -- pairing: the tool-call part survives; only `output` changes. -- determinism/monotonicity: feeding the edited view back elides nothing new - (stable prefix ⇒ cache-friendly). -- integration: a chat that was over the summarize trigger drops under it after - Stage 0 ⇒ `usedModelCall === false` (summarization avoided). -- no-op identity: nothing qualifies ⇒ returns the same array reference. - -#### Open question for the user before coding - -- **RESOLVED 2026-06-14.** `keepRecentToolResults` = **4**, `minEditableToolChars` - = **50000** (raised from the initial 10k proposal — see config note above; 10k - was well below every shipped tool, 50k matches LibreChat and still catches the - mempalace dump). Both env-overridable so prod can tune without a deploy. - -### Task 3 — ingestion cap for oversized MCP / sub-agent results (UPSTREAM ISSUE — file so we don't forget) - -No tier can fix a _single_ result larger than the window by trimming other -messages. Today: sandbox tools self-cap (ADR-0002, `truncated` flag), but **MCP -tools (mempalace) and sub-agent returns have no cap** — one oversized result -overflows all tiers (Tier 2/recovery never prune `recent`) → provider reject → -error. The fix is an **ingestion cap at tool-result storage time** (mirror the -sandbox/ADR-0002 pattern and Anthropic's 100K MCP offload): truncate/offload the -oversized result, set a `truncated`-style marker, tell the model to narrow / -re-fetch. Lives in tool wrapping + agent-runner, not compaction. - -**Action: file as an upstream issue on `willdady/platypus`** (per the issue-tracker -skill). Marked here so it isn't lost; not yet filed. - ---- - -## Open / deferred decisions - -- **OpenAI-compatible as a separate provider type** — not required (auto-detect - probes `max_model_len` regardless of label). Deferred. -- **Persisting Tier 2** — deferred; revisit only if storing tool outputs verbatim - is itself a problem. -- **Anthropic exact token counting** via `/v1/messages/count_tokens` — optional - accuracy upgrade; deferred. -- **Projected-input arc on the ring (drift U2)** — char/4 of composer text added - as a faint arc. Deferred; the honest tooltip label ships instead. -- **CAS contention optimization (drift R4)** — under a contended chat, the - version is read → summarize (seconds) → CAS write, so the version can be stale - by write time → wasted summarize (not corruption; loser skips safely). Bounded - by one-retry-then-skip. **Do NOT fix now.** Gated on the `cas.conflict` metric - (now emitted, chunk 10); if it shows repeated waste, move the version read to - just-before-write or take a short advisory lock for the summarize window. - -### Deliberately NOT done in chunk 10 (2026-06-11) — with reasons - -The RV7e-RV10 + observability sweep closed the review backlog; these four were -left undone **on purpose**, not missed: - -- **RV9 digest-based C4 check** — once a watermark exists, C4 reads the full - `messages` JSONB row and `stableStringify`-compares the whole prefix every - turn. The compare is already **correct** (RV1 landed); a content digest would - only make it cheaper. Pure optimization of a correct path → revisit only if the - per-turn read+stringify shows up in profiling, and fold it into any future - C4 rework rather than touching the correctness path now. -- **defect 7 — `content`-type tool output base64 → char/4** (`token-estimate.ts`). - The `content` tool-result variant `stableStringify`s media bytes into the - char/4 blob. Fixing it **symmetrically** (so estimate(UI) === estimate(Model) - still holds — the load-bearing P2/T1 invariant) requires extracting media into - `nonText` on BOTH adapters, where the UI side stores `output` as untyped - `unknown`. The risk to the tested invariant outweighs the benefit: **no current - tool emits `content`-type media**. Fix before the first tool that does. -- **`bytesFromUrl` vs storage/utils `parseDataUrl` duplication** — merging them - couples the estimator to the storage layer for zero behaviour change. Left as - two small private regexes. -- **`estimate_vs_real.divergence` metric** (drift T2 feedback loop) — deferred - with the image-constant tuning work it feeds; still log-only. -- **Trigger estimator scope — FIXED (drift C1).** Originally flagged from live - test 2026-06-03; confirmed unfixed in chunk 2 by the 2026-06-09 review. **Both - prescribed paths now landed:** - 1. **DONE 2026-06-10** — `estimateOverheadTokens` adds the system prompt + tool - schemas to the projection (`projectTier1Tokens`) and subtracts them from the - compaction target (the ~986-vs-8888 gap was dominated by tool schemas). - 2. **DONE 2026-06-11** — the ADR-prescribed prior-turn provider baseline is - wired: `prepareChatTurn` threads `lastInputTokens` from the last assistant - message's `metadata.stats.contextTokens`; `projectTier1Tokens` returns - `max(charBased, lastInputTokens)` so turns ≥ 2 are floored by the real - provider count instead of trusting char/4. - - The Qwen3.6 / vLLM under-count (provider 8888 vs estimate ~986) is closed: the - projection now sees both the tool-schema overhead and the prior-turn provider - count, so it no longer blows past the trigger silently. - ---- - -## Drift log & code-review checklist - -Every issue found across 4 review rounds, the resolution, and **the exact thing -to re-verify once the code exists.** Round trajectory: R1 design holes → R2 -second-order effects → R3 a third-order race → R4 zero correctness findings (one -telemetry-gated note). This is the anti-regression list — check it at PR time. - -| ID | Issue | Resolution | ✅ Verify in code | -| ------- | ------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| **C1** | Trigger only counted last response, not what this turn adds | `projected = lastInputTokens + estimate(newMsgs)` | Trigger sums unsummarized new messages, not just last `usage` | -| **C2** | Compacting to the trigger ratio re-fires next turn (Cline #5616 thrash) | Hysteresis: target 0.5 ≠ trigger 0.8. Chunk-12 removed the schema fields + `resolveCompactionConfig` clamp; the runtime `targetRatio→triggerRatio*0.9` clamp now lives in `buildCompactionRuntime` after the env overrides (B-F1). | Post-compaction output `<= targetTokens`; a follow-up turn does not re-compact; an inverted `COMPACTION_TARGET_RATIO`/`COMPACTION_TRIGGER_RATIO` env pair is clamped (warn) not honored | -| **C3** | Raw window ratio ignored output + safety headroom | `inputBudget = window − maxOutputReserve − safetyReserve` | Budget subtracts both reserves before ratios | -| **C4** | Edit/delete/regenerate below watermark → stale summary | Invalidate via `writeWatermark`: version bump + clear summary + reset watermark | Every edit/delete/regenerate handler calls `writeWatermark`; forking below watermark resets on new branch | -| **M1** | Cold-start on huge imported history exceeds summarizer's own window | Chunked / map-reduce summarize | Prefix larger than summarizer window is chunked, not sent whole | -| **M2** | First-turn char/4 underestimate | `× 1.15` margin + recovery net | First-turn projection applies the margin | -| **M3** | "Both tiers apply to sub-agents" was wrong | Sub-agents = Tier 2 only (no durable history) | Sub-agent path wires Tier 2 + window only, no Tier 1 | -| **T1** | Tier 1 (UIMessage) and Tier 2 (ModelMessage) measured by different estimators → divergence | **One** `estimateTokens` over `CountUnit[]`, two adapters; `MODEL_BOUND` filter excludes UI-only parts both sides | No second estimator exists; equality test passes exactly on filtered set; UI-only parts (reasoning/source/step/data) never counted | -| **T2** | char/4 on base64 images is meaningless; ordering vs inline unclear | Modality table (anthropic/openai/default), header-parse dims w/ constant fallback, detail→high; estimate AFTER `inlineFileUrls`; divergence `log.warn` | No char/4 on image bytes; Tier 1 runs post-inline; missing dims → 1200; turn-2 divergence logged | -| **T3** | Recovery compaction vs "single durable writer"; finalize-mid-error ambiguous | Recovery does in-memory trim via `compactModelMessages` + sets persisted `compactionDirty`; durable write on NEXT `prepareChatTurn` only | Recovery never writes summary/watermark directly; `compactionDirty` is a DB column; recovery trim calls the Tier 2 adapter, not a bespoke trim | -| **T4** | litellm registry keys don't match our model IDs | Normalization chain + alias map + `log.warn` on MISS | Lookup tries exact→strip-prefix→lower→alias→family; Bedrock ARN / Azure resolve or log a miss | -| **T5** | Window cache stale after override edit | `cache.evict(providerId)` in provider PATCH, immediate | Editing modelMeta busts cache without waiting TTL | -| **T6** | 8192 default silently over-compacts | `log.warn` on default; ring renders **neutral**, no false ramp | Fall-to-default is logged; ring is grey/no-% when window unknown | -| **T7** | `taskModelId` may be unset | Fallback `taskModelId → main`; log model + cost | Summarizer falls back to main model; no crash on unset | -| **T8** | char/4 underestimates CJK/JSON | Accepted; margin + real-usage handoff + recovery net | (No code; documented as text-only heuristic) | -| **T9** | One synthetic 400 doesn't cover per-provider error bodies | Fixture set: OpenAI / Anthropic / Google-vLLM | `isContextOverflowError` matrix tests real per-provider phrasings | -| **T10** | CAS rejects stale write but loser behavior undefined → livelock risk | Re-read; if winner advanced → SKIP; else retry once then SKIP. NOTE: the covered-skip deliberately does NOT clear dirty (B-F6) — a concurrent `invalidateCompaction` also advances the version but leaves dirty set on purpose, so clearing it on a covered skip could drop a forced compaction. Leaving dirty is strictly safe (≤1 extra compaction next turn). | Loser never recompute-loops; terminal state is skip; decides by version; covered-skip leaves dirty for the next turn | -| **R1** | Loser-skip assumed monotonic watermark; C4 reset moves it backward → stale write back door | All writes (advance/reset/dirty) through one versioned CAS; loser compares **version** not watermark value | Single `writeWatermark`; invalidation bumps version; no path mutates these fields outside it | -| **U1** | Ring showed previous model's window after a model switch | Resolve window from **selected** model, not last-message metadata | Ring reads selected-model window from `modelMeta`, refreshes on switch | -| **U2** | Ring lags pending composer input | Required tooltip label "current input not yet counted"; arc deferred | Tooltip text present and unmistakable | -| **U3** | Forced-compact confirm too soft | Confirm default-ON when drop significant (`>keepRecent` or `>30%`) | Threshold confirm wired; (P1: not destructive anyway) | -| **U4** | No feedback for defer-while-streaming click | Pending badge + disabled ring + "will compact on finish" tooltip | Ring disables + shows pending state between click and finish | -| **R4** | CAS read→summarize→write window wastes summarize under contention | Accepted, **not fixed**; gated on `cas.conflict` metric | `cas.conflict` metric emitted; no premature lock added | -| **P1** | (principle) compaction misread as data loss | View-not-delete: raw messages persist | No code path hard-deletes a summarized message | - ---- - -## Appendix: prior art & review - -### Prior art (open-source tools surveyed) - -| Tool | Strategy | Window source | Threshold | Pitfall | -| ------------ | ---------------------------------------- | --------------------------------------------- | ------------------------------------------------------- | ------------------------------------------------ | -| Open WebUI | none (BYO filter), errors out | `num_ctx` (Ollama only) | n/a | silent overspend on API providers | -| LibreChat | **both**: prune tool results → summarize | `maxContextTokens` (yaml) | trigger on prune; `reserveRatio` 0.05 | ignored on some endpoints | -| LangGraph | `trim_messages` vs `SummarizationNode` | you supply | you set | trim breaks tool pairs | -| llama.cpp | context-shift or HTTP 400 | `--ctx-size`, `--keep N` | off by default | infinite shift loop / hard 400 | -| Ollama | silent clip | `num_ctx` (default 2048) | clips | silent token loss | -| Cline | **summarize at %** | reads window − model buffer | `autoCondenseThreshold` (0-1) | **thrash (cline #5616)** | -| Claude Code | **summarize at %** | reads window, live meter | ~83.5%, `CLAUDE_AUTOCOMPACT_PCT_OVERRIDE` | ~33k buffer reserved for compact | -| Codex CLI | prune-from-memory → summarize | `ctx − min(maxOut,20k)` | `effective − 13k`; **hard 90% cap** | freezes near threshold (#19116) | -| Hermes Agent | prune tool results → summarize | provider metadata + `context_length` override | **0.50** primary + **0.85** hygiene net + 400-msg valve | token **floor** breaks sub-floor models (#14690) | -| OpenRouter | middle-out truncate (gateway) | `/models.context_length` | on overflow | drops middle silently | -| litellm | `trim_messages` (trim_ratio 0.75) | **model registry JSON** | ratio | orphaned tool-call msgs | - -Borrowed: litellm registry (§A), `reserveRatio` headroom (§C), prune-then-summarize -staging (§C), hysteresis vs thrash (§C), fail-loud event (§C), live usage meter (§H). - -Sources: Open WebUI context-window docs + discussions #4983/#6402; LibreChat -summarization/model_specs/token_usage docs; LangGraph add-memory docs; llama.cpp -server README + issues #17284/#3969; Cline auto-compact docs + issue #5616; -litellm token_usage/message_trimming docs + `model_prices_and_context_window.json`; -OpenRouter models + message-transforms docs; vLLM engine args; Codex CLI compaction -docs + issues #11805/#19116; Claude Code auto-compact env override + issue #41818; -Hermes Agent context-compression docs + issues #12626/#14690. - -### Field re-survey (2026-06-12) — verified complete vs Hermes / Codex / Claude Code / Cline - -Re-checked the shipped implementation against the four agents above. **Every -"gap" the survey suggested is already implemented** — the design is at or ahead -of the field: - -- **Real provider-token feeding** — already done (C1). `projectTier1Tokens` - returns `max(charBased, lastInputTokens)`; the estimate is the cold-start - fallback (turn 1, no prior `usage`) and the stale-tail top-up, not a - replacement. Strictly safer than Hermes (which only falls back to estimate). -- **Input-tokens-only window** — already done (F1). `windowFromRegistryEntry` - trusts only `max_input_tokens`, never litellm's `max_tokens` (the output cap). -- **Reserve carve / "90% cap" equivalent** — already done (C3). The trigger is a - fraction of `inputBudget = window − maxOutputReserve − safetyReserve`, so even - `triggerRatio = 1.0` (the schema max) fires below the raw window. Codex's hard - 90% clamp is structural here, not a separate clamp. -- **Two-layer (proactive + always-on recovery)** — P4; matches Hermes - primary+hygiene and Claude's auto-compact+buffer. - -**The token-FLOOR anti-pattern (Hermes #14690) — explicitly DO NOT copy.** -Hermes clamps the _trigger_ up with `max(ctx·pct, 64000)`, which exceeds the real -window on any sub-64k model → compaction never fires → silent overflow. Our -fraction-of-`inputBudget` trigger is inherently safe at any window size; a floor -belongs only on the _window fallback_ (`detected ?? DEFAULT`, never -`max(detected, FLOOR)`), which §A already does. - -**Genuinely-absent, deferred (optional — not bugs):** - -- **Message-count force-compact valve** (Hermes 400-msg `hygiene_hard_message_limit`). - A count-based backstop independent of the token estimate — catches a blown - estimator that the recovery net would otherwise have to absorb. Cheap; consider - if the `estimate_vs_real.divergence` signal ever shows the estimator drifting. -- **`maxOutputReserve` floor when max-output is unknown.** `computeBudget` reserves - `maxOutputTokens ?? min(4096, 0.25·ctx)`; a reasoning model with a large real - output but no resolved `max_output_tokens` could under-reserve. Recovery covers - it today; revisit only if overflow-on-output shows up for such a model. -- **Model-aware aggressiveness** (Cline trims 75% on small windows vs 50%). Marginal; - our fixed `targetRatio` is adequate. Deferred. - -### Review change log (applied to this doc) - -- **C1–C4, M1–M3** — see drift table. -- **A** litellm registry replaces homegrown lookup table. -- **T1–T10, R1, R4, U1–U4** — round 2-4 findings, see drift table. -- **P1–P4** — design principles extracted from the review consensus. -- Added: prune-before-summarize Stage 1, fail-loud `context-compacted` event, - Observability section, global kill switch, lazy-rollout note, ADR (queued: - `docs/adr/NNNN-context-compaction.md` capturing the _why_ — two tiers, - view-not-delete, CAS-on-version, char/4-not-tokenizer). diff --git a/docs/adr/0009-context-compaction.md b/docs/adr/0009-context-compaction.md deleted file mode 100644 index 07a26081..00000000 --- a/docs/adr/0009-context-compaction.md +++ /dev/null @@ -1,136 +0,0 @@ ---- -status: proposed ---- - -# Chat Context Compaction - -Chats hard-fail when message history exceeds a model's context window. This ADR -records **how** we decided to keep them alive and **why** the obvious simpler -options were rejected. The implementation spec (the _how_) lives in -`context-compaction-plan.md`; this ADR is the _why_ and the gate that -implementation answers to. - -Status is **proposed** until step 1 (window resolution + estimator + schema) -confirms the foundation holds; promote to **accepted** then. If implementation -forces a different choice, supersede with a new ADR rather than editing this one. - -## Decision - -A **two-tier, view-not-delete** compaction model, fed by a **single token -estimator**, with all durable state mutated through a **single versioned CAS -writer**, and an always-on **recovery net** for overflow errors the proactive -path misses. - -### Two tiers, not one - -- **Tier 1 — cross-turn, durable.** Runs in `prepareChatTurn` before a response. - Summarizes/prunes old history, persists a summary + watermark. Owns durable - state. -- **Tier 2 — intra-turn, throwaway.** Runs in the AI SDK `prepareStep` hook to - keep a single heavy response (many tool/sub-agent calls) executable mid-loop. - Not persisted — the SDK's canonical message list commits to history as normal, - and next turn Tier 1 folds it into the durable summary. - -One tier cannot cover both cases: a single response can blow the window without -any cross-turn history growth (Tier 2's job), and durable history must be -compacted before a turn even starts (Tier 1's job). Sub-agents — which start -fresh each invocation with no cross-turn history — therefore use **Tier 2 only**. - -### Compaction is a view, not a delete - -The watermark + summary change _what is sent to the model_, never _what is -stored_. Raw messages persist in the DB untouched. This makes forced/automatic -compaction non-destructive in the data sense (a user can still read full -history; a future "expand summary" UI is free), and reduces "irreversible data -loss" objections to a UX-courtesy confirmation rather than a correctness -concern. - -### One estimator - -Token counting lives in exactly one function over one neutral structure -(`CountUnit[]`). Tier 1 (UIMessages) and Tier 2 (ModelMessages) both normalize -into it, counting only model-bound parts. Divergence between the two tiers is -impossible by construction rather than monitored — a tier cannot fire on a -number the other never sees. - -### One durable writer (versioned CAS) - -All mutations of compaction state (`summaryWatermark`, `contextSummary`, -`compactionDirty`) go through a single compare-and-swap function keyed on a -`version` column. Concurrent runs on one chat (e.g. a trigger run and a user -run), and the interaction between compaction and history-edit invalidation, are -resolved by **version**, not by comparing watermark values — so a watermark that -moves _backward_ on an edit cannot be misread as "not yet advanced" and produce a -stale summary over mutated history. - -### Recovery is the net, not the plan - -A `400/413` context-overflow error is caught, the messages aggressively trimmed -in-memory (via the same Tier 2 adapter), and the call retried **once**. Recovery -never writes durable state directly — it flags `compactionDirty`, and the next -turn's Tier 1 does the durable compaction. Recovery stays on even when proactive -compaction is globally disabled; it is the last line of defense, not a risk -surface. - -### Char/4 estimate, not a real tokenizer - -Pre-call token counting uses a `char/4` heuristic (text parts only; a modality -table for images) on the **first turn only**. Every later turn uses the -provider-reported real `usage.inputTokens`. We accept first-turn imprecision -(guarded by a 1.15 margin and the recovery net) rather than ship a per-provider -tokenizer dependency. - -### Window source: API → litellm registry → default - -Resolve the context window per-model: manual override → provider API auto-detect -(Google/OpenRouter/vLLM expose it) → the community-maintained litellm registry -JSON (covers OpenAI/Anthropic/Bedrock, which don't) → a conservative 8192 -default. We do **not** maintain our own context-window table. - -## Considered Options - -- **Single-tier compaction (cross-turn only)** — rejected. Cannot rescue a - single response whose own tool loop overflows the window; would force the whole - response to fail even though durable history was fine. -- **One estimator per tier** — rejected. Two estimators over two message shapes - drift; one tier ends up firing on a count the other never computes, making - contention and threshold bugs undebuggable. Collapsed to one estimator + two - adapters. -- **Hard-delete / truncate old messages** — rejected. Irreversible, and the - "drops the middle silently" failure mode seen in gateway truncation (OpenRouter) - and silent clipping (Ollama). View-not-delete keeps the data and makes the - action auditable (a visible `context-compacted` event). -- **Homegrown context-window lookup table** — rejected. Unmaintainable across - providers and model churn; the litellm registry is the industry "don't maintain - your own table" answer (AnythingLLM dropped its hardcoded table for it). -- **A real pre-call tokenizer** — rejected for v1. Per-provider tokenizers are a - heavy dependency for a number that the provider returns accurately after the - first call anyway. -- **Optimistic-without-version concurrency (compare watermark values)** — - rejected. Breaks when history-edit invalidation moves the watermark backward; - the versioned CAS removes the monotonicity assumption entirely. -- **Compacting to the trigger threshold** — rejected; it re-fires every turn - (the Cline #5616 thrash). Trigger and target ratios are deliberately distinct - (0.8 vs 0.5 of the input budget) for hysteresis. - -## Consequences - -- **Schema additions.** `provider.modelMeta` (JSONB, per-model window/output - overrides); chat/run gain `contextSummary`, `summaryWatermark`, - `compactionDirty`, and `version`. All additive nullable columns. -- **Lazy rollout, no backfill.** Existing chats compact only on their next turn; - no eager backfill job (it would create a thundering herd of summarize calls). -- **A summarize call costs money and latency.** Stage 1 prunes without a model - call first; Stage 2 summarizes only when pruning is insufficient, using the task - model (falling back to the main model). -- **First-turn token estimates are imprecise**, especially for - image-heavy/CJK/JSON content; the recovery net absorbs the misses and a - divergence metric tunes the image constants over time. -- **A global `COMPACTION_ENABLED` kill switch** disables proactive compaction in - prod without a deploy; recovery is unaffected. -- **Observability is part of the contract** — compaction/recovery/CAS-conflict - metrics gate the two deferred optimizations (CAS contention, projected-input - ring arc); without the metrics those decisions are guesses. -- **Frontend gains a context-usage ring** (window resolved from the _selected_ - model, neutral when unknown) and a per-message stats popover, reusing the - existing tool-call timing mechanism. diff --git a/docs/adr/0012-context-compaction.md b/docs/adr/0012-context-compaction.md new file mode 100644 index 00000000..d06cb26e --- /dev/null +++ b/docs/adr/0012-context-compaction.md @@ -0,0 +1,385 @@ +--- +status: accepted +--- + +# Chat Context Compaction + +Chats hard-fail when message history exceeds a model's context window. This ADR +records the design we shipped to keep them alive, **why** the obvious simpler +options were rejected, and the named parts the implementation refers back to. + +It is self-contained: every decision, mechanism, and trade-off the code cites +lives in a section below. Code comments reference this ADR by section name (e.g. +_"ADR-0012 §Tier 1"_, _"ADR-0012 §Summary invalidation"_) rather than by any +external plan or chunk number. + +If a future change forces a different choice, supersede with a new ADR rather +than editing this one. + +## Context + +The AI SDK (`ai@6`) reports real token usage **after** each call +(`usage.inputTokens`/`outputTokens`/`totalTokens`) but exposes **no** +context-window metadata on the model interface and **no** pre-call tokenizer. +Providers diverge on whether the window is discoverable: Google +(`inputTokenLimit`), OpenRouter (`context_length`), and vLLM/OpenAI-compatible +(`max_model_len`) expose it via API; OpenAI, Anthropic, and Bedrock do not. +Error handling previously covered only auth/rate-limit/5xx — a context-overflow +rejection killed the turn. Top-level chats and sub-agents both run through the +shared `agent-runner`/`ToolLoopAgent`, so one implementation covers both. + +## Decision + +A **two-tier, view-not-delete** compaction model, fed by a **single token +estimator**, with all durable state mutated through a **single versioned CAS +writer**, an always-on **recovery net** for overflow errors the proactive path +misses, and a deterministic **context-editing** pass that prunes stale bulky +tool results without a model call. + +## Principles (load-bearing) + +### View, not delete + +The watermark + summary change _what is sent to the model_, never _what is +stored_. Raw messages persist in the DB untouched. Forced/automatic compaction +is therefore non-destructive in the data sense — a user can still read full +history; a future "expand summary" UI is free — which reduces "irreversible data +loss" objections to a UX-courtesy confirmation rather than a correctness concern. +Never hard-delete a summarized message. + +### One estimator + +Token counting lives in exactly one function over one neutral structure +(`CountUnit[]`). Tier 1 (UIMessages) and Tier 2 (ModelMessages) both normalize +into it, counting only **model-bound** parts (`text`, `tool-call`, +`tool-result`, `file`, `image`). UI-only parts (`reasoning`, `source`, +`step-start`, `data-*`) are excluded on both sides. Divergence between the tiers +is impossible by construction rather than monitored — a tier cannot fire on a +number the other never sees. + +### One durable writer + +All mutations of compaction state (`summaryWatermark`, `contextSummary`, +`compactionDirty`) go through a single compare-and-swap function keyed on a +`version` column. Concurrent runs on one chat (e.g. a trigger run and a user +run), and the interaction between compaction and history-edit invalidation, are +resolved by **version**, not by comparing watermark values — so a watermark that +moves _backward_ on an edit cannot be misread as "not yet advanced" and produce a +stale summary over mutated history. On a CAS conflict the loser re-reads the row: +if the winner already covered its prefix it **skips** (safe no-op); otherwise it +retries **once**, then skips with a contended warning. No recompute-loop, no +livelock. A covered-skip deliberately does **not** clear `compactionDirty`: a +concurrent invalidation also bumps the version but intentionally leaves dirty set +(it resets the summary without shrinking history), so clearing dirty on a skip +could drop a forced compaction the overflow demanded. Leaving it set is strictly +safe — worst case is one extra compaction next turn. + +### Recovery is the net + +A `400/413` context-overflow error is caught, the messages aggressively trimmed +in-memory (via the same Tier 2 adapter — no bespoke trimmer), and the call +retried **once**. Recovery never writes durable summary/watermark state directly — +it flags `compactionDirty` on detection (before the retry outcome), and the next +turn's Tier 1 does the durable compaction. Recovery stays on even when proactive +compaction is globally disabled; it is the last line of defense, not a risk +surface. + +## Mechanisms + +### Window resolution + +`resolveContextWindow(provider, modelId)` resolves per-model in order: manual +override (`provider.modelMeta`) → provider API auto-detect (Google / OpenRouter / +vLLM) → the community-maintained **litellm registry** JSON (covers +OpenAI/Anthropic/Bedrock, which don't expose it) → a conservative `8192` default. +We do **not** maintain our own context-window table. + +- **Key normalization.** Registry keys don't match `resolvedModelId` 1:1. Lookup + order: `exact → strip provider prefix → lowercase → alias map → family +heuristic → MISS`. The family heuristic uses boundary-safe separators + (`"-"`, `"."`, `":"`, `"/"`) so `gpt-4.5-preview` never resolves via a stale + `gpt-4` entry. Every MISS warns (it falls to default — must be visible). +- **Caching & eviction.** Results cache in-memory per provider+model with a TTL. + Editing a `modelMeta` override **immediately** evicts (`evict(providerId)`) in + the provider PUT handler — TTL is only a backstop. `source:"default"` results + use a short TTL (60 s) so a registry miss or transient API blip doesn't pin + 8192 for the full hour. API fetches use a 5 s timeout and single-flight + (`#inflight`) to avoid a cold-cache stampede. +- When the window is default/unknown the ring renders **neutral**, never a + guessed ramp. `maxOutputTokens` is resolved the same way (needed for the budget + math). + +### Token estimation + +Char/4 over **text parts only** (never char/4 a base64 image); a modality table +sizes non-text parts (`anthropic`/`openai`/`default` constants, dimensions from a +cheap PNG/JPEG header parse when bytes are in hand). Used **only on the first +turn** before any provider `usage` exists; every later turn uses the +provider-reported real `usage.inputTokens`. We accept first-turn imprecision +(guarded by a 1.15 cold-start margin and the recovery net) rather than ship a +per-provider tokenizer. The Tier 1 estimate runs **after** file inlining so the +payload counted is the real one. Where image `detail` is unset (the common case) +we assume `high` — **over-counting beats overflow**. A turn-2 divergence check is +a designed-in feedback hook: compare the cold-start estimate against the real +`usage.inputTokens` and warn when they diverge by >50%, to tune the image +constants over time (currently log-only). + +### Tier 1 — cross-turn compaction (durable) + +Runs in `prepareChatTurn` before a response, over durable history (UIMessages). + +- **Budget math.** Trigger and target are fractions of the **input** budget, not + the raw window: `inputBudget = window − maxOutputReserve − safetyReserve` + (safety = `reserveRatio × window`, default 0.05). `triggerTokens = 0.8 × +inputBudget`, `targetTokens = 0.5 × inputBudget`. Per-turn **overhead** (system + prompt + tool schemas + skill list) is counted toward the trigger and + subtracted from the effective target, since it consumes the same window but is + invisible to a message-only estimate. When `maxOutputTokens` is unknown the + output reserve falls back to `min(4096, 0.25 × window)`. +- **Trigger projection.** `projected = max(charBasedEstimate, lastInputTokens)` + where `lastInputTokens` is the prior turn's provider-reported + `usage.inputTokens` (threaded from the last assistant message's + `metadata.stats.contextTokens`). The cold-start ×1.15 margin applies only on + turn 1 when no provider baseline exists. Compact when `projected ≥ trigger`. +- **Hysteresis.** Compaction must reduce the conversation to `≤ targetTokens`, + well below the trigger, so it does **not** re-fire next turn. The trigger (0.8) + and target (0.5) ratios are deliberately distinct. Config is global/env-only + (no submitted schema to validate), so the runtime clamps `target → trigger × +0.9` when an operator sets `COMPACTION_TARGET_RATIO ≥ COMPACTION_TRIGGER_RATIO`. +- **Staged, cheap-first.** Stage 1 **prunes** the older prefix without a model + call (soft-trim bulky tool/RAG results to head+tail, then placeholder over + `minPrunableChars`); only if still above target does Stage 2 **summarize** the + prefix into one synthetic summary message. Tool-call/result pairs are atomic and + never split across the keep boundary. Output: `[system, summary, …kept recent]`. + A visible `context-compacted` event makes it fail-loud. +- **Summarizer model & map-reduce.** Summarize uses the task model + (`taskModelId`), falling back to the main model; same-provider only. When the + prefix exceeds the summarizer's own window it is chunked and map-reduced (a + large cold-start/imported history can't be sent whole). Summarization is + **incremental**: each turn only the messages _after_ the watermark are + summarized and folded into the existing summary, then the watermark advances. +- **Summary invalidation.** If a message at/below `summaryWatermark` is + edited/deleted/regenerated the summary is stale. The handler bumps version + + clears `contextSummary` + resets the watermark in one CAS write. Because the CAS + loser compares **version** (not watermark value), a compaction racing an + invalidation sees a conflict and re-reads the reset state — it can never write a + stale summary over mutated history. The invalidation compares the **un-inlined** + submission (file URLs match on both sides) with stable key ordering (jsonb is + re-ordered by Postgres), against the pre-overwrite DB snapshot loaded before the + sink overwrites the row. + +### Tier 2 — intra-turn compaction (in-memory) + +For a single heavy response (many tool/sub-agent calls) that bloats the window +mid-loop. Runs in the AI SDK `prepareStep` hook on both `streamText` and +`generateText`, over ModelMessages, summarizing old completed tool results while +keeping recent steps verbatim and preserving call/result pairing. Fires **only +when genuinely near the limit** (no per-step overhead on a small loop). **Not +persisted** — the SDK's canonical message list commits to history as normal, and +next turn Tier 1 folds it into the durable summary. One tier cannot cover both +cases: a single response can blow the window without any cross-turn growth (Tier +2's job), and durable history must be compacted before a turn starts (Tier 1's +job). + +### Recovery + +`isContextOverflowError` matches `APICallError` with status `400/413` and a +per-provider body regex (OpenAI/vLLM, Anthropic, Google, Bedrock — fixture-tested +matrix). The recovery middleware wraps the model in **both** `streamText` and +`generateText`, so every step of a tool loop gets detect → flag `compactionDirty` +(persisted on detection, via the durable writer) → trim via the **same Tier 2 +adapter** (system head pinned, keep-recent halved with a floor of 2, forced past +the estimate gate since the provider already rejected the prompt) → retry once. A +second failure surfaces "Conversation too large — start a new chat". Durable +compaction happens on the **next** `prepareChatTurn`, which sees the dirty flag. +Headless runs (no chat row) still get the in-memory trim + retry, but cannot flag +`compactionDirty` — there is nothing to persist to. + +### Sub-agents + +Sub-agents start fresh each invocation (only a `task` string, no cross-turn +history), so they have nothing for Tier 1 to compact — they use **Tier 2 only**. +Each resolves its own model's window/output and passes it through; recovery covers +them too because `agent-runner` is shared. + +### Config & kill switch + +Compaction behavior is **global** (`DEFAULT_COMPACTION_CONFIG`); only window/output +**size** is per-model (via §Window resolution / `provider.modelMeta`). Per-agent +tuning was shipped and then removed — no surveyed tool (Hermes/Codex/Claude +Code/Cline) exposes per-agent knobs, and the ratios self-normalize to the model +window, so per-agent variance bought nothing measurable. The env +`COMPACTION_ENABLED` (default true) disables **all proactive** compaction (Tier 1 + +- Tier 2) in prod without a deploy; **recovery ignores it**. A single-agent + opt-out, if ever needed, would be per-model or per-workspace — not per-agent. + +### Context-usage ring + +The frontend shows a small SVG ring next to the model selector, fill = +`usedTokens / contextWindow`, ramping green → amber (≥0.7) → red (≥0.9), and +**neutral grey when the window is unknown/default**. The window comes from the +**currently selected model** (not the last assistant message's metadata, else it +shows the previous model's window after a switch); the numerator is the last +response's `contextTokens` (the **last step's** `usage.inputTokens` — peak context +fullness, not the run-wide sum, which would over-count multi-step loops). A +required tooltip states the ring reflects the last response, not the unsent +composer input. + +### Per-message stats + +An `(i)` action under each assistant response shows input/output tokens, TTFT, and +total generation time, reusing the existing tool-call timing mechanism. Stats are +stamped on `message.metadata.stats` at the `applyToolCompletions` point (the +`messageMetadata` callback fires at message start, before timing/usage exist). +TTFT/total are server-measured; cost figures use the run-wide token sums. + +### Force-compact on demand + +The ring is clickable: `POST /chats/:id/compact` runs Tier 1 once **regardless of +threshold** (force), persists via the durable writer, and returns the post-compact +usage so the ring refreshes immediately. If a response is streaming the click is +**deferred** (pending badge, disabled, fires on finish); a confirm dialog appears +only when the drop is significant — `messagesDropped > keepRecentMessages` **or** +an estimated reduction `> 30%` of history — below that it runs immediately. Per +§View-not-delete this is not destructive regardless. + +### Compaction trace in the timeline + +Compaction is surfaced as a synthetic `compact_context` tool-call + tool-result +pair, reusing the existing tool-call UI (active spinner → collapsed expander with +a summary excerpt). The Tier 1 path injects the pair into the response stream; the +force-compact path (no live stream) persists a standalone synthetic assistant +message **above** the watermark. The trace part is **stripped before +`convertToModelMessages`** at both call sites so it never replays to the provider +as a phantom tool call; a trace-only message is dropped entirely. The trace is +emitted only when an actual model summary ran (not for prune-only or +dirty-within-target no-ops). + +### Stage 0 — context editing (prune, don't summarize) + +A deterministic, no-model-call pass that runs as **Stage 0 inside +`applyTier1Compaction`, before the trigger decision**, replacing the `output` of +**old bulky** tool results with a self-describing placeholder (names the tool + +elided size; tells the model to re-call). This mirrors Anthropic's +`clear_tool_uses` context editing. It keeps the tool-call block (pairing stays +valid), prunes by **recency count** of tool results (`keepRecentToolResults`, +default 4) above a **size gate** (`minEditableToolChars`, default 50 000), exempts +the newest message, and is idempotent + grow-guarded (never re-elides a +placeholder, never inflates a result smaller than the placeholder). Running it +before the trigger lets a lean view **avoid** summarization entirely (cheaper). +It needs **no durable state, no CAS, no version bump** — it is recomputed from raw +messages each turn, a sibling of the trace-stripping transform. Accepted fidelity +loss: an elided placeholder also flows into any prefix Stage 2 later summarizes +(a huge dump's head+tail is poor summary fodder anyway; raw stays in the DB). + +### Hard window wall (recent-trim gate) + +Missing the soft `targetTokens` is cheap (a hysteresis goal). The hard wall is +`inputBudget` (window − output reserve − safety). Recent (kept) messages are +trimmed **only** when `estimate(recent) + summary > inputBudget` (the call would +actually overflow), not on the soft-target miss — below the wall `recent` stays +full-fidelity and simply re-compacts next turn. The single newest message is +always exempt. A single result too large to fit _even as the newest_ is the +unsolved ingestion-cap case (an over-large dump as the last message will +hard-error) — out of scope here, would need an ingestion cap at storage time. + +### Summarizer hardening + +Tier-1 `summarize()` is a long blocking call inside `prepareChatTurn`, before the +response stream opens, and does not bump the per-step stall timer — the 120 s +watchdog once killed a slow summarize mid-call. Hardening: a **heartbeat** pings +`onActivity` (~every 10 s) so the watchdog keeps resetting; a `maxOutputTokens` +**ceiling** (≈4 000) backstops a degenerate runaway expansion (`finishReason === +"length"` is logged); the summarize call is **cancellable** (`abortSignal`); and a +**structured handoff prompt** (intent · decisions/facts · files/tools · current +state) with an explicit length target reduces loss across repeated +re-compactions. + +## Considered Options + +- **Single-tier compaction (cross-turn only)** — rejected. Cannot rescue a single + response whose own tool loop overflows the window. +- **One estimator per tier** — rejected. Two estimators over two message shapes + drift; collapsed to one estimator + two adapters. +- **Hard-delete / truncate old messages** — rejected. Irreversible, and the + "drops the middle silently" failure mode seen in gateway truncation. View-not- + delete keeps the data and makes the action auditable. +- **Homegrown context-window lookup table** — rejected. Unmaintainable across + providers; the litellm registry is the industry "don't maintain your own table" + answer. +- **A real pre-call tokenizer** — rejected for v1. A heavy per-provider dependency + for a number the provider returns accurately after the first call. +- **Optimistic concurrency by comparing watermark values** — rejected. Breaks when + invalidation moves the watermark backward; versioned CAS removes the + monotonicity assumption. +- **Compacting to the trigger threshold** — rejected; it re-fires every turn (the + thrash failure). Trigger and target ratios are distinct for hysteresis. +- **Per-agent compaction tuning** — shipped then removed; no real tool exposes it + and the ratios self-normalize to the model window. +- **Summarize-only (no context editing)** — insufficient. Bulky kept tool results + dominate `tokensAfter`; deterministic prune-not-summarize is cheaper and lossless + to the DB. +- **A token FLOOR on the trigger** (`max(window × pct, 64000)`) — rejected as an + anti-pattern. It overflows sub-64k models (the trigger never fires). A floor + belongs only on the _window fallback_ (`detected ?? DEFAULT`), never as + `max(detected, FLOOR)`. +- **Sizing the window from litellm `max_tokens`** — rejected. Only + `max_input_tokens` is trusted; `max_tokens` is the output cap, not the context + window, and conflating them mis-sizes the budget. +- **A selectable compaction model in the provider UI** — rejected. Compaction + already runs through `taskModelId` on the chat provider's own client + (same-provider only); on a single-model provider a dropdown is a no-op, and + `workspace.taskModelProviderId` (which routes _other_ task work) deliberately + does not apply to compaction. Not worth the multi-provider wiring now. + +## Open / deferred decisions + +Consciously deferred, with rationale — recorded so the _why-not-yet_ isn't lost: + +- **CAS-contention optimization (the per-turn full-history read + stringify)** is + left unoptimized; the full-prefix compare is already correct, so this is gated on + the `cas.conflict` metric actually showing waste before it's touched. +- **The estimate-vs-real divergence metric** (see §Token estimation) stays log-only + until the image-constant tuning work is picked up. +- **Live `Pending → Running` compaction trace** is deferred; the trace renders + post-hoc "Completed" only. Run/connection decoupling already neutralizes the + data-loss vector, so this is a liveness/UX gap, not correctness. +- **Also deferred:** a message-count force-compact valve (a count-based backstop + independent of the token estimate), a projected-input arc on the ring, persisting + Tier 2 output, model-aware trim aggressiveness, and Anthropic's `count_tokens` for + exact Claude counts — none are needed for the "no more hard fails" goal. +- **Latent invariant — content-type tool-result media.** The `content`-variant + tool result currently serializes media bytes into the char/4 text blob. Fixing it + must be **symmetric** across both adapters (extract media into `nonText` on the UI + _and_ Model side) or it breaks the §One estimator equality — the load-bearing + invariant. No current tool emits content-type media; fix it **before** the first + one does, not after. + +## Consequences + +- **Schema additions.** `provider.modelMeta` (JSONB, per-model window/output + overrides); chat/run gain `contextSummary`, `summaryWatermark`, + `compactionDirty`, `version`. All additive, nullable/defaulted. +- **Lazy rollout, no backfill.** Existing chats compact only on their next turn; no + eager backfill job (it would create a thundering herd of summarize calls). +- **A summarize call costs money and latency.** Stage 0 (context editing) and Stage + 1 (prune) run first without a model call; Stage 2 summarizes only when needed. +- **First-turn token estimates are imprecise** (image-heavy/CJK/JSON); the recovery + net absorbs the misses. +- **Cross-tenant safety.** The submit route verifies the body `id` belongs to the + caller's workspace before a run starts — the compaction store is keyed by chat id + only, so an unvalidated id would otherwise let one workspace mutate another's + summary/watermark. +- **A global `COMPACTION_ENABLED` kill switch** disables proactive compaction in + prod without a deploy; recovery is unaffected. +- **Observability is part of the contract** — emitted as structured `metric:`-tagged + log lines: `compaction.fired`, `summarize.latency_ms`, `recovery.*`, + `context_window.fell_to_default`, `litellm.key_miss`, `cas.conflict`, + `context_edited`. +- **Frontend gains a context-usage ring** (window from the selected model, neutral + when unknown), a per-message stats popover, and a `compact_context` timeline + trace, all reusing the existing tool-call timing/rendering mechanism. +- **Unsolved: the single oversized newest result.** A tool result too large to fit + even as the newest message hard-errors; the fix is an ingestion cap at storage + time, out of scope here. diff --git a/packages/schemas/index.ts b/packages/schemas/index.ts index 7dbdcd5a..6ae33b3c 100644 --- a/packages/schemas/index.ts +++ b/packages/schemas/index.ts @@ -101,7 +101,7 @@ export const chatSchema = z.object({ seed: z.number().optional(), presencePenalty: z.number().optional(), frequencyPenalty: z.number().optional(), - // Context-compaction state (docs/adr/0009). Server-managed; intentionally NOT + // Context-compaction state (docs/adr/0012). Server-managed; intentionally NOT // part of chatSubmit/chatUpdate. summaryWatermark is the message id of the // last summarized message (P1: a view over history, never a delete). contextSummary: z.string().nullable().optional(), @@ -556,7 +556,7 @@ export const providerApiModeSchema = z.enum(["chat", "responses"]); export type ProviderApiMode = z.infer; -// Per-model context-window / output overrides (context-compaction-plan §A). +// Per-model context-window / output overrides (ADR-0012 §Window resolution). // Keyed by model id; both fields optional so an override can set just one. export const modelMetaEntrySchema = z.object({ contextWindow: z.number().int().positive().optional(), @@ -1547,16 +1547,16 @@ export const dashboardUpdateSchema = z.object({ mobileLayout: z.array(rglLayoutItemSchema).optional(), }); -// Message stats (context-compaction-plan §H/§I) +// Message stats (ADR-0012 §Context-usage ring / §Per-message stats) // Stamped on the last assistant message's metadata.stats after each stream run. -// Used by the frontend context-usage ring (§H) and per-message stats popover (§I). +// Used by the frontend context-usage ring and per-message stats popover. export const messageStatsSchema = z.object({ - // Run-wide totals across every step (sum) — for the §I cost popover. + // Run-wide totals across every step (sum) — for the cost popover (§Per-message stats). inputTokens: z.number().nonnegative(), outputTokens: z.number().nonnegative(), - // Input tokens of the LAST model call = peak context fullness — for the §H - // ring. NOT the run-wide sum (which over-counts on multi-step tool loops). + // Input tokens of the LAST model call = peak context fullness — for the + // §Context-usage ring. NOT the run-wide sum (which over-counts on multi-step tool loops). contextTokens: z.number().nonnegative(), startedAt: z.string(), firstTokenAt: z.string().optional(), From ecff2f53e669d7baf32c55161353de344716575b Mon Sep 17 00:00:00 2001 From: "frantisek.spacek@morosystems.cz" Date: Mon, 15 Jun 2026 14:34:00 +0200 Subject: [PATCH 21/21] fix(backend): satisfy strict eslint type-aware rules in compaction code CI `pnpm lint` failed on the compaction branch under type-aware rules: - require-await: de-async test mocks + two prod helpers (loadBuiltinRegistry, the default loadRegistry slot) that never await; return Promise.resolve/reject to preserve behavior. - no-unnecessary-type-assertion: drop redundant `as` casts (autofix). - no-unsafe-* / no-explicit-any: type the captured mock-call args in chat-execution/sub-agent tests instead of leaking `any`. - no-useless-assignment: drop the dead `agent` local in the force-compact path. No behavior change. Lint, typecheck, and 203 affected tests all pass. Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/backend/src/runs/agent-runner.test.ts | 2 +- apps/backend/src/runs/compaction.test.ts | 20 +++---- apps/backend/src/runs/compaction.ts | 2 +- apps/backend/src/runs/context-window.test.ts | 6 +-- apps/backend/src/runs/context-window.ts | 3 +- apps/backend/src/runs/litellm-registry.ts | 8 +-- apps/backend/src/runs/recovery.test.ts | 54 ++++++------------- apps/backend/src/runs/token-estimate.test.ts | 4 +- .../src/services/chat-execution.test.ts | 9 +++- apps/backend/src/services/chat-execution.ts | 6 +-- apps/backend/src/tools/sub-agent.test.ts | 4 +- 11 files changed, 50 insertions(+), 68 deletions(-) diff --git a/apps/backend/src/runs/agent-runner.test.ts b/apps/backend/src/runs/agent-runner.test.ts index 1fc2dd45..c7e73e00 100644 --- a/apps/backend/src/runs/agent-runner.test.ts +++ b/apps/backend/src/runs/agent-runner.test.ts @@ -174,7 +174,7 @@ const fakeTurn = (overrides?: { dispose?: () => Promise }) => { targetTokens: 1000, keepRecentMessages: 10, minPrunableChars: 2000, - summarize: async (t: string) => t, + summarize: (t: string) => Promise.resolve(t), }, tier2: null, dispose, diff --git a/apps/backend/src/runs/compaction.test.ts b/apps/backend/src/runs/compaction.test.ts index 2321fdbc..49ff6e5d 100644 --- a/apps/backend/src/runs/compaction.test.ts +++ b/apps/backend/src/runs/compaction.test.ts @@ -41,23 +41,23 @@ class FakeStore implements CompactionStore { }; } - async readState() { - return { ...this.state }; + readState() { + return Promise.resolve({ ...this.state }); } - async casWrite( + casWrite( _chatId: string, expectVersion: number, patch: WatermarkPatch, - ) { + ): Promise { this.casCalls++; - if (this.state.version !== expectVersion) return false; + if (this.state.version !== expectVersion) return Promise.resolve(false); if ("watermark" in patch) this.state.summaryWatermark = patch.watermark ?? null; if ("summary" in patch) this.state.contextSummary = patch.summary ?? null; if ("dirty" in patch) this.state.compactionDirty = patch.dirty ?? false; this.state.version = expectVersion + 1; - return true; + return Promise.resolve(true); } } @@ -171,7 +171,7 @@ function uiText( role: "user" | "assistant", text: string, ): PlatypusUIMessage { - return { id, role, parts: [{ type: "text", text }] } as PlatypusUIMessage; + return { id, role, parts: [{ type: "text", text }] }; } function uiTool(id: string, output: unknown): PlatypusUIMessage { @@ -190,7 +190,7 @@ function uiTool(id: string, output: unknown): PlatypusUIMessage { } as unknown as PlatypusUIMessage; } -const noopSummarize = async () => "SUMMARY"; +const noopSummarize = () => Promise.resolve("SUMMARY"); describe("softTrim", () => { it("keeps short text untouched", () => { @@ -1294,7 +1294,7 @@ describe("applyTier1Compaction — Stage 0 avoids summarization (ADR-0012 §Stag ]; it("elides the old dump, drops under trigger, skips the model call", async () => { - const summarize = vi.fn(async () => "SUMMARY"); + const summarize = vi.fn(() => Promise.resolve("SUMMARY")); const out = await applyTier1Compaction({ chatId: "c", messages: messages(), @@ -1314,7 +1314,7 @@ describe("applyTier1Compaction — Stage 0 avoids summarization (ADR-0012 §Stag }); it("without context editing the same chat triggers summarization", async () => { - const summarize = vi.fn(async () => "SUMMARY"); + const summarize = vi.fn(() => Promise.resolve("SUMMARY")); const out = await applyTier1Compaction({ chatId: "c", messages: messages(), diff --git a/apps/backend/src/runs/compaction.ts b/apps/backend/src/runs/compaction.ts index 74d543ea..bc13df15 100644 --- a/apps/backend/src/runs/compaction.ts +++ b/apps/backend/src/runs/compaction.ts @@ -1055,7 +1055,7 @@ export function summaryUIMessage(text: string): PlatypusUIMessage { parts: [ { type: "text", text: `[Summary of earlier conversation]\n${text}` }, ], - } as PlatypusUIMessage; + }; } /** Fail-loud event so the transcript shows compaction happened (ADR-0012 §Tier 1). */ diff --git a/apps/backend/src/runs/context-window.test.ts b/apps/backend/src/runs/context-window.test.ts index 99b5bbba..27c63a9a 100644 --- a/apps/backend/src/runs/context-window.test.ts +++ b/apps/backend/src/runs/context-window.test.ts @@ -25,7 +25,7 @@ const REGISTRY: Registry = { "legacy-model": { max_tokens: 4096 }, }; -const loadRegistry = async () => REGISTRY; +const loadRegistry = () => Promise.resolve(REGISTRY); function resolver() { return new ContextWindowResolver({ loadRegistry }); @@ -260,9 +260,7 @@ describe("API auto-detect parsers", () => { describe("registry load failure (ADR-0012 §Window resolution)", () => { it("a throwing loader degrades to empty registry → default, no reject", async () => { const r = new ContextWindowResolver({ - loadRegistry: async () => { - throw new Error("bad vendored json"); - }, + loadRegistry: () => Promise.reject(new Error("bad vendored json")), }); const out = await r.resolve({ ...openai }, "gpt-4o"); expect(out.source).toBe("default"); diff --git a/apps/backend/src/runs/context-window.ts b/apps/backend/src/runs/context-window.ts index 2e698a0d..4e42f621 100644 --- a/apps/backend/src/runs/context-window.ts +++ b/apps/backend/src/runs/context-window.ts @@ -325,7 +325,8 @@ export class ContextWindowResolver { #now: () => number; constructor(deps: ResolverDeps = {}) { - this.#loadRegistry = deps.loadRegistry ?? (async () => ({})); + this.#loadRegistry = + deps.loadRegistry ?? ((): Promise => Promise.resolve({})); this.#aliasMap = deps.aliasMap ?? {}; this.#httpGetJson = deps.httpGetJson ?? defaultHttpGetJson; this.#ttlMs = deps.ttlMs ?? DEFAULT_CACHE_TTL_MS; diff --git a/apps/backend/src/runs/litellm-registry.ts b/apps/backend/src/runs/litellm-registry.ts index b260ffb2..8d217a75 100644 --- a/apps/backend/src/runs/litellm-registry.ts +++ b/apps/backend/src/runs/litellm-registry.ts @@ -339,8 +339,8 @@ const REGISTRY: Registry = { "Qwen/Qwen3-32B": { max_input_tokens: 131072, max_output_tokens: 8192 }, }; -/** Returns the built-in minimal registry. Async so the signature matches the - * injected `loadRegistry` slot and allows a future async fetch path. */ -export async function loadBuiltinRegistry(): Promise { - return REGISTRY; +/** Returns the built-in minimal registry. Returns a Promise so the signature + * matches the injected `loadRegistry` slot and allows a future async fetch path. */ +export function loadBuiltinRegistry(): Promise { + return Promise.resolve(REGISTRY); } diff --git a/apps/backend/src/runs/recovery.test.ts b/apps/backend/src/runs/recovery.test.ts index 87d658a6..6959c787 100644 --- a/apps/backend/src/runs/recovery.test.ts +++ b/apps/backend/src/runs/recovery.test.ts @@ -136,7 +136,7 @@ const ctx = (over: Partial = {}): RecoveryContext => ({ targetTokens: 100, keepRecentMessages: 4, // recovery halves this → keep 2 minPrunableChars: 2000, - summarize: async () => "RSUM", + summarize: () => Promise.resolve("RSUM"), ...over, }); @@ -149,10 +149,10 @@ const overflow = () => /** Fake V3 model capturing retry params. */ const fakeModel = (result: unknown = "RETRIED", fail?: unknown) => { const calls: Array<{ prompt: PromptMsg[] }> = []; - const impl = async (params: { prompt: PromptMsg[] }) => { + const impl = (params: { prompt: PromptMsg[] }) => { calls.push(params); - if (fail) throw fail; - return result; + if (fail) return Promise.reject(fail); + return Promise.resolve(result); }; return { calls, model: { doGenerate: impl, doStream: impl } }; }; @@ -166,20 +166,16 @@ const runWrapGenerate = ( }, ) => (mw.wrapGenerate as (o: unknown) => Promise)({ - doStream: async () => { - throw new Error("unused"); - }, + doStream: () => Promise.reject(new Error("unused")), ...args, }); describe("contextOverflowRecoveryMiddleware (ADR-0012 §Recovery)", () => { it("trims via the shared compactor and retries exactly once on overflow", async () => { - const markDirty = vi.fn(async () => undefined); + const markDirty = vi.fn(() => Promise.resolve(undefined)); const mw = contextOverflowRecoveryMiddleware(ctx({ markDirty })); const { calls, model } = fakeModel(); - const doGenerate = vi.fn(async () => { - throw overflow(); - }); + const doGenerate = vi.fn(() => Promise.reject(overflow())); const result = await runWrapGenerate(mw, { doGenerate, @@ -208,16 +204,14 @@ describe("contextOverflowRecoveryMiddleware (ADR-0012 §Recovery)", () => { }); it("propagates the second overflow — no infinite retry", async () => { - const markDirty = vi.fn(async () => undefined); + const markDirty = vi.fn(() => Promise.resolve(undefined)); const mw = contextOverflowRecoveryMiddleware(ctx({ markDirty })); const second = overflow(); const { model } = fakeModel(undefined, second); await expect( runWrapGenerate(mw, { - doGenerate: async () => { - throw overflow(); - }, + doGenerate: () => Promise.reject(overflow()), params: { prompt: overflowPrompt() }, model, }), @@ -227,16 +221,14 @@ describe("contextOverflowRecoveryMiddleware (ADR-0012 §Recovery)", () => { }); it("rethrows non-overflow errors without retrying or flagging", async () => { - const markDirty = vi.fn(async () => undefined); + const markDirty = vi.fn(() => Promise.resolve(undefined)); const mw = contextOverflowRecoveryMiddleware(ctx({ markDirty })); const { calls, model } = fakeModel(); const authError = apiError({ statusCode: 401, message: "bad key" }); await expect( runWrapGenerate(mw, { - doGenerate: async () => { - throw authError; - }, + doGenerate: () => Promise.reject(authError), params: { prompt: overflowPrompt() }, model, }), @@ -246,16 +238,12 @@ describe("contextOverflowRecoveryMiddleware (ADR-0012 §Recovery)", () => { }); it("still retries when persisting the dirty flag fails (best-effort)", async () => { - const markDirty = vi.fn(async () => { - throw new Error("db down"); - }); + const markDirty = vi.fn(() => Promise.reject(new Error("db down"))); const mw = contextOverflowRecoveryMiddleware(ctx({ markDirty })); const { calls, model } = fakeModel(); const result = await runWrapGenerate(mw, { - doGenerate: async () => { - throw overflow(); - }, + doGenerate: () => Promise.reject(overflow()), params: { prompt: overflowPrompt() }, model, }); @@ -267,18 +255,14 @@ describe("contextOverflowRecoveryMiddleware (ADR-0012 §Recovery)", () => { const first = overflow(); const mw = contextOverflowRecoveryMiddleware( ctx({ - summarize: async () => { - throw new Error("summarizer down"); - }, + summarize: () => Promise.reject(new Error("summarizer down")), }), ); const { calls, model } = fakeModel(); await expect( runWrapGenerate(mw, { - doGenerate: async () => { - throw first; - }, + doGenerate: () => Promise.reject(first), params: { prompt: overflowPrompt() }, model, }), @@ -291,12 +275,8 @@ describe("contextOverflowRecoveryMiddleware (ADR-0012 §Recovery)", () => { const { calls, model } = fakeModel("STREAMED"); const result = await (mw.wrapStream as (o: unknown) => Promise)({ - doGenerate: async () => { - throw new Error("unused"); - }, - doStream: async () => { - throw overflow(); - }, + doGenerate: () => Promise.reject(new Error("unused")), + doStream: () => Promise.reject(overflow()), params: { prompt: overflowPrompt() }, model, }); diff --git a/apps/backend/src/runs/token-estimate.test.ts b/apps/backend/src/runs/token-estimate.test.ts index 710f4af5..73784bd4 100644 --- a/apps/backend/src/runs/token-estimate.test.ts +++ b/apps/backend/src/runs/token-estimate.test.ts @@ -144,7 +144,7 @@ describe("modality table (ADR-0012 §Token estimation — never char/4 an image) id: "m1", role: "user", parts: [{ type: "file", mediaType: "image/png", url: dataUrl(png) }], - } as PlatypusUIMessage, + }, ]; const tokens = estimateTokens(uiMessagesToCountUnits(ui, "anthropic")); // char/4 of the base64 data URL would be far larger than the table cost. @@ -279,7 +279,7 @@ describe("adapter equality (ADR-0012 §One estimator — one estimate across bot const model = await convertToModelMessages(ui); const uiTokens = estimateTokens( - uiMessagesToCountUnits(ui as PlatypusUIMessage[], "openai"), + uiMessagesToCountUnits(ui, "openai"), ); const modelTokens = estimateTokens( modelMessagesToCountUnits(model, "openai"), diff --git a/apps/backend/src/services/chat-execution.test.ts b/apps/backend/src/services/chat-execution.test.ts index b2844a92..7291ff93 100644 --- a/apps/backend/src/services/chat-execution.test.ts +++ b/apps/backend/src/services/chat-execution.test.ts @@ -668,7 +668,7 @@ describe("chat-execution", () => { const buildRuntime = (signal?: AbortSignal, onActivity?: () => void) => buildCompactionRuntime({ chatId: "chat-1", - provider: baseProvider as never, + provider: baseProvider, resolvedModelId: "gpt-4", opened: { languageModel: vi.fn(() => ({ modelId: "task-model" })), @@ -704,7 +704,12 @@ describe("chat-execution", () => { expect(out).toBe("SUMMARY"); expect(mockGenerateText).toHaveBeenCalledTimes(1); - const arg = mockGenerateText.mock.calls[0][0]; + const arg = mockGenerateText.mock.calls[0][0] as { + maxOutputTokens?: number; + abortSignal?: AbortSignal; + prompt?: string; + system: string; + }; expect(arg.maxOutputTokens).toBe(4000); expect(arg.abortSignal).toBe(controller.signal); expect(arg.prompt).toBe("history text"); diff --git a/apps/backend/src/services/chat-execution.ts b/apps/backend/src/services/chat-execution.ts index 4feae0c0..20dea580 100644 --- a/apps/backend/src/services/chat-execution.ts +++ b/apps/backend/src/services/chat-execution.ts @@ -1667,7 +1667,6 @@ export async function forceCompactChat( // Shared resources and the ADR-0007 attachment gate). let provider: Provider; let resolvedModelId: string; - let agent: AgentRow | null = null; if (chatRow.agentId) { const agentRow = await drizzleChatTurnQueries.getAgent( @@ -1676,10 +1675,9 @@ export async function forceCompactChat( workspaceId, ); if (!agentRow) throw new NotFoundError("Agent not found"); - agent = agentRow; - resolvedModelId = agent.modelId; + resolvedModelId = agentRow.modelId; const providerRow = await drizzleChatTurnQueries.getProvider( - agent.providerId, + agentRow.providerId, orgId, workspaceId, ); diff --git a/apps/backend/src/tools/sub-agent.test.ts b/apps/backend/src/tools/sub-agent.test.ts index 26be1c6c..3ab227c9 100644 --- a/apps/backend/src/tools/sub-agent.test.ts +++ b/apps/backend/src/tools/sub-agent.test.ts @@ -45,9 +45,9 @@ function createMockFullStream( const { mockStream, MockToolLoopAgent, capturedSettings } = vi.hoisted(() => { const mockStream = vi.fn(); - const capturedSettings: any[] = []; + const capturedSettings: Record[] = []; class MockToolLoopAgent { - constructor(settings: any) { + constructor(settings: Record) { capturedSettings.push(settings); } stream = mockStream;