diff --git a/apps/backend/.env.example b/apps/backend/.env.example index 0c728bab..b8197606 100644 --- a/apps/backend/.env.example +++ b/apps/backend/.env.example @@ -49,4 +49,21 @@ PLATYPUS_SANDBOX_DOCKER_ENABLED=false # PLATYPUS_SANDBOX_DOCKER_ALLOWED_NETWORKS=shared-services,public-tools # Frontend URL for generating resource links in tool responses -FRONTEND_URL=http://localhost:3001 \ No newline at end of file +FRONTEND_URL=http://localhost:3001 + +# Context compaction (ADR-0012 §Config & kill switch). +# Compaction behavior is global; window/output size stays per-model. +# COMPACTION_ENABLED=false disables proactive compaction (recovery still runs). +# COMPACTION_ENABLED=true +# +# Optional overrides for the global ceiling. Unset = built-in defaults +# (trigger 0.8, target 0.5, reserve 0.05, keepRecent 10, minPrunable 2000, +# minRecentPrunable 10000). +# Lower the trigger to exercise auto-compaction on test deployments. +# Keep target < trigger or compaction re-fires every turn. +# COMPACTION_TRIGGER_RATIO=0.8 +# COMPACTION_TARGET_RATIO=0.5 +# COMPACTION_RESERVE_RATIO=0.05 +# COMPACTION_KEEP_RECENT=10 +# COMPACTION_MIN_PRUNABLE_CHARS=2000 +# COMPACTION_MIN_RECENT_PRUNABLE_CHARS=10000 \ No newline at end of file diff --git a/apps/backend/drizzle/0046_context_compaction.sql b/apps/backend/drizzle/0046_context_compaction.sql new file mode 100644 index 00000000..573845c6 --- /dev/null +++ b/apps/backend/drizzle/0046_context_compaction.sql @@ -0,0 +1,5 @@ +ALTER TABLE "chat" ADD COLUMN "context_summary" text;--> statement-breakpoint +ALTER TABLE "chat" ADD COLUMN "summary_watermark" text;--> statement-breakpoint +ALTER TABLE "chat" ADD COLUMN "compaction_dirty" boolean DEFAULT false NOT NULL;--> statement-breakpoint +ALTER TABLE "chat" ADD COLUMN "version" integer DEFAULT 0 NOT NULL;--> statement-breakpoint +ALTER TABLE "provider" ADD COLUMN "model_meta" jsonb; diff --git a/apps/backend/drizzle/meta/0046_snapshot.json b/apps/backend/drizzle/meta/0046_snapshot.json new file mode 100644 index 00000000..cda086e7 --- /dev/null +++ b/apps/backend/drizzle/meta/0046_snapshot.json @@ -0,0 +1,4323 @@ +{ + "id": "c302529b-3427-4f45-a87d-6615109ab2eb", + "prevId": "668db7b6-9bad-46e6-b6bc-2533fce5ce32", + "version": "7", + "dialect": "postgresql", + "tables": { + "public.agent": { + "name": "agent", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "organization_id": { + "name": "organization_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "provider_id": { + "name": "provider_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "system_prompt": { + "name": "system_prompt", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "model_id": { + "name": "model_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "max_steps": { + "name": "max_steps", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "temperature": { + "name": "temperature", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "top_p": { + "name": "top_p", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "top_k": { + "name": "top_k", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "seed": { + "name": "seed", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "presence_penalty": { + "name": "presence_penalty", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "frequency_penalty": { + "name": "frequency_penalty", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "tool_set_ids": { + "name": "tool_set_ids", + "type": "jsonb", + "primaryKey": false, + "notNull": false, + "default": "'[]'::jsonb" + }, + "skill_ids": { + "name": "skill_ids", + "type": "jsonb", + "primaryKey": false, + "notNull": false, + "default": "'[]'::jsonb" + }, + "sub_agent_ids": { + "name": "sub_agent_ids", + "type": "jsonb", + "primaryKey": false, + "notNull": false, + "default": "'[]'::jsonb" + }, + "input_placeholder": { + "name": "input_placeholder", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "avatar_key": { + "name": "avatar_key", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_agent_workspace_id": { + "name": "idx_agent_workspace_id", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_agent_organization_id": { + "name": "idx_agent_organization_id", + "columns": [ + { + "expression": "organization_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_agent_provider_id": { + "name": "idx_agent_provider_id", + "columns": [ + { + "expression": "provider_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "agent_organization_id_organization_id_fk": { + "name": "agent_organization_id_organization_id_fk", + "tableFrom": "agent", + "tableTo": "organization", + "columnsFrom": [ + "organization_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "agent_workspace_id_workspace_id_fk": { + "name": "agent_workspace_id_workspace_id_fk", + "tableFrom": "agent", + "tableTo": "workspace", + "columnsFrom": [ + "workspace_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "agent_provider_id_provider_id_fk": { + "name": "agent_provider_id_provider_id_fk", + "tableFrom": "agent", + "tableTo": "provider", + "columnsFrom": [ + "provider_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "restrict", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_agent_name_org": { + "name": "unique_agent_name_org", + "nullsNotDistinct": false, + "columns": [ + "organization_id", + "name" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.attachment": { + "name": "attachment", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "resource_type": { + "name": "resource_type", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "resource_id": { + "name": "resource_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_attachment_workspace": { + "name": "idx_attachment_workspace", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "resource_type", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_attachment_resource": { + "name": "idx_attachment_resource", + "columns": [ + { + "expression": "resource_type", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "resource_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "attachment_workspace_id_workspace_id_fk": { + "name": "attachment_workspace_id_workspace_id_fk", + "tableFrom": "attachment", + "tableTo": "workspace", + "columnsFrom": [ + "workspace_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_attachment": { + "name": "unique_attachment", + "nullsNotDistinct": false, + "columns": [ + "workspace_id", + "resource_type", + "resource_id" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.blueprint": { + "name": "blueprint", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "organization_id": { + "name": "organization_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "task_model_provider_id": { + "name": "task_model_provider_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "memory_extraction_provider_id": { + "name": "memory_extraction_provider_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "memory_embedding_provider_id": { + "name": "memory_embedding_provider_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "context": { + "name": "context", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_blueprint_organization_id": { + "name": "idx_blueprint_organization_id", + "columns": [ + { + "expression": "organization_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "blueprint_organization_id_organization_id_fk": { + "name": "blueprint_organization_id_organization_id_fk", + "tableFrom": "blueprint", + "tableTo": "organization", + "columnsFrom": [ + "organization_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "blueprint_task_model_provider_id_provider_id_fk": { + "name": "blueprint_task_model_provider_id_provider_id_fk", + "tableFrom": "blueprint", + "tableTo": "provider", + "columnsFrom": [ + "task_model_provider_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "set null", + "onUpdate": "no action" + }, + "blueprint_memory_extraction_provider_id_provider_id_fk": { + "name": "blueprint_memory_extraction_provider_id_provider_id_fk", + "tableFrom": "blueprint", + "tableTo": "provider", + "columnsFrom": [ + "memory_extraction_provider_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "set null", + "onUpdate": "no action" + }, + "blueprint_memory_embedding_provider_id_provider_id_fk": { + "name": "blueprint_memory_embedding_provider_id_provider_id_fk", + "tableFrom": "blueprint", + "tableTo": "provider", + "columnsFrom": [ + "memory_embedding_provider_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "set null", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_blueprint_name_org": { + "name": "unique_blueprint_name_org", + "nullsNotDistinct": false, + "columns": [ + "organization_id", + "name" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.blueprint_item": { + "name": "blueprint_item", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "blueprint_id": { + "name": "blueprint_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "resource_type": { + "name": "resource_type", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "resource_id": { + "name": "resource_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_blueprint_item_blueprint": { + "name": "idx_blueprint_item_blueprint", + "columns": [ + { + "expression": "blueprint_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_blueprint_item_resource": { + "name": "idx_blueprint_item_resource", + "columns": [ + { + "expression": "resource_type", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "resource_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "blueprint_item_blueprint_id_blueprint_id_fk": { + "name": "blueprint_item_blueprint_id_blueprint_id_fk", + "tableFrom": "blueprint_item", + "tableTo": "blueprint", + "columnsFrom": [ + "blueprint_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_blueprint_item": { + "name": "unique_blueprint_item", + "nullsNotDistinct": false, + "columns": [ + "blueprint_id", + "resource_type", + "resource_id" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.chat": { + "name": "chat", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "title": { + "name": "title", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "messages": { + "name": "messages", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "status": { + "name": "status", + "type": "text", + "primaryKey": false, + "notNull": true, + "default": "'succeeded'" + }, + "is_pinned": { + "name": "is_pinned", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": false + }, + "tags": { + "name": "tags", + "type": "jsonb", + "primaryKey": false, + "notNull": false, + "default": "'[]'::jsonb" + }, + "agent_id": { + "name": "agent_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "provider_id": { + "name": "provider_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "model_id": { + "name": "model_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "system_prompt": { + "name": "system_prompt", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "temperature": { + "name": "temperature", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "top_p": { + "name": "top_p", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "top_k": { + "name": "top_k", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "seed": { + "name": "seed", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "presence_penalty": { + "name": "presence_penalty", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "frequency_penalty": { + "name": "frequency_penalty", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "context_summary": { + "name": "context_summary", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "summary_watermark": { + "name": "summary_watermark", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "compaction_dirty": { + "name": "compaction_dirty", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": false + }, + "version": { + "name": "version", + "type": "integer", + "primaryKey": false, + "notNull": true, + "default": 0 + }, + "last_memory_processed_at": { + "name": "last_memory_processed_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "memory_extraction_status": { + "name": "memory_extraction_status", + "type": "text", + "primaryKey": false, + "notNull": false, + "default": "'pending'" + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_chat_workspace_id": { + "name": "idx_chat_workspace_id", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_chat_tags": { + "name": "idx_chat_tags", + "columns": [ + { + "expression": "tags", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + }, + "idx_chat_memory_processing": { + "name": "idx_chat_memory_processing", + "columns": [ + { + "expression": "memory_extraction_status", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "last_memory_processed_at", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "updated_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "chat_workspace_id_workspace_id_fk": { + "name": "chat_workspace_id_workspace_id_fk", + "tableFrom": "chat", + "tableTo": "workspace", + "columnsFrom": [ + "workspace_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.context": { + "name": "context", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "user_id": { + "name": "user_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "content": { + "name": "content", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_context_user_id": { + "name": "idx_context_user_id", + "columns": [ + { + "expression": "user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_context_workspace_id": { + "name": "idx_context_workspace_id", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "context_user_id_user_id_fk": { + "name": "context_user_id_user_id_fk", + "tableFrom": "context", + "tableTo": "user", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "context_workspace_id_workspace_id_fk": { + "name": "context_workspace_id_workspace_id_fk", + "tableFrom": "context", + "tableTo": "workspace", + "columnsFrom": [ + "workspace_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_context_user_workspace": { + "name": "unique_context_user_workspace", + "nullsNotDistinct": false, + "columns": [ + "user_id", + "workspace_id" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.dashboard": { + "name": "dashboard", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "desktop_layout": { + "name": "desktop_layout", + "type": "jsonb", + "primaryKey": false, + "notNull": true, + "default": "'[]'::jsonb" + }, + "mobile_layout": { + "name": "mobile_layout", + "type": "jsonb", + "primaryKey": false, + "notNull": true, + "default": "'[]'::jsonb" + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_dashboard_workspace_id": { + "name": "idx_dashboard_workspace_id", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "uq_dashboard_workspace_name": { + "name": "uq_dashboard_workspace_name", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "name", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "dashboard_workspace_id_workspace_id_fk": { + "name": "dashboard_workspace_id_workspace_id_fk", + "tableFrom": "dashboard", + "tableTo": "workspace", + "columnsFrom": [ + "workspace_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.invitation": { + "name": "invitation", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "email": { + "name": "email", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "organization_id": { + "name": "organization_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "invited_by": { + "name": "invited_by", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "status": { + "name": "status", + "type": "text", + "primaryKey": false, + "notNull": true, + "default": "'pending'" + }, + "workspace_name": { + "name": "workspace_name", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "expires_at": { + "name": "expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_invitation_email": { + "name": "idx_invitation_email", + "columns": [ + { + "expression": "email", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_invitation_org_id": { + "name": "idx_invitation_org_id", + "columns": [ + { + "expression": "organization_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "invitation_organization_id_organization_id_fk": { + "name": "invitation_organization_id_organization_id_fk", + "tableFrom": "invitation", + "tableTo": "organization", + "columnsFrom": [ + "organization_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "invitation_invited_by_user_id_fk": { + "name": "invitation_invited_by_user_id_fk", + "tableFrom": "invitation", + "tableTo": "user", + "columnsFrom": [ + "invited_by" + ], + "columnsTo": [ + "id" + ], + "onDelete": "no action", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_invitation_org_email": { + "name": "unique_invitation_org_email", + "nullsNotDistinct": false, + "columns": [ + "organization_id", + "email" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.invitation_blueprint": { + "name": "invitation_blueprint", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "invitation_id": { + "name": "invitation_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "blueprint_id": { + "name": "blueprint_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "position": { + "name": "position", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_invitation_blueprint_invitation": { + "name": "idx_invitation_blueprint_invitation", + "columns": [ + { + "expression": "invitation_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_invitation_blueprint_blueprint": { + "name": "idx_invitation_blueprint_blueprint", + "columns": [ + { + "expression": "blueprint_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "invitation_blueprint_invitation_id_invitation_id_fk": { + "name": "invitation_blueprint_invitation_id_invitation_id_fk", + "tableFrom": "invitation_blueprint", + "tableTo": "invitation", + "columnsFrom": [ + "invitation_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "invitation_blueprint_blueprint_id_blueprint_id_fk": { + "name": "invitation_blueprint_blueprint_id_blueprint_id_fk", + "tableFrom": "invitation_blueprint", + "tableTo": "blueprint", + "columnsFrom": [ + "blueprint_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_invitation_blueprint": { + "name": "unique_invitation_blueprint", + "nullsNotDistinct": false, + "columns": [ + "invitation_id", + "blueprint_id" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.kanban_board": { + "name": "kanban_board", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "labels": { + "name": "labels", + "type": "jsonb", + "primaryKey": false, + "notNull": true, + "default": "'[]'::jsonb" + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_kanban_board_workspace_id": { + "name": "idx_kanban_board_workspace_id", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "kanban_board_workspace_id_workspace_id_fk": { + "name": "kanban_board_workspace_id_workspace_id_fk", + "tableFrom": "kanban_board", + "tableTo": "workspace", + "columnsFrom": [ + "workspace_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_kanban_board_name_workspace": { + "name": "unique_kanban_board_name_workspace", + "nullsNotDistinct": false, + "columns": [ + "workspace_id", + "name" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.kanban_card": { + "name": "kanban_card", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "column_id": { + "name": "column_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "title": { + "name": "title", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "body": { + "name": "body", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "label_ids": { + "name": "label_ids", + "type": "jsonb", + "primaryKey": false, + "notNull": true, + "default": "'[]'::jsonb" + }, + "assignees": { + "name": "assignees", + "type": "jsonb", + "primaryKey": false, + "notNull": true, + "default": "'[]'::jsonb" + }, + "due_date": { + "name": "due_date", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "priority": { + "name": "priority", + "type": "text", + "primaryKey": false, + "notNull": true, + "default": "'none'" + }, + "position": { + "name": "position", + "type": "real", + "primaryKey": false, + "notNull": true + }, + "created_by_user_id": { + "name": "created_by_user_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_by_agent_id": { + "name": "created_by_agent_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "last_edited_by_user_id": { + "name": "last_edited_by_user_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "last_edited_by_agent_id": { + "name": "last_edited_by_agent_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_kanban_card_column_id": { + "name": "idx_kanban_card_column_id", + "columns": [ + { + "expression": "column_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_kanban_card_label_ids": { + "name": "idx_kanban_card_label_ids", + "columns": [ + { + "expression": "label_ids", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + }, + "idx_kanban_card_assignees": { + "name": "idx_kanban_card_assignees", + "columns": [ + { + "expression": "assignees", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + }, + "idx_kanban_card_due_date": { + "name": "idx_kanban_card_due_date", + "columns": [ + { + "expression": "due_date", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_kanban_card_priority": { + "name": "idx_kanban_card_priority", + "columns": [ + { + "expression": "priority", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_kanban_card_column_position": { + "name": "idx_kanban_card_column_position", + "columns": [ + { + "expression": "column_id", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "position", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "kanban_card_column_id_kanban_column_id_fk": { + "name": "kanban_card_column_id_kanban_column_id_fk", + "tableFrom": "kanban_card", + "tableTo": "kanban_column", + "columnsFrom": [ + "column_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "kanban_card_created_by_user_id_user_id_fk": { + "name": "kanban_card_created_by_user_id_user_id_fk", + "tableFrom": "kanban_card", + "tableTo": "user", + "columnsFrom": [ + "created_by_user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "set null", + "onUpdate": "no action" + }, + "kanban_card_created_by_agent_id_agent_id_fk": { + "name": "kanban_card_created_by_agent_id_agent_id_fk", + "tableFrom": "kanban_card", + "tableTo": "agent", + "columnsFrom": [ + "created_by_agent_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "set null", + "onUpdate": "no action" + }, + "kanban_card_last_edited_by_user_id_user_id_fk": { + "name": "kanban_card_last_edited_by_user_id_user_id_fk", + "tableFrom": "kanban_card", + "tableTo": "user", + "columnsFrom": [ + "last_edited_by_user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "set null", + "onUpdate": "no action" + }, + "kanban_card_last_edited_by_agent_id_agent_id_fk": { + "name": "kanban_card_last_edited_by_agent_id_agent_id_fk", + "tableFrom": "kanban_card", + "tableTo": "agent", + "columnsFrom": [ + "last_edited_by_agent_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "set null", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.kanban_card_comment": { + "name": "kanban_card_comment", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "card_id": { + "name": "card_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "body": { + "name": "body", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_by_user_id": { + "name": "created_by_user_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_by_agent_id": { + "name": "created_by_agent_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_kanban_card_comment_card_id": { + "name": "idx_kanban_card_comment_card_id", + "columns": [ + { + "expression": "card_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "kanban_card_comment_card_id_kanban_card_id_fk": { + "name": "kanban_card_comment_card_id_kanban_card_id_fk", + "tableFrom": "kanban_card_comment", + "tableTo": "kanban_card", + "columnsFrom": [ + "card_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "kanban_card_comment_created_by_user_id_user_id_fk": { + "name": "kanban_card_comment_created_by_user_id_user_id_fk", + "tableFrom": "kanban_card_comment", + "tableTo": "user", + "columnsFrom": [ + "created_by_user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "set null", + "onUpdate": "no action" + }, + "kanban_card_comment_created_by_agent_id_agent_id_fk": { + "name": "kanban_card_comment_created_by_agent_id_agent_id_fk", + "tableFrom": "kanban_card_comment", + "tableTo": "agent", + "columnsFrom": [ + "created_by_agent_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "set null", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.kanban_column": { + "name": "kanban_column", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "board_id": { + "name": "board_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "position": { + "name": "position", + "type": "real", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_kanban_column_board_id": { + "name": "idx_kanban_column_board_id", + "columns": [ + { + "expression": "board_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "kanban_column_board_id_kanban_board_id_fk": { + "name": "kanban_column_board_id_kanban_board_id_fk", + "tableFrom": "kanban_column", + "tableTo": "kanban_board", + "columnsFrom": [ + "board_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.mcp": { + "name": "mcp", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "organization_id": { + "name": "organization_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "url": { + "name": "url", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "headers": { + "name": "headers", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "auth_type": { + "name": "auth_type", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "bearer_token": { + "name": "bearer_token", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "oauth_access_token": { + "name": "oauth_access_token", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "oauth_refresh_token": { + "name": "oauth_refresh_token", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "oauth_token_expires_at": { + "name": "oauth_token_expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "oauth_scope": { + "name": "oauth_scope", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "oauth_requested_scope": { + "name": "oauth_requested_scope", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "oauth_client_id": { + "name": "oauth_client_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "oauth_client_secret": { + "name": "oauth_client_secret", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_mcp_workspace_id": { + "name": "idx_mcp_workspace_id", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_mcp_organization_id": { + "name": "idx_mcp_organization_id", + "columns": [ + { + "expression": "organization_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "mcp_organization_id_organization_id_fk": { + "name": "mcp_organization_id_organization_id_fk", + "tableFrom": "mcp", + "tableTo": "organization", + "columnsFrom": [ + "organization_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "mcp_workspace_id_workspace_id_fk": { + "name": "mcp_workspace_id_workspace_id_fk", + "tableFrom": "mcp", + "tableTo": "workspace", + "columnsFrom": [ + "workspace_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_mcp_name_org": { + "name": "unique_mcp_name_org", + "nullsNotDistinct": false, + "columns": [ + "organization_id", + "name" + ] + }, + "unique_mcp_name_workspace": { + "name": "unique_mcp_name_workspace", + "nullsNotDistinct": false, + "columns": [ + "workspace_id", + "name" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.mcp_oauth_state": { + "name": "mcp_oauth_state", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "mcp_id": { + "name": "mcp_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "code_verifier": { + "name": "code_verifier", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "redirect_uri": { + "name": "redirect_uri", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "expires_at": { + "name": "expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "idx_mcp_oauth_state_mcp_id": { + "name": "idx_mcp_oauth_state_mcp_id", + "columns": [ + { + "expression": "mcp_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "mcp_oauth_state_mcp_id_mcp_id_fk": { + "name": "mcp_oauth_state_mcp_id_mcp_id_fk", + "tableFrom": "mcp_oauth_state", + "tableTo": "mcp", + "columnsFrom": [ + "mcp_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.memory_daily_summary": { + "name": "memory_daily_summary", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "user_id": { + "name": "user_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "summary_date": { + "name": "summary_date", + "type": "date", + "primaryKey": false, + "notNull": true + }, + "summary": { + "name": "summary", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "embedding": { + "name": "embedding", + "type": "vector", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_daily_summary_user_workspace": { + "name": "idx_daily_summary_user_workspace", + "columns": [ + { + "expression": "user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_daily_summary_date": { + "name": "idx_daily_summary_date", + "columns": [ + { + "expression": "summary_date", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "memory_daily_summary_user_id_user_id_fk": { + "name": "memory_daily_summary_user_id_user_id_fk", + "tableFrom": "memory_daily_summary", + "tableTo": "user", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "memory_daily_summary_workspace_id_workspace_id_fk": { + "name": "memory_daily_summary_workspace_id_workspace_id_fk", + "tableFrom": "memory_daily_summary", + "tableTo": "workspace", + "columnsFrom": [ + "workspace_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_daily_summary_user_workspace_date": { + "name": "unique_daily_summary_user_workspace_date", + "nullsNotDistinct": false, + "columns": [ + "user_id", + "workspace_id", + "summary_date" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.notification": { + "name": "notification", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "agent_id": { + "name": "agent_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "title": { + "name": "title", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "body": { + "name": "body", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_notification_workspace_id": { + "name": "idx_notification_workspace_id", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_notification_agent_id": { + "name": "idx_notification_agent_id", + "columns": [ + { + "expression": "agent_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_notification_created_at": { + "name": "idx_notification_created_at", + "columns": [ + { + "expression": "created_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "notification_workspace_id_workspace_id_fk": { + "name": "notification_workspace_id_workspace_id_fk", + "tableFrom": "notification", + "tableTo": "workspace", + "columnsFrom": [ + "workspace_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "notification_agent_id_agent_id_fk": { + "name": "notification_agent_id_agent_id_fk", + "tableFrom": "notification", + "tableTo": "agent", + "columnsFrom": [ + "agent_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.notification_read": { + "name": "notification_read", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "notification_id": { + "name": "notification_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "user_id": { + "name": "user_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "read_at": { + "name": "read_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_notification_read_user_id": { + "name": "idx_notification_read_user_id", + "columns": [ + { + "expression": "user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_notification_read_notification_id": { + "name": "idx_notification_read_notification_id", + "columns": [ + { + "expression": "notification_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "notification_read_notification_id_notification_id_fk": { + "name": "notification_read_notification_id_notification_id_fk", + "tableFrom": "notification_read", + "tableTo": "notification", + "columnsFrom": [ + "notification_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "notification_read_user_id_user_id_fk": { + "name": "notification_read_user_id_user_id_fk", + "tableFrom": "notification_read", + "tableTo": "user", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_notification_read": { + "name": "unique_notification_read", + "nullsNotDistinct": false, + "columns": [ + "notification_id", + "user_id" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.organization": { + "name": "organization", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.organization_member": { + "name": "organization_member", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "organization_id": { + "name": "organization_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "user_id": { + "name": "user_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "role": { + "name": "role", + "type": "text", + "primaryKey": false, + "notNull": true, + "default": "'member'" + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_org_member_org_id": { + "name": "idx_org_member_org_id", + "columns": [ + { + "expression": "organization_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_org_member_user_id": { + "name": "idx_org_member_user_id", + "columns": [ + { + "expression": "user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "organization_member_organization_id_organization_id_fk": { + "name": "organization_member_organization_id_organization_id_fk", + "tableFrom": "organization_member", + "tableTo": "organization", + "columnsFrom": [ + "organization_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "organization_member_user_id_user_id_fk": { + "name": "organization_member_user_id_user_id_fk", + "tableFrom": "organization_member", + "tableTo": "user", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.provider": { + "name": "provider", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "organization_id": { + "name": "organization_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "provider_type": { + "name": "provider_type", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "api_key": { + "name": "api_key", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "region": { + "name": "region", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "base_url": { + "name": "base_url", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "headers": { + "name": "headers", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "extraBody": { + "name": "extraBody", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "organization": { + "name": "organization", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "project": { + "name": "project", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "api_mode": { + "name": "api_mode", + "type": "text", + "primaryKey": false, + "notNull": true, + "default": "'responses'" + }, + "native_search_enabled": { + "name": "native_search_enabled", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": true + }, + "modelIds": { + "name": "modelIds", + "type": "jsonb", + "primaryKey": false, + "notNull": true + }, + "task_model_id": { + "name": "task_model_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "memory_extraction_model_id": { + "name": "memory_extraction_model_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "embedding_model_id": { + "name": "embedding_model_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "embedding_dimensions": { + "name": "embedding_dimensions", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "model_meta": { + "name": "model_meta", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_provider_workspace_id": { + "name": "idx_provider_workspace_id", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_provider_organization_id": { + "name": "idx_provider_organization_id", + "columns": [ + { + "expression": "organization_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "provider_organization_id_organization_id_fk": { + "name": "provider_organization_id_organization_id_fk", + "tableFrom": "provider", + "tableTo": "organization", + "columnsFrom": [ + "organization_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_provider_name_org": { + "name": "unique_provider_name_org", + "nullsNotDistinct": false, + "columns": [ + "organization_id", + "name" + ] + }, + "unique_provider_name_workspace": { + "name": "unique_provider_name_workspace", + "nullsNotDistinct": false, + "columns": [ + "workspace_id", + "name" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.sandbox": { + "name": "sandbox", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "backend": { + "name": "backend", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "config": { + "name": "config", + "type": "jsonb", + "primaryKey": false, + "notNull": true, + "default": "'{}'::jsonb" + }, + "credentials": { + "name": "credentials", + "type": "jsonb", + "primaryKey": false, + "notNull": true, + "default": "'{}'::jsonb" + }, + "admin_env": { + "name": "admin_env", + "type": "jsonb", + "primaryKey": false, + "notNull": true, + "default": "'{}'::jsonb" + }, + "user_env": { + "name": "user_env", + "type": "jsonb", + "primaryKey": false, + "notNull": true, + "default": "'{}'::jsonb" + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "unique_sandbox_workspace_id": { + "name": "unique_sandbox_workspace_id", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "sandbox_workspace_id_workspace_id_fk": { + "name": "sandbox_workspace_id_workspace_id_fk", + "tableFrom": "sandbox", + "tableTo": "workspace", + "columnsFrom": [ + "workspace_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.sandbox_teardown_failure": { + "name": "sandbox_teardown_failure", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "backend": { + "name": "backend", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "config": { + "name": "config", + "type": "jsonb", + "primaryKey": false, + "notNull": true + }, + "error": { + "name": "error", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "attempted_at": { + "name": "attempted_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_sandbox_teardown_failure_workspace_id": { + "name": "idx_sandbox_teardown_failure_workspace_id", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.skill": { + "name": "skill", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "organization_id": { + "name": "organization_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "body": { + "name": "body", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_skill_workspace_id": { + "name": "idx_skill_workspace_id", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_skill_organization_id": { + "name": "idx_skill_organization_id", + "columns": [ + { + "expression": "organization_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "skill_organization_id_organization_id_fk": { + "name": "skill_organization_id_organization_id_fk", + "tableFrom": "skill", + "tableTo": "organization", + "columnsFrom": [ + "organization_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "skill_workspace_id_workspace_id_fk": { + "name": "skill_workspace_id_workspace_id_fk", + "tableFrom": "skill", + "tableTo": "workspace", + "columnsFrom": [ + "workspace_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "unique_skill_name_workspace": { + "name": "unique_skill_name_workspace", + "nullsNotDistinct": false, + "columns": [ + "workspace_id", + "name" + ] + }, + "unique_skill_name_org": { + "name": "unique_skill_name_org", + "nullsNotDistinct": false, + "columns": [ + "organization_id", + "name" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.trigger": { + "name": "trigger", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "agent_id": { + "name": "agent_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "type": { + "name": "type", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "instruction": { + "name": "instruction", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "enabled": { + "name": "enabled", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": true + }, + "max_runs_to_keep": { + "name": "max_runs_to_keep", + "type": "integer", + "primaryKey": false, + "notNull": true, + "default": 10 + }, + "search": { + "name": "search", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": false + }, + "config": { + "name": "config", + "type": "jsonb", + "primaryKey": false, + "notNull": true + }, + "last_run_at": { + "name": "last_run_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "next_run_at": { + "name": "next_run_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_trigger_workspace_id": { + "name": "idx_trigger_workspace_id", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_trigger_next_run_at": { + "name": "idx_trigger_next_run_at", + "columns": [ + { + "expression": "next_run_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_trigger_type": { + "name": "idx_trigger_type", + "columns": [ + { + "expression": "type", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "trigger_workspace_id_workspace_id_fk": { + "name": "trigger_workspace_id_workspace_id_fk", + "tableFrom": "trigger", + "tableTo": "workspace", + "columnsFrom": [ + "workspace_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "trigger_agent_id_agent_id_fk": { + "name": "trigger_agent_id_agent_id_fk", + "tableFrom": "trigger", + "tableTo": "agent", + "columnsFrom": [ + "agent_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "restrict", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.trigger_run": { + "name": "trigger_run", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "trigger_id": { + "name": "trigger_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "status": { + "name": "status", + "type": "text", + "primaryKey": false, + "notNull": true, + "default": "'pending'" + }, + "event_type": { + "name": "event_type", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "event_data": { + "name": "event_data", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "started_at": { + "name": "started_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "completed_at": { + "name": "completed_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "error_message": { + "name": "error_message", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "stats": { + "name": "stats", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_trigger_run_trigger_id": { + "name": "idx_trigger_run_trigger_id", + "columns": [ + { + "expression": "trigger_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_trigger_run_started_at": { + "name": "idx_trigger_run_started_at", + "columns": [ + { + "expression": "started_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "trigger_run_trigger_id_trigger_id_fk": { + "name": "trigger_run_trigger_id_trigger_id_fk", + "tableFrom": "trigger_run", + "tableTo": "trigger", + "columnsFrom": [ + "trigger_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.webhook": { + "name": "webhook", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "workspace_id": { + "name": "workspace_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true, + "default": "'Webhook'" + }, + "url": { + "name": "url", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "signing_secret": { + "name": "signing_secret", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "headers": { + "name": "headers", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "enabled": { + "name": "enabled", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": true + }, + "events": { + "name": "events", + "type": "jsonb", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_webhook_workspace_id": { + "name": "idx_webhook_workspace_id", + "columns": [ + { + "expression": "workspace_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "webhook_workspace_id_workspace_id_fk": { + "name": "webhook_workspace_id_workspace_id_fk", + "tableFrom": "webhook", + "tableTo": "workspace", + "columnsFrom": [ + "workspace_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.widget": { + "name": "widget", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "dashboard_id": { + "name": "dashboard_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "type": { + "name": "type", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "title": { + "name": "title", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "data": { + "name": "data", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_widget_dashboard_id": { + "name": "idx_widget_dashboard_id", + "columns": [ + { + "expression": "dashboard_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "uq_widget_dashboard_title": { + "name": "uq_widget_dashboard_title", + "columns": [ + { + "expression": "dashboard_id", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "title", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "widget_dashboard_id_dashboard_id_fk": { + "name": "widget_dashboard_id_dashboard_id_fk", + "tableFrom": "widget", + "tableTo": "dashboard", + "columnsFrom": [ + "dashboard_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.workspace": { + "name": "workspace", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "organization_id": { + "name": "organization_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "owner_id": { + "name": "owner_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "context": { + "name": "context", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "task_model_provider_id": { + "name": "task_model_provider_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "memory_extraction_provider_id": { + "name": "memory_extraction_provider_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "memory_embedding_provider_id": { + "name": "memory_embedding_provider_id", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "max_daily_summaries": { + "name": "max_daily_summaries", + "type": "integer", + "primaryKey": false, + "notNull": false, + "default": 90 + }, + "provider_self_management": { + "name": "provider_self_management", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": false + }, + "mcp_self_management": { + "name": "mcp_self_management", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "idx_workspace_organization_id": { + "name": "idx_workspace_organization_id", + "columns": [ + { + "expression": "organization_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "idx_workspace_owner_id": { + "name": "idx_workspace_owner_id", + "columns": [ + { + "expression": "owner_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "workspace_organization_id_organization_id_fk": { + "name": "workspace_organization_id_organization_id_fk", + "tableFrom": "workspace", + "tableTo": "organization", + "columnsFrom": [ + "organization_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "workspace_owner_id_user_id_fk": { + "name": "workspace_owner_id_user_id_fk", + "tableFrom": "workspace", + "tableTo": "user", + "columnsFrom": [ + "owner_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "workspace_task_model_provider_id_provider_id_fk": { + "name": "workspace_task_model_provider_id_provider_id_fk", + "tableFrom": "workspace", + "tableTo": "provider", + "columnsFrom": [ + "task_model_provider_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "set null", + "onUpdate": "no action" + }, + "workspace_memory_extraction_provider_id_provider_id_fk": { + "name": "workspace_memory_extraction_provider_id_provider_id_fk", + "tableFrom": "workspace", + "tableTo": "provider", + "columnsFrom": [ + "memory_extraction_provider_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "set null", + "onUpdate": "no action" + }, + "workspace_memory_embedding_provider_id_provider_id_fk": { + "name": "workspace_memory_embedding_provider_id_provider_id_fk", + "tableFrom": "workspace", + "tableTo": "provider", + "columnsFrom": [ + "memory_embedding_provider_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "set null", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.account": { + "name": "account", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "account_id": { + "name": "account_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "provider_id": { + "name": "provider_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "user_id": { + "name": "user_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "access_token": { + "name": "access_token", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "refresh_token": { + "name": "refresh_token", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "id_token": { + "name": "id_token", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "access_token_expires_at": { + "name": "access_token_expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "refresh_token_expires_at": { + "name": "refresh_token_expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "scope": { + "name": "scope", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "password": { + "name": "password", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "account_userId_idx": { + "name": "account_userId_idx", + "columns": [ + { + "expression": "user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "account_user_id_user_id_fk": { + "name": "account_user_id_user_id_fk", + "tableFrom": "account", + "tableTo": "user", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.session": { + "name": "session", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "expires_at": { + "name": "expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true + }, + "token": { + "name": "token", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "ip_address": { + "name": "ip_address", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "user_agent": { + "name": "user_agent", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "user_id": { + "name": "user_id", + "type": "text", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "session_userId_idx": { + "name": "session_userId_idx", + "columns": [ + { + "expression": "user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "session_user_id_user_id_fk": { + "name": "session_user_id_user_id_fk", + "tableFrom": "session", + "tableTo": "user", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "session_token_unique": { + "name": "session_token_unique", + "nullsNotDistinct": false, + "columns": [ + "token" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.user": { + "name": "user", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "email": { + "name": "email", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "email_verified": { + "name": "email_verified", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": false + }, + "image": { + "name": "image", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "role": { + "name": "role", + "type": "text", + "primaryKey": false, + "notNull": true, + "default": "'user'" + }, + "banned": { + "name": "banned", + "type": "boolean", + "primaryKey": false, + "notNull": false, + "default": false + }, + "ban_reason": { + "name": "ban_reason", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "ban_expires": { + "name": "ban_expires", + "type": "timestamp", + "primaryKey": false, + "notNull": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "user_email_unique": { + "name": "user_email_unique", + "nullsNotDistinct": false, + "columns": [ + "email" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.verification": { + "name": "verification", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "identifier": { + "name": "identifier", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "value": { + "name": "value", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "expires_at": { + "name": "expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "verification_identifier_idx": { + "name": "verification_identifier_idx", + "columns": [ + { + "expression": "identifier", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + } + }, + "enums": {}, + "schemas": {}, + "sequences": {}, + "roles": {}, + "policies": {}, + "views": {}, + "_meta": { + "columns": {}, + "schemas": {}, + "tables": {} + } +} \ No newline at end of file diff --git a/apps/backend/drizzle/meta/_journal.json b/apps/backend/drizzle/meta/_journal.json index 44a49c37..847f0e0f 100644 --- a/apps/backend/drizzle/meta/_journal.json +++ b/apps/backend/drizzle/meta/_journal.json @@ -323,6 +323,13 @@ "when": 1780816408681, "tag": "0045_married_roxanne_simpson", "breakpoints": true + }, + { + "idx": 46, + "version": "7", + "when": 1781201728242, + "tag": "0046_context_compaction", + "breakpoints": true } ] } \ No newline at end of file diff --git a/apps/backend/src/db/schema.ts b/apps/backend/src/db/schema.ts index 21a1d6e7..52015213 100644 --- a/apps/backend/src/db/schema.ts +++ b/apps/backend/src/db/schema.ts @@ -67,6 +67,13 @@ export const provider = pgTable( memoryExtractionModelId: t.text("memory_extraction_model_id").notNull(), embeddingModelId: t.text("embedding_model_id"), embeddingDimensions: t.integer("embedding_dimensions"), + // Per-model context-window / output overrides (ADR-0012 §Window resolution). + // Keyed by model id; resolveContextWindow consults this before API/registry. + modelMeta: t + .jsonb("model_meta") + .$type< + Record + >(), createdAt: t.timestamp("created_at").notNull().defaultNow(), updatedAt: t.timestamp("updated_at").notNull().defaultNow(), }), @@ -162,6 +169,16 @@ export const chat = pgTable( presencePenalty: t.real("presence_penalty"), frequencyPenalty: t.real("frequency_penalty"), + // Context-compaction state (docs/adr/0012). All additive nullable/defaulted. + // View-not-delete (ADR-0012 §View, not delete): these change what is sent to the model, never the + // stored `messages`. `summaryWatermark` = message id of the last summarized + // message. All mutations go through the single versioned CAS writer (ADR-0012 §One durable writer); + // `version` is its compare-and-swap token. + contextSummary: t.text("context_summary"), + summaryWatermark: t.text("summary_watermark"), + compactionDirty: t.boolean("compaction_dirty").notNull().default(false), + version: t.integer("version").notNull().default(0), + // Memory processing tracking lastMemoryProcessedAt: t.timestamp("last_memory_processed_at"), memoryExtractionStatus: t diff --git a/apps/backend/src/routes/chat.test.ts b/apps/backend/src/routes/chat.test.ts index f69e879f..43b3161e 100644 --- a/apps/backend/src/routes/chat.test.ts +++ b/apps/backend/src/routes/chat.test.ts @@ -6,8 +6,9 @@ import { resetMockDb, } from "../test-utils.ts"; -const { mockPrepareChatTurn } = vi.hoisted(() => ({ +const { mockPrepareChatTurn, mockForceCompactChat } = vi.hoisted(() => ({ mockPrepareChatTurn: vi.fn(), + mockForceCompactChat: vi.fn(), })); vi.mock("../services/chat-execution.ts", () => { @@ -25,12 +26,20 @@ vi.mock("../services/chat-execution.ts", () => { } return { prepareChatTurn: mockPrepareChatTurn, + forceCompactChat: mockForceCompactChat, + // loadChatMessages is called by agent-runner before onStart (ADR-0012 §Summary invalidation baseline). + loadChatMessages: vi.fn().mockResolvedValue([]), ValidationError, NotFoundError, drizzleChatTurnQueries: {}, }; }); +import { runRegistry } from "../runs/run-registry.ts"; +// Mocked above — resolves to the mock's NotFoundError class, the same one the +// route checks with `instanceof`. +import { NotFoundError } from "../services/chat-execution.ts"; + import app from "../server.ts"; // Mock AI SDK @@ -88,6 +97,10 @@ describe("Chat Routes", () => { beforeEach(() => { resetMockDb(); vi.clearAllMocks(); + // The `POST /` test starts a (mocked) run that registers chat-1 and never + // finalizes, leaving it in the process-wide registry. Clear it so the + // compact route's in-progress guard sees a clean slate. + runRegistry.unregister("chat-1"); mockDb.where.mockReturnValue(mockDb); mockDb.orderBy.mockReturnValue(mockDb); mockDb.limit.mockReturnValue(mockDb); @@ -239,6 +252,7 @@ describe("Chat Routes", () => { mockDb.limit.mockResolvedValueOnce([ { ownerId: "user-1", organizationId: "org-1" }, ]); // requireWorkspaceAccess + mockDb.limit.mockResolvedValueOnce([{ workspaceId: "ws-1" }]); // ADR-0012 §Consequences (cross-tenant safety) chat workspace check // ChatSink.onStart upserts the chat row with status=running before // prepareChatTurn runs. Returning a non-empty array skips the insert @@ -279,6 +293,36 @@ describe("Chat Routes", () => { expect(res.status).toBe(200); expect(await res.text()).toBe("stream"); }); + + it("returns 404 when the submitted chat id belongs to another workspace (ADR-0012 §Consequences cross-tenant safety)", async () => { + mockSession({ + id: "user-1", + name: "Test User", + email: "test@example.com", + }); + mockDb.limit.mockResolvedValueOnce([{ role: "member" }]); // requireOrgAccess + mockDb.limit.mockResolvedValueOnce([ + { ownerId: "user-1", organizationId: "org-1" }, + ]); // requireWorkspaceAccess + // Cross-tenant check: the chat exists but in a DIFFERENT workspace. + mockDb.limit.mockResolvedValueOnce([{ workspaceId: "ws-other" }]); + + const res = await app.request(baseUrl, { + method: "POST", + body: JSON.stringify({ + id: "chat-1", + workspaceId, + providerId: "p1", + modelId: "m1", + messages: [{ role: "user", content: "hello" }], + }), + headers: { "Content-Type": "application/json" }, + }); + + expect(res.status).toBe(404); + // The run must never start — no compaction-store mutation on another tenant's chat. + expect(mockPrepareChatTurn).not.toHaveBeenCalled(); + }); }); describe("DELETE /:chatId", () => { @@ -469,4 +513,75 @@ describe("Chat Routes", () => { expect(await res.json()).toEqual(mockUpdatedChat); }); }); + + describe("POST /:chatId/compact", () => { + const ownerAccess = () => { + mockSession(); + mockDb.limit.mockResolvedValueOnce([{ role: "member" }]); // requireOrgAccess + mockDb.limit.mockResolvedValueOnce([ + { ownerId: "user-1", organizationId: "org-1" }, + ]); // requireWorkspaceAccess + owner + }; + + it("force-compacts and returns the refreshed usage", async () => { + ownerAccess(); + mockForceCompactChat.mockResolvedValueOnce({ + estimatedTokens: 1234, + contextWindow: 8192, + contextWindowIsDefault: false, + }); + + const res = await app.request(`${baseUrl}/chat-1/compact`, { + method: "POST", + }); + + expect(res.status).toBe(200); + expect(await res.json()).toEqual({ + inputTokens: 1234, + contextWindow: 8192, + contextWindowIsDefault: false, + }); + expect(mockForceCompactChat).toHaveBeenCalledWith( + "chat-1", + workspaceId, + orgId, + ); + }); + + it("returns 409 when a run is in progress (does not compact)", async () => { + ownerAccess(); + // runRegistry is keyed by runId, which equals the chatId for top-level + // chat runs — so an in-flight run on this chat blocks the compact. + runRegistry.register("chat-1"); + try { + const res = await app.request(`${baseUrl}/chat-1/compact`, { + method: "POST", + }); + expect(res.status).toBe(409); + expect(mockForceCompactChat).not.toHaveBeenCalled(); + } finally { + runRegistry.unregister("chat-1"); + } + }); + + it("returns 404 when the chat is not found / not in the workspace", async () => { + ownerAccess(); + mockForceCompactChat.mockRejectedValueOnce( + new NotFoundError("Chat not found"), + ); + + const res = await app.request(`${baseUrl}/chat-other/compact`, { + method: "POST", + }); + expect(res.status).toBe(404); + }); + + it("returns 401 without a session", async () => { + mockNoSession(); + const res = await app.request(`${baseUrl}/chat-1/compact`, { + method: "POST", + }); + expect(res.status).toBe(401); + }); + }); }); diff --git a/apps/backend/src/routes/chat.ts b/apps/backend/src/routes/chat.ts index 8acaa199..b0129ad4 100644 --- a/apps/backend/src/routes/chat.ts +++ b/apps/backend/src/routes/chat.ts @@ -9,7 +9,11 @@ import { provider as providerTable, workspace as workspaceTable, } from "../db/schema.ts"; -import { NotFoundError, ValidationError } from "../services/chat-execution.ts"; +import { + forceCompactChat, + NotFoundError, + ValidationError, +} from "../services/chat-execution.ts"; import { openProvider } from "../services/provider.ts"; import { chatGenerateMetadataSchema, @@ -29,6 +33,7 @@ import { type PlatypusUIMessage } from "../types.ts"; import { rewriteStorageUrls, deleteFiles } from "../storage/utils.ts"; import { getOrigin } from "../utils/get-origin.ts"; import { agentRunner } from "../runs/agent-runner.ts"; +import { runRegistry } from "../runs/run-registry.ts"; import { ChatSink } from "../runs/sinks/chat-sink.ts"; import type { RunInput } from "../runs/types.ts"; @@ -143,6 +148,23 @@ chat.post( const scope = c.get("workspaceScope")!; const data = c.req.valid("json"); + // ADR-0012 §Consequences (cross-tenant safety): verify the submitted chat id (if any) belongs to this workspace. + // Without this check a workspace-A user could supply a workspace-B chat id + // and corrupt B's compaction state via the unscoped store writes. + if (data.id) { + const existing = await db + .select({ workspaceId: chatTable.workspaceId }) + .from(chatTable) + .where(eq(chatTable.id, data.id)) + .limit(1); + if ( + existing.length > 0 && + existing[0].workspaceId !== scope.workspaceId + ) { + return c.json({ message: "Chat not found" }, 404); + } + } + const input: RunInput = { runId: data.id, request: data, @@ -417,4 +439,52 @@ chat.post( }, ); +chat.post( + "/:chatId/compact", + requireAuth, + requireOrgAccess(), + requireWorkspaceAccess, + requireWorkspaceOwner, + async (c) => { + const orgId = c.req.param("orgId")!; + const chatId = c.req.param("chatId"); + const workspaceId = c.req.param("workspaceId")!; + + // Reject if a run is currently in flight — the frontend defers the click + // until streaming finishes (ADR-0012 §Force-compact on demand), but guard here as a belt-and-suspenders + // check to avoid CAS races with an in-progress writer. + if (runRegistry.has(chatId)) { + return c.json( + { error: "Run in progress; retry after the response finishes" }, + 409, + ); + } + + try { + const result = await forceCompactChat(chatId, workspaceId, orgId); + return c.json({ + inputTokens: result.estimatedTokens, + // ADR-0012 §Force-compact on demand: the client confirms only when the drop + // is significant (messagesDropped > keepRecentMessages OR reduction > 30%). + tokensBefore: result.tokensBefore, + messagesDropped: result.messagesDropped, + keepRecentMessages: result.keepRecentMessages, + contextWindow: result.contextWindow, + contextWindowIsDefault: result.contextWindowIsDefault, + // ADR-0012 §Compaction trace in the timeline: the persisted synthetic trace message (when a summary ran), so + // the frontend can append it to the timeline without a full refetch. + traceMessage: result.traceMessage, + }); + } catch (error) { + if (error instanceof NotFoundError) { + return c.json({ error: error.message }, 404); + } + if (error instanceof ValidationError) { + return c.json({ error: error.message }, 400); + } + throw error; + } + }, +); + export { chat }; diff --git a/apps/backend/src/routes/org-provider.ts b/apps/backend/src/routes/org-provider.ts index 9777fe75..d037152e 100644 --- a/apps/backend/src/routes/org-provider.ts +++ b/apps/backend/src/routes/org-provider.ts @@ -7,6 +7,7 @@ import { providerCreateSchema, providerUpdateSchema } from "@platypus/schemas"; import { eq, and } from "drizzle-orm"; import { handleEmbeddingConfigChange } from "../services/embedding-invalidation.ts"; import { dedupeArray } from "../utils.ts"; +import { contextWindowResolver } from "../runs/context-window.ts"; import { requireAuth } from "../middleware/authentication.ts"; import { requireOrgAccess } from "../middleware/authorization.ts"; import { requireSharedDeletable } from "../services/scoped-resource.ts"; @@ -117,6 +118,10 @@ orgProvider.put( throw new NotFoundError("Provider not found"); } + // RV7c: bust the cached context window so a modelMeta override takes effect + // immediately rather than waiting out the 1-hour TTL (drift T5). + contextWindowResolver.evict(providerId); + return c.json(record[0], 200); }, ); diff --git a/apps/backend/src/routes/provider.ts b/apps/backend/src/routes/provider.ts index b437c335..74170291 100644 --- a/apps/backend/src/routes/provider.ts +++ b/apps/backend/src/routes/provider.ts @@ -7,6 +7,8 @@ import { providerCreateSchema, providerUpdateSchema } from "@platypus/schemas"; import { eq, and } from "drizzle-orm"; import { handleEmbeddingConfigChange } from "../services/embedding-invalidation.ts"; import { dedupeArray } from "../utils.ts"; +import { contextWindowResolver } from "../runs/context-window.ts"; +import { resolveCompactionConfig } from "../services/chat-execution.ts"; import { requireAuth } from "../middleware/authentication.ts"; import { requireOrgAccess, @@ -135,6 +137,10 @@ provider.put( ) .returning(); + // ADR-0012 §Window resolution (caching & eviction): bust the cached context window so a modelMeta override takes effect + // immediately rather than waiting out the 1-hour TTL (ADR-0012 §Window resolution (caching & eviction)). + contextWindowResolver.evict(providerId); + return c.json(record[0], 200); }, ); @@ -171,4 +177,49 @@ provider.delete( }, ); +/** + * Returns the resolved context window for a specific model on this provider + * (ADR-0012 §Context-usage ring). Uses the cached resolver — fast for repeated calls. + * Returns `{ contextWindow: null }` when the window fell to the conservative + * default so the frontend can render the ring neutral (ADR-0012 §Context-usage ring). + */ +provider.get( + "/:providerId/context-window", + requireAuth, + requireOrgAccess(), + requireWorkspaceAccess, + async (c) => { + const orgId = c.req.param("orgId")!; + const workspaceId = c.req.param("workspaceId")!; + const providerId = c.req.param("providerId"); + const modelId = c.req.query("modelId"); + + if (!modelId) { + return c.json({ error: "modelId query parameter required" }, 400); + } + + const found = await requireScoped(db, "provider", providerId, { + orgId, + wsId: workspaceId, + }); + + const resolved = await contextWindowResolver + .resolve(found.row, modelId) + .catch(() => null); + + return c.json({ + contextWindow: + resolved && resolved.source !== "default" + ? resolved.contextWindow + : null, + source: resolved?.source ?? "default", + // ADR-0012 §Force-compact on demand: the client gates the confirm dialog on + // the drop being significant. messagesDropped ≈ total − keepRecent, so + // "messagesDropped > keepRecent" ⟺ "total > 2 × keepRecent" — a pre-run + // proxy computable client-side from the message count. + keepRecentMessages: resolveCompactionConfig().keepRecentMessages, + }); + }, +); + export { provider }; diff --git a/apps/backend/src/runs/agent-runner.test.ts b/apps/backend/src/runs/agent-runner.test.ts index 45660386..c7e73e00 100644 --- a/apps/backend/src/runs/agent-runner.test.ts +++ b/apps/backend/src/runs/agent-runner.test.ts @@ -80,9 +80,17 @@ vi.mock("../logger.ts", () => ({ }, })); -import { AgentRunner } from "./agent-runner.ts"; +import { + AgentRunner, + prependCompactionChunks, + stripCompactionTraceParts, + withToolTimestamps, +} from "./agent-runner.ts"; +import { buildTier2PrepareStep } from "./compaction.ts"; +import type { UIMessageChunk } from "ai"; import { runRegistry, TimeoutError } from "./run-registry.ts"; import type { ResolvedRunPlan, RunInput, RunSink } from "./types.ts"; +import type { PlatypusUIMessage } from "../types.ts"; import type { WorkspaceScope } from "../scope.ts"; type LifecycleEvent = @@ -161,6 +169,14 @@ const fakeTurn = (overrides?: { dispose?: () => Promise }) => { providerId: "p1", modelId: "m1", }, + recovery: { + imageProvider: "default" as const, + targetTokens: 1000, + keepRecentMessages: 10, + minPrunableChars: 2000, + summarize: (t: string) => Promise.resolve(t), + }, + tier2: null, dispose, }; }; @@ -406,7 +422,15 @@ describe("AgentRunner.stream — success & interruption", () => { onFinish: (ctx: { messages: unknown[] }) => Promise | void; }) => { streamHarness.onFinish = uiOpts.onFinish; - return { tee: () => [{}, {}] }; + // The runner pipes this through withToolTimestamps (pipeThrough) and + // tees it, so it must be a real ReadableStream. Its contents are + // irrelevant — the snapshot branch is driven via the mocked + // readUIMessageStream (streamHarness.queue), not this stream. + return new ReadableStream({ + start(controller) { + controller.close(); + }, + }); }, }; }, @@ -434,16 +458,19 @@ describe("AgentRunner.stream — success & interruption", () => { usage: { inputTokens: 3, outputTokens: 4 }, toolCalls: [], }); - // A partial snapshot streams in over the server-side branch. - queue.push({ id: "m1", role: "assistant", parts: [] }); + // The server-side snapshot branch delivers the final assistant message, + // updating state.messages; ending the queue drains the consumer, which + // finalises the run (the runner does not use toUIMessageStream's onFinish). + const finalMessage = { + id: "m1", + role: "assistant", + parts: [{ type: "text", text: "hi" }], + }; + queue.push(finalMessage); await tick(); - // Natural completion delivers the final assistant message. - const finalMessages = [ - { id: "m1", role: "assistant", parts: [{ type: "text", text: "hi" }] }, - ]; - await streamHarness.onFinish!({ messages: finalMessages }); queue.end(); await tick(); + const finalMessages = [finalMessage]; expect(sink.names()).toEqual([ "onStart", @@ -485,8 +512,7 @@ describe("AgentRunner.stream — success & interruption", () => { await tick(); expect(runner.cancel("s-cancel")).toBe(true); - // The SDK observes the abort and finishes the UI stream. - await streamHarness.onFinish!({ messages: [partial] }); + // The abort ends the UI stream; the snapshot consumer drains and finalises. queue.end(); await tick(); @@ -547,3 +573,420 @@ describe("AgentRunner timeout types", () => { expect(e.kind).toBe("run"); }); }); + +describe("withToolTimestamps", () => { + const FIXED_NOW = "2026-05-30T12:00:00.000Z"; + + const collect = async (stream: ReadableStream): Promise => { + const out: T[] = []; + const reader = stream.getReader(); + for (;;) { + const { done, value } = await reader.read(); + if (done) break; + out.push(value); + } + return out; + }; + + const sourceOf = (chunks: UIMessageChunk[]): ReadableStream => + new ReadableStream({ + start(controller) { + for (const chunk of chunks) controller.enqueue(chunk); + controller.close(); + }, + }); + + const toolInputAvailable = ( + overrides: Partial< + Extract + > = {}, + ): UIMessageChunk => ({ + type: "tool-input-available", + toolCallId: "t1", + toolName: "foo", + input: { x: 1 }, + ...overrides, + }); + + it("injects startedAt on tool-input-available chunks", async () => { + const { stream } = withToolTimestamps( + sourceOf([toolInputAvailable()]), + () => FIXED_NOW, + ); + const result = await collect(stream); + + expect(result).toHaveLength(1); + expect( + (result[0] as { toolMetadata?: Record }).toolMetadata, + ).toEqual({ startedAt: FIXED_NOW }); + }); + + it("preserves existing toolMetadata fields", async () => { + const { stream } = withToolTimestamps( + sourceOf([toolInputAvailable({ toolMetadata: { custom: "value" } })]), + () => FIXED_NOW, + ); + const result = await collect(stream); + + expect( + (result[0] as { toolMetadata?: Record }).toolMetadata, + ).toEqual({ + custom: "value", + startedAt: FIXED_NOW, + }); + }); + + it("passes other chunks through unchanged", async () => { + const chunks: UIMessageChunk[] = [ + { type: "text-delta", id: "a", delta: "hello" }, + { + type: "tool-output-available", + toolCallId: "t1", + output: { ok: true }, + }, + { type: "finish", finishReason: "stop" }, + ]; + + const { stream } = withToolTimestamps(sourceOf(chunks), () => FIXED_NOW); + const result = await collect(stream); + + expect(result).toEqual(chunks); + }); + + it("records completedAt for tool-output-available chunks", async () => { + const { stream, completions } = withToolTimestamps( + sourceOf([ + toolInputAvailable(), + { + type: "tool-output-available", + toolCallId: "t1", + output: { ok: true }, + }, + ]), + () => FIXED_NOW, + ); + // Completions are populated as the stream drains, so consume it first. + await collect(stream); + + expect(completions.get("t1")).toBe(FIXED_NOW); + }); + + it("records completedAt for tool-output-error chunks", async () => { + const { stream, completions } = withToolTimestamps( + sourceOf([ + toolInputAvailable(), + { + type: "tool-output-error", + toolCallId: "t1", + errorText: "boom", + }, + ]), + () => FIXED_NOW, + ); + await collect(stream); + + expect(completions.get("t1")).toBe(FIXED_NOW); + }); + + // Mirrors AgentRunner.stream's pipeline: transform -> tee -> readUIMessageStream + // drains the snapshot branch. Verifies completions populate AND the built + // message's tool part carries the same toolCallId, so applyToolCompletions + // (matches on toolCallId) can stamp completedAt. + it("integration: completions + built tool part share toolCallId after tee+read", async () => { + const { readUIMessageStream } = + await vi.importActual("ai"); + + const chunks: UIMessageChunk[] = [ + { type: "start", messageId: "m1" }, + { type: "start-step" }, + { + type: "tool-input-available", + toolCallId: "call_xyz", + toolName: "foo", + input: { a: 1 }, + }, + { + type: "tool-output-available", + toolCallId: "call_xyz", + output: { ok: true }, + }, + { type: "finish-step" }, + { type: "finish" }, + ]; + + const { stream, completions } = withToolTimestamps( + sourceOf(chunks), + () => FIXED_NOW, + ); + const [forResponse, forSnapshot] = stream.tee(); + + let lastMessage: { parts?: Array> } | undefined; + for await (const message of readUIMessageStream({ stream: forSnapshot })) { + lastMessage = message; + } + await collect(forResponse); + + expect(completions.get("call_xyz")).toBe(FIXED_NOW); + + const toolPart = lastMessage?.parts?.find( + (p) => (p as { toolCallId?: string }).toolCallId === "call_xyz", + ) as { toolMetadata?: Record; toolCallId?: string }; + expect(toolPart).toBeDefined(); + expect(toolPart.toolCallId).toBe("call_xyz"); + expect(toolPart.toolMetadata).toMatchObject({ startedAt: FIXED_NOW }); + }); +}); + +describe("prependCompactionChunks", () => { + const collect = async ( + stream: ReadableStream, + ): Promise => { + const out: UIMessageChunk[] = []; + const reader = stream.getReader(); + for (;;) { + const { done, value } = await reader.read(); + if (done) break; + out.push(value); + } + return out; + }; + + const sourceOf = (chunks: UIMessageChunk[]): ReadableStream => + new ReadableStream({ + start(controller) { + for (const chunk of chunks) controller.enqueue(chunk); + controller.close(); + }, + }); + + it("injects a compact_context tool-call/result pair right after start, before any text", async () => { + const out = await collect( + prependCompactionChunks( + sourceOf([ + { type: "start" }, + { type: "text-start", id: "t" }, + { type: "text-delta", id: "t", delta: "hi" }, + ]), + { messagesDropped: 12, summaryExcerpt: "the user did X" }, + () => "cc1", + ), + ); + + expect(out.map((c) => c.type)).toEqual([ + "start", + "tool-input-available", + "tool-output-available", + "text-start", + "text-delta", + ]); + const input = out[1] as Extract< + UIMessageChunk, + { type: "tool-input-available" } + >; + expect(input.toolName).toBe("compact_context"); + expect(input.toolCallId).toBe("cc1"); + const output = out[2] as Extract< + UIMessageChunk, + { type: "tool-output-available" } + >; + expect(output.toolCallId).toBe("cc1"); + expect(output.output).toEqual({ + messagesDropped: 12, + summaryExcerpt: "the user did X", + }); + }); + + it("omits summaryExcerpt when absent", async () => { + const out = await collect( + prependCompactionChunks( + sourceOf([{ type: "start" }]), + { messagesDropped: 3 }, + () => "cc2", + ), + ); + const output = out[2] as Extract< + UIMessageChunk, + { type: "tool-output-available" } + >; + expect(output.output).toEqual({ messagesDropped: 3 }); + }); + + it("injects only once even if multiple start events appear", async () => { + const out = await collect( + prependCompactionChunks( + sourceOf([{ type: "start" }, { type: "start" }]), + { messagesDropped: 1 }, + () => "cc3", + ), + ); + expect(out.filter((c) => c.type === "tool-input-available")).toHaveLength( + 1, + ); + }); +}); + +describe("stripCompactionTraceParts", () => { + const traceMessage = (id: string): PlatypusUIMessage => + ({ + id, + role: "assistant", + parts: [ + { + type: "tool-compact_context", + toolCallId: `${id}-call`, + state: "output-available", + input: { messagesDropped: 2 }, + output: { messagesDropped: 2 }, + }, + ], + }) as unknown as PlatypusUIMessage; + + it("drops a trace-only assistant message entirely (never replayed to the model)", () => { + const messages = [ + { id: "u1", role: "user", parts: [{ type: "text", text: "hi" }] }, + traceMessage("t1"), + ] as unknown as PlatypusUIMessage[]; + + const out = stripCompactionTraceParts(messages); + expect(out.map((m) => m.id)).toEqual(["u1"]); + }); + + it("strips only the trace part from an assistant message with real content", () => { + const messages = [ + { + id: "a1", + role: "assistant", + parts: [ + { + type: "tool-compact_context", + toolCallId: "a1-call", + state: "output-available", + input: {}, + output: {}, + }, + { type: "text", text: "answer" }, + ], + }, + ] as unknown as PlatypusUIMessage[]; + + const out = stripCompactionTraceParts(messages); + expect(out).toHaveLength(1); + expect(out[0].parts.map((p) => p.type)).toEqual(["text"]); + }); + + it("returns the same array reference when nothing to strip", () => { + const messages = [ + { id: "u1", role: "user", parts: [{ type: "text", text: "hi" }] }, + ] as unknown as PlatypusUIMessage[]; + expect(stripCompactionTraceParts(messages)).toBe(messages); + }); +}); + +describe("buildTier2PrepareStep", () => { + const makeCtx = (triggerTokens = 100) => ({ + triggerTokens, + targetTokens: 50, + keepRecentMessages: 4, + minPrunableChars: 100, + imageProvider: "default" as const, + summarize: vi.fn().mockResolvedValue("summary"), + summarizerWindow: undefined, + }); + + // Invoke a PrepareStepFunction supplying only the field under test; the + // callback ignores steps/stepNumber/model/experimental_context. + const callStep = ( + fn: ReturnType, + messages: import("ai").ModelMessage[], + ) => + fn({ + messages, + steps: [], + stepNumber: 0, + model: {} as never, + experimental_context: undefined, + }); + + const shortMessages: import("ai").ModelMessage[] = [ + { role: "user", content: [{ type: "text", text: "hi" }] }, + { + role: "assistant", + content: [{ type: "text", text: "hello" }], + }, + ]; + + // 6 assistant/tool pairs where each tool result carries 1200 chars of text + // (≈ 300 tokens each via char/4). Total ≈ 1800+ tokens > any reasonable + // triggerTokens threshold used in these tests. + const longMessages = (): import("ai").ModelMessage[] => { + const msgs: import("ai").ModelMessage[] = [ + { role: "user", content: [{ type: "text", text: "start" }] }, + ]; + for (let i = 0; i < 6; i++) { + msgs.push({ + role: "assistant", + content: [ + { + type: "tool-call", + toolCallId: `tc${i}`, + toolName: "tool", + input: {}, + }, + ], + }); + msgs.push({ + role: "tool", + content: [ + { + type: "tool-result", + toolCallId: `tc${i}`, + toolName: "tool", + // Must use typed output shape so tokenEstimator counts the value. + output: { type: "text" as const, value: "x".repeat(1200) }, + }, + ], + }); + } + return msgs; + }; + + it("returns undefined when messages are below triggerTokens (ADR-0012 §Sub-agents)", async () => { + const fn = buildTier2PrepareStep(makeCtx(10_000)); + const result = await callStep(fn, shortMessages); + expect(result).toBeUndefined(); + }); + + it("compacts when messages exceed triggerTokens", async () => { + const msgs = longMessages(); + const ctx = makeCtx(1); + const fn = buildTier2PrepareStep(ctx); + const result = await callStep(fn, msgs); + expect(result?.messages).toBeDefined(); + const out = result!.messages!; + expect(out.length).toBeLessThan(msgs.length); + // Stage 2 summarizes the dropped prefix. + expect(ctx.summarize).toHaveBeenCalled(); + // First surviving message is the synthetic summary (role "user"); the one + // after it starts the kept tail and must not be an orphaned tool result + // (its assistant tool-call would have been dropped into the prefix). + expect(out[1]?.role).not.toBe("tool"); + }); + + it("returns undefined when prefix is empty (no-op, ADR-0012 §Sub-agents)", async () => { + // Two messages, keepRecentMessages 4 → no prefix to summarize → + // compactModelMessages drops nothing → prepareStep returns undefined so the + // SDK proceeds unchanged, and the summarizer is never called. + const ctx = makeCtx(1); + const fn = buildTier2PrepareStep(ctx); + const result = await callStep(fn, shortMessages); + expect(result).toBeUndefined(); + expect(ctx.summarize).not.toHaveBeenCalled(); + }); + + it("does not call summarize when estimate is below triggerTokens", async () => { + const ctx = makeCtx(10_000); + const fn = buildTier2PrepareStep(ctx); + await callStep(fn, shortMessages); + expect(ctx.summarize).not.toHaveBeenCalled(); + }); +}); diff --git a/apps/backend/src/runs/agent-runner.ts b/apps/backend/src/runs/agent-runner.ts index 0959bf9f..51efe020 100644 --- a/apps/backend/src/runs/agent-runner.ts +++ b/apps/backend/src/runs/agent-runner.ts @@ -8,8 +8,21 @@ import { readUIMessageStream, stepCountIs, streamText, + wrapLanguageModel, + type LanguageModel, + type UIMessageChunk, } from "ai"; import { + contextOverflowRecoveryMiddleware, + isContextOverflowError, +} from "./recovery.ts"; +import { + buildTier2PrepareStep, + COMPACT_CONTEXT_TOOL_NAME, + type CompactionTrace, +} from "./compaction.ts"; +import { + loadChatMessages, prepareChatTurn, type ChatTurn, type ToolActivityEvent, @@ -32,6 +45,212 @@ import type { RunStatus, } from "./types.ts"; +/** + * Result of {@link withToolTimestamps}: the transformed stream plus a map of + * `toolCallId` → completion ISO timestamp, populated as tool-output chunks + * pass through. + */ +export type ToolTimestampStream = { + stream: ReadableStream; + /** toolCallId → completedAt ISO timestamp, filled in as the stream drains. */ + completions: Map; +}; + +/** + * Stamps tool-call timing onto the stream so the UI can show each tool's run + * duration: + * + * - `startedAt` is injected into `tool-input-available` chunks via + * `toolMetadata`. It must go here (not on the output chunk) because the AI + * SDK's tool-output handlers ignore `chunk.toolMetadata` and reuse the + * invocation's existing `toolMetadata` from the input-available phase. + * - `completedAt` cannot ride the output chunk for the same reason, so it is + * recorded in the returned `completions` map keyed by `toolCallId`. The run + * loop applies it to the built message via {@link applyToolCompletions} + * before the sink persists it. + * + * Exported for unit testing. + */ +export function withToolTimestamps( + stream: ReadableStream, + now: () => string = () => new Date().toISOString(), +): ToolTimestampStream { + const completions = new Map(); + const out = stream.pipeThrough( + new TransformStream({ + transform(chunk, controller) { + if (chunk.type === "tool-input-available") { + controller.enqueue({ + ...chunk, + toolMetadata: { + ...chunk.toolMetadata, + startedAt: now(), + }, + }); + return; + } + if ( + chunk.type === "tool-output-available" || + chunk.type === "tool-output-error" + ) { + completions.set(chunk.toolCallId, now()); + } + controller.enqueue(chunk); + }, + }), + ); + return { stream: out, completions }; +} + +/** + * Injects synthetic `compact_context` tool-call + tool-result chunks into a + * UIMessage stream immediately after the `start` event (ADR-0012 §Compaction trace in the timeline). Makes Tier + * 1 compaction visible in the chat timeline without a custom renderer — the + * existing tool-call expander handles it automatically. + * + * Exported for unit testing. + */ +export function prependCompactionChunks( + stream: ReadableStream, + trace: CompactionTrace, + generateId: () => string = createIdGenerator({ prefix: "cc", size: 12 }), +): ReadableStream { + const toolCallId = generateId(); + const syntheticChunks: UIMessageChunk[] = [ + { + type: "tool-input-available", + toolCallId, + toolName: COMPACT_CONTEXT_TOOL_NAME, + title: "Context compaction", + input: { messagesDropped: trace.messagesDropped }, + }, + { + type: "tool-output-available", + toolCallId, + output: { + messagesDropped: trace.messagesDropped, + ...(trace.summaryExcerpt + ? { summaryExcerpt: trace.summaryExcerpt } + : {}), + }, + }, + ]; + let injected = false; + return stream.pipeThrough( + new TransformStream({ + transform(chunk, controller) { + controller.enqueue(chunk); + if (!injected && chunk.type === "start") { + injected = true; + for (const c of syntheticChunks) controller.enqueue(c); + } + }, + }), + ); +} + +const COMPACT_CONTEXT_PART_TYPE = `tool-${COMPACT_CONTEXT_TOOL_NAME}`; + +/** + * Removes the synthetic `compact_context` trace parts (ADR-0012 §Compaction trace in the timeline) from a message + * list before it is converted to ModelMessages. The trace is a UI-only marker + * persisted in the assistant message for the chat timeline; it must NEVER be + * replayed to the provider, which would otherwise see a phantom tool call for a + * tool it was never given (provider rejection / model confusion). An assistant + * message left with no parts after stripping (the ADR-0012 §Force-compact on demand standalone trace message) + * is dropped entirely rather than sent empty. + * + * Exported for unit testing. + */ +export function stripCompactionTraceParts( + messages: PlatypusUIMessage[], +): PlatypusUIMessage[] { + let changed = false; + const out: PlatypusUIMessage[] = []; + for (const message of messages) { + if ( + message.role !== "assistant" || + !message.parts.some((p) => p.type === COMPACT_CONTEXT_PART_TYPE) + ) { + out.push(message); + continue; + } + changed = true; + const parts = message.parts.filter( + (p) => p.type !== COMPACT_CONTEXT_PART_TYPE, + ); + if (parts.length > 0) out.push({ ...message, parts }); + // else: trace-only message (ADR-0012 §Force-compact on demand) — drop it from the model payload. + } + return changed ? out : messages; +} + +/** Stats stamped on the last assistant message's metadata after each stream (ADR-0012 §Context-usage ring / §Per-message stats). */ +export type MessageStats = { + /** Run-wide totals across every step (sum) — ADR-0012 §Per-message stats cost popover. */ + inputTokens: number; + outputTokens: number; + /** + * Input tokens of the LAST model call = peak context fullness — ADR-0012 §Context-usage ring. + * NOT the run-wide sum (which over-counts on multi-step tool loops). + */ + contextTokens: number; + startedAt: string; + firstTokenAt?: string; + finishedAt: string; + contextWindow: number; + contextWindowIsDefault: boolean; +}; + +/** + * Stamps per-run stats (token counts, timing, resolved context window) onto + * the last assistant message's `metadata.stats` in place. Applied at the same + * point as {@link applyToolCompletions} so both mutations happen before the + * sink persists the final state (ADR-0012 §Context-usage ring / §Per-message stats). + */ +function applyMessageStats( + messages: PlatypusUIMessage[], + stats: MessageStats, +): void { + for (let i = messages.length - 1; i >= 0; i--) { + if (messages[i].role === "assistant") { + const msg = messages[i] as PlatypusUIMessage & { + metadata?: Record; + }; + msg.metadata = { ...msg.metadata, stats }; + return; + } + } +} + +/** + * Stamps `completedAt` onto assistant tool parts in place, reading from the + * `completions` map produced by {@link withToolTimestamps}. Applied to the + * built message just before it is persisted, since the AI SDK strips + * `toolMetadata` from tool-output chunks and the end time can't be injected + * inline. Paired with the injected `startedAt`, this lets the UI compute each + * tool's run duration. + */ +function applyToolCompletions( + messages: PlatypusUIMessage[], + completions: Map, +): void { + if (completions.size === 0) return; + for (const message of messages) { + for (const part of message.parts ?? []) { + const anyPart = part as { + toolCallId?: string; + toolMetadata?: Record; + }; + const completedAt = anyPart.toolCallId + ? completions.get(anyPart.toolCallId) + : undefined; + if (!completedAt) continue; + anyPart.toolMetadata = { ...anyPart.toolMetadata, completedAt }; + } + } +} + export type StreamOptions = { origin: string; frontendUrl?: string; @@ -153,6 +372,12 @@ type RunState = { stats: RunStats; messages: PlatypusUIMessage[]; terminated: boolean; + /** + * Input tokens reported by the most recent model step = peak context + * fullness for the ADR-0012 §Context-usage ring. Tracked separately from `stats.inputTokens`, + * which is the run-wide SUM and over-counts multi-step tool loops. + */ + lastStepInputTokens: number; }; /** @@ -177,6 +402,8 @@ export class AgentRunner { origin: string | undefined, frontendUrl?: string, onActivity?: (event?: ToolActivityEvent) => void, + priorMessages?: PlatypusUIMessage[], + signal?: AbortSignal, ): Promise { return prepareChatTurn({ orgId: scope.orgId, @@ -188,6 +415,8 @@ export class AgentRunner { frontendUrl, runMode: scope.principal.kind === "user" ? "interactive" : "headless", onActivity, + priorMessages, + signal, }); } @@ -218,12 +447,29 @@ export class AgentRunner { timeouts?: Pick; }) { const { scope, input, sink } = params; + + // ADR-0012 §Summary invalidation: snapshot the DB state BEFORE onStart overwrites it so + // applyTier1IfNeeded has the correct ADR-0012 §Summary invalidation baseline. Only interactive chats + // carry a `request.id`; headless runs (triggers, sub-agents) have none. + const priorMessages = input.request.id + ? await loadChatMessages(input.request.id).catch((err) => { + // Falls back to the post-overwrite DB read inside applyTier1IfNeeded, + // which cannot detect edits below the watermark — log the degradation. + logger.warn( + { err, chatId: input.request.id }, + "ADR-0012 §Summary invalidation: failed to snapshot prior messages; ADR-0012 §Summary invalidation edit-detection degraded this turn", + ); + return undefined; + }) + : undefined; + await sink.onStart({ runId: input.runId, messages: input.messages }); const state: RunState = { stats: {}, messages: input.messages, terminated: false, + lastStepInputTokens: 0, }; const finalize = async ( @@ -276,6 +522,8 @@ export class AgentRunner { params.origin, params.frontendUrl, onActivity, + priorMessages, + handle.signal, ); } catch (error) { const err = error instanceof Error ? error : new Error(String(error)); @@ -296,6 +544,8 @@ export class AgentRunner { }): void => { handle.bumpStep(); accumulateStepStats(state.stats, step); + state.lastStepInputTokens = + step.usage?.inputTokens ?? state.lastStepInputTokens; logger.info( { runId: input.runId, @@ -322,12 +572,26 @@ export class AgentRunner { // an `undefined` value identically, and the streaming path has always // passed them this way in production. const modelArgs = { - model: state.turn.stream.model, - messages: await convertToModelMessages(state.turn.stream.messages), + // Recovery middleware (ADR-0012 §Recovery): every model call — first call and every + // tool-loop step, stream and generate alike — gets one trim-and-retry on + // a provider "context too long" rejection. Always on; not gated by ADR-0012 §Config & kill switch. + model: withOverflowRecovery(state.turn), + // Strip the UI-only synthetic compact_context trace parts (ADR-0012 §Compaction trace in the timeline) before + // sending history to the provider — replaying them surfaces a phantom tool + // call for a tool the model was never given. Applied here so both the + // streaming and generate paths (which share modelArgs) are covered. + messages: await convertToModelMessages( + stripCompactionTraceParts(state.turn.stream.messages), + ), system: state.turn.stream.system, tools: state.turn.stream.tools, stopWhen: [stepCountIs(state.turn.stream.maxSteps)], abortSignal: handle.signal, + // Tier 2 (ADR-0012 §Tier 2): in-turn compaction before each step when the live window + // nears the limit. Undefined when the turn has no Tier 2 runtime. + prepareStep: state.turn.tier2 + ? buildTier2PrepareStep(state.turn.tier2) + : undefined, temperature: state.turn.stream.temperature, topP: state.turn.stream.topP, topK: state.turn.stream.topK, @@ -357,66 +621,154 @@ export class AgentRunner { logger.debug({ systemPrompt: modelArgs.system }, "System prompt for chat"); + const startedAt = new Date().toISOString(); + let firstTokenAt: string | undefined; + // Set when the ADR-0012 §Context-usage ring / §Per-message stats are first emitted (messageMetadata `finish`), so + // the post-stream persist stamp reuses the same value rather than a slightly + // later one — streamed and reloaded stats then match. + let finishedAt: string | undefined; + + // Single source of truth for the per-message stats, so the live-streamed + // copy (messageMetadata, below) and the persisted copy (applyMessageStats in + // the finally) are identical. Reads the mutable state at call time. + const buildMessageStats = ( + finishedAtValue: string, + ): MessageStats | undefined => { + if (!state.turn) return undefined; + return { + inputTokens: state.stats.inputTokens ?? 0, + outputTokens: state.stats.outputTokens ?? 0, + contextTokens: state.lastStepInputTokens, + startedAt, + firstTokenAt, + finishedAt: finishedAtValue, + contextWindow: state.turn.resolved.contextWindow, + contextWindowIsDefault: state.turn.resolved.contextWindowIsDefault, + }; + }; + const result = streamText({ ...modelArgs, onStepFinish: (step) => onStep(step), + // TTFT: stamp the first text token here (fires before the `finish` event), + // so the stats are complete by the time messageMetadata emits them. + onChunk: ({ chunk }) => { + if (!firstTokenAt && chunk.type === "text-delta") { + firstTokenAt = new Date().toISOString(); + } + }, }); // Build the UI message stream and tee it. The response body consumes // one branch; we drain the other server-side so a disconnected // client (cancelling the response branch) doesn't propagate back to // the source. The source keeps pulling as long as the snapshot - // branch is being read, so `onFinish` only fires on natural - // completion — not when the consumer cancels with partial state. + // branch is being read. const uiStream = result.toUIMessageStream({ originalMessages: input.messages, generateMessageId: createIdGenerator({ prefix: "msg", size: 16 }), - messageMetadata: () => - state.turn?.resolved.agentId + // Emit the ADR-0012 §Context-usage ring / §Per-message stats with the `finish` event so the client gets them on + // the final stream chunk — the (i) stats action then appears the instant + // the answer completes, not a DB-refetch round-trip later. `start` carries + // only agentId (timing/usage don't exist yet). The post-stream stamp in + // the finally still writes them to the persisted message for reload. + messageMetadata: ({ part }) => { + const agentId = state.turn?.resolved.agentId ? { agentId: state.turn.resolved.agentId } - : undefined, - onError: (error) => formatStreamError(error), - onFinish: async ({ messages: finalMessages }) => { - state.messages = finalMessages; - let status: RunStatus = "succeeded"; - let err: Error | undefined; - if (handle.signal.aborted) { - const reason: unknown = handle.signal.reason; - if (reason instanceof TimeoutError) { - status = "failed"; - err = reason; - } else { - status = "cancelled"; - } + : undefined; + if (part.type === "finish") { + finishedAt = new Date().toISOString(); + const stats = buildMessageStats(finishedAt); + return stats ? { ...agentId, stats } : agentId; } - await finalize(status, err); + return agentId; }, + onError: (error) => formatStreamError(error), }); - const [forResponse, forSnapshot] = uiStream.tee(); + // ADR-0012 §Compaction trace in the timeline: if Tier 1 compaction fired this turn, prepend synthetic + // compact_context tool-call + tool-result chunks so the compaction is + // visible in the chat timeline. Injected after the 'start' event so the + // AI SDK builds them into the same assistant message as the response. + const tracedStream: ReadableStream = state.turn + ?.compactionTrace + ? prependCompactionChunks( + uiStream as ReadableStream, + state.turn.compactionTrace, + ) + : (uiStream as ReadableStream); + + const { stream: timedStream, completions } = + withToolTimestamps(tracedStream); + const [forResponse, forSnapshot] = timedStream.tee(); // Read the snapshot branch as message snapshots and keep `state.messages` // up to date. ChatSink's FlushScheduler then writes the in-progress // assistant message to the DB on each onProgress bump, so a user who // reconnects mid-run sees the partial answer (not just their own // input message). + // + // finalize is called here (not in toUIMessageStream's onFinish) so that + // state.messages reflects the fully-drained stream — including the tool + // `completedAt` timestamps and ADR-0012 §Context-usage ring / §Per-message stats applied below — before the sink + // persists it. + // An error chunk (model/tool failure surfaced via formatStreamError) or + // an internal stream fault ends the for-await without throwing, because + // readUIMessageStream defaults terminateOnError=false. Capture it so the + // finally finalizes "failed" instead of silently persisting a partial + // message as "succeeded". + let streamError: unknown; void (async () => { try { for await (const message of readUIMessageStream({ stream: forSnapshot, - onError: (err) => + onError: (err) => { + streamError = err; logger.error( { err, runId: input.runId }, "Snapshot stream parse error", - ), + ); + }, })) { state.messages = [...input.messages, message]; } } catch (err) { + streamError = err; logger.error( { err, runId: input.runId }, "Server-side UI stream consumer error", ); + } finally { + // Reuse the finish-event timestamp when present so the persisted stats + // match what was streamed; fall back if the stream ended without one. + const finishedAtFinal = finishedAt ?? new Date().toISOString(); + applyToolCompletions(state.messages, completions); + const stats = buildMessageStats(finishedAtFinal); + if (stats) applyMessageStats(state.messages, stats); + let status: RunStatus = "succeeded"; + let err: Error | undefined; + if (handle.signal.aborted) { + const reason: unknown = handle.signal.reason; + if (reason instanceof TimeoutError) { + status = "failed"; + err = reason; + } else { + status = "cancelled"; + } + } else if (streamError !== undefined) { + // The stream errored (model/tool rejection or internal fault) but did + // not abort — record the run as failed rather than succeeded. + status = "failed"; + err = + streamError instanceof Error + ? streamError + : new Error( + typeof streamError === "string" + ? streamError + : "Server-side UI stream error", + ); + } + await finalize(status, err); } })(); @@ -488,6 +840,22 @@ export class AgentRunner { } } +/** + * Wraps the turn's model with the context-overflow recovery middleware (ADR-0012 §Recovery): every model call — first call and every tool-loop step, stream and + * generate alike — gets one trim-and-retry on a provider "context too long" + * rejection. Always on; the ADR-0012 §Config & kill switch does not gate it. + */ +const withOverflowRecovery = (turn: ChatTurn): LanguageModel => + wrapLanguageModel({ + // turn.stream.model is typed `LanguageModel` (string | model spec); at this + // point it is always a resolved model object, never a string id — narrow to + // the spec form wrapLanguageModel requires. + model: turn.stream.model as Parameters< + typeof wrapLanguageModel + >[0]["model"], + middleware: contextOverflowRecoveryMiddleware(turn.recovery), + }); + /** * Converts AI SDK errors into user-facing strings for the UI message stream. * Behaviour-preserving copy of the previous inline `onError` handler. @@ -497,6 +865,11 @@ const formatStreamError = (error: unknown): string => { if (LoadAPIKeyError.isInstance(error)) { return "AI provider API key is missing or not configured."; } + // Reaching here means recovery (ADR-0012 §Recovery) already trimmed and retried once and the + // provider still rejected the prompt — surface the actionable dead end. + if (isContextOverflowError(error)) { + return "Conversation too large for the model's context window even after trimming — start a new chat or reduce attachments."; + } if (APICallError.isInstance(error)) { if (error.statusCode === 401 || error.statusCode === 403) { return "AI provider authentication failed. Your API key may be invalid or expired."; diff --git a/apps/backend/src/runs/compaction.test.ts b/apps/backend/src/runs/compaction.test.ts new file mode 100644 index 00000000..49ff6e5d --- /dev/null +++ b/apps/backend/src/runs/compaction.test.ts @@ -0,0 +1,1335 @@ +import { describe, it, expect, vi } from "vitest"; + +vi.mock("../index.ts", () => ({ db: {} })); // drizzle store unused in these tests +vi.mock("../logger.ts", () => ({ + logger: { warn: vi.fn(), info: vi.fn(), error: vi.fn(), debug: vi.fn() }, +})); + +import { + commitWatermark, + compactUIMessages, + compactModelMessages, + editToolResults, + elidedToolPlaceholder, + pickKeepBoundary, + softTrim, + type CompactionStore, + type CompactionState, + type WatermarkPatch, +} from "./compaction.ts"; +import { logger } from "../logger.ts"; +import type { ModelMessage } from "ai"; +import type { PlatypusUIMessage } from "../types.ts"; + +/** + * In-memory store. Since JS is single-threaded, the version check in `casWrite` + * is atomic per call — exactly the guarantee Postgres gives via the `version` + * predicate. `readState` returns a snapshot copy, so a version bump that happens + * after a read (a racing winner) makes that reader's snapshot stale → CAS fails. + */ +class FakeStore implements CompactionStore { + state: CompactionState; + casCalls = 0; + + constructor(init: Partial = {}) { + this.state = { + version: 0, + summaryWatermark: null, + contextSummary: null, + compactionDirty: false, + ...init, + }; + } + + readState() { + return Promise.resolve({ ...this.state }); + } + + casWrite( + _chatId: string, + expectVersion: number, + patch: WatermarkPatch, + ): Promise { + this.casCalls++; + if (this.state.version !== expectVersion) return Promise.resolve(false); + if ("watermark" in patch) + this.state.summaryWatermark = patch.watermark ?? null; + if ("summary" in patch) this.state.contextSummary = patch.summary ?? null; + if ("dirty" in patch) this.state.compactionDirty = patch.dirty ?? false; + this.state.version = expectVersion + 1; + return Promise.resolve(true); + } +} + +describe("casWrite — version-gated CAS (ADR-0012 §One durable writer)", () => { + it("applies and bumps version when the expected version matches", async () => { + const store = new FakeStore({ version: 3 }); + const won = await store.casWrite("c", 3, { summary: "s", watermark: "m1" }); + expect(won).toBe(true); + expect(store.state.version).toBe(4); + expect(store.state.contextSummary).toBe("s"); + expect(store.state.summaryWatermark).toBe("m1"); + }); + + it("two writers on the same version: one wins, the other loses", async () => { + const store = new FakeStore({ version: 0 }); + const first = await store.casWrite("c", 0, { summary: "A" }); + const second = await store.casWrite("c", 0, { summary: "B" }); + expect(first).toBe(true); + expect(second).toBe(false); // version is now 1, expected 0 + expect(store.state.contextSummary).toBe("A"); + }); + + it("an explicit null clears a field; an absent key leaves it untouched", async () => { + const store = new FakeStore({ + version: 1, + contextSummary: "old", + summaryWatermark: "m5", + }); + await store.casWrite("c", 1, { summary: null }); // reset summary only + expect(store.state.contextSummary).toBeNull(); + expect(store.state.summaryWatermark).toBe("m5"); // untouched + }); +}); + +describe("commitWatermark — loser logic (ADR-0012 §One durable writer)", () => { + it("applies a write on an uncontended commit", async () => { + const store = new FakeStore({ version: 2 }); + const res = await commitWatermark(store, "c", () => ({ + kind: "write", + patch: { summary: "sum", watermark: "m9" }, + })); + expect(res).toEqual({ status: "applied", version: 3 }); + expect(store.state.summaryWatermark).toBe("m9"); + }); + + it("skips immediately when the decision is a no-op", async () => { + const store = new FakeStore({ version: 0 }); + const res = await commitWatermark(store, "c", () => ({ + kind: "skip", + reason: "no-op", + })); + expect(res).toEqual({ status: "skipped", reason: "no-op" }); + expect(store.casCalls).toBe(0); + }); + + it("re-reads after a CAS conflict and succeeds on the retry", async () => { + const store = new FakeStore({ version: 0 }); + let firstDecision = true; + const res = await commitWatermark(store, "c", (state) => { + if (firstDecision) { + firstDecision = false; + // Simulate a racing winner committing between our read and write. + store.state.version = 1; + store.state.summaryWatermark = "winner"; + } + // Decide by the (re-read) version, not the watermark value. + return { kind: "write", patch: { summary: `at-v${state.version}` } }; + }); + expect(res.status).toBe("applied"); + // First attempt CAS expected v0 but row is v1 → lost; retry expects v1 → wins. + expect(store.state.version).toBe(2); + expect(store.state.contextSummary).toBe("at-v1"); + }); + + it("decides 'covered' on the retry and skips (winner already did the work)", async () => { + const store = new FakeStore({ version: 0, summaryWatermark: "m1" }); + let first = true; + const res = await commitWatermark(store, "c", (state) => { + if (first) { + first = false; + store.state.version = 1; + store.state.summaryWatermark = "m20"; // winner advanced past our prefix + return { kind: "write", patch: { summary: "mine", watermark: "m10" } }; + } + // On re-read we see the winner covered us → skip (decide by version). + expect(state.version).toBe(1); + return { kind: "skip", reason: "covered" }; + }); + expect(res).toEqual({ status: "skipped", reason: "covered" }); + expect(store.state.summaryWatermark).toBe("m20"); // winner's value preserved + }); + + it("gives up as 'contended' after two conflicts — no livelock", async () => { + const store = new FakeStore({ version: 0 }); + let decideCalls = 0; + const res = await commitWatermark(store, "c", (state) => { + decideCalls++; + // Every decision races a winner → both CAS attempts fail. + store.state.version = state.version + 1; + return { kind: "write", patch: { summary: "x" } }; + }); + expect(res).toEqual({ status: "skipped", reason: "contended" }); + expect(decideCalls).toBe(2); // exactly MAX_ATTEMPTS, then stop + }); +}); + +// --- Slice 2b: compaction primitives ------------------------------------ + +function uiText( + id: string, + role: "user" | "assistant", + text: string, +): PlatypusUIMessage { + return { id, role, parts: [{ type: "text", text }] }; +} + +function uiTool(id: string, output: unknown): PlatypusUIMessage { + return { + id, + role: "assistant", + parts: [ + { + type: "tool-doThing", + toolCallId: `${id}-call`, + state: "output-available", + input: {}, + output, + }, + ], + } as unknown as PlatypusUIMessage; +} + +const noopSummarize = () => Promise.resolve("SUMMARY"); + +describe("softTrim", () => { + it("keeps short text untouched", () => { + expect(softTrim("short", 500)).toBe("short"); + }); + it("trims long text to head+tail with a marker", () => { + const out = softTrim("a".repeat(2000), 100); + expect(out.startsWith("a".repeat(100))).toBe(true); + expect(out).toContain("elided 1800 chars"); + expect(out.length).toBeLessThan(2000); + }); +}); + +describe("pickKeepBoundary", () => { + it("UIMessage: any split is safe", () => { + expect(pickKeepBoundary(5, 2, () => true)).toBe(3); + }); + it("ModelMessage: walks back so recent does not start on an orphan tool result", () => { + const roles = ["user", "assistant", "tool", "user"]; + const safe = (i: number) => i >= roles.length || roles[i] !== "tool"; + // start at 4-2=2 (role "tool", unsafe) → walk back to 1 (assistant, safe) + expect(pickKeepBoundary(4, 2, safe)).toBe(1); + }); +}); + +describe("compactUIMessages (Tier 1)", () => { + const baseOpts = { + keepRecentMessages: 2, + minPrunableChars: 2000, + summarize: noopSummarize, + }; + + it("is a no-op when already within target (hysteresis precondition)", async () => { + const msgs = [uiText("a", "user", "hi"), uiText("b", "assistant", "yo")]; + const res = await compactUIMessages(msgs, { + ...baseOpts, + targetTokens: 1000, + }); + expect(res.usedModelCall).toBe(false); + expect(res.messagesDropped).toBe(0); + expect(res.keptMessages).toBe(msgs); + }); + + it("Stage 1 prune reaches target WITHOUT a model call", async () => { + const summarize = vi.fn(noopSummarize); + const msgs = [ + uiTool("big", "X".repeat(4000)), // ~1000 tokens, prunes to ~250 + uiText("r1", "user", "hello"), + uiText("r2", "assistant", "world"), + ]; + const res = await compactUIMessages(msgs, { + ...baseOpts, + summarize, + targetTokens: 300, + }); + expect(res.usedModelCall).toBe(false); + expect(summarize).not.toHaveBeenCalled(); + expect(res.watermarkId).toBeNull(); + expect(res.keptMessages).toHaveLength(3); // pruned prefix stays visible + expect(res.estimatedTokens).toBeLessThanOrEqual(300); + }); + + it("Stage 2 summarizes when pruning is insufficient (text-heavy prefix)", async () => { + const summarize = vi.fn(noopSummarize); + const msgs = [ + uiText("p1", "user", "P".repeat(4000)), + uiText("p2", "assistant", "Q".repeat(4000)), + uiText("r1", "user", "hello"), + uiText("r2", "assistant", "world"), + ]; + const res = await compactUIMessages(msgs, { + ...baseOpts, + summarize, + targetTokens: 300, + }); + expect(res.usedModelCall).toBe(true); + expect(summarize).toHaveBeenCalledOnce(); + expect(res.summaryText).toBe("SUMMARY"); + expect(res.watermarkId).toBe("p2"); // last folded message + expect(res.keptMessages).toHaveLength(2); // only recent kept + expect(res.estimatedTokens).toBeLessThanOrEqual(300); + }); + + it("does NOT re-fire next turn: feeding the result back is a no-op (ADR-0012 §Tier 1 (hysteresis))", async () => { + const msgs = [ + uiText("p1", "user", "P".repeat(4000)), + uiText("p2", "assistant", "Q".repeat(4000)), + uiText("r1", "user", "hello"), + uiText("r2", "assistant", "world"), + ]; + const target = 300; + const first = await compactUIMessages(msgs, { + ...baseOpts, + targetTokens: target, + }); + expect(first.usedModelCall).toBe(true); + + const second = await compactUIMessages(first.keptMessages, { + ...baseOpts, + targetTokens: target, + priorSummary: first.summaryText, + }); + expect(second.usedModelCall).toBe(false); // already within target + expect(second.messagesDropped).toBe(0); + }); + + it("map-reduces an oversized prefix (ADR-0012 §Tier 1 (summarizer model & map-reduce))", async () => { + const summarize = vi.fn(noopSummarize); + const msgs = [ + uiText("p1", "user", "Z".repeat(4000)), // ~1000 tokens of transcript + uiText("r1", "user", "hello"), + uiText("r2", "assistant", "world"), + ]; + await compactUIMessages(msgs, { + ...baseOpts, + summarize, + targetTokens: 50, + summarizerWindow: 100, // 400-char chunks → several chunk calls + 1 reduce + }); + expect(summarize.mock.calls.length).toBeGreaterThan(1); + }); + + it("Stage 2 prunes large tool results in kept (recent) messages", async () => { + const msgs = [ + uiText("p1", "user", "P".repeat(4000)), + uiText("p2", "assistant", "Q".repeat(4000)), + uiTool("r1", "X".repeat(12000)), // big tool result in recent + uiText("r2", "user", "done"), + ]; + const res = await compactUIMessages(msgs, { + ...baseOpts, + summarize: noopSummarize, + targetTokens: 300, + minRecentPrunableChars: 5000, // 12000-char output exceeds threshold + }); + expect(res.usedModelCall).toBe(true); + expect(res.keptMessages).toHaveLength(2); // r1 + r2 + // Tool result in r1 should be trimmed (soft-trim produces head+tail, not full string) + const toolPart = res.keptMessages[0].parts?.find((p) => + (p as { type: string }).type.startsWith("tool-"), + ) as { output?: string } | undefined; + expect(typeof toolPart?.output).toBe("string"); + expect((toolPart?.output as string).length).toBeLessThan(12000); + }); + + it("Stage 2 does not prune recent tool results below minRecentPrunableChars", async () => { + const msgs = [ + uiText("p1", "user", "P".repeat(4000)), + uiText("p2", "assistant", "Q".repeat(4000)), + uiTool("r1", "X".repeat(3000)), // below threshold of 20000 + uiText("r2", "user", "done"), + ]; + const res = await compactUIMessages(msgs, { + ...baseOpts, + summarize: noopSummarize, + targetTokens: 300, + minRecentPrunableChars: 20000, // threshold above 3000 → no pruning + }); + expect(res.usedModelCall).toBe(true); + const toolPart = res.keptMessages[0].parts?.find((p) => + (p as { type: string }).type.startsWith("tool-"), + ) as { output?: string } | undefined; + // Output unchanged — 3000 chars below threshold + expect(toolPart?.output).toBe("X".repeat(3000)); + }); + + it("prunes large recent tool results when the prefix is empty (no summary)", async () => { + // Whole history fits within keepRecentMessages (2) but a huge tool result + // pushes it over target. boundary=0 → empty prefix → no model call, but the + // outlier in recent must still be trimmed (Finding 1 gap). + const msgs = [ + uiTool("r1", "X".repeat(12000)), // big tool result, no prefix to summarize + uiText("r2", "user", "done"), + ]; + const res = await compactUIMessages(msgs, { + ...baseOpts, + summarize: noopSummarize, + targetTokens: 300, + minRecentPrunableChars: 5000, + }); + expect(res.usedModelCall).toBe(false); // empty prefix → no summarize + const toolPart = res.keptMessages[0].parts?.find((p) => + (p as { type: string }).type.startsWith("tool-"), + ) as { output?: string } | undefined; + expect(typeof toolPart?.output).toBe("string"); + expect((toolPart?.output as string).length).toBeLessThan(12000); + }); + + it("ADR-0012 §Hard window wall: keeps recent VERBATIM in the empty-prefix path when within inputBudget", async () => { + // Whole history fits within keepRecentMessages (2) → empty prefix, no model + // call. Over the soft target but under the wall → outlier must stay untouched. + const msgs = [ + uiTool("r1", "X".repeat(12000)), + uiText("r2", "user", "done"), + ]; + const res = await compactUIMessages(msgs, { + ...baseOpts, + summarize: noopSummarize, + targetTokens: 300, + minRecentPrunableChars: 5000, + inputBudget: 100000, // wall far above → no recent trim + }); + expect(res.usedModelCall).toBe(false); + const toolPart = res.keptMessages[0].parts?.find((p) => + (p as { type: string }).type.startsWith("tool-"), + ) as { output?: string } | undefined; + expect(toolPart?.output).toBe("X".repeat(12000)); // untouched + }); + + it("warns (no wall) when Stage 2 result still exceeds 2× targetTokens after pruning", async () => { + const warn = vi.spyOn(logger, "warn").mockReturnValue(undefined); + const msgs = [ + uiText("p1", "user", "P".repeat(4000)), + uiText("p2", "assistant", "Q".repeat(4000)), + // recent messages are huge text (not tool), cannot be pruned + uiText("r1", "user", "R".repeat(8000)), + uiText("r2", "assistant", "S".repeat(8000)), + ]; + await compactUIMessages(msgs, { + ...baseOpts, + summarize: noopSummarize, + targetTokens: 50, // recent alone is ~4000 tokens → well over 2×50 + // no inputBudget → warn falls back to the target*2 heuristic + }); + expect(warn).toHaveBeenCalledWith( + expect.objectContaining({ targetTokens: 50 }), + expect.stringContaining("recent messages exceed the window"), + ); + warn.mockRestore(); + }); + + it("ADR-0012 §Hard window wall: does NOT warn on a soft-target miss when recent is under the wall", async () => { + const warn = vi.spyOn(logger, "warn").mockReturnValue(undefined); + const msgs = [ + uiText("p1", "user", "P".repeat(4000)), + uiText("p2", "assistant", "Q".repeat(4000)), + uiText("r1", "user", "R".repeat(8000)), + uiText("r2", "assistant", "S".repeat(8000)), + ]; + await compactUIMessages(msgs, { + ...baseOpts, + summarize: noopSummarize, + targetTokens: 50, // way over target... + inputBudget: 100000, // ...but well under the hard wall → no warn + }); + expect(warn).not.toHaveBeenCalled(); + warn.mockRestore(); + }); + + it("ADR-0012 §Hard window wall: keeps recent tool results VERBATIM when within inputBudget", async () => { + // Over the soft target (300) so Stage 2 fires, but the kept view (summary + + // recent) stays under the hard wall → recent must NOT be trimmed. + const msgs = [ + uiText("p1", "user", "P".repeat(4000)), + uiText("p2", "assistant", "Q".repeat(4000)), + uiTool("r1", "X".repeat(12000)), // ~3000 tokens in recent + uiText("r2", "user", "done"), + ]; + const res = await compactUIMessages(msgs, { + ...baseOpts, + summarize: noopSummarize, + targetTokens: 300, + minRecentPrunableChars: 5000, + inputBudget: 100000, // wall far above the kept view → no recent trim + }); + expect(res.usedModelCall).toBe(true); + const toolPart = res.keptMessages[0].parts?.find((p) => + (p as { type: string }).type.startsWith("tool-"), + ) as { output?: string } | undefined; + expect(toolPart?.output).toBe("X".repeat(12000)); // untouched + }); + + it("ADR-0012 §Hard window wall: trims recent (except newest) when the kept view breaches inputBudget", async () => { + // Two big tool results in recent; the kept view breaches the wall → trim the + // older one, exempt the single newest message even though it is bulky. + const msgs = [ + uiText("p1", "user", "P".repeat(4000)), + uiText("p2", "assistant", "Q".repeat(4000)), + uiTool("r1", "X".repeat(12000)), // older recent → trimmed + uiTool("r2", "Y".repeat(12000)), // newest → exempt + ]; + const res = await compactUIMessages(msgs, { + ...baseOpts, + summarize: noopSummarize, + targetTokens: 300, + minRecentPrunableChars: 5000, + inputBudget: 100, // wall well below the kept view → trim + }); + expect(res.usedModelCall).toBe(true); + const out = (i: number) => + ( + res.keptMessages[i].parts?.find((p) => + (p as { type: string }).type.startsWith("tool-"), + ) as { output?: string } | undefined + )?.output; + expect((out(0) as string).length).toBeLessThan(12000); // r1 trimmed + expect(out(1)).toBe("Y".repeat(12000)); // r2 (newest) exempt + }); +}); + +describe("compactModelMessages (Tier 2 / recovery)", () => { + const baseOpts = { + keepRecentMessages: 2, + minPrunableChars: 2000, + summarize: noopSummarize, + }; + + it("is a no-op when within target", async () => { + const msgs: ModelMessage[] = [ + { role: "user", content: "hi" }, + { role: "assistant", content: "yo" }, + ]; + const res = await compactModelMessages(msgs, { + ...baseOpts, + targetTokens: 1000, + }); + expect(res.usedModelCall).toBe(false); + expect(res.messages).toBe(msgs); + }); + + it("summarizes and prepends one synthetic message, preserving tool pairing", async () => { + const msgs: ModelMessage[] = [ + { role: "user", content: "P".repeat(4000) }, + { + role: "assistant", + content: [ + { type: "tool-call", toolCallId: "t1", toolName: "f", input: {} }, + ], + }, + { + role: "tool", + content: [ + { + type: "tool-result", + toolCallId: "t1", + toolName: "f", + output: { type: "json", value: { ok: true } }, + }, + ], + }, + { role: "user", content: "recent" }, + ]; + const res = await compactModelMessages(msgs, { + ...baseOpts, + targetTokens: 50, + }); + expect(res.usedModelCall).toBe(true); + // First message is the synthetic summary (user-framed). + expect(res.messages[0].role).toBe("user"); + expect(JSON.stringify(res.messages[0].content)).toContain( + "Summary of earlier conversation", + ); + // The assistant tool-call and its tool result stay adjacent (not split). + const roles = res.messages.map((m) => m.role); + const toolIdx = roles.indexOf("tool"); + expect(roles[toolIdx - 1]).toBe("assistant"); + }); + + it("force bypasses BOTH no-op gates so recovery never retries byte-identically (ADR-0012 §Recovery)", async () => { + // Estimator says we are within target AND nothing is prunable (small, + // non-bulky messages). Without force both the whole-message gate and the + // post-prune gate would no-op → recovery would retry the exact same prompt + // and fail again. force must push through to a real summarize. + const msgs: ModelMessage[] = [ + { role: "user", content: "a" }, + { role: "assistant", content: "b" }, + { role: "user", content: "recent-1" }, + { role: "assistant", content: "recent-2" }, + ]; + const res = await compactModelMessages(msgs, { + ...baseOpts, + targetTokens: 100000, // estimator is well under target + force: true, + }); + expect(res.usedModelCall).toBe(true); + expect(res.messagesDropped).toBeGreaterThan(0); + expect(res.messages).not.toBe(msgs); + }); + + it("force with an empty prefix is a no-op, not a prompt-growing summary (ADR-0012 §Tier 1, model-side)", async () => { + // recent alone exceeds keepRecentMessages → prefix is empty. Summarizing + // nothing would ADD a synthetic message and grow the prompt, never + // converging. Surface the overflow instead. + const msgs: ModelMessage[] = [ + { role: "user", content: "only-1" }, + { role: "assistant", content: "only-2" }, + ]; + const res = await compactModelMessages(msgs, { + ...baseOpts, + keepRecentMessages: 2, + targetTokens: 1, + force: true, + }); + expect(res.usedModelCall).toBe(false); + expect(res.messages.length).toBe(msgs.length); + }); +}); + +// --- Slice 2c: Tier 1 orchestration ------------------------------------- + +import { + applyTier1Compaction, + buildCompactionTraceMessage, + computeBudget, + invalidateCompaction, + affectedBelowWatermark, + summaryUIMessage, + DEFAULT_COMPACTION_CONFIG, + type Budget, + type CompactionConfig, +} from "./compaction.ts"; + +describe("buildCompactionTraceMessage (ADR-0012 §Force-compact on demand)", () => { + it("builds an assistant message with a completed compact_context tool part", () => { + const msg = buildCompactionTraceMessage( + { messagesDropped: 7, summaryExcerpt: "did things" }, + "msg-abc", + ); + expect(msg.id).toBe("msg-abc"); + expect(msg.role).toBe("assistant"); + expect(msg.parts).toHaveLength(1); + const part = msg.parts[0] as { + type: string; + state: string; + toolCallId: string; + output: unknown; + }; + expect(part.type).toBe("tool-compact_context"); + expect(part.state).toBe("output-available"); + expect(part.toolCallId).toBe("msg-abc-call"); + expect(part.output).toEqual({ + messagesDropped: 7, + summaryExcerpt: "did things", + }); + }); + + it("omits summaryExcerpt from the output when absent", () => { + const msg = buildCompactionTraceMessage({ messagesDropped: 1 }, "msg-x"); + const part = msg.parts[0] as { output: unknown }; + expect(part.output).toEqual({ messagesDropped: 1 }); + }); +}); + +function storeFromState(state: Partial): FakeStore { + return new FakeStore(state); +} + +const cfg = (over: Partial = {}): CompactionConfig => ({ + ...DEFAULT_COMPACTION_CONFIG, + keepRecentMessages: 2, + ...over, +}); + +describe("computeBudget (ADR-0012 §Tier 1 (budget math) — subtract both reserves)", () => { + it("subtracts output + safety reserve before applying ratios", () => { + const b = computeBudget( + 10000, + 2000, + cfg({ reserveRatio: 0.05, triggerRatio: 0.8, targetRatio: 0.5 }), + ); + expect(b.inputBudget).toBe(7500); // 10000 - 2000 - 500 + expect(b.triggerTokens).toBe(6000); + expect(b.targetTokens).toBe(3750); + }); + it("uses a conservative output reserve when maxOutputTokens is unknown", () => { + const b = computeBudget(10000, undefined, cfg({ reserveRatio: 0.05 })); + expect(b.inputBudget).toBe(7000); // 10000 - min(4096, 2500) - 500 + }); + + it("caps the output reserve at half the window so inputBudget can't collapse (ADR-0012 §Tier 1 (budget math))", () => { + // A bogus registry entry where max_output >= the input-scoped window would + // otherwise drive inputBudget toward 1 and thrash. The cap keeps it sane. + const b = computeBudget(10000, 20000, cfg({ reserveRatio: 0.05 })); + // reserve capped at 5000 (half), safety 500 → 10000 - 5000 - 500 = 4500. + expect(b.inputBudget).toBe(4500); + }); +}); + +const bigText = (id: string, role: "user" | "assistant") => + uiText(id, role, "X".repeat(4000)); + +describe("applyTier1Compaction", () => { + const baseBudget: Budget = { + inputBudget: 100, + triggerTokens: 50, + targetTokens: 50, + }; + + it("under trigger: reconstructs the persisted view, no write", async () => { + const store = storeFromState({ + version: 2, + summaryWatermark: "m2", + contextSummary: "PRIOR", + }); + const messages = ["m1", "m2", "m3", "m4"].map((id) => + uiText(id, "user", "hi"), + ); + const out = await applyTier1Compaction({ + chatId: "c", + messages, + state: { + version: 2, + summaryWatermark: "m2", + contextSummary: "PRIOR", + compactionDirty: false, + }, + budget: { + inputBudget: 100000, + triggerTokens: 100000, + targetTokens: 50000, + }, + config: cfg(), + imageProvider: "default", + summarize: noopSummarize, + store, + }); + expect(out.compacted).toBe(false); + expect(out.messages[0]).toEqual(summaryUIMessage("PRIOR")); // re-injected summary + expect(out.messages.map((m) => m.id)).toEqual([ + "context-summary", + "m3", + "m4", + ]); // dropped ≤ watermark + expect(store.casCalls).toBe(0); // nothing persisted + }); + + it("over trigger: compacts, persists summary+watermark, clears dirty, fires event", async () => { + const store = storeFromState({ version: 0 }); + const onEvent = vi.fn(); + const messages = [ + bigText("p1", "user"), + bigText("p2", "assistant"), + uiText("r1", "user", "a"), + uiText("r2", "assistant", "b"), + ]; + const out = await applyTier1Compaction({ + chatId: "c", + messages, + state: { + version: 0, + summaryWatermark: null, + contextSummary: null, + compactionDirty: false, + }, + budget: baseBudget, + config: cfg(), + imageProvider: "default", + summarize: noopSummarize, + store, + onEvent, + }); + expect(out.compacted).toBe(true); + expect(store.state.contextSummary).toBe("SUMMARY"); + expect(store.state.summaryWatermark).toBe("p2"); + expect(store.state.compactionDirty).toBe(false); + expect(store.state.version).toBe(1); + expect(out.messages[0].id).toBe("context-summary"); + expect(onEvent).toHaveBeenCalledOnce(); + // ADR-0012 §Compaction trace in the timeline: a summary ran → a trace is surfaced with the dropped count and a + // summary excerpt. + expect(out.compactionTrace).toEqual({ + messagesDropped: 2, + summaryExcerpt: "SUMMARY", + }); + }); + + it("disabled + not dirty: no compaction even when over the trigger", async () => { + const store = storeFromState({ version: 0 }); + const messages = [ + bigText("p1", "user"), + bigText("p2", "assistant"), + uiText("r1", "user", "a"), + ]; + const out = await applyTier1Compaction({ + chatId: "c", + messages, + state: { + version: 0, + summaryWatermark: null, + contextSummary: null, + compactionDirty: false, + }, + budget: baseBudget, + config: cfg({ compactionEnabled: false }), + imageProvider: "default", + summarize: noopSummarize, + store, + }); + expect(out.compacted).toBe(false); + expect(store.casCalls).toBe(0); + }); + + it("dirty forces compaction even when proactive is disabled (ADR-0012 §Recovery is the net recovery hand-off)", async () => { + const store = storeFromState({ version: 0, compactionDirty: true }); + const messages = [ + bigText("p1", "user"), + bigText("p2", "assistant"), + uiText("r1", "user", "a"), + uiText("r2", "assistant", "b"), + ]; + const out = await applyTier1Compaction({ + chatId: "c", + messages, + state: { + version: 0, + summaryWatermark: null, + contextSummary: null, + compactionDirty: true, + }, + budget: baseBudget, + config: cfg({ compactionEnabled: false }), + imageProvider: "default", + summarize: noopSummarize, + store, + }); + expect(out.compacted).toBe(true); + expect(store.state.compactionDirty).toBe(false); + }); + + it("dirty but already within target: just clears the flag (no summary)", async () => { + const store = storeFromState({ version: 0, compactionDirty: true }); + const messages = [ + uiText("r1", "user", "a"), + uiText("r2", "assistant", "b"), + ]; + const out = await applyTier1Compaction({ + chatId: "c", + messages, + state: { + version: 0, + summaryWatermark: null, + contextSummary: null, + compactionDirty: true, + }, + budget: { + inputBudget: 100000, + triggerTokens: 100000, + targetTokens: 100000, + }, + config: cfg(), + imageProvider: "default", + summarize: noopSummarize, + store, + }); + expect(out.compacted).toBe(false); + expect(store.state.compactionDirty).toBe(false); // flag cleared + expect(store.state.contextSummary).toBeNull(); // no summary written + expect(store.state.version).toBe(1); + // ADR-0012 §Compaction trace in the timeline: no model summary ran → no trace (would be an empty timeline entry). + expect(out.compactionTrace).toBeUndefined(); + }); + + it("under trigger: no trace surfaced", async () => { + const store = storeFromState({ version: 0 }); + const messages = [uiText("r1", "user", "a")]; + const out = await applyTier1Compaction({ + chatId: "c", + messages, + state: { + version: 0, + summaryWatermark: null, + contextSummary: null, + compactionDirty: false, + }, + budget: { + inputBudget: 100000, + triggerTokens: 100000, + targetTokens: 50000, + }, + config: cfg(), + imageProvider: "default", + summarize: noopSummarize, + store, + }); + expect(out.compacted).toBe(false); + expect(out.compactionTrace).toBeUndefined(); + }); +}); + +describe("invalidateCompaction (ADR-0012 §Summary invalidation)", () => { + const ordered = ["m1", "m2", "m3", "m4"]; + + it("resets summary + watermark when a message at/below the watermark changes", async () => { + const store = storeFromState({ + version: 5, + summaryWatermark: "m2", + contextSummary: "S", + }); + const res = await invalidateCompaction(store, "c", ["m2"], ordered); + expect(res.status).toBe("applied"); + expect(store.state.summaryWatermark).toBeNull(); + expect(store.state.contextSummary).toBeNull(); + expect(store.state.version).toBe(6); // bumped so a racing compaction loses (ADR-0012 §One durable writer) + }); + + it("is a no-op when the edit is entirely above the watermark", async () => { + const store = storeFromState({ + version: 5, + summaryWatermark: "m2", + contextSummary: "S", + }); + const res = await invalidateCompaction(store, "c", ["m4"], ordered); + expect(res).toEqual({ status: "skipped", reason: "no-op" }); + expect(store.state.contextSummary).toBe("S"); + }); + + it("resets when an affected message was deleted (missing from ordering)", async () => { + const store = storeFromState({ + version: 1, + summaryWatermark: "m3", + contextSummary: "S", + }); + const res = await invalidateCompaction(store, "c", ["gone"], ordered); + expect(res.status).toBe("applied"); + expect(store.state.summaryWatermark).toBeNull(); + }); + + it("is a no-op when there is no summary/watermark to invalidate", async () => { + const store = storeFromState({ version: 0 }); + const res = await invalidateCompaction(store, "c", ["m1"], ordered); + expect(res).toEqual({ status: "skipped", reason: "no-op" }); + }); +}); + +describe("affectedBelowWatermark (ADR-0012 §Summary invalidation divergence detection)", () => { + const persisted = [ + uiText("m1", "user", "one"), + uiText("m2", "assistant", "two"), + uiText("m3", "user", "three"), + ]; + + it("returns [] when the prefix is unchanged", () => { + const incoming = [ + uiText("m1", "user", "one"), + uiText("m2", "assistant", "two"), + uiText("m3", "user", "x"), + ]; + expect(affectedBelowWatermark(persisted, incoming, "m2")).toEqual([]); + }); + + it("flags a content edit at/below the watermark", () => { + const incoming = [ + uiText("m1", "user", "EDITED"), + uiText("m2", "assistant", "two"), + uiText("m3", "user", "three"), + ]; + expect(affectedBelowWatermark(persisted, incoming, "m2")).toEqual(["m1"]); + }); + + it("flags a deleted message below the watermark", () => { + const incoming = [ + uiText("m2", "assistant", "two"), + uiText("m3", "user", "three"), + ]; + expect(affectedBelowWatermark(persisted, incoming, "m2")).toEqual(["m1"]); + }); + + it("flags when the watermark message itself is gone from canonical history", () => { + expect(affectedBelowWatermark(persisted, persisted, "ghost")).toEqual([ + "ghost", + ]); + }); + + it("ignores edits strictly above the watermark", () => { + const incoming = [ + uiText("m1", "user", "one"), + uiText("m2", "assistant", "two"), + uiText("m3", "user", "CHANGED"), + ]; + expect(affectedBelowWatermark(persisted, incoming, "m2")).toEqual([]); + }); +}); + +// --- ADR-0012 §Tier 1 (trigger projection) / §Token estimation (cold-start margin): trigger projection + recovery dirty-flag producer ----- + +import { + projectTier1Tokens, + setCompactionDirty, + COLD_START_MARGIN, +} from "./compaction.ts"; + +describe("projectTier1Tokens (ADR-0012 §Tier 1 (trigger projection) / §Token estimation (cold-start margin))", () => { + it("applies the cold-start margin when no provider baseline exists (ADR-0012 §Token estimation (cold-start margin))", () => { + expect( + projectTier1Tokens({ messageTokens: 100, priorSummaryTokens: 0 }), + ).toBe(Math.ceil(100 * COLD_START_MARGIN)); + }); + + it("counts the per-turn overhead toward the trigger (ADR-0012 §Tier 1 (trigger projection))", () => { + expect( + projectTier1Tokens({ + messageTokens: 100, + priorSummaryTokens: 20, + overheadTokens: 50, + }), + ).toBe(Math.ceil(170 * COLD_START_MARGIN)); + }); + + it("uses the provider-reported count as a floor when available", () => { + // The observed live gap: char/4 said ~986, the provider said 8888. + expect( + projectTier1Tokens({ + messageTokens: 986, + priorSummaryTokens: 0, + lastInputTokens: 8888, + }), + ).toBe(8888); + }); + + it("drops the margin when a provider baseline is present", () => { + expect( + projectTier1Tokens({ + messageTokens: 100, + priorSummaryTokens: 0, + lastInputTokens: 50, + }), + ).toBe(100); + }); + + it("treats a 0 provider count as no baseline and keeps the margin (ADR-0012 §Tier 1 (trigger projection))", () => { + // Usage-less providers persist contextTokens=0; a bare `== null` check would + // skip the margin AND no-op the max(), leaving the raw char/4 with no buffer. + expect( + projectTier1Tokens({ + messageTokens: 100, + priorSummaryTokens: 0, + lastInputTokens: 0, + }), + ).toBe(Math.ceil(100 * COLD_START_MARGIN)); + }); +}); + +describe("applyTier1Compaction — overhead in the trigger (ADR-0012 §Tier 1 (trigger projection))", () => { + it("fires on system/tool overhead even when messages alone are under trigger", async () => { + const store = storeFromState({ version: 0 }); + // ~4 tokens of messages — far under the 50-token trigger on their own. + const messages = [ + uiText("p1", "user", "aaaa"), + uiText("p2", "assistant", "bbbb"), + uiText("r1", "user", "cccc"), + uiText("r2", "assistant", "dddd"), + ]; + const out = await applyTier1Compaction({ + chatId: "c", + messages, + state: { + version: 0, + summaryWatermark: null, + contextSummary: null, + compactionDirty: false, + }, + budget: { inputBudget: 100, triggerTokens: 50, targetTokens: 25 }, + config: cfg(), + imageProvider: "default", + summarize: noopSummarize, + store, + overheadTokens: 60, // tool schemas + system prompt dominate + }); + expect(out.compacted).toBe(true); + expect(store.state.summaryWatermark).toBe("p2"); + }); + + it("does not fire when messages + overhead stay under the trigger", async () => { + const store = storeFromState({ version: 0 }); + const messages = [ + uiText("r1", "user", "cccc"), + uiText("r2", "assistant", "dddd"), + ]; + const out = await applyTier1Compaction({ + chatId: "c", + messages, + state: { + version: 0, + summaryWatermark: null, + contextSummary: null, + compactionDirty: false, + }, + budget: { inputBudget: 100, triggerTokens: 50, targetTokens: 25 }, + config: cfg(), + imageProvider: "default", + summarize: noopSummarize, + store, + overheadTokens: 10, + }); + expect(out.compacted).toBe(false); + expect(store.casCalls).toBe(0); + }); +}); + +describe("setCompactionDirty (ADR-0012 §Recovery producer)", () => { + it("sets the flag through the CAS writer", async () => { + const store = storeFromState({ version: 3 }); + const res = await setCompactionDirty(store, "c"); + expect(res).toEqual({ status: "applied", version: 4 }); + expect(store.state.compactionDirty).toBe(true); + }); + + it("is a no-op when already dirty (no version churn)", async () => { + const store = storeFromState({ version: 3, compactionDirty: true }); + const res = await setCompactionDirty(store, "c"); + expect(res).toEqual({ status: "skipped", reason: "no-op" }); + expect(store.casCalls).toBe(0); + expect(store.state.version).toBe(3); + }); + + it("never touches summary or watermark (recovery only flags)", async () => { + const store = storeFromState({ + version: 1, + contextSummary: "KEEP", + summaryWatermark: "m7", + }); + await setCompactionDirty(store, "c"); + expect(store.state.contextSummary).toBe("KEEP"); + expect(store.state.summaryWatermark).toBe("m7"); + }); +}); + +// --- ADR-0012 §Stage 0 — context editing --------------------------- + +/** Tool message with a named tool and arbitrary output. */ +const toolMsg = ( + id: string, + name: string, + output: unknown, +): PlatypusUIMessage => + ({ + id, + role: "assistant", + parts: [ + { + type: `tool-${name}`, + toolCallId: `${id}-call`, + state: "output-available", + input: { q: "x" }, + output, + }, + ], + }) as unknown as PlatypusUIMessage; + +const bigOut = (n = 200) => "D".repeat(n); +const outputOf = (m: PlatypusUIMessage) => + (m.parts[0] as { output?: unknown }).output; + +describe("editToolResults (Stage 0 — context editing)", () => { + const opts = { keepRecentToolResults: 1, minEditableToolChars: 100 }; + + it("elides OLD bulky results past the keep-window; keeps recent + all text", () => { + const messages = [ + toolMsg("t1", "search", bigOut()), + uiText("u1", "user", "carry on"), + toolMsg("t2", "search", bigOut()), + toolMsg("t3", "search", bigOut()), + ]; + const res = editToolResults(messages, opts); + // 3 results, keep last 1 (t3) → t1, t2 are candidates and both bulky. + expect(res.resultsElided).toBe(2); + expect(outputOf(res.messages[0])).toBe( + elidedToolPlaceholder("search", 200), + ); + expect(outputOf(res.messages[2])).toBe( + elidedToolPlaceholder("search", 200), + ); + expect(outputOf(res.messages[3])).toBe(bigOut()); // t3 within keep-window + expect(res.messages[1]).toBe(messages[1]); // text untouched (same ref) + }); + + it("keeps results within keepRecentToolResults verbatim", () => { + const messages = [ + toolMsg("t1", "f", bigOut()), + toolMsg("t2", "f", bigOut()), + toolMsg("t3", "f", bigOut()), + ]; + const res = editToolResults(messages, { + keepRecentToolResults: 2, + minEditableToolChars: 100, + }); + expect(res.resultsElided).toBe(1); // only t1 + expect(outputOf(res.messages[0])).toBe(elidedToolPlaceholder("f", 200)); + expect(outputOf(res.messages[1])).toBe(bigOut()); + expect(outputOf(res.messages[2])).toBe(bigOut()); + }); + + it("exempts the newest message even with keepRecentToolResults=0", () => { + const messages = [ + toolMsg("t1", "f", bigOut()), + toolMsg("t2", "f", bigOut()), + ]; + const res = editToolResults(messages, { + keepRecentToolResults: 0, + minEditableToolChars: 100, + }); + expect(res.resultsElided).toBe(1); // t1 only; t2 is the newest message + expect(outputOf(res.messages[0])).toBe(elidedToolPlaceholder("f", 200)); + expect(outputOf(res.messages[1])).toBe(bigOut()); + }); + + it("size gate: leaves results at/under minEditableToolChars untouched", () => { + const messages = [ + toolMsg("small", "f", bigOut(50)), // ≤ gate + toolMsg("big", "f", bigOut(200)), // > gate + uiText("u1", "user", "tail"), // newest, so both tools are candidates + ]; + const res = editToolResults(messages, { + keepRecentToolResults: 0, + minEditableToolChars: 100, + }); + expect(res.resultsElided).toBe(1); + expect(outputOf(res.messages[0])).toBe(bigOut(50)); // small kept + expect(outputOf(res.messages[1])).toBe(elidedToolPlaceholder("f", 200)); + }); + + it("pairing: keeps the tool-call part, swaps only the output body", () => { + const messages = [ + toolMsg("t1", "search", bigOut()), + uiText("u1", "user", "x"), + ]; + const res = editToolResults(messages, { + keepRecentToolResults: 0, + minEditableToolChars: 100, + }); + const part = res.messages[0].parts[0] as Record; + expect(part.type).toBe("tool-search"); + expect(part.toolCallId).toBe("t1-call"); + expect(part.input).toEqual({ q: "x" }); + expect(part.state).toBe("output-available"); + expect(part.output).toBe(elidedToolPlaceholder("search", 200)); + }); + + it("is deterministic/monotonic: feeding the edited view back elides nothing new", () => { + const messages = [ + toolMsg("t1", "f", bigOut()), + toolMsg("t2", "f", bigOut()), + uiText("u1", "user", "tail"), + ]; + const first = editToolResults(messages, opts); + expect(first.resultsElided).toBeGreaterThan(0); + const second = editToolResults(first.messages, opts); + expect(second.resultsElided).toBe(0); + expect(second.messages).toBe(first.messages); // stable ⇒ cache-friendly + }); + + it("grow-guard: never elides when the placeholder would be longer than the output", () => { + // Tiny gate picks a result just over it, but shorter than the ~140-char + // placeholder ⇒ eliding would inflate the prompt. Must skip (no negative + // reclaim, no churn, no-op identity). + const shortOut = "D".repeat(30); // > gate 10, < placeholder length + const messages = [ + toolMsg("t1", "f", shortOut), + uiText("u1", "user", "tail"), + ]; + const res = editToolResults(messages, { + keepRecentToolResults: 0, + minEditableToolChars: 10, + }); + expect(res.resultsElided).toBe(0); + expect(res.charsReclaimed).toBe(0); + expect(res.messages).toBe(messages); + }); + + it("no-op identity: returns the same array reference when nothing qualifies", () => { + const messages = [ + toolMsg("t1", "f", bigOut(50)), // under gate + uiText("u1", "user", "hi"), + ]; + const res = editToolResults(messages, opts); + expect(res.resultsElided).toBe(0); + expect(res.charsReclaimed).toBe(0); + expect(res.messages).toBe(messages); + }); +}); + +describe("applyTier1Compaction — Stage 0 avoids summarization (ADR-0012 §Stage 0 — context editing)", () => { + const hugeTool = (id: string) => toolMsg(id, "dump", "Z".repeat(8000)); + // High minPrunableChars so Stage 1 prefix-pruning does NOT rescue the no-edit + // case — it must reach Stage 2 (the model call) to make Stage 0's avoidance of + // it the real discriminator. + const editCfg = cfg({ + keepRecentToolResults: 1, + minEditableToolChars: 100, + keepRecentMessages: 2, + minPrunableChars: 100000, + }); + // Trigger sits between the post-edit size (~one big tool left) and the + // pre-edit size (~two big tools). + const budget: Budget = { + inputBudget: 100000, + triggerTokens: 3000, + targetTokens: 1500, + }; + const state: CompactionState = { + version: 0, + summaryWatermark: null, + contextSummary: null, + compactionDirty: false, + }; + const messages = () => [ + hugeTool("bt1"), + hugeTool("bt2"), + uiText("r1", "user", "ok"), + uiText("r2", "assistant", "done"), + ]; + + it("elides the old dump, drops under trigger, skips the model call", async () => { + const summarize = vi.fn(() => Promise.resolve("SUMMARY")); + const out = await applyTier1Compaction({ + chatId: "c", + messages: messages(), + state, + budget, + config: editCfg, + imageProvider: "default", + summarize, + store: storeFromState({ version: 0 }), + }); + expect(summarize).not.toHaveBeenCalled(); + expect(out.compacted).toBe(false); + // Stage 0 still leaned the view: the old dump (bt1) is a placeholder, the + // recent dump (bt2, within keep) stays verbatim. + expect(outputOf(out.messages[0])).toBe(elidedToolPlaceholder("dump", 8000)); + expect(outputOf(out.messages[1])).toBe("Z".repeat(8000)); + }); + + it("without context editing the same chat triggers summarization", async () => { + const summarize = vi.fn(() => Promise.resolve("SUMMARY")); + const out = await applyTier1Compaction({ + chatId: "c", + messages: messages(), + state, + budget, + config: cfg({ + contextEditingEnabled: false, + keepRecentMessages: 2, + minPrunableChars: 100000, + }), + imageProvider: "default", + summarize, + store: storeFromState({ version: 0 }), + }); + expect(summarize).toHaveBeenCalledOnce(); + expect(out.compacted).toBe(true); + }); +}); diff --git a/apps/backend/src/runs/compaction.ts b/apps/backend/src/runs/compaction.ts new file mode 100644 index 00000000..bc13df15 --- /dev/null +++ b/apps/backend/src/runs/compaction.ts @@ -0,0 +1,1497 @@ +/** + * Context compaction (ADR-0012 §Tier 1 / §Tier 2). + * + * This module owns durable compaction state and the message-shaping primitives. + * Slice 2a (this section) is the **single durable writer** (principle ADR-0012 §One durable writer): every + * mutation of `summaryWatermark` / `contextSummary` / `compactionDirty` flows + * through {@link CompactionStore.casWrite}, a version-gated compare-and-swap. + * + * Why versioned CAS and not "compare the watermark value" (ADR-0012 §One durable writer): history + * edits (ADR-0012 §Tier 1 invalidation) move the watermark **backward**. A loser that compared + * watermark values could mistake a reset for "not yet advanced" and write a stale + * summary over mutated history. Deciding by `version` removes the monotonicity + * assumption entirely — any concurrent mutation bumps the version, so a racing + * write simply loses the CAS and re-reads the truth. + */ + +import { and, eq } from "drizzle-orm"; +import type { ModelMessage, PrepareStepFunction } from "ai"; +import { db } from "../index.ts"; +import { chat as chatTable } from "../db/schema.ts"; +import { logger } from "../logger.ts"; +import type { PlatypusUIMessage } from "../types.ts"; +import { + estimateTokens, + stableStringify, + uiMessagesToCountUnits, + modelMessagesToCountUnits, + CHARS_PER_TOKEN, + type ImageProvider, +} from "./token-estimate.ts"; + +/** Durable compaction state on the chat row. */ +export type CompactionState = { + version: number; + summaryWatermark: string | null; + contextSummary: string | null; + compactionDirty: boolean; +}; + +/** + * A patch to the compaction fields. Only the keys present are written; absent + * keys are left untouched. `version` is always bumped by the writer (not here). + */ +export type WatermarkPatch = { + watermark?: string | null; + summary?: string | null; + dirty?: boolean; +}; + +/** + * The durable-state seam. Production wires this to Drizzle + * ({@link drizzleCompactionStore}); tests pass an in-memory implementation so + * the CAS algorithm is exercised without Postgres. + */ +export type CompactionStore = { + readState(chatId: string): Promise; + /** + * Version-gated compare-and-swap. Applies `patch` and sets + * `version = expectVersion + 1` **only if** the row's current version still + * equals `expectVersion`. Returns true iff exactly one row was updated + * (i.e. this writer won). The single durable writer (ADR-0012 §One durable writer). + */ + casWrite( + chatId: string, + expectVersion: number, + patch: WatermarkPatch, + ): Promise; +}; + +export const drizzleCompactionStore: CompactionStore = { + async readState(chatId) { + const rows = await db + .select({ + version: chatTable.version, + summaryWatermark: chatTable.summaryWatermark, + contextSummary: chatTable.contextSummary, + compactionDirty: chatTable.compactionDirty, + }) + .from(chatTable) + .where(eq(chatTable.id, chatId)) + .limit(1); + return rows[0] ?? null; + }, + + async casWrite(chatId, expectVersion, patch) { + const set: Record = { + version: expectVersion + 1, + updatedAt: new Date(), + }; + // Only touch the fields named in the patch — `in` so an explicit null + // (clear summary / reset watermark) is distinguishable from "leave alone". + if ("watermark" in patch) set.summaryWatermark = patch.watermark; + if ("summary" in patch) set.contextSummary = patch.summary; + if ("dirty" in patch) set.compactionDirty = patch.dirty; + + const updated = await db + .update(chatTable) + .set(set) + .where( + and(eq(chatTable.id, chatId), eq(chatTable.version, expectVersion)), + ) + .returning({ id: chatTable.id }); + return updated.length === 1; + }, +}; + +/** Outcome of {@link commitWatermark}. */ +export type CommitResult = + | { status: "applied"; version: number } + | { status: "skipped"; reason: "no-op" | "covered" | "contended" }; + +/** + * Decision an attempt makes against the freshly-read state: either write a patch + * or skip (a no-op, or because a concurrent winner already covered this work). + */ +export type WatermarkDecision = + | { kind: "write"; patch: WatermarkPatch } + | { kind: "skip"; reason: "no-op" | "covered" }; + +/** + * The single entry point for mutating compaction state (ADR-0012 §One durable writer). + * + * Reads the current state, asks `decide` what to do, and CAS-writes it. On a + * CAS conflict it re-reads and retries the decision **once**; a second conflict + * terminates as `skipped: "contended"` — never a recompute loop, so there is no + * livelock. Because `decide` is re-run against the re-read state, a racing + * invalidation (which bumps version + resets the watermark) is seen on the + * retry, and `decide` can choose to skip rather than write a stale summary. + * + * `decide` returning `skip: "covered"` means a winner already did this work; the + * caller should pass a patch that also clears `compactionDirty` in that branch + * if it wants the flag cleared (it is just another field on the patch). + */ +export async function commitWatermark( + store: CompactionStore, + chatId: string, + decide: (state: CompactionState) => WatermarkDecision, +): Promise { + const MAX_ATTEMPTS = 2; + for (let attempt = 0; attempt < MAX_ATTEMPTS; attempt++) { + const state = await store.readState(chatId); + if (!state) return { status: "skipped", reason: "no-op" }; + + const decision = decide(state); + if (decision.kind === "skip") { + return { status: "skipped", reason: decision.reason }; + } + + const won = await store.casWrite(chatId, state.version, decision.patch); + if (won) return { status: "applied", version: state.version + 1 }; + // Lost the CAS — a concurrent writer moved the version. Loop to re-read and + // re-decide. The decision compares VERSION (via the re-read), not watermark + // values, so a backward watermark reset cannot be misread (ADR-0012 §One durable writer). The metric + // gates whether the read→summarize→write contention note ever needs a fix. + logger.info( + { metric: "cas.conflict", chatId, attempt, version: state.version }, + "cas.conflict", + ); + } + + logger.warn( + { metric: "cas.conflict", chatId, contended: true }, + "compaction CAS contended past retry — skipping (safe no-op)", + ); + return { status: "skipped", reason: "contended" }; +} + +// =========================================================================== +// Slice 2b — compaction primitives (the message-shaping leaves) +// +// Two adapters share the same staged, cheap-first strategy (LibreChat pattern): +// Stage 1 — prune bulky tool results (no model call). Often enough. +// Stage 2 — summarize the older prefix into one synthetic summary (model call). +// `compactUIMessages` (Tier 1, durable) and `compactModelMessages` (Tier 2 + +// recovery, throwaway) differ only in message shape and the tool-pairing rule. +// Token counting is the ONE estimator from token-estimate.ts (ADR-0012 §One estimator). +// =========================================================================== + +/** Summarizes a transcript into a compact paragraph. Injected (the task model). */ +export type Summarize = (text: string) => Promise; + +/** Rough token count of a bare string (summary text) — the same char/4 rule. */ +function textTokens(text: string): number { + return Math.ceil(text.length / CHARS_PER_TOKEN); +} + +/** + * Soft-trims an over-long string to head+tail with an elision marker, so a bulky + * tool result keeps some signal instead of vanishing entirely. + */ +export function softTrim(text: string, keepEachSide = 500): string { + if (text.length <= keepEachSide * 2) return text; + const head = text.slice(0, keepEachSide); + const tail = text.slice(-keepEachSide); + const elided = text.length - keepEachSide * 2; + return `${head}\n…[elided ${elided} chars]…\n${tail}`; +} + +/** + * Picks the index splitting `prefix = [0, boundary)` from `recent = [boundary, + * total)`. Starts at `total - keepRecent`, then walks backward while the + * boundary is unsafe so a tool-call/result pair is never split (ADR-0012 §Tier 1). + */ +export function pickKeepBoundary( + total: number, + keepRecent: number, + isSafeBoundary: (index: number) => boolean, +): number { + let boundary = Math.max(0, total - keepRecent); + while (boundary > 0 && !isSafeBoundary(boundary)) boundary--; + return boundary; +} + +// --- Tier 1: UIMessage shape --------------------------------------------- + +/** + * Prunes bulky tool-result outputs in a UIMessage in place on a shallow copy. + * The tool part is kept (never dropped — the assistant tool message is atomic, + * ADR-0012 §Tier 1); only its `output` is soft-trimmed. Returns the (possibly) pruned message. + */ +function pruneUIMessage( + message: PlatypusUIMessage, + minPrunableChars: number, +): { message: PlatypusUIMessage; changed: boolean } { + let changed = false; + const parts = (message.parts ?? []).map((part) => { + const anyPart = part as { type: string; output?: unknown }; + const isTool = + anyPart.type === "dynamic-tool" || anyPart.type.startsWith("tool-"); + if (!isTool || anyPart.output === undefined) return part; + const serialized = + typeof anyPart.output === "string" + ? anyPart.output + : JSON.stringify(anyPart.output); + if (serialized.length <= minPrunableChars) return part; + changed = true; + return { ...anyPart, output: softTrim(serialized) }; + }); + return changed + ? { message: { ...message, parts } as PlatypusUIMessage, changed } + : { message, changed }; +} + +/** + * Placeholder body for an elided tool result (ADR-0012 §Stage 0 — context editing). + * LLM-AGNOSTIC: Platypus may run small/weak background models, so the string is + * EXPLICIT and self-describing. A terse marker ("[Old tool result content + * cleared]") assumes the model infers it can re-call the tool; a small model may + * not. Names the tool + elided size so the model can decide to re-run it, and is + * short enough that Stage 1 / the hard window wall never re-trim it. + */ +const ELIDED_PLACEHOLDER_PREFIX = '[Tool result for "'; + +export function elidedToolPlaceholder(toolName: string, chars: number): string { + return `${ELIDED_PLACEHOLDER_PREFIX}${toolName}" omitted to save context (${chars} chars). The full result is still available — call the tool again with the same input if you need it.]`; +} + +export type EditToolResultsOptions = { + /** Exempt the last N tool results (most recent) from elision. */ + keepRecentToolResults: number; + /** Only elide a tool result whose serialized output exceeds this many chars. */ + minEditableToolChars: number; +}; + +export type EditToolResultsResult = { + messages: PlatypusUIMessage[]; + resultsElided: number; + /** Net chars removed (original output length − placeholder length), for metrics. */ + charsReclaimed: number; +}; + +/** + * Stage 0 (ADR-0012 §Stage 0 — context editing; Anthropic `clear_tool_uses` + * equivalent): replaces the `output` of OLD bulky tool-result parts with a short + * placeholder, keeping the tool part itself (pairing) and ALL text parts intact. + * Pure + deterministic — no model call, recomputed from raw messages each turn by + * recency, so it needs no durable state (ADR-0012 §View, not delete: raw `chat.messages` is untouched, the + * full result stays for UI/audit). + * + * Recency is by COUNT of tool results (we have no clean turn id): the last + * `keepRecentToolResults` results are exempt, and the newest message is exempt + * regardless (same invariant as ADR-0012 §Hard window wall). A result is elided only when + * its serialized `output` exceeds `minEditableToolChars` — the size gate ≈ + * Anthropic's `clear_at_least`, so trivial results never churn the prompt cache. + * + * Monotonic + deterministic ⇒ cache-friendly: a result is elided the turn it ages + * past the keep-window and stays elided. Returns the SAME array reference when + * nothing qualified, so callers can skip a re-estimate. + */ +export function editToolResults( + messages: PlatypusUIMessage[], + opts: EditToolResultsOptions, +): EditToolResultsResult { + // Enumerate every tool-result-bearing part in order so "keep the last N" is a + // simple tail slice. A single message can carry several tool parts. + const toolResultLocs: Array<{ mi: number; pi: number }> = []; + messages.forEach((m, mi) => { + (m.parts ?? []).forEach((part, pi) => { + const ap = part as { type: string; output?: unknown }; + const isTool = ap.type === "dynamic-tool" || ap.type.startsWith("tool-"); + if (isTool && ap.output !== undefined) toolResultLocs.push({ mi, pi }); + }); + }); + + // Candidates for elision = all but the last `keepRecentToolResults`; the newest + // MESSAGE is exempt regardless (ADR-0012 §Hard window wall invariant). Decide the + // FULL elision policy here (recency + size gate + idempotency + grow-guard) and + // record the precomputed placeholder, so the rewrite map below fires only when + // there is real work — and never allocates a copy for a pure no-op. + const keepFrom = Math.max( + 0, + toolResultLocs.length - opts.keepRecentToolResults, + ); + const newestMessageIndex = messages.length - 1; + const elideAt = new Map(); // "mi:pi" -> placeholder + let charsReclaimed = 0; + for (let k = 0; k < keepFrom; k++) { + const loc = toolResultLocs[k]; + if (loc.mi === newestMessageIndex) continue; // newest message exempt + const ap = (messages[loc.mi].parts ?? [])[loc.pi] as { + type: string; + output?: unknown; + toolName?: string; + }; + const serialized = + typeof ap.output === "string" ? ap.output : JSON.stringify(ap.output); + // Size gate (≈ clear_at_least): leave trivial results untouched — no churn. + if (serialized.length <= opts.minEditableToolChars) continue; + // Idempotency guard: never re-elide our own placeholder. At the default gate + // (50k) the ~150-char placeholder is far below it, but a misconfigured tiny + // gate would otherwise re-elide it every turn. Keeps this monotonic. + if ( + typeof ap.output === "string" && + ap.output.startsWith(ELIDED_PLACEHOLDER_PREFIX) + ) { + continue; + } + const toolName = + ap.type === "dynamic-tool" + ? (ap.toolName ?? "unknown") + : ap.type.slice("tool-".length); + const placeholder = elidedToolPlaceholder(toolName, serialized.length); + // Grow-guard: a tiny gate could pick a result shorter than the placeholder; + // eliding would INFLATE the prompt (negative reclaim). Skip — never grow. + if (placeholder.length >= serialized.length) continue; + elideAt.set(`${loc.mi}:${loc.pi}`, placeholder); + charsReclaimed += serialized.length - placeholder.length; + } + + // Nothing truly qualified ⇒ return the original reference so callers skip the + // re-estimate (cache-friendly no-op) and we allocate no copy. + if (elideAt.size === 0) { + return { messages, resultsElided: 0, charsReclaimed: 0 }; + } + + const out = messages.map((m, mi) => { + const parts = m.parts ?? []; + if (!parts.some((_, pi) => elideAt.has(`${mi}:${pi}`))) return m; + const newParts = parts.map((part, pi) => { + const placeholder = elideAt.get(`${mi}:${pi}`); + if (placeholder === undefined) return part; + const ap = part as { output?: unknown }; + return { ...ap, output: placeholder }; + }); + return { ...m, parts: newParts } as PlatypusUIMessage; + }); + + return { messages: out, resultsElided: elideAt.size, charsReclaimed }; +} + +/** Builds a readable transcript of UIMessages for the summarizer. */ +/** Renders each message to its own transcript string (one entry per message), so + * the map-reduce summarizer can chunk on message boundaries and never split a + * single message mid-content (ADR-0012 §Tier 1 map-reduce). */ +function renderUIMessageList(messages: PlatypusUIMessage[]): string[] { + return messages.map((m) => { + const text = (m.parts ?? []) + .map((p) => { + const ap = p as { type: string; text?: string; output?: unknown }; + if (ap.type === "text") return ap.text ?? ""; + if (ap.type === "dynamic-tool" || ap.type.startsWith("tool-")) { + const out = + typeof ap.output === "string" + ? ap.output + : ap.output !== undefined + ? JSON.stringify(ap.output) + : ""; + return `[tool ${ap.type}] ${softTrim(out, 200)}`; + } + return ""; + }) + .filter(Boolean) + .join("\n"); + return `${m.role}: ${text}`; + }); +} + +export type UICompactOptions = { + /** Reduce the model view to at most this many tokens (hysteresis target). */ + targetTokens: number; + keepRecentMessages: number; + minPrunableChars: number; + /** Threshold for pruning tool results in kept (recent) messages after Stage 2. + * Defaults to minPrunableChars * 5 when omitted. */ + minRecentPrunableChars?: number; + /** + * The HARD window wall (ADR-0012 §Hard window wall): the kept view's tokens + * above which the call would actually overflow (already net of per-turn + * overhead by the caller). Recent (kept) tool results are trimmed ONLY when + * the kept view breaches this wall — a mere `targetTokens` (hysteresis) miss + * is cheap (it re-compacts next turn) and is not worth gutting active data the + * user is asking about. The single newest message is always exempt regardless. + * When omitted, recent results are always trimmed once over target (the + * behaviour predating ADR-0012 §Hard window wall) — safer than never trimming for callers that cannot + * supply the wall. + */ + inputBudget?: number; + imageProvider?: ImageProvider; + /** Existing durable summary to fold the new prefix into (incremental). */ + priorSummary?: string | null; + summarize: Summarize; + /** Token budget of one summarize call; larger prefixes are map-reduced (ADR-0012 §Tier 1 (summarizer model & map-reduce)). */ + summarizerWindow?: number; + /** + * Bypass the no-op estimate gate and force compaction even when char/4 says + * we are within budget. Used for dirty-forced Tier 1 (ADR-0012 §Recovery): recovery sets + * the dirty flag AFTER a provider rejection, so the estimator already failed; + * re-using it as the no-op gate causes an infinite overflow→dirty→no-op loop. + */ + force?: boolean; + /** + * Pre-computed estimate of `messages`. The caller's trigger projection + * already ran the char/4 pass over this exact set, so reuse it instead of + * re-estimating the full history a second time on the hot path. + */ + knownEstimate?: number; +}; + +export type UICompactionResult = { + /** Messages to send to the model (recent verbatim; pruned prefix if no summary). */ + keptMessages: PlatypusUIMessage[]; + /** New folded summary, or unchanged prior summary, or null. */ + summaryText: string | null; + /** Id of the last message folded into the summary (the new watermark), or null. */ + watermarkId: string | null; + messagesDropped: number; + usedModelCall: boolean; + /** Post-compaction estimate incl. the summary — should be ≤ targetTokens (ADR-0012 §Tier 1 (hysteresis)). */ + estimatedTokens: number; +}; + +/** + * Summarizes a prefix transcript, map-reducing when it exceeds the summarizer's + * own window (ADR-0012 §Tier 1 (summarizer model & map-reduce) — a huge cold-start history can't be sent whole). + */ +/** + * Packs per-message transcript segments into chunks that each fit `windowTokens`, + * splitting only on MESSAGE boundaries — never mid-message. A lone segment larger + * than the window (a single oversized message) is char-sliced as a last resort, + * which is unavoidable for one message that cannot fit whole. + */ +function packSegments(segments: string[], windowTokens: number): string[] { + const chunks: string[] = []; + let cur = ""; + const flush = () => { + if (cur) { + chunks.push(cur); + cur = ""; + } + }; + for (const seg of segments) { + if (textTokens(seg) > windowTokens) { + flush(); + const charBudget = windowTokens * CHARS_PER_TOKEN; + for (let i = 0; i < seg.length; i += charBudget) { + chunks.push(seg.slice(i, i + charBudget)); + } + continue; + } + const next = cur ? `${cur}\n\n${seg}` : seg; + if (textTokens(next) > windowTokens) { + flush(); + cur = seg; + } else { + cur = next; + } + } + flush(); + return chunks; +} + +async function summarizePrefix( + segments: string[], + priorSummary: string | null | undefined, + summarize: Summarize, + summarizerWindow: number | undefined, +): Promise { + const fold = (prior: string | null | undefined, body: string) => + prior ? `Previous summary:\n${prior}\n\nNewer messages:\n${body}` : body; + + // Single pass when everything — prior summary AND fold framing included — + // fits the window. Checking the *folded* size (not the bare body) closes the + // gap where a large prior summary overflowed an otherwise-fitting prefix. + const joined = segments.join("\n\n"); + if ( + !summarizerWindow || + textTokens(fold(priorSummary, joined)) <= summarizerWindow + ) { + return summarize(fold(priorSummary, joined)); + } + + // Map: summarize each window-sized chunk (message-boundary aligned). + const chunks = packSegments(segments, summarizerWindow); + const chunkSummaries: string[] = []; + for (const chunk of chunks) chunkSummaries.push(await summarize(chunk)); + + // Reduce: the joined chunk summaries (+ prior) can THEMSELVES exceed the window + // when there are many chunks, so recurse rather than summarizing them whole — + // the reduce step must never re-overflow (ADR-0012 §Tier 1 map-reduce). Each + // pass shrinks the segment count, so this converges. + return summarizePrefix( + chunkSummaries, + priorSummary, + summarize, + summarizerWindow, + ); +} + +/** + * Tier 1 (durable) compaction over UIMessages. Stage 1 prunes; if that reaches + * the target, no model call is made and the prefix stays (lighter). Otherwise + * Stage 2 summarizes the prefix into one synthetic summary and drops it from the + * model view. Raw messages are never mutated by the caller (ADR-0012 §View, not delete — this returns a + * view). + */ +export async function compactUIMessages( + messages: PlatypusUIMessage[], + opts: UICompactOptions, +): Promise { + const provider = opts.imageProvider ?? "default"; + const priorTokens = opts.priorSummary ? textTokens(opts.priorSummary) : 0; + const estimate = (msgs: PlatypusUIMessage[]) => + estimateTokens(uiMessagesToCountUnits(msgs, provider)); + + // Reuse the caller's already-computed estimate of `messages` rather than + // re-running the full char/4 pass on the hot path. + const initialEstimate = opts.knownEstimate ?? estimate(messages); + + // No-op when already within target (incl. the existing summary). This is what + // makes a follow-up turn after compaction NOT re-fire (hysteresis, ADR-0012 §Tier 1 (hysteresis)). + // Bypassed when `force` is set — recovery sets the dirty flag AFTER a provider + // rejection, so the estimator already proved wrong; using it as a no-op gate + // causes an infinite overflow→dirty→no-op loop (ADR-0012 §Recovery). + if (!opts.force && initialEstimate + priorTokens <= opts.targetTokens) { + return { + keptMessages: messages, + summaryText: opts.priorSummary ?? null, + watermarkId: null, + messagesDropped: 0, + usedModelCall: false, + estimatedTokens: initialEstimate + priorTokens, + }; + } + + const boundary = pickKeepBoundary( + messages.length, + opts.keepRecentMessages, + () => true, // UIMessage tool-call+result live in one message — any split is safe + ); + const prefix = messages.slice(0, boundary); + const recent = messages.slice(boundary); + + // Stage 1 — prune bulky tool results in the prefix (no model call). + const prunedPrefix = prefix.map( + (m) => pruneUIMessage(m, opts.minPrunableChars).message, + ); + const prunedAll = [...prunedPrefix, ...recent]; + if (!opts.force && estimate(prunedAll) + priorTokens <= opts.targetTokens) { + return { + keptMessages: prunedAll, + summaryText: opts.priorSummary ?? null, + watermarkId: null, // pruning advances no watermark (no new summary) + messagesDropped: 0, + usedModelCall: false, + estimatedTokens: estimate(prunedAll) + priorTokens, + }; + } + + // Past this point we are over target. Recent (kept) messages stay in the model + // view, so extreme outliers (e.g. large MCP tool dumps) bloat tokensAfter. + // The hard window wall (ADR-0012 §Hard window wall): trim them ONLY when the kept view would breach + // the hard window wall (`inputBudget`); a soft `targetTokens` miss is left at + // full fidelity and just re-compacts next turn (cheap). The newest message is + // always exempt — it is the data the current turn is actively about. + const recentThreshold = + opts.minRecentPrunableChars ?? opts.minPrunableChars * 5; + const pruneRecentExemptNewest = ( + msgs: PlatypusUIMessage[], + ): { messages: PlatypusUIMessage[]; changed: boolean } => { + let changed = false; + const messages = msgs.map((m, i) => { + if (i === msgs.length - 1) return m; // newest always exempt + const pruned = pruneUIMessage(m, recentThreshold); + if (pruned.changed) changed = true; + return pruned.message; + }); + return { messages, changed }; + }; + // Decides whether to keep `recent` verbatim or trim it (ADR-0012 §Hard window wall). Returns the + // kept messages and their token estimate (reused for `afterEstimate` so the + // recent set is never re-estimated). `fixedTokens` is the kept view's NON-recent + // part (pruned prefix and/or folded summary). When `inputBudget` is omitted the + // wall is unknown → always trim once over target (guard predating ADR-0012 §Hard window wall). + const keepRecentWithinWall = ( + fixedTokens: number, + recentMsgs: PlatypusUIMessage[], + ): { messages: PlatypusUIMessage[]; recentTokens: number } => { + const recentTokens = estimate(recentMsgs); + if ( + opts.inputBudget !== undefined && + fixedTokens + recentTokens <= opts.inputBudget + ) { + return { messages: recentMsgs, recentTokens }; // within wall — full fidelity + } + const trimmed = pruneRecentExemptNewest(recentMsgs); + // Nothing prunable (no tool outputs over threshold) → reuse the estimate. + return { + messages: trimmed.messages, + recentTokens: trimmed.changed ? estimate(trimmed.messages) : recentTokens, + }; + }; + + // Warn only when the kept view still breaches the HARD wall after trimming — + // i.e. recent genuinely couldn't be brought under the window (one oversized + // result; ingestion-cap territory). Under ADR-0012 §Hard window wall a soft `targetTokens` + // miss is by design (recent kept verbatim below the wall), so it is NOT a + // warning. Falls back to the old `target * 2` heuristic when no wall is supplied. + const warnIfOverWall = (afterEstimate: number) => { + const over = + opts.inputBudget !== undefined + ? afterEstimate > opts.inputBudget + : afterEstimate > opts.targetTokens * 2; + if (over) { + logger.warn( + { + afterEstimate, + targetTokens: opts.targetTokens, + inputBudget: opts.inputBudget, + keepRecentMessages: opts.keepRecentMessages, + }, + "compaction fired but recent messages exceed the window — a single oversized tool result may be uncompactable (see ingestion cap)", + ); + } + }; + + // ADR-0012 §Tier 1: nothing to summarize when the prefix is empty (history fits within + // keepRecentMessages). Also bail when the boundary message has no id — we + // cannot anchor a watermark there, and committing a watermark:null + + // non-null summary would orphan the summary (viewAfterWatermark ignores + // contextSummary when the watermark is null, so the previously-summarised + // prefix reappears every turn). + const watermarkId = + prefix.length > 0 ? (prefix[prefix.length - 1].id ?? null) : null; + if (prefix.length === 0 || watermarkId === null) { + const prunedPrefixTokens = estimate(prunedPrefix) + priorTokens; + const keptRecent = keepRecentWithinWall(prunedPrefixTokens, recent); + const kept = [...prunedPrefix, ...keptRecent.messages]; + const afterEstimate = prunedPrefixTokens + keptRecent.recentTokens; + warnIfOverWall(afterEstimate); + return { + keptMessages: kept, + summaryText: opts.priorSummary ?? null, + watermarkId: null, + messagesDropped: 0, + usedModelCall: false, + estimatedTokens: afterEstimate, + }; + } + + // Stage 2 — summarize the pruned prefix into one synthetic summary. + const summaryText = await summarizePrefix( + renderUIMessageList(prunedPrefix), + opts.priorSummary, + opts.summarize, + opts.summarizerWindow, + ); + + const summaryTokens = textTokens(summaryText); + const keptRecent = keepRecentWithinWall(summaryTokens, recent); + const afterEstimate = keptRecent.recentTokens + summaryTokens; + warnIfOverWall(afterEstimate); + + return { + keptMessages: keptRecent.messages, + summaryText, + watermarkId, + messagesDropped: prefix.length, + usedModelCall: true, + estimatedTokens: afterEstimate, + }; +} + +// --- Tier 2 / recovery: ModelMessage shape ------------------------------- + +/** Soft-trims bulky tool-result parts in a ModelMessage (role "tool"). */ +function pruneModelMessage( + message: ModelMessage, + minPrunableChars: number, +): ModelMessage { + if (message.role !== "tool" || typeof message.content === "string") { + return message; + } + const content = message.content.map((part) => { + if (part.type !== "tool-result") return part; + const output = part.output; + if (output.type === "text" || output.type === "error-text") { + if (output.value.length > minPrunableChars) { + return { + ...part, + output: { ...output, value: softTrim(output.value) }, + }; + } + return part; + } + if (output.type === "json" || output.type === "error-json") { + const serialized = JSON.stringify(output.value); + if (serialized.length > minPrunableChars) { + return { + ...part, + output: { type: "text" as const, value: softTrim(serialized) }, + }; + } + } + // ADR-0012 §Tier 1 (Stage 1 prune): @ai-sdk/mcp emits {type:"content"} for essentially every MCP tool + // result. Without this branch Stage 1 reclaims zero tokens from the bulkiest + // payloads and their text is invisible to the summarizer. + if (output.type === "content" && Array.isArray(output.value)) { + type ContentItem = { type: string; text?: string }; + const items = output.value as ContentItem[]; + const text = items + .filter((i) => i.type === "text") + .map((i) => i.text ?? "") + .join("\n"); + const mediaCount = items.filter((i) => i.type !== "text").length; + const marker = mediaCount > 0 ? `\n[${mediaCount} media item(s)]` : ""; + // Trim the text BEFORE appending the media marker so a huge text payload + // can never truncate the "[N media item(s)]" signal. + if (text.length + marker.length > minPrunableChars) { + return { + ...part, + output: { + type: "content" as const, + value: [ + { type: "text" as const, text: `${softTrim(text)}${marker}` }, + ], + }, + }; + } + } + return part; + }); + return { ...message, content }; +} + +/** Per-message transcript strings (one entry per message). See renderUIMessageList. */ +function renderModelMessageList(messages: ModelMessage[]): string[] { + return messages.map((m) => { + if (typeof m.content === "string") return `${m.role}: ${m.content}`; + const text = m.content + .map((p) => { + if (p.type === "text") return p.text; + if (p.type === "tool-call") return `[tool-call ${p.toolName}]`; + if (p.type === "tool-result") { + const o = p.output; + let v: string; + if (o.type === "text" || o.type === "error-text") { + v = o.value; + } else if (o.type === "json" || o.type === "error-json") { + v = JSON.stringify(o.value); + } else if (o.type === "content") { + // ADR-0012 §Tier 1 (Stage 1 prune): extract text items from content-type MCP output. + type ContentItem = { type: string; text?: string }; + v = (o.value as ContentItem[]) + .filter((i) => i.type === "text") + .map((i) => i.text ?? "") + .join("\n"); + } else { + v = ""; + } + return `[tool-result] ${softTrim(v, 200)}`; + } + return ""; + }) + .filter(Boolean) + .join("\n"); + return `${m.role}: ${text}`; + }); +} + +/** A synthetic summary as a model message. User-role + clear framing is the most + * broadly accepted shape (avoids mid-array system-message restrictions). */ +export function summaryModelMessage(text: string): ModelMessage { + return { + role: "user", + content: [ + { type: "text", text: `[Summary of earlier conversation]\n${text}` }, + ], + }; +} + +export type ModelCompactOptions = { + targetTokens: number; + keepRecentMessages: number; + minPrunableChars: number; + imageProvider?: ImageProvider; + summarize: Summarize; + summarizerWindow?: number; + /** Bypass the no-op estimate gate (same semantics as UICompactOptions.force). */ + force?: boolean; + /** + * Estimate of `messages` the caller already computed (e.g. the Tier 2 + * prepareStep trigger check). Reuses it for gate 1 instead of re-running a + * full estimate pass over the same messages. + */ + knownEstimate?: number; +}; + +export type ModelCompactionResult = { + messages: ModelMessage[]; + messagesDropped: number; + usedModelCall: boolean; + estimatedTokens: number; +}; + +/** + * Tier 2 (intra-turn) / recovery compaction over ModelMessages. Throwaway — the + * SDK keeps its canonical list; this only keeps a heavy response executable. + * Pairing rule differs from Tier 1: an assistant tool-call and its following + * `role:"tool"` result are separate messages and must not be split. + */ +export async function compactModelMessages( + messages: ModelMessage[], + opts: ModelCompactOptions, +): Promise { + const provider = opts.imageProvider ?? "default"; + const estimate = (msgs: ModelMessage[]) => + estimateTokens(modelMessagesToCountUnits(msgs, provider)); + + const initialEstimate = opts.knownEstimate ?? estimate(messages); + if (!opts.force && initialEstimate <= opts.targetTokens) { + return { + messages, + messagesDropped: 0, + usedModelCall: false, + estimatedTokens: initialEstimate, + }; + } + + // A boundary is unsafe if it would start `recent` on a tool result orphaned + // from its assistant tool-call (which would sit in the dropped prefix). + const boundary = pickKeepBoundary( + messages.length, + opts.keepRecentMessages, + (i) => i >= messages.length || messages[i].role !== "tool", + ); + const prefix = messages.slice(0, boundary); + const recent = messages.slice(boundary); + + // Stage 1 — prune. + const prunedPrefix = prefix.map((m) => + pruneModelMessage(m, opts.minPrunableChars), + ); + const prunedAll = [...prunedPrefix, ...recent]; + // Force-guarded like gate 1 (ADR-0012 §Recovery): when recovery forces a trim the provider + // already rejected this prompt, so the estimator proved wrong — re-trusting + // it here would return a byte-identical prompt and burn the single retry. + if (!opts.force && estimate(prunedAll) <= opts.targetTokens) { + return { + messages: prunedAll, + messagesDropped: 0, + usedModelCall: false, + estimatedTokens: estimate(prunedAll), + }; + } + + // ADR-0012 §Tier 1 (model-side): nothing to summarize when the prefix is empty (recent + // alone exceeds keepRecentMessages). Summarizing an empty prefix would add a + // synthetic message and GROW the prompt — never converges. Surface the + // overflow instead (recovery retries once, then propagates). + if (prefix.length === 0) { + return { + messages: prunedAll, + messagesDropped: 0, + usedModelCall: false, + estimatedTokens: estimate(prunedAll), + }; + } + + // Stage 2 — summarize the pruned prefix into one synthetic message. + const summaryText = await summarizePrefix( + renderModelMessageList(prunedPrefix), + null, + opts.summarize, + opts.summarizerWindow, + ); + const compacted = [summaryModelMessage(summaryText), ...recent]; + return { + messages: compacted, + messagesDropped: prefix.length, + usedModelCall: true, + estimatedTokens: estimate(compacted), + }; +} + +// =========================================================================== +// Slice 2c — Tier 1 orchestration (budget, view reconstruction, persist) +// +// `applyTier1Compaction` is the durable, cross-turn entry point invoked from +// `prepareChatTurn`. It is dependency-injected (store + summarizer) so it is +// unit-testable without standing up the full turn machinery. It: +// 1. Reconstructs the compacted VIEW from persisted state every turn (ADR-0012 §View, not delete) — +// drop messages up to the watermark, re-inject the stored summary. +// 2. Triggers a fresh compaction when the projected size crosses the trigger +// ratio, OR when `compactionDirty` forces it (recovery hand-off, ADR-0012 §Recovery). +// 3. Persists any new summary/watermark + clears dirty via the single CAS +// writer (ADR-0012 §One durable writer), the loser skipping safely on contention. +// =========================================================================== + +/** Resolved per-turn compaction config (ADR-0012 §Config & kill switch), defaults applied. */ +export type CompactionConfig = { + compactionEnabled: boolean; + triggerRatio: number; + targetRatio: number; + reserveRatio: number; + keepRecentMessages: number; + minPrunableChars: number; + /** Threshold for pruning tool results in the kept (recent) messages after + * Stage 2 summarization. Higher than minPrunableChars — we trim extreme + * outliers (e.g. huge MCP tool dumps) without destroying useful context. */ + minRecentPrunableChars: number; + /** Stage 0 context editing (ADR-0012 §Stage 0 — context editing): elide OLD bulky tool results to a + * placeholder before the trigger check, so a leaned view can avoid summarizing + * entirely. Gated alongside the COMPACTION_ENABLED kill switch. */ + contextEditingEnabled: boolean; + /** Stage 0: exempt the last N tool results from elision (recency, by count). */ + keepRecentToolResults: number; + /** Stage 0: only elide a tool result whose serialized output exceeds this. */ + minEditableToolChars: number; +}; + +export const DEFAULT_COMPACTION_CONFIG: CompactionConfig = { + compactionEnabled: true, + triggerRatio: 0.8, + targetRatio: 0.5, + reserveRatio: 0.05, + keepRecentMessages: 10, + minPrunableChars: 2000, + minRecentPrunableChars: 10000, + contextEditingEnabled: true, + keepRecentToolResults: 4, + // 50k chars ≈ 12.5k tokens — matches LibreChat's minPrunableToolChars, the only + // direct per-result char-gate analog. High enough to spare medium results (less + // cache churn) while still catching the ~160k-char mempalace dump. + minEditableToolChars: 50000, +}; + +export type Budget = { + inputBudget: number; + triggerTokens: number; + targetTokens: number; +}; + +/** + * Budget math (ADR-0012 §Tier 1 (budget math)): the trigger/target are fractions of the INPUT budget — + * the window minus the output reservation and a safety headroom — not of the raw + * window. When the resolved max output is unknown, reserve a conservative slice. + */ +export function computeBudget( + contextWindow: number, + maxOutputTokens: number | undefined, + config: CompactionConfig, +): Budget { + const rawOutputReserve = + maxOutputTokens ?? Math.min(4096, Math.floor(contextWindow * 0.25)); + // Cap the output reservation at half the window (ADR-0012 §Tier 1 (budget math)). litellm's + // `max_input_tokens` (which feeds `contextWindow`) is already input-scoped for + // some providers, so subtracting a large `max_output_tokens` again can collapse + // `inputBudget` toward 1 — making trigger/target ≈ 0 and thrashing. Capping + // keeps the (otherwise-safe) over-reservation from degenerating. + const maxOutputReserve = Math.min( + rawOutputReserve, + Math.floor(contextWindow * 0.5), + ); + const safetyReserve = Math.floor(config.reserveRatio * contextWindow); + const inputBudget = Math.max( + 1, + contextWindow - maxOutputReserve - safetyReserve, + ); + return { + inputBudget, + triggerTokens: config.triggerRatio * inputBudget, + targetTokens: config.targetRatio * inputBudget, + }; +} + +/** + * First-turn safety margin on the char/4 projection (ADR-0012 §Token estimation (cold-start margin)): char/4 + * under-counts CJK, dense JSON, and tool chatter, and on a cold start there is + * no provider-reported `usage.inputTokens` to correct it. + */ +export const COLD_START_MARGIN = 1.15; + +/** + * The Tier 1 trigger projection (ADR-0012 §Tier 1 (trigger projection)): what THIS turn is about to put on + * the wire, not just the stored messages. `overheadTokens` carries the + * estimated system prompt + tool schemas + skill payload — invisible to a + * message-only estimate but sent to the model on every turn (the observed + * live-test gap: provider reported 8888 input tokens vs ~986 message-only). + * `lastInputTokens` is the provider-reported count from the prior turn — the + * corrective baseline for turns ≥ 2 (threaded in the ADR-0012 §Context-usage ring usage-metadata chunk). + * When it is absent the whole char/4 projection is inflated by + * {@link COLD_START_MARGIN} (ADR-0012 §Token estimation (cold-start margin)). + */ +export function projectTier1Tokens(args: { + messageTokens: number; + priorSummaryTokens: number; + overheadTokens?: number; + lastInputTokens?: number; +}): number { + const charBased = + args.messageTokens + args.priorSummaryTokens + (args.overheadTokens ?? 0); + // Treat a non-positive count as "no baseline" (ADR-0012 §Tier 1 (trigger projection)): some OpenAI-compatible / + // vLLM gateways omit `usage.inputTokens`, which we persist as + // `contextTokens = 0`. A bare `== null` check would let that 0 slip through — + // skipping the cold-start margin AND no-op-ing the `max()` below — leaving the + // raw char/4 projection with no safety buffer on EVERY turn for those + // providers. Falling back to the margin keeps the conservative over-count. + if (args.lastInputTokens == null || args.lastInputTokens <= 0) { + return Math.ceil(charBased * COLD_START_MARGIN); + } + // Two independent estimates of this turn's payload: `charBased` is a fresh + // char/4 pass over the whole unsummarized view (+ summary + overhead); + // `lastInputTokens` is the provider's accurate count from the prior turn but + // stale (missing messages appended since). Take the larger — char/4 chronically + // under-counts, so this is usually `lastInputTokens`; over-counting only + // triggers compaction earlier, never an overflow. + return Math.max(Math.ceil(charBased), args.lastInputTokens); +} + +/** Synthetic UIMessage carrying the persisted summary, injected into the view. */ +export function summaryUIMessage(text: string): PlatypusUIMessage { + return { + id: "context-summary", + role: "user", + parts: [ + { type: "text", text: `[Summary of earlier conversation]\n${text}` }, + ], + }; +} + +/** Fail-loud event so the transcript shows compaction happened (ADR-0012 §Tier 1). */ +export type CompactionEvent = { + type: "context-compacted"; + messagesDropped: number; + tokensBefore: number; + tokensAfter: number; +}; + +export type Tier1Input = { + chatId: string; + /** Full durable history (post-`inlineFileUrls`, ADR-0012 §Token estimation). */ + messages: PlatypusUIMessage[]; + state: CompactionState; + budget: Budget; + config: CompactionConfig; + imageProvider: ImageProvider; + summarize: Summarize; + store: CompactionStore; + summarizerWindow?: number; + /** + * Estimated tokens of the per-turn payload that is NOT in `messages` — + * system prompt, tool schemas, skill list (ADR-0012 §Tier 1 (trigger projection)). Counted toward the + * trigger and subtracted from the compaction target (compaction cannot + * shrink it, so hysteresis must leave room for it — ADR-0012 §Tier 1 (hysteresis)). + */ + overheadTokens?: number; + /** Provider-reported `usage.inputTokens` from the prior turn (ADR-0012 §Tier 1 (trigger projection), via ADR-0012 §Context-usage ring). */ + lastInputTokens?: number; + onEvent?: (event: CompactionEvent) => void; +}; + +export type CompactionTrace = { + /** Number of messages that were folded into the summary. */ + messagesDropped: number; + /** First ~120 chars of the LLM-generated summary. */ + summaryExcerpt?: string; +}; + +/** Tool name for the synthetic compaction-trace tool-call/result pair (ADR-0012 §Compaction trace in the timeline). + * Shared by the stream-trace producer (agent-runner), the strip filter that + * keeps it out of the model payload, the ADR-0012 §Force-compact on demand persisted-message builder, and the + * frontend display-name mapping. */ +export const COMPACT_CONTEXT_TOOL_NAME = "compact_context"; + +/** Builds a standalone synthetic assistant message carrying the compaction + * trace as a `compact_context` tool-call/result pair (ADR-0012 §Force-compact on demand — forced compaction + * has no live stream to inject into, so the trace is persisted as its own + * message instead). The message is always appended ABOVE the watermark, so it + * is never itself summarized; the strip filter keeps it out of the model + * payload on subsequent turns. */ +export function buildCompactionTraceMessage( + trace: CompactionTrace, + id: string, +): PlatypusUIMessage { + return { + id, + role: "assistant", + parts: [ + { + type: `tool-${COMPACT_CONTEXT_TOOL_NAME}`, + toolCallId: `${id}-call`, + state: "output-available", + input: { messagesDropped: trace.messagesDropped }, + output: { + messagesDropped: trace.messagesDropped, + ...(trace.summaryExcerpt + ? { summaryExcerpt: trace.summaryExcerpt } + : {}), + }, + }, + ], + } as unknown as PlatypusUIMessage; +} + +export type Tier1Output = { + /** The compacted view to send to the model (summary message + recent). */ + messages: PlatypusUIMessage[]; + /** True when a new summary was produced and persisted this turn. */ + compacted: boolean; + commit?: CommitResult; + /** + * Present ONLY when a model summary was produced this turn — the user-visible + * "compaction happened" signal (ADR-0012 §Compaction trace in the timeline). Deliberately undefined for + * prune-only and force-dirty-within-target no-op turns: those drop 0 messages + * and have no excerpt, so a trace would render an empty/confusing timeline + * entry. + */ + compactionTrace?: CompactionTrace; +}; + +/** Splits history at the watermark message id. Returns the messages after it and + * whether the stored summary is still trustworthy (watermark id still present). */ +function viewAfterWatermark( + messages: PlatypusUIMessage[], + state: CompactionState, +): { afterWatermark: PlatypusUIMessage[]; priorSummary: string | null } { + if (!state.summaryWatermark) { + return { afterWatermark: messages, priorSummary: null }; + } + const idx = messages.findIndex((m) => m.id === state.summaryWatermark); + if (idx === -1) { + // Watermark message is gone (edited/deleted before invalidation landed): + // distrust the summary and fall back to the full history (defensive ADR-0012 §Summary invalidation). + return { afterWatermark: messages, priorSummary: null }; + } + return { + afterWatermark: messages.slice(idx + 1), + priorSummary: state.contextSummary, + }; +} + +export async function applyTier1Compaction( + input: Tier1Input, +): Promise { + const { messages, state, budget, config, imageProvider } = input; + const estimate = (msgs: PlatypusUIMessage[]) => + estimateTokens(uiMessagesToCountUnits(msgs, imageProvider)); + + const { afterWatermark, priorSummary } = viewAfterWatermark(messages, state); + const priorSummaryTokens = priorSummary ? textTokens(priorSummary) : 0; + + // Stage 0 — context editing (ADR-0012 §Stage 0 — context editing): elide OLD bulky tool results to + // placeholders BEFORE the trigger projection, so a leaned view can drop under + // the trigger and skip summarization entirely. Pure/deterministic, no durable + // state (ADR-0012 §View, not delete). Gated by the COMPACTION_ENABLED kill switch (recovery stays the + // net, ADR-0012 §Recovery is the net) AND the per-feature `contextEditingEnabled`. Returns the same array + // reference when nothing qualified, so the no-op case re-estimates nothing. + // NB (ADR-0012 §Stage 0 — context editing): the elided placeholders also flow into the prefix that + // Stage 2 would summarize, so a summarized result keeps only its placeholder — + // an accepted fidelity trade-off (a 40k dump's head+tail is poor summary fodder + // and the raw stays in the DB). + const contextEditing = + config.compactionEnabled && config.contextEditingEnabled + ? editToolResults(afterWatermark, { + keepRecentToolResults: config.keepRecentToolResults, + minEditableToolChars: config.minEditableToolChars, + }) + : { messages: afterWatermark, resultsElided: 0, charsReclaimed: 0 }; + const editedView = contextEditing.messages; + if (contextEditing.resultsElided > 0) { + logger.info( + { + metric: "context_edited", + chatId: input.chatId, + resultsElided: contextEditing.resultsElided, + charsReclaimed: contextEditing.charsReclaimed, + }, + "context_edited", + ); + } + + const inject = (summary: string | null, msgs: PlatypusUIMessage[]) => + summary ? [summaryUIMessage(summary), ...msgs] : msgs; + + // The view that would be sent if we did nothing more this turn. + const baseView = inject(priorSummary, editedView); + const overheadTokens = input.overheadTokens ?? 0; + // Compute the char/4 pass over the unsummarized view once and reuse it + // for both the trigger projection and compactUIMessages' no-op gate. + const messageTokens = estimate(editedView); + const projected = projectTier1Tokens({ + messageTokens, + priorSummaryTokens, + overheadTokens, + lastInputTokens: input.lastInputTokens, + }); + + const forceCompact = state.compactionDirty; + const triggered = + forceCompact || + (config.compactionEnabled && projected >= budget.triggerTokens); + + logger.info( + { + metric: "compaction.check", + chatId: input.chatId, + compactionEnabled: config.compactionEnabled, + projected, + triggerTokens: budget.triggerTokens, + targetTokens: budget.targetTokens, + inputBudget: budget.inputBudget, + triggered, + forceCompact, + messageTokens, + priorSummaryTokens, + overheadTokens: input.overheadTokens ?? 0, + lastInputTokens: input.lastInputTokens, + }, + "compaction.check", + ); + + if (!triggered) { + return { messages: baseView, compacted: false }; + } + + // Compaction can only shrink the messages, never the per-turn overhead, so + // the target the messages must fit in is reduced by it (ADR-0012 §Tier 1 (hysteresis)). When the + // overhead alone exhausts the target, hysteresis is impossible — warn loudly + // (compaction will re-fire every turn) but still compact: recovery is the + // only other net. + const effectiveTarget = Math.max(0, budget.targetTokens - overheadTokens); + if (overheadTokens >= budget.targetTokens) { + logger.warn( + { chatId: input.chatId, overheadTokens, target: budget.targetTokens }, + "system/tool overhead alone exceeds the compaction target — compaction will re-fire each turn", + ); + } + + // The hard wall the kept view must fit under (ADR-0012 §Hard window wall), net of the per-turn + // overhead compaction cannot shrink — mirrors how effectiveTarget adjusts the + // soft target. Recent tool results are trimmed only when this is breached. + const effectiveInputBudget = Math.max(0, budget.inputBudget - overheadTokens); + + const result = await compactUIMessages(editedView, { + targetTokens: effectiveTarget, + inputBudget: effectiveInputBudget, + keepRecentMessages: config.keepRecentMessages, + minPrunableChars: config.minPrunableChars, + minRecentPrunableChars: config.minRecentPrunableChars, + imageProvider, + priorSummary, + summarize: input.summarize, + summarizerWindow: input.summarizerWindow, + // When dirty-forced the estimator already proved wrong (ADR-0012 §Recovery): bypass the + // no-op gate so recovery's dirty flag actually shrinks the history. + force: forceCompact, + // The no-op gate estimates this exact set; reuse the value above. + knownEstimate: messageTokens, + }); + + const view = inject(result.summaryText ?? priorSummary, result.keptMessages); + + // Persist through the single CAS writer (ADR-0012 §One durable writer). The decision is gated on the + // version we read; if a concurrent writer advanced it, we skip rather than + // recompute (the wasted summarize is bounded, never corrupting). The + // version-pinning gate is shared so both write paths decide identically. + const capturedVersion = state.version; + // On a version mismatch we skip as "covered" WITHOUT clearing dirty (ADR-0012 + // §One durable writer). Clearing on skip is only safe when the winner actually + // compacted; a concurrent invalidateCompaction also advances the version yet + // leaves dirty set on purpose (it resets the summary, it does not shrink + // history) — clearing dirty here would then drop the forced compaction the + // overflow demanded. Leaving dirty set is strictly safe: worst case is one + // extra compaction next turn. + const pinnedWrite = (patch: WatermarkPatch) => + commitWatermark(input.store, input.chatId, (latest) => + latest.version === capturedVersion + ? { kind: "write", patch } + : { kind: "skip", reason: "covered" }, + ); + let commit: CommitResult | undefined; + + if (result.usedModelCall) { + // Same-basis before/after for the user-visible reduction: both are + // char/4 message estimates plus the per-turn overhead. The trigger + // `projected` mixes in the provider's `lastInputTokens` floor and is NOT + // comparable to the message-only post estimate, so reporting it as "before" + // overstated the drop. Computed only on the model-call path (the only place + // these are reported). + const tokensBefore = messageTokens + priorSummaryTokens + overheadTokens; + const tokensAfter = result.estimatedTokens + overheadTokens; + + commit = await pinnedWrite({ + summary: result.summaryText, + watermark: result.watermarkId, + dirty: false, + }); + logger.info( + { + metric: "compaction.fired", + tier: 1, + chatId: input.chatId, + tokensBefore, + tokensAfter, + // Keep the raw trigger projection for correlation with compaction.check. + projected, + messagesDropped: result.messagesDropped, + }, + "compaction.fired", + ); + input.onEvent?.({ + type: "context-compacted", + messagesDropped: result.messagesDropped, + tokensBefore, + tokensAfter, + }); + } else if (state.compactionDirty) { + // Forced by recovery but pruning/within-target sufficed: just clear the flag. + commit = await pinnedWrite({ dirty: false }); + } + + // Only surface a trace when an actual model summary was produced. Prune-only + // and force-dirty-within-target runs drop 0 messages with no excerpt — a + // trace there would be an empty, confusing timeline entry (ADR-0012 §Compaction trace in the timeline). + const compactionTrace: CompactionTrace | undefined = + result.usedModelCall && result.summaryText + ? { + messagesDropped: result.messagesDropped, + summaryExcerpt: result.summaryText.slice(0, 120), + } + : undefined; + + return { + messages: view, + compacted: result.usedModelCall, + commit, + compactionTrace, + }; +} + +/** + * Detects which summarized messages (at/below the watermark) the freshly + * submitted history changed or dropped — the ADR-0012 §Summary invalidation trigger. Because the client + * resubmits the full message array each turn (there is no separate edit/delete + * endpoint), divergence is found by comparing the persisted canonical history + * against the incoming one up to the watermark. Returns the ids that an + * edit/delete/regenerate touched; empty means the summary is still valid. + */ +export function affectedBelowWatermark( + persisted: PlatypusUIMessage[], + incoming: PlatypusUIMessage[], + watermarkId: string | null, +): string[] { + if (!watermarkId) return []; + const wmIdx = persisted.findIndex((m) => m.id === watermarkId); + if (wmIdx === -1) return [watermarkId]; // watermark message gone entirely + const incomingById = new Map(incoming.map((m) => [m.id, m])); + const affected: string[] = []; + for (let i = 0; i <= wmIdx; i++) { + const p = persisted[i]; + if (!p.id) continue; + const inc = incomingById.get(p.id); + if (!inc || stableStringify(inc.parts) !== stableStringify(p.parts)) { + affected.push(p.id); + } + } + return affected; +} + +/** + * Persists `compactionDirty = true` after a context-overflow recovery (ADR-0012 §Recovery). + * Recovery never writes summary/watermark — it only flags; the next + * `prepareChatTurn` sees the flag, forces Tier 1, and clears it inside the same + * CAS write that advances the watermark. Goes through the single writer (ADR-0012 §One durable writer); + * already-dirty is a no-op. + */ +export async function setCompactionDirty( + store: CompactionStore, + chatId: string, +): Promise { + return commitWatermark(store, chatId, (state) => + state.compactionDirty + ? { kind: "skip", reason: "no-op" } + : { kind: "write", patch: { dirty: true } }, + ); +} + +export async function invalidateCompaction( + store: CompactionStore, + chatId: string, + affectedIds: string[], + orderedIds: string[], +): Promise { + return commitWatermark(store, chatId, (state) => { + if (!state.summaryWatermark && !state.contextSummary) { + return { kind: "skip", reason: "no-op" }; + } + const wmIndex = state.summaryWatermark + ? orderedIds.indexOf(state.summaryWatermark) + : orderedIds.length; // null watermark ⇒ everything is "summarized-from-start" + const affectsSummarized = affectedIds.some((id) => { + const i = orderedIds.indexOf(id); + // Affected message is missing (deleted) or sits at/below the watermark. + return i === -1 || (wmIndex !== -1 && i <= wmIndex); + }); + if (!affectsSummarized) return { kind: "skip", reason: "no-op" }; + return { kind: "write", patch: { summary: null, watermark: null } }; + }); +} + +// --- Tier 2 in-turn compaction (ADR-0012 §Tier 2) --- + +/** + * Per-turn Tier 2 compaction context (ADR-0012 §Tier 2). Null when the ADR-0012 §Config & kill switch or + * agent config disables proactive compaction. Sub-agents also receive Tier 2 + * (ADR-0012 §Sub-agents / §Tier 2 — they have no durable history for Tier 1, but their tool loop + * can bloat intra-turn). + */ +export type Tier2Context = { + triggerTokens: number; + targetTokens: number; + keepRecentMessages: number; + minPrunableChars: number; + imageProvider: ImageProvider; + summarize: Summarize; + summarizerWindow?: number; +}; + +/** + * Builds the Tier 2 in-turn compaction `prepareStep` callback (ADR-0012 §Tier 2). Fires + * before each step of a tool loop when the accumulated model messages exceed + * `triggerTokens` — compacts via `compactModelMessages` and returns the + * trimmed messages. Returns `undefined` when below the threshold so the SDK + * proceeds unchanged (ADR-0012 §Sub-agents / §Tier 2: no per-step overhead when the loop is small). + */ +export function buildTier2PrepareStep(ctx: Tier2Context): PrepareStepFunction { + return async ({ messages }) => { + const estimate = estimateTokens( + modelMessagesToCountUnits(messages, ctx.imageProvider), + ); + if (estimate < ctx.triggerTokens) return undefined; + + const result = await compactModelMessages(messages, { + targetTokens: ctx.targetTokens, + keepRecentMessages: ctx.keepRecentMessages, + minPrunableChars: ctx.minPrunableChars, + imageProvider: ctx.imageProvider, + summarize: ctx.summarize, + summarizerWindow: ctx.summarizerWindow, + // Reuse the trigger-check estimate; skips a redundant full pass. + knownEstimate: estimate, + }); + + if (result.messagesDropped === 0) return undefined; + + logger.info( + { + messagesDropped: result.messagesDropped, + estimatedTokensBefore: estimate, + estimatedTokensAfter: result.estimatedTokens, + }, + "Tier 2 in-turn compaction fired", + ); + + return { messages: result.messages }; + }; +} diff --git a/apps/backend/src/runs/context-window.test.ts b/apps/backend/src/runs/context-window.test.ts new file mode 100644 index 00000000..27c63a9a --- /dev/null +++ b/apps/backend/src/runs/context-window.test.ts @@ -0,0 +1,350 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; + +vi.mock("../logger.ts", () => ({ + logger: { warn: vi.fn(), info: vi.fn(), error: vi.fn(), debug: vi.fn() }, +})); + +import { + ContextWindowResolver, + lookupRegistry, + DEFAULT_CONTEXT_WINDOW, + type Registry, + type ProviderWindowInput, +} from "./context-window.ts"; + +const REGISTRY: Registry = { + "gpt-4o": { max_input_tokens: 128000, max_output_tokens: 16384 }, + "claude-3-5-sonnet-20240620": { + max_input_tokens: 200000, + max_output_tokens: 8192, + }, + "anthropic.claude-3-5-sonnet-20240620-v1:0": { + max_input_tokens: 200000, + max_output_tokens: 4096, + }, + "legacy-model": { max_tokens: 4096 }, +}; + +const loadRegistry = () => Promise.resolve(REGISTRY); + +function resolver() { + return new ContextWindowResolver({ loadRegistry }); +} + +const openai: ProviderWindowInput = { + id: "prov-openai", + providerType: "OpenAI", + baseUrl: null, + apiKey: "sk-x", +}; + +describe("lookupRegistry — key normalization (ADR-0012 §Window resolution (key normalization))", () => { + it("exact match", () => { + expect(lookupRegistry(REGISTRY, "gpt-4o")?.max_input_tokens).toBe(128000); + }); + + it("strips a provider prefix", () => { + expect(lookupRegistry(REGISTRY, "openai/gpt-4o")?.max_input_tokens).toBe( + 128000, + ); + }); + + it("lowercases", () => { + expect(lookupRegistry(REGISTRY, "GPT-4o")?.max_input_tokens).toBe(128000); + }); + + it("uses the alias map for an Azure deployment name", () => { + expect( + lookupRegistry(REGISTRY, "my-azure-deploy", { + "my-azure-deploy": "gpt-4o", + })?.max_input_tokens, + ).toBe(128000); + }); + + it("resolves a Bedrock ARN to its vendor.model id", () => { + const arn = + "arn:aws:bedrock:us-east-1::foundation-model/anthropic.claude-3-5-sonnet-20240620-v1:0"; + expect(lookupRegistry(REGISTRY, arn)?.max_output_tokens).toBe(4096); + }); + + it("family heuristic: dated suffix matches the base key", () => { + // "gpt-4o-2024-11-20" → longest prefix key "gpt-4o" + expect( + lookupRegistry(REGISTRY, "gpt-4o-2024-11-20")?.max_input_tokens, + ).toBe(128000); + }); + + it("returns undefined on a true MISS", () => { + expect(lookupRegistry(REGISTRY, "totally-unknown-xyz")).toBeUndefined(); + }); +}); + +describe("resolveContextWindow — resolution order", () => { + beforeEach(() => vi.clearAllMocks()); + + it("1. manual override wins over everything", async () => { + const r = resolver(); + const out = await r.resolve( + { + ...openai, + modelMeta: { + "gpt-4o": { contextWindow: 64000, maxOutputTokens: 2048 }, + }, + }, + "gpt-4o", + ); + expect(out).toEqual({ + contextWindow: 64000, + maxOutputTokens: 2048, + source: "override", + }); + }); + + it("3. falls to the litellm registry when no override / API", async () => { + const r = resolver(); + const out = await r.resolve({ ...openai }, "gpt-4o"); + expect(out).toEqual({ + contextWindow: 128000, + maxOutputTokens: 16384, + source: "registry", + }); + }); + + it("ignores litellm max_tokens (output cap, not window) → default (ADR-0012 §Window resolution)", async () => { + // "legacy-model" has only max_tokens; that is the OUTPUT cap, so it must NOT + // be read as the context window. Falls through to the conservative default. + const r = resolver(); + const out = await r.resolve({ ...openai }, "legacy-model"); + expect(out.contextWindow).toBe(DEFAULT_CONTEXT_WINDOW); + expect(out.source).toBe("default"); + }); + + it("merges a maxOutputTokens-only override onto a registry window (ADR-0012 §Window resolution)", async () => { + const r = resolver(); + const out = await r.resolve( + { ...openai, modelMeta: { "gpt-4o": { maxOutputTokens: 999 } } }, + "gpt-4o", + ); + // No contextWindow override → window from registry, but output cap overridden. + expect(out).toEqual({ + contextWindow: 128000, + maxOutputTokens: 999, + source: "registry", + }); + }); + + it("4. conservative default + source=default on a MISS (ADR-0012 §Context-usage ring)", async () => { + const r = resolver(); + const out = await r.resolve({ ...openai }, "unknown-model-zzz"); + expect(out).toEqual({ + contextWindow: DEFAULT_CONTEXT_WINDOW, + maxOutputTokens: undefined, + source: "default", + }); + }); +}); + +describe("API auto-detect parsers", () => { + it("Google: inputTokenLimit / outputTokenLimit", async () => { + const httpGetJson = vi.fn().mockResolvedValue({ + inputTokenLimit: 1048576, + outputTokenLimit: 8192, + }); + const r = new ContextWindowResolver({ loadRegistry, httpGetJson }); + const out = await r.resolve( + { + id: "g", + providerType: "Google", + baseUrl: "https://gen.example", + apiKey: "k", + }, + "gemini-1.5-pro", + ); + expect(out).toEqual({ + contextWindow: 1048576, + maxOutputTokens: 8192, + source: "api", + }); + expect(httpGetJson).toHaveBeenCalledWith( + "https://gen.example/v1beta/models/gemini-1.5-pro", + { "x-goog-api-key": "k" }, + ); + }); + + it("OpenRouter: matches id → context_length", async () => { + const httpGetJson = vi.fn().mockResolvedValue({ + data: [ + { id: "other", context_length: 1 }, + { + id: "meta-llama/llama-3.1-70b", + context_length: 131072, + top_provider: { max_completion_tokens: 4096 }, + }, + ], + }); + const r = new ContextWindowResolver({ loadRegistry, httpGetJson }); + const out = await r.resolve( + { + id: "or", + providerType: "OpenRouter", + baseUrl: "https://openrouter.ai", + }, + "meta-llama/llama-3.1-70b", + ); + expect(out).toEqual({ + contextWindow: 131072, + maxOutputTokens: 4096, + source: "api", + }); + }); + + it("vLLM / OpenAI-compatible: max_model_len from a custom baseUrl", async () => { + const httpGetJson = vi.fn().mockResolvedValue({ + data: [{ id: "my-vllm-model", max_model_len: 32768 }], + }); + const r = new ContextWindowResolver({ loadRegistry, httpGetJson }); + const out = await r.resolve( + { + id: "v", + providerType: "OpenAI", + baseUrl: "http://localhost:8000", + apiKey: "x", + }, + "my-vllm-model", + ); + expect(out.contextWindow).toBe(32768); + expect(out.source).toBe("api"); + }); + + it("vLLM: a baseUrl already ending in /v1 probes /v1/models, not /v1/v1/models", async () => { + const httpGetJson = vi.fn().mockResolvedValue({ + data: [{ id: "qwen36", max_model_len: 262144 }], + }); + const r = new ContextWindowResolver({ loadRegistry, httpGetJson }); + const out = await r.resolve( + { + id: "v", + providerType: "OpenAI", + baseUrl: "http://localhost:8000/v1", + apiKey: "x", + }, + "qwen36", + ); + expect(out.contextWindow).toBe(262144); + expect(out.source).toBe("api"); + expect(httpGetJson).toHaveBeenCalledWith( + "http://localhost:8000/v1/models", + expect.anything(), + ); + }); + + it("official OpenAI (no baseUrl) skips the probe and falls to registry", async () => { + const httpGetJson = vi.fn(); + const r = new ContextWindowResolver({ loadRegistry, httpGetJson }); + const out = await r.resolve({ ...openai, baseUrl: null }, "gpt-4o"); + expect(httpGetJson).not.toHaveBeenCalled(); + expect(out.source).toBe("registry"); + }); + + it("a failing API probe falls through to the registry", async () => { + const httpGetJson = vi.fn().mockRejectedValue(new Error("boom")); + const r = new ContextWindowResolver({ loadRegistry, httpGetJson }); + const out = await r.resolve( + { id: "g", providerType: "Google", baseUrl: "https://gen.example" }, + "gpt-4o", + ); + expect(out.source).toBe("registry"); + }); +}); + +describe("registry load failure (ADR-0012 §Window resolution)", () => { + it("a throwing loader degrades to empty registry → default, no reject", async () => { + const r = new ContextWindowResolver({ + loadRegistry: () => Promise.reject(new Error("bad vendored json")), + }); + const out = await r.resolve({ ...openai }, "gpt-4o"); + expect(out.source).toBe("default"); + expect(out.contextWindow).toBe(DEFAULT_CONTEXT_WINDOW); + }); +}); + +describe("cache + evict (ADR-0012 §Window resolution (caching & eviction))", () => { + it("caches within the TTL (one probe), evict forces a re-probe", async () => { + const httpGetJson = vi + .fn() + .mockResolvedValue({ data: [{ id: "m", max_model_len: 1000 }] }); + const r = new ContextWindowResolver({ loadRegistry, httpGetJson }); + const p = { + id: "v", + providerType: "OpenAI", + baseUrl: "http://x", + apiKey: "k", + }; + + await r.resolve(p, "m"); + await r.resolve(p, "m"); + expect(httpGetJson).toHaveBeenCalledTimes(1); // second hit served from cache + + r.evict("v"); + await r.resolve(p, "m"); + expect(httpGetJson).toHaveBeenCalledTimes(2); // evict busted the cache + }); + + it("the cached value expires after the TTL", async () => { + let now = 1000; + const httpGetJson = vi + .fn() + .mockResolvedValue({ data: [{ id: "m", max_model_len: 1000 }] }); + const r = new ContextWindowResolver({ + loadRegistry, + httpGetJson, + ttlMs: 100, + now: () => now, + }); + const p = { + id: "v", + providerType: "OpenAI", + baseUrl: "http://x", + apiKey: "k", + }; + + await r.resolve(p, "m"); + now += 50; + await r.resolve(p, "m"); + expect(httpGetJson).toHaveBeenCalledTimes(1); // still within TTL + + now += 100; // past TTL + await r.resolve(p, "m"); + expect(httpGetJson).toHaveBeenCalledTimes(2); + }); + + it("a default-source result is cached briefly, not for the full TTL", async () => { + let now = 0; + // API probe never yields a window and the model is not in the registry → + // every resolve falls to source:"default". + const httpGetJson = vi.fn().mockResolvedValue({ data: [] }); + const r = new ContextWindowResolver({ + loadRegistry, + httpGetJson, + ttlMs: 60 * 60 * 1000, // full TTL is an hour + now: () => now, + }); + const p = { + id: "v", + providerType: "OpenAI", + baseUrl: "http://x", + apiKey: "k", + }; + + const first = await r.resolve(p, "unknown-model"); + expect(first.source).toBe("default"); + + now += 30 * 1000; // within the 60 s default-source TTL + await r.resolve(p, "unknown-model"); + expect(httpGetJson).toHaveBeenCalledTimes(1); // still cached + + now += 40 * 1000; // 70 s total — past the short TTL, far short of the hour + await r.resolve(p, "unknown-model"); + expect(httpGetJson).toHaveBeenCalledTimes(2); // re-probed, blip not pinned + }); +}); diff --git a/apps/backend/src/runs/context-window.ts b/apps/backend/src/runs/context-window.ts new file mode 100644 index 00000000..4e42f621 --- /dev/null +++ b/apps/backend/src/runs/context-window.ts @@ -0,0 +1,474 @@ +/** + * Context-window resolution (ADR-0012 §Window resolution). + * + * Resolves the usable context window (and max output tokens) for a + * provider+model, in this order: + * + * 1. Manual override — `provider.modelMeta[modelId]`. + * 2. API auto-detect — Google / OpenRouter / vLLM expose the window. + * 3. litellm registry — community model price/context JSON (covers + * OpenAI / Anthropic / Bedrock, which don't expose it). + * 4. Conservative default — {@link DEFAULT_CONTEXT_WINDOW} (8192). + * + * A fall-through to the default, and every registry key MISS, is `log.warn`'d: + * the window is then unknown and the ring must render neutral + * (ADR-0012 §Context-usage ring). + * + * Results are cached per `providerId:modelId` with a TTL. Editing a `modelMeta` + * override must call {@link ContextWindowResolver.evict} immediately so the + * override takes effect without waiting for the TTL + * (ADR-0012 §Window resolution (caching & eviction)). + * + * The registry lookup and HTTP probe are injected so this module is unit + * testable without network or a vendored multi-MB JSON file + * (ADR-0012 §Window resolution (key normalization) cases are + * exercised against small fixture registries). + */ + +import { logger } from "../logger.ts"; + +/** Conservative window when nothing else resolves. */ +export const DEFAULT_CONTEXT_WINDOW = 8192; + +/** Default cache TTL: API-detected windows can drift, the override path evicts. */ +export const DEFAULT_CACHE_TTL_MS = 60 * 60 * 1000; // 1 hour + +/** + * Short TTL for `source: "default"` resolutions (ADR-0012 §Window resolution (caching & eviction)). A registry + * MISS or a transient API failure falls to 8192; caching that for the full hour + * pins a wrong window long after the blip clears. A 60 s TTL lets the next turn + * re-probe while still collapsing a burst of same-turn lookups. + */ +export const DEFAULT_SOURCE_CACHE_TTL_MS = 60 * 1000; // 1 minute + +/** Where a resolved window came from — drives ring neutrality (ADR-0012 §Context-usage ring). */ +export type WindowSource = "override" | "api" | "registry" | "default"; + +export type ResolvedWindow = { + contextWindow: number; + maxOutputTokens?: number; + source: WindowSource; +}; + +/** The slice of a provider row this module needs. */ +export type ProviderWindowInput = { + id: string; + providerType: string; + baseUrl?: string | null; + apiKey?: string | null; + modelMeta?: Record< + string, + { contextWindow?: number; maxOutputTokens?: number } + > | null; +}; + +/** A litellm registry entry (subset of the fields we read). */ +export type RegistryEntry = { + max_input_tokens?: number; + max_output_tokens?: number; + max_tokens?: number; +}; + +export type Registry = Record; + +/** Fetches and parses JSON from a URL. Injected so tests avoid network. */ +export type HttpGetJson = ( + url: string, + headers?: Record, +) => Promise; + +export type ResolverDeps = { + /** Provides the litellm registry (lazy; may be empty until vendored). */ + loadRegistry?: () => Promise; + /** model id → registry key aliases (Bedrock ARNs, Azure deployments, …). */ + aliasMap?: Record; + httpGetJson?: HttpGetJson; + ttlMs?: number; + now?: () => number; +}; + +// --------------------------------------------------------------------------- +// litellm registry key normalization (ADR-0012 §Window resolution (key normalization)) +// --------------------------------------------------------------------------- + +/** Strips a Bedrock ARN down to its `vendor.model` id, if it is one. */ +function bedrockModelFromArn(modelId: string): string | undefined { + const match = /foundation-model\/(.+)$/.exec(modelId); + return match?.[1]; +} + +/** + * Resolves a registry entry for a model id via the normalization chain: + * exact → strip provider prefix → lowercase → alias map → Bedrock ARN → + * family heuristic (longest registry key that prefixes the id) → MISS. + */ +export function lookupRegistry( + registry: Registry, + modelId: string, + aliasMap: Record = {}, +): RegistryEntry | undefined { + // 1. exact + if (registry[modelId]) return registry[modelId]; + + // 2. strip provider prefix ("openai/gpt-4o" → "gpt-4o") + const slash = modelId.indexOf("/"); + const stripped = slash >= 0 ? modelId.slice(slash + 1) : modelId; + if (stripped !== modelId && registry[stripped]) return registry[stripped]; + + // 3. lowercase variants + const lowerExact = modelId.toLowerCase(); + if (registry[lowerExact]) return registry[lowerExact]; + const lowerStripped = stripped.toLowerCase(); + if (registry[lowerStripped]) return registry[lowerStripped]; + + // 4. alias map (Azure deployment names, custom vLLM names, …) + const alias = aliasMap[modelId]; + if (alias && registry[alias]) return registry[alias]; + + // 5. Bedrock ARN → vendor.model, tried bare and under the "bedrock/" prefix, + // each also lowercased (registry keys for Bedrock are lowercase; ARNs are not + // guaranteed to be — defect 11). + const bedrock = bedrockModelFromArn(modelId); + if (bedrock) { + const candidates = [ + bedrock, + `bedrock/${bedrock}`, + bedrock.toLowerCase(), + `bedrock/${bedrock.toLowerCase()}`, + ]; + for (const c of candidates) if (registry[c]) return registry[c]; + } + + // 6. family heuristic — longest registry key that is a proper prefix of the + // id, separated by "-", ".", ":", or "/" so "gpt-4" does NOT match "gpt-4.5" + // (ADR-0012 §Window resolution (key normalization): raw startsWith caused gpt-4.5-preview to silently resolve via a + // stale gpt-4 entry with a wrong 8192 window). + // Case-insensitive so mixed-case registry keys ("Qwen/…", "meta-llama/…") + // still match lowercase ids from providers that normalize model names. + const strippedLower = stripped.toLowerCase(); + let best: { key: string; entry: RegistryEntry } | undefined; + for (const key of Object.keys(registry)) { + const keyLower = key.toLowerCase(); + const isMatch = + strippedLower === keyLower || + strippedLower.startsWith(keyLower + "-") || + strippedLower.startsWith(keyLower + ".") || + strippedLower.startsWith(keyLower + ":") || + strippedLower.startsWith(keyLower + "/"); + if (isMatch && (!best || key.length > best.key.length)) { + best = { key, entry: registry[key] }; + } + } + if (best) return best.entry; + + // 7. MISS + return undefined; +} + +function windowFromRegistryEntry(entry: RegistryEntry): { + contextWindow?: number; + maxOutputTokens?: number; +} { + // Only trust the explicit input limit. litellm's `max_tokens` is the OUTPUT + // cap (not the context window); using it would silently under-size the window + // and cause constant over-compaction (ADR-0012 §Window resolution). When `max_input_tokens` is + // absent we return no window so the caller falls to the conservative default, + // which at least surfaces a warn + neutral ring rather than a wrong number. + return { + contextWindow: entry.max_input_tokens, + maxOutputTokens: entry.max_output_tokens, + }; +} + +// --------------------------------------------------------------------------- +// API auto-detect parsers +// --------------------------------------------------------------------------- + +function trimSlash(url: string): string { + return url.replace(/\/+$/, ""); +} + +async function detectGoogle( + provider: ProviderWindowInput, + modelId: string, + httpGetJson: HttpGetJson, +): Promise | undefined> { + const base = trimSlash( + provider.baseUrl || "https://generativelanguage.googleapis.com", + ); + const headers = provider.apiKey + ? { "x-goog-api-key": provider.apiKey } + : undefined; + const body = (await httpGetJson( + `${base}/v1beta/models/${modelId}`, + headers, + )) as { + inputTokenLimit?: number; + outputTokenLimit?: number; + }; + if (typeof body?.inputTokenLimit === "number") { + return { + contextWindow: body.inputTokenLimit, + maxOutputTokens: body.outputTokenLimit, + source: "api", + }; + } + return undefined; +} + +async function detectOpenRouter( + provider: ProviderWindowInput, + modelId: string, + httpGetJson: HttpGetJson, +): Promise | undefined> { + const base = trimSlash(provider.baseUrl || "https://openrouter.ai"); + const body = (await httpGetJson(`${base}/api/v1/models`)) as { + data?: Array<{ + id?: string; + context_length?: number; + top_provider?: { max_completion_tokens?: number }; + }>; + }; + const entry = body?.data?.find((m) => m.id === modelId); + if (entry && typeof entry.context_length === "number") { + return { + contextWindow: entry.context_length, + maxOutputTokens: entry.top_provider?.max_completion_tokens, + source: "api", + }; + } + return undefined; +} + +async function detectOpenAiCompatible( + provider: ProviderWindowInput, + modelId: string, + httpGetJson: HttpGetJson, +): Promise | undefined> { + if (!provider.baseUrl) return undefined; // official OpenAI omits the field + // baseUrl conventionally ends in "/v1" (the OpenAI SDK needs it that way for + // chat calls), but the models endpoint is "{root}/v1/models" — strip a + // trailing "/v1" first so we don't request "/v1/v1/models" (404 → the window + // silently falls to the default and the usage ring renders "unknown"). + const base = trimSlash(provider.baseUrl).replace(/\/v1$/, ""); + const headers = provider.apiKey + ? { authorization: `Bearer ${provider.apiKey}` } + : undefined; + const body = (await httpGetJson(`${base}/v1/models`, headers)) as { + data?: Array<{ id?: string; max_model_len?: number }>; + }; + const entry = body?.data?.find((m) => m.id === modelId); + // vLLM and most OpenAI-compatible servers expose `max_model_len`. + if (entry && typeof entry.max_model_len === "number") { + return { contextWindow: entry.max_model_len, source: "api" }; + } + return undefined; +} + +async function detectViaApi( + provider: ProviderWindowInput, + modelId: string, + httpGetJson: HttpGetJson, +): Promise | undefined> { + try { + switch (provider.providerType) { + case "Google": + return await detectGoogle(provider, modelId, httpGetJson); + case "OpenRouter": + return await detectOpenRouter(provider, modelId, httpGetJson); + case "OpenAI": + return await detectOpenAiCompatible(provider, modelId, httpGetJson); + default: + return undefined; // Anthropic / Bedrock — no API window, use registry + } + } catch (error) { + logger.warn( + { + error, + providerId: provider.id, + modelId, + providerType: provider.providerType, + }, + "context-window API auto-detect failed; falling through", + ); + return undefined; + } +} + +// --------------------------------------------------------------------------- +// Resolver (cache + evict) +// --------------------------------------------------------------------------- + +/** ADR-0012 §Window resolution (caching & eviction): 5 s hard cap so a hung provider endpoint never blocks turns for ~300 s. */ +const API_DETECT_TIMEOUT_MS = 5000; + +const defaultHttpGetJson: HttpGetJson = async (url, headers) => { + const res = await fetch(url, { + headers, + signal: AbortSignal.timeout(API_DETECT_TIMEOUT_MS), + }); + if (!res.ok) throw new Error(`GET ${url} → ${res.status}`); + return res.json(); +}; + +type CacheEntry = { value: ResolvedWindow; expiresAt: number }; + +export class ContextWindowResolver { + #cache = new Map(); + /** ADR-0012 §Window resolution (caching & eviction): single-flight — concurrent callers for the same key share one fetch. */ + #inflight = new Map>(); + #loadRegistry: () => Promise; + #registry: Registry | undefined; + #aliasMap: Record; + #httpGetJson: HttpGetJson; + #ttlMs: number; + #now: () => number; + + constructor(deps: ResolverDeps = {}) { + this.#loadRegistry = + deps.loadRegistry ?? ((): Promise => Promise.resolve({})); + this.#aliasMap = deps.aliasMap ?? {}; + this.#httpGetJson = deps.httpGetJson ?? defaultHttpGetJson; + this.#ttlMs = deps.ttlMs ?? DEFAULT_CACHE_TTL_MS; + this.#now = deps.now ?? (() => Date.now()); + } + + /** Drops all cached windows for a provider — call on `modelMeta` edit (ADR-0012 §Window resolution (caching & eviction)). */ + evict(providerId: string): void { + for (const key of this.#cache.keys()) { + if (key.startsWith(`${providerId}:`)) this.#cache.delete(key); + } + // Also cancel any in-flight fetch for this provider so the next call + // re-resolves with the updated modelMeta rather than caching a stale result. + for (const key of this.#inflight.keys()) { + if (key.startsWith(`${providerId}:`)) this.#inflight.delete(key); + } + } + + async #registryEntry(modelId: string): Promise { + if (this.#registry === undefined) { + // A failing loader (bad vendored JSON, fs error) must not reject the whole + // resolution — degrade to an empty registry + warn (ADR-0012 §Window resolution). + try { + this.#registry = await this.#loadRegistry(); + } catch (error) { + logger.warn( + { error }, + "litellm registry load failed; treating as empty", + ); + this.#registry = {}; + } + } + return lookupRegistry(this.#registry, modelId, this.#aliasMap); + } + + async resolve( + provider: ProviderWindowInput, + modelId: string, + ): Promise { + const cacheKey = `${provider.id}:${modelId}`; + const cached = this.#cache.get(cacheKey); + if (cached && cached.expiresAt > this.#now()) return cached.value; + + // ADR-0012 §Window resolution (caching & eviction): single-flight — reuse an in-flight promise rather than spawning a + // second fetch for the same key (cold-cache stampede protection). + const existing = this.#inflight.get(cacheKey); + if (existing) return existing; + + const promise = this.#resolveUncached(provider, modelId).then((value) => { + // Only write the cache if this promise is still the live in-flight one. + // An evict() during the fetch deletes the inflight entry; without this + // guard the resolving promise would repopulate the cache with the stale + // pre-update value and defeat the eviction for a full TTL (ADR-0012 §Window resolution (caching & eviction) race). + if (this.#inflight.get(cacheKey) === promise) { + // ADR-0012 §Window resolution (caching & eviction): a default-source result (MISS or transient API + // failure) gets a short TTL so a blip doesn't pin 8192 for an hour. + const ttl = + value.source === "default" + ? Math.min(DEFAULT_SOURCE_CACHE_TTL_MS, this.#ttlMs) + : this.#ttlMs; + this.#cache.set(cacheKey, { value, expiresAt: this.#now() + ttl }); + this.#inflight.delete(cacheKey); + } + return value; + }); + // Store before awaiting so concurrent callers see the same promise. + this.#inflight.set(cacheKey, promise); + try { + return await promise; + } catch (err) { + this.#inflight.delete(cacheKey); + throw err; + } + } + + async #resolveUncached( + provider: ProviderWindowInput, + modelId: string, + ): Promise { + // 1. Manual override + const override = provider.modelMeta?.[modelId]; + if (override?.contextWindow) { + return { + contextWindow: override.contextWindow, + maxOutputTokens: override.maxOutputTokens, + source: "override", + }; + } + + // 2. API auto-detect + const api = await detectViaApi(provider, modelId, this.#httpGetJson); + if (api?.contextWindow) { + return { + contextWindow: api.contextWindow, + maxOutputTokens: override?.maxOutputTokens ?? api.maxOutputTokens, + source: "api", + }; + } + + // 3. litellm registry + const entry = await this.#registryEntry(modelId); + if (entry) { + const { contextWindow, maxOutputTokens } = windowFromRegistryEntry(entry); + if (contextWindow) { + return { + contextWindow, + maxOutputTokens: override?.maxOutputTokens ?? maxOutputTokens, + source: "registry", + }; + } + } else { + logger.warn( + { + metric: "litellm.key_miss", + providerId: provider.id, + modelId, + providerType: provider.providerType, + }, + "litellm registry key MISS — falling to default window", + ); + } + + // 4. Conservative default + logger.warn( + { + metric: "context_window.fell_to_default", + providerId: provider.id, + modelId, + default: DEFAULT_CONTEXT_WINDOW, + }, + "context window unresolved — using conservative default (ring neutral)", + ); + return { + contextWindow: DEFAULT_CONTEXT_WINDOW, + maxOutputTokens: override?.maxOutputTokens, + source: "default", + }; + } +} + +/** Process-wide resolver. Routes use this; tests construct their own. */ +import { loadBuiltinRegistry } from "./litellm-registry.ts"; +export const contextWindowResolver = new ContextWindowResolver({ + loadRegistry: loadBuiltinRegistry, +}); diff --git a/apps/backend/src/runs/litellm-registry.ts b/apps/backend/src/runs/litellm-registry.ts new file mode 100644 index 00000000..8d217a75 --- /dev/null +++ b/apps/backend/src/runs/litellm-registry.ts @@ -0,0 +1,346 @@ +/** + * Minimal vendored subset of the litellm model_prices_and_context_window.json + * (MIT licence — https://github.com/BerriAI/litellm). + * + * Only includes `max_input_tokens` and `max_output_tokens` — the two fields + * {@link ContextWindowResolver} reads. Covers providers whose context window is + * not available via a live API call (OpenAI, Anthropic, Bedrock). Google and + * OpenRouter are auto-detected at runtime and do not need entries here. + * + * Keys follow the litellm naming convention — bare model ids without a provider + * prefix. The registry lookup in context-window.ts tries exact → stripped → + * lowercase → alias → Bedrock ARN → family heuristic before a MISS. + * + * Keep sorted alphabetically within each vendor section for easier diffing. + * Update when models whose windows differ from their family default are released. + */ + +import type { Registry } from "./context-window.ts"; + +const REGISTRY: Registry = { + // --------------------------------------------------------------------------- + // OpenAI + // --------------------------------------------------------------------------- + "chatgpt-4o-latest": { max_input_tokens: 128000, max_output_tokens: 16384 }, + "gpt-3.5-turbo": { max_input_tokens: 16385, max_output_tokens: 4096 }, + "gpt-3.5-turbo-0125": { max_input_tokens: 16385, max_output_tokens: 4096 }, + "gpt-3.5-turbo-16k": { max_input_tokens: 16385, max_output_tokens: 4096 }, + "gpt-4": { max_input_tokens: 8192, max_output_tokens: 8192 }, + "gpt-4-0125-preview": { max_input_tokens: 128000, max_output_tokens: 4096 }, + "gpt-4-1106-preview": { max_input_tokens: 128000, max_output_tokens: 4096 }, + "gpt-4-turbo": { max_input_tokens: 128000, max_output_tokens: 4096 }, + "gpt-4-turbo-preview": { max_input_tokens: 128000, max_output_tokens: 4096 }, + "gpt-4-vision-preview": { max_input_tokens: 128000, max_output_tokens: 4096 }, + "gpt-4.1": { max_input_tokens: 1047576, max_output_tokens: 32768 }, + "gpt-4.1-mini": { max_input_tokens: 1047576, max_output_tokens: 32768 }, + "gpt-4.1-nano": { max_input_tokens: 1047576, max_output_tokens: 32768 }, + "gpt-4.5-preview": { max_input_tokens: 128000, max_output_tokens: 16384 }, + "gpt-4o": { max_input_tokens: 128000, max_output_tokens: 16384 }, + "gpt-4o-2024-05-13": { max_input_tokens: 128000, max_output_tokens: 4096 }, + "gpt-4o-2024-08-06": { max_input_tokens: 128000, max_output_tokens: 16384 }, + "gpt-4o-2024-11-20": { max_input_tokens: 128000, max_output_tokens: 16384 }, + "gpt-4o-audio-preview": { + max_input_tokens: 128000, + max_output_tokens: 16384, + }, + "gpt-4o-mini": { max_input_tokens: 128000, max_output_tokens: 16384 }, + "gpt-4o-mini-2024-07-18": { + max_input_tokens: 128000, + max_output_tokens: 16384, + }, + "gpt-4o-mini-audio-preview": { + max_input_tokens: 128000, + max_output_tokens: 16384, + }, + o1: { max_input_tokens: 200000, max_output_tokens: 100000 }, + "o1-mini": { max_input_tokens: 128000, max_output_tokens: 65536 }, + "o1-preview": { max_input_tokens: 128000, max_output_tokens: 32768 }, + o3: { max_input_tokens: 200000, max_output_tokens: 100000 }, + "o3-mini": { max_input_tokens: 200000, max_output_tokens: 100000 }, + "o4-mini": { max_input_tokens: 200000, max_output_tokens: 100000 }, + + // --------------------------------------------------------------------------- + // Anthropic (direct API — also covered under bedrock/ below) + // --------------------------------------------------------------------------- + "claude-2": { max_input_tokens: 100000, max_output_tokens: 4096 }, + "claude-2.1": { max_input_tokens: 200000, max_output_tokens: 4096 }, + "claude-3-haiku-20240307": { + max_input_tokens: 200000, + max_output_tokens: 4096, + }, + "claude-3-opus-20240229": { + max_input_tokens: 200000, + max_output_tokens: 4096, + }, + "claude-3-sonnet-20240229": { + max_input_tokens: 200000, + max_output_tokens: 4096, + }, + "claude-3-5-haiku-20241022": { + max_input_tokens: 200000, + max_output_tokens: 8192, + }, + "claude-3-5-sonnet-20240620": { + max_input_tokens: 200000, + max_output_tokens: 8192, + }, + "claude-3-5-sonnet-20241022": { + max_input_tokens: 200000, + max_output_tokens: 8192, + }, + "claude-3-7-sonnet-20250219": { + max_input_tokens: 200000, + max_output_tokens: 128000, + }, + "claude-haiku-4-5-20251001": { + max_input_tokens: 200000, + max_output_tokens: 8192, + }, + "claude-opus-4-5": { max_input_tokens: 200000, max_output_tokens: 32000 }, + "claude-opus-4-8": { max_input_tokens: 200000, max_output_tokens: 32000 }, + "claude-sonnet-4-5": { max_input_tokens: 200000, max_output_tokens: 64000 }, + "claude-sonnet-4-6": { max_input_tokens: 200000, max_output_tokens: 64000 }, + "claude-instant-1": { max_input_tokens: 100000, max_output_tokens: 4096 }, + "claude-instant-1.2": { max_input_tokens: 100000, max_output_tokens: 4096 }, + + // --------------------------------------------------------------------------- + // Bedrock — Anthropic models + // --------------------------------------------------------------------------- + "bedrock/anthropic.claude-instant-v1": { + max_input_tokens: 100000, + max_output_tokens: 4096, + }, + "bedrock/anthropic.claude-v2": { + max_input_tokens: 100000, + max_output_tokens: 4096, + }, + "bedrock/anthropic.claude-v2:1": { + max_input_tokens: 200000, + max_output_tokens: 4096, + }, + "bedrock/anthropic.claude-3-haiku-20240307-v1:0": { + max_input_tokens: 200000, + max_output_tokens: 4096, + }, + "bedrock/anthropic.claude-3-sonnet-20240229-v1:0": { + max_input_tokens: 200000, + max_output_tokens: 4096, + }, + "bedrock/anthropic.claude-3-opus-20240229-v1:0": { + max_input_tokens: 200000, + max_output_tokens: 4096, + }, + "bedrock/anthropic.claude-3-5-haiku-20241022-v1:0": { + max_input_tokens: 200000, + max_output_tokens: 8192, + }, + "bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0": { + max_input_tokens: 200000, + max_output_tokens: 8192, + }, + "bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0": { + max_input_tokens: 200000, + max_output_tokens: 8192, + }, + "bedrock/anthropic.claude-3-7-sonnet-20250219-v1:0": { + max_input_tokens: 200000, + max_output_tokens: 128000, + }, + + // --------------------------------------------------------------------------- + // Bedrock — Meta Llama + // --------------------------------------------------------------------------- + "bedrock/meta.llama3-8b-instruct-v1:0": { + max_input_tokens: 128000, + max_output_tokens: 8192, + }, + "bedrock/meta.llama3-70b-instruct-v1:0": { + max_input_tokens: 128000, + max_output_tokens: 8192, + }, + "bedrock/meta.llama3-1-8b-instruct-v1:0": { + max_input_tokens: 128000, + max_output_tokens: 8192, + }, + "bedrock/meta.llama3-1-70b-instruct-v1:0": { + max_input_tokens: 128000, + max_output_tokens: 8192, + }, + "bedrock/meta.llama3-1-405b-instruct-v1:0": { + max_input_tokens: 128000, + max_output_tokens: 8192, + }, + "bedrock/meta.llama3-2-1b-instruct-v1:0": { + max_input_tokens: 128000, + max_output_tokens: 8192, + }, + "bedrock/meta.llama3-2-3b-instruct-v1:0": { + max_input_tokens: 128000, + max_output_tokens: 8192, + }, + "bedrock/meta.llama3-2-11b-instruct-v1:0": { + max_input_tokens: 128000, + max_output_tokens: 8192, + }, + "bedrock/meta.llama3-2-90b-instruct-v1:0": { + max_input_tokens: 128000, + max_output_tokens: 8192, + }, + + // --------------------------------------------------------------------------- + // Bedrock — Amazon Titan/Nova + // --------------------------------------------------------------------------- + "bedrock/amazon.nova-lite-v1:0": { + max_input_tokens: 300000, + max_output_tokens: 5120, + }, + "bedrock/amazon.nova-micro-v1:0": { + max_input_tokens: 128000, + max_output_tokens: 5120, + }, + "bedrock/amazon.nova-pro-v1:0": { + max_input_tokens: 300000, + max_output_tokens: 5120, + }, + "bedrock/amazon.titan-text-express-v1": { + max_input_tokens: 8192, + max_output_tokens: 8192, + }, + "bedrock/amazon.titan-text-lite-v1": { + max_input_tokens: 4096, + max_output_tokens: 4096, + }, + "bedrock/amazon.titan-text-premier-v1:0": { + max_input_tokens: 32000, + max_output_tokens: 3072, + }, + + // --------------------------------------------------------------------------- + // Bedrock — Mistral + // --------------------------------------------------------------------------- + "bedrock/mistral.mistral-7b-instruct-v0:2": { + max_input_tokens: 32768, + max_output_tokens: 8192, + }, + "bedrock/mistral.mistral-large-2402-v1:0": { + max_input_tokens: 32768, + max_output_tokens: 8192, + }, + "bedrock/mistral.mistral-large-2407-v1:0": { + max_input_tokens: 131072, + max_output_tokens: 8192, + }, + "bedrock/mistral.mixtral-8x7b-instruct-v0:1": { + max_input_tokens: 32768, + max_output_tokens: 8192, + }, + + // --------------------------------------------------------------------------- + // Mistral (direct API) + // --------------------------------------------------------------------------- + "mistral-large": { max_input_tokens: 131072, max_output_tokens: 4096 }, + "mistral-large-latest": { max_input_tokens: 131072, max_output_tokens: 4096 }, + "mistral-medium": { max_input_tokens: 32768, max_output_tokens: 4096 }, + "mistral-small": { max_input_tokens: 32768, max_output_tokens: 4096 }, + "mistral-small-latest": { max_input_tokens: 32768, max_output_tokens: 4096 }, + "mistral-tiny": { max_input_tokens: 32768, max_output_tokens: 4096 }, + "mixtral-8x7b": { max_input_tokens: 32768, max_output_tokens: 4096 }, + "mixtral-8x22b": { max_input_tokens: 65536, max_output_tokens: 4096 }, + + // --------------------------------------------------------------------------- + // Meta Llama (direct / OpenAI-compat, e.g. Together.ai, Fireworks) + // --------------------------------------------------------------------------- + "meta-llama/Llama-2-7b-chat-hf": { + max_input_tokens: 4096, + max_output_tokens: 4096, + }, + "meta-llama/Llama-2-13b-chat-hf": { + max_input_tokens: 4096, + max_output_tokens: 4096, + }, + "meta-llama/Llama-2-70b-chat-hf": { + max_input_tokens: 4096, + max_output_tokens: 4096, + }, + "meta-llama/Meta-Llama-3-8B-Instruct": { + max_input_tokens: 8192, + max_output_tokens: 8192, + }, + "meta-llama/Meta-Llama-3-70B-Instruct": { + max_input_tokens: 8192, + max_output_tokens: 8192, + }, + "meta-llama/Meta-Llama-3.1-8B-Instruct": { + max_input_tokens: 131072, + max_output_tokens: 8192, + }, + "meta-llama/Meta-Llama-3.1-70B-Instruct": { + max_input_tokens: 131072, + max_output_tokens: 8192, + }, + "meta-llama/Meta-Llama-3.1-405B-Instruct": { + max_input_tokens: 131072, + max_output_tokens: 8192, + }, + "meta-llama/Llama-3.2-1B-Instruct": { + max_input_tokens: 131072, + max_output_tokens: 8192, + }, + "meta-llama/Llama-3.2-3B-Instruct": { + max_input_tokens: 131072, + max_output_tokens: 8192, + }, + "meta-llama/Llama-3.2-11B-Vision-Instruct": { + max_input_tokens: 131072, + max_output_tokens: 8192, + }, + "meta-llama/Llama-3.2-90B-Vision-Instruct": { + max_input_tokens: 131072, + max_output_tokens: 8192, + }, + "meta-llama/Llama-3.3-70B-Instruct": { + max_input_tokens: 131072, + max_output_tokens: 8192, + }, + "meta-llama/Llama-4-Scout-17B-16E-Instruct": { + max_input_tokens: 10000000, + max_output_tokens: 16384, + }, + "meta-llama/Llama-4-Maverick-17B-128E-Instruct": { + max_input_tokens: 1000000, + max_output_tokens: 16384, + }, + + // --------------------------------------------------------------------------- + // Qwen (via OpenAI-compat, e.g. vLLM / Together) + // --------------------------------------------------------------------------- + "Qwen/Qwen2-7B-Instruct": { + max_input_tokens: 32768, + max_output_tokens: 8192, + }, + "Qwen/Qwen2-72B-Instruct": { + max_input_tokens: 32768, + max_output_tokens: 8192, + }, + "Qwen/Qwen2.5-7B-Instruct": { + max_input_tokens: 131072, + max_output_tokens: 8192, + }, + "Qwen/Qwen2.5-14B-Instruct": { + max_input_tokens: 131072, + max_output_tokens: 8192, + }, + "Qwen/Qwen2.5-72B-Instruct": { + max_input_tokens: 131072, + max_output_tokens: 8192, + }, + "Qwen/Qwen3-8B": { max_input_tokens: 131072, max_output_tokens: 8192 }, + "Qwen/Qwen3-14B": { max_input_tokens: 131072, max_output_tokens: 8192 }, + "Qwen/Qwen3-32B": { max_input_tokens: 131072, max_output_tokens: 8192 }, +}; + +/** Returns the built-in minimal registry. Returns a Promise so the signature + * matches the injected `loadRegistry` slot and allows a future async fetch path. */ +export function loadBuiltinRegistry(): Promise { + return Promise.resolve(REGISTRY); +} diff --git a/apps/backend/src/runs/recovery.test.ts b/apps/backend/src/runs/recovery.test.ts new file mode 100644 index 00000000..6959c787 --- /dev/null +++ b/apps/backend/src/runs/recovery.test.ts @@ -0,0 +1,344 @@ +import { describe, it, expect, vi } from "vitest"; + +vi.mock("../index.ts", () => ({ db: {} })); // drizzle store unused in these tests +vi.mock("../logger.ts", () => ({ + logger: { warn: vi.fn(), info: vi.fn(), error: vi.fn(), debug: vi.fn() }, +})); + +import { APICallError } from "ai"; +import { + contextOverflowRecoveryMiddleware, + isContextOverflowError, + trimOverflowingPrompt, + type RecoveryContext, +} from "./recovery.ts"; + +const apiError = (args: { + message?: string; + statusCode: number; + responseBody?: string; +}) => + new APICallError({ + message: args.message ?? "Bad Request", + url: "https://provider.example/v1", + requestBodyValues: {}, + statusCode: args.statusCode, + responseBody: args.responseBody, + }); + +// --- isContextOverflowError — per-provider body matrix (ADR-0012 §Recovery) --------- + +describe("isContextOverflowError (ADR-0012 §Recovery)", () => { + it("matches the OpenAI phrasing + code", () => { + const err = apiError({ + statusCode: 400, + responseBody: JSON.stringify({ + error: { + message: + "This model's maximum context length is 8192 tokens. However, your messages resulted in 10042 tokens. Please reduce the length of the messages.", + type: "invalid_request_error", + code: "context_length_exceeded", + }, + }), + }); + expect(isContextOverflowError(err)).toBe(true); + }); + + it("matches the Anthropic phrasing", () => { + const err = apiError({ + statusCode: 400, + message: "prompt is too long: 210042 tokens > 200000 maximum", + }); + expect(isContextOverflowError(err)).toBe(true); + }); + + it("matches the vLLM / OpenAI-compatible phrasing", () => { + const err = apiError({ + statusCode: 400, + responseBody: + '{"object":"error","message":"This model\'s maximum context length is 40960 tokens. However, you requested 45123 tokens (40123 in the messages, 5000 in the completion). Please reduce the length of the messages or completion.","code":40303}', + }); + expect(isContextOverflowError(err)).toBe(true); + }); + + it("matches the Google phrasing", () => { + const err = apiError({ + statusCode: 400, + responseBody: + '{"error":{"code":400,"message":"The input token count (1200000) exceeds the maximum number of tokens allowed (1048576).","status":"INVALID_ARGUMENT"}}', + }); + expect(isContextOverflowError(err)).toBe(true); + }); + + it("matches the Bedrock ValidationException phrasing", () => { + const err = apiError({ + statusCode: 400, + responseBody: '{"message":"Input is too long for requested model."}', + }); + expect(isContextOverflowError(err)).toBe(true); + }); + + it("matches a 413 payload-too-large with a token message", () => { + const err = apiError({ + statusCode: 413, + responseBody: '{"error":"too many tokens in request"}', + }); + expect(isContextOverflowError(err)).toBe(true); + }); + + it("rejects a 400 that is not about context (validation error)", () => { + const err = apiError({ + statusCode: 400, + responseBody: + '{"error":{"message":"Invalid value for temperature: must be between 0 and 2."}}', + }); + expect(isContextOverflowError(err)).toBe(false); + }); + + it("rejects 429 / 401 / 5xx regardless of body", () => { + for (const statusCode of [401, 429, 500, 503]) { + const err = apiError({ + statusCode, + responseBody: '{"error":"maximum context length exceeded"}', + }); + expect(isContextOverflowError(err)).toBe(false); + } + }); + + it("rejects non-APICallError values", () => { + expect(isContextOverflowError(new Error("prompt is too long"))).toBe(false); + expect(isContextOverflowError(undefined)).toBe(false); + }); +}); + +// --- middleware: trim + retry-once (ADR-0012 §Recovery) -------------------------- + +type PromptMsg = { role: string; content: unknown }; + +const text = (role: "user" | "assistant", t: string): PromptMsg => ({ + role, + content: [{ type: "text", text: t }], +}); + +/** system + 2 big + 2 small messages: prune can't help (no tool results), so + * the trim must go through the shared summarize stage (ADR-0012 §Recovery). */ +const overflowPrompt = (): PromptMsg[] => [ + { role: "system", content: "SYS" }, + text("user", "X".repeat(4000)), + text("assistant", "Y".repeat(4000)), + text("user", "recent question"), + text("assistant", "recent answer"), +]; + +const ctx = (over: Partial = {}): RecoveryContext => ({ + chatId: "chat-1", + imageProvider: "default", + targetTokens: 100, + keepRecentMessages: 4, // recovery halves this → keep 2 + minPrunableChars: 2000, + summarize: () => Promise.resolve("RSUM"), + ...over, +}); + +const overflow = () => + apiError({ + statusCode: 400, + responseBody: '{"error":{"code":"context_length_exceeded"}}', + }); + +/** Fake V3 model capturing retry params. */ +const fakeModel = (result: unknown = "RETRIED", fail?: unknown) => { + const calls: Array<{ prompt: PromptMsg[] }> = []; + const impl = (params: { prompt: PromptMsg[] }) => { + calls.push(params); + if (fail) return Promise.reject(fail); + return Promise.resolve(result); + }; + return { calls, model: { doGenerate: impl, doStream: impl } }; +}; + +const runWrapGenerate = ( + mw: ReturnType, + args: { + doGenerate: () => Promise; + params: { prompt: PromptMsg[] }; + model: unknown; + }, +) => + (mw.wrapGenerate as (o: unknown) => Promise)({ + doStream: () => Promise.reject(new Error("unused")), + ...args, + }); + +describe("contextOverflowRecoveryMiddleware (ADR-0012 §Recovery)", () => { + it("trims via the shared compactor and retries exactly once on overflow", async () => { + const markDirty = vi.fn(() => Promise.resolve(undefined)); + const mw = contextOverflowRecoveryMiddleware(ctx({ markDirty })); + const { calls, model } = fakeModel(); + const doGenerate = vi.fn(() => Promise.reject(overflow())); + + const result = await runWrapGenerate(mw, { + doGenerate, + params: { prompt: overflowPrompt() }, + model, + }); + + expect(result).toBe("RETRIED"); + expect(doGenerate).toHaveBeenCalledTimes(1); + expect(calls).toHaveLength(1); + + const retried = calls[0].prompt; + // System head pinned verbatim at the front (ADR-0012 §Tier 1). + expect(retried[0]).toEqual({ role: "system", content: "SYS" }); + // The big prefix was replaced by the shared summary message (ADR-0012 + // §Recovery — compactModelMessages' shape, not a bespoke trim). + const summary = retried[1] as { content: Array<{ text: string }> }; + expect(summary.content[0].text).toContain( + "[Summary of earlier conversation]", + ); + expect(summary.content[0].text).toContain("RSUM"); + // Recent messages kept verbatim. + expect(retried.at(-1)).toEqual(text("assistant", "recent answer")); + // Dirty flag persisted on DETECTION (before the retry outcome is known). + expect(markDirty).toHaveBeenCalledTimes(1); + }); + + it("propagates the second overflow — no infinite retry", async () => { + const markDirty = vi.fn(() => Promise.resolve(undefined)); + const mw = contextOverflowRecoveryMiddleware(ctx({ markDirty })); + const second = overflow(); + const { model } = fakeModel(undefined, second); + + await expect( + runWrapGenerate(mw, { + doGenerate: () => Promise.reject(overflow()), + params: { prompt: overflowPrompt() }, + model, + }), + ).rejects.toBe(second); + // Flag persisted anyway: the NEXT turn must compact durably (ADR-0012 §Recovery). + expect(markDirty).toHaveBeenCalledTimes(1); + }); + + it("rethrows non-overflow errors without retrying or flagging", async () => { + const markDirty = vi.fn(() => Promise.resolve(undefined)); + const mw = contextOverflowRecoveryMiddleware(ctx({ markDirty })); + const { calls, model } = fakeModel(); + const authError = apiError({ statusCode: 401, message: "bad key" }); + + await expect( + runWrapGenerate(mw, { + doGenerate: () => Promise.reject(authError), + params: { prompt: overflowPrompt() }, + model, + }), + ).rejects.toBe(authError); + expect(calls).toHaveLength(0); + expect(markDirty).not.toHaveBeenCalled(); + }); + + it("still retries when persisting the dirty flag fails (best-effort)", async () => { + const markDirty = vi.fn(() => Promise.reject(new Error("db down"))); + const mw = contextOverflowRecoveryMiddleware(ctx({ markDirty })); + const { calls, model } = fakeModel(); + + const result = await runWrapGenerate(mw, { + doGenerate: () => Promise.reject(overflow()), + params: { prompt: overflowPrompt() }, + model, + }); + expect(result).toBe("RETRIED"); + expect(calls).toHaveLength(1); + }); + + it("surfaces the ORIGINAL overflow when the trim itself fails", async () => { + const first = overflow(); + const mw = contextOverflowRecoveryMiddleware( + ctx({ + summarize: () => Promise.reject(new Error("summarizer down")), + }), + ); + const { calls, model } = fakeModel(); + + await expect( + runWrapGenerate(mw, { + doGenerate: () => Promise.reject(first), + params: { prompt: overflowPrompt() }, + model, + }), + ).rejects.toBe(first); + expect(calls).toHaveLength(0); + }); + + it("covers the stream path: doStream rejection is trimmed and retried", async () => { + const mw = contextOverflowRecoveryMiddleware(ctx()); + const { calls, model } = fakeModel("STREAMED"); + + const result = await (mw.wrapStream as (o: unknown) => Promise)({ + doGenerate: () => Promise.reject(new Error("unused")), + doStream: () => Promise.reject(overflow()), + params: { prompt: overflowPrompt() }, + model, + }); + expect(result).toBe("STREAMED"); + expect(calls).toHaveLength(1); + expect(calls[0].prompt[0]).toEqual({ role: "system", content: "SYS" }); + }); +}); + +describe("trimOverflowingPrompt", () => { + it("pins multiple leading system messages and halves keep-recent", async () => { + const prompt: PromptMsg[] = [ + { role: "system", content: "S1" }, + { role: "system", content: "S2" }, + text("user", "A".repeat(4000)), + text("assistant", "B".repeat(4000)), + text("user", "u2"), + text("assistant", "a2"), + ]; + const { prompt: out, messagesDropped } = await trimOverflowingPrompt( + prompt, + ctx(), // keepRecentMessages 4 → recovery keeps 2 + ); + expect(out[0]).toEqual({ role: "system", content: "S1" }); + expect(out[1]).toEqual({ role: "system", content: "S2" }); + expect(messagesDropped).toBe(2); // the two big messages summarized away + expect(out.at(-2)).toEqual(text("user", "u2")); + expect(out.at(-1)).toEqual(text("assistant", "a2")); + }); + + it("never orphans a tool result at the keep boundary", async () => { + const toolCall: PromptMsg = { + role: "assistant", + content: [ + { type: "tool-call", toolCallId: "t1", toolName: "search", input: {} }, + ], + }; + const toolResult: PromptMsg = { + role: "tool", + content: [ + { + type: "tool-result", + toolCallId: "t1", + toolName: "search", + output: { type: "text", value: "Z".repeat(4000) }, + }, + ], + }; + const prompt: PromptMsg[] = [ + { role: "system", content: "SYS" }, + text("user", "Q".repeat(4000)), + toolCall, + toolResult, // boundary at keep-2 would start recent here — must walk back + text("assistant", "done"), + ]; + const { prompt: out } = await trimOverflowingPrompt(prompt, ctx()); + const firstNonSystem = out.findIndex((m) => m.role !== "system"); + // Recent must not begin with an orphaned role:"tool" message. + expect(out[firstNonSystem].role).not.toBe("tool"); + const toolIdx = out.findIndex((m) => m.role === "tool"); + if (toolIdx !== -1) { + expect(out[toolIdx - 1].role).toBe("assistant"); + } + }); +}); diff --git a/apps/backend/src/runs/recovery.ts b/apps/backend/src/runs/recovery.ts new file mode 100644 index 00000000..ad94ad34 --- /dev/null +++ b/apps/backend/src/runs/recovery.ts @@ -0,0 +1,230 @@ +/** + * Context-overflow recovery (ADR-0012 §Recovery). + * + * Recovery is the NET, proactive compaction is the plan: even when Tier 1/2 are + * disabled (kill switch — ADR-0012 §Config & kill switch) or their estimates were wrong, a provider 400/413 + * "context too long" must not hard-fail the turn. The middleware here wraps the + * language model so EVERY individual model call — the first call of a turn and + * every later step of a tool loop, in both the stream and generate paths — gets + * one trim-and-retry: + * + * 1. Detect the overflow ({@link isContextOverflowError}, per-provider body + * matrix — ADR-0012 §Recovery). + * 2. Persist `compactionDirty = true` through the single CAS writer so the + * NEXT `prepareChatTurn` forces a durable Tier 1 compaction (ADR-0012 + * §Recovery — recovery never writes summary/watermark itself; it only flags). + * 3. Trim in-memory via {@link compactModelMessages} — the shared Tier 2 + * adapter, NOT a bespoke trim (ADR-0012 §Recovery) — and retry the call once. + * 4. A second failure propagates; {@link formatStreamError} in agent-runner + * surfaces the "conversation too large" message. No infinite retry. + * + * The middleware operates on the `LanguageModelV3Prompt`. Its message shape is + * a structural subset of `ModelMessage` for everything compaction touches + * (roles, text / tool-call / tool-result / file parts, output wrappers), so the + * prompt is passed to `compactModelMessages` directly rather than through a + * lossy converter — one estimator, one trimmer (ADR-0012 §One estimator / + * §Recovery). The leading system message(s) are split off first and re-attached + * verbatim (ADR-0012 §Tier 1: pin the system prompt; the summary must never + * swallow it). + */ + +import { + APICallError, + type LanguageModelMiddleware, + type ModelMessage, +} from "ai"; +import { logger } from "../logger.ts"; +import { compactModelMessages, type Summarize } from "./compaction.ts"; +import type { ImageProvider } from "./token-estimate.ts"; + +/** + * Everything the middleware needs to trim and retry, resolved once per turn by + * `prepareChatTurn`. `markDirty` is absent for headless runs (triggers, + * sub-agents) — they have no durable chat row to flag. + */ +export type RecoveryContext = { + /** Chat id, for log correlation only. Absent on headless runs. */ + chatId?: string; + imageProvider: ImageProvider; + /** Trim down to this many tokens (the Tier 1 hysteresis target). */ + targetTokens: number; + /** The configured keep-recent; recovery halves it (aggressive trim, ADR-0012 §Recovery). */ + keepRecentMessages: number; + minPrunableChars: number; + summarize: Summarize; + summarizerWindow?: number; + /** + * Persists `compactionDirty = true` (via the single CAS writer). Called as + * soon as an overflow is DETECTED — before the retry — so the next turn + * compacts durably even if this retry fails. Best-effort: a failure here + * never blocks the retry. + */ + markDirty?: () => Promise; +}; + +/** + * Per-provider context-overflow phrasings (ADR-0012 §Recovery). Matched against the + * error message AND raw response body, case-insensitive: + * - OpenAI / vLLM / OpenAI-compatible: "This model's maximum context length is + * N tokens…" + code "context_length_exceeded" + * - Anthropic: "prompt is too long: N tokens > N maximum" + * - Google: "The input token count (N) exceeds the maximum number of tokens + * allowed (N)" + * - Bedrock: "Input is too long for requested model." (ValidationException) + * - Generic gateways: "too many tokens", "exceed context limit" + */ +const CONTEXT_OVERFLOW_PATTERN = + /context[ _]length|context_length_exceeded|prompt is too long|too many tokens|maximum context|exceeds the (?:maximum|max)(?: number of)? (?:input )?tokens|input is too long|exceeds? (?:the )?context limit/i; + +/** + * True when `error` is a provider context-overflow rejection: an `APICallError` + * with status 400 or 413 whose message/body matches a known overflow phrasing. + * Rate limits (429), auth (401/403), and 5xx are deliberately excluded — those + * have their own handling and a trim-retry would not help. + */ +export function isContextOverflowError(error: unknown): boolean { + if (!APICallError.isInstance(error)) return false; + if (error.statusCode !== 400 && error.statusCode !== 413) return false; + const haystack = `${error.message ?? ""}\n${ + typeof error.responseBody === "string" ? error.responseBody : "" + }`; + return CONTEXT_OVERFLOW_PATTERN.test(haystack); +} + +/** A V3 prompt message — structurally compatible with ModelMessage (see header). */ +type PromptMessage = { role: string; content: unknown }; + +/** + * Trims an overflowing prompt via the shared Tier 2 adapter. The system head + * (leading `role:"system"` messages) is pinned and re-attached verbatim. + * Exported for unit testing. + */ +export async function trimOverflowingPrompt( + prompt: T[], + ctx: RecoveryContext, +): Promise<{ prompt: T[]; messagesDropped: number }> { + let systemEnd = 0; + while (systemEnd < prompt.length && prompt[systemEnd].role === "system") { + systemEnd++; + } + const systemHead = prompt.slice(0, systemEnd); + const rest = prompt.slice(systemEnd) as unknown as ModelMessage[]; + + const result = await compactModelMessages(rest, { + // Aggressive: halve the configured keep-recent (ADR-0012 §Recovery), floor of 2 so a + // user/assistant pair survives. + keepRecentMessages: Math.max(2, Math.ceil(ctx.keepRecentMessages / 2)), + targetTokens: ctx.targetTokens, + minPrunableChars: ctx.minPrunableChars, + imageProvider: ctx.imageProvider, + summarize: ctx.summarize, + summarizerWindow: ctx.summarizerWindow, + // The provider already rejected this prompt, so the estimator is wrong; + // bypass the no-op gate or the retry will be byte-identical (ADR-0012 §Recovery). + force: true, + }); + + return { + prompt: [...systemHead, ...(result.messages as unknown as T[])], + messagesDropped: result.messagesDropped, + }; +} + +/** + * Wraps both `doGenerate` and `doStream` with the detect → flag → trim → retry- + * once sequence. Apply via `wrapLanguageModel({ model, middleware })` in + * agent-runner. Note a stream that overflows MID-stream (after chunks started + * flowing) is not recoverable — providers reject oversized prompts up front, so + * the rejection surfaces from the `doStream()` promise itself, which is caught. + */ +export function contextOverflowRecoveryMiddleware( + ctx: RecoveryContext, +): LanguageModelMiddleware { + // Shared by both wrappers: returns the retried params, or rethrows. + const recoverParams = async

( + error: unknown, + params: P, + ): Promise

=> { + if (!isContextOverflowError(error)) throw error; + + logger.warn( + { + metric: "recovery.overflow_detected", + chatId: ctx.chatId, + error: String(error), + }, + "context overflow detected; trimming and retrying once", + ); + + // Flag durable compaction for the NEXT turn first (ADR-0012 §Recovery) — even if the + // retry below fails, the next prepareChatTurn must force Tier 1. + if (ctx.markDirty) { + try { + await ctx.markDirty(); + } catch (err) { + logger.error( + { err, chatId: ctx.chatId }, + "failed to persist compactionDirty after overflow", + ); + } + } + + try { + const { prompt, messagesDropped } = await trimOverflowingPrompt( + params.prompt, + ctx, + ); + logger.info( + { metric: "recovery.retry", chatId: ctx.chatId, messagesDropped }, + "overflow recovery trim complete; retrying model call", + ); + return { ...params, prompt }; + } catch (trimError) { + // The trim itself failed (e.g. the summarize call errored). Surface the + // ORIGINAL overflow so the user sees the actionable message. + logger.error( + { err: trimError, chatId: ctx.chatId }, + "overflow recovery trim failed", + ); + throw error; + } + }; + + // Runs the single retry and logs recovery.failed if the provider rejects the + // trimmed prompt too (the dead end formatStreamError then surfaces to the user). + const retry = async (op: () => PromiseLike): Promise => { + try { + return await op(); + } catch (retryError) { + logger.error( + { + metric: "recovery.failed", + chatId: ctx.chatId, + error: String(retryError), + }, + "overflow recovery retry still rejected by provider", + ); + throw retryError; + } + }; + + return { + specificationVersion: "v3", + wrapGenerate: async ({ doGenerate, params, model }) => { + try { + return await doGenerate(); + } catch (error) { + const next = await recoverParams(error, params); + return retry(() => model.doGenerate(next)); + } + }, + wrapStream: async ({ doStream, params, model }) => { + try { + return await doStream(); + } catch (error) { + const next = await recoverParams(error, params); + return retry(() => model.doStream(next)); + } + }, + }; +} diff --git a/apps/backend/src/runs/sinks/chat-sink.test.ts b/apps/backend/src/runs/sinks/chat-sink.test.ts index ec8b403d..5b6c1c35 100644 --- a/apps/backend/src/runs/sinks/chat-sink.test.ts +++ b/apps/backend/src/runs/sinks/chat-sink.test.ts @@ -24,6 +24,8 @@ const planWithAgent: ResolvedRunPlan = { seed: undefined, presencePenalty: undefined, frequencyPenalty: undefined, + contextWindow: 128000, + contextWindowIsDefault: false, }, }; @@ -39,6 +41,8 @@ const planAdhoc: ResolvedRunPlan = { seed: 42, presencePenalty: 0.1, frequencyPenalty: 0.2, + contextWindow: 128000, + contextWindowIsDefault: false, }, }; diff --git a/apps/backend/src/runs/sinks/trigger-sink.test.ts b/apps/backend/src/runs/sinks/trigger-sink.test.ts index 61ef3c5d..e38c046e 100644 --- a/apps/backend/src/runs/sinks/trigger-sink.test.ts +++ b/apps/backend/src/runs/sinks/trigger-sink.test.ts @@ -8,6 +8,8 @@ const plan: ResolvedRunPlan = { agentId: "a1", providerId: "p1", modelId: "m1", + contextWindow: 128000, + contextWindowIsDefault: false, }, }; diff --git a/apps/backend/src/runs/token-estimate.test.ts b/apps/backend/src/runs/token-estimate.test.ts new file mode 100644 index 00000000..73784bd4 --- /dev/null +++ b/apps/backend/src/runs/token-estimate.test.ts @@ -0,0 +1,386 @@ +import { describe, it, expect } from "vitest"; +import { convertToModelMessages, type UIMessage } from "ai"; +import { + estimateTokens, + uiMessagesToCountUnits, + modelMessagesToCountUnits, + parseImageDimensions, + imageProviderFor, + CHARS_PER_TOKEN, + DEFAULT_NONTEXT_TOKENS, + MODEL_BOUND_UI_PART_TYPES, + type CountUnit, +} from "./token-estimate.ts"; +import type { ModelMessage } from "ai"; +import type { PlatypusUIMessage } from "../types.ts"; + +// A 24-byte PNG: 8-byte signature + IHDR length/type + width@16 + height@20. +function fakePng(width: number, height: number): Uint8Array { + const b = new Uint8Array(24); + b.set([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a], 0); // signature + b.set([0, 0, 0, 13], 8); // IHDR length + b.set([0x49, 0x48, 0x44, 0x52], 12); // "IHDR" + new DataView(b.buffer).setUint32(16, width); + new DataView(b.buffer).setUint32(20, height); + return b; +} + +// A minimal JPEG with a single SOF0 marker carrying dimensions. +function fakeJpeg(width: number, height: number): Uint8Array { + const b = new Uint8Array(12); + b.set([0xff, 0xd8, 0xff, 0xc0, 0x00, 0x11, 0x08], 0); // SOI + SOF0 + len + prec + const view = new DataView(b.buffer); + view.setUint16(7, height); + view.setUint16(9, width); + return b; +} + +function dataUrl(bytes: Uint8Array, mediaType = "image/png"): string { + return `data:${mediaType};base64,${Buffer.from(bytes).toString("base64")}`; +} + +describe("estimateTokens (the single estimator, ADR-0012 §One estimator)", () => { + it("applies char/4 to text only, rounding up", () => { + const units: CountUnit[] = [ + { role: "user", text: "abcdefgh", nonText: [] }, + ]; + expect(estimateTokens(units)).toBe(8 / CHARS_PER_TOKEN); + + const odd: CountUnit[] = [{ role: "user", text: "abcde", nonText: [] }]; + expect(estimateTokens(odd)).toBe(2); // ceil(5/4) + }); + + it("sums across multiple units (role-agnostic total)", () => { + const units: CountUnit[] = [ + { role: "system", text: "aaaa", nonText: [] }, + { role: "user", text: "bbbb", nonText: [] }, + { role: "assistant", text: "cccc", nonText: [] }, + ]; + expect(estimateTokens(units)).toBe(3); + }); +}); + +describe("modality table (ADR-0012 §Token estimation — never char/4 an image)", () => { + it("anthropic: ceil(w*h/750)", () => { + const units: CountUnit[] = [ + { + role: "user", + text: "", + nonText: [{ provider: "anthropic", width: 100, height: 100 }], + }, + ]; + expect(estimateTokens(units)).toBe(Math.ceil((100 * 100) / 750)); // 14 + }); + + it("openai high detail: 85 + 170 per tile", () => { + const units: CountUnit[] = [ + { + role: "user", + text: "", + nonText: [{ provider: "openai", width: 100, height: 100 }], + }, + ]; + expect(estimateTokens(units)).toBe(85 + 170 * 1); // single tile + }); + + it("openai low detail is a flat 85, even without dimensions", () => { + const withDims: CountUnit[] = [ + { + role: "user", + text: "", + nonText: [ + { provider: "openai", width: 4000, height: 4000, detail: "low" }, + ], + }, + ]; + expect(estimateTokens(withDims)).toBe(85); + + const noDims: CountUnit[] = [ + { + role: "user", + text: "", + nonText: [{ provider: "openai", detail: "low" }], + }, + ]; + expect(estimateTokens(noDims)).toBe(85); + }); + + it("missing dimensions use a pessimistic per-provider ceiling (ADR-0012 §Token estimation)", () => { + // Providers with a real per-image cost would be UNDER-counted by the flat + // 1200 default when bytes/dims are unavailable (hosted URL), so they fall to + // a pessimistic ceiling near each provider's post-resize max instead. + const anthropic: CountUnit[] = [ + { role: "user", text: "", nonText: [{ provider: "anthropic" }] }, + ]; + expect(estimateTokens(anthropic)).toBe(1600); + + const openaiHigh: CountUnit[] = [ + { role: "user", text: "", nonText: [{ provider: "openai" }] }, + ]; + expect(estimateTokens(openaiHigh)).toBe(2000); + + // The unknown ("default") provider keeps the conservative flat default. + const unknown: CountUnit[] = [ + { role: "user", text: "", nonText: [{ provider: "default" }] }, + ]; + expect(estimateTokens(unknown)).toBe(DEFAULT_NONTEXT_TOKENS); + }); + + it("unknown provider falls to the conservative default", () => { + const units: CountUnit[] = [ + { + role: "user", + text: "", + nonText: [{ provider: "default", width: 100, height: 100 }], + }, + ]; + expect(estimateTokens(units)).toBe(DEFAULT_NONTEXT_TOKENS); + }); + + it("an image is NOT counted as char/4 of its base64 bytes", () => { + const png = fakePng(64, 64); + const ui: PlatypusUIMessage[] = [ + { + id: "m1", + role: "user", + parts: [{ type: "file", mediaType: "image/png", url: dataUrl(png) }], + }, + ]; + const tokens = estimateTokens(uiMessagesToCountUnits(ui, "anthropic")); + // char/4 of the base64 data URL would be far larger than the table cost. + const charsIfNaive = Math.ceil(dataUrl(png).length / CHARS_PER_TOKEN); + expect(tokens).toBe(Math.ceil((64 * 64) / 750)); + expect(tokens).toBeLessThan(charsIfNaive); + }); +}); + +describe("parseImageDimensions (cheap header parse)", () => { + it("reads PNG IHDR dimensions", () => { + expect(parseImageDimensions(fakePng(800, 600))).toEqual({ + width: 800, + height: 600, + }); + }); + + it("reads JPEG SOF dimensions", () => { + expect(parseImageDimensions(fakeJpeg(320, 240))).toEqual({ + width: 320, + height: 240, + }); + }); + + it("returns undefined for unrecognized bytes", () => { + expect(parseImageDimensions(new Uint8Array([1, 2, 3, 4]))).toBeUndefined(); + }); +}); + +describe("MODEL_BOUND filter (ADR-0012 §One estimator — UI-only parts excluded)", () => { + it("counts text but ignores reasoning / source / step-start / data parts", () => { + const ui: PlatypusUIMessage[] = [ + { + id: "m1", + role: "assistant", + parts: [ + { type: "reasoning", text: "thinking hard about it" }, + { type: "text", text: "hello" }, + { type: "step-start" }, + { type: "source-url", sourceId: "s1", url: "https://example.com" }, + { type: "data-custom", data: { hidden: "payload" } }, + ], + } as unknown as PlatypusUIMessage, + ]; + const units = uiMessagesToCountUnits(ui); + expect(units).toHaveLength(1); + expect(units[0].text).toBe("hello"); + expect(units[0].nonText).toHaveLength(0); + }); + + it("only text/file UI part types are model-bound (the documented set)", () => { + expect([...MODEL_BOUND_UI_PART_TYPES]).toEqual(["text", "file"]); + // The UI-only types the adapter must drop are NOT in the model-bound set. + for (const uiOnly of [ + "reasoning", + "source-url", + "source-document", + "step-start", + "data-custom", + ]) { + expect(MODEL_BOUND_UI_PART_TYPES).not.toContain(uiOnly); + } + }); +}); + +describe("tool-result output variants (model adapter)", () => { + const unit = (output: unknown): CountUnit => { + const msg = { + role: "tool", + content: [ + { type: "tool-result", toolCallId: "c1", toolName: "t", output }, + ], + } as unknown as ModelMessage; + return modelMessagesToCountUnits([msg])[0]; + }; + + it("folds text / json / content value into char/4 text", () => { + expect(unit({ type: "text", value: "hello world" }).text).toContain( + "hello", + ); + expect(unit({ type: "json", value: { a: 1 } }).text).toContain('"a"'); + expect( + unit({ type: "content", value: [{ type: "text", text: "deep" }] }).text, + ).toContain("deep"); + }); + + it("uses the reason (not a value) for execution-denied", () => { + expect( + unit({ type: "execution-denied", reason: "blocked" }).text, + ).toContain("blocked"); + }); +}); + +describe("adapter equality (ADR-0012 §One estimator — one estimate across both shapes)", () => { + it("estimate(UI) === estimate(convertToModelMessages(UI)) exactly", async () => { + const png = fakePng(128, 128); + const ui: UIMessage[] = [ + { + id: "s", + role: "system", + parts: [{ type: "text", text: "You are helpful." }], + }, + { + id: "u", + role: "user", + parts: [ + { type: "text", text: "What is the weather and look at this image?" }, + { type: "file", mediaType: "image/png", url: dataUrl(png) }, + ], + }, + { + id: "a", + role: "assistant", + parts: [ + { type: "text", text: "Let me check." }, + { + type: "tool-getWeather", + toolCallId: "call-1", + state: "output-available", + input: { city: "San Francisco", units: "metric" }, + output: { temperatureC: 18, condition: "foggy" }, + }, + ], + } as unknown as UIMessage, + { + id: "a2", + role: "assistant", + parts: [{ type: "text", text: "It is 18C and foggy." }], + }, + ]; + + const model = await convertToModelMessages(ui); + + const uiTokens = estimateTokens( + uiMessagesToCountUnits(ui, "openai"), + ); + const modelTokens = estimateTokens( + modelMessagesToCountUnits(model, "openai"), + ); + + expect(uiTokens).toBe(modelTokens); + expect(uiTokens).toBeGreaterThan(0); + }); +}); + +describe("imageProviderFor", () => { + it("maps provider types to cost families", () => { + expect(imageProviderFor("Anthropic")).toBe("anthropic"); + expect(imageProviderFor("Bedrock")).toBe("anthropic"); + expect(imageProviderFor("OpenAI")).toBe("openai"); + expect(imageProviderFor("OpenRouter")).toBe("default"); + expect(imageProviderFor("Google")).toBe("default"); + }); +}); + +// --- estimateOverheadTokens (ADR-0012 §Tier 1 (trigger projection)) -------- + +import { z } from "zod"; +import { tool } from "ai"; +import { estimateOverheadTokens } from "./token-estimate.ts"; + +describe("estimateOverheadTokens (ADR-0012 §Tier 1 (trigger projection))", () => { + it("counts the system prompt at char/4", () => { + const sys = "S".repeat(400); + expect(estimateOverheadTokens(sys, {})).toBe(100); + }); + + it("handles missing system prompt and tools", () => { + expect(estimateOverheadTokens(undefined, undefined)).toBe(0); + }); + + it("counts tool name, description, and serialized JSON schema", () => { + const sys = "system"; + const base = estimateOverheadTokens(sys, {}); + const withTool = estimateOverheadTokens(sys, { + searchDocuments: tool({ + description: + "Searches the workspace document store and returns ranked matches.", + inputSchema: z.object({ + query: z.string().describe("Full-text query string"), + limit: z.number().optional().describe("Maximum results to return"), + }), + }), + }); + // Name + description alone are ~20 tokens; the serialized schema (with + // property names and descriptions) must push it well past that. + expect(withTool).toBeGreaterThan(base + 40); + }); + + it("falls back to a conservative flat cost for unserializable schemas", () => { + const tokens = estimateOverheadTokens("", { + weird: { description: "", inputSchema: 42 } as never, + }); + // Either the fallback constant fired or some serialization succeeded — + // never zero, never a throw. + expect(tokens).toBeGreaterThanOrEqual(2); // ≥ name chars / 4 + expect(Number.isFinite(tokens)).toBe(true); + }); + + it("scales with a realistic multi-tool agent (the 8888-vs-986 gap)", () => { + const sys = "You are a helpful agent.\n".repeat(40); // ~1k chars + const tools = Object.fromEntries( + Array.from({ length: 8 }, (_, i) => [ + `tool_${i}`, + tool({ + description: + "A realistically verbose tool description explaining inputs, outputs, constraints, and error behaviour for the model.", + inputSchema: z.object({ + target: z.string().describe("The resource identifier to act on"), + options: z + .object({ + recursive: z.boolean().optional(), + depth: z.number().optional(), + filter: z.string().optional(), + }) + .optional(), + }), + }), + ]), + ); + // The point of ADR-0012 §Tier 1 (trigger projection): this payload is large + // even with a short history. + expect(estimateOverheadTokens(sys, tools)).toBeGreaterThan(500); + }); + + it("is stable across repeated calls (schema-cache must not change counts)", () => { + const sys = "system prompt"; + const tools = { + lookup: tool({ + description: "Look something up by id.", + inputSchema: z.object({ id: z.string().describe("identifier") }), + }), + }; + const first = estimateOverheadTokens(sys, tools); + // Same tool objects → WeakMap hit on the second call; the memoized schema + // length must reproduce the exact token count, never drift. + expect(estimateOverheadTokens(sys, tools)).toBe(first); + }); +}); diff --git a/apps/backend/src/runs/token-estimate.ts b/apps/backend/src/runs/token-estimate.ts new file mode 100644 index 00000000..62ea28d3 --- /dev/null +++ b/apps/backend/src/runs/token-estimate.ts @@ -0,0 +1,557 @@ +/** + * The single token estimator (ADR-0012 §One estimator). + * + * Token counting lives in **exactly one** function — {@link estimateTokens} — + * over **one** neutral structure ({@link CountUnit}). Tier 1 operates on + * UIMessages and Tier 2 on ModelMessages; both normalize into `CountUnit[]` via + * the adapters here, so the two tiers can never diverge on a count + * (ADR-0012 §One estimator). + * + * Hard rules baked in: + * - **char/4 applies to text only.** Tool-call inputs and tool-result outputs + * are text-like to the model, so they fold into a unit's `text`. Image / + * binary bytes are NEVER char/4'd — they go through the modality table + * ({@link nonTextTokens}, ADR-0012 §Token estimation). + * - **UI-only parts are excluded on both sides.** `reasoning`, `source-url`, + * `source-document`, `step-start`, and `data-*` never reach the model, so + * they are dropped by both adapters (ADR-0012 §One estimator). + * - The estimate is content-only — **no per-message role framing overhead** — + * so the total is invariant to how messages are grouped. That is what lets + * the UIMessage and ModelMessage adapters agree exactly even though + * `convertToModelMessages` splits one UI message into several model messages. + * + * The char/4 estimate runs every turn. The provider-reported + * `usage.inputTokens` from the prior turn acts as a corrective baseline when + * available (`Tier1Input.lastInputTokens` — threaded by the ADR-0012 + * §Context-usage ring); until then the cold-start margin + * (ADR-0012 §Token estimation (cold-start margin)) compensates for under-counts. + */ + +import { + asSchema, + type ModelMessage, + type Tool, + type ToolResultPart, + type DataContent, +} from "ai"; +import type { PlatypusUIMessage } from "../types.ts"; + +/** Number of characters approximated as one token (text only). */ +export const CHARS_PER_TOKEN = 4; + +/** + * Conservative flat cost for a non-text part whose true cost we cannot compute + * (unknown provider, missing image dimensions, non-image binary file). Over- + * counting beats overflow (ADR-0012 §Token estimation). + */ +export const DEFAULT_NONTEXT_TOKENS = 1200; + +/** OpenAI's flat cost for a `detail: "low"` image, independent of size. */ +const OPENAI_LOW_DETAIL_TOKENS = 85; + +/** + * No-dimension fallbacks for providers with a real per-image cost (ADR-0012 §Token estimation). When the + * bytes are absent (hosted http(s) URL — and note `inlineFileUrls` turns every + * stored attachment into one) or the header can't be parsed, we have no pixels + * to plug into the formula. The flat {@link DEFAULT_NONTEXT_TOKENS} (1200) + * under-counts a large image on these providers, defeating "over-count beats + * overflow" exactly where it matters. Use a pessimistic value near each + * provider's effective per-image ceiling after its own resize: + * - Anthropic resizes to ≤1.15 MP ⇒ ~1600 tokens max. + * - OpenAI high-detail tiling tops out a few thousand; 2000 is a safe ceiling + * for the common ≤2048² case. + */ +const ANTHROPIC_NO_DIMS_TOKENS = 1600; +const OPENAI_HIGH_NO_DIMS_TOKENS = 2000; + +/** + * The provider families with a known image-cost formula. Everything else maps + * to `"default"` and pays the conservative flat cost. + */ +export type ImageProvider = "anthropic" | "openai" | "default"; + +/** + * A non-text, model-bound part reduced to what the estimator needs: which + * provider formula applies, and (when known) the decoded pixel dimensions. + * `width`/`height` undefined → the provider's missing-dimension fallback. + */ +export type NonTextPart = { + provider: ImageProvider; + width?: number; + height?: number; + /** OpenAI image detail hint. Unset is treated as `"high"` (over-count). */ + detail?: "low" | "high"; +}; + +/** Message role, neutral across UIMessage and ModelMessage shapes. */ +export type CountRole = "system" | "user" | "assistant" | "tool"; + +/** + * The neutral counting structure. One per source message. `text` is the + * char/4'd blob (text parts + serialized tool input/output); `nonText` holds + * images/binaries counted via the modality table. + */ +export type CountUnit = { + role: CountRole; + text: string; + nonText: NonTextPart[]; +}; + +/** + * UIMessage part `type`s that reach the model and are therefore counted. Kept + * as data so the test can assert the UI-only parts are excluded + * (ADR-0012 §One estimator). + * Tool parts are matched separately by the `tool-`/`dynamic-tool` prefix. + */ +export const MODEL_BOUND_UI_PART_TYPES = ["text", "file"] as const; + +// --------------------------------------------------------------------------- +// The estimator (the one function — ADR-0012 §One estimator) +// --------------------------------------------------------------------------- + +function nonTextTokens(part: NonTextPart): number { + const { provider, width, height, detail } = part; + + if (width == null || height == null) { + // Dimensions unknown. OpenAI low-detail has a flat cost even without dims; + // providers with a real per-image cost get a pessimistic ceiling (ADR-0012 §Token estimation); + // everything else falls to the conservative default. + if (provider === "openai" && detail === "low") + return OPENAI_LOW_DETAIL_TOKENS; + if (provider === "anthropic") return ANTHROPIC_NO_DIMS_TOKENS; + if (provider === "openai") return OPENAI_HIGH_NO_DIMS_TOKENS; + return DEFAULT_NONTEXT_TOKENS; + } + + switch (provider) { + case "anthropic": + // Anthropic's documented approximation: tokens ≈ (w × h) / 750. + return Math.ceil((width * height) / 750); + case "openai": + return detail === "low" + ? OPENAI_LOW_DETAIL_TOKENS + : openaiHighDetailTokens(width, height); + default: + return DEFAULT_NONTEXT_TOKENS; + } +} + +/** + * OpenAI's high-detail tiling cost (gpt-4o family): fit within 2048×2048, scale + * the shortest side to 768, then 85 base + 170 per 512px tile. + */ +function openaiHighDetailTokens(w: number, h: number): number { + let width = w; + let height = h; + const longest = Math.max(width, height); + if (longest > 2048) { + const scale = 2048 / longest; + width = Math.round(width * scale); + height = Math.round(height * scale); + } + const shortest = Math.min(width, height); + if (shortest > 768) { + const scale = 768 / shortest; + width = Math.round(width * scale); + height = Math.round(height * scale); + } + const tiles = Math.ceil(width / 512) * Math.ceil(height / 512); + return 85 + 170 * tiles; +} + +/** + * The single estimator. Sums char/4 of each unit's text plus the modality-table + * cost of each non-text part. Content-only, role-agnostic (see file header). + */ +export const estimateTokens = (units: CountUnit[]): number => { + let total = 0; + for (const unit of units) { + total += Math.ceil(unit.text.length / CHARS_PER_TOKEN); + for (const part of unit.nonText) total += nonTextTokens(part); + } + return total; +}; + +// --------------------------------------------------------------------------- +// Shared helpers +// --------------------------------------------------------------------------- + +/** + * Deterministic JSON with sorted keys, so the same value serializes to the same + * string from either adapter (the UIMessage and ModelMessage shapes must agree + * exactly — ADR-0012 §One estimator). Cheaper than guarding key order at every + * call site. + */ +export function stableStringify(value: unknown): string { + if (value === null || typeof value !== "object") + return JSON.stringify(value) ?? ""; + if (Array.isArray(value)) return `[${value.map(stableStringify).join(",")}]`; + const obj = value as Record; + const keys = Object.keys(obj).sort(); + return `{${keys + .map((k) => `${JSON.stringify(k)}:${stableStringify(obj[k])}`) + .join(",")}}`; +} + +function isImageMediaType(mediaType: string | undefined): boolean { + return typeof mediaType === "string" && mediaType.startsWith("image/"); +} + +/** + * Builds a {@link NonTextPart} for an image, parsing pixel dimensions from the + * bytes when available (ADR-0012 §Token estimation: a cheap header read, no full + * decode). + */ +function imagePart( + provider: ImageProvider, + bytes: Uint8Array | undefined, + detail?: "low" | "high", +): NonTextPart { + const dims = bytes ? parseImageDimensions(bytes) : undefined; + return { provider, width: dims?.width, height: dims?.height, detail }; +} + +/** A non-image binary file: conservative flat cost, no formula. */ +function binaryPart(): NonTextPart { + return { provider: "default" }; +} + +// --------------------------------------------------------------------------- +// Image dimension parsing (cheap header parse — PNG IHDR / JPEG SOF) +// --------------------------------------------------------------------------- + +/** + * Reads pixel dimensions from PNG / JPEG headers without decoding the image. + * Returns undefined for unrecognized formats or truncated data — the caller + * then falls to the conservative constant (ADR-0012 §Token estimation). + */ +export function parseImageDimensions( + bytes: Uint8Array, +): { width: number; height: number } | undefined { + // PNG: 8-byte signature, then IHDR chunk with width@16, height@20 (BE). + if ( + bytes.length >= 24 && + bytes[0] === 0x89 && + bytes[1] === 0x50 && + bytes[2] === 0x4e && + bytes[3] === 0x47 + ) { + const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength); + return { width: view.getUint32(16), height: view.getUint32(20) }; + } + + // JPEG: 0xFFD8 start, then walk segment markers to the SOF that carries dims. + if (bytes.length >= 4 && bytes[0] === 0xff && bytes[1] === 0xd8) { + let offset = 2; + while (offset + 9 < bytes.length) { + if (bytes[offset] !== 0xff) { + offset++; + continue; + } + const marker = bytes[offset + 1]; + // 0xFF fill bytes pad before a real marker; consume one and re-read so a + // run of fill bytes doesn't get mistaken for a segment. + if (marker === 0xff) { + offset++; + continue; + } + // 0xFF00 is a stuffed data byte inside entropy-coded data, not a marker. + if (marker === 0x00) { + offset += 2; + continue; + } + // SOF0..SOF15 carry frame dimensions, excluding DHT(C4)/JPG(C8)/DAC(CC). + const isSof = + marker >= 0xc0 && + marker <= 0xcf && + marker !== 0xc4 && + marker !== 0xc8 && + marker !== 0xcc; + if (isSof) { + const view = new DataView( + bytes.buffer, + bytes.byteOffset, + bytes.byteLength, + ); + const height = view.getUint16(offset + 5); + const width = view.getUint16(offset + 7); + return { width, height }; + } + // Standalone markers with no length payload: SOI(D8), EOI(D9), + // RSTn(D0-D7), TEM(01). Skip the 2-byte marker. + if ( + marker === 0xd8 || + marker === 0xd9 || + marker === 0x01 || + (marker >= 0xd0 && marker <= 0xd7) + ) { + offset += 2; + continue; + } + const segLength = (bytes[offset + 2] << 8) | bytes[offset + 3]; + if (segLength < 2) return undefined; + offset += 2 + segLength; + } + } + + return undefined; +} + +/** + * Upper bound on bytes decoded from a data URL for header parsing. PNG + * dimensions live in the first 24 bytes; a JPEG SOF marker is almost always + * within the first few KB. Decoding only a 64 KB prefix avoids materializing a + * multi-MB image on every estimation pass — we never need the pixel data, only + * the header. base64 packs 3 bytes per 4 chars, so cap the input accordingly. + */ +const HEADER_DECODE_MAX_BYTES = 64 * 1024; +const HEADER_DECODE_MAX_B64_CHARS = Math.ceil(HEADER_DECODE_MAX_BYTES / 3) * 4; + +/** + * Decodes the bytes behind a UIMessage file URL when it is a base64 data URL. + * Hosted (http/https) URLs return undefined — we have no bytes in hand, so the + * caller falls to the conservative constant. Only a bounded prefix is decoded + * since the caller only reads image headers. + */ +function bytesFromUrl(url: string): Uint8Array | undefined { + const match = /^data:[^;,]*;base64,(.*)$/s.exec(url); + if (!match) return undefined; + try { + const b64 = match[1].slice(0, HEADER_DECODE_MAX_B64_CHARS); + return new Uint8Array(Buffer.from(b64, "base64")); + } catch { + return undefined; + } +} + +/** Normalizes the various ModelMessage byte containers into a Uint8Array. */ +function bytesFromDataContent(data: DataContent | URL): Uint8Array | undefined { + if (typeof data === "string") return bytesFromUrl(data); + if (data instanceof URL) return undefined; + if (data instanceof Uint8Array) return data; + if (data instanceof ArrayBuffer) return new Uint8Array(data); + if (typeof Buffer !== "undefined" && Buffer.isBuffer(data)) { + return new Uint8Array(data); + } + return undefined; +} + +// --------------------------------------------------------------------------- +// Tier 1 adapter — UIMessage → CountUnit (one unit per message) +// --------------------------------------------------------------------------- + +function uiMessageToCountUnit( + message: PlatypusUIMessage, + provider: ImageProvider, +): CountUnit { + let text = ""; + const nonText: NonTextPart[] = []; + + for (const part of message.parts ?? []) { + const type = part.type; + + if (type === "text") { + text += (part as { text: string }).text; + continue; + } + + if (type === "file") { + const file = part as { mediaType?: string; url: string }; + const bytes = bytesFromUrl(file.url); + if (isImageMediaType(file.mediaType)) { + nonText.push(imagePart(provider, bytes)); + } else { + nonText.push(binaryPart()); + } + continue; + } + + // Tool invocations (`tool-` and `dynamic-tool`) are model-bound and + // text-like: fold their input + output into the char/4 blob. + if (type === "dynamic-tool" || type.startsWith("tool-")) { + const tool = part as { + input?: unknown; + output?: unknown; + errorText?: string; + }; + if (tool.input !== undefined) text += stableStringify(tool.input); + // Count the output OR the error text — `convertToModelMessages` maps an + // `output-error` UI part to a `tool-result` with `output: {type:"error-text", + // value: errorText}`, which the model adapter counts via `toolResultOutputText`. + // Skipping errorText here would make the UI side count 0 for a failed tool call + // while the model side counts the error string — breaking the §One estimator + // equality (a tier could fire on a number the other never sees). + if (tool.output !== undefined) { + text += stableStringify(tool.output); + } else if (tool.errorText !== undefined) { + text += stableStringify(tool.errorText); + } + continue; + } + + // Everything else (reasoning, source-url, source-document, step-start, + // data-*) is UI-only and excluded on both sides (ADR-0012 §One estimator). + } + + return { role: message.role, text, nonText }; +} + +/** Tier 1 adapter: UIMessages → neutral count units. */ +export function uiMessagesToCountUnits( + messages: PlatypusUIMessage[], + provider: ImageProvider = "default", +): CountUnit[] { + return messages.map((m) => uiMessageToCountUnit(m, provider)); +} + +// --------------------------------------------------------------------------- +// Tier 2 adapter — ModelMessage → CountUnit (one unit per message) +// --------------------------------------------------------------------------- + +/** + * Extracts the model-visible string from a tool-result output wrapper. Only two + * behaviours exist: `execution-denied` carries a `reason`; every other variant + * (`text` / `error-text` / `json` / `error-json` / `content`) carries a `value` + * that is char/4'd via `stableStringify` — mirroring the UI adapter, which folds + * the raw output the same way (the old per-label switch collapsed to these + * two and carried an unreachable `default`). + */ +function toolResultOutputText(output: ToolResultPart["output"]): string { + return output.type === "execution-denied" + ? stableStringify(output.reason ?? "") + : stableStringify(output.value); +} + +function modelMessageToCountUnit( + message: ModelMessage, + provider: ImageProvider, +): CountUnit { + const role = message.role; + let text = ""; + const nonText: NonTextPart[] = []; + + const { content } = message; + if (typeof content === "string") { + return { role, text: content, nonText }; + } + + for (const part of content) { + switch (part.type) { + case "text": + text += part.text; + break; + case "tool-call": + text += stableStringify(part.input); + break; + case "tool-result": + text += toolResultOutputText(part.output); + break; + case "image": { + const img = part; + nonText.push(imagePart(provider, bytesFromDataContent(img.image))); + break; + } + case "file": { + const file = part; + if (isImageMediaType(file.mediaType)) { + nonText.push(imagePart(provider, bytesFromDataContent(file.data))); + } else { + nonText.push(binaryPart()); + } + break; + } + // reasoning / tool-approval-* are UI-only or control parts — excluded. + default: + break; + } + } + + return { role, text, nonText }; +} + +/** Tier 2 adapter: ModelMessages → neutral count units. */ +export function modelMessagesToCountUnits( + messages: ModelMessage[], + provider: ImageProvider = "default", +): CountUnit[] { + return messages.map((m) => modelMessageToCountUnit(m, provider)); +} + +// --------------------------------------------------------------------------- +// Per-turn overhead — system prompt + tool schemas +// (ADR-0012 §Tier 1 (trigger projection)) +// --------------------------------------------------------------------------- + +/** + * Flat fallback for a tool whose input schema cannot be serialized (e.g. a + * provider-defined tool with no JSON-schema representation). Conservative — + * over-counting beats overflow. + */ +export const TOOL_SCHEMA_FALLBACK_TOKENS = 200; + +/** + * Serialized-schema char length cached per input-schema object. The + * `asSchema(...) → stableStringify` conversion is the expensive part of overhead + * estimation and a tool's schema object is stable across turns, so memoize it. + * A WeakMap keyed by the schema object never pins a tool that goes out of scope. + */ +const schemaLenCache = new WeakMap(); + +/** + * Estimates the tokens of the per-turn payload that is NOT in the message + * history: the rendered system prompt plus every tool's name, description, and + * JSON input schema — all sent to the model on every turn, and the dominant + * cause of the trigger under-count on tool-bearing agents + * (ADR-0012 §Tier 1 (trigger projection)) (observed 8888 + * provider-reported vs ~986 message-only). Same char/4 rule as the single + * estimator; the result feeds `Tier1Input.overheadTokens`. + */ +export function estimateOverheadTokens( + systemPrompt: string | undefined, + tools: Record | undefined, +): number { + let tokens = Math.ceil((systemPrompt ?? "").length / CHARS_PER_TOKEN); + for (const [name, tool] of Object.entries(tools ?? {})) { + const t = tool as { description?: string; inputSchema?: unknown }; + let schemaLen = 0; + if (t.inputSchema != null) { + const key = typeof t.inputSchema === "object" ? t.inputSchema : undefined; + const cached = key ? schemaLenCache.get(key) : undefined; + if (cached !== undefined) { + schemaLen = cached; + } else { + try { + // asSchema is the SDK's own conversion to the wire-format JSON schema. + schemaLen = stableStringify( + asSchema(t.inputSchema as never).jsonSchema, + ).length; + if (key) schemaLenCache.set(key, schemaLen); + } catch { + tokens += TOOL_SCHEMA_FALLBACK_TOKENS; + } + } + } + // Concatenated length == sum of lengths, so this stays numerically identical + // to folding the schema string into `text` before the single char/4 divide. + const baseLen = (name + (t.description ?? "")).length + schemaLen; + tokens += Math.ceil(baseLen / CHARS_PER_TOKEN); + } + return tokens; +} + +/** + * Maps a provider `providerType` (as stored on the provider row) to the image + * cost family. Bedrock most commonly serves Anthropic models, so it maps to + * `anthropic`; OpenRouter is heterogeneous and maps to `default`. + */ +export function imageProviderFor(providerType: string): ImageProvider { + switch (providerType) { + case "Anthropic": + case "Bedrock": + return "anthropic"; + case "OpenAI": + return "openai"; + default: + return "default"; + } +} diff --git a/apps/backend/src/services/chat-execution.test.ts b/apps/backend/src/services/chat-execution.test.ts index fbdb7645..7291ff93 100644 --- a/apps/backend/src/services/chat-execution.test.ts +++ b/apps/backend/src/services/chat-execution.test.ts @@ -65,14 +65,27 @@ vi.mock("@ai-sdk/mcp", () => ({ auth: vi.fn(), })); +// Partial mock of "ai": only `generateText` is replaced (used by the compaction +// summarizer). `createIdGenerator` and the rest stay real via importActual. +const { mockGenerateText } = vi.hoisted(() => ({ + mockGenerateText: vi.fn(), +})); +vi.mock("ai", async (importActual) => { + const actual = await importActual(); + return { ...actual, generateText: mockGenerateText }; +}); + import { prepareChatTurn, + buildCompactionRuntime, NotFoundError, ValidationError, createToolHeartbeat, shouldInjectNativeSearch, } from "./chat-execution.ts"; import { createInMemoryChatTurnQueries } from "./chat-execution.test-fixtures.ts"; +import { logger } from "../logger.ts"; +import { contextWindowResolver } from "../runs/context-window.ts"; const baseProvider = { id: "p1", @@ -650,4 +663,110 @@ describe("chat-execution", () => { ).toBe(true); }); }); + + describe("buildCompactionRuntime summarize (ADR-0012 §Summarizer hardening / review Fix B)", () => { + const buildRuntime = (signal?: AbortSignal, onActivity?: () => void) => + buildCompactionRuntime({ + chatId: "chat-1", + provider: baseProvider, + resolvedModelId: "gpt-4", + opened: { + languageModel: vi.fn(() => ({ modelId: "task-model" })), + } as never, + onActivity, + signal, + }); + + beforeEach(() => { + mockGenerateText.mockReset(); + vi.spyOn(contextWindowResolver, "resolve").mockResolvedValue({ + contextWindow: 128_000, + maxOutputTokens: 4096, + source: "registry", + } as never); + }); + + afterEach(() => { + vi.restoreAllMocks(); + vi.useRealTimers(); + }); + + it("threads the abort signal, output ceiling, and ordered prompt into generateText", async () => { + mockGenerateText.mockResolvedValue({ + text: "SUMMARY", + usage: {}, + finishReason: "stop", + }); + const controller = new AbortController(); + const runtime = await buildRuntime(controller.signal); + + const out = await runtime.summarize("history text"); + + expect(out).toBe("SUMMARY"); + expect(mockGenerateText).toHaveBeenCalledTimes(1); + const arg = mockGenerateText.mock.calls[0][0] as { + maxOutputTokens?: number; + abortSignal?: AbortSignal; + prompt?: string; + system: string; + }; + expect(arg.maxOutputTokens).toBe(4000); + expect(arg.abortSignal).toBe(controller.signal); + expect(arg.prompt).toBe("history text"); + expect(arg.system).toContain("context checkpoint compaction"); + // Sections ordered most-critical-first so truncation drops the tail + // (file/tool detail), not intent or next step. + const intentIdx = arg.system.indexOf("Intent & open requests"); + const nextStepIdx = arg.system.indexOf("Current state & next step"); + const filesIdx = arg.system.indexOf("Files & tools touched"); + expect(intentIdx).toBeGreaterThanOrEqual(0); + expect(nextStepIdx).toBeGreaterThan(intentIdx); + expect(filesIdx).toBeGreaterThan(nextStepIdx); + }); + + it("warns but still returns the summary when the output ceiling is hit", async () => { + mockGenerateText.mockResolvedValue({ + text: "TRUNCATED", + usage: {}, + finishReason: "length", + }); + const warn = vi.spyOn(logger, "warn").mockImplementation(() => {}); + const runtime = await buildRuntime(); + + const out = await runtime.summarize("x"); + + expect(out).toBe("TRUNCATED"); + expect(warn).toHaveBeenCalledWith( + expect.objectContaining({ maxTokens: 4000 }), + expect.stringContaining("maxOutputTokens ceiling"), + ); + }); + + it("bumps onActivity on each heartbeat tick while summarize runs, then stops", async () => { + let resolveGen: (v: unknown) => void = () => {}; + mockGenerateText.mockImplementation( + () => + new Promise((resolve) => { + resolveGen = resolve; + }), + ); + const onActivity = vi.fn(); + const runtime = await buildRuntime(undefined, onActivity); + + vi.useFakeTimers(); + const pending = runtime.summarize("x"); + + // Two heartbeat intervals (10 s each) elapse mid-call. + await vi.advanceTimersByTimeAsync(25_000); + expect(onActivity).toHaveBeenCalledTimes(2); + + resolveGen({ text: "S", usage: {}, finishReason: "stop" }); + await pending; + + // Interval cleared in the finally block — no further bumps. + onActivity.mockClear(); + await vi.advanceTimersByTimeAsync(30_000); + expect(onActivity).not.toHaveBeenCalled(); + }); + }); }); diff --git a/apps/backend/src/services/chat-execution.ts b/apps/backend/src/services/chat-execution.ts index 6d5ee3d6..20dea580 100644 --- a/apps/backend/src/services/chat-execution.ts +++ b/apps/backend/src/services/chat-execution.ts @@ -3,7 +3,7 @@ import { type MCPClient, } from "@ai-sdk/mcp"; import { openProvider } from "./provider.ts"; -import { and, eq, or, inArray } from "drizzle-orm"; +import { and, eq, or, inArray, sql } from "drizzle-orm"; import { db } from "../index.ts"; import { agent as agentTable, @@ -27,11 +27,46 @@ import { type MemorySummary, } from "./memory-retrieval.ts"; import type { Provider, Skill } from "@platypus/schemas"; -import type { LanguageModel, Tool } from "ai"; +import { + createIdGenerator, + generateText, + type LanguageModel, + type Tool, +} from "ai"; import { logger } from "../logger.ts"; import { buildMcpTransportConfig } from "./mcp-oauth-provider.ts"; import { inlineFileUrls } from "../storage/utils.ts"; import type { PlatypusUIMessage } from "../types.ts"; +import { chat as chatTable } from "../db/schema.ts"; +import { + contextWindowResolver, + DEFAULT_CONTEXT_WINDOW, +} from "../runs/context-window.ts"; +import { + estimateTokens, + estimateOverheadTokens, + imageProviderFor, + uiMessagesToCountUnits, + type ImageProvider, +} from "../runs/token-estimate.ts"; +import { + applyTier1Compaction, + affectedBelowWatermark, + buildCompactionTraceMessage, + buildTier2PrepareStep, + computeBudget, + drizzleCompactionStore, + invalidateCompaction, + DEFAULT_COMPACTION_CONFIG, + setCompactionDirty, + type Budget, + type CompactionConfig, + type CompactionState, + type CompactionTrace, + type Summarize, + type Tier2Context, +} from "../runs/compaction.ts"; +import type { RecoveryContext } from "../runs/recovery.ts"; // --- Errors --- @@ -91,6 +126,13 @@ type GenerationConfig = { * messages) — those arrive as separate `PrepareChatTurnInput` fields. */ export type ChatTurnRequest = { + /** + * Chat id. Present for interactive chat turns (the chatSubmit payload); + * absent for headless callers (triggers, sub-agents) whose `request` carries + * no chat. Tier 1 compaction keys on it — see the skip guard in + * `prepareChatTurn` (ADR-0012 §Sub-agents: headless runs are Tier 2 only). + */ + id?: string; agentId?: string; providerId?: string; modelId?: string; @@ -118,6 +160,12 @@ export type ChatTurn = { presencePenalty?: number; seed?: number; }; + /** + * Set when Tier 1 compaction fired this turn (ADR-0012 §Compaction trace in the timeline). agent-runner emits + * a synthetic compact_context tool-call + tool-result pair into the stream so + * the compaction is visible in the chat timeline. + */ + compactionTrace?: CompactionTrace; resolved: { agentId?: string; providerId: string; @@ -129,7 +177,23 @@ export type ChatTurn = { frequencyPenalty?: number; presencePenalty?: number; seed?: number; + /** Resolved context window for the main model (ADR-0012 §Context-usage ring, ADR-0012 §Per-message stats). */ + contextWindow: number; + /** True when contextWindow fell to the conservative default (ADR-0012 §Context-usage ring: ring → neutral). */ + contextWindowIsDefault: boolean; }; + /** + * Context-overflow recovery wiring (ADR-0012 §Recovery, ADR-0012 §Recovery is the net). Always present — recovery is + * the safety net and stays on even when proactive compaction is disabled. + * agent-runner wraps the model with the recovery middleware using this. + */ + recovery: RecoveryContext; + /** + * Tier 2 in-turn compaction config (ADR-0012 §Tier 2). Null when proactive compaction is + * disabled (ADR-0012 §Config & kill switch or agent override). agent-runner builds the + * prepareStep callback from this and wires it into streamText/generateText. + */ + tier2: Tier2Context | null; dispose: () => Promise; }; @@ -161,6 +225,20 @@ export type PrepareChatTurnInput = { * yield bumps invoke with no event (timer-only). */ onActivity?: (event?: ToolActivityEvent) => void; + /** + * Messages as they were in the DB BEFORE this submission's `ChatSink.onStart` + * overwrote them — the ADR-0012 §Summary invalidation baseline for detecting edits below the watermark + * (ADR-0012 §Summary invalidation). Loaded by agent-runner before calling onStart. When absent the ADR-0012 §Summary invalidation + * check falls back to a DB read that now returns the post-overwrite state. + */ + priorMessages?: PlatypusUIMessage[]; + /** + * Run abort signal from the run registry. Threaded into the compaction + * summarizer so a cancelled or timed-out run aborts the in-flight + * `generateText` (review Fix B). Optional: callers without a registry-backed + * run (tests, ad-hoc) omit it and the summarize call simply runs uncancelled. + */ + signal?: AbortSignal; }; /** @@ -418,6 +496,418 @@ export const drizzleChatTurnQueries: ChatTurnQueries = { }, }; +// --- Tier 1 context compaction (ADR-0012) --- + +const EMPTY_COMPACTION_STATE: CompactionState = { + version: 0, + summaryWatermark: null, + contextSummary: null, + compactionDirty: false, +}; + +/** + * Resolves the effective global compaction config from DEFAULT_COMPACTION_CONFIG + * + env overrides (ADR-0012 §Config & kill switch). Extracted so both + * buildCompactionRuntime and the context-window endpoint (which surfaces + * keepRecentMessages to the force-compact confirm gate) share one source of + * truth. Pure — depends only on process.env. + */ +export function resolveCompactionConfig(): CompactionConfig { + const config = { ...DEFAULT_COMPACTION_CONFIG }; + // Global kill switch (ADR-0012 §Config & kill switch) gates proactive compaction; recovery is unaffected. + if (process.env.COMPACTION_ENABLED === "false") { + config.compactionEnabled = false; + } + // Optional env overrides for the global ceiling (ADR-0012 §Config & kill switch). Unset/blank/invalid → + // the DEFAULT_COMPACTION_CONFIG value stands, so production behavior is + // unchanged. Intended for tuning the trigger on test deployments without a + // code change. Keep targetRatio < triggerRatio or compaction re-fires every + // turn (the thrash trap). + // Reads + RANGE-VALIDATES a numeric env override (ADR-0012 §Config & kill switch). An out-of-range or + // non-finite value is rejected (warn + fall back to the default) rather than + // silently applied: the old `Number.isFinite`-only check let `0` and negatives + // through, so `COMPACTION_KEEP_RECENT=0` summarized the current message away + // and `COMPACTION_TRIGGER_RATIO=0` fired on empty chats. + const numEnv = ( + name: string, + raw: string | undefined, + opts: { min?: number; max?: number; integer?: boolean } = {}, + ): number | undefined => { + if (raw == null || raw === "") return undefined; + let n = Number(raw); + let invalid = !Number.isFinite(n); + if (!invalid && opts.integer) n = Math.floor(n); + if (!invalid && opts.min !== undefined && n < opts.min) invalid = true; + if (!invalid && opts.max !== undefined && n > opts.max) invalid = true; + if (invalid) { + logger.warn( + { env: name, raw, ...opts }, + "ignoring out-of-range compaction env override; using default", + ); + return undefined; + } + return n; + }; + const RATIO = { min: 0.01, max: 1 }; + config.triggerRatio = + numEnv( + "COMPACTION_TRIGGER_RATIO", + process.env.COMPACTION_TRIGGER_RATIO, + RATIO, + ) ?? config.triggerRatio; + config.targetRatio = + numEnv( + "COMPACTION_TARGET_RATIO", + process.env.COMPACTION_TARGET_RATIO, + RATIO, + ) ?? config.targetRatio; + config.reserveRatio = + numEnv("COMPACTION_RESERVE_RATIO", process.env.COMPACTION_RESERVE_RATIO, { + min: 0, + max: 0.9, + }) ?? config.reserveRatio; + config.keepRecentMessages = + numEnv("COMPACTION_KEEP_RECENT", process.env.COMPACTION_KEEP_RECENT, { + min: 1, + integer: true, + }) ?? config.keepRecentMessages; + config.minPrunableChars = + numEnv( + "COMPACTION_MIN_PRUNABLE_CHARS", + process.env.COMPACTION_MIN_PRUNABLE_CHARS, + { + min: 1, + integer: true, + }, + ) ?? config.minPrunableChars; + config.minRecentPrunableChars = + numEnv( + "COMPACTION_MIN_RECENT_PRUNABLE_CHARS", + process.env.COMPACTION_MIN_RECENT_PRUNABLE_CHARS, + { min: 1, integer: true }, + ) ?? config.minRecentPrunableChars; + // ADR-0012 §Stage 0 — context editing. Disabled via + // COMPACTION_CONTEXT_EDITING_ENABLED=false; recency/size gates tunable. + if (process.env.COMPACTION_CONTEXT_EDITING_ENABLED === "false") { + config.contextEditingEnabled = false; + } + config.keepRecentToolResults = + numEnv( + "COMPACTION_KEEP_RECENT_TOOL_RESULTS", + process.env.COMPACTION_KEEP_RECENT_TOOL_RESULTS, + { min: 0, integer: true }, + ) ?? config.keepRecentToolResults; + config.minEditableToolChars = + numEnv( + "COMPACTION_MIN_EDITABLE_TOOL_CHARS", + process.env.COMPACTION_MIN_EDITABLE_TOOL_CHARS, + { min: 1, integer: true }, + ) ?? config.minEditableToolChars; + + // Hysteresis backstop (ADR-0012 §Tier 1 (hysteresis)): target must stay below trigger or + // compaction re-fires every turn (ADR-0012 §Tier 1 hysteresis). The earlier runtime clamp was + // dropped when per-agent config was removed (ADR-0012 §Config & kill switch); restore it here so an operator who + // sets COMPACTION_TARGET_RATIO >= COMPACTION_TRIGGER_RATIO still runs safely. + if (config.targetRatio >= config.triggerRatio) { + const clamped = config.triggerRatio * 0.9; + logger.warn( + { + targetRatio: config.targetRatio, + triggerRatio: config.triggerRatio, + clamped, + }, + "COMPACTION_TARGET_RATIO >= COMPACTION_TRIGGER_RATIO; clamping target to triggerRatio*0.9 (hysteresis)", + ); + config.targetRatio = clamped; + } + return config; +} + +/** + * Loads the canonical (raw) persisted history for a chat. Exported so + * agent-runner can snapshot it BEFORE `ChatSink.onStart` overwrites the row — + * that snapshot is the ADR-0012 §Summary invalidation baseline (ADR-0012 §Summary invalidation: onStart runs before prepareChatTurn, + * so a read inside applyTier1IfNeeded would see the just-submitted messages). + */ +export async function loadChatMessages( + chatId: string, +): Promise { + const rows = await db + .select({ messages: chatTable.messages }) + .from(chatTable) + .where(eq(chatTable.id, chatId)) + .limit(1); + return (rows[0]?.messages as PlatypusUIMessage[] | null) ?? []; +} + +/** + * Newest-first scan for the last assistant message carrying a POSITIVE + * provider-reported `contextTokens` (the ADR-0012 §Context-usage ring stat). Skips two messages that would + * otherwise shadow the real baseline: + * - the ADR-0012 §Force-compact on demand standalone trace message (assistant role, no `metadata.stats`) — ADR-0012 §Tier 1 (hysteresis); + * - a turn from a usage-less provider stamped `contextTokens = 0` — ADR-0012 §Tier 1 (trigger projection). + * Either would make the Tier 1 projection drop the corrective baseline (and, for + * the 0 case, the cold-start margin too). + */ +function findLastInputTokens( + messages: PlatypusUIMessage[], +): number | undefined { + for (let i = messages.length - 1; i >= 0; i--) { + if (messages[i].role !== "assistant") continue; + const ct = ( + messages[i].metadata as { stats?: { contextTokens?: number } } | undefined + )?.stats?.contextTokens; + if (typeof ct === "number" && ct > 0) return ct; + } + return undefined; +} + +/** + * Everything the compaction machinery needs that is resolved once per turn: + * the budget (from the resolved context window), the effective config, the + * summarizer, and the summarizer's own window (ADR-0012 §Tier 1 (summarizer model & map-reduce)). Shared by Tier 1 + * and the recovery middleware (ADR-0012 §Recovery) so the two never disagree. + */ +type CompactionRuntime = { + budget: Budget; + config: CompactionConfig; + imageProvider: ImageProvider; + summarize: Summarize; + summarizerWindow?: number; + /** Resolved context window for the main model (ADR-0012 §Context-usage ring). */ + contextWindow: number; + /** True when the window fell to the conservative default (ADR-0012 §Context-usage ring: ring → neutral). */ + contextWindowIsDefault: boolean; +}; + +/** + * Builds the per-turn compaction runtime. Never throws: a failed window + * resolution falls back to the conservative default so recovery (ADR-0012 §Recovery is the net) always + * has a working configuration. + */ +/** Safety ceiling on summarizer output (ADR-0012 §Summarizer hardening). Prevents a runaway + * model from producing a summary longer than its input. The system prompt + * hard-limits to 1500 tokens; this 4000 backstop catches models that ignore + * the instruction (e.g. qwen36 on large tool-heavy inputs). */ +const SUMMARIZE_MAX_OUTPUT_TOKENS = 4000; + +/** Heartbeat interval while the summarizer runs (ADR-0012 §Summarizer hardening). Resets the + * per-step stall watchdog so a slow summarize call is not misidentified as a + * frozen run and killed before it returns. */ +const SUMMARIZE_HEARTBEAT_INTERVAL_MS = 10_000; + +export async function buildCompactionRuntime(args: { + chatId?: string; + provider: Provider; + resolvedModelId: string; + opened: ReturnType; + /** When present, called every ~10 s during `summarize` to keep the per-step + * stall watchdog alive (ADR-0012 §Summarizer hardening). */ + onActivity?: () => void; + /** Run abort signal, threaded into the summarizer `generateText` so a + * cancelled / per-run-timed-out run aborts the call instead of leaking it + * past the heartbeat-suppressed per-step watchdog (review Fix B). */ + signal?: AbortSignal; +}): Promise { + const { chatId, provider, resolvedModelId, opened, onActivity, signal } = + args; + + const config = resolveCompactionConfig(); + + // ADR-0012 §Window resolution: resolve both windows concurrently (they are independent). + const taskModelId = provider.taskModelId || resolvedModelId; + const [mainWindow, summarizerWindowResult] = await Promise.all([ + contextWindowResolver.resolve(provider, resolvedModelId).catch((error) => { + logger.error( + { error, chatId, resolvedModelId }, + "context window resolution failed; using conservative default", + ); + return null; + }), + contextWindowResolver.resolve(provider, taskModelId).catch(() => null), + ]); + + const contextWindow = mainWindow?.contextWindow ?? DEFAULT_CONTEXT_WINDOW; + const maxOutputTokens = mainWindow?.maxOutputTokens; + const budget = computeBudget(contextWindow, maxOutputTokens, config); + + const summarizerWindow = summarizerWindowResult + ? computeBudget( + summarizerWindowResult.contextWindow, + summarizerWindowResult.maxOutputTokens, + config, + ).inputBudget + : undefined; + + // Summarizer uses the provider's task model, falling back to the main model + // when unset (ADR-0012 §Tier 1 (summarizer model)). generateText is one-shot, no tools. + const summarize = async (text: string): Promise => { + const startedAt = Date.now(); + // ADR-0012 §Summarizer hardening: keep the per-step stall watchdog alive while the + // summarizer runs. Tier-1 compaction is legitimate long work, not a stall; + // without this ping the 120 s watchdog fires and kills the run. + const heartbeat = onActivity + ? setInterval(onActivity, SUMMARIZE_HEARTBEAT_INTERVAL_MS) + : null; + try { + const result = await generateText({ + model: opened.languageModel(taskModelId), + // ADR-0012 §Summarizer hardening: structured handoff prompt — sections reduce loss + // across repeated re-compactions (Codex CLI pattern); explicit concise + // instruction + "aim under ~1500 tokens" pairs with the output ceiling. + // Sections are ordered most-critical-first: if the output is truncated + // at the ceiling (finishReason === "length"), the tail that drops is + // the least resume-critical (file/tool detail), not intent or next step. + system: `You are performing a context checkpoint compaction. Another instance of this assistant will resume using ONLY your summary plus the most recent messages — earlier history will be gone. Write a dense markdown handoff under these headings, in this order (omit one only if truly empty). Front-load the most important facts within each section — if you run long, later detail may be cut: + +- **Intent & open requests** — what the user wants, the latest explicit request, pending tasks. +- **Current state & next step** — where things stand and the immediate next action. +- **Decisions & facts** — conclusions, confirmed values/IDs/paths, constraints and user preferences (preserve any security-relevant instruction verbatim). +- **Files & tools touched** — what was read/changed and why. + +If a prior summary appears in the history, integrate it — don't drop facts it captured. Be concise: hard limit 1500 tokens maximum. Output only the summary.`, + prompt: text, + // ADR-0012 §Summarizer hardening: hard ceiling prevents a runaway model from + // producing a summary longer than its input. Prompt hard-limits to + // 1500 tokens; 4000 backstop catches models that ignore the instruction. + maxOutputTokens: SUMMARIZE_MAX_OUTPUT_TOKENS, + // Fix B (review): thread the run's abort signal so a cancelled or + // per-run-timed-out run actually aborts this call. The heartbeat above + // keeps the per-step watchdog from firing, so without this a hung + // summarize would otherwise run until the 10 min per-run timeout. + abortSignal: signal, + }); + const { text: summary, usage, finishReason } = result; + logger.info( + { + metric: "summarize.latency_ms", + latencyMs: Date.now() - startedAt, + chatId, + taskModelId, + usage, + finishReason, + hitOutputCeiling: finishReason === "length", + }, + "context compaction summarize", + ); + if (finishReason === "length") { + logger.warn( + { chatId, taskModelId, maxTokens: SUMMARIZE_MAX_OUTPUT_TOKENS }, + "summarize hit maxOutputTokens ceiling — summary may be truncated", + ); + } + return summary; + } finally { + if (heartbeat !== null) clearInterval(heartbeat); + } + }; + + return { + budget, + config, + imageProvider: imageProviderFor(provider.providerType), + summarize, + summarizerWindow, + contextWindow, + contextWindowIsDefault: !mainWindow || mainWindow.source === "default", + }; +} + +type ApplyTier1Args = { + chatId: string; + runtime: CompactionRuntime; + /** Post-inlineFileUrls messages — used for the compaction itself (ADR-0012 §Token estimation). */ + messages: PlatypusUIMessage[]; + /** + * Pre-inlineFileUrls messages from this submission — used as the incoming + * side of the ADR-0012 §Summary invalidation divergence check (ADR-0012 §Summary invalidation). Must NOT be inlined: the persisted + * side also uses storage:// / http:// URLs, so both sides are comparable. + */ + rawMessages: PlatypusUIMessage[]; + /** + * Messages as they were in the DB BEFORE this submission's onStart overwrote + * them (ADR-0012 §Summary invalidation). When absent, the ADR-0012 §Summary invalidation check falls back to a fresh DB read, which + * returns the post-overwrite state and therefore never detects edits. + */ + priorMessages?: PlatypusUIMessage[]; + /** Estimated system-prompt + tool-schema payload for this turn (ADR-0012 §Tier 1 (trigger projection)). */ + overheadTokens: number; + /** Provider-reported `usage.inputTokens` from the prior turn (ADR-0012 §Tier 1 (trigger projection), ADR-0012 §Context-usage ring). */ + lastInputTokens?: number; +}; + +type Tier1IfNeededResult = { + messages: PlatypusUIMessage[]; + compactionTrace?: CompactionTrace; +}; + +/** + * Reconstructs/advances the compacted view and persists any new summary — all + * best-effort. Any throw degrades to the uncompacted messages (recovery ADR-0012 §Recovery + * remains the safety net). Returns the messages to send to the model plus an + * optional compactionTrace for the stream trace (ADR-0012 §Compaction trace in the timeline). + */ +async function applyTier1IfNeeded( + args: ApplyTier1Args, +): Promise { + const { chatId, runtime, messages, rawMessages } = args; + try { + const store = drizzleCompactionStore; + let state = (await store.readState(chatId)) ?? EMPTY_COMPACTION_STATE; + + // ADR-0012 §Summary invalidation: if the submitted history changed at/below the watermark + // (edit/delete/regenerate), reset the stale summary before compacting. The + // single submit endpoint is the only "edit handler" in this architecture. + // + // ADR-0012 §Summary invalidation fix: the baseline must be the DB state BEFORE this submission's + // onStart overwrote the row. agent-runner reads it before calling onStart + // and threads it here as `priorMessages`. We also compare the pre-inline + // (`rawMessages`) side so file-URL inlining doesn't trigger false positives. + if (state.summaryWatermark || state.contextSummary) { + const persisted = args.priorMessages ?? (await loadChatMessages(chatId)); + const affected = affectedBelowWatermark( + persisted, + rawMessages, + state.summaryWatermark, + ); + if (affected.length > 0) { + const orderedIds = rawMessages + .map((m) => m.id) + .filter((id): id is string => Boolean(id)); + await invalidateCompaction(store, chatId, affected, orderedIds); + state = (await store.readState(chatId)) ?? state; + } + } + + const result = await applyTier1Compaction({ + chatId, + messages, + state, + budget: runtime.budget, + config: runtime.config, + imageProvider: runtime.imageProvider, + summarize: runtime.summarize, + summarizerWindow: runtime.summarizerWindow, + overheadTokens: args.overheadTokens, + lastInputTokens: args.lastInputTokens, + store, + onEvent: (event) => + logger.info({ chatId, ...event }, "context-compacted"), + }); + + return { + messages: result.messages, + compactionTrace: result.compactionTrace, + }; + } catch (error) { + logger.error( + { error, chatId }, + "Tier 1 compaction failed; sending uncompacted history", + ); + return { messages }; + } +} + /** * Whether the provider's native web_search tool should be injected for this * turn. True only when the request opted into search AND the provider hasn't @@ -461,6 +951,7 @@ export const prepareChatTurn = async ( frontendUrl, runMode = "interactive", onActivity, + signal, } = input; const workspace = await queries.getWorkspace(workspaceId); @@ -557,12 +1048,80 @@ export const prepareChatTurn = async ( const systemPrompt = generation.systemPrompt!; + // --- Context compaction & recovery (ADR-0012) --- + // The runtime (window budget, config, summarizer) is resolved once and shared + // by Tier 1 and the recovery middleware so they never disagree. Never throws. + const compactionRuntime = await buildCompactionRuntime({ + chatId: request.id, + provider, + resolvedModelId, + opened, + // Thread the activity callback so the summarizer heartbeat can bump the + // per-step stall watchdog (ADR-0012 §Summarizer hardening). `onActivity` accepts an + // optional event, so it satisfies the `() => void` heartbeat signature + // directly — the interval invokes it with no event (timer-only bump). + onActivity, + // Thread the abort signal so a cancelled/timed-out run aborts summarize + // instead of leaking past the heartbeat-suppressed watchdog (review Fix B). + signal, + }); + + // Per-turn overhead: system prompt + tool schemas, sent on every turn but + // invisible to a message-only estimate (ADR-0012 §Tier 1 (trigger projection)). + const overheadTokens = estimateOverheadTokens(systemPrompt, wrappedTools); + + // Tier 1 is best-effort: a failure here must never break the turn — recovery + // (ADR-0012 §Recovery) is the net. Runs AFTER inlineFileUrls so the estimate sees the real + // payload (ADR-0012 §Token estimation). Cross-turn durable compaction is keyed by chat id; headless + // runs (triggers, sub-agents) carry no chat id and have no durable history to + // compact (ADR-0012 §Sub-agents — they are Tier 2 only), so send messages uncompacted. + const chatId = request.id; + const tier1Result = chatId + ? await applyTier1IfNeeded({ + chatId, + runtime: compactionRuntime, + messages: inlinedMessages, + // Pre-inline messages for ADR-0012 §Summary invalidation comparison (ADR-0012 §Summary invalidation): both sides must use the + // same URL format (storage:// / http://) to avoid false positives. + rawMessages: messages, + // Pre-overwrite baseline threaded from agent-runner (ADR-0012 §Summary invalidation). + priorMessages: input.priorMessages, + overheadTokens, + // Prior turn's provider-reported input token count (ADR-0012 §Tier 1 (trigger projection) / ADR-0012 §Context-usage ring): the + // corrective baseline for the Tier 1 trigger projection on turns ≥ 2. + // Absent on turn 1 → cold-start margin applies. + lastInputTokens: findLastInputTokens(messages), + }) + : { messages: inlinedMessages }; + const compactedMessages = tier1Result.messages; + + // Recovery (ADR-0012 §Recovery, ADR-0012 §Recovery is the net): always wired, even when proactive compaction is off. + // Headless runs get trim+retry but no dirty flag (no durable chat row). + const recovery: RecoveryContext = { + chatId, + imageProvider: compactionRuntime.imageProvider, + // ADR-0012 §Tier 1 (budget math): subtract the per-turn overhead so recovery uses the same effective + // target as Tier 1. Without this, a large overhead (e.g. 65%+ of the window) + // means the recovery retry still overflows even after trimming. + targetTokens: Math.max( + 0, + compactionRuntime.budget.targetTokens - overheadTokens, + ), + keepRecentMessages: compactionRuntime.config.keepRecentMessages, + minPrunableChars: compactionRuntime.config.minPrunableChars, + summarize: compactionRuntime.summarize, + summarizerWindow: compactionRuntime.summarizerWindow, + markDirty: chatId + ? () => setCompactionDirty(drizzleCompactionStore, chatId) + : undefined, + }; + return { stream: { model, tools: wrappedTools, system: systemPrompt, - messages: inlinedMessages, + messages: compactedMessages, maxSteps: resolvedMaxSteps, temperature: generation.temperature, topP: generation.topP, @@ -584,7 +1143,35 @@ export const prepareChatTurn = async ( frequencyPenalty: agent ? undefined : generation.frequencyPenalty, presencePenalty: agent ? undefined : generation.presencePenalty, seed: agent ? undefined : request.seed, + contextWindow: compactionRuntime.contextWindow, + contextWindowIsDefault: compactionRuntime.contextWindowIsDefault, }, + compactionTrace: tier1Result.compactionTrace, + recovery, + tier2: compactionRuntime.config.compactionEnabled + ? { + // ADR-0012 §Tier 1 (budget math) (Tier 2): the prepareStep estimate counts ModelMessages only — + // system prompt + tool schemas go as separate streamText params and + // are invisible to it, yet they consume the same window. Subtract the + // per-turn overhead so the trigger/target reflect the real wire + // payload (mirrors the Tier 1 and recovery targets above). Without + // this, a large overhead lets the payload blow past the budget before + // Tier 2 ever fires — exactly the tool-heavy case it exists for. + triggerTokens: Math.max( + 0, + compactionRuntime.budget.triggerTokens - overheadTokens, + ), + targetTokens: Math.max( + 0, + compactionRuntime.budget.targetTokens - overheadTokens, + ), + keepRecentMessages: compactionRuntime.config.keepRecentMessages, + minPrunableChars: compactionRuntime.config.minPrunableChars, + imageProvider: compactionRuntime.imageProvider, + summarize: compactionRuntime.summarize, + summarizerWindow: compactionRuntime.summarizerWindow, + } + : null, dispose, }; }; @@ -913,20 +1500,107 @@ const loadSubAgents = async ( description: sa.description, })); + // Provider lookups are memoized so the Tier 2 loop below and the + // createModelFn callback don't each re-fetch + re-open the same provider + // (F1): one getProvider + openProvider per distinct providerId per turn. + const providerCache = new Map< + string, + { provider: Provider; opened: ReturnType } | null + >(); + const resolveSubProvider = async (providerId: string) => { + if (!providerCache.has(providerId)) { + const p = await queries.getProvider(providerId, orgId, workspaceId); + providerCache.set( + providerId, + p ? { provider: p, opened: openProvider(p) } : null, + ); + } + return providerCache.get(providerId) ?? null; + }; + + // Tier 2 only for sub-agents (ADR-0012 §Sub-agents: no durable history for Tier 1). + // Resolve per-sub-agent compaction runtime so each sub-agent's tool loop + // gets a prepareStep calibrated to its own model's context window. + const subAgentPrepareSteps = new Map< + string, + import("ai").PrepareStepFunction + >(); + // Per-sub-agent overflow recovery (ADR-0012 §Sub-agents). Built ALWAYS — recovery (ADR-0012 §Recovery is the net) is + // the net even when the ADR-0012 §Config & kill switch disables proactive compaction, exactly + // as on the main path. Tier 2 (below) is the only part gated by the switch. + const subAgentRecoveries = new Map(); + await Promise.all( + subAgentRecords.map(async (sa) => { + try { + const resolved = await resolveSubProvider(sa.providerId); + if (!resolved) return; + const runtime = await buildCompactionRuntime({ + // Sub-agents have no chat row; tag logs with the sub-agent id (F3). + chatId: sa.id, + provider: resolved.provider, + resolvedModelId: sa.modelId, + opened: resolved.opened, + }); + // ADR-0012 §Tier 1 (budget math): subtract the sub-agent's per-turn + // overhead so its recovery/Tier 2 targets match the main path. The + // sub-agent's tool schemas resolve lazily at invocation and aren't + // available here, so the system prompt — the dominant, predictable + // component — is the floor; under-counting overhead only trims slightly + // less aggressively, and recovery's force-halving still backstops it. + const subOverheadTokens = estimateOverheadTokens( + sa.systemPrompt ?? undefined, + undefined, + ); + // Recovery net first (not gated by compactionEnabled). No markDirty — + // sub-agents have no durable chat row to flag. + subAgentRecoveries.set(sa.id, { + chatId: sa.id, + imageProvider: runtime.imageProvider, + targetTokens: Math.max( + 0, + runtime.budget.targetTokens - subOverheadTokens, + ), + keepRecentMessages: runtime.config.keepRecentMessages, + minPrunableChars: runtime.config.minPrunableChars, + summarize: runtime.summarize, + summarizerWindow: runtime.summarizerWindow, + }); + if (!runtime.config.compactionEnabled) return; + const tier2: Tier2Context = { + triggerTokens: Math.max( + 0, + runtime.budget.triggerTokens - subOverheadTokens, + ), + targetTokens: Math.max( + 0, + runtime.budget.targetTokens - subOverheadTokens, + ), + keepRecentMessages: runtime.config.keepRecentMessages, + minPrunableChars: runtime.config.minPrunableChars, + imageProvider: runtime.imageProvider, + summarize: runtime.summarize, + summarizerWindow: runtime.summarizerWindow, + }; + subAgentPrepareSteps.set(sa.id, buildTier2PrepareStep(tier2)); + } catch (error) { + logger.warn( + { error, subAgentId: sa.id }, + "Failed to build Tier 2 for sub-agent; skipping", + ); + } + }), + ); + const subAgentMcpClients: MCPClient[] = []; const subAgentTools = await createSubAgentTools( subAgentRecords, async (providerId: string, modelId: string) => { - const subProvider = await queries.getProvider( - providerId, - orgId, - workspaceId, - ); - if (!subProvider) { + const resolved = await resolveSubProvider(providerId); + if (!resolved) { throw new Error(`Provider '${providerId}' not found for sub-agent`); } - return openProvider(subProvider).languageModel(modelId); + return resolved.opened.languageModel(modelId); }, async (subAgentId: string, toolSetIds: string[]) => { const subAgentRecord = subAgentRecords.find((sa) => sa.id === subAgentId); @@ -941,7 +1615,164 @@ const loadSubAgents = async ( return subTools; }, onProgress, + (id) => subAgentPrepareSteps.get(id), + (id) => subAgentRecoveries.get(id), ); return { subAgents, subAgentTools, subAgentMcpClients }; }; + +// --- Force-compact endpoint (ADR-0012 §Force-compact on demand) --- + +/** + * Runs Tier 1 compaction unconditionally for a chat (ADR-0012 §Force-compact on demand: clickable ring). + * Forces the compaction regardless of the token threshold by injecting + * compactionDirty=true so the ADR-0012 §Recovery force path bypasses the estimate gate. + * Called from `POST /chats/:id/compact`; the route guards against concurrent + * runs before calling here. + */ +export async function forceCompactChat( + chatId: string, + workspaceId: string, + orgId: string, +): Promise<{ + estimatedTokens: number; + /** Message-only estimate of the history BEFORE compaction (same basis as estimatedTokens). */ + tokensBefore: number; + /** Number of prefix messages folded into the summary this run (0 if no summary). */ + messagesDropped: number; + /** The config keep-recent count — the client compares messagesDropped against it. */ + keepRecentMessages: number; + contextWindow: number; + contextWindowIsDefault: boolean; + /** ADR-0012 §Compaction trace in the timeline — the persisted synthetic trace message, when a summary was produced. */ + traceMessage?: PlatypusUIMessage; +}> { + // Load the chat record (workspace-scoped). + const chatRows = await db + .select({ + agentId: chatTable.agentId, + providerId: chatTable.providerId, + modelId: chatTable.modelId, + }) + .from(chatTable) + .where( + and(eq(chatTable.id, chatId), eq(chatTable.workspaceId, workspaceId)), + ) + .limit(1); + if (chatRows.length === 0) throw new NotFoundError("Chat not found"); + const chatRow = chatRows[0]; + + // Resolve provider + model via the shared query layer (respects org-scoped + // Shared resources and the ADR-0007 attachment gate). + let provider: Provider; + let resolvedModelId: string; + + if (chatRow.agentId) { + const agentRow = await drizzleChatTurnQueries.getAgent( + chatRow.agentId, + orgId, + workspaceId, + ); + if (!agentRow) throw new NotFoundError("Agent not found"); + resolvedModelId = agentRow.modelId; + const providerRow = await drizzleChatTurnQueries.getProvider( + agentRow.providerId, + orgId, + workspaceId, + ); + if (!providerRow) throw new NotFoundError("Provider not found"); + provider = providerRow; + } else if (chatRow.providerId && chatRow.modelId) { + const providerRow = await drizzleChatTurnQueries.getProvider( + chatRow.providerId, + orgId, + workspaceId, + ); + if (!providerRow) throw new NotFoundError("Provider not found"); + provider = providerRow; + resolvedModelId = chatRow.modelId; + } else { + throw new ValidationError("Chat has no provider/model configured"); + } + + const opened = openProvider(provider); + const runtime = await buildCompactionRuntime({ + chatId, + provider, + resolvedModelId, + opened, + }); + + const messages = await loadChatMessages(chatId); + const rawState = + (await drizzleCompactionStore.readState(chatId)) ?? EMPTY_COMPACTION_STATE; + + // Force-trigger by marking dirty in the in-memory copy (ADR-0012 §Recovery: bypass the + // estimate gate so the compaction actually shrinks the history). + const forcedState: CompactionState = { ...rawState, compactionDirty: true }; + + const result = await applyTier1Compaction({ + chatId, + messages, + state: forcedState, + budget: runtime.budget, + config: runtime.config, + imageProvider: runtime.imageProvider, + summarize: runtime.summarize, + store: drizzleCompactionStore, + summarizerWindow: runtime.summarizerWindow, + }); + + // Message-only estimate (no per-turn system/tool overhead): the ring uses it + // as a transient post-compact value that the next response's provider count + // supersedes. It therefore reads slightly low vs the live ring numerator + // (which includes overhead) — acceptable for an immediate visual refresh. + const estimatedTokens = estimateTokens( + uiMessagesToCountUnits(result.messages, runtime.imageProvider), + ); + // Pre-compaction estimate (same basis) so the client can decide whether the + // drop is significant enough to confirm — ADR-0012 §Force-compact on demand. + const tokensBefore = estimateTokens( + uiMessagesToCountUnits(messages, runtime.imageProvider), + ); + + // ADR-0012 §Compaction trace in the timeline: a forced compaction has no live stream to inject the trace into, so + // persist it as a standalone synthetic assistant message. Appended after the + // last real message — above the watermark (which already advanced inside + // applyTier1Compaction), so it is never itself summarized. The strip filter + // keeps it out of the model payload on subsequent turns. Only written when a + // model summary was actually produced (result.compactionTrace is undefined + // otherwise — see Tier1Output). + let traceMessage: PlatypusUIMessage | undefined; + if (result.compactionTrace) { + traceMessage = buildCompactionTraceMessage( + result.compactionTrace, + createIdGenerator({ prefix: "msg", size: 16 })(), + ); + // Atomic jsonb append: concatenate at the DB rather than overwrite + // the whole column from the in-memory `messages` snapshot loaded earlier. + // The route guards with runRegistry.has(chatId), but a run that registers in + // the has()→write window — or a second concurrent POST /compact — would + // otherwise be clobbered by this stale array. `||` appends to whatever is + // stored now, so no concurrently-written messages are lost. + await db + .update(chatTable) + .set({ + messages: sql`coalesce(${chatTable.messages}, '[]'::jsonb) || ${JSON.stringify([traceMessage])}::jsonb`, + }) + .where( + and(eq(chatTable.id, chatId), eq(chatTable.workspaceId, workspaceId)), + ); + } + + return { + estimatedTokens, + tokensBefore, + messagesDropped: result.compactionTrace?.messagesDropped ?? 0, + keepRecentMessages: runtime.config.keepRecentMessages, + contextWindow: runtime.contextWindow, + contextWindowIsDefault: runtime.contextWindowIsDefault, + traceMessage, + }; +} diff --git a/apps/backend/src/tools/sub-agent.test.ts b/apps/backend/src/tools/sub-agent.test.ts index 5af36d14..3ab227c9 100644 --- a/apps/backend/src/tools/sub-agent.test.ts +++ b/apps/backend/src/tools/sub-agent.test.ts @@ -43,13 +43,16 @@ function createMockFullStream( }; } -const { mockStream, MockToolLoopAgent } = vi.hoisted(() => { +const { mockStream, MockToolLoopAgent, capturedSettings } = vi.hoisted(() => { const mockStream = vi.fn(); + const capturedSettings: Record[] = []; class MockToolLoopAgent { - constructor() {} + constructor(settings: Record) { + capturedSettings.push(settings); + } stream = mockStream; } - return { mockStream, MockToolLoopAgent }; + return { mockStream, MockToolLoopAgent, capturedSettings }; }); vi.mock("ai", async () => { @@ -74,6 +77,25 @@ describe("createSubAgentTool", () => { tools: {}, }; + beforeEach(() => { + capturedSettings.length = 0; + }); + + describe("Tier 2 prepareStep (ADR-0012 §Sub-agents)", () => { + it("passes prepareStep to ToolLoopAgent when provided", () => { + const mockPrepareStep = vi.fn(); + createSubAgentTool({ ...baseOptions, prepareStep: mockPrepareStep }); + expect(capturedSettings[0]).toMatchObject({ + prepareStep: mockPrepareStep, + }); + }); + + it("passes undefined prepareStep when not provided", () => { + createSubAgentTool(baseOptions); + expect(capturedSettings[0].prepareStep).toBeUndefined(); + }); + }); + describe("toolName generation", () => { it("generates PascalCase delegateTo prefix", () => { const { toolName } = createSubAgentTool(baseOptions); @@ -398,4 +420,34 @@ describe("createSubAgentTools", () => { expect(Object.keys(result)).toHaveLength(1); }); + + it("threads prepareStepFn to ToolLoopAgent for each sub-agent (ADR-0012 §Sub-agents)", async () => { + capturedSettings.length = 0; + const subAgents = [ + { id: "sa-1", name: "Alpha", providerId: "p1", modelId: "m1" }, + { id: "sa-2", name: "Beta", providerId: "p1", modelId: "m1" }, + ]; + const mockStep1 = vi.fn(); + const mockStep2 = vi.fn(); + const prepareStepFn = vi + .fn() + .mockImplementation((id: string) => + id === "sa-1" ? mockStep1 : mockStep2, + ); + + const createModelFn = vi.fn().mockResolvedValue({}); + const loadToolsFn = vi.fn().mockResolvedValue({}); + + await createSubAgentTools( + subAgents, + createModelFn, + loadToolsFn, + undefined, + prepareStepFn, + ); + + expect(capturedSettings).toHaveLength(2); + expect(capturedSettings[0].prepareStep).toBe(mockStep1); + expect(capturedSettings[1].prepareStep).toBe(mockStep2); + }); }); diff --git a/apps/backend/src/tools/sub-agent.ts b/apps/backend/src/tools/sub-agent.ts index 0a83e86b..52f13f55 100644 --- a/apps/backend/src/tools/sub-agent.ts +++ b/apps/backend/src/tools/sub-agent.ts @@ -2,11 +2,17 @@ import { stepCountIs, tool, ToolLoopAgent, + wrapLanguageModel, type LanguageModel, + type PrepareStepFunction, type Tool, } from "ai"; import { z } from "zod"; import { logger } from "../logger.ts"; +import { + contextOverflowRecoveryMiddleware, + type RecoveryContext, +} from "../runs/recovery.ts"; /** * Single source of truth for the sub-agent delegation tool name. @@ -49,6 +55,17 @@ interface SubAgentToolOptions { maxSteps?: number; /** Called on each activity update from the sub-agent. Used to reset the parent run's per-step timeout. */ onProgress?: () => void; + /** Tier 2 in-turn compaction callback (ADR-0012 §Tier 2 / §Sub-agents). Null when compaction disabled. */ + prepareStep?: PrepareStepFunction; + /** + * Context-overflow recovery (ADR-0012 §Recovery) for the sub-agent's own model calls. + * Sub-agents run a ToolLoopAgent OUTSIDE the parent run's recovery-wrapped + * model, so without this their only overflow protection is Tier 2 — which + * fires late (its trigger omits the sub-agent's tool/prompt overhead) and has + * no net behind it. Wrapping here gives every sub-agent step one trim+retry, + * matching the main path (ADR-0012 §Sub-agents). `markDirty` is omitted (no chat row). + */ + recovery?: RecoveryContext; } /** @@ -68,17 +85,36 @@ export const createSubAgentTool = (options: SubAgentToolOptions) => { tools, maxSteps = 50, onProgress, + prepareStep, + recovery, } = options; const toolName = subAgentToolName({ name }); + // Wrap the sub-agent model with the overflow-recovery middleware (ADR-0012 §Sub-agents) so + // a step that overflows gets one trim+retry instead of hard-failing the task. + // Guard on `typeof model !== "string"`: `wrapLanguageModel` needs a model + // INSTANCE, and `LanguageModel` permits a bare string id. The factory returns + // an instance today, but a string would otherwise throw here and the catch in + // `createSubAgentTools` would silently drop the whole sub-agent — so degrade to + // the unwrapped model instead. The remaining cast only reconciles the + // V2/V3 instance union (wrapLanguageModel accepts both at runtime). + const recoveredModel: LanguageModel = + recovery && typeof model !== "string" + ? wrapLanguageModel({ + model: model as Parameters[0]["model"], + middleware: contextOverflowRecoveryMiddleware(recovery), + }) + : model; + const agent = new ToolLoopAgent({ - model, + model: recoveredModel, instructions: systemPrompt || `You are a specialized sub-agent named "${name}". Complete the task you are given thoroughly and accurately.`, tools, stopWhen: [stepCountIs(maxSteps)], + prepareStep, }); return { @@ -192,6 +228,8 @@ export const createSubAgentTools = async ( toolSetIds: string[], ) => Promise>, onProgress?: () => void, + prepareStepFn?: (id: string) => PrepareStepFunction | undefined, + recoveryFn?: (id: string) => RecoveryContext | undefined, ): Promise> => { const tools: Record = {}; @@ -216,6 +254,8 @@ export const createSubAgentTools = async ( tools: subAgentTools, maxSteps: subAgent.maxSteps || 50, onProgress, + prepareStep: prepareStepFn?.(subAgent.id), + recovery: recoveryFn?.(subAgent.id), }); tools[toolName] = tool; diff --git a/apps/frontend/components/ai-elements/tool.tsx b/apps/frontend/components/ai-elements/tool.tsx index fee20b81..aebb14b3 100644 --- a/apps/frontend/components/ai-elements/tool.tsx +++ b/apps/frontend/components/ai-elements/tool.tsx @@ -7,6 +7,7 @@ import { CollapsibleTrigger, } from "@/components/ui/collapsible"; import { cn } from "@/lib/utils"; +import { useToolDuration } from "@/hooks/use-tool-completed-at"; import type { ToolUIPart } from "ai"; import { ArrowRightLeftIcon, @@ -44,6 +45,9 @@ import { CodeBlock } from "./code-block"; export function humanizeToolType(type: string): string { // Strip the "tool-" prefix const name = type.startsWith("tool-") ? type.slice(5) : type; + // Synthetic compaction trace (§K/11c) — render a human label instead of the + // raw, underscore-laden function name. + if (name === "compact_context") return "Context compaction"; // Split on camelCase boundaries const words = name.replace(/([a-z])([A-Z])/g, "$1 $2").split(" "); // Capitalise the first word, lowercase the rest @@ -172,6 +176,10 @@ export type ToolHeaderProps = { label?: string; type: ToolUIPart["type"]; state: ToolUIPart["state"]; + /** ISO timestamp of when this tool call began, if known. */ + startedAt?: string; + /** ISO timestamp of when this tool call completed, if known. */ + completedAt?: string; className?: string; }; @@ -210,8 +218,11 @@ export const ToolHeader = ({ label, type, state, + startedAt, + completedAt, ...props }: ToolHeaderProps) => { + const duration = useToolDuration(state, startedAt, completedAt); // getToolIcon returns a stable module-level Lucide icon; render via // createElement so the dynamic selection isn't flagged as a component // created during render. @@ -237,6 +248,11 @@ export const ToolHeader = ({ )} {getStatusBadge(state)} + {duration && ( + + {duration} + + )} diff --git a/apps/frontend/components/chat-message.tsx b/apps/frontend/components/chat-message.tsx index 102e8bb1..b709af0a 100644 --- a/apps/frontend/components/chat-message.tsx +++ b/apps/frontend/components/chat-message.tsx @@ -35,7 +35,7 @@ import { TextUIPart, type ChatStatus, } from "ai"; -import { Agent } from "@platypus/schemas"; +import { Agent, type MessageStats } from "@platypus/schemas"; import { BotIcon, CheckIcon, @@ -44,10 +44,87 @@ import { TrashIcon, RefreshCwIcon, XIcon, + InfoIcon, } from "lucide-react"; import { Textarea } from "./ui/textarea"; import { LoadSkillTool } from "./load-skill-tool"; import { SubAgentTool } from "./sub-agent-tool"; +import { Button } from "./ui/button"; +import { Popover, PopoverContent, PopoverTrigger } from "./ui/popover"; +import { Tooltip, TooltipContent, TooltipTrigger } from "./ui/tooltip"; +import { formatDurationMs } from "@/lib/utils"; + +const getToolStartedAt = (part: unknown): string | undefined => { + const raw = (part as { toolMetadata?: { startedAt?: unknown } })?.toolMetadata + ?.startedAt; + return typeof raw === "string" ? raw : undefined; +}; + +const getToolCompletedAt = (part: unknown): string | undefined => { + const raw = (part as { toolMetadata?: { completedAt?: unknown } }) + ?.toolMetadata?.completedAt; + return typeof raw === "string" ? raw : undefined; +}; + +function MessageStatsPopover({ stats }: { stats: MessageStats }) { + const ttft = stats.firstTokenAt + ? formatDurationMs( + new Date(stats.firstTokenAt).getTime() - + new Date(stats.startedAt).getTime(), + ) + : undefined; + const total = formatDurationMs( + new Date(stats.finishedAt).getTime() - new Date(stats.startedAt).getTime(), + ); + return ( + + + + + + + + +

+

+ Response stats +

+

+ In:{" "} + {stats.inputTokens.toLocaleString()}{" "} + Out:{" "} + {stats.outputTokens.toLocaleString()} +

+ {ttft && ( +

+ TTFT: {ttft} +

+ )} + {total && ( +

+ Total: {total} +

+ )} +
+ + + + In: {stats.inputTokens.toLocaleString()} · Out:{" "} + {stats.outputTokens.toLocaleString()} + {ttft ? ` · TTFT: ${ttft}` : ""} + {total ? ` · Total: ${total}` : ""} + + + ); +} interface ChatMessageProps { /** The message object to render */ @@ -120,6 +197,7 @@ export const ChatMessage = memo(function ChatMessage({ )); + const fileParts = message.parts?.filter( (part): part is FileUIPart => part.type === "file" && !part.mediaType?.startsWith("image/"), @@ -134,6 +212,11 @@ export const ChatMessage = memo(function ChatMessage({ .map((part) => part.text) .join("") || ""; + const assistantStats = + message.role === "assistant" + ? (message.metadata as { stats?: MessageStats } | undefined)?.stats + : undefined; + return ( {fileParts && fileParts.length > 0 && ( @@ -154,7 +237,17 @@ export const ChatMessage = memo(function ChatMessage({ )} {message.parts?.map((part, i) => { - if (part.type === "text") { + if (part.type === "step-start") { + // The SDK emits step-start at every round boundary. We don't render + // it — tool-call timestamps appear inside the tool header below. + return null; + } else if (part.type === "text") { + const partText = (part as TextUIPart).text; + + // Skip empty text parts on assistant messages — the SDK emits them + // between steps; rendering would leave a bare avatar bubble. + if (message.role === "assistant" && !partText.trim()) return null; + if (isEditing) { const isFirstTextPart = i === message.parts.findIndex((p) => p.type === "text"); @@ -186,7 +279,7 @@ export const ChatMessage = memo(function ChatMessage({ avatar={assistantAvatar} > - {(part as TextUIPart).text} + {partText} ); @@ -212,6 +305,8 @@ export const ChatMessage = memo(function ChatMessage({ @@ -251,6 +346,8 @@ export const ChatMessage = memo(function ChatMessage({ state={toolPart.state} type={toolPart.type} label={toolLabel} + startedAt={getToolStartedAt(toolPart)} + completedAt={getToolCompletedAt(toolPart)} /> @@ -354,6 +451,7 @@ export const ChatMessage = memo(function ChatMessage({ )} + {assistantStats && } ))} diff --git a/apps/frontend/components/chat.tsx b/apps/frontend/components/chat.tsx index cfc38d1a..d9fe1a19 100644 --- a/apps/frontend/components/chat.tsx +++ b/apps/frontend/components/chat.tsx @@ -32,6 +32,7 @@ import { Agent, ToolSet, Skill, + type MessageStats, } from "@platypus/schemas"; import { type PlatypusUIMessage } from "@platypus/backend/src/types"; import useSWR from "swr"; @@ -55,6 +56,7 @@ import { TooltipTrigger, } from "@/components/ui/tooltip"; import { ChatMessage } from "./chat-message"; +import { ContextUsageRing } from "./context-usage-ring"; import { ModelSelectorDialog } from "./model-selector-dialog"; import { toast } from "sonner"; @@ -378,19 +380,46 @@ export const Chat = ({ [messages, setMessages], ); - // TODO: Ideally show a loading indicator here - if (isLoading || !providersData) return null; + // Resolve the effective provider+model for the ring (ADR-0012 §Context-usage ring: use selected + // model's window, not last message's window). When an agent is selected we + // look up its provider/model; otherwise use the directly selected values. + const effectiveRingProviderId = agentId + ? (agents.find((a) => a.id === agentId)?.providerId ?? "") + : providerId; + const effectiveRingModelId = agentId + ? (agents.find((a) => a.id === agentId)?.modelId ?? "") + : modelId; + + // Fetch resolved context window for the currently-selected model (cached on + // the backend). Returns null contextWindow when source = "default" so the ring + // renders neutral (ADR-0012 §Context-usage ring). Re-fetches automatically on model/agent change. + const { data: contextWindowData } = useSWR<{ + contextWindow: number | null; + source: string; + keepRecentMessages?: number; + }>( + backendUrl && user && effectiveRingProviderId && effectiveRingModelId + ? joinUrl( + backendUrl, + `/organizations/${orgId}/workspaces/${workspaceId}/providers/${effectiveRingProviderId}/context-window?modelId=${encodeURIComponent(effectiveRingModelId)}`, + ) + : null, + fetcher, + ); - // Show alert if no providers are configured - if (providers.length === 0) { - return ( -
-
- -
-
- ); - } + // Stats from the last completed assistant message for the ring (ADR-0012 §Context-usage ring) and + // per-message stats popover (ADR-0012 §Per-message stats). + const lastAssistantStats = useMemo(() => { + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + const stats = (msg.metadata as { stats?: MessageStats } | undefined) + ?.stats; + if (msg.role === "assistant" && stats) { + return stats; + } + } + return null; + }, [messages]); const selectedAgent = agentId ? agents.find((a) => a.id === agentId) : null; // Resolve the provider backing the current selection, whether that's a raw @@ -416,6 +445,145 @@ export const Chat = ({ chatData?.status === "running" && status === "ready"; const effectiveStatus = isReconnectedToRunningRun ? "streaming" : status; + // ADR-0012 §Force-compact on demand — state for pending (deferred while streaming), + // in-flight compaction spinner, and the post-compact token estimate that + // refreshes the ring immediately (before the next completed message). + const [compactPending, setCompactPending] = useState(false); + const [isCompacting, setIsCompacting] = useState(false); + // Stable count of assistant messages — unaffected by optimistic user-message + // pushes (ADR-0012 §Context-usage ring). Used to tag post-compact estimates so the ring doesn't + // snap back to the old value when the user hits Send. + const assistantMessageCount = useMemo( + () => messages.filter((m) => m.role === "assistant").length, + [messages], + ); + + // Post-compact estimate, tagged with the assistant message count at + // compaction time so it auto-expires once a new assistant message arrives + // (the next provider count is authoritative). Using assistantMessageCount + // instead of messages.length fixes the ring-jump bug (ADR-0012 §Context-usage ring): an optimistic user + // message increments messages.length but not assistantMessageCount, so the + // compacted estimate stays valid until the real response lands. + const [compacted, setCompacted] = useState<{ + atAssistantMessageCount: number; + tokens: number; + } | null>(null); + + const runCompact = useCallback(async () => { + if (!backendUrl) return; + setIsCompacting(true); + try { + const res = await fetch( + joinUrl( + backendUrl, + `/organizations/${orgId}/workspaces/${workspaceId}/chat/${chatId}/compact`, + ), + { method: "POST", credentials: "include" }, + ); + if (!res.ok) { + const body = await res.json().catch(() => ({})); + toast.error((body as { error?: string }).error ?? "Compact failed"); + return; + } + // Refresh the ring immediately from the post-compact estimate (ADR-0012 §Force-compact on demand). This + // is a message-only char/4 estimate (no per-turn system/tool overhead), + // so it reads slightly low until the next real response replaces it with + // the provider's authoritative count. + const body = (await res.json().catch(() => ({}))) as { + inputTokens?: number; + traceMessage?: PlatypusUIMessage; + }; + if (typeof body.inputTokens === "number") { + setCompacted({ + atAssistantMessageCount: assistantMessageCount, + tokens: body.inputTokens, + }); + } + // ADR-0012 §Compaction trace in the timeline: append the persisted compaction-trace message so it shows in the + // timeline immediately. It carries the id the backend persisted, so a + // later SWR revalidation reconciles rather than duplicating it. + if (body.traceMessage) { + const traceMessage = body.traceMessage; + setMessages((prev) => + prev.some((m) => m.id === traceMessage.id) + ? prev + : [...prev, traceMessage], + ); + } + toast.success("Context compacted"); + } catch { + toast.error("Compact request failed"); + } finally { + setIsCompacting(false); + } + }, [ + backendUrl, + orgId, + workspaceId, + chatId, + assistantMessageCount, + setMessages, + ]); + + const handleCompact = useCallback(() => { + // ADR-0012 §Force-compact on demand: confirm ONLY when the drop is significant; + // below that, run immediately. The summarized prefix is everything before the + // keep-recent boundary, so messagesDropped ≈ messages.length − keepRecent, and + // the ADR's "messagesDropped > keepRecentMessages" criterion reduces to the + // pre-run-computable "messages.length > 2 × keepRecent". (The >30%-reduction + // criterion needs the post-run summary size; we don't gate on it here — the op + // is non-destructive either way per ADR-0012 §View, not delete.) + // Confirm at click time (not after the deferred run fires) so the prompt never + // surprises the user mid-stream. + const keepRecent = contextWindowData?.keepRecentMessages ?? 10; + const significant = messages.length > keepRecent * 2; + if ( + significant && + !window.confirm( + "This will summarize older messages to reduce context usage. The full conversation history is preserved. Continue?", + ) + ) { + return; + } + if (effectiveStatus === "streaming" || effectiveStatus === "submitted") { + setCompactPending(true); + } else { + void runCompact(); + } + }, [contextWindowData, messages.length, effectiveStatus, runCompact]); + + // Fire deferred compact once streaming finishes (ADR-0012 §Force-compact on demand). Already confirmed + // at click time, so this just runs. + useEffect(() => { + if ( + compactPending && + effectiveStatus !== "streaming" && + effectiveStatus !== "submitted" + ) { + // Reacting to a streaming→idle transition to fire a queued action is the + // intended use of an effect here; clearing the flag prevents a re-fire. + // eslint-disable-next-line react-hooks/set-state-in-effect + setCompactPending(false); + void runCompact(); + } + }, [compactPending, effectiveStatus, runCompact]); + + // Early returns live below ALL hooks so hook order stays unconditional + // (react-hooks/rules-of-hooks). The ADR-0012 §Context-usage ring / §Force-compact ring hooks above must always run. + // TODO: Ideally show a loading indicator here + if (isLoading || !providersData) return null; + + // Show alert if no providers are configured + if (providers.length === 0) { + return ( +
+
+ +
+
+ ); + } + const handleSubmit = async (message: PromptInputMessage) => { // Stop the stream if currently streaming or submitted if (effectiveStatus === "streaming" || effectiveStatus === "submitted") { @@ -576,6 +744,22 @@ export const Chat = ({ Search )} + - - - - - + + + + + + + + + + {selectedAgent.description?.trim() || "Agent info"} + + = 0.7, red >= 0.9. + * Shows neutral grey with no percentage when contextWindow is unknown/default + * (ADR-0012 §Context-usage ring) or when no run has completed yet. + * + * When `onClick` is provided the ring is clickable (ADR-0012 §Force-compact on demand). + * - While `isPending` (click queued, waiting for streaming to finish): shows + * a pending badge and is disabled (ADR-0012 §Force-compact on demand). + * - While `isCompacting`: shows a spinner. + * - `isStreaming` disables clicks entirely (frontend defers via pending flag). + */ +export function ContextUsageRing({ + usedTokens, + contextWindow, + onClick, + isStreaming, + isPending, + isCompacting, +}: { + usedTokens?: number; + contextWindow?: number | null; + onClick?: () => void; + isStreaming?: boolean; + isPending?: boolean; + isCompacting?: boolean; +}) { + const r = 7; + const circumference = 2 * Math.PI * r; + // Amber has no semantic token (unlike primary/destructive); use Tailwind v4's + // default-palette CSS var so the threshold colour isn't a bare hex literal. + const amber = "var(--color-amber-500, #f59e0b)"; + + const isNeutral = !contextWindow || usedTokens === undefined; + const fill = isNeutral + ? 0 + : Math.min(1, Math.max(0, usedTokens / contextWindow)); + + const color = isNeutral + ? "var(--color-muted-foreground)" + : fill >= 0.9 + ? "var(--color-destructive)" + : fill >= 0.7 + ? amber + : "var(--color-primary)"; + + const isDisabled = isPending || isCompacting || isStreaming || !onClick; + const isClickable = !!onClick && !isDisabled; + + // Append the compact affordance whenever the ring is actually clickable — + // including the neutral (unknown-window) state, where the user can still + // force a compaction even though no fill is shown. + const clickHint = isClickable ? " · click to compact" : ""; + let tooltipLabel: string; + if (isPending) { + tooltipLabel = "Will compact when response finishes"; + } else if (isCompacting) { + tooltipLabel = "Compacting…"; + } else if (isNeutral) { + if (!contextWindow) { + tooltipLabel = `Context window unknown · model not found in provider registry${clickHint}`; + } else { + tooltipLabel = `No messages yet · ${contextWindow.toLocaleString()} token window${clickHint}`; + } + } else { + tooltipLabel = `Last response: ${usedTokens!.toLocaleString()} / ${contextWindow!.toLocaleString()} (${Math.round(fill * 100)}%) · current input not yet counted${clickHint}`; + } + + return ( + + +
{ + if (e.key === "Enter" || e.key === " ") { + e.preventDefault(); + onClick!(); + } + } + : undefined + } + tabIndex={isClickable ? 0 : undefined} + role={isClickable ? "button" : undefined} + aria-label={tooltipLabel} + aria-disabled={isDisabled || undefined} + > + {isCompacting ? ( + + ) : ( + + {/* Track */} + + {/* Fill */} + + {/* Pending dot */} + {isPending && } + + )} +
+
+ + {tooltipLabel} + +
+ ); +} diff --git a/apps/frontend/components/dynamic-tool-header.tsx b/apps/frontend/components/dynamic-tool-header.tsx index db72bdbf..766402fb 100644 --- a/apps/frontend/components/dynamic-tool-header.tsx +++ b/apps/frontend/components/dynamic-tool-header.tsx @@ -3,6 +3,7 @@ import { Badge } from "@/components/ui/badge"; import { CollapsibleTrigger } from "@/components/ui/collapsible"; import { cn } from "@/lib/utils"; +import { useToolDuration } from "@/hooks/use-tool-completed-at"; import type { DynamicToolUIPart } from "ai"; import { CheckCircleIcon, @@ -17,6 +18,10 @@ import type { ReactNode } from "react"; export type DynamicToolHeaderProps = { title: string; state: DynamicToolUIPart["state"]; + /** ISO timestamp of when this tool call began, if known. */ + startedAt?: string; + /** ISO timestamp of when this tool call completed, if known. */ + completedAt?: string; className?: string; }; @@ -53,20 +58,30 @@ export const DynamicToolHeader = ({ className, title, state, + startedAt, + completedAt, ...props -}: DynamicToolHeaderProps) => ( - -
- - {title} - {getStatusBadge(state)} -
- -
-); +}: DynamicToolHeaderProps) => { + const duration = useToolDuration(state, startedAt, completedAt); + return ( + +
+ + {title} + {getStatusBadge(state)} + {duration && ( + + {duration} + + )} +
+ +
+ ); +}; diff --git a/apps/frontend/hooks/use-tool-completed-at.ts b/apps/frontend/hooks/use-tool-completed-at.ts new file mode 100644 index 00000000..c7c59d65 --- /dev/null +++ b/apps/frontend/hooks/use-tool-completed-at.ts @@ -0,0 +1,90 @@ +import { useEffect, useState } from "react"; +import { formatDurationMs, formatToolDuration } from "@/lib/utils"; + +const isTerminalState = (state: string): boolean => state.startsWith("output-"); + +const toMs = (iso?: string): number | undefined => { + if (!iso) return undefined; + const t = new Date(iso).getTime(); + return Number.isNaN(t) ? undefined : t; +}; + +/** + * Resolves a tool call's run-duration string for the tool header. + * + * - While the tool is running it shows a live elapsed timer, ticking once a + * second from when the tool was first observed (the server start time isn't + * carried on the streamed message, so we measure on the client). + * - When it turns terminal it freezes: the exact server-measured span if both + * `startedAt`/`completedAt` are persisted (after a chat reload), otherwise + * the client-observed span. + * + * A client clock is only used when the tool was actually seen running this + * session, so reloading a chat (tool already terminal at mount) never shows a + * bogus value — it relies on the server timestamps or shows nothing. + * + * Returns undefined when there's nothing meaningful to show (e.g. a historical + * message that predates duration tracking). + */ +export function useToolDuration( + state: string, + startedAt?: string, + completedAt?: string, +): string | undefined { + const running = !isTerminalState(state); + // All render-visible values are state, never refs or live Date.now() reads + // (upstream's react-hooks rules forbid both during render). Every write is + // deferred into a timer callback — setState synchronously inside an effect + // body is also disallowed, but a timer/interval callback is a permitted site. + const [clientStart, setClientStart] = useState(); + const [clientEnd, setClientEnd] = useState(); + const [elapsedMs, setElapsedMs] = useState(0); + + // While running: record the client-observed start once and tick the elapsed + // time every second. `start` is captured in the effect body (reading + // Date.now() there is fine); the setState calls run in deferred callbacks. + useEffect(() => { + if (!running) return; + const start = Date.now(); + const startTimer = setTimeout( + () => setClientStart((prev) => prev ?? start), + 0, + ); + const id = setInterval(() => setElapsedMs(Date.now() - start), 1000); + return () => { + clearTimeout(startTimer); + clearInterval(id); + }; + }, [running]); + + // First terminal transition after we saw it running: freeze the end span. + // Deferred to a timer callback so it is not a synchronous effect-body write. + useEffect(() => { + if (running || clientStart === undefined) return; + const endTimer = setTimeout( + () => setClientEnd((prev) => prev ?? Date.now()), + 0, + ); + return () => clearTimeout(endTimer); + }, [running, clientStart]); + + // Live elapsed timer while running. `elapsedMs` is 0 until the first tick and + // `clientStart` is set on the next frame, so the very first render returns + // undefined (nothing meaningful to show yet). + if (running) { + if (clientStart === undefined) return undefined; + return formatDurationMs(elapsedMs); + } + + // Terminal: exact server span if available, else the client-observed span. + const serverDuration = formatToolDuration(startedAt, completedAt); + if (serverDuration) return serverDuration; + + const startMs = toMs(startedAt) ?? clientStart; + const endMs = + toMs(completedAt) ?? (clientStart !== undefined ? clientEnd : undefined); + if (startMs !== undefined && endMs !== undefined && endMs >= startMs) { + return formatDurationMs(endMs - startMs); + } + return undefined; +} diff --git a/apps/frontend/lib/utils.test.ts b/apps/frontend/lib/utils.test.ts index 53217425..ac8a6b25 100644 --- a/apps/frontend/lib/utils.test.ts +++ b/apps/frontend/lib/utils.test.ts @@ -1,5 +1,5 @@ import { describe, it, expect } from "vitest"; -import { joinUrl, parseValidationErrors } from "./utils"; +import { formatToolDuration, joinUrl, parseValidationErrors } from "./utils"; describe("joinUrl", () => { it("should join base URL and path", () => { @@ -25,6 +25,34 @@ describe("joinUrl", () => { }); }); +describe("formatToolDuration", () => { + const start = "2026-05-30T12:00:00.000Z"; + const plus = (ms: number) => + new Date(new Date(start).getTime() + ms).toISOString(); + + it("returns undefined when a timestamp is missing", () => { + expect(formatToolDuration(undefined, start)).toBeUndefined(); + expect(formatToolDuration(start, undefined)).toBeUndefined(); + }); + + it("returns undefined for invalid or negative durations", () => { + expect(formatToolDuration("not-a-date", start)).toBeUndefined(); + expect(formatToolDuration(plus(1000), start)).toBeUndefined(); + }); + + it("formats sub-second durations in milliseconds", () => { + expect(formatToolDuration(start, plus(950))).toBe("950ms"); + }); + + it("formats sub-minute durations in seconds with one decimal", () => { + expect(formatToolDuration(start, plus(1200))).toBe("1.2s"); + }); + + it("formats durations over a minute as minutes and seconds", () => { + expect(formatToolDuration(start, plus(63000))).toBe("1m 3s"); + }); +}); + describe("parseValidationErrors", () => { it("should parse validation errors correctly", () => { const errorData = { diff --git a/apps/frontend/lib/utils.ts b/apps/frontend/lib/utils.ts index d7b014f6..b5673274 100644 --- a/apps/frontend/lib/utils.ts +++ b/apps/frontend/lib/utils.ts @@ -18,6 +18,36 @@ export function joinUrl(base: string, path: string): string { return `${normalizedBase}${normalizedPath}`; } +/** + * Formats a tool call's run duration from its start/end ISO timestamps. + * Returns undefined when either timestamp is missing or invalid (e.g. an + * in-progress tool, or a historical message persisted before durations were + * tracked) so the UI can simply render nothing. + * + * Output scales with magnitude: `950ms`, `1.2s`, `1m 3s`. + */ +export function formatToolDuration( + startedAt?: string, + completedAt?: string, +): string | undefined { + if (!startedAt || !completedAt) return undefined; + const start = new Date(startedAt).getTime(); + const end = new Date(completedAt).getTime(); + if (Number.isNaN(start) || Number.isNaN(end) || end < start) return undefined; + return formatDurationMs(end - start); +} + +/** Formats an elapsed millisecond span: `950ms`, `1.2s`, `1m 3s`. */ +export function formatDurationMs(ms: number): string | undefined { + if (!Number.isFinite(ms) || ms < 0) return undefined; + if (ms < 1000) return `${Math.round(ms)}ms`; + const seconds = ms / 1000; + if (seconds < 60) return `${seconds.toFixed(1)}s`; + const minutes = Math.floor(seconds / 60); + const remSeconds = Math.round(seconds % 60); + return `${minutes}m ${remSeconds}s`; +} + export const fetcher = async (input: RequestInfo | URL, init?: RequestInit) => { const res = await fetch(input, { ...init, credentials: "include" }); if (!res.ok) { diff --git a/docs/adr/0012-context-compaction.md b/docs/adr/0012-context-compaction.md new file mode 100644 index 00000000..d06cb26e --- /dev/null +++ b/docs/adr/0012-context-compaction.md @@ -0,0 +1,385 @@ +--- +status: accepted +--- + +# Chat Context Compaction + +Chats hard-fail when message history exceeds a model's context window. This ADR +records the design we shipped to keep them alive, **why** the obvious simpler +options were rejected, and the named parts the implementation refers back to. + +It is self-contained: every decision, mechanism, and trade-off the code cites +lives in a section below. Code comments reference this ADR by section name (e.g. +_"ADR-0012 §Tier 1"_, _"ADR-0012 §Summary invalidation"_) rather than by any +external plan or chunk number. + +If a future change forces a different choice, supersede with a new ADR rather +than editing this one. + +## Context + +The AI SDK (`ai@6`) reports real token usage **after** each call +(`usage.inputTokens`/`outputTokens`/`totalTokens`) but exposes **no** +context-window metadata on the model interface and **no** pre-call tokenizer. +Providers diverge on whether the window is discoverable: Google +(`inputTokenLimit`), OpenRouter (`context_length`), and vLLM/OpenAI-compatible +(`max_model_len`) expose it via API; OpenAI, Anthropic, and Bedrock do not. +Error handling previously covered only auth/rate-limit/5xx — a context-overflow +rejection killed the turn. Top-level chats and sub-agents both run through the +shared `agent-runner`/`ToolLoopAgent`, so one implementation covers both. + +## Decision + +A **two-tier, view-not-delete** compaction model, fed by a **single token +estimator**, with all durable state mutated through a **single versioned CAS +writer**, an always-on **recovery net** for overflow errors the proactive path +misses, and a deterministic **context-editing** pass that prunes stale bulky +tool results without a model call. + +## Principles (load-bearing) + +### View, not delete + +The watermark + summary change _what is sent to the model_, never _what is +stored_. Raw messages persist in the DB untouched. Forced/automatic compaction +is therefore non-destructive in the data sense — a user can still read full +history; a future "expand summary" UI is free — which reduces "irreversible data +loss" objections to a UX-courtesy confirmation rather than a correctness concern. +Never hard-delete a summarized message. + +### One estimator + +Token counting lives in exactly one function over one neutral structure +(`CountUnit[]`). Tier 1 (UIMessages) and Tier 2 (ModelMessages) both normalize +into it, counting only **model-bound** parts (`text`, `tool-call`, +`tool-result`, `file`, `image`). UI-only parts (`reasoning`, `source`, +`step-start`, `data-*`) are excluded on both sides. Divergence between the tiers +is impossible by construction rather than monitored — a tier cannot fire on a +number the other never sees. + +### One durable writer + +All mutations of compaction state (`summaryWatermark`, `contextSummary`, +`compactionDirty`) go through a single compare-and-swap function keyed on a +`version` column. Concurrent runs on one chat (e.g. a trigger run and a user +run), and the interaction between compaction and history-edit invalidation, are +resolved by **version**, not by comparing watermark values — so a watermark that +moves _backward_ on an edit cannot be misread as "not yet advanced" and produce a +stale summary over mutated history. On a CAS conflict the loser re-reads the row: +if the winner already covered its prefix it **skips** (safe no-op); otherwise it +retries **once**, then skips with a contended warning. No recompute-loop, no +livelock. A covered-skip deliberately does **not** clear `compactionDirty`: a +concurrent invalidation also bumps the version but intentionally leaves dirty set +(it resets the summary without shrinking history), so clearing dirty on a skip +could drop a forced compaction the overflow demanded. Leaving it set is strictly +safe — worst case is one extra compaction next turn. + +### Recovery is the net + +A `400/413` context-overflow error is caught, the messages aggressively trimmed +in-memory (via the same Tier 2 adapter — no bespoke trimmer), and the call +retried **once**. Recovery never writes durable summary/watermark state directly — +it flags `compactionDirty` on detection (before the retry outcome), and the next +turn's Tier 1 does the durable compaction. Recovery stays on even when proactive +compaction is globally disabled; it is the last line of defense, not a risk +surface. + +## Mechanisms + +### Window resolution + +`resolveContextWindow(provider, modelId)` resolves per-model in order: manual +override (`provider.modelMeta`) → provider API auto-detect (Google / OpenRouter / +vLLM) → the community-maintained **litellm registry** JSON (covers +OpenAI/Anthropic/Bedrock, which don't expose it) → a conservative `8192` default. +We do **not** maintain our own context-window table. + +- **Key normalization.** Registry keys don't match `resolvedModelId` 1:1. Lookup + order: `exact → strip provider prefix → lowercase → alias map → family +heuristic → MISS`. The family heuristic uses boundary-safe separators + (`"-"`, `"."`, `":"`, `"/"`) so `gpt-4.5-preview` never resolves via a stale + `gpt-4` entry. Every MISS warns (it falls to default — must be visible). +- **Caching & eviction.** Results cache in-memory per provider+model with a TTL. + Editing a `modelMeta` override **immediately** evicts (`evict(providerId)`) in + the provider PUT handler — TTL is only a backstop. `source:"default"` results + use a short TTL (60 s) so a registry miss or transient API blip doesn't pin + 8192 for the full hour. API fetches use a 5 s timeout and single-flight + (`#inflight`) to avoid a cold-cache stampede. +- When the window is default/unknown the ring renders **neutral**, never a + guessed ramp. `maxOutputTokens` is resolved the same way (needed for the budget + math). + +### Token estimation + +Char/4 over **text parts only** (never char/4 a base64 image); a modality table +sizes non-text parts (`anthropic`/`openai`/`default` constants, dimensions from a +cheap PNG/JPEG header parse when bytes are in hand). Used **only on the first +turn** before any provider `usage` exists; every later turn uses the +provider-reported real `usage.inputTokens`. We accept first-turn imprecision +(guarded by a 1.15 cold-start margin and the recovery net) rather than ship a +per-provider tokenizer. The Tier 1 estimate runs **after** file inlining so the +payload counted is the real one. Where image `detail` is unset (the common case) +we assume `high` — **over-counting beats overflow**. A turn-2 divergence check is +a designed-in feedback hook: compare the cold-start estimate against the real +`usage.inputTokens` and warn when they diverge by >50%, to tune the image +constants over time (currently log-only). + +### Tier 1 — cross-turn compaction (durable) + +Runs in `prepareChatTurn` before a response, over durable history (UIMessages). + +- **Budget math.** Trigger and target are fractions of the **input** budget, not + the raw window: `inputBudget = window − maxOutputReserve − safetyReserve` + (safety = `reserveRatio × window`, default 0.05). `triggerTokens = 0.8 × +inputBudget`, `targetTokens = 0.5 × inputBudget`. Per-turn **overhead** (system + prompt + tool schemas + skill list) is counted toward the trigger and + subtracted from the effective target, since it consumes the same window but is + invisible to a message-only estimate. When `maxOutputTokens` is unknown the + output reserve falls back to `min(4096, 0.25 × window)`. +- **Trigger projection.** `projected = max(charBasedEstimate, lastInputTokens)` + where `lastInputTokens` is the prior turn's provider-reported + `usage.inputTokens` (threaded from the last assistant message's + `metadata.stats.contextTokens`). The cold-start ×1.15 margin applies only on + turn 1 when no provider baseline exists. Compact when `projected ≥ trigger`. +- **Hysteresis.** Compaction must reduce the conversation to `≤ targetTokens`, + well below the trigger, so it does **not** re-fire next turn. The trigger (0.8) + and target (0.5) ratios are deliberately distinct. Config is global/env-only + (no submitted schema to validate), so the runtime clamps `target → trigger × +0.9` when an operator sets `COMPACTION_TARGET_RATIO ≥ COMPACTION_TRIGGER_RATIO`. +- **Staged, cheap-first.** Stage 1 **prunes** the older prefix without a model + call (soft-trim bulky tool/RAG results to head+tail, then placeholder over + `minPrunableChars`); only if still above target does Stage 2 **summarize** the + prefix into one synthetic summary message. Tool-call/result pairs are atomic and + never split across the keep boundary. Output: `[system, summary, …kept recent]`. + A visible `context-compacted` event makes it fail-loud. +- **Summarizer model & map-reduce.** Summarize uses the task model + (`taskModelId`), falling back to the main model; same-provider only. When the + prefix exceeds the summarizer's own window it is chunked and map-reduced (a + large cold-start/imported history can't be sent whole). Summarization is + **incremental**: each turn only the messages _after_ the watermark are + summarized and folded into the existing summary, then the watermark advances. +- **Summary invalidation.** If a message at/below `summaryWatermark` is + edited/deleted/regenerated the summary is stale. The handler bumps version + + clears `contextSummary` + resets the watermark in one CAS write. Because the CAS + loser compares **version** (not watermark value), a compaction racing an + invalidation sees a conflict and re-reads the reset state — it can never write a + stale summary over mutated history. The invalidation compares the **un-inlined** + submission (file URLs match on both sides) with stable key ordering (jsonb is + re-ordered by Postgres), against the pre-overwrite DB snapshot loaded before the + sink overwrites the row. + +### Tier 2 — intra-turn compaction (in-memory) + +For a single heavy response (many tool/sub-agent calls) that bloats the window +mid-loop. Runs in the AI SDK `prepareStep` hook on both `streamText` and +`generateText`, over ModelMessages, summarizing old completed tool results while +keeping recent steps verbatim and preserving call/result pairing. Fires **only +when genuinely near the limit** (no per-step overhead on a small loop). **Not +persisted** — the SDK's canonical message list commits to history as normal, and +next turn Tier 1 folds it into the durable summary. One tier cannot cover both +cases: a single response can blow the window without any cross-turn growth (Tier +2's job), and durable history must be compacted before a turn starts (Tier 1's +job). + +### Recovery + +`isContextOverflowError` matches `APICallError` with status `400/413` and a +per-provider body regex (OpenAI/vLLM, Anthropic, Google, Bedrock — fixture-tested +matrix). The recovery middleware wraps the model in **both** `streamText` and +`generateText`, so every step of a tool loop gets detect → flag `compactionDirty` +(persisted on detection, via the durable writer) → trim via the **same Tier 2 +adapter** (system head pinned, keep-recent halved with a floor of 2, forced past +the estimate gate since the provider already rejected the prompt) → retry once. A +second failure surfaces "Conversation too large — start a new chat". Durable +compaction happens on the **next** `prepareChatTurn`, which sees the dirty flag. +Headless runs (no chat row) still get the in-memory trim + retry, but cannot flag +`compactionDirty` — there is nothing to persist to. + +### Sub-agents + +Sub-agents start fresh each invocation (only a `task` string, no cross-turn +history), so they have nothing for Tier 1 to compact — they use **Tier 2 only**. +Each resolves its own model's window/output and passes it through; recovery covers +them too because `agent-runner` is shared. + +### Config & kill switch + +Compaction behavior is **global** (`DEFAULT_COMPACTION_CONFIG`); only window/output +**size** is per-model (via §Window resolution / `provider.modelMeta`). Per-agent +tuning was shipped and then removed — no surveyed tool (Hermes/Codex/Claude +Code/Cline) exposes per-agent knobs, and the ratios self-normalize to the model +window, so per-agent variance bought nothing measurable. The env +`COMPACTION_ENABLED` (default true) disables **all proactive** compaction (Tier 1 + +- Tier 2) in prod without a deploy; **recovery ignores it**. A single-agent + opt-out, if ever needed, would be per-model or per-workspace — not per-agent. + +### Context-usage ring + +The frontend shows a small SVG ring next to the model selector, fill = +`usedTokens / contextWindow`, ramping green → amber (≥0.7) → red (≥0.9), and +**neutral grey when the window is unknown/default**. The window comes from the +**currently selected model** (not the last assistant message's metadata, else it +shows the previous model's window after a switch); the numerator is the last +response's `contextTokens` (the **last step's** `usage.inputTokens` — peak context +fullness, not the run-wide sum, which would over-count multi-step loops). A +required tooltip states the ring reflects the last response, not the unsent +composer input. + +### Per-message stats + +An `(i)` action under each assistant response shows input/output tokens, TTFT, and +total generation time, reusing the existing tool-call timing mechanism. Stats are +stamped on `message.metadata.stats` at the `applyToolCompletions` point (the +`messageMetadata` callback fires at message start, before timing/usage exist). +TTFT/total are server-measured; cost figures use the run-wide token sums. + +### Force-compact on demand + +The ring is clickable: `POST /chats/:id/compact` runs Tier 1 once **regardless of +threshold** (force), persists via the durable writer, and returns the post-compact +usage so the ring refreshes immediately. If a response is streaming the click is +**deferred** (pending badge, disabled, fires on finish); a confirm dialog appears +only when the drop is significant — `messagesDropped > keepRecentMessages` **or** +an estimated reduction `> 30%` of history — below that it runs immediately. Per +§View-not-delete this is not destructive regardless. + +### Compaction trace in the timeline + +Compaction is surfaced as a synthetic `compact_context` tool-call + tool-result +pair, reusing the existing tool-call UI (active spinner → collapsed expander with +a summary excerpt). The Tier 1 path injects the pair into the response stream; the +force-compact path (no live stream) persists a standalone synthetic assistant +message **above** the watermark. The trace part is **stripped before +`convertToModelMessages`** at both call sites so it never replays to the provider +as a phantom tool call; a trace-only message is dropped entirely. The trace is +emitted only when an actual model summary ran (not for prune-only or +dirty-within-target no-ops). + +### Stage 0 — context editing (prune, don't summarize) + +A deterministic, no-model-call pass that runs as **Stage 0 inside +`applyTier1Compaction`, before the trigger decision**, replacing the `output` of +**old bulky** tool results with a self-describing placeholder (names the tool + +elided size; tells the model to re-call). This mirrors Anthropic's +`clear_tool_uses` context editing. It keeps the tool-call block (pairing stays +valid), prunes by **recency count** of tool results (`keepRecentToolResults`, +default 4) above a **size gate** (`minEditableToolChars`, default 50 000), exempts +the newest message, and is idempotent + grow-guarded (never re-elides a +placeholder, never inflates a result smaller than the placeholder). Running it +before the trigger lets a lean view **avoid** summarization entirely (cheaper). +It needs **no durable state, no CAS, no version bump** — it is recomputed from raw +messages each turn, a sibling of the trace-stripping transform. Accepted fidelity +loss: an elided placeholder also flows into any prefix Stage 2 later summarizes +(a huge dump's head+tail is poor summary fodder anyway; raw stays in the DB). + +### Hard window wall (recent-trim gate) + +Missing the soft `targetTokens` is cheap (a hysteresis goal). The hard wall is +`inputBudget` (window − output reserve − safety). Recent (kept) messages are +trimmed **only** when `estimate(recent) + summary > inputBudget` (the call would +actually overflow), not on the soft-target miss — below the wall `recent` stays +full-fidelity and simply re-compacts next turn. The single newest message is +always exempt. A single result too large to fit _even as the newest_ is the +unsolved ingestion-cap case (an over-large dump as the last message will +hard-error) — out of scope here, would need an ingestion cap at storage time. + +### Summarizer hardening + +Tier-1 `summarize()` is a long blocking call inside `prepareChatTurn`, before the +response stream opens, and does not bump the per-step stall timer — the 120 s +watchdog once killed a slow summarize mid-call. Hardening: a **heartbeat** pings +`onActivity` (~every 10 s) so the watchdog keeps resetting; a `maxOutputTokens` +**ceiling** (≈4 000) backstops a degenerate runaway expansion (`finishReason === +"length"` is logged); the summarize call is **cancellable** (`abortSignal`); and a +**structured handoff prompt** (intent · decisions/facts · files/tools · current +state) with an explicit length target reduces loss across repeated +re-compactions. + +## Considered Options + +- **Single-tier compaction (cross-turn only)** — rejected. Cannot rescue a single + response whose own tool loop overflows the window. +- **One estimator per tier** — rejected. Two estimators over two message shapes + drift; collapsed to one estimator + two adapters. +- **Hard-delete / truncate old messages** — rejected. Irreversible, and the + "drops the middle silently" failure mode seen in gateway truncation. View-not- + delete keeps the data and makes the action auditable. +- **Homegrown context-window lookup table** — rejected. Unmaintainable across + providers; the litellm registry is the industry "don't maintain your own table" + answer. +- **A real pre-call tokenizer** — rejected for v1. A heavy per-provider dependency + for a number the provider returns accurately after the first call. +- **Optimistic concurrency by comparing watermark values** — rejected. Breaks when + invalidation moves the watermark backward; versioned CAS removes the + monotonicity assumption. +- **Compacting to the trigger threshold** — rejected; it re-fires every turn (the + thrash failure). Trigger and target ratios are distinct for hysteresis. +- **Per-agent compaction tuning** — shipped then removed; no real tool exposes it + and the ratios self-normalize to the model window. +- **Summarize-only (no context editing)** — insufficient. Bulky kept tool results + dominate `tokensAfter`; deterministic prune-not-summarize is cheaper and lossless + to the DB. +- **A token FLOOR on the trigger** (`max(window × pct, 64000)`) — rejected as an + anti-pattern. It overflows sub-64k models (the trigger never fires). A floor + belongs only on the _window fallback_ (`detected ?? DEFAULT`), never as + `max(detected, FLOOR)`. +- **Sizing the window from litellm `max_tokens`** — rejected. Only + `max_input_tokens` is trusted; `max_tokens` is the output cap, not the context + window, and conflating them mis-sizes the budget. +- **A selectable compaction model in the provider UI** — rejected. Compaction + already runs through `taskModelId` on the chat provider's own client + (same-provider only); on a single-model provider a dropdown is a no-op, and + `workspace.taskModelProviderId` (which routes _other_ task work) deliberately + does not apply to compaction. Not worth the multi-provider wiring now. + +## Open / deferred decisions + +Consciously deferred, with rationale — recorded so the _why-not-yet_ isn't lost: + +- **CAS-contention optimization (the per-turn full-history read + stringify)** is + left unoptimized; the full-prefix compare is already correct, so this is gated on + the `cas.conflict` metric actually showing waste before it's touched. +- **The estimate-vs-real divergence metric** (see §Token estimation) stays log-only + until the image-constant tuning work is picked up. +- **Live `Pending → Running` compaction trace** is deferred; the trace renders + post-hoc "Completed" only. Run/connection decoupling already neutralizes the + data-loss vector, so this is a liveness/UX gap, not correctness. +- **Also deferred:** a message-count force-compact valve (a count-based backstop + independent of the token estimate), a projected-input arc on the ring, persisting + Tier 2 output, model-aware trim aggressiveness, and Anthropic's `count_tokens` for + exact Claude counts — none are needed for the "no more hard fails" goal. +- **Latent invariant — content-type tool-result media.** The `content`-variant + tool result currently serializes media bytes into the char/4 text blob. Fixing it + must be **symmetric** across both adapters (extract media into `nonText` on the UI + _and_ Model side) or it breaks the §One estimator equality — the load-bearing + invariant. No current tool emits content-type media; fix it **before** the first + one does, not after. + +## Consequences + +- **Schema additions.** `provider.modelMeta` (JSONB, per-model window/output + overrides); chat/run gain `contextSummary`, `summaryWatermark`, + `compactionDirty`, `version`. All additive, nullable/defaulted. +- **Lazy rollout, no backfill.** Existing chats compact only on their next turn; no + eager backfill job (it would create a thundering herd of summarize calls). +- **A summarize call costs money and latency.** Stage 0 (context editing) and Stage + 1 (prune) run first without a model call; Stage 2 summarizes only when needed. +- **First-turn token estimates are imprecise** (image-heavy/CJK/JSON); the recovery + net absorbs the misses. +- **Cross-tenant safety.** The submit route verifies the body `id` belongs to the + caller's workspace before a run starts — the compaction store is keyed by chat id + only, so an unvalidated id would otherwise let one workspace mutate another's + summary/watermark. +- **A global `COMPACTION_ENABLED` kill switch** disables proactive compaction in + prod without a deploy; recovery is unaffected. +- **Observability is part of the contract** — emitted as structured `metric:`-tagged + log lines: `compaction.fired`, `summarize.latency_ms`, `recovery.*`, + `context_window.fell_to_default`, `litellm.key_miss`, `cas.conflict`, + `context_edited`. +- **Frontend gains a context-usage ring** (window from the selected model, neutral + when unknown), a per-message stats popover, and a `compact_context` timeline + trace, all reusing the existing tool-call timing/rendering mechanism. +- **Unsolved: the single oversized newest result.** A tool result too large to fit + even as the newest message hard-errors; the fix is an ingestion cap at storage + time, out of scope here. diff --git a/packages/schemas/index.test.ts b/packages/schemas/index.test.ts index c8a3b881..c4915856 100644 --- a/packages/schemas/index.test.ts +++ b/packages/schemas/index.test.ts @@ -13,7 +13,10 @@ import { sandboxEnvSchema, SANDBOX_ENV_MAX_ENTRIES, SANDBOX_ENV_MAX_VALUE_BYTES, + providerSchema, + providerUpdateSchema, providerCreateSchema, + chatSchema, } from "./index"; describe("Organization Schema", () => { @@ -286,6 +289,106 @@ describe("Agent Schema", () => { }); }); +describe("Provider modelMeta (context-compaction §A)", () => { + const base = { + id: "prov-1", + workspaceId: "ws-1", + name: "My Provider", + providerType: "OpenAI" as const, + apiKey: "sk-x", + modelIds: ["gpt-4o"], + taskModelId: "gpt-4o", + memoryExtractionModelId: "gpt-4o", + createdAt: new Date(), + updatedAt: new Date(), + }; + + it("is valid with modelMeta omitted (additive, optional)", () => { + expect(providerSchema.safeParse(base).success).toBe(true); + }); + + it("accepts per-model contextWindow / maxOutputTokens overrides", () => { + const result = providerSchema.safeParse({ + ...base, + modelMeta: { + "gpt-4o": { contextWindow: 128000, maxOutputTokens: 16384 }, + "o1-mini": { contextWindow: 200000 }, + }, + }); + expect(result.success).toBe(true); + }); + + it("rejects a non-positive contextWindow", () => { + const result = providerSchema.safeParse({ + ...base, + modelMeta: { "gpt-4o": { contextWindow: 0 } }, + }); + expect(result.success).toBe(false); + }); + + it("rejects a non-integer window", () => { + const result = providerSchema.safeParse({ + ...base, + modelMeta: { "gpt-4o": { contextWindow: 1.5 } }, + }); + expect(result.success).toBe(false); + }); + + it("carries modelMeta through the update schema", () => { + const result = providerUpdateSchema.safeParse({ + name: "My Provider", + providerType: "OpenAI", + apiKey: "sk-x", + modelIds: ["gpt-4o"], + taskModelId: "gpt-4o", + memoryExtractionModelId: "gpt-4o", + modelMeta: { "gpt-4o": { contextWindow: 128000 } }, + }); + expect(result.success).toBe(true); + }); +}); + +describe("Chat compaction state (context-compaction §C)", () => { + const base = { + id: "chat-1", + workspaceId: "ws-1", + title: "My Chat Title", + status: "succeeded" as const, + isPinned: false, + createdAt: new Date(), + updatedAt: new Date(), + }; + + it("is valid with compaction fields omitted (existing rows)", () => { + expect(chatSchema.safeParse(base).success).toBe(true); + }); + + it("accepts a populated summary + watermark + version", () => { + const result = chatSchema.safeParse({ + ...base, + contextSummary: "Summary of earlier turns.", + summaryWatermark: "msg-42", + compactionDirty: true, + version: 3, + }); + expect(result.success).toBe(true); + }); + + it("accepts an explicitly null summary / watermark", () => { + const result = chatSchema.safeParse({ + ...base, + contextSummary: null, + summaryWatermark: null, + }); + expect(result.success).toBe(true); + }); + + it("rejects a non-integer version", () => { + const result = chatSchema.safeParse({ ...base, version: 1.5 }); + expect(result.success).toBe(false); + }); +}); + describe("Provider Create Schema", () => { const baseProvider = { organizationId: "org-123", diff --git a/packages/schemas/index.ts b/packages/schemas/index.ts index 2086beec..ccfe643b 100644 --- a/packages/schemas/index.ts +++ b/packages/schemas/index.ts @@ -101,6 +101,13 @@ export const chatSchema = z.object({ seed: z.number().optional(), presencePenalty: z.number().optional(), frequencyPenalty: z.number().optional(), + // Context-compaction state (docs/adr/0012). Server-managed; intentionally NOT + // part of chatSubmit/chatUpdate. summaryWatermark is the message id of the + // last summarized message (P1: a view over history, never a delete). + contextSummary: z.string().nullable().optional(), + summaryWatermark: z.string().nullable().optional(), + compactionDirty: z.boolean().optional(), + version: z.number().int().optional(), createdAt: z.date(), updatedAt: z.date(), }); @@ -553,6 +560,19 @@ export const providerApiModeSchema = z.enum(["chat", "responses"]); export type ProviderApiMode = z.infer; +// Per-model context-window / output overrides (ADR-0012 §Window resolution). +// Keyed by model id; both fields optional so an override can set just one. +export const modelMetaEntrySchema = z.object({ + contextWindow: z.number().int().positive().optional(), + maxOutputTokens: z.number().int().positive().optional(), +}); + +export type ModelMetaEntry = z.infer; + +export const modelMetaSchema = z.record(z.string(), modelMetaEntrySchema); + +export type ModelMeta = z.infer; + const providerBaseSchema = z.object({ id: z.string(), organizationId: z.string().optional(), @@ -592,6 +612,7 @@ const providerBaseSchema = z.object({ .max(4096) .nullable() .optional(), + modelMeta: modelMetaSchema.optional(), createdAt: z.date(), updatedAt: z.date(), }); @@ -643,6 +664,7 @@ export const providerCreateSchema = providerBaseSchema.pick({ memoryExtractionModelId: true, embeddingModelId: true, embeddingDimensions: true, + modelMeta: true, }); // Sandbox @@ -778,6 +800,7 @@ export const providerUpdateSchema = providerBaseSchema.pick({ memoryExtractionModelId: true, embeddingModelId: true, embeddingDimensions: true, + modelMeta: true, }); export type ProviderUpdateData = z.infer; @@ -1527,3 +1550,23 @@ export const dashboardUpdateSchema = z.object({ desktopLayout: z.array(rglLayoutItemSchema).optional(), mobileLayout: z.array(rglLayoutItemSchema).optional(), }); + +// Message stats (ADR-0012 §Context-usage ring / §Per-message stats) +// Stamped on the last assistant message's metadata.stats after each stream run. +// Used by the frontend context-usage ring and per-message stats popover. + +export const messageStatsSchema = z.object({ + // Run-wide totals across every step (sum) — for the cost popover (§Per-message stats). + inputTokens: z.number().nonnegative(), + outputTokens: z.number().nonnegative(), + // Input tokens of the LAST model call = peak context fullness — for the + // §Context-usage ring. NOT the run-wide sum (which over-counts on multi-step tool loops). + contextTokens: z.number().nonnegative(), + startedAt: z.string(), + firstTokenAt: z.string().optional(), + finishedAt: z.string(), + contextWindow: z.number().positive(), + contextWindowIsDefault: z.boolean(), +}); + +export type MessageStats = z.infer;