diff --git a/cli/DEVELOPMENT.md b/cli/DEVELOPMENT.md index f5db2895..c73852aa 100644 --- a/cli/DEVELOPMENT.md +++ b/cli/DEVELOPMENT.md @@ -1,11 +1,11 @@ -# Dirac CLI +# ISAAC CLI -The official CLI for Dirac. Run Dirac tasks directly from the terminal with the same underlying functionality as the VS Code extension. +The official CLI for ISAAC. Run ISAAC tasks directly from the terminal with the same underlying functionality as the VS Code extension. ## Features - **Reuses Core Codebase**: Shares the same Controller, Task, and API handling as the VS Code extension -- **Terminal Output**: Displays Dirac messages directly in your terminal with colored output +- **Terminal Output**: Displays ISAAC messages directly in your terminal with colored output - **Task History**: Access your task history from the command line - **Configurable**: Use custom configuration directories and working directories - **Image Support**: Attach images to your prompts using file paths or inline references @@ -14,7 +14,7 @@ The official CLI for Dirac. Run Dirac tasks directly from the terminal with the - Node.js 20.x or later - npm or yarn -- The parent Dirac project dependencies installed +- The parent ISAAC project dependencies installed ## Installation @@ -43,17 +43,17 @@ npm run cli:link ### Interactive Mode (Default) -When you run `dirac` without any command, it launches an interactive welcome prompt: +When you run `isaac` without any command, it launches an interactive welcome prompt: ```bash # Launch interactive mode -dirac +isaac # Or run a task directly -dirac "Create a hello world function in Python" +isaac "Create a hello world function in Python" # With options -dirac -v --thinking "Analyze this codebase" +isaac -v --thinking "Analyze this codebase" ``` ### Commands @@ -63,8 +63,8 @@ dirac -v --thinking "Analyze this codebase" Run a new task with a prompt. ```bash -dirac task "Create a hello world function in Python" -dirac t "Create a hello world function" +isaac task "Create a hello world function in Python" +isaac t "Create a hello world function" ``` **Options:** @@ -78,29 +78,29 @@ dirac t "Create a hello world function" | `-i, --images ` | Image file paths to include with the task | | `-v, --verbose` | Show verbose output including reasoning | | `-c, --cwd ` | Working directory for the task | -| `--config ` | Path to Dirac configuration directory | +| `--config ` | Path to ISAAC configuration directory | | `-t, --thinking` | Enable extended thinking (1024 token budget) | **Examples:** ```bash # Run in plan mode with verbose output -dirac task -p -v "Design a REST API" +isaac task -p -v "Design a REST API" # Use a specific model with yolo mode -dirac task -m claude-sonnet-4-5-20250929 -y "Refactor this function" +isaac task -m claude-sonnet-4-5-20250929 -y "Refactor this function" # Include images with your prompt -dirac task -i screenshot.png diagram.jpg "Fix the UI based on these images" +isaac task -i screenshot.png diagram.jpg "Fix the UI based on these images" # Or use inline image references in the prompt -dirac task "Fix the layout shown in @./screenshot.png" +isaac task "Fix the layout shown in @./screenshot.png" # Enable extended thinking for complex tasks -dirac task -t "Architect a microservices system" +isaac task -t "Architect a microservices system" # Specify working directory -dirac task -c /path/to/project "Add unit tests" +isaac task -c /path/to/project "Add unit tests" ``` #### `history` (alias: `h`) @@ -108,8 +108,8 @@ dirac task -c /path/to/project "Add unit tests" List task history with pagination support. ```bash -dirac history -dirac h +isaac history +isaac h ``` **Options:** @@ -118,19 +118,19 @@ dirac h |--------|-------------| | `-n, --limit ` | Number of tasks to show (default: 10) | | `-p, --page ` | Page number, 1-based (default: 1) | -| `--config ` | Path to Dirac configuration directory | +| `--config ` | Path to ISAAC configuration directory | **Examples:** ```bash # Show last 10 tasks (default) -dirac history +isaac history # Show 20 tasks -dirac history -n 20 +isaac history -n 20 # Show page 2 with 5 tasks per page -dirac history -n 5 -p 2 +isaac history -n 5 -p 2 ``` #### `config` @@ -138,21 +138,21 @@ dirac history -n 5 -p 2 Show current configuration including global and workspace state. ```bash -dirac config +isaac config ``` **Options:** | Option | Description | |--------|-------------| -| `--config ` | Path to Dirac configuration directory | +| `--config ` | Path to ISAAC configuration directory | #### `auth` Authenticate a provider and configure what model is used. ```bash -dirac auth +isaac auth ``` **Options:** @@ -165,22 +165,22 @@ dirac auth | `-b, --baseurl ` | Base URL (optional, only for openai provider) | | `-v, --verbose` | Show verbose output | | `-c, --cwd ` | Working directory for the task | -| `--config ` | Path to Dirac configuration directory | +| `--config ` | Path to ISAAC configuration directory | **Examples:** ```bash # Interactive authentication -dirac auth +isaac auth # Quick setup with provider and API key -dirac auth -p anthropic -k sk-ant-xxxxx +isaac auth -p anthropic -k sk-ant-xxxxx # Full quick setup with model -dirac auth -p openai-native -k sk-xxxxx -m gpt-4o +isaac auth -p openai-native -k sk-xxxxx -m gpt-4o # OpenAI-compatible provider with custom base URL -dirac auth -p openai -k your-api-key -b https://api.example.com/v1 +isaac auth -p openai -k your-api-key -b https://api.example.com/v1 ``` ### Global Options @@ -191,7 +191,7 @@ These options are available for the default command (running a task directly): |--------|-------------| | `-v, --verbose` | Show verbose output | | `-c, --cwd ` | Working directory | -| `--config ` | Configuration directory | +| `--config ` | ISAAC configuration directory | | `--thinking` | Enable extended thinking (1024 token budget) | ## Development @@ -202,11 +202,11 @@ These options are available for the default command (running a task directly): # 1. Install all dependencies (root, webview-ui, cli) npm run install:all -# 2. Build and link globally so you can run `dirac` from anywhere +# 2. Build and link globally so you can run `isaac` from anywhere npm run cli:link # 3. Test it -dirac --help +isaac --help ``` ### Scripts @@ -218,8 +218,8 @@ Run these from the repository root: | `npm run install:all` | Install deps for root, webview-ui, and cli | | `npm run cli:build` | Generate protos and build CLI | | `npm run cli:build:production` | Production build (minified) | -| `npm run cli:link` | Build and `npm link` so you can run `dirac` from anywhere | -| `npm run cli:unlink` | Remove the global `dirac` symlink | +| `npm run cli:link` | Build and `npm link` so you can run `isaac` from anywhere | +| `npm run cli:unlink` | Remove the global `isaac` symlink | | `npm run cli:dev` | Link + watch mode for development | | `npm run cli:watch` | Watch mode only (no initial build) | | `npm run cli:test` | Run CLI tests | @@ -229,7 +229,7 @@ Run these from the repository root: 1. Run `npm run cli:dev` - this links the CLI globally and starts watch mode 2. Make changes to files in `cli/src/` 3. The build automatically rebuilds on save -4. Test your changes by running `dirac` in another terminal +4. Test your changes by running `isaac` in another terminal 5. When done, run `npm run cli:unlink` to clean up ### Proto Generation @@ -352,7 +352,7 @@ npm run protos npm run cli:build ``` -### "command not found: dirac" +### "command not found: isaac" The CLI isn't linked globally. Run: diff --git a/cli/src/context/StdinContext.test.tsx b/cli/src/context/StdinContext.test.tsx index afaed618..7ed96273 100644 --- a/cli/src/context/StdinContext.test.tsx +++ b/cli/src/context/StdinContext.test.tsx @@ -1,5 +1,6 @@ import { Text } from "ink" import { render } from "ink-testing-library" +// biome-ignore lint/correctness/noUnusedImports: required in scope for the classic JSX runtime (tsconfig jsx: "react") import React from "react" import { afterEach, beforeEach, describe, expect, it, vi } from "vitest" import { StdinProvider } from "./StdinContext" diff --git a/cli/src/context/StdinContext.tsx b/cli/src/context/StdinContext.tsx index 4c0757c7..ba131cc1 100644 --- a/cli/src/context/StdinContext.tsx +++ b/cli/src/context/StdinContext.tsx @@ -1,7 +1,7 @@ /** * Context for tracking stdin raw mode support * Used to conditionally disable input handling when stdin doesn't support raw mode - * (e.g., when input is piped: echo "..." | diracdev) + * (e.g., when input is piped: echo "..." | isaac) */ import { useStdin } from "ink" diff --git a/cli/src/exports.ts b/cli/src/exports.ts index 739ef164..6f4ed71f 100644 --- a/cli/src/exports.ts +++ b/cli/src/exports.ts @@ -1,18 +1,18 @@ /** - * Dirac Library Exports + * ISAAC Library Exports * - * This file exports the public API for programmatic use of Dirac. - * Use these classes and types to embed Dirac into your applications. + * This file exports the public API for programmatic use of ISAAC. + * Use these classes and types to embed ISAAC into your applications. * * @example * ```typescript - * import { DiracAgent } from "dirac" + * import { DiracAgent } from "isaac-cli" * * const agent = new DiracAgent() * await agent.initialize({ clientCapabilities: {} }) * const session = await agent.newSession({ cwd: process.cwd() }) * ``` - * @module dirac + * @module isaac-cli */ export { DiracAgent } from "./agent/DiracAgent.js" diff --git a/cli/src/index.test.ts b/cli/src/index.test.ts index 3a110728..95eb4a0d 100644 --- a/cli/src/index.test.ts +++ b/cli/src/index.test.ts @@ -14,7 +14,7 @@ describe("CLI Commands", () => { beforeEach(() => { // Create a fresh program instance for each test program = new Command() - program.name("dirac").description("Dirac CLI - AI coding assistant").version("0.0.0") + program.name("isaac").description("ISAAC — Intelligence Souveraine Ailiance Agent Codeur").version("0.0.0") program.enablePositionalOptions() // Define commands matching index.ts @@ -67,11 +67,6 @@ describe("CLI Commands", () => { .option("--config ", "Configuration directory") .action(() => {}) - program - .command("kanban") - .description("Run npx kanban --agent dirac") - .action(() => {}) - // Default command for interactive mode program .argument("[prompt]", "Task prompt") @@ -86,7 +81,6 @@ describe("CLI Commands", () => { .option("--auto-condense", "Enable AI-powered context compaction instead of mechanical truncation") .option("--hooks-dir ", "Additional hooks directory") .option("--auto-approve-all", "Enable auto-approve all") - .option("--kanban", "Run npx kanban --agent dirac") .action(() => {}) }) @@ -280,13 +274,6 @@ describe("CLI Commands", () => { }) }) - describe("kanban command", () => { - it("should parse kanban command", () => { - const args = ["node", "cli", "kanban"] - program.parse(args) - }) - }) - describe("auth command", () => { it("should parse auth command", () => { const args = ["node", "cli", "auth"] @@ -382,10 +369,6 @@ describe("CLI Commands", () => { expect(program.opts().autoApproveAll).toBe(true) }) - it("should parse --kanban flag", () => { - program.parse(["node", "cli", "--kanban"]) - expect(program.opts().kanban).toBe(true) - }) }) describe("command structure", () => { @@ -395,7 +378,6 @@ describe("CLI Commands", () => { expect(commandNames).toContain("history") expect(commandNames).toContain("config") expect(commandNames).toContain("auth") - expect(commandNames).toContain("kanban") }) it("should have correct aliases", () => { diff --git a/cli/src/init.ts b/cli/src/init.ts index 9359a867..16c4fba1 100644 --- a/cli/src/init.ts +++ b/cli/src/init.ts @@ -113,7 +113,7 @@ export async function initializeCli(options: InitOptions): Promise { const controller = webview.controller as any await telemetryService.captureExtensionActivated() - await telemetryService.captureHostEvent("dirac_cli", "initialized") + await telemetryService.captureHostEvent("isaac_cli", "initialized") // =============== Symbol Index Service =============== // Initialize symbol index for the project in background diff --git a/cli/src/utils/piped.ts b/cli/src/utils/piped.ts index bb56a79b..734ff218 100644 --- a/cli/src/utils/piped.ts +++ b/cli/src/utils/piped.ts @@ -4,9 +4,9 @@ import * as fs from "node:fs" * Read piped input from stdin (non-blocking) * * This function is designed to work with piped input, including chained commands: - * git diff | dirac 'explain' | dirac 'summarize' + * git diff | isaac 'explain' | isaac 'summarize' * - * The challenge is that when chaining dirac commands, the first command may take + * The challenge is that when chaining isaac commands, the first command may take * several seconds to complete, so we can't use a short timeout. Instead, we wait * for EOF which signals that the previous command has finished writing. */ diff --git a/cli/tests/tracing/JsonlTracer.test.ts b/cli/tests/tracing/JsonlTracer.test.ts index b0c6b7f4..dd4e072c 100644 --- a/cli/tests/tracing/JsonlTracer.test.ts +++ b/cli/tests/tracing/JsonlTracer.test.ts @@ -339,4 +339,38 @@ describe("JsonlTracer", () => { tracer.appendTurn({ phase: "execute" }) tracer.close("aborted", 1) }) + + it("scrubs inline credentials from gateway_url before writing meta.json", () => { + const tracer = new JsonlTracer("task-metascrub", tmpDir) + tracer.writeMeta({ + task: "task-metascrub", + mode: "act", + approval_mode: "manual", + ailiance_agent_version: "0.1.0", + gateway_url: "http://admin:hunter2@studio:9300", + }) + const raw = fs.readFileSync(path.join(tmpDir, TRACING_DIR_NAME, "task-metascrub", "meta.json"), "utf8") + expect(raw).not.toContain("hunter2") + expect(raw).toContain("[REDACTED]") + // in-memory meta stays intact for later merges (only the on-disk copy is scrubbed) + tracer.mergeStats({ turns: 1 }) + const after = JSON.parse(fs.readFileSync(path.join(tmpDir, TRACING_DIR_NAME, "task-metascrub", "meta.json"), "utf8")) + expect(after.stats.turns).toBe(1) + }) + + it("scrubs secrets that surface in error strings", () => { + const tracer = new JsonlTracer("task-errscrub", tmpDir) + tracer.writeMeta({ + task: "task-errscrub", + mode: "act", + approval_mode: "manual", + ailiance_agent_version: "0.1.0", + gateway_url: "http://studio:9300", + }) + tracer.recordPlannerTurn("nope", 3, ["auth failed for token=supersecretvalue"]) + const line = JSON.parse( + fs.readFileSync(path.join(tmpDir, TRACING_DIR_NAME, "task-errscrub", "trace.jsonl"), "utf8").trim().split("\n")[0], + ) + expect(JSON.stringify(line.errors)).not.toContain("supersecretvalue") + }) }) diff --git a/src/core/api/providers/__tests__/fireworks.test.ts b/src/core/api/providers/__tests__/fireworks.test.ts index 8ce4df56..40879b62 100644 --- a/src/core/api/providers/__tests__/fireworks.test.ts +++ b/src/core/api/providers/__tests__/fireworks.test.ts @@ -49,6 +49,7 @@ describe("FireworksHandler", () => { outputTokens: 4, cacheReadTokens: 0, cacheWriteTokens: 0, + totalCost: 0.00003405, }, ]) }) diff --git a/src/core/api/providers/__tests__/openrouter.test.ts b/src/core/api/providers/__tests__/openrouter.test.ts index 588612a6..ea3c7901 100644 --- a/src/core/api/providers/__tests__/openrouter.test.ts +++ b/src/core/api/providers/__tests__/openrouter.test.ts @@ -55,7 +55,7 @@ describe("OpenRouterHandler", () => { cacheReadTokens: 0, inputTokens: 13, outputTokens: 5, - totalCost: 0, + totalCost: 0.000114, }, ]) }) diff --git a/src/core/mcp/McpClientManager.ts b/src/core/mcp/McpClientManager.ts index bd09ed15..fbccc367 100644 --- a/src/core/mcp/McpClientManager.ts +++ b/src/core/mcp/McpClientManager.ts @@ -10,6 +10,7 @@ import { Logger } from "../../shared/services/Logger" import { loadMcpConfigsFromPlugins } from "./McpServerConfigLoader" import type { ConnectedClient, McpServerConfig, McpToolMetadata, McpToolResult } from "./types" import { makeQualifiedToolName } from "./types" +import { assertMcpUrlAllowed } from "./urlSecurity" export interface McpLoadFilter { enabledServers?: string[] @@ -118,6 +119,10 @@ class McpClientManager { let transport: Parameters[0] if (cfg.type === "http") { + // Defense-in-depth: the loader already enforces this, but re-check at + // connect time in case a config reached us through another path. + const verdict = assertMcpUrlAllowed(cfg.url) + if (!verdict.ok) throw new Error(`MCP server "${serverId}": ${verdict.reason}`) // Dynamic import so the streamableHttp module (and its ESM-only // eventsource-parser dep) is pulled in only when an HTTP server connects. // esbuild bundles it into the dist; the MCP unit tests run under vitest diff --git a/src/core/mcp/McpServerConfigLoader.ts b/src/core/mcp/McpServerConfigLoader.ts index 0b7fe5fd..b15e2040 100644 --- a/src/core/mcp/McpServerConfigLoader.ts +++ b/src/core/mcp/McpServerConfigLoader.ts @@ -5,6 +5,7 @@ import { Logger } from "@/shared/services/Logger" import { pluginDiscoveryService } from "../plugins/PluginDiscoveryService" import type { McpServerConfig } from "./types" +import { assertMcpUrlAllowed, shouldSendBearer } from "./urlSecurity" interface RawMcpJson { mcpServers?: Record< @@ -40,13 +41,24 @@ function serverTokenEnv(serverId: string): string { // `Authorization: Bearer ` from ISAAC_MCP__TOKEN when set and no // Authorization header was declared. The token is never persisted (the tool // cache stores only a config hash + the tool list). -function resolveHttpHeaders(serverId: string, raw?: Record): Record | undefined { +function resolveHttpHeaders(serverId: string, url: string, raw?: Record): Record | undefined { const headers: Record = {} for (const [k, v] of Object.entries(raw ?? {})) { headers[k] = expandEnvVars(v) } const token = process.env[serverTokenEnv(serverId)]?.trim() const hasAuth = Object.keys(headers).some((k) => k.toLowerCase() === "authorization") + // Token-gate: a bearer credential (injected from env OR declared in .mcp.json) + // must not leave the machine in cleartext. Fail closed — skip the server + // rather than silently leak the token to an insecure public endpoint. + if ((token && !hasAuth) || hasAuth) { + if (!shouldSendBearer(url)) { + throw new Error( + `refusing to send Authorization over insecure channel to ${url} ` + + `(use https, a private/loopback/Tailscale host, or add the host to ISAAC_MCP_ALLOW_HOSTS)`, + ) + } + } if (token && !hasAuth) { headers.Authorization = `Bearer ${token}` } @@ -86,15 +98,47 @@ export async function loadMcpConfigsFromPlugins(): Promise { const kind = server.type ?? "stdio" if (kind !== "stdio" && kind !== "http") continue - // Validate required fields per transport before claiming the id, so an - // invalid entry doesn't shadow a valid same-id server from a later plugin. - if (kind === "stdio" && !server.command) { - Logger.warn(`[mcp] Server "${serverId}" in plugin ${plugin.manifest.name} has no command, skipping`) - continue - } - if (kind === "http" && !server.url) { - Logger.warn(`[mcp] HTTP server "${serverId}" in plugin ${plugin.manifest.name} has no url, skipping`) - continue + // Validate + resolve per transport BEFORE claiming the id, so an invalid + // or security-rejected entry doesn't shadow a valid same-id server from a + // later plugin (mirrors the dedupe-by-id contract below). + const pluginRoot = plugin.rootDir + let resolved: McpServerConfig | null = null + + if (kind === "stdio") { + if (!server.command) { + Logger.warn(`[mcp] Server "${serverId}" in plugin ${plugin.manifest.name} has no command, skipping`) + continue + } + resolved = { + id: serverId, + pluginName: plugin.manifest.name, + pluginRoot, + type: "stdio", + command: expandPluginRoot(server.command, pluginRoot), + args: (server.args ?? []).map((a) => expandPluginRoot(a, pluginRoot)), + } + } else { + if (!server.url) { + Logger.warn(`[mcp] HTTP server "${serverId}" in plugin ${plugin.manifest.name} has no url, skipping`) + continue + } + const url = expandEnvVars(server.url) + // SSRF guard: refuse cloud-metadata endpoints (unless allowlisted). + const verdict = assertMcpUrlAllowed(url) + if (!verdict.ok) { + Logger.warn(`[mcp] HTTP server "${serverId}" in plugin ${plugin.manifest.name}: ${verdict.reason}, skipping`) + continue + } + let headers: Record | undefined + try { + headers = resolveHttpHeaders(serverId, url, server.headers) + } catch (e) { + Logger.warn( + `[mcp] HTTP server "${serverId}" in plugin ${plugin.manifest.name}: ${(e as Error).message}, skipping`, + ) + continue + } + resolved = { id: serverId, pluginName: plugin.manifest.name, pluginRoot, type: "http", url, headers } } const dupOwner = seenServers.get(serverId) @@ -105,27 +149,7 @@ export async function loadMcpConfigsFromPlugins(): Promise { continue } seenServers.set(serverId, plugin.manifest.name) - - const pluginRoot = plugin.rootDir - if (kind === "http") { - configs.push({ - id: serverId, - pluginName: plugin.manifest.name, - pluginRoot, - type: "http", - url: expandEnvVars(server.url!), - headers: resolveHttpHeaders(serverId, server.headers), - }) - } else { - configs.push({ - id: serverId, - pluginName: plugin.manifest.name, - pluginRoot, - type: "stdio", - command: expandPluginRoot(server.command!, pluginRoot), - args: (server.args ?? []).map((a) => expandPluginRoot(a, pluginRoot)), - }) - } + configs.push(resolved) } } diff --git a/src/core/mcp/__tests__/McpServerConfigLoader.test.ts b/src/core/mcp/__tests__/McpServerConfigLoader.test.ts index 34504cb1..3c8980f7 100644 --- a/src/core/mcp/__tests__/McpServerConfigLoader.test.ts +++ b/src/core/mcp/__tests__/McpServerConfigLoader.test.ts @@ -5,6 +5,7 @@ import path from "path" import { afterEach, beforeEach, describe, it } from "vitest" import type { DiscoveredPlugin } from "../../plugins/PluginDiscoveryService" +import type { McpStdioServerConfig } from "../types" // We test loadMcpConfigsFromPlugins by patching pluginDiscoveryService.discover() // to return fake plugins pointing at a tmpdir, without touching the real ~/.claude/plugins. @@ -83,8 +84,8 @@ describe("McpServerConfigLoader", () => { expect(result[0].id).to.equal("my-server") expect(result[0].pluginName).to.equal("plugin-with-mcp") expect(result[0].type).to.equal("stdio") - expect(result[0].command).to.equal("/usr/bin/node") - expect(result[0].args).to.deep.equal(["server.js", "--port", "3000"]) + expect((result[0] as McpStdioServerConfig).command).to.equal("/usr/bin/node") + expect((result[0] as McpStdioServerConfig).args).to.deep.equal(["server.js", "--port", "3000"]) ;(pluginDiscoveryModule.pluginDiscoveryService as any).discover = async () => [] }) @@ -105,7 +106,7 @@ describe("McpServerConfigLoader", () => { const context7s = result.filter((c) => c.id === "context7") expect(context7s).to.have.length(1) expect(context7s[0].pluginName).to.equal("plugin-a") // first declarer wins - expect(context7s[0].args).to.deep.equal(["-y", "@upstash/context7-mcp@2.1.4"]) + expect((context7s[0] as McpStdioServerConfig).args).to.deep.equal(["-y", "@upstash/context7-mcp@2.1.4"]) ;(pluginDiscoveryModule.pluginDiscoveryService as any).discover = async () => [] }) @@ -131,8 +132,8 @@ describe("McpServerConfigLoader", () => { const { loadMcpConfigsFromPlugins } = loaderModule const result = await loadMcpConfigsFromPlugins() expect(result).to.have.length(1) - expect(result[0].command).to.equal(`${fakePlugin.rootDir}/bin/server`) - expect(result[0].args).to.deep.equal([`--root`, `${fakePlugin.rootDir}/data`]) + expect((result[0] as McpStdioServerConfig).command).to.equal(`${fakePlugin.rootDir}/bin/server`) + expect((result[0] as McpStdioServerConfig).args).to.deep.equal([`--root`, `${fakePlugin.rootDir}/data`]) ;(pluginDiscoveryModule.pluginDiscoveryService as any).discover = async () => [] }) @@ -245,4 +246,66 @@ describe("McpServerConfigLoader", () => { ;(pluginDiscoveryModule.pluginDiscoveryService as any).discover = async () => [] } }) + + it("skips an http server pointing at a cloud-metadata endpoint (SSRF guard)", async () => { + const fakePlugin = await createFakePlugin(tmpDir, "owner", "plugin-ssrf", "1.0.0", { name: "plugin-ssrf" }) + await fs.writeFile( + path.join(fakePlugin.rootDir, ".mcp.json"), + JSON.stringify({ + mcpServers: { evil: { type: "http", url: "http://169.254.169.254/latest/meta-data/" } }, + }), + ) + ;(pluginDiscoveryModule.pluginDiscoveryService as any).discover = async () => [fakePlugin] + + const { loadMcpConfigsFromPlugins } = loaderModule + const result = await loadMcpConfigsFromPlugins() + expect(result).to.have.length(0) + + ;(pluginDiscoveryModule.pluginDiscoveryService as any).discover = async () => [] + }) + + it("skips an http server when a bearer token would leak over cleartext to a public host (token-gate)", async () => { + const fakePlugin = await createFakePlugin(tmpDir, "owner", "plugin-tokengate", "1.0.0", { name: "plugin-tokengate" }) + await fs.writeFile( + path.join(fakePlugin.rootDir, ".mcp.json"), + JSON.stringify({ + mcpServers: { pub: { type: "http", url: "http://api.public.example/mcp" } }, + }), + ) + ;(pluginDiscoveryModule.pluginDiscoveryService as any).discover = async () => [fakePlugin] + + process.env.ISAAC_MCP_PUB_TOKEN = "secret-token" + try { + const { loadMcpConfigsFromPlugins } = loaderModule + const result = await loadMcpConfigsFromPlugins() + expect(result).to.have.length(0) + } finally { + delete process.env.ISAAC_MCP_PUB_TOKEN + ;(pluginDiscoveryModule.pluginDiscoveryService as any).discover = async () => [] + } + }) + + it("injects the bearer token for a private host over http (token-gate allows)", async () => { + const fakePlugin = await createFakePlugin(tmpDir, "owner", "plugin-localtoken", "1.0.0", { name: "plugin-localtoken" }) + await fs.writeFile( + path.join(fakePlugin.rootDir, ".mcp.json"), + JSON.stringify({ + mcpServers: { loc: { type: "http", url: "http://127.0.0.1:8765/mcp" } }, + }), + ) + ;(pluginDiscoveryModule.pluginDiscoveryService as any).discover = async () => [fakePlugin] + + process.env.ISAAC_MCP_LOC_TOKEN = "local-token" + try { + const { loadMcpConfigsFromPlugins } = loaderModule + const result = await loadMcpConfigsFromPlugins() + expect(result).to.have.length(1) + expect((result[0] as { headers?: Record }).headers).to.deep.equal({ + Authorization: "Bearer local-token", + }) + } finally { + delete process.env.ISAAC_MCP_LOC_TOKEN + ;(pluginDiscoveryModule.pluginDiscoveryService as any).discover = async () => [] + } + }) }) diff --git a/src/core/mcp/__tests__/urlSecurity.test.ts b/src/core/mcp/__tests__/urlSecurity.test.ts new file mode 100644 index 00000000..32777f41 --- /dev/null +++ b/src/core/mcp/__tests__/urlSecurity.test.ts @@ -0,0 +1,104 @@ +import { expect } from "chai" +import { afterEach, describe, it } from "vitest" + +import { assertMcpUrlAllowed, isMetadataHost, shouldSendBearer } from "../urlSecurity" + +describe("urlSecurity", () => { + const savedAllow = process.env.ISAAC_MCP_ALLOW_HOSTS + + afterEach(() => { + if (savedAllow === undefined) delete process.env.ISAAC_MCP_ALLOW_HOSTS + else process.env.ISAAC_MCP_ALLOW_HOSTS = savedAllow + }) + + describe("assertMcpUrlAllowed (SSRF guard)", () => { + it("blocks known cloud-metadata endpoints", () => { + for (const u of [ + "http://169.254.169.254/latest/meta-data/", + "http://metadata.google.internal/computeMetadata/v1/", + "http://metadata.goog/", + "http://100.100.100.200/", + "http://[fd00:ec2::254]/", + ]) { + expect(assertMcpUrlAllowed(u).ok, u).to.equal(false) + } + }) + + it("allows public https, loopback and private hosts", () => { + for (const u of [ + "https://api.example.com/mcp", + "http://localhost:3000/mcp", + "http://127.0.0.1:8080/", + "http://10.0.0.5/mcp", + "http://192.168.1.10/mcp", + "http://100.64.0.1/mcp", // Tailscale CGNAT + ]) { + expect(assertMcpUrlAllowed(u).ok, u).to.equal(true) + } + }) + + it("blocks IPv4-mapped IPv6 and trailing-dot evasions of the metadata block", () => { + for (const u of [ + "http://[::ffff:169.254.169.254]/", // dotted IPv4-mapped (Node compresses) + "http://[::ffff:a9fe:a9fe]/", // hextet IPv4-mapped form of 169.254.169.254 + "http://[::ffff:6464:64c8]/", // 100.100.100.200 (Alibaba) mapped + "http://169.254.169.254./", // trailing FQDN dot + "http://metadata.google.internal./", + ]) { + expect(assertMcpUrlAllowed(u).ok, u).to.equal(false) + } + }) + + it("rejects invalid urls and non-http schemes", () => { + expect(assertMcpUrlAllowed("not a url").ok).to.equal(false) + expect(assertMcpUrlAllowed("ftp://example.com").ok).to.equal(false) + expect(assertMcpUrlAllowed("file:///etc/passwd").ok).to.equal(false) + }) + + it("honours ISAAC_MCP_ALLOW_HOSTS to override a metadata block", () => { + expect(assertMcpUrlAllowed("http://169.254.169.254/").ok).to.equal(false) + process.env.ISAAC_MCP_ALLOW_HOSTS = "169.254.169.254, other.host" + expect(assertMcpUrlAllowed("http://169.254.169.254/").ok).to.equal(true) + }) + }) + + describe("shouldSendBearer (token-gate)", () => { + it("allows token over https (any host)", () => { + expect(shouldSendBearer("https://api.example.com/mcp")).to.equal(true) + }) + + it("allows token over http only for private/loopback/CGNAT hosts", () => { + expect(shouldSendBearer("http://localhost:3000/")).to.equal(true) + expect(shouldSendBearer("http://127.0.0.1/")).to.equal(true) + expect(shouldSendBearer("http://10.1.2.3/")).to.equal(true) + expect(shouldSendBearer("http://192.168.0.2/")).to.equal(true) + expect(shouldSendBearer("http://172.16.0.1/")).to.equal(true) + expect(shouldSendBearer("http://100.96.0.7/")).to.equal(true) + }) + + it("recognises IPv4-mapped IPv6 private addresses as token-safe", () => { + expect(shouldSendBearer("http://[::ffff:192.168.1.1]/")).to.equal(true) + expect(shouldSendBearer("http://[::ffff:c0a8:0101]/")).to.equal(true) // hextet form of 192.168.1.1 + }) + + it("refuses token over http to public hosts", () => { + expect(shouldSendBearer("http://api.example.com/mcp")).to.equal(false) + expect(shouldSendBearer("http://8.8.8.8/")).to.equal(false) + // link-local is NOT auto-trusted (ambiguous with metadata) + expect(shouldSendBearer("http://169.254.1.1/")).to.equal(false) + }) + + it("honours ISAAC_MCP_ALLOW_HOSTS for an otherwise-unsafe host", () => { + expect(shouldSendBearer("http://gw.public.example/")).to.equal(false) + process.env.ISAAC_MCP_ALLOW_HOSTS = "gw.public.example" + expect(shouldSendBearer("http://gw.public.example/")).to.equal(true) + }) + }) + + describe("isMetadataHost", () => { + it("matches case-insensitively", () => { + expect(isMetadataHost("METADATA.GOOGLE.INTERNAL")).to.equal(true) + expect(isMetadataHost("example.com")).to.equal(false) + }) + }) +}) diff --git a/src/core/mcp/urlSecurity.ts b/src/core/mcp/urlSecurity.ts new file mode 100644 index 00000000..911e18e0 --- /dev/null +++ b/src/core/mcp/urlSecurity.ts @@ -0,0 +1,139 @@ +// --------------------------------------------------------------------------- +// MCP HTTP URL security +// +// Two independent guards for http(s) MCP servers whose URL + bearer token come +// from a plugin-supplied .mcp.json (an untrusted source): +// +// 1. SSRF guard (assertMcpUrlAllowed): refuse to connect to known cloud +// metadata endpoints (169.254.169.254 & friends). Loopback and private +// ranges stay allowed — local/LAN MCP servers are a legitimate, common +// setup. Hosts listed in ISAAC_MCP_ALLOW_HOSTS bypass the block. +// +// 2. Bearer token-gate (shouldSendBearer): only attach `Authorization: Bearer` +// when the channel is safe — https, OR a loopback / RFC1918 / CGNAT +// (Tailscale 100.64/10) host, OR an explicitly allowlisted host. A token +// must never leave the machine in cleartext to a public http:// endpoint. +// +// Limitation: checks operate on the URL's literal host. DNS rebinding (a public +// name resolving to a private IP at connect time) is out of scope for this +// .mcp.json threat model and documented as such. +// --------------------------------------------------------------------------- + +/** Cloud instance-metadata endpoints — connecting here is almost always SSRF. */ +const METADATA_HOSTS = new Set([ + "169.254.169.254", // AWS / Azure / GCP / OpenStack / DigitalOcean + "metadata.google.internal", // GCP + "metadata.goog", // GCP + "100.100.100.200", // Alibaba Cloud (note: inside CGNAT 100.64/10) + "fd00:ec2::254", // AWS IPv6 IMDS +]) + +/** + * url.hostname keeps the [..] brackets for IPv6 literals; strip them + lowercase. + * Also canonicalise two evasion forms so downstream checks see the real address: + * - trailing FQDN dot ("169.254.169.254." / "metadata.google.internal.") + * - IPv4-mapped IPv6 ("::ffff:a9fe:a9fe" or "::ffff:169.254.169.254") -> dotted + * IPv4, so both the metadata set and the private-range check apply. + */ +function normalizeHost(hostname: string): string { + let h = hostname.replace(/^\[/, "").replace(/\]$/, "").toLowerCase() + if (h.length > 1 && h.endsWith(".")) h = h.slice(0, -1) + const mappedHex = h.match(/^::ffff:([0-9a-f]{1,4}):([0-9a-f]{1,4})$/) + if (mappedHex) { + const hi = Number.parseInt(mappedHex[1], 16) + const lo = Number.parseInt(mappedHex[2], 16) + return `${hi >> 8}.${hi & 0xff}.${lo >> 8}.${lo & 0xff}` + } + const mappedDotted = h.match(/^::ffff:(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})$/) + if (mappedDotted) return mappedDotted[1] + return h +} + +function envAllowlist(): Set { + const raw = process.env.ISAAC_MCP_ALLOW_HOSTS + if (!raw) return new Set() + return new Set( + raw + .split(",") + .map((h) => h.trim().toLowerCase()) + .filter(Boolean), + ) +} + +function parseIpv4(host: string): number[] | null { + const parts = host.split(".") + if (parts.length !== 4) return null + const octets: number[] = [] + for (const p of parts) { + if (!/^\d{1,3}$/.test(p)) return null + const n = Number(p) + if (n > 255) return null + octets.push(n) + } + return octets +} + +/** Loopback / RFC1918 / link-local-ULA / CGNAT — "private enough" to send a token over http. */ +function isPrivateHost(hostname: string): boolean { + const host = hostname.toLowerCase() + if (host === "localhost" || host.endsWith(".localhost")) return true + if (host === "::1") return true + // IPv6 unique-local (fc00::/7 → fc.. / fd..). url.hostname strips [] brackets. + if (/^f[cd][0-9a-f]{0,2}:/.test(host)) return true + + const v4 = parseIpv4(host) + if (!v4) return false + const [a, b] = v4 + if (a === 127) return true // 127.0.0.0/8 loopback + if (a === 10) return true // 10.0.0.0/8 + if (a === 192 && b === 168) return true // 192.168.0.0/16 + if (a === 172 && b >= 16 && b <= 31) return true // 172.16.0.0/12 + // 100.64.0.0/10 CGNAT (Tailscale). NB: 100.100.100.200 (Alibaba metadata) + // falls in this range, but the SSRF guard checks isMetadataHost FIRST and + // blocks the connection outright, so it never reaches the token-gate here. + if (a === 100 && b >= 64 && b <= 127) return true + return false +} + +export function isMetadataHost(hostname: string): boolean { + return METADATA_HOSTS.has(normalizeHost(hostname)) +} + +/** Whether a bearer token may be attached for this URL (token-gate policy). */ +export function shouldSendBearer(rawUrl: string): boolean { + let url: URL + try { + url = new URL(rawUrl) + } catch { + return false + } + const host = normalizeHost(url.hostname) + if (envAllowlist().has(host)) return true + if (url.protocol === "https:") return true + if (url.protocol === "http:" && isPrivateHost(host)) return true + return false +} + +export type McpUrlVerdict = { ok: true } | { ok: false; reason: string } + +/** SSRF guard: block cloud-metadata endpoints unless explicitly allowlisted. */ +export function assertMcpUrlAllowed(rawUrl: string): McpUrlVerdict { + let url: URL + try { + url = new URL(rawUrl) + } catch { + return { ok: false, reason: `invalid url "${rawUrl}"` } + } + if (url.protocol !== "http:" && url.protocol !== "https:") { + return { ok: false, reason: `unsupported scheme "${url.protocol}" (only http/https)` } + } + const host = normalizeHost(url.hostname) + if (envAllowlist().has(host)) return { ok: true } + if (isMetadataHost(host)) { + return { + ok: false, + reason: `blocked cloud-metadata endpoint "${host}" (set ISAAC_MCP_ALLOW_HOSTS to override)`, + } + } + return { ok: true } +} diff --git a/src/core/prompts/system-prompt/__tests__/__snapshots__/anthropic_claude_4_5_sonnet-basic.snap b/src/core/prompts/system-prompt/__tests__/__snapshots__/anthropic_claude_4_5_sonnet-basic.snap index cf8d58ab..90b6a85f 100644 --- a/src/core/prompts/system-prompt/__tests__/__snapshots__/anthropic_claude_4_5_sonnet-basic.snap +++ b/src/core/prompts/system-prompt/__tests__/__snapshots__/anthropic_claude_4_5_sonnet-basic.snap @@ -10,6 +10,36 @@ TOOL USE - Prefer tools for communication; avoid redundant text in assistant responses. +TOOL CONSTRAINTS — READ CAREFULLY + +The ONLY tools you may call are the ones explicitly listed in the tools schema sent with this request. You MUST NOT invent, hallucinate, or guess other tool names. Forbidden examples of non-existent tools you must never call: +- digikey, digikey:search, digikey_search +- bom, bom:create, bom_create +- kicad, kicad:new_project, kicad_new_project +- oscilloscope, oscilloscope:probe, logic_analyzer +- web_search, browser, http_get, curl +- Any name containing ":" or "." is NOT a valid tool name + +If a task seems to require a tool that is not listed, use the available tools instead: +- To create or write a file → write_to_file +- To run any shell command, script, or external program → execute_command +- To read an existing file → read_file +- To search code → search_files or list_files + +If you find yourself wanting to call a non-listed tool name, STOP immediately. Use execute_command with a relevant shell command instead. There are NO other tools available regardless of the task domain (electronics, hardware, IoT, KiCad, etc.). + +ASYNCHRONOUS TOOLS + +The tools execute_command, search_files, and list_files (when recursive=true) may return a placeholder of the form: + + task_id: + status: running + To retrieve the result, call get_tool_result with this task_id. + +When you receive such a placeholder, call get_tool_result(task_id="...") to fetch the actual result. You may continue with other independent work in parallel — the task runs in the background. The result will be available within seconds to minutes depending on the tool. + +If you do not need the result immediately, you can defer the call to get_tool_result until you actually need it. If you are blocked waiting on the result, call get_tool_result with wait=true (default) to block. Use wait=false to peek at the current status without blocking. + ACT MODE VS PLAN MODE In each user message, the environment_details will specify the current mode. There are two modes: @@ -18,12 +48,12 @@ In each user message, the environment_details will specify the current mode. The - In ACT MODE, you use tools to accomplish the user's task. Once you've completed the user's task, you use the attempt_completion tool to present the result of the task to the user. - PLAN MODE: In this special mode, you have access to the plan_mode_respond tool. - In PLAN MODE, start by getting precise understanding of what the user wants in this task. - - In PLAN MODE, the goal is to gather information and get context to create a detailed plan for accomplishing the task, which the user will review and approve before they switch you to ACT MODE to implement the solution. + - In PLAN MODE, the goal is to gather information and present a plan, which the user will review and approve before they switch you to ACT MODE to implement the solution. If it is a simple question, answer promptly. Not all tasks sent to you are deep research tasks. SYSTEM INFO -- Operating System: linux -- Default Shell: /bin/bash +- Operating System: darwin +- Default Shell: /bin/zsh - Current Working Directory: /test/project (this is where all the tools will be executed from) - Available CPU Cores: 1 (Use this value for parallel jobs like 'make -j' instead of 'nproc') diff --git a/src/core/prompts/system-prompt/__tests__/__snapshots__/anthropic_claude_4_5_sonnet-no-browser.snap b/src/core/prompts/system-prompt/__tests__/__snapshots__/anthropic_claude_4_5_sonnet-no-browser.snap index cf8d58ab..90b6a85f 100644 --- a/src/core/prompts/system-prompt/__tests__/__snapshots__/anthropic_claude_4_5_sonnet-no-browser.snap +++ b/src/core/prompts/system-prompt/__tests__/__snapshots__/anthropic_claude_4_5_sonnet-no-browser.snap @@ -10,6 +10,36 @@ TOOL USE - Prefer tools for communication; avoid redundant text in assistant responses. +TOOL CONSTRAINTS — READ CAREFULLY + +The ONLY tools you may call are the ones explicitly listed in the tools schema sent with this request. You MUST NOT invent, hallucinate, or guess other tool names. Forbidden examples of non-existent tools you must never call: +- digikey, digikey:search, digikey_search +- bom, bom:create, bom_create +- kicad, kicad:new_project, kicad_new_project +- oscilloscope, oscilloscope:probe, logic_analyzer +- web_search, browser, http_get, curl +- Any name containing ":" or "." is NOT a valid tool name + +If a task seems to require a tool that is not listed, use the available tools instead: +- To create or write a file → write_to_file +- To run any shell command, script, or external program → execute_command +- To read an existing file → read_file +- To search code → search_files or list_files + +If you find yourself wanting to call a non-listed tool name, STOP immediately. Use execute_command with a relevant shell command instead. There are NO other tools available regardless of the task domain (electronics, hardware, IoT, KiCad, etc.). + +ASYNCHRONOUS TOOLS + +The tools execute_command, search_files, and list_files (when recursive=true) may return a placeholder of the form: + + task_id: + status: running + To retrieve the result, call get_tool_result with this task_id. + +When you receive such a placeholder, call get_tool_result(task_id="...") to fetch the actual result. You may continue with other independent work in parallel — the task runs in the background. The result will be available within seconds to minutes depending on the tool. + +If you do not need the result immediately, you can defer the call to get_tool_result until you actually need it. If you are blocked waiting on the result, call get_tool_result with wait=true (default) to block. Use wait=false to peek at the current status without blocking. + ACT MODE VS PLAN MODE In each user message, the environment_details will specify the current mode. There are two modes: @@ -18,12 +48,12 @@ In each user message, the environment_details will specify the current mode. The - In ACT MODE, you use tools to accomplish the user's task. Once you've completed the user's task, you use the attempt_completion tool to present the result of the task to the user. - PLAN MODE: In this special mode, you have access to the plan_mode_respond tool. - In PLAN MODE, start by getting precise understanding of what the user wants in this task. - - In PLAN MODE, the goal is to gather information and get context to create a detailed plan for accomplishing the task, which the user will review and approve before they switch you to ACT MODE to implement the solution. + - In PLAN MODE, the goal is to gather information and present a plan, which the user will review and approve before they switch you to ACT MODE to implement the solution. If it is a simple question, answer promptly. Not all tasks sent to you are deep research tasks. SYSTEM INFO -- Operating System: linux -- Default Shell: /bin/bash +- Operating System: darwin +- Default Shell: /bin/zsh - Current Working Directory: /test/project (this is where all the tools will be executed from) - Available CPU Cores: 1 (Use this value for parallel jobs like 'make -j' instead of 'nproc') diff --git a/src/core/prompts/system-prompt/__tests__/__snapshots__/anthropic_claude_4_5_sonnet.tools.snap b/src/core/prompts/system-prompt/__tests__/__snapshots__/anthropic_claude_4_5_sonnet.tools.snap index c602075a..eccecfe5 100644 --- a/src/core/prompts/system-prompt/__tests__/__snapshots__/anthropic_claude_4_5_sonnet.tools.snap +++ b/src/core/prompts/system-prompt/__tests__/__snapshots__/anthropic_claude_4_5_sonnet.tools.snap @@ -294,6 +294,30 @@ ] } }, + { + "name": "get_tool_result", + "description": "Retrieve the result of an asynchronous tool call previously launched by execute_command, search_files, or list_files (when recursive=true). Use this when one of those tools returned a placeholder of the form 'task_id: \\nstatus: running' instead of an immediate result. You may keep working on independent tasks in parallel — the async work runs in the background.", + "input_schema": { + "type": "object", + "properties": { + "task_id": { + "type": "string", + "description": "The task identifier (ULID) returned by the async tool." + }, + "wait": { + "type": "boolean", + "description": "If true (default), block until the task completes or timeout_ms expires. If false, return current status immediately." + }, + "timeout_ms": { + "type": "integer", + "description": "How long to wait for completion when wait=true. Default 60000ms, hard-capped at 300000ms (5 minutes)." + } + }, + "required": [ + "task_id" + ] + } + }, { "name": "find_symbol_references", "description": "Finds all exact AST references and invocations of one or more functions, classes, or variables across specified files or directories. Returns precise file paths.", diff --git a/src/core/prompts/system-prompt/__tests__/__snapshots__/gemini_gemini_3-basic.snap b/src/core/prompts/system-prompt/__tests__/__snapshots__/gemini_gemini_3-basic.snap index cf8d58ab..90b6a85f 100644 --- a/src/core/prompts/system-prompt/__tests__/__snapshots__/gemini_gemini_3-basic.snap +++ b/src/core/prompts/system-prompt/__tests__/__snapshots__/gemini_gemini_3-basic.snap @@ -10,6 +10,36 @@ TOOL USE - Prefer tools for communication; avoid redundant text in assistant responses. +TOOL CONSTRAINTS — READ CAREFULLY + +The ONLY tools you may call are the ones explicitly listed in the tools schema sent with this request. You MUST NOT invent, hallucinate, or guess other tool names. Forbidden examples of non-existent tools you must never call: +- digikey, digikey:search, digikey_search +- bom, bom:create, bom_create +- kicad, kicad:new_project, kicad_new_project +- oscilloscope, oscilloscope:probe, logic_analyzer +- web_search, browser, http_get, curl +- Any name containing ":" or "." is NOT a valid tool name + +If a task seems to require a tool that is not listed, use the available tools instead: +- To create or write a file → write_to_file +- To run any shell command, script, or external program → execute_command +- To read an existing file → read_file +- To search code → search_files or list_files + +If you find yourself wanting to call a non-listed tool name, STOP immediately. Use execute_command with a relevant shell command instead. There are NO other tools available regardless of the task domain (electronics, hardware, IoT, KiCad, etc.). + +ASYNCHRONOUS TOOLS + +The tools execute_command, search_files, and list_files (when recursive=true) may return a placeholder of the form: + + task_id: + status: running + To retrieve the result, call get_tool_result with this task_id. + +When you receive such a placeholder, call get_tool_result(task_id="...") to fetch the actual result. You may continue with other independent work in parallel — the task runs in the background. The result will be available within seconds to minutes depending on the tool. + +If you do not need the result immediately, you can defer the call to get_tool_result until you actually need it. If you are blocked waiting on the result, call get_tool_result with wait=true (default) to block. Use wait=false to peek at the current status without blocking. + ACT MODE VS PLAN MODE In each user message, the environment_details will specify the current mode. There are two modes: @@ -18,12 +48,12 @@ In each user message, the environment_details will specify the current mode. The - In ACT MODE, you use tools to accomplish the user's task. Once you've completed the user's task, you use the attempt_completion tool to present the result of the task to the user. - PLAN MODE: In this special mode, you have access to the plan_mode_respond tool. - In PLAN MODE, start by getting precise understanding of what the user wants in this task. - - In PLAN MODE, the goal is to gather information and get context to create a detailed plan for accomplishing the task, which the user will review and approve before they switch you to ACT MODE to implement the solution. + - In PLAN MODE, the goal is to gather information and present a plan, which the user will review and approve before they switch you to ACT MODE to implement the solution. If it is a simple question, answer promptly. Not all tasks sent to you are deep research tasks. SYSTEM INFO -- Operating System: linux -- Default Shell: /bin/bash +- Operating System: darwin +- Default Shell: /bin/zsh - Current Working Directory: /test/project (this is where all the tools will be executed from) - Available CPU Cores: 1 (Use this value for parallel jobs like 'make -j' instead of 'nproc') diff --git a/src/core/prompts/system-prompt/__tests__/__snapshots__/gemini_gemini_3-no-browser.snap b/src/core/prompts/system-prompt/__tests__/__snapshots__/gemini_gemini_3-no-browser.snap index cf8d58ab..90b6a85f 100644 --- a/src/core/prompts/system-prompt/__tests__/__snapshots__/gemini_gemini_3-no-browser.snap +++ b/src/core/prompts/system-prompt/__tests__/__snapshots__/gemini_gemini_3-no-browser.snap @@ -10,6 +10,36 @@ TOOL USE - Prefer tools for communication; avoid redundant text in assistant responses. +TOOL CONSTRAINTS — READ CAREFULLY + +The ONLY tools you may call are the ones explicitly listed in the tools schema sent with this request. You MUST NOT invent, hallucinate, or guess other tool names. Forbidden examples of non-existent tools you must never call: +- digikey, digikey:search, digikey_search +- bom, bom:create, bom_create +- kicad, kicad:new_project, kicad_new_project +- oscilloscope, oscilloscope:probe, logic_analyzer +- web_search, browser, http_get, curl +- Any name containing ":" or "." is NOT a valid tool name + +If a task seems to require a tool that is not listed, use the available tools instead: +- To create or write a file → write_to_file +- To run any shell command, script, or external program → execute_command +- To read an existing file → read_file +- To search code → search_files or list_files + +If you find yourself wanting to call a non-listed tool name, STOP immediately. Use execute_command with a relevant shell command instead. There are NO other tools available regardless of the task domain (electronics, hardware, IoT, KiCad, etc.). + +ASYNCHRONOUS TOOLS + +The tools execute_command, search_files, and list_files (when recursive=true) may return a placeholder of the form: + + task_id: + status: running + To retrieve the result, call get_tool_result with this task_id. + +When you receive such a placeholder, call get_tool_result(task_id="...") to fetch the actual result. You may continue with other independent work in parallel — the task runs in the background. The result will be available within seconds to minutes depending on the tool. + +If you do not need the result immediately, you can defer the call to get_tool_result until you actually need it. If you are blocked waiting on the result, call get_tool_result with wait=true (default) to block. Use wait=false to peek at the current status without blocking. + ACT MODE VS PLAN MODE In each user message, the environment_details will specify the current mode. There are two modes: @@ -18,12 +48,12 @@ In each user message, the environment_details will specify the current mode. The - In ACT MODE, you use tools to accomplish the user's task. Once you've completed the user's task, you use the attempt_completion tool to present the result of the task to the user. - PLAN MODE: In this special mode, you have access to the plan_mode_respond tool. - In PLAN MODE, start by getting precise understanding of what the user wants in this task. - - In PLAN MODE, the goal is to gather information and get context to create a detailed plan for accomplishing the task, which the user will review and approve before they switch you to ACT MODE to implement the solution. + - In PLAN MODE, the goal is to gather information and present a plan, which the user will review and approve before they switch you to ACT MODE to implement the solution. If it is a simple question, answer promptly. Not all tasks sent to you are deep research tasks. SYSTEM INFO -- Operating System: linux -- Default Shell: /bin/bash +- Operating System: darwin +- Default Shell: /bin/zsh - Current Working Directory: /test/project (this is where all the tools will be executed from) - Available CPU Cores: 1 (Use this value for parallel jobs like 'make -j' instead of 'nproc') diff --git a/src/core/prompts/system-prompt/__tests__/__snapshots__/gemini_gemini_3.tools.snap b/src/core/prompts/system-prompt/__tests__/__snapshots__/gemini_gemini_3.tools.snap index 9d33f7c4..fb479653 100644 --- a/src/core/prompts/system-prompt/__tests__/__snapshots__/gemini_gemini_3.tools.snap +++ b/src/core/prompts/system-prompt/__tests__/__snapshots__/gemini_gemini_3.tools.snap @@ -294,6 +294,30 @@ ] } }, + { + "name": "get_tool_result", + "description": "Retrieve the result of an asynchronous tool call previously launched by execute_command, search_files, or list_files (when recursive=true). Use this when one of those tools returned a placeholder of the form 'task_id: \\nstatus: running' instead of an immediate result. You may keep working on independent tasks in parallel — the async work runs in the background.", + "parameters": { + "type": "OBJECT", + "properties": { + "task_id": { + "type": "STRING", + "description": "The task identifier (ULID) returned by the async tool." + }, + "wait": { + "type": "BOOLEAN", + "description": "If true (default), block until the task completes or timeout_ms expires. If false, return current status immediately." + }, + "timeout_ms": { + "type": "NUMBER", + "description": "How long to wait for completion when wait=true. Default 60000ms, hard-capped at 300000ms (5 minutes)." + } + }, + "required": [ + "task_id" + ] + } + }, { "name": "find_symbol_references", "description": "Finds all exact AST references and invocations of one or more functions, classes, or variables across specified files or directories. Returns precise file paths.", diff --git a/src/core/prompts/system-prompt/__tests__/__snapshots__/openai_gpt_5-basic.snap b/src/core/prompts/system-prompt/__tests__/__snapshots__/openai_gpt_5-basic.snap index cf8d58ab..90b6a85f 100644 --- a/src/core/prompts/system-prompt/__tests__/__snapshots__/openai_gpt_5-basic.snap +++ b/src/core/prompts/system-prompt/__tests__/__snapshots__/openai_gpt_5-basic.snap @@ -10,6 +10,36 @@ TOOL USE - Prefer tools for communication; avoid redundant text in assistant responses. +TOOL CONSTRAINTS — READ CAREFULLY + +The ONLY tools you may call are the ones explicitly listed in the tools schema sent with this request. You MUST NOT invent, hallucinate, or guess other tool names. Forbidden examples of non-existent tools you must never call: +- digikey, digikey:search, digikey_search +- bom, bom:create, bom_create +- kicad, kicad:new_project, kicad_new_project +- oscilloscope, oscilloscope:probe, logic_analyzer +- web_search, browser, http_get, curl +- Any name containing ":" or "." is NOT a valid tool name + +If a task seems to require a tool that is not listed, use the available tools instead: +- To create or write a file → write_to_file +- To run any shell command, script, or external program → execute_command +- To read an existing file → read_file +- To search code → search_files or list_files + +If you find yourself wanting to call a non-listed tool name, STOP immediately. Use execute_command with a relevant shell command instead. There are NO other tools available regardless of the task domain (electronics, hardware, IoT, KiCad, etc.). + +ASYNCHRONOUS TOOLS + +The tools execute_command, search_files, and list_files (when recursive=true) may return a placeholder of the form: + + task_id: + status: running + To retrieve the result, call get_tool_result with this task_id. + +When you receive such a placeholder, call get_tool_result(task_id="...") to fetch the actual result. You may continue with other independent work in parallel — the task runs in the background. The result will be available within seconds to minutes depending on the tool. + +If you do not need the result immediately, you can defer the call to get_tool_result until you actually need it. If you are blocked waiting on the result, call get_tool_result with wait=true (default) to block. Use wait=false to peek at the current status without blocking. + ACT MODE VS PLAN MODE In each user message, the environment_details will specify the current mode. There are two modes: @@ -18,12 +48,12 @@ In each user message, the environment_details will specify the current mode. The - In ACT MODE, you use tools to accomplish the user's task. Once you've completed the user's task, you use the attempt_completion tool to present the result of the task to the user. - PLAN MODE: In this special mode, you have access to the plan_mode_respond tool. - In PLAN MODE, start by getting precise understanding of what the user wants in this task. - - In PLAN MODE, the goal is to gather information and get context to create a detailed plan for accomplishing the task, which the user will review and approve before they switch you to ACT MODE to implement the solution. + - In PLAN MODE, the goal is to gather information and present a plan, which the user will review and approve before they switch you to ACT MODE to implement the solution. If it is a simple question, answer promptly. Not all tasks sent to you are deep research tasks. SYSTEM INFO -- Operating System: linux -- Default Shell: /bin/bash +- Operating System: darwin +- Default Shell: /bin/zsh - Current Working Directory: /test/project (this is where all the tools will be executed from) - Available CPU Cores: 1 (Use this value for parallel jobs like 'make -j' instead of 'nproc') diff --git a/src/core/prompts/system-prompt/__tests__/__snapshots__/openai_gpt_5-no-browser.snap b/src/core/prompts/system-prompt/__tests__/__snapshots__/openai_gpt_5-no-browser.snap index cf8d58ab..90b6a85f 100644 --- a/src/core/prompts/system-prompt/__tests__/__snapshots__/openai_gpt_5-no-browser.snap +++ b/src/core/prompts/system-prompt/__tests__/__snapshots__/openai_gpt_5-no-browser.snap @@ -10,6 +10,36 @@ TOOL USE - Prefer tools for communication; avoid redundant text in assistant responses. +TOOL CONSTRAINTS — READ CAREFULLY + +The ONLY tools you may call are the ones explicitly listed in the tools schema sent with this request. You MUST NOT invent, hallucinate, or guess other tool names. Forbidden examples of non-existent tools you must never call: +- digikey, digikey:search, digikey_search +- bom, bom:create, bom_create +- kicad, kicad:new_project, kicad_new_project +- oscilloscope, oscilloscope:probe, logic_analyzer +- web_search, browser, http_get, curl +- Any name containing ":" or "." is NOT a valid tool name + +If a task seems to require a tool that is not listed, use the available tools instead: +- To create or write a file → write_to_file +- To run any shell command, script, or external program → execute_command +- To read an existing file → read_file +- To search code → search_files or list_files + +If you find yourself wanting to call a non-listed tool name, STOP immediately. Use execute_command with a relevant shell command instead. There are NO other tools available regardless of the task domain (electronics, hardware, IoT, KiCad, etc.). + +ASYNCHRONOUS TOOLS + +The tools execute_command, search_files, and list_files (when recursive=true) may return a placeholder of the form: + + task_id: + status: running + To retrieve the result, call get_tool_result with this task_id. + +When you receive such a placeholder, call get_tool_result(task_id="...") to fetch the actual result. You may continue with other independent work in parallel — the task runs in the background. The result will be available within seconds to minutes depending on the tool. + +If you do not need the result immediately, you can defer the call to get_tool_result until you actually need it. If you are blocked waiting on the result, call get_tool_result with wait=true (default) to block. Use wait=false to peek at the current status without blocking. + ACT MODE VS PLAN MODE In each user message, the environment_details will specify the current mode. There are two modes: @@ -18,12 +48,12 @@ In each user message, the environment_details will specify the current mode. The - In ACT MODE, you use tools to accomplish the user's task. Once you've completed the user's task, you use the attempt_completion tool to present the result of the task to the user. - PLAN MODE: In this special mode, you have access to the plan_mode_respond tool. - In PLAN MODE, start by getting precise understanding of what the user wants in this task. - - In PLAN MODE, the goal is to gather information and get context to create a detailed plan for accomplishing the task, which the user will review and approve before they switch you to ACT MODE to implement the solution. + - In PLAN MODE, the goal is to gather information and present a plan, which the user will review and approve before they switch you to ACT MODE to implement the solution. If it is a simple question, answer promptly. Not all tasks sent to you are deep research tasks. SYSTEM INFO -- Operating System: linux -- Default Shell: /bin/bash +- Operating System: darwin +- Default Shell: /bin/zsh - Current Working Directory: /test/project (this is where all the tools will be executed from) - Available CPU Cores: 1 (Use this value for parallel jobs like 'make -j' instead of 'nproc') diff --git a/src/core/prompts/system-prompt/__tests__/__snapshots__/openai_gpt_5.tools.snap b/src/core/prompts/system-prompt/__tests__/__snapshots__/openai_gpt_5.tools.snap index 2f202f1b..b73ef101 100644 --- a/src/core/prompts/system-prompt/__tests__/__snapshots__/openai_gpt_5.tools.snap +++ b/src/core/prompts/system-prompt/__tests__/__snapshots__/openai_gpt_5.tools.snap @@ -3,8 +3,8 @@ "type": "function", "function": { "name": "ask_followup_question", - "description": "Asks the user a clarifying question when you encounter ambiguities or need more details.", "strict": false, + "description": "Asks the user a clarifying question when you encounter ambiguities or need more details.", "parameters": { "type": "object", "properties": { @@ -28,8 +28,8 @@ "type": "function", "function": { "name": "attempt_completion", - "description": "Presents a brief and informative summary of the final result. Keep it concise while covering important changes. Avoid redundant text.", "strict": false, + "description": "Presents a brief and informative summary of the final result. Keep it concise while covering important changes. Avoid redundant text.", "parameters": { "type": "object", "properties": { @@ -53,8 +53,8 @@ "type": "function", "function": { "name": "diagnostics_scan", - "description": "Runs diagnostics (linter and syntax checks) on the specified files and returns the results. This is useful for checking if recent changes introduced any errors or for getting a summary of existing problems in specific files.", "strict": false, + "description": "Runs diagnostics (linter and syntax checks) on the specified files and returns the results. This is useful for checking if recent changes introduced any errors or for getting a summary of existing problems in specific files.", "parameters": { "type": "object", "properties": { @@ -77,8 +77,8 @@ "type": "function", "function": { "name": "browser_action", - "description": "Request to interact with a Puppeteer-controlled browser. Every action, except `close`, will be responded to with a screenshot of the browser's current state, along with any new console logs. You may only perform one browser action per message, and wait for the user's response including a screenshot and logs to determine the next action.\n- The sequence of actions **must always start with** launching the browser at a URL, and **must always end with** closing the browser. If you need to visit a new URL that is not possible to navigate to from the current webpage, you must first close the browser, then launch again at the new URL.\n- While the browser is active, only the `browser_action` tool can be used. No other tools should be called during this time. You may proceed to use other tools only after closing the browser. For example if you run into an error and need to fix a file, you must close the browser, then use other tools to make the necessary changes, then re-launch the browser to verify the result.\n- The browser window has a resolution of **1280x720** pixels. When performing any click actions, ensure the coordinates are within this resolution range.\n- Before clicking on any elements such as icons, links, or buttons, you must consult the provided screenshot of the page to determine the coordinates of the element. The click should be targeted at the **center of the element**, not on its edges.", "strict": false, + "description": "Request to interact with a Puppeteer-controlled browser. Every action, except `close`, will be responded to with a screenshot of the browser's current state, along with any new console logs. You may only perform one browser action per message, and wait for the user's response including a screenshot and logs to determine the next action.\n- The sequence of actions **must always start with** launching the browser at a URL, and **must always end with** closing the browser. If you need to visit a new URL that is not possible to navigate to from the current webpage, you must first close the browser, then launch again at the new URL.\n- While the browser is active, only the `browser_action` tool can be used. No other tools should be called during this time. You may proceed to use other tools only after closing the browser. For example if you run into an error and need to fix a file, you must close the browser, then use other tools to make the necessary changes, then re-launch the browser to verify the result.\n- The browser window has a resolution of **1280x720** pixels. When performing any click actions, ensure the coordinates are within this resolution range.\n- Before clicking on any elements such as icons, links, or buttons, you must consult the provided screenshot of the page to determine the coordinates of the element. The click should be targeted at the **center of the element**, not on its edges.", "parameters": { "type": "object", "properties": { @@ -110,8 +110,8 @@ "type": "function", "function": { "name": "edit_file", - "description": "Edit one or more files by replacing, inserting after, or inserting before specific lines.\nRead the files of extract function first to get current anchors. Each file contains an array of edits.\n\nEDIT TYPES:\n1. replace (default): Replaces an inclusive range of lines from anchor to end_anchor.\n2. insert_after: Inserts the provided text immediately after the line specified by anchor. end_anchor is not used.\n3. insert_before: Inserts the provided text immediately before the line specified by anchor. end_anchor is not used.\n\nANCHOR RULES:\n1. Anchors are a single opaque word (e.g., \"AppleBanana\") and basically hashes that carry no meaning, followed by § which is followed by the actual line content.\n2. For 'replace', anchors are inclusive, meaning what you specify as anchor and end_anchor, the lines belonging to both and everything in between will be overwritten.\n3. Anchors are file scoped. \"Apple§\" in one file is different from \"Apple§\" in another file. \n\nWhen replacing multi-line statements, function calls, or dictionaries, you MUST ensure your end_anchor points to precisely the line where the construct ends (e.g. a closing bracket or end of function). Do not leave orphaned closing syntax on the following lines NOR do miss the closing syntax. \n\nTip: if you are stuck updating a single line, try to change it to 'replace' call with start line is few lines before the the target line and end line is a few lines after.\n\nBATCHING RULES:\nYou MUST batch all non-overlapping edits into a single tool call. As long as the edits do not overlap, our backend tooling guarantees safety. Multiple files can be edited in a single call also.", "strict": false, + "description": "Edit one or more files by replacing, inserting after, or inserting before specific lines.\nRead the files of extract function first to get current anchors. Each file contains an array of edits.\n\nEDIT TYPES:\n1. replace (default): Replaces an inclusive range of lines from anchor to end_anchor.\n2. insert_after: Inserts the provided text immediately after the line specified by anchor. end_anchor is not used.\n3. insert_before: Inserts the provided text immediately before the line specified by anchor. end_anchor is not used.\n\nANCHOR RULES:\n1. Anchors are a single opaque word (e.g., \"AppleBanana\") and basically hashes that carry no meaning, followed by § which is followed by the actual line content.\n2. For 'replace', anchors are inclusive, meaning what you specify as anchor and end_anchor, the lines belonging to both and everything in between will be overwritten.\n3. Anchors are file scoped. \"Apple§\" in one file is different from \"Apple§\" in another file. \n\nWhen replacing multi-line statements, function calls, or dictionaries, you MUST ensure your end_anchor points to precisely the line where the construct ends (e.g. a closing bracket or end of function). Do not leave orphaned closing syntax on the following lines NOR do miss the closing syntax. \n\nTip: if you are stuck updating a single line, try to change it to 'replace' call with start line is few lines before the the target line and end line is a few lines after.\n\nBATCHING RULES:\nYou MUST batch all non-overlapping edits into a single tool call. As long as the edits do not overlap, our backend tooling guarantees safety. Multiple files can be edited in a single call also.", "parameters": { "type": "object", "properties": { @@ -181,8 +181,8 @@ "type": "function", "function": { "name": "replace_symbol", - "description": "Replaces one or more symbols (functions, methods, or classes) in one or more files with new code. This is more robust and token-efficient than edit_file because it targets specific AST nodes directly. IMPORTANT: You MUST provide the complete and correct replacement for each symbol, including all its associated JSDoc, comments, decorators, and export keywords. The tool will replace the entire original range of the symbol and its metadata with your provided text.", "strict": false, + "description": "Replaces one or more symbols (functions, methods, or classes) in one or more files with new code. This is more robust and token-efficient than edit_file because it targets specific AST nodes directly. IMPORTANT: You MUST provide the complete and correct replacement for each symbol, including all its associated JSDoc, comments, decorators, and export keywords. The tool will replace the entire original range of the symbol and its metadata with your provided text.", "parameters": { "type": "object", "properties": { @@ -228,8 +228,8 @@ "type": "function", "function": { "name": "rename_symbol", - "description": "Renames ALL occurrences of a symbol (function, class, method, or variable) inside the specified files or directories. This tool can identify precise symbols using a language's AST and is more accurate than a simple search-and-replace because it understands the language structure. For renaming tasks, strongly prefer this as the first pass.", "strict": false, + "description": "Renames ALL occurrences of a symbol (function, class, method, or variable) inside the specified files or directories. This tool can identify precise symbols using a language's AST and is more accurate than a simple search-and-replace because it understands the language structure. For renaming tasks, strongly prefer this as the first pass.", "parameters": { "type": "object", "properties": { @@ -262,8 +262,8 @@ "type": "function", "function": { "name": "execute_command", - "description": "Executes CLI commands or scripts. Use 'commands' for simple sequences of shell operations, can also be a single command. Use 'script' for complex multi-line logic, data processing, or when a high-level language like Python or Node.js is more efficient than shell scripting. Default language is bash'script' are also very useful for combinatorial problems such as looping over 'swap and try' patternScripts have full access to the file system and current environment, be careful. In multi-root workspaces, use the @workspace:command syntax for standard commands. Leverage the full power of the environment's interpreters (bash, python, node, etc.) to accomplish tasks with minimal round-trips.NOTE: provide exactly one of the {commands,script}", "strict": false, + "description": "Executes CLI commands or scripts. Use 'commands' for simple sequences of shell operations, can also be a single command. Use 'script' for complex multi-line logic, data processing, or when a high-level language like Python or Node.js is more efficient than shell scripting. Default language is bash'script' are also very useful for combinatorial problems such as looping over 'swap and try' patternScripts have full access to the file system and current environment, be careful. In multi-root workspaces, use the @workspace:command syntax for standard commands. Leverage the full power of the environment's interpreters (bash, python, node, etc.) to accomplish tasks with minimal round-trips.NOTE: provide exactly one of the {commands,script}", "parameters": { "type": "object", "properties": { @@ -292,8 +292,8 @@ "type": "function", "function": { "name": "get_function", - "description": "Extracts the complete implementation of one or more functions or methods from one or more files. Use this to inspect specific functions' logic without reading the entire files. You can specify multiple files and multiple functions, it will return an all to all lookup result. Use dot-separated path to the function.", "strict": false, + "description": "Extracts the complete implementation of one or more functions or methods from one or more files. Use this to inspect specific functions' logic without reading the entire files. You can specify multiple files and multiple functions, it will return an all to all lookup result. Use dot-separated path to the function.", "parameters": { "type": "object", "properties": { @@ -324,8 +324,8 @@ "type": "function", "function": { "name": "get_file_skeleton", - "description": "Reads the structural outline of one or more files by extracting the lines where classes, functions, and methods are defined (including nested definitions) while stripping all implementation logic. Use this to quickly understand multiple files' structures and APIs before requesting specific functions.", "strict": false, + "description": "Reads the structural outline of one or more files by extracting the lines where classes, functions, and methods are defined (including nested definitions) while stripping all implementation logic. Use this to quickly understand multiple files' structures and APIs before requesting specific functions.", "parameters": { "type": "object", "properties": { @@ -344,12 +344,41 @@ } } }, + { + "type": "function", + "function": { + "name": "get_tool_result", + "strict": false, + "description": "Retrieve the result of an asynchronous tool call previously launched by execute_command, search_files, or list_files (when recursive=true). Use this when one of those tools returned a placeholder of the form 'task_id: \\nstatus: running' instead of an immediate result. You may keep working on independent tasks in parallel — the async work runs in the background.", + "parameters": { + "type": "object", + "properties": { + "task_id": { + "type": "string", + "description": "The task identifier (ULID) returned by the async tool. Example: 01HXYZ…" + }, + "wait": { + "type": "boolean", + "description": "If true (default), block until the task completes or timeout_ms expires. If false, return current status immediately. Example: true or false (optional, default true)" + }, + "timeout_ms": { + "type": "integer", + "description": "How long to wait for completion when wait=true. Default 60000ms, hard-capped at 300000ms (5 minutes). Example: 60000 (optional)" + } + }, + "required": [ + "task_id" + ], + "additionalProperties": false + } + } + }, { "type": "function", "function": { "name": "find_symbol_references", - "description": "Finds all exact AST references and invocations of one or more functions, classes, or variables across specified files or directories. Returns precise file paths.", "strict": false, + "description": "Finds all exact AST references and invocations of one or more functions, classes, or variables across specified files or directories. Returns precise file paths.", "parameters": { "type": "object", "properties": { @@ -389,8 +418,8 @@ "type": "function", "function": { "name": "list_files", - "description": "List files and directories within the specified paths (one or more), or provide information about a specific file if a file path is provided. If recursive is true, it will list all files and directories recursively. Skips non-useful content (.git, node_modules, build artifacts, etc.). Files are sorted by most recently modified first within each directory. The output includes the line count for each file. Do not use this tool to confirm the existence of files you've just created.", "strict": false, + "description": "List files and directories within the specified paths (one or more), or provide information about a specific file if a file path is provided. If recursive is true, it will list all files and directories recursively. Skips non-useful content (.git, node_modules, build artifacts, etc.). Files are sorted by most recently modified first within each directory. The output includes the line count for each file. Do not use this tool to confirm the existence of files you've just created.", "parameters": { "type": "object", "properties": { @@ -417,8 +446,8 @@ "type": "function", "function": { "name": "new_task", - "description": "Creates a new task with preloaded context from the current conversation.", "strict": false, + "description": "Creates a new task with preloaded context from the current conversation.", "parameters": { "type": "object", "properties": { @@ -438,8 +467,8 @@ "type": "function", "function": { "name": "plan_mode_respond", - "description": "Proposes a step-by-step solution plan to the user. Use only in PLAN MODE after exploring the codebase. Avoid repeating the plan in text.", "strict": false, + "description": "Proposes a step-by-step solution plan to the user. Use only in PLAN MODE after exploring the codebase. Avoid repeating the plan in text.", "parameters": { "type": "object", "properties": { @@ -463,8 +492,8 @@ "type": "function", "function": { "name": "read_file", - "description": "Reads the complete contents of one or more files at the specified paths. Automatically extracts raw text from PDF and DOCX files. Returns the hash anchored lines that you can use with the edit_file tool. You can also specify a line range to read only a specific part of the file(s). Examples: { paths: [\"src/main.ts\", \"package.json\"] }, { paths: [\"src/main.ts\"] }, { paths: [\"src/main.ts\"], start_line: 10, end_line: 50 }, { paths: [\"src/main.ts\"], start_line: 100 }, { paths: [\"src/main.ts\"], end_line: 50 }. Consider using surgical tools like get_file_skeleton or get_function over this.", "strict": false, + "description": "Reads the complete contents of one or more files at the specified paths. Automatically extracts raw text from PDF and DOCX files. Returns the hash anchored lines that you can use with the edit_file tool. You can also specify a line range to read only a specific part of the file(s). Examples: { paths: [\"src/main.ts\", \"package.json\"] }, { paths: [\"src/main.ts\"] }, { paths: [\"src/main.ts\"], start_line: 10, end_line: 50 }, { paths: [\"src/main.ts\"], start_line: 100 }, { paths: [\"src/main.ts\"], end_line: 50 }. Consider using surgical tools like get_file_skeleton or get_function over this.", "parameters": { "type": "object", "properties": { @@ -495,8 +524,8 @@ "type": "function", "function": { "name": "search_files", - "description": "Regex search across files in the specified paths (files or directories). Skips non-useful content (.git, node_modules, build artifacts, etc. and all files and directories starting with a dot). Prefer AST tools over this when reasonable.", "strict": false, + "description": "Regex search across files in the specified paths (files or directories). Skips non-useful content (.git, node_modules, build artifacts, etc. and all files and directories starting with a dot). Prefer AST tools over this when reasonable.", "parameters": { "type": "object", "properties": { @@ -532,8 +561,8 @@ "type": "function", "function": { "name": "use_subagents", - "description": "Run upto five focused in-process subagents in parallel. Each subagent gets its own prompt and returns a comprehensive research result. Default timeout is 300 seconds. Particularly effective for investigating multiple independent paths simultaneously without consuming the your context window.", "strict": false, + "description": "Run upto five focused in-process subagents in parallel. Each subagent gets its own prompt and returns a comprehensive research result. Default timeout is 300 seconds. Particularly effective for investigating multiple independent paths simultaneously without consuming the your context window.", "parameters": { "type": "object", "properties": { @@ -581,8 +610,8 @@ "type": "function", "function": { "name": "write_to_file", - "description": "Creates a new file or completely overwrites an existing file. Automatically creates required directories.", "strict": false, + "description": "Creates a new file or completely overwrites an existing file. Automatically creates required directories.", "parameters": { "type": "object", "properties": { diff --git a/src/core/prompts/system-prompt/__tests__/__snapshots__/vertex_gemini_3-basic.snap b/src/core/prompts/system-prompt/__tests__/__snapshots__/vertex_gemini_3-basic.snap index cf8d58ab..90b6a85f 100644 --- a/src/core/prompts/system-prompt/__tests__/__snapshots__/vertex_gemini_3-basic.snap +++ b/src/core/prompts/system-prompt/__tests__/__snapshots__/vertex_gemini_3-basic.snap @@ -10,6 +10,36 @@ TOOL USE - Prefer tools for communication; avoid redundant text in assistant responses. +TOOL CONSTRAINTS — READ CAREFULLY + +The ONLY tools you may call are the ones explicitly listed in the tools schema sent with this request. You MUST NOT invent, hallucinate, or guess other tool names. Forbidden examples of non-existent tools you must never call: +- digikey, digikey:search, digikey_search +- bom, bom:create, bom_create +- kicad, kicad:new_project, kicad_new_project +- oscilloscope, oscilloscope:probe, logic_analyzer +- web_search, browser, http_get, curl +- Any name containing ":" or "." is NOT a valid tool name + +If a task seems to require a tool that is not listed, use the available tools instead: +- To create or write a file → write_to_file +- To run any shell command, script, or external program → execute_command +- To read an existing file → read_file +- To search code → search_files or list_files + +If you find yourself wanting to call a non-listed tool name, STOP immediately. Use execute_command with a relevant shell command instead. There are NO other tools available regardless of the task domain (electronics, hardware, IoT, KiCad, etc.). + +ASYNCHRONOUS TOOLS + +The tools execute_command, search_files, and list_files (when recursive=true) may return a placeholder of the form: + + task_id: + status: running + To retrieve the result, call get_tool_result with this task_id. + +When you receive such a placeholder, call get_tool_result(task_id="...") to fetch the actual result. You may continue with other independent work in parallel — the task runs in the background. The result will be available within seconds to minutes depending on the tool. + +If you do not need the result immediately, you can defer the call to get_tool_result until you actually need it. If you are blocked waiting on the result, call get_tool_result with wait=true (default) to block. Use wait=false to peek at the current status without blocking. + ACT MODE VS PLAN MODE In each user message, the environment_details will specify the current mode. There are two modes: @@ -18,12 +48,12 @@ In each user message, the environment_details will specify the current mode. The - In ACT MODE, you use tools to accomplish the user's task. Once you've completed the user's task, you use the attempt_completion tool to present the result of the task to the user. - PLAN MODE: In this special mode, you have access to the plan_mode_respond tool. - In PLAN MODE, start by getting precise understanding of what the user wants in this task. - - In PLAN MODE, the goal is to gather information and get context to create a detailed plan for accomplishing the task, which the user will review and approve before they switch you to ACT MODE to implement the solution. + - In PLAN MODE, the goal is to gather information and present a plan, which the user will review and approve before they switch you to ACT MODE to implement the solution. If it is a simple question, answer promptly. Not all tasks sent to you are deep research tasks. SYSTEM INFO -- Operating System: linux -- Default Shell: /bin/bash +- Operating System: darwin +- Default Shell: /bin/zsh - Current Working Directory: /test/project (this is where all the tools will be executed from) - Available CPU Cores: 1 (Use this value for parallel jobs like 'make -j' instead of 'nproc') diff --git a/src/core/prompts/system-prompt/__tests__/__snapshots__/vertex_gemini_3-no-browser.snap b/src/core/prompts/system-prompt/__tests__/__snapshots__/vertex_gemini_3-no-browser.snap index cf8d58ab..90b6a85f 100644 --- a/src/core/prompts/system-prompt/__tests__/__snapshots__/vertex_gemini_3-no-browser.snap +++ b/src/core/prompts/system-prompt/__tests__/__snapshots__/vertex_gemini_3-no-browser.snap @@ -10,6 +10,36 @@ TOOL USE - Prefer tools for communication; avoid redundant text in assistant responses. +TOOL CONSTRAINTS — READ CAREFULLY + +The ONLY tools you may call are the ones explicitly listed in the tools schema sent with this request. You MUST NOT invent, hallucinate, or guess other tool names. Forbidden examples of non-existent tools you must never call: +- digikey, digikey:search, digikey_search +- bom, bom:create, bom_create +- kicad, kicad:new_project, kicad_new_project +- oscilloscope, oscilloscope:probe, logic_analyzer +- web_search, browser, http_get, curl +- Any name containing ":" or "." is NOT a valid tool name + +If a task seems to require a tool that is not listed, use the available tools instead: +- To create or write a file → write_to_file +- To run any shell command, script, or external program → execute_command +- To read an existing file → read_file +- To search code → search_files or list_files + +If you find yourself wanting to call a non-listed tool name, STOP immediately. Use execute_command with a relevant shell command instead. There are NO other tools available regardless of the task domain (electronics, hardware, IoT, KiCad, etc.). + +ASYNCHRONOUS TOOLS + +The tools execute_command, search_files, and list_files (when recursive=true) may return a placeholder of the form: + + task_id: + status: running + To retrieve the result, call get_tool_result with this task_id. + +When you receive such a placeholder, call get_tool_result(task_id="...") to fetch the actual result. You may continue with other independent work in parallel — the task runs in the background. The result will be available within seconds to minutes depending on the tool. + +If you do not need the result immediately, you can defer the call to get_tool_result until you actually need it. If you are blocked waiting on the result, call get_tool_result with wait=true (default) to block. Use wait=false to peek at the current status without blocking. + ACT MODE VS PLAN MODE In each user message, the environment_details will specify the current mode. There are two modes: @@ -18,12 +48,12 @@ In each user message, the environment_details will specify the current mode. The - In ACT MODE, you use tools to accomplish the user's task. Once you've completed the user's task, you use the attempt_completion tool to present the result of the task to the user. - PLAN MODE: In this special mode, you have access to the plan_mode_respond tool. - In PLAN MODE, start by getting precise understanding of what the user wants in this task. - - In PLAN MODE, the goal is to gather information and get context to create a detailed plan for accomplishing the task, which the user will review and approve before they switch you to ACT MODE to implement the solution. + - In PLAN MODE, the goal is to gather information and present a plan, which the user will review and approve before they switch you to ACT MODE to implement the solution. If it is a simple question, answer promptly. Not all tasks sent to you are deep research tasks. SYSTEM INFO -- Operating System: linux -- Default Shell: /bin/bash +- Operating System: darwin +- Default Shell: /bin/zsh - Current Working Directory: /test/project (this is where all the tools will be executed from) - Available CPU Cores: 1 (Use this value for parallel jobs like 'make -j' instead of 'nproc') diff --git a/src/core/prompts/system-prompt/__tests__/__snapshots__/vertex_gemini_3.tools.snap b/src/core/prompts/system-prompt/__tests__/__snapshots__/vertex_gemini_3.tools.snap index 9d33f7c4..fb479653 100644 --- a/src/core/prompts/system-prompt/__tests__/__snapshots__/vertex_gemini_3.tools.snap +++ b/src/core/prompts/system-prompt/__tests__/__snapshots__/vertex_gemini_3.tools.snap @@ -294,6 +294,30 @@ ] } }, + { + "name": "get_tool_result", + "description": "Retrieve the result of an asynchronous tool call previously launched by execute_command, search_files, or list_files (when recursive=true). Use this when one of those tools returned a placeholder of the form 'task_id: \\nstatus: running' instead of an immediate result. You may keep working on independent tasks in parallel — the async work runs in the background.", + "parameters": { + "type": "OBJECT", + "properties": { + "task_id": { + "type": "STRING", + "description": "The task identifier (ULID) returned by the async tool." + }, + "wait": { + "type": "BOOLEAN", + "description": "If true (default), block until the task completes or timeout_ms expires. If false, return current status immediately." + }, + "timeout_ms": { + "type": "NUMBER", + "description": "How long to wait for completion when wait=true. Default 60000ms, hard-capped at 300000ms (5 minutes)." + } + }, + "required": [ + "task_id" + ] + } + }, { "name": "find_symbol_references", "description": "Finds all exact AST references and invocations of one or more functions, classes, or variables across specified files or directories. Returns precise file paths.", diff --git a/src/core/prompts/system-prompt/spec.ts b/src/core/prompts/system-prompt/spec.ts index 25b7c0eb..7d1a5abb 100644 --- a/src/core/prompts/system-prompt/spec.ts +++ b/src/core/prompts/system-prompt/spec.ts @@ -182,7 +182,7 @@ export function toolSpecFunctionDefinition(tool: DiracToolSpec, context: SystemP type: "object", properties, required, - ...(strict ? { additionalProperties: false } : {}), + additionalProperties: false, }, }, } @@ -417,7 +417,7 @@ export function openAIToolToAnthropic(openAITool: OpenAITool): AnthropicTool { ? { properties: func.parameters?.properties, required: (func.parameters as any)?.required || [], - } + } : {}), }, } diff --git a/src/core/slash-commands/__tests__/index.test.ts b/src/core/slash-commands/__tests__/index.test.ts index 0db59f4b..d31c47d3 100644 --- a/src/core/slash-commands/__tests__/index.test.ts +++ b/src/core/slash-commands/__tests__/index.test.ts @@ -18,9 +18,4 @@ describe("slash-commands", () => { }) expect(result.processedText).to.include("help them create a new task with preloaded context") }) - it("should process /askDirac slash command", async () => { - const text = "" + "/askDirac" + "" - const result = await parseSlashCommands(text, {}, {}, "test-ulid", undefined, [], undefined, "/mock/path", "mock-dist/source") - expect(result.processedText).to.include("The source code is located at: /mock/path/mock-dist/source") - }) -}) \ No newline at end of file +}) diff --git a/src/core/storage/disk.ts b/src/core/storage/disk.ts index bf8c6d79..4f9193d3 100644 --- a/src/core/storage/disk.ts +++ b/src/core/storage/disk.ts @@ -27,11 +27,13 @@ import { StateManager } from "./StateManager" * @param filePath - The target file path * @param data - The data to write */ -async function atomicWriteFile(filePath: string, data: string): Promise { +async function atomicWriteFile(filePath: string, data: string, mode?: number): Promise { const tmpPath = `${filePath}.tmp.${process.pid}.${Date.now()}.${randomUUID()}` try { - // Write to temporary file first - await fs.writeFile(tmpPath, data, "utf8") + // Write to temporary file first. When a mode is given, create the temp + // file with it so the renamed target inherits restrictive permissions + // (used for files that may hold conversation secrets). + await fs.writeFile(tmpPath, data, mode !== undefined ? { encoding: "utf8", mode } : "utf8") // Rename temp file to target (atomic in most cases) await fs.rename(tmpPath, filePath) } catch (error) { @@ -243,7 +245,11 @@ export async function saveApiConversationHistory(taskId: string, apiConversation syncWorker().enqueue(taskId, fileName, data) // Store locally const filePath = path.join(await ensureTaskDirectoryExists(taskId), fileName) - await atomicWriteFile(filePath, data) + // 0600: the conversation history is a plaintext copy of the full LLM + // exchange and may contain secrets the user/tools pasted. It is replayed + // to resume a task, so it is deliberately NOT scrubbed — restrict it to + // the owner instead. + await atomicWriteFile(filePath, data, 0o600) } } catch (error) { // in the off chance this fails, we don't want to stop the task diff --git a/src/core/task/tools/handlers/__tests__/RenameSymbolToolHandler.test.ts b/src/core/task/tools/handlers/__tests__/RenameSymbolToolHandler.test.ts index 7f0aa001..a109cfea 100644 --- a/src/core/task/tools/handlers/__tests__/RenameSymbolToolHandler.test.ts +++ b/src/core/task/tools/handlers/__tests__/RenameSymbolToolHandler.test.ts @@ -8,13 +8,13 @@ import sinon from "sinon" import { HostProvider } from "@/hosts/host-provider" import * as getDiagnosticsProvidersModule from "@/integrations/diagnostics/getDiagnosticsProviders" import { LinterFeedbackProvider } from "@/integrations/diagnostics/LinterFeedbackProvider" +import { SymbolIndexService } from "@/services/symbol-index/SymbolIndexService" import { DiagnosticSeverity } from "@/shared/proto/index.dirac" import { setVscodeHostProviderMock } from "@/test/host-provider-test-utils" import { TaskState } from "../../../TaskState" import { ToolValidator } from "../../ToolValidator" import type { TaskConfig } from "../../types/TaskConfig" import { RenameSymbolToolHandler } from "../RenameSymbolToolHandler" -import { SymbolIndexService } from "@/services/symbol-index/SymbolIndexService" let tmpDir: string @@ -32,10 +32,14 @@ function createConfig() { reset: sinon.stub().resolves(), saveChanges: sinon.stub().callsFake(async () => { if (lastPath && lastContent !== undefined) { - await fs.writeFile(lastPath, lastContent) + // lastPath may be a relative displayPath — resolve against tmpDir + const absPath = path.isAbsolute(lastPath) ? lastPath : path.join(tmpDir, lastPath) + await fs.writeFile(absPath, lastContent) } return { finalContent: lastContent } }), + showReview: sinon.stub().resolves(), + hideReview: sinon.stub().resolves(), } const callbacks = { @@ -478,7 +482,7 @@ describe("RenameSymbolToolHandler", () => { const result = await handler.execute(config, block) assert.strictEqual(result, "No occurrences of symbol 'old' found in the specified paths.") - + // Verify updateFile was not called for non-existent file (it might be called if we don't check existence first, but the handler does check) // Actually, the handler calls fs.stat(absPath) and skips if it fails. }) diff --git a/src/core/task/tools/handlers/__tests__/SubagentToolHandler.test.ts b/src/core/task/tools/handlers/__tests__/SubagentToolHandler.test.ts index 0ec6d7a0..9757ea7f 100644 --- a/src/core/task/tools/handlers/__tests__/SubagentToolHandler.test.ts +++ b/src/core/task/tools/handlers/__tests__/SubagentToolHandler.test.ts @@ -160,7 +160,7 @@ describe("SubagentToolHandler", () => { ) sinon.assert.calledOnce(callbacks.removeLastPartialMessageIfExistsWithType) - sinon.assert.calledWithExactly(callbacks.removeLastPartialMessageIfExistsWithType, "say", "use_subagents") + sinon.assert.calledWithExactly(callbacks.removeLastPartialMessageIfExistsWithType, "say", "use_subagents", undefined) sinon.assert.calledOnce(callbacks.ask) sinon.assert.calledWithMatch(callbacks.ask, "use_subagents", sinon.match.string, true) @@ -188,7 +188,7 @@ describe("SubagentToolHandler", () => { ) sinon.assert.calledOnce(callbacks.removeLastPartialMessageIfExistsWithType) - sinon.assert.calledWithExactly(callbacks.removeLastPartialMessageIfExistsWithType, "ask", "use_subagents") + sinon.assert.calledWithExactly(callbacks.removeLastPartialMessageIfExistsWithType, "ask", "use_subagents", undefined) sinon.assert.calledOnce(callbacks.say) sinon.assert.calledWithMatch(callbacks.say, "use_subagents", sinon.match.string, undefined, undefined, true) @@ -472,6 +472,4 @@ describe("SubagentToolHandler", () => { // Check runner call assert.equal(runStub.firstCall.args[2], 300) }) - - }) diff --git a/src/core/task/tools/handlers/__tests__/debug_syntax.test.ts b/src/core/task/tools/handlers/__tests__/debug_syntax.test.ts index 728b5120..8da25e0f 100644 --- a/src/core/task/tools/handlers/__tests__/debug_syntax.test.ts +++ b/src/core/task/tools/handlers/__tests__/debug_syntax.test.ts @@ -44,7 +44,6 @@ function createConfig() { undoUserEdits: sinon.stub().resolves(), } - const callbacks = { say: sinon.stub().resolves(undefined), ask: sinon.stub().resolves({ response: "yesButtonClicked" }), @@ -59,6 +58,9 @@ function createConfig() { cwd: tmpDir, mode: "act", taskState, + messageState: { + getApiConversationHistory: sinon.stub().returns([]), + }, services: { fileContextTracker: { trackFileContext: sinon.stub().resolves(), diff --git a/src/core/tracing/JsonlTracer.ts b/src/core/tracing/JsonlTracer.ts index b468a86b..d561c63b 100644 --- a/src/core/tracing/JsonlTracer.ts +++ b/src/core/tracing/JsonlTracer.ts @@ -244,7 +244,7 @@ export class JsonlTracer { success: input.tool_execution.success, } : null, - errors: input.errors ?? [], + errors: input.errors ? scrubSecrets(input.errors) : [], } this.queueAppend(this.tracePath, line) return line @@ -315,7 +315,11 @@ export class JsonlTracer { // uses the same pattern. const tmpPath = `${this.metaPath}.tmp` try { - fs.writeFileSync(tmpPath, JSON.stringify(this.meta, null, 2), "utf8") + // Scrub at the single persist point so credential-bearing fields + // (gateway_url with inline creds, worker endpoints) never hit disk. + // scrubSecrets returns a deep copy, so this.meta stays intact for + // later merges (mergeStats / close). + fs.writeFileSync(tmpPath, JSON.stringify(scrubSecrets(this.meta), null, 2), "utf8") fs.renameSync(tmpPath, this.metaPath) } catch (_err) { // swallow — tracing is non-fatal. Best-effort cleanup of the tmp. diff --git a/src/services/local-router/HealthMonitor.ts b/src/services/local-router/HealthMonitor.ts index af3f3dca..a87723de 100644 --- a/src/services/local-router/HealthMonitor.ts +++ b/src/services/local-router/HealthMonitor.ts @@ -38,22 +38,29 @@ export class HealthMonitor { private async check(id: string, w: WorkerEndpoint): Promise { try { - // Try a lightweight /health endpoint first; fall back to /v1/models + // Try a lightweight /health endpoint first; fall back to /v1/models. + // Each attempt gets its OWN controller + timeout: a shared one meant the + // fallback fetch could start with an already-fired 5s timer and abort + // instantly, hiding a worker that was actually up. const base = w.url.replace(/\/v1\/?$/, "").replace(/\/$/, "") - const ctrl = new AbortController() - const timeout = setTimeout(() => ctrl.abort(), 5_000) try { - const r = await fetch(`${base}/health`, { signal: ctrl.signal }) - this.health.set(id, r.ok ? "up" : "down") + this.health.set(id, (await HealthMonitor.pingOnce(`${base}/health`)) ? "up" : "down") } catch { - // try /v1/models - const r = await fetch(`${base}/v1/models`, { signal: ctrl.signal }) - this.health.set(id, r.ok ? "up" : "down") - } finally { - clearTimeout(timeout) + this.health.set(id, (await HealthMonitor.pingOnce(`${base}/v1/models`)) ? "up" : "down") } } catch { this.health.set(id, "down") } } + + private static async pingOnce(url: string): Promise { + const ctrl = new AbortController() + const timeout = setTimeout(() => ctrl.abort(), 5_000) + try { + const r = await fetch(url, { signal: ctrl.signal }) + return r.ok + } finally { + clearTimeout(timeout) + } + } } diff --git a/src/services/local-router/LocalRouter.ts b/src/services/local-router/LocalRouter.ts index a047218d..0f3596cd 100644 --- a/src/services/local-router/LocalRouter.ts +++ b/src/services/local-router/LocalRouter.ts @@ -67,6 +67,12 @@ function combineAbortSignals(signals: (AbortSignal | undefined)[]): { * in state-keys.ts). Kept local so this module stays buildable from contexts * where StateManager isn't initialized (unit tests, CLI bootstrap). */ +// Escape a string for safe interpolation into a RegExp — tool names with regex +// metacharacters (e.g. "file.write") would otherwise corrupt the pattern. +function escapeRegExp(s: string): string { + return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") +} + const DEFAULT_LOCAL_ROUTER_TOTAL_TIMEOUT_MS = 60_000 const DEFAULT_LOCAL_ROUTER_IDLE_TIMEOUT_MS = 20_000 @@ -190,19 +196,47 @@ export class LocalRouter { const url = worker.url.replace(/\/$/, "") const body = { ...req, model: worker.modelId, stream: false } - const res = await fetch(`${url}/chat/completions`, { - method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify(body), - }) - if (!res.ok) { - const text = await res.text().catch(() => "") - throw new Error(`[LocalRouter] worker ${worker.id} returned ${res.status}: ${text.slice(0, 200)}`) + + // Wall-clock guard. The streaming path has both total + idle timers, but + // the non-streaming path produces no chunks, so only `total` applies here. + // Without it a worker that accepts the POST and never answers hangs the + // agent forever (this was the missing guard vs chatStream). + const totalTimeoutMs = sanitizeTimeout(req.timeoutMs, DEFAULT_LOCAL_ROUTER_TOTAL_TIMEOUT_MS) + const { controller, dispose: detachSignals } = combineAbortSignals([req.signal]) + let timedOut = false + const totalTimer = setTimeout(() => { + timedOut = true + controller.abort(new LocalRouterTimeoutError("total", worker.id, totalTimeoutMs)) + }, totalTimeoutMs) + const cleanup = () => { + clearTimeout(totalTimer) + detachSignals() + if (!controller.signal.aborted) controller.abort() + } + + try { + const res = await fetch(`${url}/chat/completions`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(body), + signal: controller.signal, + }) + if (!res.ok) { + const text = await res.text().catch(() => "") + throw new Error(`[LocalRouter] worker ${worker.id} returned ${res.status}: ${text.slice(0, 200)}`) + } + const data = (await res.json()) as ChatResponse + this.cache.set(cacheKey, data) + routingObserver.emit({ ts: Date.now(), category, workerId: worker.id, cacheHit: false, estTokens }) + return data + } catch (err) { + // Surface the timeout as the typed error so callers can fall back on it, + // rather than a raw AbortError. + if (timedOut) throw new LocalRouterTimeoutError("total", worker.id, totalTimeoutMs) + throw err + } finally { + cleanup() } - const data = (await res.json()) as ChatResponse - this.cache.set(cacheKey, data) - routingObserver.emit({ ts: Date.now(), category, workerId: worker.id, cacheHit: false, estTokens }) - return data } /** @@ -279,13 +313,17 @@ export class LocalRouter { const command = bashFence[1].trim() if (!command) return null return { + // requires_approval: true — a command parsed out of a model's + // markdown fence is untrusted; it must go through the approval + // gate (e.g. `rm -rf` should never auto-run). Auto-approval modes + // downstream can still bypass this; the safe default does not. match: bashFence, - toolCall: { name: "execute_command", arguments: { command, requires_approval: false } }, + toolCall: { name: "execute_command", arguments: { command, requires_approval: true } }, } } case "plain_func": { for (const toolName of toolNames) { - const re = new RegExp(`\\b${toolName}\\s*\\(([^)]*)\\)`) + const re = new RegExp(`\\b${escapeRegExp(toolName)}\\s*\\(([^)]*)\\)`) const m = buf.match(re) if (!m) continue const args = LocalRouter.parsePlainArgs( diff --git a/src/services/local-router/ResponseCache.ts b/src/services/local-router/ResponseCache.ts index 5594c35f..16564692 100644 --- a/src/services/local-router/ResponseCache.ts +++ b/src/services/local-router/ResponseCache.ts @@ -44,9 +44,17 @@ export class ResponseCache { set(key: string, value: ChatResponse): void { if (this.map.size >= this.maxSize && !this.map.has(key)) { - // evict oldest - const firstKey = this.map.keys().next().value - if (firstKey) this.map.delete(firstKey) + // First reclaim expired entries — otherwise a full cache of stale + // entries would evict a fresh one by LRU while the stale ones linger. + const now = Date.now() + for (const [k, e] of this.map) { + if (e.expiresAt < now) this.map.delete(k) + } + // Still at capacity? evict the oldest (LRU). + if (this.map.size >= this.maxSize) { + const firstKey = this.map.keys().next().value + if (firstKey) this.map.delete(firstKey) + } } this.map.set(key, { value, expiresAt: Date.now() + this.ttlMs }) } diff --git a/src/services/local-router/__tests__/LocalRouter.test.ts b/src/services/local-router/__tests__/LocalRouter.test.ts index 506b5693..ffb12f1e 100644 --- a/src/services/local-router/__tests__/LocalRouter.test.ts +++ b/src/services/local-router/__tests__/LocalRouter.test.ts @@ -395,7 +395,9 @@ describe("LocalRouter", () => { assert.strictEqual(tc.name, "execute_command") const args = JSON.parse(tc.argumentsRaw) as { command: string; requires_approval: boolean } assert.strictEqual(args.command, "ls -la /tmp") - assert.strictEqual(args.requires_approval, false) + // Commands parsed from a model's markdown fence must default to requiring + // approval — they are untrusted and must hit the approval gate. + assert.strictEqual(args.requires_approval, true) router.dispose() })