From 692a2b4c526dad5058ba85a6431508fae7449ecc Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 9 Jun 2026 13:06:34 +0000
Subject: [PATCH 01/14] Add local models, cost-aware routing, and compute/cost
 awareness

Make tiny-code more cost-effective by running cheap open-weight models
locally and escalating to frontier models only for heavy work.

- Ollama provider over the OpenAI-compatible API (raw fetch + SSE), no new
  deps. Google Gemma and other local models work as model ids.
- Local-first routing: a heuristic classifier starts each turn on the
  cheap/local model and escalates to a configured frontier model when the
  task is heavy or the local model calls the new `escalate` tool / gets stuck.
- Cost awareness: per-turn + cumulative token usage with estimated $ for
  cloud turns ("no API cost" for local), plus a startup RAM advisory that
  warns when a local model won't fit or is too small to tool-call reliably.
- `/costs` command surfaces usage, spend, and workflow cost-cutting tips.
- Config: provider 'ollama', ollamaBaseUrl, routing, escalateTo; docs updated.

All tests pass (87), coverage >80%, typecheck and lint clean.
---
 .env.example                       |   6 +-
 README.md                          |  63 ++++++++-
 TODO.md                            |  13 ++
 src/agent/loop.ts                  |  60 ++++++++-
 src/agent/router.ts                |  42 ++++++
 src/agent/systemPrompt.ts          |   8 ++
 src/cli.ts                         |   9 +-
 src/config/load.ts                 |  39 +++++-
 src/index.ts                       |  13 +-
 src/providers/index.ts             |   7 +
 src/providers/ollama.ts            | 203 +++++++++++++++++++++++++++++
 src/providers/pricing.ts           |  56 ++++++++
 src/providers/types.ts             |   2 +-
 src/repl.ts                        |  88 ++++++++++++-
 src/system/resources.ts            |  71 ++++++++++
 src/tools/escalate.ts              |  26 ++++
 src/ui/render.ts                   |  54 +++++++-
 tests/agent/loop.test.ts           | 100 +++++++++++++-
 tests/agent/router.test.ts         |  22 ++++
 tests/config/load.test.ts          |  35 +++++
 tests/providers/ollamaSend.test.ts |  90 +++++++++++++
 tests/providers/pricing.test.ts    |  28 ++++
 tests/providers/translate.test.ts  |  23 ++++
 tests/system/resources.test.ts     |  32 +++++
 tests/ui/render.test.ts            |  38 ++++++
 25 files changed, 1099 insertions(+), 29 deletions(-)
 create mode 100644 src/agent/router.ts
 create mode 100644 src/providers/ollama.ts
 create mode 100644 src/providers/pricing.ts
 create mode 100644 src/system/resources.ts
 create mode 100644 src/tools/escalate.ts
 create mode 100644 tests/agent/router.test.ts
 create mode 100644 tests/providers/ollamaSend.test.ts
 create mode 100644 tests/providers/pricing.test.ts
 create mode 100644 tests/system/resources.test.ts

diff --git a/.env.example b/.env.example
index 23d4820..cb72dc1 100644
--- a/.env.example
+++ b/.env.example
@@ -1,7 +1,9 @@
-# Provide at least one. If both are present, Anthropic is used by default.
+# Provide at least one for cloud providers. If both are present, Anthropic is
+# the default. Ollama runs locally and needs no key.
 ANTHROPIC_API_KEY=
 GEMINI_API_KEY=
 
 # Optional overrides (also settable via config file / CLI flags)
-# TINY_CODE_PROVIDER=anthropic   # anthropic | gemini
+# TINY_CODE_PROVIDER=anthropic   # anthropic | gemini | ollama
 # TINY_CODE_MODEL=claude-opus-4-8
+# TINY_CODE_OLLAMA_URL=http://localhost:11434/v1   # Ollama OpenAI-compatible endpoint
diff --git a/README.md b/README.md
index 0f8c0ab..6b73404 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,12 @@
 # tiny-code
 
 A small, extensible CLI coding agent. Interactive terminal REPL, interchangeable
-**Anthropic** and **Gemini** models, and just the core features you actually use:
-read/write/edit files, run shell commands, search code, and a custom
-commands/skills system. No business logic baked in.
+**Anthropic**, **Gemini**, and **local (Ollama)** models, and just the core
+features you actually use: read/write/edit files, run shell commands, search
+code, and a custom commands/skills system. No business logic baked in.
+
+Run cheap, open-weight models locally and **escalate heavy work to a frontier
+model only when needed** — see [Local models & cost-aware routing](#local-models--cost-aware-routing).
 
 > Status: early (v0.x). Published as `@therr/tiny-code`; the binary is
 > `tiny-code`. Names may change before the first npm publish.
@@ -37,15 +40,58 @@ export GEMINI_API_KEY=...
 tiny-code                       # start the REPL (uses an available key)
 tiny-code --provider gemini     # force a provider
 tiny-code --model claude-opus-4-8
+tiny-code --provider ollama --model gemma3:12b   # run a local model (no API cost)
 ```
 
 In the REPL: type a request, watch it work. Mutating actions (writes, edits,
 shell commands) prompt for approval unless pre-approved in config.
 
 - `/help` — list commands
+- `/costs` — session token usage, estimated $ cost, and cost-saving tips
 - `/<name> [args]` — run a custom command (see below)
 - `/exit` — quit
 
+## Local models & cost-aware routing
+
+tiny-code talks to a local [Ollama](https://ollama.com) server over its
+OpenAI-compatible API, so any model you've pulled is available — including
+**Google Gemma 3** (`gemma3:4b`, `gemma3:12b`, `gemma3:27b`) and
+`qwen2.5-coder` (the default, which tool-calls reliably).
+
+```bash
+ollama serve
+ollama pull qwen2.5-coder:7b
+tiny-code --provider ollama --model qwen2.5-coder:7b
+```
+
+**Mind the compute cost.** Local models are free of API charges but use your
+machine's RAM/VRAM. On startup with an Ollama model, tiny-code prints how much
+memory the model needs versus what's free, and warns if it likely won't fit or
+if the model is too small (≤3B) to tool-call reliably. Rough guide (≈Q4):
+
+| Model        | ~RAM needed | Good for                          |
+| ------------ | ----------- | --------------------------------- |
+| `gemma3:1b`  | ~1 GB       | trivial text (poor at tool calls) |
+| `gemma3:4b`  | ~3 GB       | lightweight edits, search         |
+| `gemma3:12b` | ~7 GB       | most coding tasks                 |
+| `gemma3:27b` | ~16 GB      | stronger reasoning                |
+
+**Local-first routing.** Set a `routing` of `local-first` with an `escalateTo`
+target: every turn starts on the cheap/local model, and tiny-code escalates to
+the frontier model when a turn looks heavy (refactors, debugging, multi-file
+work) or when the local model gets stuck and calls the built-in `escalate` tool.
+You get local speed and zero cost for the bulk of the work, and frontier power
+only for the hard parts. Run `/costs` any time for usage, spend, and tips.
+
+```json
+{
+  "provider": "ollama",
+  "model": "qwen2.5-coder:7b",
+  "routing": "local-first",
+  "escalateTo": { "provider": "anthropic", "model": "claude-opus-4-8" }
+}
+```
+
 ## Project context
 
 On start, the agent walks up from the working directory looking for `AGENTS.md`
@@ -81,10 +127,13 @@ CLI flags.
 {
   "provider": "anthropic",
   "model": "claude-opus-4-8",
+  "ollamaBaseUrl": "http://localhost:11434/v1",
   "maxTokens": 16000,
   "thinking": true,
   "effort": "high",
   "maxIterations": 50,
+  "routing": "off",
+  "escalateTo": { "provider": "anthropic", "model": "claude-opus-4-8" },
   "allow": {
     "tools": [],
     "bash": ["npm test", "git status", "git diff"],
@@ -96,6 +145,14 @@ CLI flags.
 `allow` pre-approves mutating actions so they skip the confirmation prompt:
 `bash` matches command prefixes, `write` matches path globs for write/edit.
 
+`routing: "local-first"` plus `escalateTo` enables cost-aware routing (see
+[above](#local-models--cost-aware-routing)); it defaults to `local-first`
+automatically whenever `escalateTo` is present. `ollamaBaseUrl` points at your
+Ollama server's OpenAI-compatible endpoint.
+
+Approximate cloud pricing used for the `/costs` estimate lives in
+`src/providers/pricing.ts` — edit it to match current vendor rates.
+
 ## Development
 
 ```bash
diff --git a/TODO.md b/TODO.md
index b35c72c..8e933e2 100644
--- a/TODO.md
+++ b/TODO.md
@@ -9,6 +9,19 @@ Explore/Plan agent). **Approach:** a `spawn_agent` tool whose `execute` construc
 a child `AgentLoop` with its own message history and a read-only tool subset,
 returning the child's final text. Keep depth at 1 to start.
 
+> Note: the cheap/expensive model split is now handled by **local-first
+> routing** (`routing: "local-first"` + `escalateTo`): turns start on the
+> local/cheap model and escalate to a frontier model when heavy or stuck (see
+> `src/agent/router.ts`, `src/tools/escalate.ts`, and the loop's escalation
+> logic). Sub-agents remain useful for *parallel* isolated runs.
+
+## More local-model interoperability
+Ollama is wired in via its OpenAI-compatible endpoint (`src/providers/ollama.ts`),
+which already covers LM Studio and vLLM (same wire format) by pointing
+`ollamaBaseUrl`/`TINY_CODE_OLLAMA_URL` at them. **Next:** an optional
+`/api/tags` probe to list locally-installed models and surface tokens/sec in the
+usage line; per-model context-window awareness for the RAM advisory.
+
 ## Web search / fetch
 Let the agent look up docs during a task. **Approach:** add `web_search` and
 `web_fetch` tools. For Anthropic, optionally delegate to the server-side
diff --git a/src/agent/loop.ts b/src/agent/loop.ts
index e482fa2..fe10bdd 100644
--- a/src/agent/loop.ts
+++ b/src/agent/loop.ts
@@ -10,7 +10,10 @@ export interface AgentUI {
   onToolStart(name: string, input: unknown): void;
   onToolResult(name: string, result: ToolResult): void;
   onToolDenied(name: string): void;
-  onUsage(usage: Usage): void;
+  /** `model` identifies which model produced the usage (for accurate pricing). */
+  onUsage(usage: Usage, model?: string): void;
+  /** Fired when local-first routing escalates the turn to the frontier model. */
+  onRoute(provider: string, model: string, reason: string): void;
   onAssistantEnd(): void;
   onMaxIterations(): void;
 }
@@ -23,8 +26,15 @@ export interface AgentLoopOptions {
   ui: AgentUI;
   cwd: string;
   maxIterations?: number;
+  /** Frontier model to escalate heavy/stuck turns to (enables local-first routing). */
+  escalationProvider?: ModelProvider | undefined;
+  /** Classifies a turn up front so heavy tasks start on the frontier model. */
+  router?: ((input: string) => 'light' | 'heavy') | undefined;
 }
 
+/** Consecutive tool-error iterations before auto-escalating a stuck local model. */
+const STUCK_THRESHOLD = 3;
+
 /**
  * The provider-agnostic, UI-agnostic agentic loop: send → stream → run tools →
  * feed results back → repeat until the model stops requesting tools (or the
@@ -38,6 +48,8 @@ export class AgentLoop {
   private readonly ui: AgentUI;
   private readonly cwd: string;
   private readonly maxIterations: number;
+  private readonly escalationProvider: ModelProvider | undefined;
+  private readonly router: ((input: string) => 'light' | 'heavy') | undefined;
   private readonly messages: Message[] = [];
 
   constructor(opts: AgentLoopOptions) {
@@ -48,6 +60,8 @@ export class AgentLoop {
     this.ui = opts.ui;
     this.cwd = opts.cwd;
     this.maxIterations = opts.maxIterations ?? 50;
+    this.escalationProvider = opts.escalationProvider;
+    this.router = opts.router;
   }
 
   /** Conversation history (for inspection / persistence). */
@@ -60,11 +74,15 @@ export class AgentLoop {
     this.messages.push({ role: 'user', content: [{ type: 'text', text: userInput }] });
     const tools = this.registry.toSchemas();
 
+    let active = this.selectInitialProvider(userInput);
+    let escalated = active === this.escalationProvider;
+    let consecutiveErrors = 0;
+
     for (let iteration = 0; iteration < this.maxIterations; iteration += 1) {
       let text = '';
       const toolCalls: ToolUseBlock[] = [];
 
-      for await (const event of this.provider.send({
+      for await (const event of active.send({
         system: this.system,
         messages: [...this.messages],
         tools,
@@ -75,7 +93,7 @@ export class AgentLoop {
         } else if (event.type === 'tool_call') {
           toolCalls.push({ type: 'tool_use', id: event.id, name: event.name, input: event.input });
         } else {
-          this.ui.onUsage(event.usage);
+          this.ui.onUsage(event.usage, active.model);
         }
       }
 
@@ -88,16 +106,50 @@ export class AgentLoop {
 
       if (toolCalls.length === 0) return;
 
+      // The local model can explicitly hand off via the `escalate` tool.
+      if (!escalated && toolCalls.some((c) => c.name === 'escalate')) {
+        active = this.escalate('requested by model');
+        escalated = true;
+      }
+
       const results: ToolResultBlock[] = [];
+      let anyError = false;
       for (const call of toolCalls) {
-        results.push(await this.executeToolCall(call));
+        const result = await this.executeToolCall(call);
+        if (result.isError) anyError = true;
+        results.push(result);
       }
       this.messages.push({ role: 'user', content: results });
+
+      // Auto-escalate a local model that appears stuck (repeated tool errors).
+      if (!escalated) {
+        consecutiveErrors = anyError ? consecutiveErrors + 1 : 0;
+        if (consecutiveErrors >= STUCK_THRESHOLD) {
+          active = this.escalate('stuck — repeated tool errors');
+          escalated = true;
+        }
+      }
     }
 
     this.ui.onMaxIterations();
   }
 
+  /** Pick the provider for a turn: heavy tasks start on the frontier model. */
+  private selectInitialProvider(input: string): ModelProvider {
+    if (this.escalationProvider && this.router && this.router(input) === 'heavy') {
+      this.ui.onRoute(this.escalationProvider.name, this.escalationProvider.model, 'heavy task');
+      return this.escalationProvider;
+    }
+    return this.provider;
+  }
+
+  /** Switch to the frontier provider mid-turn. Falls back to the primary if unset. */
+  private escalate(reason: string): ModelProvider {
+    if (!this.escalationProvider) return this.provider;
+    this.ui.onRoute(this.escalationProvider.name, this.escalationProvider.model, reason);
+    return this.escalationProvider;
+  }
+
   private async executeToolCall(call: ToolUseBlock): Promise<ToolResultBlock> {
     const tool = this.registry.get(call.name);
     if (!tool) {
diff --git a/src/agent/router.ts b/src/agent/router.ts
new file mode 100644
index 0000000..1582f2f
--- /dev/null
+++ b/src/agent/router.ts
@@ -0,0 +1,42 @@
+/**
+ * Lightweight, dependency-free task classification for local-first routing.
+ *
+ * The cheap/local model handles each turn by default; this heuristic flags the
+ * turns that are better started on the frontier model. It is intentionally
+ * conservative — when in doubt it returns 'light' and lets the local model
+ * escalate explicitly (via the `escalate` tool) if it gets stuck.
+ */
+export type TaskWeight = 'light' | 'heavy';
+
+const HEAVY_PATTERNS: RegExp[] = [
+  /\brefactor(?:ing|ed)?\b/i,
+  /\barchitect(?:ure|ural)?\b/i,
+  /\bdesign\b/i,
+  /\bdebug(?:ging|ged)?\b/i,
+  /\boptimi[sz]e\b/i,
+  /\bmigrat(?:e|ion|ing)\b/i,
+  /\bimplement\b/i,
+  /\bredesign\b/i,
+  /\broot[- ]?cause\b/i,
+  /\bwhy (?:is|does|are|do|did)\b/i,
+  /\bthink (?:hard|carefully|through|deeply)\b/i,
+  /\bacross (?:the |multiple |several )?(?:files|modules|codebase)\b/i,
+  /\bend[- ]to[- ]end\b/i,
+];
+
+/** Number of file-path-looking tokens above which a turn is considered heavy. */
+const MULTI_FILE_THRESHOLD = 3;
+/** Character length above which a turn is considered heavy. */
+const LONG_INPUT_CHARS = 600;
+
+/** Classify a user turn as 'light' (local) or 'heavy' (escalate to frontier). */
+export function classifyTurn(input: string): TaskWeight {
+  const text = input.trim();
+  if (text.length >= LONG_INPUT_CHARS) return 'heavy';
+  if (HEAVY_PATTERNS.some((re) => re.test(text))) return 'heavy';
+
+  const fileMentions = text.match(/[\w./-]+\.[a-z]{1,5}\b/gi) ?? [];
+  if (fileMentions.length >= MULTI_FILE_THRESHOLD) return 'heavy';
+
+  return 'light';
+}
diff --git a/src/agent/systemPrompt.ts b/src/agent/systemPrompt.ts
index 1ae17d0..d02a7df 100644
--- a/src/agent/systemPrompt.ts
+++ b/src/agent/systemPrompt.ts
@@ -4,8 +4,12 @@ export interface SystemPromptParams {
   cwd: string;
   projectContext: string;
   tools: ToolSchema[];
+  /** When true, this model is the cheap/local model in a local-first setup. */
+  escalation?: boolean;
 }
 
+const ESCALATION_GUIDANCE = `Cost-aware routing: you are running as a fast, low-cost model. Handle routine work yourself — reading, searching, listing, and small, well-scoped edits. If a task needs deep reasoning, a large or multi-file refactor, tricky debugging, or you find yourself stuck or uncertain, call the \`escalate\` tool with a brief reason to hand off to a more capable model. Prefer escalating early over guessing.`;
+
 const BASE_PERSONA = `You are a precise, autonomous coding agent operating in a terminal.
 
 Guidelines:
@@ -25,6 +29,10 @@ export function buildSystemPrompt(params: SystemPromptParams): string {
     `Available tools:\n${toolList}`,
   ];
 
+  if (params.escalation) {
+    sections.push(ESCALATION_GUIDANCE);
+  }
+
   if (params.projectContext.trim().length > 0) {
     sections.push(
       `Project-specific instructions (from the project's context file):\n\n${params.projectContext.trim()}`,
diff --git a/src/cli.ts b/src/cli.ts
index b664658..5c451aa 100644
--- a/src/cli.ts
+++ b/src/cli.ts
@@ -12,8 +12,8 @@ Usage:
   tiny-code [options]
 
 Options:
-  --provider <name>   anthropic | gemini (default: inferred from API keys)
-  --model <id>        Model id override
+  --provider <name>   anthropic | gemini | ollama (default: inferred from API keys)
+  --model <id>        Model id override (e.g. claude-opus-4-8, gemma3:12b)
   --config <path>     Path to a config JSON file
   -v, --version       Print version
   -h, --help          Show this help
@@ -21,6 +21,11 @@ Options:
 Environment:
   ANTHROPIC_API_KEY   Required for the Anthropic provider
   GEMINI_API_KEY      Required for the Gemini provider
+  TINY_CODE_OLLAMA_URL  Ollama OpenAI-compatible base URL (default http://localhost:11434/v1)
+
+Cost-saving: set "routing": "local-first" with an "escalateTo" target in your
+config to run cheap/local models by default and escalate heavy tasks. Run /costs
+in the session for usage and tips.
 `;
 
 function main(): void {
diff --git a/src/config/load.ts b/src/config/load.ts
index f71112b..5c4389a 100644
--- a/src/config/load.ts
+++ b/src/config/load.ts
@@ -3,8 +3,16 @@ import { homedir } from 'node:os';
 import { join } from 'node:path';
 import { z } from 'zod';
 
-export type Provider = 'anthropic' | 'gemini';
+export type Provider = 'anthropic' | 'gemini' | 'ollama';
 export type Effort = 'low' | 'medium' | 'high' | 'xhigh' | 'max';
+export type Routing = 'local-first' | 'off';
+
+/** A frontier model to escalate heavy tasks to under local-first routing. */
+export interface EscalateTarget {
+  provider: Provider;
+  model: string;
+  ollamaBaseUrl?: string | undefined;
+}
 
 /** Auto-approval rules that bypass the interactive permission prompt. */
 export interface AllowRules {
@@ -21,10 +29,16 @@ export interface ResolvedConfig {
   model: string;
   anthropicApiKey: string | undefined;
   geminiApiKey: string | undefined;
+  /** OpenAI-compatible base URL for the Ollama provider. */
+  ollamaBaseUrl: string;
   maxTokens: number;
   thinking: boolean;
   effort: Effort;
   maxIterations: number;
+  /** 'local-first' starts turns on the cheap model and escalates heavy ones. */
+  routing: Routing;
+  /** Frontier model heavy tasks escalate to (only used when routing is 'local-first'). */
+  escalateTo: EscalateTarget | undefined;
   commandDirs: string[];
   allow: AllowRules;
 }
@@ -38,16 +52,28 @@ export interface CliOverrides {
 const DEFAULT_MODELS: Record<Provider, string> = {
   anthropic: 'claude-opus-4-8',
   gemini: 'gemini-2.5-pro',
+  ollama: 'qwen2.5-coder:7b',
 };
 
+const DEFAULT_OLLAMA_URL = 'http://localhost:11434/v1';
+
+const EscalateTargetSchema = z.object({
+  provider: z.enum(['anthropic', 'gemini', 'ollama']),
+  model: z.string(),
+  ollamaBaseUrl: z.string().url().optional(),
+});
+
 const FileConfigSchema = z
   .object({
-    provider: z.enum(['anthropic', 'gemini']).optional(),
+    provider: z.enum(['anthropic', 'gemini', 'ollama']).optional(),
     model: z.string().optional(),
+    ollamaBaseUrl: z.string().url().optional(),
     maxTokens: z.number().int().positive().optional(),
     thinking: z.boolean().optional(),
     effort: z.enum(['low', 'medium', 'high', 'xhigh', 'max']).optional(),
     maxIterations: z.number().int().positive().optional(),
+    routing: z.enum(['local-first', 'off']).optional(),
+    escalateTo: EscalateTargetSchema.optional(),
     commandDirs: z.array(z.string()).optional(),
     allow: z
       .object({
@@ -98,6 +124,12 @@ export function loadConfig(overrides: CliOverrides = {}, cwd: string = process.c
 
   const effort = (env.TINY_CODE_EFFORT as Effort | undefined) ?? file.effort ?? 'high';
 
+  const ollamaBaseUrl = env.TINY_CODE_OLLAMA_URL ?? file.ollamaBaseUrl ?? DEFAULT_OLLAMA_URL;
+
+  const escalateTo = file.escalateTo;
+  // Default to local-first whenever an escalation target is configured.
+  const routing: Routing = file.routing ?? (escalateTo ? 'local-first' : 'off');
+
   const defaultCommandDirs = [
     join(cwd, '.agent', 'commands'),
     join(home, '.config', 'tiny-code', 'commands'),
@@ -108,10 +140,13 @@ export function loadConfig(overrides: CliOverrides = {}, cwd: string = process.c
     model,
     anthropicApiKey,
     geminiApiKey,
+    ollamaBaseUrl,
     maxTokens,
     thinking: file.thinking ?? true,
     effort,
     maxIterations: file.maxIterations ?? 50,
+    routing,
+    escalateTo,
     commandDirs: file.commandDirs ?? defaultCommandDirs,
     allow: {
       tools: file.allow?.tools ?? [],
diff --git a/src/index.ts b/src/index.ts
index 1215b85..1a8ce7f 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -8,19 +8,28 @@ export type { AgentUI, AgentLoopOptions } from './agent/loop.js';
 export { buildSystemPrompt } from './agent/systemPrompt.js';
 export type { SystemPromptParams } from './agent/systemPrompt.js';
 
-export { createProvider, AnthropicProvider, GeminiProvider } from './providers/index.js';
+export { createProvider, AnthropicProvider, GeminiProvider, OllamaProvider } from './providers/index.js';
 export type { ModelProvider, ProviderEvent, SendRequest, ToolSchema, Usage } from './providers/types.js';
+export { toOpenAiMessages, toOpenAiTools } from './providers/ollama.js';
+export { PRICING, estimateCost, formatUsd } from './providers/pricing.js';
+export type { ModelPricing } from './providers/pricing.js';
+
+export { classifyTurn } from './agent/router.js';
+export type { TaskWeight } from './agent/router.js';
+export { checkLocalModel, estimateModelRamGb, MODEL_RAM_GB } from './system/resources.js';
+export type { LocalModelCheck } from './system/resources.js';
 
 export { ALL_TOOLS, createRegistry, toJsonSchema } from './tools/registry.js';
 export type { ToolRegistry } from './tools/registry.js';
 export { defineTool } from './tools/types.js';
+export { escalateTool } from './tools/escalate.js';
 export type { Tool, ToolContext, ToolResult } from './tools/types.js';
 
 export { PermissionGate } from './permissions/gate.js';
 export type { PermissionPrompt, PermissionRequest, PermissionChoice } from './permissions/gate.js';
 
 export { loadConfig } from './config/load.js';
-export type { ResolvedConfig, CliOverrides, Provider, Effort, AllowRules } from './config/load.js';
+export type { ResolvedConfig, CliOverrides, Provider, Effort, AllowRules, Routing, EscalateTarget } from './config/load.js';
 export { loadProjectContext } from './config/context.js';
 
 export { loadCommands, renderCommand } from './commands/loader.js';
diff --git a/src/providers/index.ts b/src/providers/index.ts
index 3ac08ec..b37a4f1 100644
--- a/src/providers/index.ts
+++ b/src/providers/index.ts
@@ -2,10 +2,12 @@ import type { ModelProvider } from './types.js';
 import type { ResolvedConfig } from '../config/load.js';
 import { AnthropicProvider } from './anthropic.js';
 import { GeminiProvider } from './gemini.js';
+import { OllamaProvider } from './ollama.js';
 
 export type { ModelProvider, ProviderEvent, SendRequest, ToolSchema, Usage } from './types.js';
 export { AnthropicProvider } from './anthropic.js';
 export { GeminiProvider } from './gemini.js';
+export { OllamaProvider } from './ollama.js';
 
 /** Construct the configured provider, validating that its API key is present. */
 export function createProvider(config: ResolvedConfig): ModelProvider {
@@ -22,6 +24,11 @@ export function createProvider(config: ResolvedConfig): ModelProvider {
     });
   }
 
+  if (config.provider === 'ollama') {
+    // No API key required — Ollama runs locally.
+    return new OllamaProvider({ baseUrl: config.ollamaBaseUrl, model: config.model });
+  }
+
   if (!config.geminiApiKey) {
     throw new Error('GEMINI_API_KEY is not set. Export it or switch providers with --provider anthropic.');
   }
diff --git a/src/providers/ollama.ts b/src/providers/ollama.ts
new file mode 100644
index 0000000..8e2091f
--- /dev/null
+++ b/src/providers/ollama.ts
@@ -0,0 +1,203 @@
+import type { Message } from '../agent/types.js';
+import type { ModelProvider, ProviderEvent, SendRequest, ToolSchema } from './types.js';
+
+export interface OllamaProviderOptions {
+  /** OpenAI-compatible base URL, e.g. "http://localhost:11434/v1". */
+  baseUrl: string;
+  model: string;
+  /** Ignored by Ollama but required by the OpenAI wire format; defaults to "ollama". */
+  apiKey?: string;
+}
+
+interface OpenAiMessage {
+  role: 'system' | 'user' | 'assistant' | 'tool';
+  content: string;
+  tool_calls?: { id: string; type: 'function'; function: { name: string; arguments: string } }[];
+  tool_call_id?: string;
+}
+
+/**
+ * Translate internal messages into OpenAI chat messages (the shape Ollama's
+ * `/v1/chat/completions` endpoint accepts). Unlike Gemini, OpenAI correlates
+ * tool results to calls by `tool_call_id`, and our Anthropic-style ids survive
+ * the round trip — so no id synthesis is needed.
+ */
+export function toOpenAiMessages(messages: Message[]): OpenAiMessage[] {
+  const out: OpenAiMessage[] = [];
+  for (const m of messages) {
+    if (m.role === 'user') {
+      // A user turn may carry plain text and/or tool results; emit each result
+      // as its own `tool` message and gather any text into one user message.
+      let text = '';
+      for (const b of m.content) {
+        if (b.type === 'text') text += b.text;
+        else if (b.type === 'tool_result') {
+          out.push({ role: 'tool', tool_call_id: b.toolUseId, content: b.content });
+        }
+      }
+      if (text.length > 0) out.push({ role: 'user', content: text });
+      continue;
+    }
+
+    // assistant: merge text + tool_use into a single message
+    let text = '';
+    const toolCalls: NonNullable<OpenAiMessage['tool_calls']> = [];
+    for (const b of m.content) {
+      if (b.type === 'text') text += b.text;
+      else if (b.type === 'tool_use') {
+        toolCalls.push({
+          id: b.id,
+          type: 'function',
+          function: { name: b.name, arguments: JSON.stringify(b.input ?? {}) },
+        });
+      }
+    }
+    const msg: OpenAiMessage = { role: 'assistant', content: text };
+    if (toolCalls.length > 0) msg.tool_calls = toolCalls;
+    out.push(msg);
+  }
+  return out;
+}
+
+/** Translate normalized tool schemas into OpenAI's `tools` array. */
+export function toOpenAiTools(tools: ToolSchema[]): unknown[] {
+  return tools.map((t) => ({
+    type: 'function',
+    function: { name: t.name, description: t.description, parameters: t.jsonSchema },
+  }));
+}
+
+interface StreamChoice {
+  delta?: {
+    content?: string | null;
+    tool_calls?: {
+      index: number;
+      id?: string;
+      function?: { name?: string; arguments?: string };
+    }[];
+  };
+  finish_reason?: string | null;
+}
+
+interface StreamChunk {
+  choices?: StreamChoice[];
+  usage?: { prompt_tokens?: number; completion_tokens?: number } | null;
+}
+
+export class OllamaProvider implements ModelProvider {
+  readonly name = 'ollama' as const;
+  readonly model: string;
+  private readonly baseUrl: string;
+  private readonly apiKey: string;
+
+  constructor(opts: OllamaProviderOptions) {
+    this.baseUrl = opts.baseUrl.replace(/\/$/, '');
+    this.model = opts.model;
+    this.apiKey = opts.apiKey ?? 'ollama';
+  }
+
+  async *send(req: SendRequest): AsyncIterable<ProviderEvent> {
+    const messages: OpenAiMessage[] = [
+      { role: 'system', content: req.system },
+      ...toOpenAiMessages(req.messages),
+    ];
+
+    const body = {
+      model: this.model,
+      messages,
+      tools: req.tools.length > 0 ? toOpenAiTools(req.tools) : undefined,
+      stream: true,
+      stream_options: { include_usage: true },
+    };
+
+    let res: Response;
+    try {
+      res = await fetch(`${this.baseUrl}/chat/completions`, {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${this.apiKey}` },
+        body: JSON.stringify(body),
+      });
+    } catch (err) {
+      throw new Error(
+        `Cannot reach Ollama at ${this.baseUrl}. Is 'ollama serve' running? (${(err as Error).message})`,
+      );
+    }
+
+    if (!res.ok || !res.body) {
+      const detail = await res.text().catch(() => '');
+      throw new Error(`Ollama request failed (${res.status}): ${detail.slice(0, 200)}`);
+    }
+
+    // Accumulate tool calls by their streamed index; arguments arrive in fragments.
+    const calls = new Map<number, { id: string; name: string; args: string }>();
+    let usage = { inputTokens: 0, outputTokens: 0 };
+    let finish = 'stop';
+
+    for await (const chunk of parseSse(res.body)) {
+      const choice = chunk.choices?.[0];
+      if (choice?.delta?.content) yield { type: 'text', delta: choice.delta.content };
+
+      for (const tc of choice?.delta?.tool_calls ?? []) {
+        const acc = calls.get(tc.index) ?? { id: '', name: '', args: '' };
+        if (tc.id) acc.id = tc.id;
+        if (tc.function?.name) acc.name = tc.function.name;
+        if (tc.function?.arguments) acc.args += tc.function.arguments;
+        calls.set(tc.index, acc);
+      }
+
+      if (choice?.finish_reason) finish = choice.finish_reason;
+      if (chunk.usage) {
+        usage = {
+          inputTokens: chunk.usage.prompt_tokens ?? 0,
+          outputTokens: chunk.usage.completion_tokens ?? 0,
+        };
+      }
+    }
+
+    for (const [index, c] of [...calls.entries()].sort((a, b) => a[0] - b[0])) {
+      let input: unknown = {};
+      try {
+        input = c.args.trim() ? JSON.parse(c.args) : {};
+      } catch {
+        // Small models occasionally emit malformed JSON; degrade gracefully.
+        input = {};
+      }
+      yield { type: 'tool_call', id: c.id || `ollama-call-${index}`, name: c.name, input };
+    }
+
+    yield {
+      type: 'done',
+      usage,
+      stopReason: calls.size > 0 ? 'tool_use' : finish,
+    };
+  }
+}
+
+/** Parse an SSE byte stream into decoded JSON chunks, skipping the `[DONE]` sentinel. */
+async function* parseSse(body: ReadableStream<Uint8Array>): AsyncIterable<StreamChunk> {
+  const decoder = new TextDecoder();
+  let buffer = '';
+  const reader = body.getReader();
+  try {
+    for (;;) {
+      const { done, value } = await reader.read();
+      if (done) break;
+      buffer += decoder.decode(value, { stream: true });
+      let nl: number;
+      while ((nl = buffer.indexOf('\n')) !== -1) {
+        const line = buffer.slice(0, nl).trim();
+        buffer = buffer.slice(nl + 1);
+        if (!line.startsWith('data:')) continue;
+        const payload = line.slice(5).trim();
+        if (payload === '[DONE]' || payload.length === 0) continue;
+        try {
+          yield JSON.parse(payload) as StreamChunk;
+        } catch {
+          // Ignore partial/non-JSON keep-alive lines.
+        }
+      }
+    }
+  } finally {
+    reader.releaseLock();
+  }
+}
diff --git a/src/providers/pricing.ts b/src/providers/pricing.ts
new file mode 100644
index 0000000..28994bb
--- /dev/null
+++ b/src/providers/pricing.ts
@@ -0,0 +1,56 @@
+import type { Usage } from './types.js';
+
+/** Cost per 1M tokens, in USD. */
+export interface ModelPricing {
+  inputPerMTok: number;
+  outputPerMTok: number;
+}
+
+/**
+ * Approximate cloud pricing (USD per 1M tokens). These drift over time — edit
+ * this table to match current vendor pricing. Local models (Ollama) are
+ * intentionally absent: they have no per-token API cost, so {@link estimateCost}
+ * returns `null` for them and the UI reports "local (no API cost)".
+ */
+export const PRICING: Record<string, ModelPricing> = {
+  'claude-opus-4-8': { inputPerMTok: 15, outputPerMTok: 75 },
+  'claude-sonnet-4-6': { inputPerMTok: 3, outputPerMTok: 15 },
+  'claude-haiku-4-5': { inputPerMTok: 1, outputPerMTok: 5 },
+  'gemini-2.5-pro': { inputPerMTok: 1.25, outputPerMTok: 10 },
+  'gemini-2.5-flash': { inputPerMTok: 0.3, outputPerMTok: 2.5 },
+};
+
+/** Look up pricing, tolerating versioned suffixes (e.g. "claude-opus-4-8-20260101"). */
+function lookup(model: string): ModelPricing | undefined {
+  if (PRICING[model]) return PRICING[model];
+  // Longest known prefix wins, so "gemini-2.5-pro-preview" matches "gemini-2.5-pro".
+  let best: ModelPricing | undefined;
+  let bestLen = 0;
+  for (const [key, price] of Object.entries(PRICING)) {
+    if (model.startsWith(key) && key.length > bestLen) {
+      best = price;
+      bestLen = key.length;
+    }
+  }
+  return best;
+}
+
+/**
+ * Estimate the USD cost of a single turn's token usage. Returns `null` when the
+ * model has no known price (local/unpriced models), signalling "no API cost".
+ */
+export function estimateCost(model: string, usage: Usage): number | null {
+  const price = lookup(model);
+  if (!price) return null;
+  return (
+    (usage.inputTokens / 1_000_000) * price.inputPerMTok +
+    (usage.outputTokens / 1_000_000) * price.outputPerMTok
+  );
+}
+
+/** Format a USD amount with enough precision to be useful at small magnitudes. */
+export function formatUsd(amount: number): string {
+  if (amount === 0) return '$0.00';
+  if (amount < 0.01) return `$${amount.toFixed(4)}`;
+  return `$${amount.toFixed(2)}`;
+}
diff --git a/src/providers/types.ts b/src/providers/types.ts
index 45262f3..c18443e 100644
--- a/src/providers/types.ts
+++ b/src/providers/types.ts
@@ -34,7 +34,7 @@ export interface SendRequest {
  * {@link ProviderEvent}.
  */
 export interface ModelProvider {
-  readonly name: 'anthropic' | 'gemini';
+  readonly name: 'anthropic' | 'gemini' | 'ollama';
   readonly model: string;
   send(req: SendRequest): AsyncIterable<ProviderEvent>;
 }
diff --git a/src/repl.ts b/src/repl.ts
index c5ce62f..1672fdc 100644
--- a/src/repl.ts
+++ b/src/repl.ts
@@ -1,21 +1,49 @@
 import * as readline from 'node:readline';
 import pc from 'picocolors';
 import { createTerminalUI } from './ui/render.js';
+import type { TerminalUI } from './ui/render.js';
 import { AgentLoop } from './agent/loop.js';
 import { PermissionGate } from './permissions/gate.js';
 import type { PermissionPrompt } from './permissions/gate.js';
-import { createRegistry } from './tools/registry.js';
+import { ALL_TOOLS, createRegistry } from './tools/registry.js';
+import { escalateTool } from './tools/escalate.js';
 import { createProvider } from './providers/index.js';
+import { classifyTurn } from './agent/router.js';
+import { formatUsd } from './providers/pricing.js';
+import { checkLocalModel } from './system/resources.js';
 import { loadConfig } from './config/load.js';
-import type { CliOverrides } from './config/load.js';
+import type { CliOverrides, ResolvedConfig } from './config/load.js';
 import { loadProjectContext } from './config/context.js';
 import { buildSystemPrompt } from './agent/systemPrompt.js';
 import { loadCommands, renderCommand } from './commands/loader.js';
 import type { Command } from './commands/types.js';
 
+const COST_TIPS = [
+  'Let the local model handle searches, listing, and small edits; save the frontier model for heavy lifting.',
+  'Keep requests focused — narrow context means fewer input tokens.',
+  'For big refactors or tricky bugs, let routing escalate rather than forcing the local model.',
+  'Use smaller models (e.g. gemma3:4b, qwen2.5-coder:7b) for boilerplate; reserve 12B+ for reasoning.',
+  'Lower the Anthropic `effort` setting for simple tasks to cut output tokens.',
+];
+
+function printCosts(ui: TerminalUI, config: ResolvedConfig): void {
+  const t = ui.getTotals();
+  console.log(pc.bold('\nSession usage:'));
+  console.log(`  Tokens   ${t.inputTokens} in / ${t.outputTokens} out`);
+  console.log(`  Est cost ${formatUsd(t.cost)} (cloud turns only; local models are free)`);
+  const routing =
+    config.routing === 'local-first' && config.escalateTo
+      ? `local-first · ${config.provider}:${config.model} → ${config.escalateTo.provider}:${config.escalateTo.model}`
+      : `${config.provider}:${config.model}`;
+  console.log(`  Routing  ${routing}`);
+  console.log(pc.bold('\nTips to cut cost:'));
+  for (const tip of COST_TIPS) console.log(`  • ${pc.dim(tip)}`);
+}
+
 function printHelp(commands: Map<string, Command>): void {
   console.log(pc.bold('\nBuilt-in:'));
   console.log('  /help            Show this help');
+  console.log('  /costs           Show token usage, est. cost, and cost-saving tips');
   console.log('  /exit, /quit     Leave the session');
   if (commands.size > 0) {
     console.log(pc.bold('\nCustom commands:'));
@@ -30,9 +58,26 @@ export async function startRepl(overrides: CliOverrides): Promise<void> {
   const cwd = process.cwd();
   const config = loadConfig(overrides, cwd);
   const provider = createProvider(config); // throws with a clear message if the API key is missing
-  const registry = createRegistry();
+
+  // Local-first routing: build the frontier provider and expose the `escalate` tool.
+  const localFirst = config.routing === 'local-first' && config.escalateTo !== undefined;
+  const escalationProvider = localFirst
+    ? createProvider({
+        ...config,
+        provider: config.escalateTo!.provider,
+        model: config.escalateTo!.model,
+        ollamaBaseUrl: config.escalateTo!.ollamaBaseUrl ?? config.ollamaBaseUrl,
+      })
+    : undefined;
+
+  const registry = createRegistry(localFirst ? [...ALL_TOOLS, escalateTool] : undefined);
   const projectContext = loadProjectContext(cwd);
-  const system = buildSystemPrompt({ cwd, projectContext, tools: registry.toSchemas() });
+  const system = buildSystemPrompt({
+    cwd,
+    projectContext,
+    tools: registry.toSchemas(),
+    escalation: localFirst,
+  });
   const commands = loadCommands(config.commandDirs);
 
   const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
@@ -49,7 +94,7 @@ export async function startRepl(overrides: CliOverrides): Promise<void> {
     });
 
   const gate = new PermissionGate(config.allow, prompt);
-  const ui = createTerminalUI();
+  const ui = createTerminalUI({ model: provider.model, provider: provider.name });
   const agent = new AgentLoop({
     provider,
     registry,
@@ -58,15 +103,39 @@ export async function startRepl(overrides: CliOverrides): Promise<void> {
     ui,
     cwd,
     maxIterations: config.maxIterations,
+    escalationProvider,
+    router: localFirst ? classifyTurn : undefined,
   });
 
+  const routeNote = localFirst
+    ? pc.dim(` → escalates to ${config.escalateTo!.provider}:${config.escalateTo!.model}`)
+    : '';
   console.log(
-    pc.bold('tiny-code') + pc.dim(` · ${provider.name}:${provider.model} · ${cwd}`),
+    pc.bold('tiny-code') + pc.dim(` · ${provider.name}:${provider.model} · ${cwd}`) + routeNote,
   );
+
+  // Compute-cost advisory for local models: does this machine have the RAM?
+  if (provider.name === 'ollama') {
+    const check = checkLocalModel(provider.model);
+    const ramLine = `~${check.needGb}GB needed · ${check.freeGb}GB free / ${check.totalGb}GB total`;
+    if (check.warn) {
+      console.log(
+        pc.yellow(`⚠ ${provider.model} may exceed available memory (${ramLine}). Expect slow or failed runs.`),
+      );
+    } else {
+      console.log(pc.dim(`Local model: ${ramLine}. No API cost.`));
+    }
+    if (check.toolCallRisk) {
+      console.log(
+        pc.yellow('⚠ Small models (≤3B) often tool-call unreliably; prefer gemma3:4b+ or qwen2.5-coder:7b for agentic work.'),
+      );
+    }
+  }
+
   if (projectContext.trim().length > 0) {
     console.log(pc.dim('Loaded project context.'));
   }
-  console.log(pc.dim('Type a request, /help for commands, /exit to quit.'));
+  console.log(pc.dim('Type a request, /help for commands, /costs for usage, /exit to quit.'));
 
   const handle = async (line: string): Promise<void> => {
     const input = line.trim();
@@ -83,6 +152,11 @@ export async function startRepl(overrides: CliOverrides): Promise<void> {
       ask();
       return;
     }
+    if (input === '/costs') {
+      printCosts(ui, config);
+      ask();
+      return;
+    }
 
     let userMessage = input;
     if (input.startsWith('/')) {
diff --git a/src/system/resources.ts b/src/system/resources.ts
new file mode 100644
index 0000000..f5030d4
--- /dev/null
+++ b/src/system/resources.ts
@@ -0,0 +1,71 @@
+import { totalmem, freemem } from 'node:os';
+
+/**
+ * Approximate memory needed to run common local models at ~Q4 quantization,
+ * in GB (weights + a modest KV cache / runtime overhead). These are guidelines
+ * for the startup advisory, not exact figures — long contexts need more.
+ */
+export const MODEL_RAM_GB: Record<string, number> = {
+  'gemma3:1b': 1,
+  'gemma3:4b': 3,
+  'gemma3:12b': 7,
+  'gemma3:27b': 16,
+  'qwen2.5-coder:1.5b': 2,
+  'qwen2.5-coder:7b': 5,
+  'qwen2.5-coder:14b': 9,
+  'qwen2.5-coder:32b': 18,
+  'llama3.2:3b': 3,
+  'llama3.1:8b': 6,
+};
+
+const GB = 1024 ** 3;
+
+/** Parse a parameter count (in billions) out of a model tag like "gemma3:12b". */
+export function parseParamsB(model: string): number | undefined {
+  const match = model.match(/(\d+(?:\.\d+)?)\s*b\b/i);
+  return match ? Number(match[1]) : undefined;
+}
+
+/** Estimate RAM (GB) for a model: explicit table first, else a size-based guess. */
+export function estimateModelRamGb(model: string): number {
+  const known = MODEL_RAM_GB[model.toLowerCase()];
+  if (known !== undefined) return known;
+  const params = parseParamsB(model);
+  // ~0.6 GB per billion params at Q4, plus ~1.5 GB runtime/KV-cache overhead.
+  return params !== undefined ? Math.round(params * 0.6 + 1.5) : 4;
+}
+
+export interface LocalModelCheck {
+  needGb: number;
+  totalGb: number;
+  freeGb: number;
+  /** True when the model likely won't fit comfortably in free memory. */
+  warn: boolean;
+  /** True for small models (≤3B) that tool-call unreliably. */
+  toolCallRisk: boolean;
+}
+
+/**
+ * Compare a local model's memory footprint against the host's available RAM.
+ * `mem` defaults to the live host readings but can be injected for testing.
+ */
+export function checkLocalModel(
+  model: string,
+  mem: { total: number; free: number } = { total: totalmem(), free: freemem() },
+): LocalModelCheck {
+  const needGb = estimateModelRamGb(model);
+  const totalGb = mem.total / GB;
+  const freeGb = mem.free / GB;
+  const params = parseParamsB(model);
+  return {
+    needGb,
+    totalGb: round1(totalGb),
+    freeGb: round1(freeGb),
+    warn: needGb > freeGb,
+    toolCallRisk: params !== undefined && params <= 3,
+  };
+}
+
+function round1(n: number): number {
+  return Math.round(n * 10) / 10;
+}
diff --git a/src/tools/escalate.ts b/src/tools/escalate.ts
new file mode 100644
index 0000000..2214364
--- /dev/null
+++ b/src/tools/escalate.ts
@@ -0,0 +1,26 @@
+import { z } from 'zod';
+import { defineTool } from './types.js';
+
+/**
+ * A signal tool, not a worker. When local-first routing is active and the
+ * current (cheap/local) model decides a task needs deeper reasoning — a large
+ * or multi-file refactor, tricky debugging, or it is simply stuck — it calls
+ * this tool. The agent loop watches for it and swaps in the configured frontier
+ * model for the rest of the turn, with full conversation context preserved.
+ * The tool itself just acknowledges; the loop performs the handoff.
+ */
+export const escalateTool = defineTool({
+  name: 'escalate',
+  description:
+    'Hand off the current task to a more capable model when it needs deep reasoning, a large or multi-file change, tricky debugging, or you are stuck. Prefer escalating early over guessing. Provide a brief reason.',
+  mutating: false,
+  schema: z.object({
+    reason: z.string().describe('A brief reason the task needs a more capable model.'),
+  }),
+  async execute(input) {
+    return {
+      output: `Escalation acknowledged (${input.reason}). A more capable model will continue this task.`,
+      summary: 'escalating',
+    };
+  },
+});
diff --git a/src/ui/render.ts b/src/ui/render.ts
index 1e86808..c93980c 100644
--- a/src/ui/render.ts
+++ b/src/ui/render.ts
@@ -1,6 +1,8 @@
 import pc from 'picocolors';
 import type { AgentUI } from '../agent/loop.js';
 import type { ToolResult } from '../tools/types.js';
+import type { Usage } from '../providers/types.js';
+import { estimateCost, formatUsd } from '../providers/pricing.js';
 
 function preview(name: string, input: unknown): string {
   const obj = (input ?? {}) as Record<string, unknown>;
@@ -15,9 +17,36 @@ function truncate(s: string, n: number): string {
   return oneLine.length > n ? `${oneLine.slice(0, n)}…` : oneLine;
 }
 
+/** Compact token count, e.g. 1234 -> "1.2k". */
+function fmtTokens(n: number): string {
+  return n >= 1000 ? `${(n / 1000).toFixed(1)}k` : String(n);
+}
+
+export interface SessionTotals {
+  inputTokens: number;
+  outputTokens: number;
+  /** Accumulated USD across priced (cloud) turns. */
+  cost: number;
+}
+
+export interface TerminalUI extends AgentUI {
+  /** Cumulative token + cost totals for the session (used by /costs). */
+  getTotals(): SessionTotals;
+}
+
+export interface TerminalUIOptions {
+  /** Default model id, used to price usage when the loop doesn't supply one. */
+  model?: string;
+  provider?: string;
+  /** Print the per-turn usage line. Default true; set false to stay silent. */
+  showUsage?: boolean;
+}
+
 /** Minimal streaming UI: assistant text inline, compact colored tool summaries. */
-export function createTerminalUI(): AgentUI {
+export function createTerminalUI(opts: TerminalUIOptions = {}): TerminalUI {
+  const showUsage = opts.showUsage ?? true;
   let atLineStart = true;
+  const totals: SessionTotals = { inputTokens: 0, outputTokens: 0, cost: 0 };
 
   const write = (s: string): void => {
     if (s.length === 0) return;
@@ -46,8 +75,24 @@ export function createTerminalUI(): AgentUI {
       ensureNewline();
       write(pc.yellow(`  ⊘ ${name} denied\n`));
     },
-    onUsage() {
-      // Token usage is available here; kept silent to reduce noise in the MVP.
+    onUsage(usage: Usage, model?: string) {
+      totals.inputTokens += usage.inputTokens;
+      totals.outputTokens += usage.outputTokens;
+      const cost = estimateCost(model ?? opts.model ?? '', usage);
+      if (cost !== null) totals.cost += cost;
+
+      if (!showUsage) return;
+      ensureNewline();
+      const tokens = `${fmtTokens(usage.inputTokens)} in / ${fmtTokens(usage.outputTokens)} out`;
+      const money =
+        cost !== null
+          ? `${formatUsd(cost)} turn · ${formatUsd(totals.cost)} session`
+          : 'local (no API cost)';
+      write(pc.dim(`· ${tokens} · ${money}\n`));
+    },
+    onRoute(provider, model, reason) {
+      ensureNewline();
+      write(pc.yellow(`↑ escalated to ${provider}:${model} (${reason})\n`));
     },
     onAssistantEnd() {
       ensureNewline();
@@ -56,5 +101,8 @@ export function createTerminalUI(): AgentUI {
       ensureNewline();
       write(pc.yellow('[Reached max iterations — stopping]\n'));
     },
+    getTotals() {
+      return { ...totals };
+    },
   };
 }
diff --git a/tests/agent/loop.test.ts b/tests/agent/loop.test.ts
index da380d5..63aa615 100644
--- a/tests/agent/loop.test.ts
+++ b/tests/agent/loop.test.ts
@@ -4,6 +4,7 @@ import { AgentLoop } from '../../src/agent/loop.js';
 import type { AgentUI } from '../../src/agent/loop.js';
 import { createRegistry } from '../../src/tools/registry.js';
 import { defineTool } from '../../src/tools/types.js';
+import { escalateTool } from '../../src/tools/escalate.js';
 import { PermissionGate } from '../../src/permissions/gate.js';
 import type { PermissionChoice } from '../../src/permissions/gate.js';
 import type { ModelProvider, ProviderEvent, SendRequest } from '../../src/providers/types.js';
@@ -15,11 +16,18 @@ const DONE: ProviderEvent = {
 };
 
 class ScriptedProvider implements ModelProvider {
-  readonly name = 'anthropic' as const;
-  readonly model = 'fake';
+  readonly name: 'anthropic' | 'gemini' | 'ollama';
+  readonly model: string;
   readonly sent: SendRequest[] = [];
 
-  constructor(private readonly turns: ProviderEvent[][]) {}
+  constructor(
+    private readonly turns: ProviderEvent[][],
+    model = 'fake',
+    name: 'anthropic' | 'gemini' | 'ollama' = 'anthropic',
+  ) {
+    this.model = model;
+    this.name = name;
+  }
 
   async *send(req: SendRequest): AsyncIterable<ProviderEvent> {
     this.sent.push(req);
@@ -36,6 +44,7 @@ function recordingUI(): { ui: AgentUI; events: string[] } {
     onToolResult: (n, r) => events.push(`result:${n}:${r.output}:${r.isError ?? false}`),
     onToolDenied: (n) => events.push(`denied:${n}`),
     onUsage: () => events.push('usage'),
+    onRoute: (p, m, r) => events.push(`route:${p}:${m}:${r}`),
     onAssistantEnd: () => events.push('assistantEnd'),
     onMaxIterations: () => events.push('maxIter'),
   };
@@ -165,6 +174,91 @@ describe('AgentLoop', () => {
     }
   });
 
+  it('routes a heavy turn to the escalation provider up front', async () => {
+    const local = new ScriptedProvider([[{ type: 'text', delta: 'local' }, DONE]], 'local');
+    const frontier = new ScriptedProvider(
+      [[{ type: 'text', delta: 'frontier' }, DONE]],
+      'big',
+      'anthropic',
+    );
+    const { ui, events } = recordingUI();
+    const loop = new AgentLoop({
+      provider: local,
+      registry,
+      gate: gateWith('yes'),
+      ui,
+      system: 'sys',
+      cwd: process.cwd(),
+      escalationProvider: frontier,
+      router: () => 'heavy',
+    });
+    await loop.run('refactor everything');
+
+    expect(frontier.sent).toHaveLength(1);
+    expect(local.sent).toHaveLength(0);
+    expect(events).toContain('route:anthropic:big:heavy task');
+  });
+
+  it('keeps a light turn on the local provider', async () => {
+    const local = new ScriptedProvider([[{ type: 'text', delta: 'local' }, DONE]], 'local');
+    const frontier = new ScriptedProvider([[DONE]], 'big');
+    const { ui, events } = recordingUI();
+    const loop = new AgentLoop({
+      provider: local,
+      registry,
+      gate: gateWith('yes'),
+      ui,
+      system: 'sys',
+      cwd: process.cwd(),
+      escalationProvider: frontier,
+      router: () => 'light',
+    });
+    await loop.run('list files');
+
+    expect(local.sent).toHaveLength(1);
+    expect(frontier.sent).toHaveLength(0);
+    expect(events).not.toContain('route:anthropic:big:heavy task');
+  });
+
+  it('escalates mid-turn when the local model calls the escalate tool', async () => {
+    const escalateRegistry = createRegistry([echoTool, escalateTool]);
+    const local = new ScriptedProvider(
+      [[{ type: 'tool_call', id: 'e1', name: 'escalate', input: { reason: 'too hard' } }, DONE]],
+      'local',
+    );
+    const frontier = new ScriptedProvider(
+      [[{ type: 'text', delta: 'handled' }, DONE]],
+      'big',
+      'anthropic',
+    );
+    const { ui, events } = recordingUI();
+    const loop = new AgentLoop({
+      provider: local,
+      registry: escalateRegistry,
+      gate: gateWith('yes'),
+      ui,
+      system: 'sys',
+      cwd: process.cwd(),
+      escalationProvider: frontier,
+      router: () => 'light',
+    });
+    await loop.run('start small then get stuck');
+
+    // First send on local, second (post-escalation) on frontier.
+    expect(local.sent).toHaveLength(1);
+    expect(frontier.sent).toHaveLength(1);
+    expect(events).toContain('route:anthropic:big:requested by model');
+    expect(events).toContain('text:handled');
+  });
+
+  it('behaves as a single provider when no escalation is configured', async () => {
+    const provider = new ScriptedProvider([[{ type: 'text', delta: 'hi' }, DONE]]);
+    const { ui, events } = recordingUI();
+    await makeLoop(provider, ui, gateWith('yes')).run('refactor the whole codebase');
+    expect(provider.sent).toHaveLength(1);
+    expect(events).not.toContain('route:anthropic:big:heavy task');
+  });
+
   it('stops at the iteration guard when tools never stop', async () => {
     const looping: ProviderEvent[][] = [];
     for (let i = 0; i < 10; i += 1) {
diff --git a/tests/agent/router.test.ts b/tests/agent/router.test.ts
new file mode 100644
index 0000000..a409b4c
--- /dev/null
+++ b/tests/agent/router.test.ts
@@ -0,0 +1,22 @@
+import { describe, it, expect } from 'vitest';
+import { classifyTurn } from '../../src/agent/router.js';
+
+describe('classifyTurn', () => {
+  it('treats simple lookups and small edits as light', () => {
+    expect(classifyTurn('list the files in src')).toBe('light');
+    expect(classifyTurn('what does this function return?')).toBe('light');
+    expect(classifyTurn('rename foo to bar in utils.ts')).toBe('light');
+  });
+
+  it('flags reasoning-heavy keywords as heavy', () => {
+    expect(classifyTurn('refactor the provider layer')).toBe('heavy');
+    expect(classifyTurn('debug why the stream hangs')).toBe('heavy');
+    expect(classifyTurn('design a caching architecture')).toBe('heavy');
+    expect(classifyTurn('implement retthrough retries')).toBe('heavy');
+  });
+
+  it('flags multi-file and long requests as heavy', () => {
+    expect(classifyTurn('update a.ts, b.ts, and c.ts to match')).toBe('heavy');
+    expect(classifyTurn('x'.repeat(700))).toBe('heavy');
+  });
+});
diff --git a/tests/config/load.test.ts b/tests/config/load.test.ts
index f1a5829..fc077f7 100644
--- a/tests/config/load.test.ts
+++ b/tests/config/load.test.ts
@@ -11,6 +11,7 @@ const ENV_KEYS = [
   'TINY_CODE_MODEL',
   'TINY_CODE_MAX_TOKENS',
   'TINY_CODE_EFFORT',
+  'TINY_CODE_OLLAMA_URL',
   'HOME',
 ];
 
@@ -85,4 +86,38 @@ describe('loadConfig', () => {
     const cfg = loadConfig({}, cwd);
     expect(cfg.model).toBe('from-env');
   });
+
+  it('supports the ollama provider with its default model and base URL', () => {
+    const cfg = loadConfig({ provider: 'ollama' }, cwd);
+    expect(cfg.provider).toBe('ollama');
+    expect(cfg.model).toBe('qwen2.5-coder:7b');
+    expect(cfg.ollamaBaseUrl).toBe('http://localhost:11434/v1');
+  });
+
+  it('honors TINY_CODE_OLLAMA_URL over the default', () => {
+    process.env.TINY_CODE_OLLAMA_URL = 'http://gpu-box:11434/v1';
+    const cfg = loadConfig({ provider: 'ollama' }, cwd);
+    expect(cfg.ollamaBaseUrl).toBe('http://gpu-box:11434/v1');
+  });
+
+  it('defaults routing to local-first when an escalateTo target is configured', async () => {
+    await writeFile(
+      join(cwd, 'tiny-code.config.json'),
+      JSON.stringify({
+        provider: 'ollama',
+        model: 'gemma3:12b',
+        escalateTo: { provider: 'anthropic', model: 'claude-opus-4-8' },
+      }),
+    );
+    const cfg = loadConfig({}, cwd);
+    expect(cfg.routing).toBe('local-first');
+    expect(cfg.escalateTo).toEqual({ provider: 'anthropic', model: 'claude-opus-4-8' });
+  });
+
+  it('defaults routing to off with no escalateTo target', () => {
+    process.env.ANTHROPIC_API_KEY = 'sk-test';
+    const cfg = loadConfig({}, cwd);
+    expect(cfg.routing).toBe('off');
+    expect(cfg.escalateTo).toBeUndefined();
+  });
 });
diff --git a/tests/providers/ollamaSend.test.ts b/tests/providers/ollamaSend.test.ts
new file mode 100644
index 0000000..745354f
--- /dev/null
+++ b/tests/providers/ollamaSend.test.ts
@@ -0,0 +1,90 @@
+import { describe, it, expect, vi, afterEach } from 'vitest';
+import { OllamaProvider } from '../../src/providers/ollama.js';
+import type { ProviderEvent } from '../../src/providers/types.js';
+
+/** Build a fake SSE Response body from a list of OpenAI-style chunks. */
+function sseResponse(chunks: unknown[]): Response {
+  const lines = chunks.map((c) => `data: ${JSON.stringify(c)}\n\n`).concat('data: [DONE]\n\n');
+  const stream = new ReadableStream<Uint8Array>({
+    start(controller) {
+      const enc = new TextEncoder();
+      for (const line of lines) controller.enqueue(enc.encode(line));
+      controller.close();
+    },
+  });
+  return new Response(stream, { status: 200, headers: { 'Content-Type': 'text/event-stream' } });
+}
+
+afterEach(() => vi.restoreAllMocks());
+
+async function collect(provider: OllamaProvider): Promise<ProviderEvent[]> {
+  const events: ProviderEvent[] = [];
+  for await (const e of provider.send({
+    system: 's',
+    messages: [{ role: 'user', content: [{ type: 'text', text: 'go' }] }],
+    tools: [{ name: 'ls', description: 'list', jsonSchema: { type: 'object' } }],
+  })) {
+    events.push(e);
+  }
+  return events;
+}
+
+describe('OllamaProvider.send', () => {
+  it('maps streamed deltas into text, tool_call, and done events', async () => {
+    vi.spyOn(globalThis, 'fetch').mockResolvedValue(
+      sseResponse([
+        { choices: [{ delta: { content: 'Hel' } }] },
+        { choices: [{ delta: { content: 'lo' } }] },
+        {
+          choices: [
+            {
+              delta: { tool_calls: [{ index: 0, id: 'c1', function: { name: 'ls', arguments: '{"path":' } }] },
+            },
+          ],
+        },
+        {
+          choices: [{ delta: { tool_calls: [{ index: 0, function: { arguments: '"."}' } }] }, finish_reason: 'tool_calls' }],
+        },
+        { choices: [], usage: { prompt_tokens: 11, completion_tokens: 7 } },
+      ]),
+    );
+
+    const provider = new OllamaProvider({ baseUrl: 'http://localhost:11434/v1', model: 'qwen2.5-coder:7b' });
+    const events = await collect(provider);
+
+    const text = events.filter((e) => e.type === 'text').map((e) => (e as { delta: string }).delta);
+    expect(text.join('')).toBe('Hello');
+
+    const call = events.find((e) => e.type === 'tool_call');
+    expect(call).toMatchObject({ type: 'tool_call', id: 'c1', name: 'ls', input: { path: '.' } });
+
+    const done = events.find((e) => e.type === 'done');
+    expect(done).toMatchObject({
+      type: 'done',
+      stopReason: 'tool_use',
+      usage: { inputTokens: 11, outputTokens: 7 },
+    });
+  });
+
+  it('degrades to empty input on malformed tool-call JSON', async () => {
+    vi.spyOn(globalThis, 'fetch').mockResolvedValue(
+      sseResponse([
+        {
+          choices: [
+            { delta: { tool_calls: [{ index: 0, id: 'c1', function: { name: 'ls', arguments: '{bad' } }] } },
+          ],
+        },
+      ]),
+    );
+    const provider = new OllamaProvider({ baseUrl: 'http://localhost:11434/v1', model: 'm' });
+    const events = await collect(provider);
+    const call = events.find((e) => e.type === 'tool_call');
+    expect(call).toMatchObject({ name: 'ls', input: {} });
+  });
+
+  it('throws a helpful error when Ollama is unreachable', async () => {
+    vi.spyOn(globalThis, 'fetch').mockRejectedValue(new Error('ECONNREFUSED'));
+    const provider = new OllamaProvider({ baseUrl: 'http://localhost:11434/v1', model: 'm' });
+    await expect(collect(provider)).rejects.toThrow(/Cannot reach Ollama/);
+  });
+});
diff --git a/tests/providers/pricing.test.ts b/tests/providers/pricing.test.ts
new file mode 100644
index 0000000..d4b3fac
--- /dev/null
+++ b/tests/providers/pricing.test.ts
@@ -0,0 +1,28 @@
+import { describe, it, expect } from 'vitest';
+import { estimateCost, formatUsd } from '../../src/providers/pricing.js';
+
+describe('estimateCost', () => {
+  it('prices a known cloud model from input/output tokens', () => {
+    // opus 4.8: $15/M in, $75/M out -> 1M in + 1M out = $90
+    const cost = estimateCost('claude-opus-4-8', { inputTokens: 1_000_000, outputTokens: 1_000_000 });
+    expect(cost).toBeCloseTo(90, 5);
+  });
+
+  it('matches versioned model ids by prefix', () => {
+    const cost = estimateCost('gemini-2.5-pro-preview', { inputTokens: 1_000_000, outputTokens: 0 });
+    expect(cost).toBeCloseTo(1.25, 5);
+  });
+
+  it('returns null for local/unpriced models', () => {
+    expect(estimateCost('gemma3:12b', { inputTokens: 1000, outputTokens: 1000 })).toBeNull();
+    expect(estimateCost('qwen2.5-coder:7b', { inputTokens: 1000, outputTokens: 1000 })).toBeNull();
+  });
+});
+
+describe('formatUsd', () => {
+  it('uses extra precision for tiny amounts', () => {
+    expect(formatUsd(0)).toBe('$0.00');
+    expect(formatUsd(0.0012)).toBe('$0.0012');
+    expect(formatUsd(1.234)).toBe('$1.23');
+  });
+});
diff --git a/tests/providers/translate.test.ts b/tests/providers/translate.test.ts
index 58fd9fc..0a55560 100644
--- a/tests/providers/translate.test.ts
+++ b/tests/providers/translate.test.ts
@@ -1,6 +1,7 @@
 import { describe, it, expect } from 'vitest';
 import { toAnthropicMessages } from '../../src/providers/anthropic.js';
 import { toGeminiContents } from '../../src/providers/gemini.js';
+import { toOpenAiMessages, toOpenAiTools } from '../../src/providers/ollama.js';
 import type { Message } from '../../src/agent/types.js';
 
 const conversation: Message[] = [
@@ -68,3 +69,25 @@ describe('toGeminiContents', () => {
     expect(out[0]!.parts).toHaveLength(0);
   });
 });
+
+describe('toOpenAiMessages', () => {
+  it('maps roles and correlates tool results by tool_call_id', () => {
+    const out = toOpenAiMessages(conversation);
+
+    const assistant = out.find((m) => m.role === 'assistant')!;
+    expect(assistant.tool_calls?.[0]).toMatchObject({
+      id: 'call-1',
+      type: 'function',
+      function: { name: 'ls' },
+    });
+    expect(assistant.tool_calls?.[0]!.function.arguments).toBe(JSON.stringify({ path: '.' }));
+
+    const toolMsg = out.find((m) => m.role === 'tool')!;
+    expect(toolMsg).toMatchObject({ role: 'tool', tool_call_id: 'call-1', content: 'a.txt' });
+  });
+
+  it('produces a function tool array', () => {
+    const tools = toOpenAiTools([{ name: 'ls', description: 'list', jsonSchema: { type: 'object' } }]);
+    expect(tools[0]).toMatchObject({ type: 'function', function: { name: 'ls', description: 'list' } });
+  });
+});
diff --git a/tests/system/resources.test.ts b/tests/system/resources.test.ts
new file mode 100644
index 0000000..af55481
--- /dev/null
+++ b/tests/system/resources.test.ts
@@ -0,0 +1,32 @@
+import { describe, it, expect } from 'vitest';
+import { checkLocalModel, estimateModelRamGb, parseParamsB } from '../../src/system/resources.js';
+
+const GB = 1024 ** 3;
+
+describe('parseParamsB / estimateModelRamGb', () => {
+  it('extracts billions of params from a tag', () => {
+    expect(parseParamsB('gemma3:12b')).toBe(12);
+    expect(parseParamsB('qwen2.5-coder:1.5b')).toBe(1.5);
+    expect(parseParamsB('mystery-model')).toBeUndefined();
+  });
+
+  it('uses the explicit table when available, else a size-based estimate', () => {
+    expect(estimateModelRamGb('gemma3:12b')).toBe(7);
+    // unknown 20b model -> 20*0.6 + 1.5 ~= 14 (rounded)
+    expect(estimateModelRamGb('something:20b')).toBe(Math.round(20 * 0.6 + 1.5));
+  });
+});
+
+describe('checkLocalModel', () => {
+  it('warns when the model needs more than the free memory', () => {
+    const check = checkLocalModel('gemma3:27b', { total: 8 * GB, free: 4 * GB }); // ~16GB
+    expect(check.warn).toBe(true);
+    expect(check.needGb).toBe(16);
+  });
+
+  it('does not warn when there is ample free memory and flags small-model tool risk', () => {
+    const check = checkLocalModel('gemma3:1b', { total: 64 * GB, free: 48 * GB });
+    expect(check.warn).toBe(false);
+    expect(check.toolCallRisk).toBe(true);
+  });
+});
diff --git a/tests/ui/render.test.ts b/tests/ui/render.test.ts
index 3160e9a..bc368bc 100644
--- a/tests/ui/render.test.ts
+++ b/tests/ui/render.test.ts
@@ -48,4 +48,42 @@ describe('createTerminalUI', () => {
     expect(out).toContain('src/x.ts');
     expect(out).toContain('**/*.ts');
   });
+
+  it('shows a cost line for cloud models and accumulates session totals', () => {
+    const out = capture(() => {
+      const ui = createTerminalUI({ model: 'claude-opus-4-8' });
+      ui.onUsage({ inputTokens: 1000, outputTokens: 1000 });
+      expect(ui.getTotals().inputTokens).toBe(1000);
+      expect(ui.getTotals().cost).toBeGreaterThan(0);
+    });
+    expect(out).toContain('1.0k in / 1.0k out');
+    expect(out).toContain('session');
+  });
+
+  it('labels local models as having no API cost', () => {
+    const out = capture(() => {
+      const ui = createTerminalUI({ model: 'qwen2.5-coder:7b', provider: 'ollama' });
+      ui.onUsage({ inputTokens: 500, outputTokens: 200 });
+      expect(ui.getTotals().cost).toBe(0);
+    });
+    expect(out).toContain('local (no API cost)');
+  });
+
+  it('stays silent when showUsage is false but still tracks totals', () => {
+    const out = capture(() => {
+      const ui = createTerminalUI({ model: 'claude-opus-4-8', showUsage: false });
+      ui.onUsage({ inputTokens: 100, outputTokens: 100 });
+      expect(ui.getTotals().inputTokens).toBe(100);
+    });
+    expect(out).toBe('');
+  });
+
+  it('renders an escalation route line', () => {
+    const out = capture(() => {
+      const ui = createTerminalUI();
+      ui.onRoute('anthropic', 'claude-opus-4-8', 'heavy task');
+    });
+    expect(out).toContain('escalated to anthropic:claude-opus-4-8');
+    expect(out).toContain('heavy task');
+  });
 });

From 79d2536a5bfc026e05ccc1554fdb2fee4fb6b40b Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 9 Jun 2026 14:05:58 +0000
Subject: [PATCH 02/14] Abstract model-selection policy into a
 ModelDecisionEngine

Pull the scattered routing logic (classifyTurn, escalate-tool handoff,
stuck-detection) out of AgentLoop into a cohesive, injectable
ModelDecisionEngine interface. AgentLoop now keeps only mechanism and
delegates all "which model" decisions to the engine.

- Add src/agent/decision/{types,localFirst,index}.ts: the engine
  interface plus LocalFirstModelEngine, the default policy. It reproduces
  prior behavior and adds compute awareness (routes a local model that
  won't fit RAM to the frontier) and cost awareness (costNote reporting).
- AgentLoop takes an optional `engine` instead of `escalationProvider` +
  `router`; routing: 'off' injects no engine (single-provider path
  unchanged).
- Wire the engine in repl.ts; export the new types from the public API.
- Adapt loop tests and add unit tests for the engine.

https://claude.ai/code/session_01T4UTQD35m11g4ChB8Cjd1w
---
 src/agent/decision/index.ts             |   3 +
 src/agent/decision/localFirst.ts        |  87 ++++++++++++++
 src/agent/decision/types.ts             |  47 ++++++++
 src/agent/loop.ts                       |  76 ++++++------
 src/index.ts                            |   2 +
 src/repl.ts                             |  12 +-
 tests/agent/decision/localFirst.test.ts | 150 ++++++++++++++++++++++++
 tests/agent/loop.test.ts                |  36 +++++-
 8 files changed, 367 insertions(+), 46 deletions(-)
 create mode 100644 src/agent/decision/index.ts
 create mode 100644 src/agent/decision/localFirst.ts
 create mode 100644 src/agent/decision/types.ts
 create mode 100644 tests/agent/decision/localFirst.test.ts

diff --git a/src/agent/decision/index.ts b/src/agent/decision/index.ts
new file mode 100644
index 0000000..ea77556
--- /dev/null
+++ b/src/agent/decision/index.ts
@@ -0,0 +1,3 @@
+export type { ModelDecisionEngine, RouteDecision, TurnSignals } from './types.js';
+export { LocalFirstModelEngine } from './localFirst.js';
+export type { LocalFirstOptions } from './localFirst.js';
diff --git a/src/agent/decision/localFirst.ts b/src/agent/decision/localFirst.ts
new file mode 100644
index 0000000..4d60c2b
--- /dev/null
+++ b/src/agent/decision/localFirst.ts
@@ -0,0 +1,87 @@
+import type { ModelProvider } from '../../providers/types.js';
+import { classifyTurn, type TaskWeight } from '../router.js';
+import { checkLocalModel } from '../../system/resources.js';
+import { estimateCost } from '../../providers/pricing.js';
+import type { ModelDecisionEngine, RouteDecision, TurnSignals } from './types.js';
+
+export interface LocalFirstOptions {
+  /** The cheap/local model that handles turns by default. */
+  primary: ModelProvider;
+  /** The frontier model heavy/stuck turns escalate to. */
+  escalation: ModelProvider;
+  /** Task-weight classifier. Defaults to {@link classifyTurn}. */
+  classify?: (input: string) => TaskWeight;
+  /** Consecutive tool-error iterations before auto-escalating. Defaults to 3. */
+  stuckThreshold?: number;
+  /** RAM-fit check (injectable for tests). Defaults to {@link checkLocalModel}. */
+  ramCheck?: (model: string) => { warn: boolean };
+}
+
+/** Per-MTok probe used to compare relative model cost for the cost-aware note. */
+const COST_PROBE = { inputTokens: 1_000_000, outputTokens: 1_000_000 };
+
+/**
+ * The default local-first policy: handle each turn on the cheap/local model and
+ * escalate to the frontier model only when capability demands it — the task
+ * looks heavy up front, the model explicitly hands off via the `escalate` tool,
+ * it gets stuck on repeated tool errors, or (compute awareness) a local model
+ * won't fit in available RAM.
+ *
+ * Cost awareness is wired in but used defensively: the engine can *report* the
+ * cost implication of an escalation (see {@link costNote}) but never initiates
+ * a cost-driven route change — escalation is always a capability decision.
+ */
+export class LocalFirstModelEngine implements ModelDecisionEngine {
+  private readonly primary: ModelProvider;
+  private readonly escalation: ModelProvider;
+  private readonly classify: (input: string) => TaskWeight;
+  private readonly stuckThreshold: number;
+  private readonly ramCheck: (model: string) => { warn: boolean };
+
+  constructor(opts: LocalFirstOptions) {
+    this.primary = opts.primary;
+    this.escalation = opts.escalation;
+    this.classify = opts.classify ?? classifyTurn;
+    this.stuckThreshold = opts.stuckThreshold ?? 3;
+    this.ramCheck = opts.ramCheck ?? checkLocalModel;
+  }
+
+  selectInitial(userInput: string): RouteDecision {
+    // Compute awareness: a local primary that won't fit in RAM runs slowly or
+    // fails outright, so route it to the frontier up front — even for light work.
+    if (this.primary.name === 'ollama' && this.ramCheck(this.primary.model).warn) {
+      return { provider: this.escalation, reason: 'compute: local model exceeds RAM' };
+    }
+    if (this.classify(userInput) === 'heavy') {
+      return { provider: this.escalation, reason: 'heavy task' };
+    }
+    return { provider: this.primary };
+  }
+
+  considerEscalation(signals: TurnSignals): RouteDecision | undefined {
+    if (signals.alreadyEscalated) return undefined;
+    if (signals.escalateRequested) {
+      return { provider: this.escalation, reason: 'requested by model' };
+    }
+    if (signals.consecutiveErrors >= this.stuckThreshold) {
+      return { provider: this.escalation, reason: 'stuck — repeated tool errors' };
+    }
+    return undefined;
+  }
+
+  /**
+   * A short, human-readable summary of what escalating costs, relative to the
+   * primary. Pure reporting — does not influence routing. Local models report
+   * as having no API cost.
+   */
+  costNote(): string {
+    const from = estimateCost(this.primary.model, COST_PROBE);
+    const to = estimateCost(this.escalation.model, COST_PROBE);
+    if (to === null) return 'escalates to a local model (no API cost)';
+    if (from === null || from === 0) {
+      return 'escalates from a free/local model to a paid model (adds API cost)';
+    }
+    const multiplier = to / from;
+    return `escalation costs ~${multiplier.toFixed(1)}× the primary per token`;
+  }
+}
diff --git a/src/agent/decision/types.ts b/src/agent/decision/types.ts
new file mode 100644
index 0000000..c0cd976
--- /dev/null
+++ b/src/agent/decision/types.ts
@@ -0,0 +1,47 @@
+import type { ModelProvider } from '../../providers/types.js';
+
+/**
+ * A model-selection decision: the provider to use and an optional human-readable
+ * reason. An empty/absent `reason` means "no route note" — the loop only fires
+ * {@link AgentUI.onRoute} when a decision both changes the active provider and
+ * carries a reason.
+ */
+export interface RouteDecision {
+  provider: ModelProvider;
+  reason?: string;
+}
+
+/**
+ * Runtime signals the loop feeds the engine once per iteration so it can decide
+ * whether to switch providers mid-turn. These are mechanical bookkeeping the
+ * loop already tracks; the engine owns the policy that interprets them.
+ */
+export interface TurnSignals {
+  /** The model called the `escalate` tool during this iteration. */
+  escalateRequested: boolean;
+  /** Consecutive iterations that ended with at least one tool error. */
+  consecutiveErrors: number;
+  /** Whether the turn has already been escalated (keeps the engine stateless). */
+  alreadyEscalated: boolean;
+  /** The provider currently handling the turn. */
+  current: ModelProvider;
+  /** Iteration index (0-based). */
+  iteration: number;
+}
+
+/**
+ * Owns all model-selection policy. {@link AgentLoop} depends on this interface
+ * instead of inlining classification + escalation rules, so the loop keeps only
+ * mechanism (send → stream → run tools → repeat) and the policy lives in one
+ * cohesive, testable place. Implementations may reason about task weight, cost,
+ * and compute (RAM) fit; the loop never sees that reasoning.
+ */
+export interface ModelDecisionEngine {
+  /** Pick the provider that starts a turn, given the user's input. */
+  selectInitial(userInput: string): RouteDecision;
+  /**
+   * Decide whether to switch providers mid-turn. Returns `undefined` to stay on
+   * the current provider. Called once per iteration after tool results.
+   */
+  considerEscalation(signals: TurnSignals): RouteDecision | undefined;
+}
diff --git a/src/agent/loop.ts b/src/agent/loop.ts
index fe10bdd..d41bbd2 100644
--- a/src/agent/loop.ts
+++ b/src/agent/loop.ts
@@ -3,6 +3,7 @@ import type { ToolRegistry } from '../tools/registry.js';
 import type { PermissionGate } from '../permissions/gate.js';
 import type { ToolResult } from '../tools/types.js';
 import type { Message, ToolResultBlock, ToolUseBlock } from './types.js';
+import type { ModelDecisionEngine, RouteDecision } from './decision/index.js';
 
 /** Sink for everything the loop wants to surface. The REPL provides the real one. */
 export interface AgentUI {
@@ -26,15 +27,13 @@ export interface AgentLoopOptions {
   ui: AgentUI;
   cwd: string;
   maxIterations?: number;
-  /** Frontier model to escalate heavy/stuck turns to (enables local-first routing). */
-  escalationProvider?: ModelProvider | undefined;
-  /** Classifies a turn up front so heavy tasks start on the frontier model. */
-  router?: ((input: string) => 'light' | 'heavy') | undefined;
+  /**
+   * Owns model-selection policy (which model starts a turn, when to escalate).
+   * When omitted, the loop runs `provider` as a single provider with no routing.
+   */
+  engine?: ModelDecisionEngine | undefined;
 }
 
-/** Consecutive tool-error iterations before auto-escalating a stuck local model. */
-const STUCK_THRESHOLD = 3;
-
 /**
  * The provider-agnostic, UI-agnostic agentic loop: send → stream → run tools →
  * feed results back → repeat until the model stops requesting tools (or the
@@ -48,8 +47,7 @@ export class AgentLoop {
   private readonly ui: AgentUI;
   private readonly cwd: string;
   private readonly maxIterations: number;
-  private readonly escalationProvider: ModelProvider | undefined;
-  private readonly router: ((input: string) => 'light' | 'heavy') | undefined;
+  private readonly engine: ModelDecisionEngine | undefined;
   private readonly messages: Message[] = [];
 
   constructor(opts: AgentLoopOptions) {
@@ -60,8 +58,7 @@ export class AgentLoop {
     this.ui = opts.ui;
     this.cwd = opts.cwd;
     this.maxIterations = opts.maxIterations ?? 50;
-    this.escalationProvider = opts.escalationProvider;
-    this.router = opts.router;
+    this.engine = opts.engine;
   }
 
   /** Conversation history (for inspection / persistence). */
@@ -74,10 +71,16 @@ export class AgentLoop {
     this.messages.push({ role: 'user', content: [{ type: 'text', text: userInput }] });
     const tools = this.registry.toSchemas();
 
-    let active = this.selectInitialProvider(userInput);
-    let escalated = active === this.escalationProvider;
+    let active = this.provider;
+    let escalated = false;
     let consecutiveErrors = 0;
 
+    if (this.engine) {
+      const initial = this.engine.selectInitial(userInput);
+      active = this.applyRoute(active, initial);
+      escalated = active !== this.provider;
+    }
+
     for (let iteration = 0; iteration < this.maxIterations; iteration += 1) {
       let text = '';
       const toolCalls: ToolUseBlock[] = [];
@@ -106,11 +109,7 @@ export class AgentLoop {
 
       if (toolCalls.length === 0) return;
 
-      // The local model can explicitly hand off via the `escalate` tool.
-      if (!escalated && toolCalls.some((c) => c.name === 'escalate')) {
-        active = this.escalate('requested by model');
-        escalated = true;
-      }
+      const escalateRequested = toolCalls.some((c) => c.name === 'escalate');
 
       const results: ToolResultBlock[] = [];
       let anyError = false;
@@ -121,12 +120,20 @@ export class AgentLoop {
       }
       this.messages.push({ role: 'user', content: results });
 
-      // Auto-escalate a local model that appears stuck (repeated tool errors).
-      if (!escalated) {
-        consecutiveErrors = anyError ? consecutiveErrors + 1 : 0;
-        if (consecutiveErrors >= STUCK_THRESHOLD) {
-          active = this.escalate('stuck — repeated tool errors');
-          escalated = true;
+      // Hand the turn's runtime signals to the engine, which owns the policy
+      // for whether to switch providers mid-turn (explicit escalate, stuck, …).
+      consecutiveErrors = anyError ? consecutiveErrors + 1 : 0;
+      if (this.engine) {
+        const next = this.engine.considerEscalation({
+          escalateRequested,
+          consecutiveErrors,
+          alreadyEscalated: escalated,
+          current: active,
+          iteration,
+        });
+        if (next) {
+          active = this.applyRoute(active, next);
+          escalated = active !== this.provider;
         }
       }
     }
@@ -134,20 +141,15 @@ export class AgentLoop {
     this.ui.onMaxIterations();
   }
 
-  /** Pick the provider for a turn: heavy tasks start on the frontier model. */
-  private selectInitialProvider(input: string): ModelProvider {
-    if (this.escalationProvider && this.router && this.router(input) === 'heavy') {
-      this.ui.onRoute(this.escalationProvider.name, this.escalationProvider.model, 'heavy task');
-      return this.escalationProvider;
+  /**
+   * Apply a routing decision: switch to its provider and surface an `onRoute`
+   * event when it actually changes the active provider and carries a reason.
+   */
+  private applyRoute(active: ModelProvider, decision: RouteDecision): ModelProvider {
+    if (decision.provider !== active && decision.reason) {
+      this.ui.onRoute(decision.provider.name, decision.provider.model, decision.reason);
     }
-    return this.provider;
-  }
-
-  /** Switch to the frontier provider mid-turn. Falls back to the primary if unset. */
-  private escalate(reason: string): ModelProvider {
-    if (!this.escalationProvider) return this.provider;
-    this.ui.onRoute(this.escalationProvider.name, this.escalationProvider.model, reason);
-    return this.escalationProvider;
+    return decision.provider;
   }
 
   private async executeToolCall(call: ToolUseBlock): Promise<ToolResultBlock> {
diff --git a/src/index.ts b/src/index.ts
index 1a8ce7f..dd4558e 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -16,6 +16,8 @@ export type { ModelPricing } from './providers/pricing.js';
 
 export { classifyTurn } from './agent/router.js';
 export type { TaskWeight } from './agent/router.js';
+export { LocalFirstModelEngine } from './agent/decision/index.js';
+export type { ModelDecisionEngine, RouteDecision, TurnSignals, LocalFirstOptions } from './agent/decision/index.js';
 export { checkLocalModel, estimateModelRamGb, MODEL_RAM_GB } from './system/resources.js';
 export type { LocalModelCheck } from './system/resources.js';
 
diff --git a/src/repl.ts b/src/repl.ts
index 1672fdc..f1eda27 100644
--- a/src/repl.ts
+++ b/src/repl.ts
@@ -8,7 +8,8 @@ import type { PermissionPrompt } from './permissions/gate.js';
 import { ALL_TOOLS, createRegistry } from './tools/registry.js';
 import { escalateTool } from './tools/escalate.js';
 import { createProvider } from './providers/index.js';
-import { classifyTurn } from './agent/router.js';
+import { LocalFirstModelEngine } from './agent/decision/index.js';
+import type { ModelDecisionEngine } from './agent/decision/index.js';
 import { formatUsd } from './providers/pricing.js';
 import { checkLocalModel } from './system/resources.js';
 import { loadConfig } from './config/load.js';
@@ -70,6 +71,12 @@ export async function startRepl(overrides: CliOverrides): Promise<void> {
       })
     : undefined;
 
+  // The decision engine owns all model-selection policy; the loop only runs it.
+  const engine: ModelDecisionEngine | undefined =
+    localFirst && escalationProvider
+      ? new LocalFirstModelEngine({ primary: provider, escalation: escalationProvider })
+      : undefined;
+
   const registry = createRegistry(localFirst ? [...ALL_TOOLS, escalateTool] : undefined);
   const projectContext = loadProjectContext(cwd);
   const system = buildSystemPrompt({
@@ -103,8 +110,7 @@ export async function startRepl(overrides: CliOverrides): Promise<void> {
     ui,
     cwd,
     maxIterations: config.maxIterations,
-    escalationProvider,
-    router: localFirst ? classifyTurn : undefined,
+    engine,
   });
 
   const routeNote = localFirst
diff --git a/tests/agent/decision/localFirst.test.ts b/tests/agent/decision/localFirst.test.ts
new file mode 100644
index 0000000..056bb29
--- /dev/null
+++ b/tests/agent/decision/localFirst.test.ts
@@ -0,0 +1,150 @@
+import { describe, it, expect } from 'vitest';
+import { LocalFirstModelEngine } from '../../../src/agent/decision/localFirst.js';
+import type { TurnSignals } from '../../../src/agent/decision/types.js';
+import type { ModelProvider, ProviderEvent, SendRequest } from '../../../src/providers/types.js';
+
+/** A provider stub with just the identity fields the engine reads. */
+function fakeProvider(model: string, name: ModelProvider['name']): ModelProvider {
+  return {
+    name,
+    model,
+    // eslint-disable-next-line require-yield
+    async *send(_req: SendRequest): AsyncIterable<ProviderEvent> {
+      return;
+    },
+  };
+}
+
+const local = fakeProvider('qwen2.5-coder:7b', 'ollama');
+const frontier = fakeProvider('claude-opus-4-8', 'anthropic');
+
+function signals(overrides: Partial<TurnSignals> = {}): TurnSignals {
+  return {
+    escalateRequested: false,
+    consecutiveErrors: 0,
+    alreadyEscalated: false,
+    current: local,
+    iteration: 0,
+    ...overrides,
+  };
+}
+
+describe('LocalFirstModelEngine', () => {
+  const fits = { warn: false };
+
+  it('routes a heavy turn to the frontier up front', () => {
+    const engine = new LocalFirstModelEngine({
+      primary: local,
+      escalation: frontier,
+      classify: () => 'heavy',
+      ramCheck: () => fits,
+    });
+    expect(engine.selectInitial('refactor everything')).toEqual({
+      provider: frontier,
+      reason: 'heavy task',
+    });
+  });
+
+  it('keeps a light turn on the primary (no reason)', () => {
+    const engine = new LocalFirstModelEngine({
+      primary: local,
+      escalation: frontier,
+      classify: () => 'light',
+      ramCheck: () => fits,
+    });
+    expect(engine.selectInitial('list files')).toEqual({ provider: local });
+  });
+
+  it('escalates a light turn when the local model exceeds RAM (compute awareness)', () => {
+    const engine = new LocalFirstModelEngine({
+      primary: local,
+      escalation: frontier,
+      classify: () => 'light',
+      ramCheck: () => ({ warn: true }),
+    });
+    expect(engine.selectInitial('list files')).toEqual({
+      provider: frontier,
+      reason: 'compute: local model exceeds RAM',
+    });
+  });
+
+  it('does not apply the RAM check to a non-local primary', () => {
+    const cloudPrimary = fakeProvider('gemini-2.5-flash', 'gemini');
+    let called = false;
+    const engine = new LocalFirstModelEngine({
+      primary: cloudPrimary,
+      escalation: frontier,
+      classify: () => 'light',
+      ramCheck: () => {
+        called = true;
+        return { warn: true };
+      },
+    });
+    expect(engine.selectInitial('list files')).toEqual({ provider: cloudPrimary });
+    expect(called).toBe(false);
+  });
+
+  describe('considerEscalation', () => {
+    const engine = new LocalFirstModelEngine({
+      primary: local,
+      escalation: frontier,
+      ramCheck: () => fits,
+    });
+
+    it('escalates when the model requests it', () => {
+      expect(engine.considerEscalation(signals({ escalateRequested: true }))).toEqual({
+        provider: frontier,
+        reason: 'requested by model',
+      });
+    });
+
+    it('escalates once consecutive errors hit the threshold', () => {
+      expect(engine.considerEscalation(signals({ consecutiveErrors: 3 }))).toEqual({
+        provider: frontier,
+        reason: 'stuck — repeated tool errors',
+      });
+    });
+
+    it('stays put below the threshold and without a request', () => {
+      expect(engine.considerEscalation(signals({ consecutiveErrors: 2 }))).toBeUndefined();
+    });
+
+    it('never re-routes once already escalated', () => {
+      expect(
+        engine.considerEscalation(
+          signals({ alreadyEscalated: true, escalateRequested: true, consecutiveErrors: 9 }),
+        ),
+      ).toBeUndefined();
+    });
+
+    it('respects a custom stuck threshold', () => {
+      const eager = new LocalFirstModelEngine({
+        primary: local,
+        escalation: frontier,
+        stuckThreshold: 1,
+        ramCheck: () => fits,
+      });
+      expect(eager.considerEscalation(signals({ consecutiveErrors: 1 }))?.reason).toBe(
+        'stuck — repeated tool errors',
+      );
+    });
+  });
+
+  describe('costNote (cost awareness)', () => {
+    it('reports added API cost when escalating from a free local model', () => {
+      const engine = new LocalFirstModelEngine({ primary: local, escalation: frontier });
+      expect(engine.costNote()).toContain('adds API cost');
+    });
+
+    it('reports a cost multiplier between two priced cloud models', () => {
+      const cheap = fakeProvider('gemini-2.5-flash', 'gemini');
+      const engine = new LocalFirstModelEngine({ primary: cheap, escalation: frontier });
+      expect(engine.costNote()).toMatch(/escalation costs ~\d+(\.\d+)?× the primary per token/);
+    });
+
+    it('reports no API cost when escalating to a local model', () => {
+      const engine = new LocalFirstModelEngine({ primary: frontier, escalation: local });
+      expect(engine.costNote()).toContain('no API cost');
+    });
+  });
+});
diff --git a/tests/agent/loop.test.ts b/tests/agent/loop.test.ts
index 63aa615..304176f 100644
--- a/tests/agent/loop.test.ts
+++ b/tests/agent/loop.test.ts
@@ -2,6 +2,7 @@ import { describe, it, expect } from 'vitest';
 import { z } from 'zod';
 import { AgentLoop } from '../../src/agent/loop.js';
 import type { AgentUI } from '../../src/agent/loop.js';
+import { LocalFirstModelEngine } from '../../src/agent/decision/index.js';
 import { createRegistry } from '../../src/tools/registry.js';
 import { defineTool } from '../../src/tools/types.js';
 import { escalateTool } from '../../src/tools/escalate.js';
@@ -189,8 +190,7 @@ describe('AgentLoop', () => {
       ui,
       system: 'sys',
       cwd: process.cwd(),
-      escalationProvider: frontier,
-      router: () => 'heavy',
+      engine: new LocalFirstModelEngine({ primary: local, escalation: frontier, classify: () => 'heavy' }),
     });
     await loop.run('refactor everything');
 
@@ -210,8 +210,7 @@ describe('AgentLoop', () => {
       ui,
       system: 'sys',
       cwd: process.cwd(),
-      escalationProvider: frontier,
-      router: () => 'light',
+      engine: new LocalFirstModelEngine({ primary: local, escalation: frontier, classify: () => 'light' }),
     });
     await loop.run('list files');
 
@@ -239,8 +238,7 @@ describe('AgentLoop', () => {
       ui,
       system: 'sys',
       cwd: process.cwd(),
-      escalationProvider: frontier,
-      router: () => 'light',
+      engine: new LocalFirstModelEngine({ primary: local, escalation: frontier, classify: () => 'light' }),
     });
     await loop.run('start small then get stuck');
 
@@ -251,6 +249,32 @@ describe('AgentLoop', () => {
     expect(events).toContain('text:handled');
   });
 
+  it('auto-escalates after the engine sees repeated tool errors (stuck)', async () => {
+    // Three iterations of a failing (unknown) tool trip the default stuck threshold.
+    const failing: ProviderEvent[][] = [];
+    for (let i = 0; i < 3; i += 1) {
+      failing.push([{ type: 'tool_call', id: `g${i}`, name: 'ghost', input: {} }, DONE]);
+    }
+    const local = new ScriptedProvider(failing, 'local');
+    const frontier = new ScriptedProvider([[{ type: 'text', delta: 'rescued' }, DONE]], 'big', 'anthropic');
+    const { ui, events } = recordingUI();
+    const loop = new AgentLoop({
+      provider: local,
+      registry,
+      gate: gateWith('yes'),
+      ui,
+      system: 'sys',
+      cwd: process.cwd(),
+      engine: new LocalFirstModelEngine({ primary: local, escalation: frontier, classify: () => 'light' }),
+    });
+    await loop.run('keep failing');
+
+    expect(local.sent).toHaveLength(3);
+    expect(frontier.sent).toHaveLength(1);
+    expect(events).toContain('route:anthropic:big:stuck — repeated tool errors');
+    expect(events).toContain('text:rescued');
+  });
+
   it('behaves as a single provider when no escalation is configured', async () => {
     const provider = new ScriptedProvider([[{ type: 'text', delta: 'hi' }, DONE]]);
     const { ui, events } = recordingUI();

From ef4fdb589d82b34bde959dce914151d25920508b Mon Sep 17 00:00:00 2001
From: Zack Anselm <zanselm5@gmail.com>
Date: Tue, 9 Jun 2026 09:49:03 -0500
Subject: [PATCH 03/14] fix(ollama): make stream_options.include_usage
 best-effort

Older Ollama builds reject unknown body fields with a 400. Sending
stream_options unconditionally meant every local turn could break over
a token-reporting nicety. Retry once without it on a 400.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/providers/ollama.ts            | 20 ++++++++++++++------
 tests/providers/ollamaSend.test.ts | 17 +++++++++++++++++
 2 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/src/providers/ollama.ts b/src/providers/ollama.ts
index 8e2091f..5aff2e6 100644
--- a/src/providers/ollama.ts
+++ b/src/providers/ollama.ts
@@ -107,16 +107,15 @@ export class OllamaProvider implements ModelProvider {
       messages,
       tools: req.tools.length > 0 ? toOpenAiTools(req.tools) : undefined,
       stream: true,
-      stream_options: { include_usage: true },
     };
 
     let res: Response;
     try {
-      res = await fetch(`${this.baseUrl}/chat/completions`, {
-        method: 'POST',
-        headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${this.apiKey}` },
-        body: JSON.stringify(body),
-      });
+      // `stream_options.include_usage` is best-effort: it gives us token counts,
+      // but older Ollama builds reject unknown body fields with a 400. Rather than
+      // breaking every local turn over a reporting nicety, retry once without it.
+      res = await this.post({ ...body, stream_options: { include_usage: true } });
+      if (res.status === 400) res = await this.post(body);
     } catch (err) {
       throw new Error(
         `Cannot reach Ollama at ${this.baseUrl}. Is 'ollama serve' running? (${(err as Error).message})`,
@@ -171,6 +170,15 @@ export class OllamaProvider implements ModelProvider {
       stopReason: calls.size > 0 ? 'tool_use' : finish,
     };
   }
+
+  /** POST a chat-completions request body to the Ollama server. */
+  private post(body: unknown): Promise<Response> {
+    return fetch(`${this.baseUrl}/chat/completions`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${this.apiKey}` },
+      body: JSON.stringify(body),
+    });
+  }
 }
 
 /** Parse an SSE byte stream into decoded JSON chunks, skipping the `[DONE]` sentinel. */
diff --git a/tests/providers/ollamaSend.test.ts b/tests/providers/ollamaSend.test.ts
index 745354f..c396d05 100644
--- a/tests/providers/ollamaSend.test.ts
+++ b/tests/providers/ollamaSend.test.ts
@@ -82,6 +82,23 @@ describe('OllamaProvider.send', () => {
     expect(call).toMatchObject({ name: 'ls', input: {} });
   });
 
+  it('retries without stream_options when the server rejects it with a 400', async () => {
+    const fetchMock = vi
+      .spyOn(globalThis, 'fetch')
+      .mockResolvedValueOnce(new Response('unknown field "stream_options"', { status: 400 }))
+      .mockResolvedValueOnce(sseResponse([{ choices: [{ delta: { content: 'ok' } }] }]));
+
+    const provider = new OllamaProvider({ baseUrl: 'http://localhost:11434/v1', model: 'm' });
+    const events = await collect(provider);
+
+    expect(fetchMock).toHaveBeenCalledTimes(2);
+    const firstBody = JSON.parse((fetchMock.mock.calls[0]![1] as RequestInit).body as string);
+    const retryBody = JSON.parse((fetchMock.mock.calls[1]![1] as RequestInit).body as string);
+    expect(firstBody.stream_options).toEqual({ include_usage: true });
+    expect(retryBody.stream_options).toBeUndefined();
+    expect(events.filter((e) => e.type === 'text').map((e) => (e as { delta: string }).delta).join('')).toBe('ok');
+  });
+
   it('throws a helpful error when Ollama is unreachable', async () => {
     vi.spyOn(globalThis, 'fetch').mockRejectedValue(new Error('ECONNREFUSED'));
     const provider = new OllamaProvider({ baseUrl: 'http://localhost:11434/v1', model: 'm' });

From 892d97139fd743aa6f178ecc672d28cde1480671 Mon Sep 17 00:00:00 2001
From: Zack Anselm <zanselm5@gmail.com>
Date: Tue, 9 Jun 2026 09:50:09 -0500
Subject: [PATCH 04/14] fix(ollama): add an idle timeout so a hung model can't
 freeze the REPL

The raw fetch had no AbortSignal, so a stuck or RAM-starved local model
left the prompt frozen with no recovery. Add an AbortController with a
120s idle timeout (reset on every received chunk, so slow-but-progressing
generations still complete) surfaced as a clear error.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/providers/ollama.ts            | 143 ++++++++++++++++++-----------
 tests/providers/ollamaSend.test.ts |  14 +++
 2 files changed, 105 insertions(+), 52 deletions(-)

diff --git a/src/providers/ollama.ts b/src/providers/ollama.ts
index 5aff2e6..b93e36f 100644
--- a/src/providers/ollama.ts
+++ b/src/providers/ollama.ts
@@ -7,6 +7,13 @@ export interface OllamaProviderOptions {
   model: string;
   /** Ignored by Ollama but required by the OpenAI wire format; defaults to "ollama". */
   apiKey?: string;
+  /**
+   * Abort the request if no bytes arrive for this long (ms). This is an *idle*
+   * timeout, reset on every received chunk — a slow-but-progressing model keeps
+   * going; a hung one (common when the machine is RAM-starved) is cut loose.
+   * Defaults to 120_000.
+   */
+  timeoutMs?: number;
 }
 
 interface OpenAiMessage {
@@ -89,11 +96,13 @@ export class OllamaProvider implements ModelProvider {
   readonly model: string;
   private readonly baseUrl: string;
   private readonly apiKey: string;
+  private readonly timeoutMs: number;
 
   constructor(opts: OllamaProviderOptions) {
     this.baseUrl = opts.baseUrl.replace(/\/$/, '');
     this.model = opts.model;
     this.apiKey = opts.apiKey ?? 'ollama';
+    this.timeoutMs = opts.timeoutMs ?? 120_000;
   }
 
   async *send(req: SendRequest): AsyncIterable<ProviderEvent> {
@@ -109,74 +118,104 @@ export class OllamaProvider implements ModelProvider {
       stream: true,
     };
 
-    let res: Response;
-    try {
-      // `stream_options.include_usage` is best-effort: it gives us token counts,
-      // but older Ollama builds reject unknown body fields with a 400. Rather than
-      // breaking every local turn over a reporting nicety, retry once without it.
-      res = await this.post({ ...body, stream_options: { include_usage: true } });
-      if (res.status === 400) res = await this.post(body);
-    } catch (err) {
-      throw new Error(
-        `Cannot reach Ollama at ${this.baseUrl}. Is 'ollama serve' running? (${(err as Error).message})`,
-      );
-    }
-
-    if (!res.ok || !res.body) {
-      const detail = await res.text().catch(() => '');
-      throw new Error(`Ollama request failed (${res.status}): ${detail.slice(0, 200)}`);
-    }
+    // Idle-timeout guard: abort if the server goes silent for `timeoutMs`. The
+    // raw fetch (unlike the cloud SDKs) has no built-in timeout, so without this
+    // a stuck local model would freeze the REPL with no way to recover.
+    const controller = new AbortController();
+    let timer: ReturnType<typeof setTimeout>;
+    const armTimer = (): void => {
+      clearTimeout(timer);
+      timer = setTimeout(() => controller.abort(), this.timeoutMs);
+    };
+    armTimer();
 
-    // Accumulate tool calls by their streamed index; arguments arrive in fragments.
-    const calls = new Map<number, { id: string; name: string; args: string }>();
-    let usage = { inputTokens: 0, outputTokens: 0 };
-    let finish = 'stop';
-
-    for await (const chunk of parseSse(res.body)) {
-      const choice = chunk.choices?.[0];
-      if (choice?.delta?.content) yield { type: 'text', delta: choice.delta.content };
-
-      for (const tc of choice?.delta?.tool_calls ?? []) {
-        const acc = calls.get(tc.index) ?? { id: '', name: '', args: '' };
-        if (tc.id) acc.id = tc.id;
-        if (tc.function?.name) acc.name = tc.function.name;
-        if (tc.function?.arguments) acc.args += tc.function.arguments;
-        calls.set(tc.index, acc);
+    try {
+      let res: Response;
+      try {
+        // `stream_options.include_usage` is best-effort: it gives us token counts,
+        // but older Ollama builds reject unknown body fields with a 400. Rather than
+        // breaking every local turn over a reporting nicety, retry once without it.
+        res = await this.post({ ...body, stream_options: { include_usage: true } }, controller.signal);
+        if (res.status === 400) res = await this.post(body, controller.signal);
+      } catch (err) {
+        if (controller.signal.aborted) throw this.timeoutError();
+        throw new Error(
+          `Cannot reach Ollama at ${this.baseUrl}. Is 'ollama serve' running? (${(err as Error).message})`,
+        );
       }
 
-      if (choice?.finish_reason) finish = choice.finish_reason;
-      if (chunk.usage) {
-        usage = {
-          inputTokens: chunk.usage.prompt_tokens ?? 0,
-          outputTokens: chunk.usage.completion_tokens ?? 0,
-        };
+      if (!res.ok || !res.body) {
+        const detail = await res.text().catch(() => '');
+        throw new Error(`Ollama request failed (${res.status}): ${detail.slice(0, 200)}`);
       }
-    }
 
-    for (const [index, c] of [...calls.entries()].sort((a, b) => a[0] - b[0])) {
-      let input: unknown = {};
+      // Accumulate tool calls by their streamed index; arguments arrive in fragments.
+      const calls = new Map<number, { id: string; name: string; args: string }>();
+      let usage = { inputTokens: 0, outputTokens: 0 };
+      let finish = 'stop';
+
       try {
-        input = c.args.trim() ? JSON.parse(c.args) : {};
-      } catch {
-        // Small models occasionally emit malformed JSON; degrade gracefully.
-        input = {};
+        for await (const chunk of parseSse(res.body)) {
+          armTimer(); // progress: reset the idle clock
+          const choice = chunk.choices?.[0];
+          if (choice?.delta?.content) yield { type: 'text', delta: choice.delta.content };
+
+          for (const tc of choice?.delta?.tool_calls ?? []) {
+            const acc = calls.get(tc.index) ?? { id: '', name: '', args: '' };
+            if (tc.id) acc.id = tc.id;
+            if (tc.function?.name) acc.name = tc.function.name;
+            if (tc.function?.arguments) acc.args += tc.function.arguments;
+            calls.set(tc.index, acc);
+          }
+
+          if (choice?.finish_reason) finish = choice.finish_reason;
+          if (chunk.usage) {
+            usage = {
+              inputTokens: chunk.usage.prompt_tokens ?? 0,
+              outputTokens: chunk.usage.completion_tokens ?? 0,
+            };
+          }
+        }
+      } catch (err) {
+        if (controller.signal.aborted) throw this.timeoutError();
+        throw err;
       }
-      yield { type: 'tool_call', id: c.id || `ollama-call-${index}`, name: c.name, input };
+
+      for (const [index, c] of [...calls.entries()].sort((a, b) => a[0] - b[0])) {
+        let input: unknown = {};
+        try {
+          input = c.args.trim() ? JSON.parse(c.args) : {};
+        } catch {
+          // Small models occasionally emit malformed JSON; degrade gracefully.
+          input = {};
+        }
+        yield { type: 'tool_call', id: c.id || `ollama-call-${index}`, name: c.name, input };
+      }
+
+      yield {
+        type: 'done',
+        usage,
+        stopReason: calls.size > 0 ? 'tool_use' : finish,
+      };
+    } finally {
+      clearTimeout(timer!);
     }
+  }
 
-    yield {
-      type: 'done',
-      usage,
-      stopReason: calls.size > 0 ? 'tool_use' : finish,
-    };
+  private timeoutError(): Error {
+    return new Error(
+      `Ollama at ${this.baseUrl} went silent for ${Math.round(this.timeoutMs / 1000)}s and was aborted. ` +
+        `The model '${this.model}' may be too large for this machine.`,
+    );
   }
 
   /** POST a chat-completions request body to the Ollama server. */
-  private post(body: unknown): Promise<Response> {
+  private post(body: unknown, signal: AbortSignal): Promise<Response> {
     return fetch(`${this.baseUrl}/chat/completions`, {
       method: 'POST',
       headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${this.apiKey}` },
       body: JSON.stringify(body),
+      signal,
     });
   }
 }
diff --git a/tests/providers/ollamaSend.test.ts b/tests/providers/ollamaSend.test.ts
index c396d05..b99e785 100644
--- a/tests/providers/ollamaSend.test.ts
+++ b/tests/providers/ollamaSend.test.ts
@@ -104,4 +104,18 @@ describe('OllamaProvider.send', () => {
     const provider = new OllamaProvider({ baseUrl: 'http://localhost:11434/v1', model: 'm' });
     await expect(collect(provider)).rejects.toThrow(/Cannot reach Ollama/);
   });
+
+  it('aborts and reports a timeout when the server goes silent', async () => {
+    // Never resolves on its own — only the idle-timeout abort can end it.
+    vi.spyOn(globalThis, 'fetch').mockImplementation(
+      (_url, init) =>
+        new Promise((_resolve, reject) => {
+          (init as RequestInit).signal?.addEventListener('abort', () =>
+            reject(new DOMException('aborted', 'AbortError')),
+          );
+        }),
+    );
+    const provider = new OllamaProvider({ baseUrl: 'http://localhost:11434/v1', model: 'm', timeoutMs: 20 });
+    await expect(collect(provider)).rejects.toThrow(/went silent.*aborted/);
+  });
 });

From c4d741d817a327e206b7785c373cb93c5d025772 Mon Sep 17 00:00:00 2001
From: Zack Anselm <zanselm5@gmail.com>
Date: Tue, 9 Jun 2026 09:50:53 -0500
Subject: [PATCH 05/14] fix(ollama): honor configured maxTokens

The Ollama body set no max_tokens, so a user who lowered maxTokens to
control length/cost saw no effect on local turns. Plumb it through as
max_tokens (omitted from the request when unset, matching prior behavior).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/providers/index.ts             |  6 +++++-
 src/providers/ollama.ts            |  5 +++++
 tests/providers/ollamaSend.test.ts | 15 +++++++++++++++
 3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/src/providers/index.ts b/src/providers/index.ts
index b37a4f1..89c6b3f 100644
--- a/src/providers/index.ts
+++ b/src/providers/index.ts
@@ -26,7 +26,11 @@ export function createProvider(config: ResolvedConfig): ModelProvider {
 
   if (config.provider === 'ollama') {
     // No API key required — Ollama runs locally.
-    return new OllamaProvider({ baseUrl: config.ollamaBaseUrl, model: config.model });
+    return new OllamaProvider({
+      baseUrl: config.ollamaBaseUrl,
+      model: config.model,
+      maxTokens: config.maxTokens,
+    });
   }
 
   if (!config.geminiApiKey) {
diff --git a/src/providers/ollama.ts b/src/providers/ollama.ts
index b93e36f..0ac73e7 100644
--- a/src/providers/ollama.ts
+++ b/src/providers/ollama.ts
@@ -7,6 +7,8 @@ export interface OllamaProviderOptions {
   model: string;
   /** Ignored by Ollama but required by the OpenAI wire format; defaults to "ollama". */
   apiKey?: string;
+  /** Cap on tokens to generate per response. Omitted from the request if unset. */
+  maxTokens?: number;
   /**
    * Abort the request if no bytes arrive for this long (ms). This is an *idle*
    * timeout, reset on every received chunk — a slow-but-progressing model keeps
@@ -96,12 +98,14 @@ export class OllamaProvider implements ModelProvider {
   readonly model: string;
   private readonly baseUrl: string;
   private readonly apiKey: string;
+  private readonly maxTokens: number | undefined;
   private readonly timeoutMs: number;
 
   constructor(opts: OllamaProviderOptions) {
     this.baseUrl = opts.baseUrl.replace(/\/$/, '');
     this.model = opts.model;
     this.apiKey = opts.apiKey ?? 'ollama';
+    this.maxTokens = opts.maxTokens;
     this.timeoutMs = opts.timeoutMs ?? 120_000;
   }
 
@@ -116,6 +120,7 @@ export class OllamaProvider implements ModelProvider {
       messages,
       tools: req.tools.length > 0 ? toOpenAiTools(req.tools) : undefined,
       stream: true,
+      max_tokens: this.maxTokens,
     };
 
     // Idle-timeout guard: abort if the server goes silent for `timeoutMs`. The
diff --git a/tests/providers/ollamaSend.test.ts b/tests/providers/ollamaSend.test.ts
index b99e785..5612bb3 100644
--- a/tests/providers/ollamaSend.test.ts
+++ b/tests/providers/ollamaSend.test.ts
@@ -99,6 +99,21 @@ describe('OllamaProvider.send', () => {
     expect(events.filter((e) => e.type === 'text').map((e) => (e as { delta: string }).delta).join('')).toBe('ok');
   });
 
+  it('forwards maxTokens as max_tokens, and omits it when unset', async () => {
+    const fetchMock = vi
+      .spyOn(globalThis, 'fetch')
+      .mockResolvedValue(sseResponse([{ choices: [{ delta: { content: 'ok' } }] }]));
+
+    await collect(new OllamaProvider({ baseUrl: 'http://localhost:11434/v1', model: 'm', maxTokens: 256 }));
+    const capped = JSON.parse((fetchMock.mock.calls[0]![1] as RequestInit).body as string);
+    expect(capped.max_tokens).toBe(256);
+
+    fetchMock.mockClear();
+    await collect(new OllamaProvider({ baseUrl: 'http://localhost:11434/v1', model: 'm' }));
+    const uncapped = JSON.parse((fetchMock.mock.calls[0]![1] as RequestInit).body as string);
+    expect(uncapped).not.toHaveProperty('max_tokens');
+  });
+
   it('throws a helpful error when Ollama is unreachable', async () => {
     vi.spyOn(globalThis, 'fetch').mockRejectedValue(new Error('ECONNREFUSED'));
     const provider = new OllamaProvider({ baseUrl: 'http://localhost:11434/v1', model: 'm' });

From 057de136dab27950678808b2413123ba3d4ac9cb Mon Sep 17 00:00:00 2001
From: Zack Anselm <zanselm5@gmail.com>
Date: Tue, 9 Jun 2026 09:51:07 -0500
Subject: [PATCH 06/14] docs(ollama): note the tool/text ordering assumption in
 toOpenAiMessages

The translation assumes text and tool_results never interleave within a
single user turn. True given how the loop builds messages today, but it's
an implicit coupling worth flagging for future changes.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/providers/ollama.ts | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/providers/ollama.ts b/src/providers/ollama.ts
index 0ac73e7..3a8cd92 100644
--- a/src/providers/ollama.ts
+++ b/src/providers/ollama.ts
@@ -30,6 +30,12 @@ interface OpenAiMessage {
  * `/v1/chat/completions` endpoint accepts). Unlike Gemini, OpenAI correlates
  * tool results to calls by `tool_call_id`, and our Anthropic-style ids survive
  * the round trip — so no id synthesis is needed.
+ *
+ * Assumes the loop never mixes plain text and tool results in one user turn in a
+ * way that would interleave them: we emit all `tool` messages first, then any
+ * text as a trailing user message. OpenAI requires each `tool` message to follow
+ * the assistant `tool_calls` that produced it; today's loop builds messages so
+ * that holds. If a future change interleaves them, revisit this ordering.
  */
 export function toOpenAiMessages(messages: Message[]): OpenAiMessage[] {
   const out: OpenAiMessage[] = [];

From 04604babed16044eb41d64c2e82782d455f712ab Mon Sep 17 00:00:00 2001
From: Zack Anselm <zanselm5@gmail.com>
Date: Tue, 9 Jun 2026 09:51:44 -0500
Subject: [PATCH 07/14] fix(ollama): emit a final SSE frame that lacks a
 trailing newline

parseSse only yielded complete newline-terminated lines, so a closing
usage frame sent without a trailing newline was silently dropped,
hurting token-count accuracy. Flush the remaining buffer at stream end.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/providers/ollama.ts            | 29 ++++++++++++++++++++---------
 tests/providers/ollamaSend.test.ts | 19 +++++++++++++++++++
 2 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/src/providers/ollama.ts b/src/providers/ollama.ts
index 3a8cd92..79f0ca8 100644
--- a/src/providers/ollama.ts
+++ b/src/providers/ollama.ts
@@ -231,6 +231,20 @@ export class OllamaProvider implements ModelProvider {
   }
 }
 
+/** Decode a single SSE line into a chunk, or `undefined` for non-data/keep-alive lines. */
+function parseSseLine(raw: string): StreamChunk | undefined {
+  const line = raw.trim();
+  if (!line.startsWith('data:')) return undefined;
+  const payload = line.slice(5).trim();
+  if (payload === '[DONE]' || payload.length === 0) return undefined;
+  try {
+    return JSON.parse(payload) as StreamChunk;
+  } catch {
+    // Ignore partial/non-JSON keep-alive lines.
+    return undefined;
+  }
+}
+
 /** Parse an SSE byte stream into decoded JSON chunks, skipping the `[DONE]` sentinel. */
 async function* parseSse(body: ReadableStream<Uint8Array>): AsyncIterable<StreamChunk> {
   const decoder = new TextDecoder();
@@ -243,18 +257,15 @@ async function* parseSse(body: ReadableStream<Uint8Array>): AsyncIterable<Stream
       buffer += decoder.decode(value, { stream: true });
       let nl: number;
       while ((nl = buffer.indexOf('\n')) !== -1) {
-        const line = buffer.slice(0, nl).trim();
+        const chunk = parseSseLine(buffer.slice(0, nl));
         buffer = buffer.slice(nl + 1);
-        if (!line.startsWith('data:')) continue;
-        const payload = line.slice(5).trim();
-        if (payload === '[DONE]' || payload.length === 0) continue;
-        try {
-          yield JSON.parse(payload) as StreamChunk;
-        } catch {
-          // Ignore partial/non-JSON keep-alive lines.
-        }
+        if (chunk) yield chunk;
       }
     }
+    // Emit a final line that arrived without a trailing newline (e.g. a closing
+    // usage frame); otherwise the last chunk's token counts would be dropped.
+    const tail = parseSseLine(buffer);
+    if (tail) yield tail;
   } finally {
     reader.releaseLock();
   }
diff --git a/tests/providers/ollamaSend.test.ts b/tests/providers/ollamaSend.test.ts
index 5612bb3..1fda23d 100644
--- a/tests/providers/ollamaSend.test.ts
+++ b/tests/providers/ollamaSend.test.ts
@@ -114,6 +114,25 @@ describe('OllamaProvider.send', () => {
     expect(uncapped).not.toHaveProperty('max_tokens');
   });
 
+  it('still parses a final usage frame that lacks a trailing newline', async () => {
+    const raw =
+      'data: {"choices":[{"delta":{"content":"hi"}}]}\n\n' +
+      'data: {"choices":[],"usage":{"prompt_tokens":3,"completion_tokens":4}}'; // no trailing \n
+    const stream = new ReadableStream<Uint8Array>({
+      start(controller) {
+        controller.enqueue(new TextEncoder().encode(raw));
+        controller.close();
+      },
+    });
+    vi.spyOn(globalThis, 'fetch').mockResolvedValue(
+      new Response(stream, { status: 200, headers: { 'Content-Type': 'text/event-stream' } }),
+    );
+
+    const provider = new OllamaProvider({ baseUrl: 'http://localhost:11434/v1', model: 'm' });
+    const done = (await collect(provider)).find((e) => e.type === 'done');
+    expect(done).toMatchObject({ usage: { inputTokens: 3, outputTokens: 4 } });
+  });
+
   it('throws a helpful error when Ollama is unreachable', async () => {
     vi.spyOn(globalThis, 'fetch').mockRejectedValue(new Error('ECONNREFUSED'));
     const provider = new OllamaProvider({ baseUrl: 'http://localhost:11434/v1', model: 'm' });

From 9dc6f7f440554991b1ec438f3352f8250b9c46c8 Mon Sep 17 00:00:00 2001
From: Zack Anselm <zanselm5@gmail.com>
Date: Tue, 9 Jun 2026 09:52:25 -0500
Subject: [PATCH 08/14] fix(router): stop escalating routine uses of common
 coding verbs

implement/debug/optimize/design appear in everyday one-line requests
('implement a getter', 'debug this typo'), so flagging them as heavy sent
many routine turns straight to the frontier model and undercut the
local-first cost goal. Keep only strong, unambiguous signals as always-heavy;
the ambiguous verbs now escalate only alongside a scope/complexity cue. The
local model can still self-escalate via the escalate tool when it struggles.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/agent/router.ts        | 23 ++++++++++++++++++-----
 tests/agent/router.test.ts | 17 ++++++++++++++---
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/src/agent/router.ts b/src/agent/router.ts
index 1582f2f..b3b2871 100644
--- a/src/agent/router.ts
+++ b/src/agent/router.ts
@@ -8,22 +8,32 @@
  */
 export type TaskWeight = 'light' | 'heavy';
 
+/**
+ * Strong, unambiguous signals that a turn genuinely needs the frontier model.
+ * These rarely show up in routine one-line requests.
+ */
 const HEAVY_PATTERNS: RegExp[] = [
   /\brefactor(?:ing|ed)?\b/i,
   /\barchitect(?:ure|ural)?\b/i,
-  /\bdesign\b/i,
-  /\bdebug(?:ging|ged)?\b/i,
-  /\boptimi[sz]e\b/i,
   /\bmigrat(?:e|ion|ing)\b/i,
-  /\bimplement\b/i,
   /\bredesign\b/i,
   /\broot[- ]?cause\b/i,
-  /\bwhy (?:is|does|are|do|did)\b/i,
   /\bthink (?:hard|carefully|through|deeply)\b/i,
   /\bacross (?:the |multiple |several )?(?:files|modules|codebase)\b/i,
   /\bend[- ]to[- ]end\b/i,
 ];
 
+/**
+ * Verbs that signal a heavy task only when paired with a scope/complexity cue.
+ * On their own — "implement a getter", "debug this typo", "optimize the loop" —
+ * they're everyday coding and stay local; eagerly escalating them would blunt
+ * the local-first cost savings. The local model can still escalate itself via
+ * the `escalate` tool when it actually struggles.
+ */
+const AMBIGUOUS_VERBS = /\b(?:implement(?:s|ing|ed)?|debug(?:ging|ged)?|optimi[sz]e|design)\b/i;
+const SCOPE_CUES =
+  /\b(?:entire|whole|complete(?:ly)?|across|multiple|several|system|subsystem|pipeline|codebase|module|from scratch)\b/i;
+
 /** Number of file-path-looking tokens above which a turn is considered heavy. */
 const MULTI_FILE_THRESHOLD = 3;
 /** Character length above which a turn is considered heavy. */
@@ -38,5 +48,8 @@ export function classifyTurn(input: string): TaskWeight {
   const fileMentions = text.match(/[\w./-]+\.[a-z]{1,5}\b/gi) ?? [];
   if (fileMentions.length >= MULTI_FILE_THRESHOLD) return 'heavy';
 
+  // Ambiguous verbs escalate only alongside a scope/complexity cue.
+  if (AMBIGUOUS_VERBS.test(text) && SCOPE_CUES.test(text)) return 'heavy';
+
   return 'light';
 }
diff --git a/tests/agent/router.test.ts b/tests/agent/router.test.ts
index a409b4c..dbd6e77 100644
--- a/tests/agent/router.test.ts
+++ b/tests/agent/router.test.ts
@@ -8,11 +8,22 @@ describe('classifyTurn', () => {
     expect(classifyTurn('rename foo to bar in utils.ts')).toBe('light');
   });
 
-  it('flags reasoning-heavy keywords as heavy', () => {
+  it('flags strong reasoning-heavy keywords as heavy', () => {
     expect(classifyTurn('refactor the provider layer')).toBe('heavy');
-    expect(classifyTurn('debug why the stream hangs')).toBe('heavy');
+    expect(classifyTurn('migrate the build to esbuild')).toBe('heavy');
     expect(classifyTurn('design a caching architecture')).toBe('heavy');
-    expect(classifyTurn('implement retthrough retries')).toBe('heavy');
+    expect(classifyTurn('find the root cause of the hang')).toBe('heavy');
+  });
+
+  it('keeps routine uses of ambiguous verbs light', () => {
+    expect(classifyTurn('implement a getter for name')).toBe('light');
+    expect(classifyTurn('debug this typo')).toBe('light');
+    expect(classifyTurn('optimize the inner loop')).toBe('light');
+  });
+
+  it('escalates ambiguous verbs only when paired with a scope cue', () => {
+    expect(classifyTurn('implement the auth system from scratch')).toBe('heavy');
+    expect(classifyTurn('optimize rendering across the whole pipeline')).toBe('heavy');
   });
 
   it('flags multi-file and long requests as heavy', () => {

From bf21f196333d666e18574dbe2383e1b6f44aabfc Mon Sep 17 00:00:00 2001
From: Zack Anselm <zanselm5@gmail.com>
Date: Tue, 9 Jun 2026 09:53:11 -0500
Subject: [PATCH 09/14] fix(resources): base the RAM warning on total capacity,
 not free memory

Linux keeps most RAM in reclaimable cache, so freemem() reads low and the
free-based check spuriously warned that a model 'may exceed available
memory' on machines that run it fine. Warn off total capacity (with 20%
headroom for OS/other apps) instead; keep the free-memory comparison as an
advisory freeTight hint only.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/system/resources.ts        | 22 +++++++++++++++++++---
 tests/system/resources.test.ts | 11 +++++++++--
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/src/system/resources.ts b/src/system/resources.ts
index f5030d4..f9522da 100644
--- a/src/system/resources.ts
+++ b/src/system/resources.ts
@@ -35,18 +35,33 @@ export function estimateModelRamGb(model: string): number {
   return params !== undefined ? Math.round(params * 0.6 + 1.5) : 4;
 }
 
+/**
+ * Fraction of total RAM a model may need before we warn. Leaves headroom for the
+ * OS and other apps; a model that wants nearly all of physical RAM will thrash.
+ */
+const CAPACITY_HEADROOM = 0.8;
+
 export interface LocalModelCheck {
   needGb: number;
   totalGb: number;
   freeGb: number;
-  /** True when the model likely won't fit comfortably in free memory. */
+  /** True when the model likely won't fit in this machine's RAM (capacity-based). */
   warn: boolean;
+  /**
+   * Soft hint: the model exceeds *currently free* memory. On Linux `free` is
+   * misleadingly low (most RAM is reclaimable cache), so this is advisory only —
+   * never the basis for the hard {@link warn}.
+   */
+  freeTight: boolean;
   /** True for small models (≤3B) that tool-call unreliably. */
   toolCallRisk: boolean;
 }
 
 /**
- * Compare a local model's memory footprint against the host's available RAM.
+ * Compare a local model's memory footprint against the host's RAM. The hard
+ * warning is capacity-based (`totalmem`), since that is what actually determines
+ * feasibility — Linux reports little "free" memory because it caches aggressively,
+ * so a free-memory test would spuriously warn on machines that run the model fine.
  * `mem` defaults to the live host readings but can be injected for testing.
  */
 export function checkLocalModel(
@@ -61,7 +76,8 @@ export function checkLocalModel(
     needGb,
     totalGb: round1(totalGb),
     freeGb: round1(freeGb),
-    warn: needGb > freeGb,
+    warn: needGb > totalGb * CAPACITY_HEADROOM,
+    freeTight: needGb > freeGb,
     toolCallRisk: params !== undefined && params <= 3,
   };
 }
diff --git a/tests/system/resources.test.ts b/tests/system/resources.test.ts
index af55481..e3b7169 100644
--- a/tests/system/resources.test.ts
+++ b/tests/system/resources.test.ts
@@ -18,15 +18,22 @@ describe('parseParamsB / estimateModelRamGb', () => {
 });
 
 describe('checkLocalModel', () => {
-  it('warns when the model needs more than the free memory', () => {
+  it('warns when the model needs more than the machine can hold', () => {
     const check = checkLocalModel('gemma3:27b', { total: 8 * GB, free: 4 * GB }); // ~16GB
     expect(check.warn).toBe(true);
     expect(check.needGb).toBe(16);
   });
 
-  it('does not warn when there is ample free memory and flags small-model tool risk', () => {
+  it('does not warn when total capacity is ample and flags small-model tool risk', () => {
     const check = checkLocalModel('gemma3:1b', { total: 64 * GB, free: 48 * GB });
     expect(check.warn).toBe(false);
     expect(check.toolCallRisk).toBe(true);
   });
+
+  it('does not warn on low free memory when total capacity is sufficient (Linux cache case)', () => {
+    // 32GB box with only 2GB nominally free — gemma3:4b (~3GB) fits in capacity.
+    const check = checkLocalModel('gemma3:4b', { total: 32 * GB, free: 2 * GB });
+    expect(check.warn).toBe(false);
+    expect(check.freeTight).toBe(true); // soft hint still set
+  });
 });

From 68863cff2a7142b88fc22a8b59f1c552c017c97b Mon Sep 17 00:00:00 2001
From: Zack Anselm <zanselm5@gmail.com>
Date: Tue, 9 Jun 2026 09:53:53 -0500
Subject: [PATCH 10/14] fix(ui): show 'cost unknown' for unpriced cloud models,
 not 'no API cost'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The pricing catalog is exact-match, so a future cloud model id (e.g.
claude-opus-5) returns no pricing and the usage line printed
'local (no API cost)' for a paid frontier turn — actively misleading. Thread
the provider through onUsage and, when a cloud provider's pricing is unknown,
show 'cost unknown' instead.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/agent/loop.ts       | 10 +++++++---
 src/ui/render.ts        | 20 +++++++++++++++-----
 tests/ui/render.test.ts | 11 +++++++++++
 3 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/src/agent/loop.ts b/src/agent/loop.ts
index c45fad6..e280a88 100644
--- a/src/agent/loop.ts
+++ b/src/agent/loop.ts
@@ -12,8 +12,12 @@ export interface AgentUI {
   onToolStart(name: string, input: unknown): void;
   onToolResult(name: string, result: ToolResult): void;
   onToolDenied(name: string): void;
-  /** `model` identifies which model produced the usage (for accurate pricing). */
-  onUsage(usage: Usage, model?: string): void;
+  /**
+   * `model` and `provider` identify what produced the usage, so the UI can
+   * price it accurately — and tell an unpriced *cloud* turn (cost unknown) apart
+   * from a *local* turn (no API cost).
+   */
+  onUsage(usage: Usage, model?: string, provider?: string): void;
   /** Fired when local-first routing escalates the turn to the frontier model. */
   onRoute(provider: string, model: string, reason: string): void;
   onAssistantEnd(): void;
@@ -109,7 +113,7 @@ export class AgentLoop {
         } else {
           this.sessionUsage.inputTokens += event.usage.inputTokens;
           this.sessionUsage.outputTokens += event.usage.outputTokens;
-          this.ui.onUsage(event.usage, active.model);
+          this.ui.onUsage(event.usage, active.model, active.name);
         }
       }
 
diff --git a/src/ui/render.ts b/src/ui/render.ts
index 19b9d8f..f31629d 100644
--- a/src/ui/render.ts
+++ b/src/ui/render.ts
@@ -22,6 +22,11 @@ function fmtTokens(n: number): string {
   return n >= 1000 ? `${(n / 1000).toFixed(1)}k` : String(n);
 }
 
+/** Paid (non-local) providers, where missing pricing means "unknown" not "free". */
+function isCloud(provider?: string): boolean {
+  return provider === 'anthropic' || provider === 'gemini';
+}
+
 export interface SessionTotals {
   inputTokens: number;
   outputTokens: number;
@@ -75,7 +80,7 @@ export function createTerminalUI(opts: TerminalUIOptions = {}): TerminalUI {
       ensureNewline();
       write(pc.yellow(`  ⊘ ${name} denied\n`));
     },
-    onUsage(usage: Usage, model?: string) {
+    onUsage(usage: Usage, model?: string, provider?: string) {
       totals.inputTokens += usage.inputTokens;
       totals.outputTokens += usage.outputTokens;
       const info = getModelInfo(model ?? opts.model ?? '');
@@ -85,10 +90,15 @@ export function createTerminalUI(opts: TerminalUIOptions = {}): TerminalUI {
       if (!showUsage) return;
       ensureNewline();
       const tokens = `${fmtTokens(usage.inputTokens)} in / ${fmtTokens(usage.outputTokens)} out`;
-      const money =
-        cost !== null
-          ? `${formatUsd(cost)} turn · ${formatUsd(totals.cost)} session`
-          : 'local (no API cost)';
+      let money: string;
+      if (cost !== null) {
+        money = `${formatUsd(cost)} turn · ${formatUsd(totals.cost)} session`;
+      } else if (isCloud(provider ?? opts.provider)) {
+        // A paid cloud model we don't have pricing for — don't imply it was free.
+        money = 'cost unknown';
+      } else {
+        money = 'local (no API cost)';
+      }
       write(pc.dim(`· ${tokens} · ${money}\n`));
     },
     onRoute(provider, model, reason) {
diff --git a/tests/ui/render.test.ts b/tests/ui/render.test.ts
index d72a3fc..a49b331 100644
--- a/tests/ui/render.test.ts
+++ b/tests/ui/render.test.ts
@@ -77,6 +77,17 @@ describe('createTerminalUI', () => {
     expect(out).toContain('local (no API cost)');
   });
 
+  it('shows "cost unknown" for an unpriced cloud model rather than implying it is free', () => {
+    const out = capture(() => {
+      const ui = createTerminalUI({ provider: 'anthropic' });
+      // A future/untracked cloud model id that the catalog has no pricing for.
+      ui.onUsage({ inputTokens: 100, outputTokens: 100 }, 'claude-opus-5');
+      expect(ui.getTotals().cost).toBe(0);
+    });
+    expect(out).toContain('cost unknown');
+    expect(out).not.toContain('no API cost');
+  });
+
   it('stays silent when showUsage is false but still tracks totals', () => {
     const out = capture(() => {
       const ui = createTerminalUI({ model: 'claude-opus-4-8', showUsage: false });

From db943f906f3d85249262441fe6d16e22dc7d3b22 Mon Sep 17 00:00:00 2001
From: Zack Anselm <zanselm5@gmail.com>
Date: Tue, 9 Jun 2026 09:54:30 -0500
Subject: [PATCH 11/14] fix(ui): don't label up-front routing as 'escalated'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A heavy turn that started on the frontier model printed '↑ escalated to …'
even though nothing was escalated. Pass an 'initial' flag through onRoute and
word up-front routing as '▸ routed to …', reserving 'escalated' for mid-turn
hand-offs.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/agent/loop.ts       | 11 ++++++++---
 src/ui/render.ts        |  5 +++--
 tests/ui/render.test.ts | 15 ++++++++++++---
 3 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/src/agent/loop.ts b/src/agent/loop.ts
index e280a88..a246583 100644
--- a/src/agent/loop.ts
+++ b/src/agent/loop.ts
@@ -18,8 +18,13 @@ export interface AgentUI {
    * from a *local* turn (no API cost).
    */
   onUsage(usage: Usage, model?: string, provider?: string): void;
-  /** Fired when local-first routing escalates the turn to the frontier model. */
-  onRoute(provider: string, model: string, reason: string): void;
+  /**
+   * Fired when local-first routing sends a turn to the frontier model. `initial`
+   * is true when the turn *started* there (up-front classification), false when
+   * it was escalated mid-turn — so the UI doesn't claim "escalated" for a turn
+   * that never ran locally.
+   */
+  onRoute(provider: string, model: string, reason: string, initial?: boolean): void;
   onAssistantEnd(): void;
   onMaxIterations(): void;
 }
@@ -157,7 +162,7 @@ export class AgentLoop {
   /** Pick the provider for a turn: heavy tasks start on the frontier model. */
   private selectInitialProvider(input: string): ModelProvider {
     if (this.escalationProvider && this.router && this.router(input) === 'heavy') {
-      this.ui.onRoute(this.escalationProvider.name, this.escalationProvider.model, 'heavy task');
+      this.ui.onRoute(this.escalationProvider.name, this.escalationProvider.model, 'heavy task', true);
       return this.escalationProvider;
     }
     return this.provider;
diff --git a/src/ui/render.ts b/src/ui/render.ts
index f31629d..ea87e44 100644
--- a/src/ui/render.ts
+++ b/src/ui/render.ts
@@ -101,9 +101,10 @@ export function createTerminalUI(opts: TerminalUIOptions = {}): TerminalUI {
       }
       write(pc.dim(`· ${tokens} · ${money}\n`));
     },
-    onRoute(provider, model, reason) {
+    onRoute(provider, model, reason, initial) {
       ensureNewline();
-      write(pc.yellow(`↑ escalated to ${provider}:${model} (${reason})\n`));
+      const verb = initial ? '▸ routed to' : '↑ escalated to';
+      write(pc.yellow(`${verb} ${provider}:${model} (${reason})\n`));
     },
     onAssistantEnd() {
       ensureNewline();
diff --git a/tests/ui/render.test.ts b/tests/ui/render.test.ts
index a49b331..89acde2 100644
--- a/tests/ui/render.test.ts
+++ b/tests/ui/render.test.ts
@@ -97,12 +97,21 @@ describe('createTerminalUI', () => {
     expect(out).toBe('');
   });
 
-  it('renders an escalation route line', () => {
+  it('renders a mid-turn escalation route line', () => {
     const out = capture(() => {
       const ui = createTerminalUI();
-      ui.onRoute('anthropic', 'claude-opus-4-8', 'heavy task');
+      ui.onRoute('anthropic', 'claude-opus-4-8', 'requested by model');
     });
     expect(out).toContain('escalated to anthropic:claude-opus-4-8');
-    expect(out).toContain('heavy task');
+    expect(out).toContain('requested by model');
+  });
+
+  it('renders up-front routing as "routed to", not "escalated"', () => {
+    const out = capture(() => {
+      const ui = createTerminalUI();
+      ui.onRoute('anthropic', 'claude-opus-4-8', 'heavy task', true);
+    });
+    expect(out).toContain('routed to anthropic:claude-opus-4-8');
+    expect(out).not.toContain('escalated');
   });
 });

From dc58b226dedc277fb9b8976043bd85d180325faa Mon Sep 17 00:00:00 2001
From: Zack Anselm <zanselm5@gmail.com>
Date: Tue, 9 Jun 2026 09:55:19 -0500
Subject: [PATCH 12/14] fix(loop): keep escalation sticky across follow-up
 turns

Escalation state was run()-scoped, so a multi-turn hard task that escalated
mid-turn restarted on the local model each follow-up and could ping-pong.
Persist the escalated state for the session once a turn hands off mid-flight
(model request or stuck); clearHistory() resets it so a fresh conversation
re-routes from scratch. Up-front heavy routing stays per-turn.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/agent/loop.ts        | 26 ++++++++++++++++++++++----
 tests/agent/loop.test.ts | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/src/agent/loop.ts b/src/agent/loop.ts
index a246583..fab59d8 100644
--- a/src/agent/loop.ts
+++ b/src/agent/loop.ts
@@ -63,6 +63,12 @@ export class AgentLoop {
   private readonly router: ((input: string) => 'light' | 'heavy') | undefined;
   private readonly messages: Message[] = [];
   private sessionUsage: Usage = { inputTokens: 0, outputTokens: 0 };
+  /**
+   * Set once a turn escalates mid-flight (model request or stuck). Subsequent
+   * turns then start on the frontier model so a multi-turn hard task doesn't
+   * ping-pong back to the local model on each follow-up. Reset by clearHistory().
+   */
+  private escalatedSession = false;
 
   constructor(opts: AgentLoopOptions) {
     this.provider = opts.provider;
@@ -82,9 +88,11 @@ export class AgentLoop {
   }
 
   /** Drop the conversation history so the next turn starts fresh. Cumulative
-   *  token usage is preserved, since it reflects the whole session's cost. */
+   *  token usage is preserved, since it reflects the whole session's cost.
+   *  Also clears sticky escalation: a fresh conversation re-routes from scratch. */
   clearHistory(): void {
     this.messages.length = 0;
+    this.escalatedSession = false;
   }
 
   /** Cumulative token usage across all turns in this session. */
@@ -97,8 +105,16 @@ export class AgentLoop {
     this.messages.push({ role: 'user', content: [{ type: 'text', text: userInput }] });
     const tools = this.registry.toSchemas();
 
-    let active = this.selectInitialProvider(userInput);
-    let escalated = active === this.escalationProvider;
+    let active: ModelProvider;
+    let escalated: boolean;
+    if (this.escalatedSession && this.escalationProvider) {
+      // A prior turn escalated; stay on the frontier model for follow-ups.
+      active = this.escalationProvider;
+      escalated = true;
+    } else {
+      active = this.selectInitialProvider(userInput);
+      escalated = active === this.escalationProvider;
+    }
     let consecutiveErrors = 0;
 
     for (let iteration = 0; iteration < this.maxIterations; iteration += 1) {
@@ -168,9 +184,11 @@ export class AgentLoop {
     return this.provider;
   }
 
-  /** Switch to the frontier provider mid-turn. Falls back to the primary if unset. */
+  /** Switch to the frontier provider mid-turn. Falls back to the primary if unset.
+   *  Marks the session as escalated so follow-up turns stay on the frontier model. */
   private escalate(reason: string): ModelProvider {
     if (!this.escalationProvider) return this.provider;
+    this.escalatedSession = true;
     this.ui.onRoute(this.escalationProvider.name, this.escalationProvider.model, reason);
     return this.escalationProvider;
   }
diff --git a/tests/agent/loop.test.ts b/tests/agent/loop.test.ts
index 4fc006b..56b811e 100644
--- a/tests/agent/loop.test.ts
+++ b/tests/agent/loop.test.ts
@@ -251,6 +251,45 @@ describe('AgentLoop', () => {
     expect(events).toContain('text:handled');
   });
 
+  it('stays on the frontier model for follow-up turns once escalated', async () => {
+    const escalateRegistry = createRegistry([echoTool, escalateTool]);
+    const local = new ScriptedProvider(
+      [[{ type: 'tool_call', id: 'e1', name: 'escalate', input: { reason: 'too hard' } }, DONE]],
+      'local',
+    );
+    const frontier = new ScriptedProvider(
+      [
+        [{ type: 'text', delta: 'handled' }, DONE], // finishes the escalated turn
+        [{ type: 'text', delta: 'follow-up' }, DONE], // next turn should land here too
+      ],
+      'big',
+      'anthropic',
+    );
+    const { ui } = recordingUI();
+    const loop = new AgentLoop({
+      provider: local,
+      registry: escalateRegistry,
+      gate: gateWith('yes'),
+      ui,
+      system: 'sys',
+      cwd: process.cwd(),
+      escalationProvider: frontier,
+      router: () => 'light',
+    });
+
+    await loop.run('start small then get stuck');
+    await loop.run('a routine follow-up');
+
+    // The follow-up turn never touched the local provider.
+    expect(local.sent).toHaveLength(1);
+    expect(frontier.sent).toHaveLength(2);
+
+    // clearHistory resets stickiness: the next light turn goes back to local.
+    loop.clearHistory();
+    await loop.run('another light request');
+    expect(local.sent).toHaveLength(2);
+  });
+
   it('behaves as a single provider when no escalation is configured', async () => {
     const provider = new ScriptedProvider([[{ type: 'text', delta: 'hi' }, DONE]]);
     const { ui, events } = recordingUI();

From 785b832ccde47c6f36e59d5f1e8390213d03c9c3 Mon Sep 17 00:00:00 2001
From: Zack Anselm <zanselm5@gmail.com>
Date: Tue, 9 Jun 2026 12:50:50 -0500
Subject: [PATCH 13/14] fix: resolve broken import and regressed UI labeling

---
 .changeset/local-models-cost-routing.md | 26 +++++++++++++++++++++++++
 .env.example                            |  2 ++
 src/agent/decision/localFirst.ts        |  2 +-
 src/agent/loop.ts                       | 13 ++++++++++---
 src/index.ts                            |  1 +
 src/models/catalog.ts                   | 10 ++++++++++
 tests/agent/loop.test.ts                |  8 ++++----
 7 files changed, 54 insertions(+), 8 deletions(-)
 create mode 100644 .changeset/local-models-cost-routing.md

diff --git a/.changeset/local-models-cost-routing.md b/.changeset/local-models-cost-routing.md
new file mode 100644
index 0000000..240da17
--- /dev/null
+++ b/.changeset/local-models-cost-routing.md
@@ -0,0 +1,26 @@
+---
+"@therr/tiny-code": minor
+---
+
+Add local models and cost-aware, local-first routing.
+
+- **Local (Ollama) provider.** Talk to a local Ollama server over its
+  OpenAI-compatible API (`--provider ollama`), with an idle timeout so a hung
+  model can't freeze the REPL, best-effort token-usage reporting, and configurable
+  `maxTokens`.
+- **Local-first routing.** Set `routing: "local-first"` with an `escalateTo`
+  target to run a cheap/local model by default and escalate heavy turns (or a
+  stuck local model, via the new `escalate` tool) to a frontier model — with full
+  conversation context preserved. Escalation is sticky across follow-up turns.
+- **Model-selection policy** is now owned by a pluggable `ModelDecisionEngine`
+  (`LocalFirstModelEngine`), keeping the agent loop pure mechanism.
+- **Compute awareness.** On startup with a local model, tiny-code estimates RAM
+  need vs. machine capacity and warns when a model likely won't fit or is too
+  small (≤3B) to tool-call reliably; an over-RAM local model is routed to the
+  frontier up front.
+- **Priority-driven model selection.** `priority` (`performance` / `cost` /
+  `balanced`, or `TINY_CODE_PRIORITY`) auto-picks a catalog model when none is
+  pinned.
+- The `/costs` view reports session usage, estimated spend, and routing, and the
+  usage line distinguishes an unpriced *cloud* turn ("cost unknown") from a
+  *local* turn ("no API cost").
diff --git a/.env.example b/.env.example
index 57780e4..ab7215c 100644
--- a/.env.example
+++ b/.env.example
@@ -7,6 +7,8 @@ GEMINI_API_KEY=
 # TINY_CODE_PROVIDER=anthropic   # anthropic | gemini | ollama
 # TINY_CODE_MODEL=claude-opus-4-8
 # TINY_CODE_OLLAMA_URL=http://localhost:11434/v1   # Ollama OpenAI-compatible endpoint
+# TINY_CODE_PRIORITY=performance # performance | cost | balanced — auto-picks a model when none is pinned
+# TINY_CODE_EFFORT=high          # low | medium | high | xhigh | max — Anthropic thinking budget
 
 # Self-improvement: reflect on sessions and propose markdown-only improvement PRs.
 # On by default; set to 0 to disable. Requires the `gh` CLI installed + authed.
diff --git a/src/agent/decision/localFirst.ts b/src/agent/decision/localFirst.ts
index 4d60c2b..87c5d92 100644
--- a/src/agent/decision/localFirst.ts
+++ b/src/agent/decision/localFirst.ts
@@ -1,7 +1,7 @@
 import type { ModelProvider } from '../../providers/types.js';
 import { classifyTurn, type TaskWeight } from '../router.js';
 import { checkLocalModel } from '../../system/resources.js';
-import { estimateCost } from '../../providers/pricing.js';
+import { estimateCost } from '../../models/catalog.js';
 import type { ModelDecisionEngine, RouteDecision, TurnSignals } from './types.js';
 
 export interface LocalFirstOptions {
diff --git a/src/agent/loop.ts b/src/agent/loop.ts
index 90b63fc..d57570b 100644
--- a/src/agent/loop.ts
+++ b/src/agent/loop.ts
@@ -117,7 +117,7 @@ export class AgentLoop {
         escalated = true;
       } else {
         const initial = this.engine.selectInitial(userInput);
-        active = this.applyRoute(active, initial);
+        active = this.applyRoute(active, initial, true);
         escalated = active !== this.provider;
       }
     }
@@ -192,10 +192,17 @@ export class AgentLoop {
   /**
    * Apply a routing decision: switch to its provider and surface an `onRoute`
    * event when it actually changes the active provider and carries a reason.
+   * `initial` distinguishes up-front routing ("routed to") from a mid-turn
+   * hand-off ("escalated to") so the UI never claims a turn was escalated when
+   * it started on the frontier model.
    */
-  private applyRoute(active: ModelProvider, decision: RouteDecision): ModelProvider {
+  private applyRoute(
+    active: ModelProvider,
+    decision: RouteDecision,
+    initial = false,
+  ): ModelProvider {
     if (decision.provider !== active && decision.reason) {
-      this.ui.onRoute(decision.provider.name, decision.provider.model, decision.reason);
+      this.ui.onRoute(decision.provider.name, decision.provider.model, decision.reason, initial);
     }
     return decision.provider;
   }
diff --git a/src/index.ts b/src/index.ts
index 1d205bb..d10bb65 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -36,6 +36,7 @@ export {
   CATALOG_AS_OF,
   getModelInfo,
   estimateCostUsd,
+  estimateCost,
   formatUsd,
   blendedCostPerMTok,
   recommendModel,
diff --git a/src/models/catalog.ts b/src/models/catalog.ts
index a5a9c98..b7f1a94 100644
--- a/src/models/catalog.ts
+++ b/src/models/catalog.ts
@@ -66,6 +66,16 @@ export function estimateCostUsd(usage: Usage, info: ModelInfo): number {
   );
 }
 
+/**
+ * Estimate the USD cost of a token usage for a model id, or `null` when the
+ * model isn't in the catalog — e.g. a local/Ollama model that has no API price.
+ * A `null` means "no known price", not "free"; callers decide how to present it.
+ */
+export function estimateCost(modelId: string, usage: Usage): number | null {
+  const info = getModelInfo(modelId);
+  return info ? estimateCostUsd(usage, info) : null;
+}
+
 /** Format a USD amount with precision that stays readable for tiny costs. */
 export function formatUsd(amount: number): string {
   return `$${amount.toFixed(amount < 1 ? 4 : 2)}`;
diff --git a/tests/agent/loop.test.ts b/tests/agent/loop.test.ts
index 496d97a..cec1e20 100644
--- a/tests/agent/loop.test.ts
+++ b/tests/agent/loop.test.ts
@@ -45,7 +45,7 @@ function recordingUI(): { ui: AgentUI; events: string[] } {
     onToolResult: (n, r) => events.push(`result:${n}:${r.output}:${r.isError ?? false}`),
     onToolDenied: (n) => events.push(`denied:${n}`),
     onUsage: () => events.push('usage'),
-    onRoute: (p, m, r) => events.push(`route:${p}:${m}:${r}`),
+    onRoute: (p, m, r, initial) => events.push(`route:${p}:${m}:${r}:${initial ? 'initial' : 'escalated'}`),
     onAssistantEnd: () => events.push('assistantEnd'),
     onMaxIterations: () => events.push('maxIter'),
   };
@@ -196,7 +196,7 @@ describe('AgentLoop', () => {
 
     expect(frontier.sent).toHaveLength(1);
     expect(local.sent).toHaveLength(0);
-    expect(events).toContain('route:anthropic:big:heavy task');
+    expect(events).toContain('route:anthropic:big:heavy task:initial');
   });
 
   it('keeps a light turn on the local provider', async () => {
@@ -245,7 +245,7 @@ describe('AgentLoop', () => {
     // First send on local, second (post-escalation) on frontier.
     expect(local.sent).toHaveLength(1);
     expect(frontier.sent).toHaveLength(1);
-    expect(events).toContain('route:anthropic:big:requested by model');
+    expect(events).toContain('route:anthropic:big:requested by model:escalated');
     expect(events).toContain('text:handled');
   });
 
@@ -271,7 +271,7 @@ describe('AgentLoop', () => {
 
     expect(local.sent).toHaveLength(3);
     expect(frontier.sent).toHaveLength(1);
-    expect(events).toContain('route:anthropic:big:stuck — repeated tool errors');
+    expect(events).toContain('route:anthropic:big:stuck — repeated tool errors:escalated');
     expect(events).toContain('text:rescued');
   });
 

From 1c7ee020fef48209cac360623ab653d659edaa06 Mon Sep 17 00:00:00 2001
From: Zack Anselm <zanselm5@gmail.com>
Date: Tue, 9 Jun 2026 12:51:07 -0500
Subject: [PATCH 14/14] release: v0.2.0

---
 package-lock.json | 4 ++--
 package.json      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/package-lock.json b/package-lock.json
index eb5c3b2..1685228 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "@therr/tiny-code",
-  "version": "0.1.0",
+  "version": "0.2.0",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "@therr/tiny-code",
-      "version": "0.1.0",
+      "version": "0.2.0",
       "license": "SEE LICENSE IN LICENSE",
       "dependencies": {
         "@anthropic-ai/sdk": "^0.69.0",
diff --git a/package.json b/package.json
index d3482f7..b035254 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@therr/tiny-code",
-  "version": "0.1.0",
+  "version": "0.2.0",
   "description": "A small, extensible CLI coding agent with interchangeable Anthropic and Gemini models.",
   "type": "module",
   "bin": {