From 52b179d6653fc85610af55b9f7a9f1634f50c453 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 10 Jun 2026 12:52:56 +0000
Subject: [PATCH 1/4] Add DeepSeek and Qwen Coder provider support

Extract the Ollama provider's OpenAI-compatible streaming core into a
reusable OpenAiCompatibleProvider base, then add hosted DeepSeek and Qwen
(DashScope) providers on top of it. Wire them through config (keys, base-URL
overrides, provider inference, defaults), the model catalog (pricing/context/
coding scores for deepseek-v4-{pro,flash} and qwen3-coder-{plus,flash}), and
the cost UI. Update CLI help, .env.example, and README.

https://claude.ai/code/session_01GHiv4kP53a96EWsFEAAqWp
---
 .changeset/qwen-deepseek-coder.md       |  19 ++
 .env.example                            |  11 +-
 README.md                               |  20 +-
 src/cli.ts                              |   7 +-
 src/config/load.ts                      |  36 ++-
 src/index.ts                            |  12 +-
 src/models/catalog.ts                   |  11 +-
 src/providers/deepseek.ts               |  33 +++
 src/providers/index.ts                  |  29 +++
 src/providers/ollama.ts                 | 269 ++--------------------
 src/providers/openai-compatible.ts      | 285 ++++++++++++++++++++++++
 src/providers/qwen.ts                   |  33 +++
 src/providers/types.ts                  |   2 +-
 src/ui/render.ts                        |   7 +-
 tests/config/load.test.ts               |  39 ++++
 tests/models/catalog.test.ts            |  12 +
 tests/providers/openaiCloudSend.test.ts |  81 +++++++
 17 files changed, 637 insertions(+), 269 deletions(-)
 create mode 100644 .changeset/qwen-deepseek-coder.md
 create mode 100644 src/providers/deepseek.ts
 create mode 100644 src/providers/openai-compatible.ts
 create mode 100644 src/providers/qwen.ts
 create mode 100644 tests/providers/openaiCloudSend.test.ts

diff --git a/.changeset/qwen-deepseek-coder.md b/.changeset/qwen-deepseek-coder.md
new file mode 100644
index 0000000..53d5185
--- /dev/null
+++ b/.changeset/qwen-deepseek-coder.md
@@ -0,0 +1,19 @@
+---
+"@therr/tiny-code": minor
+---
+
+Add DeepSeek and Qwen Coder model support.
+
+- **DeepSeek and Qwen providers.** Two new hosted, OpenAI-compatible providers
+  (`--provider deepseek` / `--provider qwen`), keyed by `DEEPSEEK_API_KEY` and
+  `QWEN_API_KEY` (or `DASHSCOPE_API_KEY`). Endpoints are overridable via
+  `TINY_CODE_DEEPSEEK_URL` / `TINY_CODE_QWEN_URL` or `deepseekBaseUrl` /
+  `qwenBaseUrl` in config — e.g. to target the international DashScope host.
+- **Shared OpenAI-compatible core.** The streaming/tool-call adapter that backed
+  the Ollama provider is now a reusable `OpenAiCompatibleProvider` base; Ollama,
+  DeepSeek, and Qwen all extend it, differing only in endpoint, auth, and error
+  wording.
+- **Catalog entries** for `deepseek-v4-pro`, `deepseek-v4-flash`,
+  `qwen3-coder-plus`, and `qwen3-coder-flash`, so `/costs` estimates and
+  priority-based model selection work for the new providers. `/costs` treats both
+  as paid cloud providers.
diff --git a/.env.example b/.env.example
index ab7215c..bc84fe7 100644
--- a/.env.example
+++ b/.env.example
@@ -1,12 +1,17 @@
-# Provide at least one for cloud providers. If both are present, Anthropic is
-# the default. Ollama runs locally and needs no key.
+# Provide at least one for cloud providers. If several are present, the default
+# is the first available in this order: Anthropic, Gemini, DeepSeek, Qwen.
+# Ollama runs locally and needs no key.
 ANTHROPIC_API_KEY=
 GEMINI_API_KEY=
+DEEPSEEK_API_KEY=
+QWEN_API_KEY=          # Alibaba DashScope key (DASHSCOPE_API_KEY also accepted)
 
 # Optional overrides (also settable via config file / CLI flags)
-# TINY_CODE_PROVIDER=anthropic   # anthropic | gemini | ollama
+# TINY_CODE_PROVIDER=anthropic   # anthropic | gemini | ollama | deepseek | qwen
 # TINY_CODE_MODEL=claude-opus-4-8
 # TINY_CODE_OLLAMA_URL=http://localhost:11434/v1   # Ollama OpenAI-compatible endpoint
+# TINY_CODE_DEEPSEEK_URL=https://api.deepseek.com/v1
+# TINY_CODE_QWEN_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
 # TINY_CODE_PRIORITY=performance # performance | cost | balanced — auto-picks a model when none is pinned
 # TINY_CODE_EFFORT=high          # low | medium | high | xhigh | max — Anthropic thinking budget
 
diff --git a/README.md b/README.md
index 17b89cb..d55957b 100644
--- a/README.md
+++ b/README.md
@@ -3,9 +3,9 @@
 A small, extensible CLI coding agent built around one constraint: **keep token
 usage low**. As coding-agent costs climb, tiny-code automates the savings so
 you don't have to. Interactive terminal REPL, interchangeable **Anthropic**,
-**Gemini**, and **local (Ollama)** models, and just the core features you
-actually use: read/write/edit files, run shell commands, search code, and a
-custom commands/skills system. No business logic baked in.
+**Gemini**, **DeepSeek**, **Qwen Coder**, and **local (Ollama)** models, and just
+the core features you actually use: read/write/edit files, run shell commands,
+search code, and a custom commands/skills system. No business logic baked in.
 
 Run cheap, open-weight models locally and **escalate heavy work to a frontier
 model only when needed** — see [Local models & cost-aware routing](#local-models--cost-aware-routing).
@@ -29,19 +29,28 @@ node dist/cli.js
 
 ## Setup
 
-Provide at least one API key. If both are set, Anthropic is used by default.
+Provide at least one API key. If several are set, the default is the first
+available in this order: Anthropic, Gemini, DeepSeek, Qwen.
 
 ```bash
 export ANTHROPIC_API_KEY=sk-ant-...
 export GEMINI_API_KEY=...
+export DEEPSEEK_API_KEY=sk-...
+export QWEN_API_KEY=sk-...        # Alibaba DashScope key (DASHSCOPE_API_KEY also works)
 ```
 
+DeepSeek and Qwen are hosted, OpenAI-compatible coding models. Override their
+endpoints with `TINY_CODE_DEEPSEEK_URL` / `TINY_CODE_QWEN_URL` (or `deepseekBaseUrl`
+/ `qwenBaseUrl` in config) — e.g. to point Qwen at the international DashScope host.
+
 ## Usage
 
 ```bash
 tiny-code                       # start the REPL (uses an available key)
 tiny-code --provider gemini     # force a provider
 tiny-code --model claude-opus-4-8
+tiny-code --provider deepseek --model deepseek-v4-pro     # DeepSeek's coding model
+tiny-code --provider qwen --model qwen3-coder-plus        # Qwen Coder
 tiny-code --provider ollama --model gemma3:12b   # run a local model (no API cost)
 ```
 
@@ -154,7 +163,8 @@ CLI flags.
 `routing: "local-first"` plus `escalateTo` enables cost-aware routing (see
 [above](#local-models--cost-aware-routing)); it defaults to `local-first`
 automatically whenever `escalateTo` is present. `ollamaBaseUrl` points at your
-Ollama server's OpenAI-compatible endpoint.
+Ollama server's OpenAI-compatible endpoint; `deepseekBaseUrl` / `qwenBaseUrl`
+override the DeepSeek and Qwen (DashScope) endpoints.
 
 Approximate cloud pricing used for the `/costs` estimate lives in the model
 catalog (`src/models/catalog.ts`) — edit it to match current vendor rates.
diff --git a/src/cli.ts b/src/cli.ts
index 19112bb..68ce55d 100644
--- a/src/cli.ts
+++ b/src/cli.ts
@@ -12,8 +12,9 @@ Usage:
   tiny-code [options]
 
 Options:
-  --provider <name>   anthropic | gemini | ollama (default: inferred from API keys)
-  --model <id>        Model id override (e.g. claude-opus-4-8, gemma3:12b)
+  --provider <name>   anthropic | gemini | ollama | deepseek | qwen
+                      (default: inferred from API keys)
+  --model <id>        Model id override (e.g. claude-opus-4-8, qwen3-coder-plus)
   --config <path>     Path to a config JSON file
   -v, --version       Print version
   -h, --help          Show this help
@@ -21,6 +22,8 @@ Options:
 Environment:
   ANTHROPIC_API_KEY    Required for the Anthropic provider
   GEMINI_API_KEY       Required for the Gemini provider
+  DEEPSEEK_API_KEY     Required for the DeepSeek provider
+  QWEN_API_KEY         Required for the Qwen provider (or DASHSCOPE_API_KEY)
   TINY_CODE_OLLAMA_URL Ollama OpenAI-compatible base URL (default http://localhost:11434/v1)
   TINY_CODE_PRIORITY   performance | cost | balanced — auto-picks a model when
                        none is pinned (default: performance)
diff --git a/src/config/load.ts b/src/config/load.ts
index 4d25096..9c84243 100644
--- a/src/config/load.ts
+++ b/src/config/load.ts
@@ -5,7 +5,7 @@ import { z } from 'zod';
 import type { Priority } from '../models/catalog.js';
 import { recommendModel } from '../models/catalog.js';
 
-export type Provider = 'anthropic' | 'gemini' | 'ollama';
+export type Provider = 'anthropic' | 'gemini' | 'ollama' | 'deepseek' | 'qwen';
 export type Effort = 'low' | 'medium' | 'high' | 'xhigh' | 'max';
 export type Routing = 'local-first' | 'off';
 export type { Priority } from '../models/catalog.js';
@@ -34,8 +34,14 @@ export interface ResolvedConfig {
   priority: Priority;
   anthropicApiKey: string | undefined;
   geminiApiKey: string | undefined;
+  deepseekApiKey: string | undefined;
+  qwenApiKey: string | undefined;
   /** OpenAI-compatible base URL for the Ollama provider. */
   ollamaBaseUrl: string;
+  /** Override for the DeepSeek API endpoint (defaults to DeepSeek's hosted URL). */
+  deepseekBaseUrl: string | undefined;
+  /** Override for the Qwen/DashScope API endpoint (defaults to DashScope's URL). */
+  qwenBaseUrl: string | undefined;
   maxTokens: number;
   thinking: boolean;
   effort: Effort;
@@ -69,21 +75,27 @@ const DEFAULT_MODELS: Record<Provider, string> = {
   anthropic: 'claude-opus-4-8',
   gemini: 'gemini-2.5-pro',
   ollama: 'qwen2.5-coder:7b',
+  deepseek: 'deepseek-v4-pro',
+  qwen: 'qwen3-coder-plus',
 };
 
 const DEFAULT_OLLAMA_URL = 'http://localhost:11434/v1';
 
+const PROVIDERS = ['anthropic', 'gemini', 'ollama', 'deepseek', 'qwen'] as const;
+
 const EscalateTargetSchema = z.object({
-  provider: z.enum(['anthropic', 'gemini', 'ollama']),
+  provider: z.enum(PROVIDERS),
   model: z.string(),
   ollamaBaseUrl: z.string().url().optional(),
 });
 
 const FileConfigSchema = z
   .object({
-    provider: z.enum(['anthropic', 'gemini', 'ollama']).optional(),
+    provider: z.enum(PROVIDERS).optional(),
     model: z.string().optional(),
     ollamaBaseUrl: z.string().url().optional(),
+    deepseekBaseUrl: z.string().url().optional(),
+    qwenBaseUrl: z.string().url().optional(),
     priority: z.enum(['performance', 'cost', 'balanced']).optional(),
     maxTokens: z.number().int().positive().optional(),
     thinking: z.boolean().optional(),
@@ -132,12 +144,22 @@ export function loadConfig(overrides: CliOverrides = {}, cwd: string = process.c
   const env = process.env;
   const anthropicApiKey = env.ANTHROPIC_API_KEY || undefined;
   const geminiApiKey = env.GEMINI_API_KEY || undefined;
+  const deepseekApiKey = env.DEEPSEEK_API_KEY || undefined;
+  const qwenApiKey = env.QWEN_API_KEY || env.DASHSCOPE_API_KEY || undefined;
 
   const provider: Provider =
     overrides.provider ??
     (env.TINY_CODE_PROVIDER as Provider | undefined) ??
     file.provider ??
-    (anthropicApiKey ? 'anthropic' : geminiApiKey ? 'gemini' : 'anthropic');
+    (anthropicApiKey
+      ? 'anthropic'
+      : geminiApiKey
+        ? 'gemini'
+        : deepseekApiKey
+          ? 'deepseek'
+          : qwenApiKey
+            ? 'qwen'
+            : 'anthropic');
 
   const priority: Priority =
     (env.TINY_CODE_PRIORITY as Priority | undefined) ?? file.priority ?? 'performance';
@@ -158,6 +180,8 @@ export function loadConfig(overrides: CliOverrides = {}, cwd: string = process.c
   const effort = (env.TINY_CODE_EFFORT as Effort | undefined) ?? file.effort ?? 'high';
 
   const ollamaBaseUrl = env.TINY_CODE_OLLAMA_URL ?? file.ollamaBaseUrl ?? DEFAULT_OLLAMA_URL;
+  const deepseekBaseUrl = env.TINY_CODE_DEEPSEEK_URL ?? file.deepseekBaseUrl;
+  const qwenBaseUrl = env.TINY_CODE_QWEN_URL ?? file.qwenBaseUrl;
 
   const escalateTo = file.escalateTo;
   // Default to local-first whenever an escalation target is configured.
@@ -174,7 +198,11 @@ export function loadConfig(overrides: CliOverrides = {}, cwd: string = process.c
     priority,
     anthropicApiKey,
     geminiApiKey,
+    deepseekApiKey,
+    qwenApiKey,
     ollamaBaseUrl,
+    deepseekBaseUrl,
+    qwenBaseUrl,
     maxTokens,
     thinking: file.thinking ?? true,
     effort,
diff --git a/src/index.ts b/src/index.ts
index d10bb65..ececf80 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -8,9 +8,17 @@ export type { AgentUI, AgentLoopOptions } from './agent/loop.js';
 export { buildSystemPrompt } from './agent/systemPrompt.js';
 export type { SystemPromptParams } from './agent/systemPrompt.js';
 
-export { createProvider, AnthropicProvider, GeminiProvider, OllamaProvider } from './providers/index.js';
+export {
+  createProvider,
+  AnthropicProvider,
+  GeminiProvider,
+  OllamaProvider,
+  DeepSeekProvider,
+  QwenProvider,
+  OpenAiCompatibleProvider,
+} from './providers/index.js';
 export type { ModelProvider, ProviderEvent, SendRequest, ToolSchema, Usage } from './providers/types.js';
-export { toOpenAiMessages, toOpenAiTools } from './providers/ollama.js';
+export { toOpenAiMessages, toOpenAiTools } from './providers/openai-compatible.js';
 
 export { classifyTurn } from './agent/router.js';
 export type { TaskWeight } from './agent/router.js';
diff --git a/src/models/catalog.ts b/src/models/catalog.ts
index b7f1a94..428a1bd 100644
--- a/src/models/catalog.ts
+++ b/src/models/catalog.ts
@@ -32,7 +32,7 @@ export interface ModelInfo {
  * from the bundled claude-api reference; Gemini figures from Google's published
  * API pricing.
  */
-export const CATALOG_AS_OF = '2026-06-08';
+export const CATALOG_AS_OF = '2026-06-10';
 
 /**
  * The known coding models, newest/most-capable first within each provider.
@@ -51,6 +51,15 @@ export const MODEL_CATALOG: ModelInfo[] = [
   { id: 'gemini-2.5-pro', provider: 'gemini', label: 'Gemini 2.5 Pro', inputPricePerMTok: 1.25, outputPricePerMTok: 10, contextWindow: 1_048_576, codingScore: 90 },
   { id: 'gemini-2.5-flash', provider: 'gemini', label: 'Gemini 2.5 Flash', inputPricePerMTok: 0.3, outputPricePerMTok: 2.5, contextWindow: 1_048_576, codingScore: 72 },
   { id: 'gemini-2.5-flash-lite', provider: 'gemini', label: 'Gemini 2.5 Flash-Lite', inputPricePerMTok: 0.1, outputPricePerMTok: 0.4, contextWindow: 1_048_576, codingScore: 55 },
+
+  // DeepSeek — DeepSeek API (cache-miss) pricing. The V4 family carries DeepSeek's
+  // coding capability; the legacy "deepseek-coder" model is retired.
+  { id: 'deepseek-v4-pro', provider: 'deepseek', label: 'DeepSeek V4 Pro', inputPricePerMTok: 1.74, outputPricePerMTok: 3.48, contextWindow: 1_048_576, codingScore: 91 },
+  { id: 'deepseek-v4-flash', provider: 'deepseek', label: 'DeepSeek V4 Flash', inputPricePerMTok: 0.14, outputPricePerMTok: 0.28, contextWindow: 1_048_576, codingScore: 80 },
+
+  // Qwen Coder — Alibaba DashScope pricing for the proprietary coder models.
+  { id: 'qwen3-coder-plus', provider: 'qwen', label: 'Qwen3 Coder Plus', inputPricePerMTok: 0.65, outputPricePerMTok: 3.25, contextWindow: 1_000_000, codingScore: 89 },
+  { id: 'qwen3-coder-flash', provider: 'qwen', label: 'Qwen3 Coder Flash', inputPricePerMTok: 0.195, outputPricePerMTok: 0.975, contextWindow: 1_000_000, codingScore: 78 },
 ];
 
 /** Look up catalog facts for a model id, or `undefined` if it's not tracked. */
diff --git a/src/providers/deepseek.ts b/src/providers/deepseek.ts
new file mode 100644
index 0000000..7618ac5
--- /dev/null
+++ b/src/providers/deepseek.ts
@@ -0,0 +1,33 @@
+import { OpenAiCompatibleProvider, type OpenAiCompatibleOptions } from './openai-compatible.js';
+
+/** DeepSeek's hosted OpenAI-compatible endpoint. */
+export const DEFAULT_DEEPSEEK_URL = 'https://api.deepseek.com/v1';
+
+export interface DeepSeekProviderOptions extends Omit<OpenAiCompatibleOptions, 'baseUrl'> {
+  apiKey: string;
+  /** Override the API endpoint (defaults to {@link DEFAULT_DEEPSEEK_URL}). */
+  baseUrl?: string | undefined;
+}
+
+/**
+ * DeepSeek's cloud models (the V4 family powers its coding capability) over the
+ * OpenAI-compatible Chat Completions API. Differs from the local Ollama
+ * provider only in endpoint, required API key, and error wording.
+ */
+export class DeepSeekProvider extends OpenAiCompatibleProvider {
+  readonly name = 'deepseek' as const;
+
+  constructor(opts: DeepSeekProviderOptions) {
+    super({ ...opts, baseUrl: opts.baseUrl ?? DEFAULT_DEEPSEEK_URL });
+  }
+
+  protected override label(): string {
+    return 'DeepSeek';
+  }
+
+  protected override unreachableError(err: Error): Error {
+    return new Error(
+      `Cannot reach DeepSeek at ${this.baseUrl}. Check your network and DEEPSEEK_API_KEY. (${err.message})`,
+    );
+  }
+}
diff --git a/src/providers/index.ts b/src/providers/index.ts
index 89c6b3f..7d620af 100644
--- a/src/providers/index.ts
+++ b/src/providers/index.ts
@@ -3,11 +3,16 @@ import type { ResolvedConfig } from '../config/load.js';
 import { AnthropicProvider } from './anthropic.js';
 import { GeminiProvider } from './gemini.js';
 import { OllamaProvider } from './ollama.js';
+import { DeepSeekProvider } from './deepseek.js';
+import { QwenProvider } from './qwen.js';
 
 export type { ModelProvider, ProviderEvent, SendRequest, ToolSchema, Usage } from './types.js';
 export { AnthropicProvider } from './anthropic.js';
 export { GeminiProvider } from './gemini.js';
 export { OllamaProvider } from './ollama.js';
+export { DeepSeekProvider } from './deepseek.js';
+export { QwenProvider } from './qwen.js';
+export { OpenAiCompatibleProvider } from './openai-compatible.js';
 
 /** Construct the configured provider, validating that its API key is present. */
 export function createProvider(config: ResolvedConfig): ModelProvider {
@@ -33,6 +38,30 @@ export function createProvider(config: ResolvedConfig): ModelProvider {
     });
   }
 
+  if (config.provider === 'deepseek') {
+    if (!config.deepseekApiKey) {
+      throw new Error('DEEPSEEK_API_KEY is not set. Export it or switch providers with --provider anthropic.');
+    }
+    return new DeepSeekProvider({
+      apiKey: config.deepseekApiKey,
+      baseUrl: config.deepseekBaseUrl,
+      model: config.model,
+      maxTokens: config.maxTokens,
+    });
+  }
+
+  if (config.provider === 'qwen') {
+    if (!config.qwenApiKey) {
+      throw new Error('QWEN_API_KEY is not set. Export it or switch providers with --provider anthropic.');
+    }
+    return new QwenProvider({
+      apiKey: config.qwenApiKey,
+      baseUrl: config.qwenBaseUrl,
+      model: config.model,
+      maxTokens: config.maxTokens,
+    });
+  }
+
   if (!config.geminiApiKey) {
     throw new Error('GEMINI_API_KEY is not set. Export it or switch providers with --provider anthropic.');
   }
diff --git a/src/providers/ollama.ts b/src/providers/ollama.ts
index 79f0ca8..d59808e 100644
--- a/src/providers/ollama.ts
+++ b/src/providers/ollama.ts
@@ -1,272 +1,41 @@
-import type { Message } from '../agent/types.js';
-import type { ModelProvider, ProviderEvent, SendRequest, ToolSchema } from './types.js';
+import { OpenAiCompatibleProvider, type OpenAiCompatibleOptions } from './openai-compatible.js';
 
-export interface OllamaProviderOptions {
-  /** OpenAI-compatible base URL, e.g. "http://localhost:11434/v1". */
-  baseUrl: string;
-  model: string;
+// Re-exported so existing importers keep their `./ollama.js` entry point; the
+// translation helpers are shared by every OpenAI-compatible provider now.
+export { toOpenAiMessages, toOpenAiTools } from './openai-compatible.js';
+
+export interface OllamaProviderOptions extends Omit<OpenAiCompatibleOptions, 'apiKey'> {
   /** Ignored by Ollama but required by the OpenAI wire format; defaults to "ollama". */
   apiKey?: string;
-  /** Cap on tokens to generate per response. Omitted from the request if unset. */
-  maxTokens?: number;
-  /**
-   * Abort the request if no bytes arrive for this long (ms). This is an *idle*
-   * timeout, reset on every received chunk — a slow-but-progressing model keeps
-   * going; a hung one (common when the machine is RAM-starved) is cut loose.
-   * Defaults to 120_000.
-   */
-  timeoutMs?: number;
-}
-
-interface OpenAiMessage {
-  role: 'system' | 'user' | 'assistant' | 'tool';
-  content: string;
-  tool_calls?: { id: string; type: 'function'; function: { name: string; arguments: string } }[];
-  tool_call_id?: string;
 }
 
 /**
- * Translate internal messages into OpenAI chat messages (the shape Ollama's
- * `/v1/chat/completions` endpoint accepts). Unlike Gemini, OpenAI correlates
- * tool results to calls by `tool_call_id`, and our Anthropic-style ids survive
- * the round trip — so no id synthesis is needed.
- *
- * Assumes the loop never mixes plain text and tool results in one user turn in a
- * way that would interleave them: we emit all `tool` messages first, then any
- * text as a trailing user message. OpenAI requires each `tool` message to follow
- * the assistant `tool_calls` that produced it; today's loop builds messages so
- * that holds. If a future change interleaves them, revisit this ordering.
+ * Local Ollama server over its OpenAI-compatible endpoint. Same wire format as
+ * the cloud OpenAI-compatible providers (it also covers LM Studio and vLLM by
+ * pointing `baseUrl` at them); only the auth default and the connection-error
+ * wording — which name a local `ollama serve` and RAM pressure — differ.
  */
-export function toOpenAiMessages(messages: Message[]): OpenAiMessage[] {
-  const out: OpenAiMessage[] = [];
-  for (const m of messages) {
-    if (m.role === 'user') {
-      // A user turn may carry plain text and/or tool results; emit each result
-      // as its own `tool` message and gather any text into one user message.
-      let text = '';
-      for (const b of m.content) {
-        if (b.type === 'text') text += b.text;
-        else if (b.type === 'tool_result') {
-          out.push({ role: 'tool', tool_call_id: b.toolUseId, content: b.content });
-        }
-      }
-      if (text.length > 0) out.push({ role: 'user', content: text });
-      continue;
-    }
-
-    // assistant: merge text + tool_use into a single message
-    let text = '';
-    const toolCalls: NonNullable<OpenAiMessage['tool_calls']> = [];
-    for (const b of m.content) {
-      if (b.type === 'text') text += b.text;
-      else if (b.type === 'tool_use') {
-        toolCalls.push({
-          id: b.id,
-          type: 'function',
-          function: { name: b.name, arguments: JSON.stringify(b.input ?? {}) },
-        });
-      }
-    }
-    const msg: OpenAiMessage = { role: 'assistant', content: text };
-    if (toolCalls.length > 0) msg.tool_calls = toolCalls;
-    out.push(msg);
-  }
-  return out;
-}
-
-/** Translate normalized tool schemas into OpenAI's `tools` array. */
-export function toOpenAiTools(tools: ToolSchema[]): unknown[] {
-  return tools.map((t) => ({
-    type: 'function',
-    function: { name: t.name, description: t.description, parameters: t.jsonSchema },
-  }));
-}
-
-interface StreamChoice {
-  delta?: {
-    content?: string | null;
-    tool_calls?: {
-      index: number;
-      id?: string;
-      function?: { name?: string; arguments?: string };
-    }[];
-  };
-  finish_reason?: string | null;
-}
-
-interface StreamChunk {
-  choices?: StreamChoice[];
-  usage?: { prompt_tokens?: number; completion_tokens?: number } | null;
-}
-
-export class OllamaProvider implements ModelProvider {
+export class OllamaProvider extends OpenAiCompatibleProvider {
   readonly name = 'ollama' as const;
-  readonly model: string;
-  private readonly baseUrl: string;
-  private readonly apiKey: string;
-  private readonly maxTokens: number | undefined;
-  private readonly timeoutMs: number;
 
   constructor(opts: OllamaProviderOptions) {
-    this.baseUrl = opts.baseUrl.replace(/\/$/, '');
-    this.model = opts.model;
-    this.apiKey = opts.apiKey ?? 'ollama';
-    this.maxTokens = opts.maxTokens;
-    this.timeoutMs = opts.timeoutMs ?? 120_000;
+    super({ ...opts, apiKey: opts.apiKey ?? 'ollama' });
   }
 
-  async *send(req: SendRequest): AsyncIterable<ProviderEvent> {
-    const messages: OpenAiMessage[] = [
-      { role: 'system', content: req.system },
-      ...toOpenAiMessages(req.messages),
-    ];
-
-    const body = {
-      model: this.model,
-      messages,
-      tools: req.tools.length > 0 ? toOpenAiTools(req.tools) : undefined,
-      stream: true,
-      max_tokens: this.maxTokens,
-    };
-
-    // Idle-timeout guard: abort if the server goes silent for `timeoutMs`. The
-    // raw fetch (unlike the cloud SDKs) has no built-in timeout, so without this
-    // a stuck local model would freeze the REPL with no way to recover.
-    const controller = new AbortController();
-    let timer: ReturnType<typeof setTimeout>;
-    const armTimer = (): void => {
-      clearTimeout(timer);
-      timer = setTimeout(() => controller.abort(), this.timeoutMs);
-    };
-    armTimer();
-
-    try {
-      let res: Response;
-      try {
-        // `stream_options.include_usage` is best-effort: it gives us token counts,
-        // but older Ollama builds reject unknown body fields with a 400. Rather than
-        // breaking every local turn over a reporting nicety, retry once without it.
-        res = await this.post({ ...body, stream_options: { include_usage: true } }, controller.signal);
-        if (res.status === 400) res = await this.post(body, controller.signal);
-      } catch (err) {
-        if (controller.signal.aborted) throw this.timeoutError();
-        throw new Error(
-          `Cannot reach Ollama at ${this.baseUrl}. Is 'ollama serve' running? (${(err as Error).message})`,
-        );
-      }
-
-      if (!res.ok || !res.body) {
-        const detail = await res.text().catch(() => '');
-        throw new Error(`Ollama request failed (${res.status}): ${detail.slice(0, 200)}`);
-      }
-
-      // Accumulate tool calls by their streamed index; arguments arrive in fragments.
-      const calls = new Map<number, { id: string; name: string; args: string }>();
-      let usage = { inputTokens: 0, outputTokens: 0 };
-      let finish = 'stop';
-
-      try {
-        for await (const chunk of parseSse(res.body)) {
-          armTimer(); // progress: reset the idle clock
-          const choice = chunk.choices?.[0];
-          if (choice?.delta?.content) yield { type: 'text', delta: choice.delta.content };
-
-          for (const tc of choice?.delta?.tool_calls ?? []) {
-            const acc = calls.get(tc.index) ?? { id: '', name: '', args: '' };
-            if (tc.id) acc.id = tc.id;
-            if (tc.function?.name) acc.name = tc.function.name;
-            if (tc.function?.arguments) acc.args += tc.function.arguments;
-            calls.set(tc.index, acc);
-          }
-
-          if (choice?.finish_reason) finish = choice.finish_reason;
-          if (chunk.usage) {
-            usage = {
-              inputTokens: chunk.usage.prompt_tokens ?? 0,
-              outputTokens: chunk.usage.completion_tokens ?? 0,
-            };
-          }
-        }
-      } catch (err) {
-        if (controller.signal.aborted) throw this.timeoutError();
-        throw err;
-      }
-
-      for (const [index, c] of [...calls.entries()].sort((a, b) => a[0] - b[0])) {
-        let input: unknown = {};
-        try {
-          input = c.args.trim() ? JSON.parse(c.args) : {};
-        } catch {
-          // Small models occasionally emit malformed JSON; degrade gracefully.
-          input = {};
-        }
-        yield { type: 'tool_call', id: c.id || `ollama-call-${index}`, name: c.name, input };
-      }
-
-      yield {
-        type: 'done',
-        usage,
-        stopReason: calls.size > 0 ? 'tool_use' : finish,
-      };
-    } finally {
-      clearTimeout(timer!);
-    }
+  protected override label(): string {
+    return 'Ollama';
   }
 
-  private timeoutError(): Error {
+  protected override timeoutError(): Error {
     return new Error(
       `Ollama at ${this.baseUrl} went silent for ${Math.round(this.timeoutMs / 1000)}s and was aborted. ` +
         `The model '${this.model}' may be too large for this machine.`,
     );
   }
 
-  /** POST a chat-completions request body to the Ollama server. */
-  private post(body: unknown, signal: AbortSignal): Promise<Response> {
-    return fetch(`${this.baseUrl}/chat/completions`, {
-      method: 'POST',
-      headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${this.apiKey}` },
-      body: JSON.stringify(body),
-      signal,
-    });
-  }
-}
-
-/** Decode a single SSE line into a chunk, or `undefined` for non-data/keep-alive lines. */
-function parseSseLine(raw: string): StreamChunk | undefined {
-  const line = raw.trim();
-  if (!line.startsWith('data:')) return undefined;
-  const payload = line.slice(5).trim();
-  if (payload === '[DONE]' || payload.length === 0) return undefined;
-  try {
-    return JSON.parse(payload) as StreamChunk;
-  } catch {
-    // Ignore partial/non-JSON keep-alive lines.
-    return undefined;
-  }
-}
-
-/** Parse an SSE byte stream into decoded JSON chunks, skipping the `[DONE]` sentinel. */
-async function* parseSse(body: ReadableStream<Uint8Array>): AsyncIterable<StreamChunk> {
-  const decoder = new TextDecoder();
-  let buffer = '';
-  const reader = body.getReader();
-  try {
-    for (;;) {
-      const { done, value } = await reader.read();
-      if (done) break;
-      buffer += decoder.decode(value, { stream: true });
-      let nl: number;
-      while ((nl = buffer.indexOf('\n')) !== -1) {
-        const chunk = parseSseLine(buffer.slice(0, nl));
-        buffer = buffer.slice(nl + 1);
-        if (chunk) yield chunk;
-      }
-    }
-    // Emit a final line that arrived without a trailing newline (e.g. a closing
-    // usage frame); otherwise the last chunk's token counts would be dropped.
-    const tail = parseSseLine(buffer);
-    if (tail) yield tail;
-  } finally {
-    reader.releaseLock();
+  protected override unreachableError(err: Error): Error {
+    return new Error(
+      `Cannot reach Ollama at ${this.baseUrl}. Is 'ollama serve' running? (${err.message})`,
+    );
   }
 }
diff --git a/src/providers/openai-compatible.ts b/src/providers/openai-compatible.ts
new file mode 100644
index 0000000..8492205
--- /dev/null
+++ b/src/providers/openai-compatible.ts
@@ -0,0 +1,285 @@
+import type { Message } from '../agent/types.js';
+import type { ModelProvider, ProviderEvent, SendRequest, ToolSchema } from './types.js';
+
+export interface OpenAiCompatibleOptions {
+  /** OpenAI-compatible base URL, e.g. "https://api.deepseek.com/v1". */
+  baseUrl: string;
+  model: string;
+  /** Bearer token. Local servers (Ollama) ignore it; cloud APIs require it. */
+  apiKey?: string;
+  /** Cap on tokens to generate per response. Omitted from the request if unset. */
+  maxTokens?: number;
+  /**
+   * Abort the request if no bytes arrive for this long (ms). This is an *idle*
+   * timeout, reset on every received chunk — a slow-but-progressing model keeps
+   * going; a hung one is cut loose. Defaults to 120_000.
+   */
+  timeoutMs?: number;
+}
+
+interface OpenAiMessage {
+  role: 'system' | 'user' | 'assistant' | 'tool';
+  content: string;
+  tool_calls?: { id: string; type: 'function'; function: { name: string; arguments: string } }[];
+  tool_call_id?: string;
+}
+
+/**
+ * Translate internal messages into OpenAI chat messages (the shape every
+ * `/v1/chat/completions` endpoint accepts). Unlike Gemini, OpenAI correlates
+ * tool results to calls by `tool_call_id`, and our Anthropic-style ids survive
+ * the round trip — so no id synthesis is needed.
+ *
+ * Assumes the loop never mixes plain text and tool results in one user turn in a
+ * way that would interleave them: we emit all `tool` messages first, then any
+ * text as a trailing user message. OpenAI requires each `tool` message to follow
+ * the assistant `tool_calls` that produced it; today's loop builds messages so
+ * that holds. If a future change interleaves them, revisit this ordering.
+ */
+export function toOpenAiMessages(messages: Message[]): OpenAiMessage[] {
+  const out: OpenAiMessage[] = [];
+  for (const m of messages) {
+    if (m.role === 'user') {
+      // A user turn may carry plain text and/or tool results; emit each result
+      // as its own `tool` message and gather any text into one user message.
+      let text = '';
+      for (const b of m.content) {
+        if (b.type === 'text') text += b.text;
+        else if (b.type === 'tool_result') {
+          out.push({ role: 'tool', tool_call_id: b.toolUseId, content: b.content });
+        }
+      }
+      if (text.length > 0) out.push({ role: 'user', content: text });
+      continue;
+    }
+
+    // assistant: merge text + tool_use into a single message
+    let text = '';
+    const toolCalls: NonNullable<OpenAiMessage['tool_calls']> = [];
+    for (const b of m.content) {
+      if (b.type === 'text') text += b.text;
+      else if (b.type === 'tool_use') {
+        toolCalls.push({
+          id: b.id,
+          type: 'function',
+          function: { name: b.name, arguments: JSON.stringify(b.input ?? {}) },
+        });
+      }
+    }
+    const msg: OpenAiMessage = { role: 'assistant', content: text };
+    if (toolCalls.length > 0) msg.tool_calls = toolCalls;
+    out.push(msg);
+  }
+  return out;
+}
+
+/** Translate normalized tool schemas into OpenAI's `tools` array. */
+export function toOpenAiTools(tools: ToolSchema[]): unknown[] {
+  return tools.map((t) => ({
+    type: 'function',
+    function: { name: t.name, description: t.description, parameters: t.jsonSchema },
+  }));
+}
+
+interface StreamChoice {
+  delta?: {
+    content?: string | null;
+    tool_calls?: {
+      index: number;
+      id?: string;
+      function?: { name?: string; arguments?: string };
+    }[];
+  };
+  finish_reason?: string | null;
+}
+
+interface StreamChunk {
+  choices?: StreamChoice[];
+  usage?: { prompt_tokens?: number; completion_tokens?: number } | null;
+}
+
+/**
+ * Base adapter for any OpenAI-compatible `/v1/chat/completions` server. Ollama,
+ * DeepSeek, and Qwen (DashScope) all speak this wire format, differing only in
+ * base URL, auth, and the wording of their connection errors. Subclasses set
+ * {@link name} and may override {@link unreachableError}/{@link timeoutError}.
+ */
+export abstract class OpenAiCompatibleProvider implements ModelProvider {
+  abstract readonly name: ModelProvider['name'];
+  readonly model: string;
+  protected readonly baseUrl: string;
+  protected readonly apiKey: string;
+  protected readonly maxTokens: number | undefined;
+  protected readonly timeoutMs: number;
+
+  constructor(opts: OpenAiCompatibleOptions) {
+    this.baseUrl = opts.baseUrl.replace(/\/$/, '');
+    this.model = opts.model;
+    this.apiKey = opts.apiKey ?? '';
+    this.maxTokens = opts.maxTokens;
+    this.timeoutMs = opts.timeoutMs ?? 120_000;
+  }
+
+  async *send(req: SendRequest): AsyncIterable<ProviderEvent> {
+    const messages: OpenAiMessage[] = [
+      { role: 'system', content: req.system },
+      ...toOpenAiMessages(req.messages),
+    ];
+
+    const body = {
+      model: this.model,
+      messages,
+      tools: req.tools.length > 0 ? toOpenAiTools(req.tools) : undefined,
+      stream: true,
+      max_tokens: this.maxTokens,
+    };
+
+    // Idle-timeout guard: abort if the server goes silent for `timeoutMs`. The
+    // raw fetch (unlike the cloud SDKs) has no built-in timeout, so without this
+    // a stuck server would freeze the REPL with no way to recover.
+    const controller = new AbortController();
+    let timer: ReturnType<typeof setTimeout>;
+    const armTimer = (): void => {
+      clearTimeout(timer);
+      timer = setTimeout(() => controller.abort(), this.timeoutMs);
+    };
+    armTimer();
+
+    try {
+      let res: Response;
+      try {
+        // `stream_options.include_usage` is best-effort: it gives us token counts,
+        // but some servers reject unknown body fields with a 400. Rather than
+        // breaking every turn over a reporting nicety, retry once without it.
+        res = await this.post({ ...body, stream_options: { include_usage: true } }, controller.signal);
+        if (res.status === 400) res = await this.post(body, controller.signal);
+      } catch (err) {
+        if (controller.signal.aborted) throw this.timeoutError();
+        throw this.unreachableError(err as Error);
+      }
+
+      if (!res.ok || !res.body) {
+        const detail = await res.text().catch(() => '');
+        throw new Error(`${this.label()} request failed (${res.status}): ${detail.slice(0, 200)}`);
+      }
+
+      // Accumulate tool calls by their streamed index; arguments arrive in fragments.
+      const calls = new Map<number, { id: string; name: string; args: string }>();
+      let usage = { inputTokens: 0, outputTokens: 0 };
+      let finish = 'stop';
+
+      try {
+        for await (const chunk of parseSse(res.body)) {
+          armTimer(); // progress: reset the idle clock
+          const choice = chunk.choices?.[0];
+          if (choice?.delta?.content) yield { type: 'text', delta: choice.delta.content };
+
+          for (const tc of choice?.delta?.tool_calls ?? []) {
+            const acc = calls.get(tc.index) ?? { id: '', name: '', args: '' };
+            if (tc.id) acc.id = tc.id;
+            if (tc.function?.name) acc.name = tc.function.name;
+            if (tc.function?.arguments) acc.args += tc.function.arguments;
+            calls.set(tc.index, acc);
+          }
+
+          if (choice?.finish_reason) finish = choice.finish_reason;
+          if (chunk.usage) {
+            usage = {
+              inputTokens: chunk.usage.prompt_tokens ?? 0,
+              outputTokens: chunk.usage.completion_tokens ?? 0,
+            };
+          }
+        }
+      } catch (err) {
+        if (controller.signal.aborted) throw this.timeoutError();
+        throw err;
+      }
+
+      for (const [index, c] of [...calls.entries()].sort((a, b) => a[0] - b[0])) {
+        let input: unknown = {};
+        try {
+          input = c.args.trim() ? JSON.parse(c.args) : {};
+        } catch {
+          // Small models occasionally emit malformed JSON; degrade gracefully.
+          input = {};
+        }
+        yield { type: 'tool_call', id: c.id || `${this.name}-call-${index}`, name: c.name, input };
+      }
+
+      yield {
+        type: 'done',
+        usage,
+        stopReason: calls.size > 0 ? 'tool_use' : finish,
+      };
+    } finally {
+      clearTimeout(timer!);
+    }
+  }
+
+  /** Human-readable provider name used in error messages. */
+  protected label(): string {
+    return this.name;
+  }
+
+  /** Error raised when no usable response arrives before the idle timeout. */
+  protected timeoutError(): Error {
+    return new Error(
+      `${this.label()} at ${this.baseUrl} went silent for ${Math.round(this.timeoutMs / 1000)}s and was aborted.`,
+    );
+  }
+
+  /** Error raised when the server can't be reached at all. */
+  protected unreachableError(err: Error): Error {
+    return new Error(`Cannot reach ${this.label()} at ${this.baseUrl}. (${err.message})`);
+  }
+
+  /** POST a chat-completions request body to the server. */
+  protected post(body: unknown, signal: AbortSignal): Promise<Response> {
+    return fetch(`${this.baseUrl}/chat/completions`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${this.apiKey}` },
+      body: JSON.stringify(body),
+      signal,
+    });
+  }
+}
+
+/** Decode a single SSE line into a chunk, or `undefined` for non-data/keep-alive lines. */
+function parseSseLine(raw: string): StreamChunk | undefined {
+  const line = raw.trim();
+  if (!line.startsWith('data:')) return undefined;
+  const payload = line.slice(5).trim();
+  if (payload === '[DONE]' || payload.length === 0) return undefined;
+  try {
+    return JSON.parse(payload) as StreamChunk;
+  } catch {
+    // Ignore partial/non-JSON keep-alive lines.
+    return undefined;
+  }
+}
+
+/** Parse an SSE byte stream into decoded JSON chunks, skipping the `[DONE]` sentinel. */
+export async function* parseSse(body: ReadableStream<Uint8Array>): AsyncIterable<StreamChunk> {
+  const decoder = new TextDecoder();
+  let buffer = '';
+  const reader = body.getReader();
+  try {
+    for (;;) {
+      const { done, value } = await reader.read();
+      if (done) break;
+      buffer += decoder.decode(value, { stream: true });
+      let nl: number;
+      while ((nl = buffer.indexOf('\n')) !== -1) {
+        const chunk = parseSseLine(buffer.slice(0, nl));
+        buffer = buffer.slice(nl + 1);
+        if (chunk) yield chunk;
+      }
+    }
+    // Emit a final line that arrived without a trailing newline (e.g. a closing
+    // usage frame); otherwise the last chunk's token counts would be dropped.
+    const tail = parseSseLine(buffer);
+    if (tail) yield tail;
+  } finally {
+    reader.releaseLock();
+  }
+}
diff --git a/src/providers/qwen.ts b/src/providers/qwen.ts
new file mode 100644
index 0000000..ed926de
--- /dev/null
+++ b/src/providers/qwen.ts
@@ -0,0 +1,33 @@
+import { OpenAiCompatibleProvider, type OpenAiCompatibleOptions } from './openai-compatible.js';
+
+/** Alibaba DashScope's OpenAI-compatible endpoint (hosts the Qwen models). */
+export const DEFAULT_QWEN_URL = 'https://dashscope.aliyuncs.com/compatible-mode/v1';
+
+export interface QwenProviderOptions extends Omit<OpenAiCompatibleOptions, 'baseUrl'> {
+  apiKey: string;
+  /** Override the API endpoint (defaults to {@link DEFAULT_QWEN_URL}). */
+  baseUrl?: string | undefined;
+}
+
+/**
+ * Alibaba's Qwen Coder models (e.g. qwen3-coder-plus) served via DashScope's
+ * OpenAI-compatible Chat Completions API. Differs from the local Ollama provider
+ * only in endpoint, required API key, and error wording.
+ */
+export class QwenProvider extends OpenAiCompatibleProvider {
+  readonly name = 'qwen' as const;
+
+  constructor(opts: QwenProviderOptions) {
+    super({ ...opts, baseUrl: opts.baseUrl ?? DEFAULT_QWEN_URL });
+  }
+
+  protected override label(): string {
+    return 'Qwen';
+  }
+
+  protected override unreachableError(err: Error): Error {
+    return new Error(
+      `Cannot reach Qwen (DashScope) at ${this.baseUrl}. Check your network and QWEN_API_KEY. (${err.message})`,
+    );
+  }
+}
diff --git a/src/providers/types.ts b/src/providers/types.ts
index c18443e..1d137f3 100644
--- a/src/providers/types.ts
+++ b/src/providers/types.ts
@@ -34,7 +34,7 @@ export interface SendRequest {
  * {@link ProviderEvent}.
  */
 export interface ModelProvider {
-  readonly name: 'anthropic' | 'gemini' | 'ollama';
+  readonly name: 'anthropic' | 'gemini' | 'ollama' | 'deepseek' | 'qwen';
   readonly model: string;
   send(req: SendRequest): AsyncIterable<ProviderEvent>;
 }
diff --git a/src/ui/render.ts b/src/ui/render.ts
index ea87e44..a0ecb10 100644
--- a/src/ui/render.ts
+++ b/src/ui/render.ts
@@ -24,7 +24,12 @@ function fmtTokens(n: number): string {
 
 /** Paid (non-local) providers, where missing pricing means "unknown" not "free". */
 function isCloud(provider?: string): boolean {
-  return provider === 'anthropic' || provider === 'gemini';
+  return (
+    provider === 'anthropic' ||
+    provider === 'gemini' ||
+    provider === 'deepseek' ||
+    provider === 'qwen'
+  );
 }
 
 export interface SessionTotals {
diff --git a/tests/config/load.test.ts b/tests/config/load.test.ts
index 2da1595..6ec7878 100644
--- a/tests/config/load.test.ts
+++ b/tests/config/load.test.ts
@@ -7,12 +7,17 @@ import { loadConfig } from '../../src/config/load.js';
 const ENV_KEYS = [
   'ANTHROPIC_API_KEY',
   'GEMINI_API_KEY',
+  'DEEPSEEK_API_KEY',
+  'QWEN_API_KEY',
+  'DASHSCOPE_API_KEY',
   'TINY_CODE_PROVIDER',
   'TINY_CODE_MODEL',
   'TINY_CODE_PRIORITY',
   'TINY_CODE_MAX_TOKENS',
   'TINY_CODE_EFFORT',
   'TINY_CODE_OLLAMA_URL',
+  'TINY_CODE_DEEPSEEK_URL',
+  'TINY_CODE_QWEN_URL',
   'TINY_CODE_IMPROVE',
   'HOME',
 ];
@@ -161,6 +166,40 @@ describe('loadConfig', () => {
     expect(cfg.ollamaBaseUrl).toBe('http://gpu-box:11434/v1');
   });
 
+  it('infers deepseek when only DEEPSEEK_API_KEY is set, picking its flagship model', () => {
+    process.env.DEEPSEEK_API_KEY = 'sk-deep';
+    const cfg = loadConfig({}, cwd);
+    expect(cfg.provider).toBe('deepseek');
+    expect(cfg.model).toBe('deepseek-v4-pro');
+    expect(cfg.deepseekApiKey).toBe('sk-deep');
+  });
+
+  it('infers qwen from QWEN_API_KEY or DASHSCOPE_API_KEY', () => {
+    process.env.QWEN_API_KEY = 'sk-qwen';
+    expect(loadConfig({}, cwd).provider).toBe('qwen');
+    delete process.env.QWEN_API_KEY;
+    process.env.DASHSCOPE_API_KEY = 'sk-dash';
+    const cfg = loadConfig({}, cwd);
+    expect(cfg.provider).toBe('qwen');
+    expect(cfg.model).toBe('qwen3-coder-plus');
+    expect(cfg.qwenApiKey).toBe('sk-dash');
+  });
+
+  it('prefers anthropic over deepseek/qwen when several keys are present', () => {
+    process.env.ANTHROPIC_API_KEY = 'sk-a';
+    process.env.DEEPSEEK_API_KEY = 'sk-d';
+    process.env.QWEN_API_KEY = 'sk-q';
+    expect(loadConfig({}, cwd).provider).toBe('anthropic');
+  });
+
+  it('reads provider-specific base URL overrides', () => {
+    process.env.TINY_CODE_DEEPSEEK_URL = 'https://proxy/deepseek/v1';
+    process.env.TINY_CODE_QWEN_URL = 'https://dashscope-intl.aliyuncs.com/compatible-mode/v1';
+    const cfg = loadConfig({ provider: 'deepseek' }, cwd);
+    expect(cfg.deepseekBaseUrl).toBe('https://proxy/deepseek/v1');
+    expect(cfg.qwenBaseUrl).toBe('https://dashscope-intl.aliyuncs.com/compatible-mode/v1');
+  });
+
   it('defaults routing to local-first when an escalateTo target is configured', async () => {
     await writeFile(
       join(cwd, 'tiny-code.config.json'),
diff --git a/tests/models/catalog.test.ts b/tests/models/catalog.test.ts
index f5fdf7e..6cda7a5 100644
--- a/tests/models/catalog.test.ts
+++ b/tests/models/catalog.test.ts
@@ -58,6 +58,18 @@ describe('recommendModel', () => {
     expect(recommendModel({ provider: 'gemini', priority: 'cost' })?.id).toBe('gemini-2.5-flash');
   });
 
+  it('picks the flagship coder model for the DeepSeek and Qwen providers', () => {
+    expect(recommendModel({ provider: 'deepseek', priority: 'performance' })?.id).toBe(
+      'deepseek-v4-pro',
+    );
+    expect(recommendModel({ provider: 'qwen', priority: 'performance' })?.id).toBe(
+      'qwen3-coder-plus',
+    );
+    // Their cheaper variants win on cost.
+    expect(recommendModel({ provider: 'deepseek', priority: 'cost' })?.id).toBe('deepseek-v4-flash');
+    expect(recommendModel({ provider: 'qwen', priority: 'cost' })?.id).toBe('qwen3-coder-flash');
+  });
+
   it('balanced trades cost against capability without dropping to the weakest', () => {
     expect(recommendModel({ provider: 'anthropic', priority: 'balanced' })?.id).toBe(
       'claude-sonnet-4-6',
diff --git a/tests/providers/openaiCloudSend.test.ts b/tests/providers/openaiCloudSend.test.ts
new file mode 100644
index 0000000..3e53e02
--- /dev/null
+++ b/tests/providers/openaiCloudSend.test.ts
@@ -0,0 +1,81 @@
+import { describe, it, expect, vi, afterEach } from 'vitest';
+import { DeepSeekProvider } from '../../src/providers/deepseek.js';
+import { QwenProvider } from '../../src/providers/qwen.js';
+import type { ProviderEvent } from '../../src/providers/types.js';
+
+/** Build a fake SSE Response body from a list of OpenAI-style chunks. */
+function sseResponse(chunks: unknown[]): Response {
+  const lines = chunks.map((c) => `data: ${JSON.stringify(c)}\n\n`).concat('data: [DONE]\n\n');
+  const stream = new ReadableStream<Uint8Array>({
+    start(controller) {
+      const enc = new TextEncoder();
+      for (const line of lines) controller.enqueue(enc.encode(line));
+      controller.close();
+    },
+  });
+  return new Response(stream, { status: 200, headers: { 'Content-Type': 'text/event-stream' } });
+}
+
+afterEach(() => vi.restoreAllMocks());
+
+async function collect(provider: DeepSeekProvider | QwenProvider): Promise<ProviderEvent[]> {
+  const events: ProviderEvent[] = [];
+  for await (const e of provider.send({
+    system: 's',
+    messages: [{ role: 'user', content: [{ type: 'text', text: 'go' }] }],
+    tools: [{ name: 'ls', description: 'list', jsonSchema: { type: 'object' } }],
+  })) {
+    events.push(e);
+  }
+  return events;
+}
+
+describe('DeepSeekProvider.send', () => {
+  it('targets the DeepSeek endpoint with the API key and streams events', async () => {
+    const fetchMock = vi.spyOn(globalThis, 'fetch').mockResolvedValue(
+      sseResponse([
+        { choices: [{ delta: { content: 'hi' } }] },
+        { choices: [], usage: { prompt_tokens: 5, completion_tokens: 2 } },
+      ]),
+    );
+
+    const provider = new DeepSeekProvider({ apiKey: 'sk-deep', model: 'deepseek-v4-pro' });
+    expect(provider.name).toBe('deepseek');
+
+    const events = await collect(provider);
+    const [url, init] = fetchMock.mock.calls[0]!;
+    expect(url).toBe('https://api.deepseek.com/v1/chat/completions');
+    expect((init as RequestInit).headers).toMatchObject({ Authorization: 'Bearer sk-deep' });
+
+    const text = events.filter((e) => e.type === 'text').map((e) => (e as { delta: string }).delta);
+    expect(text.join('')).toBe('hi');
+    const done = events.find((e) => e.type === 'done');
+    expect(done).toMatchObject({ usage: { inputTokens: 5, outputTokens: 2 } });
+  });
+
+  it('reports a DeepSeek-specific error when the host is unreachable', async () => {
+    vi.spyOn(globalThis, 'fetch').mockRejectedValue(new Error('ENOTFOUND'));
+    const provider = new DeepSeekProvider({ apiKey: 'k', model: 'deepseek-v4-pro' });
+    await expect(collect(provider)).rejects.toThrow(/Cannot reach DeepSeek/);
+  });
+});
+
+describe('QwenProvider.send', () => {
+  it('targets the DashScope endpoint and respects a base URL override', async () => {
+    const fetchMock = vi
+      .spyOn(globalThis, 'fetch')
+      .mockResolvedValue(sseResponse([{ choices: [{ delta: { content: 'ok' } }] }]));
+
+    const provider = new QwenProvider({
+      apiKey: 'sk-qwen',
+      model: 'qwen3-coder-plus',
+      baseUrl: 'https://dashscope-intl.aliyuncs.com/compatible-mode/v1',
+    });
+    expect(provider.name).toBe('qwen');
+
+    await collect(provider);
+    const [url, init] = fetchMock.mock.calls[0]!;
+    expect(url).toBe('https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions');
+    expect((init as RequestInit).headers).toMatchObject({ Authorization: 'Bearer sk-qwen' });
+  });
+});

From 118faa080d2c984d70e63ceba438bd0f08a3a7ef Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 10 Jun 2026 13:14:39 +0000
Subject: [PATCH 2/4] Default model auto-selection to balanced priority

Flip the default priority from performance to balanced so the auto-picked
model is cost-aware out of the box (best capability-per-dollar behind a
quality floor) rather than most-capable-at-any-price. Update config/catalog
tests, CLI help, README, AGENTS.md, and .env.example to match.

https://claude.ai/code/session_01GHiv4kP53a96EWsFEAAqWp
---
 .changeset/balanced-default-priority.md | 14 ++++++++++++++
 .env.example                            |  2 +-
 AGENTS.md                               |  8 +++++---
 README.md                               | 19 ++++++++++++-------
 src/cli.ts                              |  2 +-
 src/config/load.ts                      |  2 +-
 src/models/catalog.ts                   |  3 ++-
 tests/config/load.test.ts               | 24 ++++++++++++++++++------
 8 files changed, 54 insertions(+), 20 deletions(-)
 create mode 100644 .changeset/balanced-default-priority.md

diff --git a/.changeset/balanced-default-priority.md b/.changeset/balanced-default-priority.md
new file mode 100644
index 0000000..b5ba201
--- /dev/null
+++ b/.changeset/balanced-default-priority.md
@@ -0,0 +1,14 @@
+---
+"@therr/tiny-code": minor
+---
+
+Default model selection to `balanced` priority.
+
+When no `model` is pinned, tiny-code now defaults to `priority: "balanced"`
+instead of `performance`, picking the best capability-per-dollar model
+(`codingScore / blendedCostPerMTok`, behind a quality floor) rather than the
+most capable regardless of price. In line with the project's token-minimalism
+goal, this makes the out-of-the-box pick cost-aware — e.g. Claude Sonnet rather
+than Opus for Anthropic. Set `priority: "performance"` (or
+`TINY_CODE_PRIORITY=performance`) to restore the previous most-capable defaults;
+pinning a `model` still overrides everything.
diff --git a/.env.example b/.env.example
index bc84fe7..90c3a29 100644
--- a/.env.example
+++ b/.env.example
@@ -12,7 +12,7 @@ QWEN_API_KEY=          # Alibaba DashScope key (DASHSCOPE_API_KEY also accepted)
 # TINY_CODE_OLLAMA_URL=http://localhost:11434/v1   # Ollama OpenAI-compatible endpoint
 # TINY_CODE_DEEPSEEK_URL=https://api.deepseek.com/v1
 # TINY_CODE_QWEN_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
-# TINY_CODE_PRIORITY=performance # performance | cost | balanced — auto-picks a model when none is pinned
+# TINY_CODE_PRIORITY=balanced   # performance | cost | balanced (default) — auto-picks a model when none is pinned
 # TINY_CODE_EFFORT=high          # low | medium | high | xhigh | max — Anthropic thinking budget
 
 # Self-improvement: reflect on sessions and propose markdown-only improvement PRs.
diff --git a/AGENTS.md b/AGENTS.md
index 7c267fd..4180d00 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -29,9 +29,11 @@ runaway costs.
 - Keep it current: when adding/repricing a model, update its entry **and**
   `CATALOG_AS_OF`. Anthropic pricing comes from the bundled claude-api reference;
   verify Gemini pricing against Google's published rates. Don't guess prices.
-- `priority` defaults to `performance`, which preserves the historical default
-  models (Opus for Anthropic, Gemini 2.5 Pro for Gemini). Don't change the
-  default without updating the config tests that assert those ids.
+- `priority` defaults to `balanced` (best capability-per-dollar behind a quality
+  floor), so the auto-picked model is cost-aware by default — e.g. Sonnet rather
+  than Opus for Anthropic. `performance` restores the historical most-capable
+  picks. Don't change the default without updating the config/catalog tests that
+  assert those ids.
 
 ## Boundaries
 - No business logic. This is a general-purpose tool.
diff --git a/README.md b/README.md
index d55957b..b1548a7 100644
--- a/README.md
+++ b/README.md
@@ -142,7 +142,7 @@ CLI flags.
   "provider": "anthropic",
   "model": "claude-opus-4-8",
   "ollamaBaseUrl": "http://localhost:11434/v1",
-  "priority": "performance",
+  "priority": "balanced",
   "maxTokens": 16000,
   "thinking": true,
   "effort": "high",
@@ -213,14 +213,19 @@ money and to pick a model that fits your cost/performance preference.
 - **Priority-driven selection.** When you don't pin a `model`, tiny-code picks
   one for you based on `priority`:
 
-  | `priority`      | Picks                                                        |
-  | --------------- | ----------------------------------------------------------- |
-  | `performance`   | The most capable model (the default — current behavior).    |
-  | `cost`          | The cheapest still-capable model.                           |
-  | `balanced`      | The best capability-per-dollar among capable models.        |
+  | `priority`      | Picks                                                            |
+  | --------------- | --------------------------------------------------------------- |
+  | `balanced`      | The best capability-per-dollar among capable models (default).  |
+  | `performance`   | The most capable model, ignoring price.                         |
+  | `cost`          | The cheapest still-capable model.                               |
+
+  `balanced` is the default: it ranks capable models by
+  `codingScore / blendedCostPerMTok` (a model's coding aptitude per blended
+  dollar, weighting input 80% / output 20%) behind a quality floor, so you get
+  strong-but-sensibly-priced models without opting in.
 
   ```json
-  { "priority": "balanced" }
+  { "priority": "performance" }
   ```
 
   Or per-session with `TINY_CODE_PRIORITY=cost`. Pinning `model` (config, env,
diff --git a/src/cli.ts b/src/cli.ts
index 68ce55d..e769d1a 100644
--- a/src/cli.ts
+++ b/src/cli.ts
@@ -26,7 +26,7 @@ Environment:
   QWEN_API_KEY         Required for the Qwen provider (or DASHSCOPE_API_KEY)
   TINY_CODE_OLLAMA_URL Ollama OpenAI-compatible base URL (default http://localhost:11434/v1)
   TINY_CODE_PRIORITY   performance | cost | balanced — auto-picks a model when
-                       none is pinned (default: performance)
+                       none is pinned (default: balanced)
 
 Cost-saving: set "routing": "local-first" with an "escalateTo" target in your
 config to run cheap/local models by default and escalate heavy tasks. Run /costs
diff --git a/src/config/load.ts b/src/config/load.ts
index 9c84243..3adaf90 100644
--- a/src/config/load.ts
+++ b/src/config/load.ts
@@ -162,7 +162,7 @@ export function loadConfig(overrides: CliOverrides = {}, cwd: string = process.c
             : 'anthropic');
 
   const priority: Priority =
-    (env.TINY_CODE_PRIORITY as Priority | undefined) ?? file.priority ?? 'performance';
+    (env.TINY_CODE_PRIORITY as Priority | undefined) ?? file.priority ?? 'balanced';
 
   // When the user pins a model, honor it. Otherwise let the catalog pick the
   // best fit for the cost/performance priority, falling back to a static
diff --git a/src/models/catalog.ts b/src/models/catalog.ts
index 428a1bd..57d6c70 100644
--- a/src/models/catalog.ts
+++ b/src/models/catalog.ts
@@ -5,7 +5,8 @@ import type { Usage } from '../providers/types.js';
  * How to weigh cost vs. capability when auto-selecting a model.
  * - `performance`: most capable model (maximize quality, ignore price)
  * - `cost`: cheapest capable model (maximize savings)
- * - `balanced`: best capability-per-dollar among genuinely capable models
+ * - `balanced` (default): best capability-per-dollar among genuinely capable
+ *   models — `codingScore / blendedCostPerMTok`, gated by a quality floor
  */
 export type Priority = 'performance' | 'cost' | 'balanced';
 
diff --git a/tests/config/load.test.ts b/tests/config/load.test.ts
index 6ec7878..158c26b 100644
--- a/tests/config/load.test.ts
+++ b/tests/config/load.test.ts
@@ -45,11 +45,12 @@ afterEach(async () => {
 });
 
 describe('loadConfig', () => {
-  it('infers anthropic when only ANTHROPIC_API_KEY is set', () => {
+  it('infers anthropic when only ANTHROPIC_API_KEY is set, picking the balanced model', () => {
     process.env.ANTHROPIC_API_KEY = 'sk-test';
     const cfg = loadConfig({}, cwd);
     expect(cfg.provider).toBe('anthropic');
-    expect(cfg.model).toBe('claude-opus-4-8');
+    // Balanced is the default priority, so it favors capability-per-dollar (Sonnet) over Opus.
+    expect(cfg.model).toBe('claude-sonnet-4-6');
     expect(cfg.anthropicApiKey).toBe('sk-test');
   });
 
@@ -112,8 +113,16 @@ describe('loadConfig', () => {
     expect(cfg.improve.onSessionEnd).toBe(false);
   });
 
-  it('defaults to performance priority and the most capable model', () => {
+  it('defaults to balanced priority and the best capability-per-dollar model', () => {
+    process.env.ANTHROPIC_API_KEY = 'sk-test';
+    const cfg = loadConfig({}, cwd);
+    expect(cfg.priority).toBe('balanced');
+    expect(cfg.model).toBe('claude-sonnet-4-6');
+  });
+
+  it('opts into the most capable model with performance priority', () => {
     process.env.ANTHROPIC_API_KEY = 'sk-test';
+    process.env.TINY_CODE_PRIORITY = 'performance';
     const cfg = loadConfig({}, cwd);
     expect(cfg.priority).toBe('performance');
     expect(cfg.model).toBe('claude-opus-4-8');
@@ -130,7 +139,7 @@ describe('loadConfig', () => {
   it('lets a pinned model win over the priority recommendation', () => {
     process.env.ANTHROPIC_API_KEY = 'sk-test';
     const cfg = loadConfig({ model: 'claude-opus-4-8' }, cwd);
-    expect(cfg.priority).toBe('performance');
+    expect(cfg.priority).toBe('balanced');
     expect(cfg.model).toBe('claude-opus-4-8');
   });
 
@@ -166,12 +175,15 @@ describe('loadConfig', () => {
     expect(cfg.ollamaBaseUrl).toBe('http://gpu-box:11434/v1');
   });
 
-  it('infers deepseek when only DEEPSEEK_API_KEY is set, picking its flagship model', () => {
+  it('infers deepseek when only DEEPSEEK_API_KEY is set', () => {
     process.env.DEEPSEEK_API_KEY = 'sk-deep';
     const cfg = loadConfig({}, cwd);
     expect(cfg.provider).toBe('deepseek');
-    expect(cfg.model).toBe('deepseek-v4-pro');
+    // Balanced default favors the cheaper flash; performance pins the pro flagship.
+    expect(cfg.model).toBe('deepseek-v4-flash');
     expect(cfg.deepseekApiKey).toBe('sk-deep');
+    process.env.TINY_CODE_PRIORITY = 'performance';
+    expect(loadConfig({}, cwd).model).toBe('deepseek-v4-pro');
   });
 
   it('infers qwen from QWEN_API_KEY or DASHSCOPE_API_KEY', () => {

From f5c383272fcf926a53f082253f33308ff3ef5e5f Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 10 Jun 2026 13:24:21 +0000
Subject: [PATCH 3/4] Add /priority command and capture provider-switching TODO

Add a /priority slash command to view and switch the cost/performance
priority mid-session, re-picking the auto-selected model (unless pinned or
governed by local-first routing) via a new AgentLoop.setProvider and a
modelPinned config flag. Record on-the-fly provider switching as a follow-up
in TODO.md.

https://claude.ai/code/session_01GHiv4kP53a96EWsFEAAqWp
---
 .changeset/priority-command.md | 13 +++++++
 README.md                      |  7 ++--
 TODO.md                        | 14 ++++++++
 src/agent/loop.ts              | 12 ++++++-
 src/config/load.ts             |  4 +++
 src/repl.ts                    | 63 ++++++++++++++++++++++++++++++++--
 tests/agent/loop.test.ts       | 16 +++++++++
 tests/config/load.test.ts      |  6 ++++
 8 files changed, 130 insertions(+), 5 deletions(-)
 create mode 100644 .changeset/priority-command.md

diff --git a/.changeset/priority-command.md b/.changeset/priority-command.md
new file mode 100644
index 0000000..a9b8b6e
--- /dev/null
+++ b/.changeset/priority-command.md
@@ -0,0 +1,13 @@
+---
+"@therr/tiny-code": minor
+---
+
+Add a `/priority` command to switch cost/performance bias mid-session.
+
+`/priority` (no args) shows the current priority and the active model;
+`/priority performance | balanced | cost` switches it and re-picks the
+auto-selected model on the fly — e.g. jump to the most capable model when a task
+gets hard, then drop back to `balanced`. Pinned models and local-first routing
+keep governing the model themselves, so there the command just records the new
+priority. Backed by a new `AgentLoop.setProvider` for swapping the active
+provider mid-session, and a `modelPinned` flag on the resolved config.
diff --git a/README.md b/README.md
index b1548a7..b6144c3 100644
--- a/README.md
+++ b/README.md
@@ -61,6 +61,7 @@ shell commands) prompt for approval unless pre-approved in config.
 - `/costs` — session token usage, estimated $ cost, and cost-saving tips
 - `/clear` — clear the conversation history and start fresh
 - `/models` — show known models, pricing, and the active one (see below)
+- `/priority [performance|balanced|cost]` — show or switch the cost/performance priority mid-session; re-picks the auto-selected model unless one is pinned (see below)
 - `/improve` — reflect on the session and propose an improvement PR (see below)
 - `/<name> [args]` — run a custom command (see below)
 - `/exit` — quit
@@ -228,8 +229,10 @@ money and to pick a model that fits your cost/performance preference.
   { "priority": "performance" }
   ```
 
-  Or per-session with `TINY_CODE_PRIORITY=cost`. Pinning `model` (config, env,
-  or `--model`) always overrides the recommendation.
+  Or per-session with `TINY_CODE_PRIORITY=cost`, or on the fly with the
+  `/priority` command (e.g. `/priority performance` to jump to the most capable
+  model when a task gets hard, then `/priority balanced` to drop back). Pinning
+  `model` (config, env, or `--model`) always overrides the recommendation.
 
 The catalog is curated and offline (tiny-code has no live model-discovery yet —
 see `TODO.md`), so its prices carry an "as of" date; keep it current as vendors
diff --git a/TODO.md b/TODO.md
index 10484b0..455868c 100644
--- a/TODO.md
+++ b/TODO.md
@@ -17,6 +17,20 @@ a single condensed block. For Anthropic use the compaction beta; for Gemini
 summarize via a lightweight call to a cheap model. Pair with conversation
 persistence so compacted sessions can be resumed.
 
+## On-the-fly provider switching
+The `/priority` command already swaps the active *model* within the current
+provider mid-session (`AgentLoop.setProvider`). Extend this to switch the
+*provider* too, so a session can move between Anthropic, Gemini, DeepSeek, Qwen,
+and Ollama without restarting. **Approach:** a `/provider <name> [model]`
+command that validates the target's API key (reuse `createProvider`'s checks),
+re-resolves the model (honoring `priority` and any pin), rebuilds the provider,
+and calls `agent.setProvider`. Decide how it interacts with local-first routing
+(switching the primary vs. the `escalateTo` target) and keep `/costs` accurate
+across providers — usage is already priced per-turn from the active model, so the
+running total stays correct; just refresh the session-end summary's model.
+Consider a single `/model <id>` shortcut that infers the provider from the
+catalog entry.
+
 ## Sub-agents
 Spawn isolated agent runs for parallel exploration/research (like a lightweight
 Explore/Plan agent). **Approach:** a `spawn_agent` tool whose `execute` constructs
diff --git a/src/agent/loop.ts b/src/agent/loop.ts
index d57570b..9c683ba 100644
--- a/src/agent/loop.ts
+++ b/src/agent/loop.ts
@@ -51,7 +51,7 @@ export interface AgentLoopOptions {
  * iteration guard trips). Conversation state persists across `run` calls.
  */
 export class AgentLoop {
-  private readonly provider: ModelProvider;
+  private provider: ModelProvider;
   private readonly registry: ToolRegistry;
   private readonly gate: PermissionGate;
   private readonly system: string;
@@ -86,6 +86,16 @@ export class AgentLoop {
     return this.messages;
   }
 
+  /**
+   * Swap the base provider mid-session — e.g. when the user changes the active
+   * model via `/priority`. Only affects un-escalated turns; if the session has
+   * stuck to an escalated frontier provider, that takes precedence until
+   * `clearHistory()` resets routing.
+   */
+  setProvider(provider: ModelProvider): void {
+    this.provider = provider;
+  }
+
   /** Drop the conversation history so the next turn starts fresh. Cumulative
    *  token usage is preserved, since it reflects the whole session's cost.
    *  Also clears sticky escalation: a fresh conversation re-routes from scratch. */
diff --git a/src/config/load.ts b/src/config/load.ts
index 3adaf90..19e45d5 100644
--- a/src/config/load.ts
+++ b/src/config/load.ts
@@ -30,6 +30,9 @@ export interface AllowRules {
 export interface ResolvedConfig {
   provider: Provider;
   model: string;
+  /** True when `model` was explicitly pinned (CLI/env/config), so changing
+   *  `priority` shouldn't re-pick it. */
+  modelPinned: boolean;
   /** Cost/performance bias used to auto-pick a model when none is pinned. */
   priority: Priority;
   anthropicApiKey: string | undefined;
@@ -195,6 +198,7 @@ export function loadConfig(overrides: CliOverrides = {}, cwd: string = process.c
   return {
     provider,
     model,
+    modelPinned: pinnedModel !== undefined,
     priority,
     anthropicApiKey,
     geminiApiKey,
diff --git a/src/repl.ts b/src/repl.ts
index e9409f2..bd887b5 100644
--- a/src/repl.ts
+++ b/src/repl.ts
@@ -13,7 +13,7 @@ import { LocalFirstModelEngine } from './agent/decision/index.js';
 import type { ModelDecisionEngine } from './agent/decision/index.js';
 import { checkLocalModel } from './system/resources.js';
 import { loadConfig } from './config/load.js';
-import type { CliOverrides, ResolvedConfig } from './config/load.js';
+import type { CliOverrides, ResolvedConfig, Priority } from './config/load.js';
 import { loadProjectContext } from './config/context.js';
 import { buildSystemPrompt } from './agent/systemPrompt.js';
 import { loadCommands, renderCommand } from './commands/loader.js';
@@ -26,6 +26,7 @@ import {
   estimateCostUsd,
   formatUsd,
   blendedCostPerMTok,
+  recommendModel,
 } from './models/catalog.js';
 import type { Usage } from './providers/types.js';
 import { getUpdateNotice, maybeRefreshUpdateCache, formatUpdateNotice } from './system/updateCheck.js';
@@ -61,6 +62,7 @@ function printHelp(commands: Map<string, Command>): void {
   console.log('  /costs           Show token usage, est. cost, and cost-saving tips');
   console.log('  /clear           Clear the conversation history and start fresh');
   console.log('  /models          Show known models, pricing, and the active one');
+  console.log('  /priority        Show or switch the cost/performance priority (e.g. /priority performance)');
   console.log('  /improve         Reflect on this session and propose an improvement PR');
   console.log('  /exit, /quit     Leave the session');
   if (commands.size > 0) {
@@ -154,7 +156,7 @@ export async function startRepl(overrides: CliOverrides): Promise<void> {
     });
 
   const gate = new PermissionGate(config.allow, prompt);
-  const modelInfo = getModelInfo(config.model);
+  let modelInfo = getModelInfo(config.model);
   const ui = createTerminalUI({ model: provider.model, provider: provider.name });
   const agent = new AgentLoop({
     provider,
@@ -247,6 +249,51 @@ export async function startRepl(overrides: CliOverrides): Promise<void> {
   }
   console.log(pc.dim('Type a request, /help for commands, /costs for usage, /exit to quit.'));
 
+  const PRIORITIES: Priority[] = ['performance', 'balanced', 'cost'];
+
+  const printPriority = (): void => {
+    console.log(pc.bold('\nPriority: ') + config.priority + pc.dim(`  (active model: ${config.model})`));
+    console.log(pc.dim('  performance  most capable model, ignoring price'));
+    console.log(pc.dim('  balanced     best capability-per-dollar (default)'));
+    console.log(pc.dim('  cost         cheapest still-capable model'));
+    console.log(pc.dim('Switch with: /priority performance | balanced | cost'));
+  };
+
+  // Change the auto-selection priority mid-session and re-pick the model when
+  // appropriate. Pinned models and local-first routing govern the model
+  // themselves, so there we just record the new priority.
+  const setPriority = (priority: Priority): void => {
+    if (priority === config.priority) {
+      console.log(pc.dim(`Priority already ${priority}.`));
+      return;
+    }
+    config.priority = priority;
+
+    if (config.modelPinned) {
+      console.log(pc.dim(`Priority → ${priority}. Model ${config.model} is pinned, so it stays.`));
+      return;
+    }
+    if (localFirst) {
+      console.log(
+        pc.dim(`Priority → ${priority}. Local-first routing picks the model; this applies if routing is off.`),
+      );
+      return;
+    }
+    const picked = recommendModel({ provider: config.provider, priority });
+    if (!picked || picked.id === config.model) {
+      console.log(pc.dim(`Priority → ${priority}. Model unchanged (${config.model}).`));
+      return;
+    }
+    const prevModel = config.model;
+    config.model = picked.id;
+    modelInfo = getModelInfo(picked.id);
+    agent.setProvider(createProvider(config));
+    console.log(
+      pc.cyan(`Priority → ${priority}.`) +
+        pc.dim(` Model ${prevModel} → ${picked.id} ($${picked.inputPricePerMTok}/$${picked.outputPricePerMTok} per 1M in/out).`),
+    );
+  };
+
   const handle = async (line: string): Promise<void> => {
     const input = line.trim();
     if (input.length === 0) {
@@ -279,6 +326,18 @@ export async function startRepl(overrides: CliOverrides): Promise<void> {
       ask();
       return;
     }
+    if (input === '/priority' || input.startsWith('/priority ')) {
+      const arg = input.slice('/priority'.length).trim().toLowerCase();
+      if (arg.length === 0) {
+        printPriority();
+      } else if ((PRIORITIES as string[]).includes(arg)) {
+        setPriority(arg as Priority);
+      } else {
+        console.log(pc.red(`Unknown priority: ${arg} (use performance, balanced, or cost)`));
+      }
+      ask();
+      return;
+    }
     if (input === '/improve') {
       if (config.improve.enabled) {
         await improve();
diff --git a/tests/agent/loop.test.ts b/tests/agent/loop.test.ts
index cec1e20..e54c910 100644
--- a/tests/agent/loop.test.ts
+++ b/tests/agent/loop.test.ts
@@ -367,6 +367,22 @@ describe('AgentLoop', () => {
     expect(loop.getUsage()).toEqual({ inputTokens: 30, outputTokens: 13 });
   });
 
+  it('routes later turns to a provider swapped in via setProvider', async () => {
+    const first = new ScriptedProvider([[{ type: 'text', delta: 'a' }, DONE]], 'model-a');
+    const second = new ScriptedProvider([[{ type: 'text', delta: 'b' }, DONE]], 'model-b');
+    const { ui } = recordingUI();
+    const loop = makeLoop(first, ui, gateWith('yes'));
+
+    await loop.run('one');
+    expect(first.sent).toHaveLength(1);
+
+    loop.setProvider(second);
+    await loop.run('two');
+    // The swapped-in provider handles the new turn; the old one is untouched.
+    expect(first.sent).toHaveLength(1);
+    expect(second.sent).toHaveLength(1);
+  });
+
   it('stops at the iteration guard when tools never stop', async () => {
     const looping: ProviderEvent[][] = [];
     for (let i = 0; i < 10; i += 1) {
diff --git a/tests/config/load.test.ts b/tests/config/load.test.ts
index 158c26b..e72196a 100644
--- a/tests/config/load.test.ts
+++ b/tests/config/load.test.ts
@@ -120,6 +120,12 @@ describe('loadConfig', () => {
     expect(cfg.model).toBe('claude-sonnet-4-6');
   });
 
+  it('flags whether the model was pinned', () => {
+    process.env.ANTHROPIC_API_KEY = 'sk-test';
+    expect(loadConfig({}, cwd).modelPinned).toBe(false);
+    expect(loadConfig({ model: 'claude-opus-4-8' }, cwd).modelPinned).toBe(true);
+  });
+
   it('opts into the most capable model with performance priority', () => {
     process.env.ANTHROPIC_API_KEY = 'sk-test';
     process.env.TINY_CODE_PRIORITY = 'performance';

From fc47dc3e6cc195482cc3aae1cca7d448a3bef833 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 10 Jun 2026 13:39:01 +0000
Subject: [PATCH 4/4] Fix review findings and prepare 0.3.0 release

- Price the session-end summary from the per-turn accumulated cost (matches
  /costs) instead of repricing all tokens at the final model's rate, which was
  wrong after a mid-session /priority model switch.
- Validate TINY_CODE_PROVIDER / TINY_CODE_PRIORITY env values; an unrecognized
  value is now ignored with a warning instead of being cast through and
  silently mis-picking a model.
- Add coverage for the provider-scoped synthetic tool-call id fallback.
- Run changeset version: bump to 0.3.0 and generate CHANGELOG.

https://claude.ai/code/session_01GHiv4kP53a96EWsFEAAqWp
---
 .changeset/balanced-default-priority.md | 14 ------
 .changeset/local-models-cost-routing.md | 26 ----------
 .changeset/priority-command.md          | 13 -----
 .changeset/qwen-deepseek-coder.md       | 19 --------
 CHANGELOG.md                            | 63 +++++++++++++++++++++++++
 package.json                            |  2 +-
 src/config/load.ts                      | 25 +++++++++-
 src/repl.ts                             |  9 ++--
 tests/config/load.test.ts               | 16 +++++++
 tests/providers/openaiCloudSend.test.ts | 17 +++++++
 10 files changed, 126 insertions(+), 78 deletions(-)
 delete mode 100644 .changeset/balanced-default-priority.md
 delete mode 100644 .changeset/local-models-cost-routing.md
 delete mode 100644 .changeset/priority-command.md
 delete mode 100644 .changeset/qwen-deepseek-coder.md
 create mode 100644 CHANGELOG.md

diff --git a/.changeset/balanced-default-priority.md b/.changeset/balanced-default-priority.md
deleted file mode 100644
index b5ba201..0000000
--- a/.changeset/balanced-default-priority.md
+++ /dev/null
@@ -1,14 +0,0 @@
----
-"@therr/tiny-code": minor
----
-
-Default model selection to `balanced` priority.
-
-When no `model` is pinned, tiny-code now defaults to `priority: "balanced"`
-instead of `performance`, picking the best capability-per-dollar model
-(`codingScore / blendedCostPerMTok`, behind a quality floor) rather than the
-most capable regardless of price. In line with the project's token-minimalism
-goal, this makes the out-of-the-box pick cost-aware — e.g. Claude Sonnet rather
-than Opus for Anthropic. Set `priority: "performance"` (or
-`TINY_CODE_PRIORITY=performance`) to restore the previous most-capable defaults;
-pinning a `model` still overrides everything.
diff --git a/.changeset/local-models-cost-routing.md b/.changeset/local-models-cost-routing.md
deleted file mode 100644
index 240da17..0000000
--- a/.changeset/local-models-cost-routing.md
+++ /dev/null
@@ -1,26 +0,0 @@
----
-"@therr/tiny-code": minor
----
-
-Add local models and cost-aware, local-first routing.
-
-- **Local (Ollama) provider.** Talk to a local Ollama server over its
-  OpenAI-compatible API (`--provider ollama`), with an idle timeout so a hung
-  model can't freeze the REPL, best-effort token-usage reporting, and configurable
-  `maxTokens`.
-- **Local-first routing.** Set `routing: "local-first"` with an `escalateTo`
-  target to run a cheap/local model by default and escalate heavy turns (or a
-  stuck local model, via the new `escalate` tool) to a frontier model — with full
-  conversation context preserved. Escalation is sticky across follow-up turns.
-- **Model-selection policy** is now owned by a pluggable `ModelDecisionEngine`
-  (`LocalFirstModelEngine`), keeping the agent loop pure mechanism.
-- **Compute awareness.** On startup with a local model, tiny-code estimates RAM
-  need vs. machine capacity and warns when a model likely won't fit or is too
-  small (≤3B) to tool-call reliably; an over-RAM local model is routed to the
-  frontier up front.
-- **Priority-driven model selection.** `priority` (`performance` / `cost` /
-  `balanced`, or `TINY_CODE_PRIORITY`) auto-picks a catalog model when none is
-  pinned.
-- The `/costs` view reports session usage, estimated spend, and routing, and the
-  usage line distinguishes an unpriced *cloud* turn ("cost unknown") from a
-  *local* turn ("no API cost").
diff --git a/.changeset/priority-command.md b/.changeset/priority-command.md
deleted file mode 100644
index a9b8b6e..0000000
--- a/.changeset/priority-command.md
+++ /dev/null
@@ -1,13 +0,0 @@
----
-"@therr/tiny-code": minor
----
-
-Add a `/priority` command to switch cost/performance bias mid-session.
-
-`/priority` (no args) shows the current priority and the active model;
-`/priority performance | balanced | cost` switches it and re-picks the
-auto-selected model on the fly — e.g. jump to the most capable model when a task
-gets hard, then drop back to `balanced`. Pinned models and local-first routing
-keep governing the model themselves, so there the command just records the new
-priority. Backed by a new `AgentLoop.setProvider` for swapping the active
-provider mid-session, and a `modelPinned` flag on the resolved config.
diff --git a/.changeset/qwen-deepseek-coder.md b/.changeset/qwen-deepseek-coder.md
deleted file mode 100644
index 53d5185..0000000
--- a/.changeset/qwen-deepseek-coder.md
+++ /dev/null
@@ -1,19 +0,0 @@
----
-"@therr/tiny-code": minor
----
-
-Add DeepSeek and Qwen Coder model support.
-
-- **DeepSeek and Qwen providers.** Two new hosted, OpenAI-compatible providers
-  (`--provider deepseek` / `--provider qwen`), keyed by `DEEPSEEK_API_KEY` and
-  `QWEN_API_KEY` (or `DASHSCOPE_API_KEY`). Endpoints are overridable via
-  `TINY_CODE_DEEPSEEK_URL` / `TINY_CODE_QWEN_URL` or `deepseekBaseUrl` /
-  `qwenBaseUrl` in config — e.g. to target the international DashScope host.
-- **Shared OpenAI-compatible core.** The streaming/tool-call adapter that backed
-  the Ollama provider is now a reusable `OpenAiCompatibleProvider` base; Ollama,
-  DeepSeek, and Qwen all extend it, differing only in endpoint, auth, and error
-  wording.
-- **Catalog entries** for `deepseek-v4-pro`, `deepseek-v4-flash`,
-  `qwen3-coder-plus`, and `qwen3-coder-flash`, so `/costs` estimates and
-  priority-based model selection work for the new providers. `/costs` treats both
-  as paid cloud providers.
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..1e1383a
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,63 @@
+# @therr/tiny-code
+
+## 0.3.0
+
+### Minor Changes
+
+- 118faa0: Default model selection to `balanced` priority.
+
+  When no `model` is pinned, tiny-code now defaults to `priority: "balanced"`
+  instead of `performance`, picking the best capability-per-dollar model
+  (`codingScore / blendedCostPerMTok`, behind a quality floor) rather than the
+  most capable regardless of price. In line with the project's token-minimalism
+  goal, this makes the out-of-the-box pick cost-aware — e.g. Claude Sonnet rather
+  than Opus for Anthropic. Set `priority: "performance"` (or
+  `TINY_CODE_PRIORITY=performance`) to restore the previous most-capable defaults;
+  pinning a `model` still overrides everything.
+
+- 785b832: Add local models and cost-aware, local-first routing.
+  - **Local (Ollama) provider.** Talk to a local Ollama server over its
+    OpenAI-compatible API (`--provider ollama`), with an idle timeout so a hung
+    model can't freeze the REPL, best-effort token-usage reporting, and configurable
+    `maxTokens`.
+  - **Local-first routing.** Set `routing: "local-first"` with an `escalateTo`
+    target to run a cheap/local model by default and escalate heavy turns (or a
+    stuck local model, via the new `escalate` tool) to a frontier model — with full
+    conversation context preserved. Escalation is sticky across follow-up turns.
+  - **Model-selection policy** is now owned by a pluggable `ModelDecisionEngine`
+    (`LocalFirstModelEngine`), keeping the agent loop pure mechanism.
+  - **Compute awareness.** On startup with a local model, tiny-code estimates RAM
+    need vs. machine capacity and warns when a model likely won't fit or is too
+    small (≤3B) to tool-call reliably; an over-RAM local model is routed to the
+    frontier up front.
+  - **Priority-driven model selection.** `priority` (`performance` / `cost` /
+    `balanced`, or `TINY_CODE_PRIORITY`) auto-picks a catalog model when none is
+    pinned.
+  - The `/costs` view reports session usage, estimated spend, and routing, and the
+    usage line distinguishes an unpriced _cloud_ turn ("cost unknown") from a
+    _local_ turn ("no API cost").
+
+- f5c3832: Add a `/priority` command to switch cost/performance bias mid-session.
+
+  `/priority` (no args) shows the current priority and the active model;
+  `/priority performance | balanced | cost` switches it and re-picks the
+  auto-selected model on the fly — e.g. jump to the most capable model when a task
+  gets hard, then drop back to `balanced`. Pinned models and local-first routing
+  keep governing the model themselves, so there the command just records the new
+  priority. Backed by a new `AgentLoop.setProvider` for swapping the active
+  provider mid-session, and a `modelPinned` flag on the resolved config.
+
+- 52b179d: Add DeepSeek and Qwen Coder model support.
+  - **DeepSeek and Qwen providers.** Two new hosted, OpenAI-compatible providers
+    (`--provider deepseek` / `--provider qwen`), keyed by `DEEPSEEK_API_KEY` and
+    `QWEN_API_KEY` (or `DASHSCOPE_API_KEY`). Endpoints are overridable via
+    `TINY_CODE_DEEPSEEK_URL` / `TINY_CODE_QWEN_URL` or `deepseekBaseUrl` /
+    `qwenBaseUrl` in config — e.g. to target the international DashScope host.
+  - **Shared OpenAI-compatible core.** The streaming/tool-call adapter that backed
+    the Ollama provider is now a reusable `OpenAiCompatibleProvider` base; Ollama,
+    DeepSeek, and Qwen all extend it, differing only in endpoint, auth, and error
+    wording.
+  - **Catalog entries** for `deepseek-v4-pro`, `deepseek-v4-flash`,
+    `qwen3-coder-plus`, and `qwen3-coder-flash`, so `/costs` estimates and
+    priority-based model selection work for the new providers. `/costs` treats both
+    as paid cloud providers.
diff --git a/package.json b/package.json
index 1a2d61c..07004f4 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@therr/tiny-code",
-  "version": "0.2.3",
+  "version": "0.3.0",
   "description": "A small, extensible CLI coding agent with interchangeable Anthropic and Gemini models.",
   "type": "module",
   "bin": {
diff --git a/src/config/load.ts b/src/config/load.ts
index 19e45d5..f10514d 100644
--- a/src/config/load.ts
+++ b/src/config/load.ts
@@ -85,6 +85,27 @@ const DEFAULT_MODELS: Record<Provider, string> = {
 const DEFAULT_OLLAMA_URL = 'http://localhost:11434/v1';
 
 const PROVIDERS = ['anthropic', 'gemini', 'ollama', 'deepseek', 'qwen'] as const;
+const PRIORITIES = ['performance', 'cost', 'balanced'] as const;
+
+/**
+ * Read an env var constrained to a known set. An unrecognized value is ignored
+ * (with a warning) rather than cast through blindly: an unchecked cast lets a
+ * typo like `TINY_CODE_PRIORITY=performant` fall through `recommendModel` and
+ * silently pick an unintended model. Returns `undefined` so resolution falls
+ * back to the next source in precedence.
+ */
+function readEnvEnum<T extends string>(
+  name: string,
+  value: string | undefined,
+  allowed: readonly T[],
+): T | undefined {
+  if (value === undefined || value === '') return undefined;
+  if ((allowed as readonly string[]).includes(value)) return value as T;
+  process.stderr.write(
+    `tiny-code: ignoring ${name}="${value}" — expected one of: ${allowed.join(', ')}\n`,
+  );
+  return undefined;
+}
 
 const EscalateTargetSchema = z.object({
   provider: z.enum(PROVIDERS),
@@ -152,7 +173,7 @@ export function loadConfig(overrides: CliOverrides = {}, cwd: string = process.c
 
   const provider: Provider =
     overrides.provider ??
-    (env.TINY_CODE_PROVIDER as Provider | undefined) ??
+    readEnvEnum('TINY_CODE_PROVIDER', env.TINY_CODE_PROVIDER, PROVIDERS) ??
     file.provider ??
     (anthropicApiKey
       ? 'anthropic'
@@ -165,7 +186,7 @@ export function loadConfig(overrides: CliOverrides = {}, cwd: string = process.c
             : 'anthropic');
 
   const priority: Priority =
-    (env.TINY_CODE_PRIORITY as Priority | undefined) ?? file.priority ?? 'balanced';
+    readEnvEnum('TINY_CODE_PRIORITY', env.TINY_CODE_PRIORITY, PRIORITIES) ?? file.priority ?? 'balanced';
 
   // When the user pins a model, honor it. Otherwise let the catalog pick the
   // best fit for the cost/performance priority, falling back to a static
diff --git a/src/repl.ts b/src/repl.ts
index bd887b5..1bb7a00 100644
--- a/src/repl.ts
+++ b/src/repl.ts
@@ -156,7 +156,7 @@ export async function startRepl(overrides: CliOverrides): Promise<void> {
     });
 
   const gate = new PermissionGate(config.allow, prompt);
-  let modelInfo = getModelInfo(config.model);
+  const modelInfo = getModelInfo(config.model);
   const ui = createTerminalUI({ model: provider.model, provider: provider.name });
   const agent = new AgentLoop({
     provider,
@@ -286,7 +286,6 @@ export async function startRepl(overrides: CliOverrides): Promise<void> {
     }
     const prevModel = config.model;
     config.model = picked.id;
-    modelInfo = getModelInfo(picked.id);
     agent.setProvider(createProvider(config));
     console.log(
       pc.cyan(`Priority → ${priority}.`) +
@@ -378,7 +377,11 @@ export async function startRepl(overrides: CliOverrides): Promise<void> {
     const usage = agent.getUsage();
     if (usage.inputTokens > 0 || usage.outputTokens > 0) {
       const fmtN = (n: number) => n.toLocaleString('en-US');
-      const cost = modelInfo ? ` ≈ ${formatUsd(estimateCostUsd(usage, modelInfo))}` : '';
+      // Use the per-turn accumulated cost (matches /costs) rather than repricing
+      // the whole session at one model's rate — the active model can change
+      // mid-session via /priority, so a single-rate estimate would be wrong.
+      const sessionCost = ui.getTotals().cost;
+      const cost = sessionCost > 0 ? ` ≈ ${formatUsd(sessionCost)}` : '';
       console.log(
         pc.dim(
           `\nSession: ↑ ${fmtN(usage.inputTokens)}  ↓ ${fmtN(usage.outputTokens)} tokens total${cost}`,
diff --git a/tests/config/load.test.ts b/tests/config/load.test.ts
index e72196a..056c515 100644
--- a/tests/config/load.test.ts
+++ b/tests/config/load.test.ts
@@ -126,6 +126,22 @@ describe('loadConfig', () => {
     expect(loadConfig({ model: 'claude-opus-4-8' }, cwd).modelPinned).toBe(true);
   });
 
+  it('ignores an invalid TINY_CODE_PRIORITY instead of silently mis-picking a model', () => {
+    process.env.ANTHROPIC_API_KEY = 'sk-test';
+    process.env.TINY_CODE_PRIORITY = 'performant'; // typo
+    const cfg = loadConfig({}, cwd);
+    // Falls back to the default priority + its model, not an arbitrary catalog entry.
+    expect(cfg.priority).toBe('balanced');
+    expect(cfg.model).toBe('claude-sonnet-4-6');
+  });
+
+  it('ignores an invalid TINY_CODE_PROVIDER and falls back to key inference', () => {
+    process.env.ANTHROPIC_API_KEY = 'sk-test';
+    process.env.TINY_CODE_PROVIDER = 'mistral'; // unsupported
+    const cfg = loadConfig({}, cwd);
+    expect(cfg.provider).toBe('anthropic');
+  });
+
   it('opts into the most capable model with performance priority', () => {
     process.env.ANTHROPIC_API_KEY = 'sk-test';
     process.env.TINY_CODE_PRIORITY = 'performance';
diff --git a/tests/providers/openaiCloudSend.test.ts b/tests/providers/openaiCloudSend.test.ts
index 3e53e02..7b7bfcb 100644
--- a/tests/providers/openaiCloudSend.test.ts
+++ b/tests/providers/openaiCloudSend.test.ts
@@ -53,6 +53,23 @@ describe('DeepSeekProvider.send', () => {
     expect(done).toMatchObject({ usage: { inputTokens: 5, outputTokens: 2 } });
   });
 
+  it('synthesizes a provider-scoped tool-call id when the server omits one', async () => {
+    // The OpenAI wire format normally supplies an id; some servers don't. The
+    // fallback must stay non-empty so the result can be correlated next turn.
+    vi.spyOn(globalThis, 'fetch').mockResolvedValue(
+      sseResponse([
+        {
+          choices: [
+            { delta: { tool_calls: [{ index: 0, function: { name: 'ls', arguments: '{}' } }] } },
+          ],
+        },
+      ]),
+    );
+    const provider = new DeepSeekProvider({ apiKey: 'k', model: 'deepseek-v4-pro' });
+    const call = (await collect(provider)).find((e) => e.type === 'tool_call');
+    expect(call).toMatchObject({ name: 'ls', id: 'deepseek-call-0' });
+  });
+
   it('reports a DeepSeek-specific error when the host is unreachable', async () => {
     vi.spyOn(globalThis, 'fetch').mockRejectedValue(new Error('ENOTFOUND'));
     const provider = new DeepSeekProvider({ apiKey: 'k', model: 'deepseek-v4-pro' });