From 52b179d6653fc85610af55b9f7a9f1634f50c453 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 10 Jun 2026 12:52:56 +0000 Subject: [PATCH 1/4] Add DeepSeek and Qwen Coder provider support Extract the Ollama provider's OpenAI-compatible streaming core into a reusable OpenAiCompatibleProvider base, then add hosted DeepSeek and Qwen (DashScope) providers on top of it. Wire them through config (keys, base-URL overrides, provider inference, defaults), the model catalog (pricing/context/ coding scores for deepseek-v4-{pro,flash} and qwen3-coder-{plus,flash}), and the cost UI. Update CLI help, .env.example, and README. https://claude.ai/code/session_01GHiv4kP53a96EWsFEAAqWp --- .changeset/qwen-deepseek-coder.md | 19 ++ .env.example | 11 +- README.md | 20 +- src/cli.ts | 7 +- src/config/load.ts | 36 ++- src/index.ts | 12 +- src/models/catalog.ts | 11 +- src/providers/deepseek.ts | 33 +++ src/providers/index.ts | 29 +++ src/providers/ollama.ts | 269 ++-------------------- src/providers/openai-compatible.ts | 285 ++++++++++++++++++++++++ src/providers/qwen.ts | 33 +++ src/providers/types.ts | 2 +- src/ui/render.ts | 7 +- tests/config/load.test.ts | 39 ++++ tests/models/catalog.test.ts | 12 + tests/providers/openaiCloudSend.test.ts | 81 +++++++ 17 files changed, 637 insertions(+), 269 deletions(-) create mode 100644 .changeset/qwen-deepseek-coder.md create mode 100644 src/providers/deepseek.ts create mode 100644 src/providers/openai-compatible.ts create mode 100644 src/providers/qwen.ts create mode 100644 tests/providers/openaiCloudSend.test.ts diff --git a/.changeset/qwen-deepseek-coder.md b/.changeset/qwen-deepseek-coder.md new file mode 100644 index 0000000..53d5185 --- /dev/null +++ b/.changeset/qwen-deepseek-coder.md @@ -0,0 +1,19 @@ +--- +"@therr/tiny-code": minor +--- + +Add DeepSeek and Qwen Coder model support. + +- **DeepSeek and Qwen providers.** Two new hosted, OpenAI-compatible providers + (`--provider deepseek` / `--provider qwen`), keyed by `DEEPSEEK_API_KEY` and + `QWEN_API_KEY` (or `DASHSCOPE_API_KEY`). Endpoints are overridable via + `TINY_CODE_DEEPSEEK_URL` / `TINY_CODE_QWEN_URL` or `deepseekBaseUrl` / + `qwenBaseUrl` in config — e.g. to target the international DashScope host. +- **Shared OpenAI-compatible core.** The streaming/tool-call adapter that backed + the Ollama provider is now a reusable `OpenAiCompatibleProvider` base; Ollama, + DeepSeek, and Qwen all extend it, differing only in endpoint, auth, and error + wording. +- **Catalog entries** for `deepseek-v4-pro`, `deepseek-v4-flash`, + `qwen3-coder-plus`, and `qwen3-coder-flash`, so `/costs` estimates and + priority-based model selection work for the new providers. `/costs` treats both + as paid cloud providers. diff --git a/.env.example b/.env.example index ab7215c..bc84fe7 100644 --- a/.env.example +++ b/.env.example @@ -1,12 +1,17 @@ -# Provide at least one for cloud providers. If both are present, Anthropic is -# the default. Ollama runs locally and needs no key. +# Provide at least one for cloud providers. If several are present, the default +# is the first available in this order: Anthropic, Gemini, DeepSeek, Qwen. +# Ollama runs locally and needs no key. ANTHROPIC_API_KEY= GEMINI_API_KEY= +DEEPSEEK_API_KEY= +QWEN_API_KEY= # Alibaba DashScope key (DASHSCOPE_API_KEY also accepted) # Optional overrides (also settable via config file / CLI flags) -# TINY_CODE_PROVIDER=anthropic # anthropic | gemini | ollama +# TINY_CODE_PROVIDER=anthropic # anthropic | gemini | ollama | deepseek | qwen # TINY_CODE_MODEL=claude-opus-4-8 # TINY_CODE_OLLAMA_URL=http://localhost:11434/v1 # Ollama OpenAI-compatible endpoint +# TINY_CODE_DEEPSEEK_URL=https://api.deepseek.com/v1 +# TINY_CODE_QWEN_URL=https://dashscope.aliyuncs.com/compatible-mode/v1 # TINY_CODE_PRIORITY=performance # performance | cost | balanced — auto-picks a model when none is pinned # TINY_CODE_EFFORT=high # low | medium | high | xhigh | max — Anthropic thinking budget diff --git a/README.md b/README.md index 17b89cb..d55957b 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,9 @@ A small, extensible CLI coding agent built around one constraint: **keep token usage low**. As coding-agent costs climb, tiny-code automates the savings so you don't have to. Interactive terminal REPL, interchangeable **Anthropic**, -**Gemini**, and **local (Ollama)** models, and just the core features you -actually use: read/write/edit files, run shell commands, search code, and a -custom commands/skills system. No business logic baked in. +**Gemini**, **DeepSeek**, **Qwen Coder**, and **local (Ollama)** models, and just +the core features you actually use: read/write/edit files, run shell commands, +search code, and a custom commands/skills system. No business logic baked in. Run cheap, open-weight models locally and **escalate heavy work to a frontier model only when needed** — see [Local models & cost-aware routing](#local-models--cost-aware-routing). @@ -29,19 +29,28 @@ node dist/cli.js ## Setup -Provide at least one API key. If both are set, Anthropic is used by default. +Provide at least one API key. If several are set, the default is the first +available in this order: Anthropic, Gemini, DeepSeek, Qwen. ```bash export ANTHROPIC_API_KEY=sk-ant-... export GEMINI_API_KEY=... +export DEEPSEEK_API_KEY=sk-... +export QWEN_API_KEY=sk-... # Alibaba DashScope key (DASHSCOPE_API_KEY also works) ``` +DeepSeek and Qwen are hosted, OpenAI-compatible coding models. Override their +endpoints with `TINY_CODE_DEEPSEEK_URL` / `TINY_CODE_QWEN_URL` (or `deepseekBaseUrl` +/ `qwenBaseUrl` in config) — e.g. to point Qwen at the international DashScope host. + ## Usage ```bash tiny-code # start the REPL (uses an available key) tiny-code --provider gemini # force a provider tiny-code --model claude-opus-4-8 +tiny-code --provider deepseek --model deepseek-v4-pro # DeepSeek's coding model +tiny-code --provider qwen --model qwen3-coder-plus # Qwen Coder tiny-code --provider ollama --model gemma3:12b # run a local model (no API cost) ``` @@ -154,7 +163,8 @@ CLI flags. `routing: "local-first"` plus `escalateTo` enables cost-aware routing (see [above](#local-models--cost-aware-routing)); it defaults to `local-first` automatically whenever `escalateTo` is present. `ollamaBaseUrl` points at your -Ollama server's OpenAI-compatible endpoint. +Ollama server's OpenAI-compatible endpoint; `deepseekBaseUrl` / `qwenBaseUrl` +override the DeepSeek and Qwen (DashScope) endpoints. Approximate cloud pricing used for the `/costs` estimate lives in the model catalog (`src/models/catalog.ts`) — edit it to match current vendor rates. diff --git a/src/cli.ts b/src/cli.ts index 19112bb..68ce55d 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -12,8 +12,9 @@ Usage: tiny-code [options] Options: - --provider anthropic | gemini | ollama (default: inferred from API keys) - --model Model id override (e.g. claude-opus-4-8, gemma3:12b) + --provider anthropic | gemini | ollama | deepseek | qwen + (default: inferred from API keys) + --model Model id override (e.g. claude-opus-4-8, qwen3-coder-plus) --config Path to a config JSON file -v, --version Print version -h, --help Show this help @@ -21,6 +22,8 @@ Options: Environment: ANTHROPIC_API_KEY Required for the Anthropic provider GEMINI_API_KEY Required for the Gemini provider + DEEPSEEK_API_KEY Required for the DeepSeek provider + QWEN_API_KEY Required for the Qwen provider (or DASHSCOPE_API_KEY) TINY_CODE_OLLAMA_URL Ollama OpenAI-compatible base URL (default http://localhost:11434/v1) TINY_CODE_PRIORITY performance | cost | balanced — auto-picks a model when none is pinned (default: performance) diff --git a/src/config/load.ts b/src/config/load.ts index 4d25096..9c84243 100644 --- a/src/config/load.ts +++ b/src/config/load.ts @@ -5,7 +5,7 @@ import { z } from 'zod'; import type { Priority } from '../models/catalog.js'; import { recommendModel } from '../models/catalog.js'; -export type Provider = 'anthropic' | 'gemini' | 'ollama'; +export type Provider = 'anthropic' | 'gemini' | 'ollama' | 'deepseek' | 'qwen'; export type Effort = 'low' | 'medium' | 'high' | 'xhigh' | 'max'; export type Routing = 'local-first' | 'off'; export type { Priority } from '../models/catalog.js'; @@ -34,8 +34,14 @@ export interface ResolvedConfig { priority: Priority; anthropicApiKey: string | undefined; geminiApiKey: string | undefined; + deepseekApiKey: string | undefined; + qwenApiKey: string | undefined; /** OpenAI-compatible base URL for the Ollama provider. */ ollamaBaseUrl: string; + /** Override for the DeepSeek API endpoint (defaults to DeepSeek's hosted URL). */ + deepseekBaseUrl: string | undefined; + /** Override for the Qwen/DashScope API endpoint (defaults to DashScope's URL). */ + qwenBaseUrl: string | undefined; maxTokens: number; thinking: boolean; effort: Effort; @@ -69,21 +75,27 @@ const DEFAULT_MODELS: Record = { anthropic: 'claude-opus-4-8', gemini: 'gemini-2.5-pro', ollama: 'qwen2.5-coder:7b', + deepseek: 'deepseek-v4-pro', + qwen: 'qwen3-coder-plus', }; const DEFAULT_OLLAMA_URL = 'http://localhost:11434/v1'; +const PROVIDERS = ['anthropic', 'gemini', 'ollama', 'deepseek', 'qwen'] as const; + const EscalateTargetSchema = z.object({ - provider: z.enum(['anthropic', 'gemini', 'ollama']), + provider: z.enum(PROVIDERS), model: z.string(), ollamaBaseUrl: z.string().url().optional(), }); const FileConfigSchema = z .object({ - provider: z.enum(['anthropic', 'gemini', 'ollama']).optional(), + provider: z.enum(PROVIDERS).optional(), model: z.string().optional(), ollamaBaseUrl: z.string().url().optional(), + deepseekBaseUrl: z.string().url().optional(), + qwenBaseUrl: z.string().url().optional(), priority: z.enum(['performance', 'cost', 'balanced']).optional(), maxTokens: z.number().int().positive().optional(), thinking: z.boolean().optional(), @@ -132,12 +144,22 @@ export function loadConfig(overrides: CliOverrides = {}, cwd: string = process.c const env = process.env; const anthropicApiKey = env.ANTHROPIC_API_KEY || undefined; const geminiApiKey = env.GEMINI_API_KEY || undefined; + const deepseekApiKey = env.DEEPSEEK_API_KEY || undefined; + const qwenApiKey = env.QWEN_API_KEY || env.DASHSCOPE_API_KEY || undefined; const provider: Provider = overrides.provider ?? (env.TINY_CODE_PROVIDER as Provider | undefined) ?? file.provider ?? - (anthropicApiKey ? 'anthropic' : geminiApiKey ? 'gemini' : 'anthropic'); + (anthropicApiKey + ? 'anthropic' + : geminiApiKey + ? 'gemini' + : deepseekApiKey + ? 'deepseek' + : qwenApiKey + ? 'qwen' + : 'anthropic'); const priority: Priority = (env.TINY_CODE_PRIORITY as Priority | undefined) ?? file.priority ?? 'performance'; @@ -158,6 +180,8 @@ export function loadConfig(overrides: CliOverrides = {}, cwd: string = process.c const effort = (env.TINY_CODE_EFFORT as Effort | undefined) ?? file.effort ?? 'high'; const ollamaBaseUrl = env.TINY_CODE_OLLAMA_URL ?? file.ollamaBaseUrl ?? DEFAULT_OLLAMA_URL; + const deepseekBaseUrl = env.TINY_CODE_DEEPSEEK_URL ?? file.deepseekBaseUrl; + const qwenBaseUrl = env.TINY_CODE_QWEN_URL ?? file.qwenBaseUrl; const escalateTo = file.escalateTo; // Default to local-first whenever an escalation target is configured. @@ -174,7 +198,11 @@ export function loadConfig(overrides: CliOverrides = {}, cwd: string = process.c priority, anthropicApiKey, geminiApiKey, + deepseekApiKey, + qwenApiKey, ollamaBaseUrl, + deepseekBaseUrl, + qwenBaseUrl, maxTokens, thinking: file.thinking ?? true, effort, diff --git a/src/index.ts b/src/index.ts index d10bb65..ececf80 100644 --- a/src/index.ts +++ b/src/index.ts @@ -8,9 +8,17 @@ export type { AgentUI, AgentLoopOptions } from './agent/loop.js'; export { buildSystemPrompt } from './agent/systemPrompt.js'; export type { SystemPromptParams } from './agent/systemPrompt.js'; -export { createProvider, AnthropicProvider, GeminiProvider, OllamaProvider } from './providers/index.js'; +export { + createProvider, + AnthropicProvider, + GeminiProvider, + OllamaProvider, + DeepSeekProvider, + QwenProvider, + OpenAiCompatibleProvider, +} from './providers/index.js'; export type { ModelProvider, ProviderEvent, SendRequest, ToolSchema, Usage } from './providers/types.js'; -export { toOpenAiMessages, toOpenAiTools } from './providers/ollama.js'; +export { toOpenAiMessages, toOpenAiTools } from './providers/openai-compatible.js'; export { classifyTurn } from './agent/router.js'; export type { TaskWeight } from './agent/router.js'; diff --git a/src/models/catalog.ts b/src/models/catalog.ts index b7f1a94..428a1bd 100644 --- a/src/models/catalog.ts +++ b/src/models/catalog.ts @@ -32,7 +32,7 @@ export interface ModelInfo { * from the bundled claude-api reference; Gemini figures from Google's published * API pricing. */ -export const CATALOG_AS_OF = '2026-06-08'; +export const CATALOG_AS_OF = '2026-06-10'; /** * The known coding models, newest/most-capable first within each provider. @@ -51,6 +51,15 @@ export const MODEL_CATALOG: ModelInfo[] = [ { id: 'gemini-2.5-pro', provider: 'gemini', label: 'Gemini 2.5 Pro', inputPricePerMTok: 1.25, outputPricePerMTok: 10, contextWindow: 1_048_576, codingScore: 90 }, { id: 'gemini-2.5-flash', provider: 'gemini', label: 'Gemini 2.5 Flash', inputPricePerMTok: 0.3, outputPricePerMTok: 2.5, contextWindow: 1_048_576, codingScore: 72 }, { id: 'gemini-2.5-flash-lite', provider: 'gemini', label: 'Gemini 2.5 Flash-Lite', inputPricePerMTok: 0.1, outputPricePerMTok: 0.4, contextWindow: 1_048_576, codingScore: 55 }, + + // DeepSeek — DeepSeek API (cache-miss) pricing. The V4 family carries DeepSeek's + // coding capability; the legacy "deepseek-coder" model is retired. + { id: 'deepseek-v4-pro', provider: 'deepseek', label: 'DeepSeek V4 Pro', inputPricePerMTok: 1.74, outputPricePerMTok: 3.48, contextWindow: 1_048_576, codingScore: 91 }, + { id: 'deepseek-v4-flash', provider: 'deepseek', label: 'DeepSeek V4 Flash', inputPricePerMTok: 0.14, outputPricePerMTok: 0.28, contextWindow: 1_048_576, codingScore: 80 }, + + // Qwen Coder — Alibaba DashScope pricing for the proprietary coder models. + { id: 'qwen3-coder-plus', provider: 'qwen', label: 'Qwen3 Coder Plus', inputPricePerMTok: 0.65, outputPricePerMTok: 3.25, contextWindow: 1_000_000, codingScore: 89 }, + { id: 'qwen3-coder-flash', provider: 'qwen', label: 'Qwen3 Coder Flash', inputPricePerMTok: 0.195, outputPricePerMTok: 0.975, contextWindow: 1_000_000, codingScore: 78 }, ]; /** Look up catalog facts for a model id, or `undefined` if it's not tracked. */ diff --git a/src/providers/deepseek.ts b/src/providers/deepseek.ts new file mode 100644 index 0000000..7618ac5 --- /dev/null +++ b/src/providers/deepseek.ts @@ -0,0 +1,33 @@ +import { OpenAiCompatibleProvider, type OpenAiCompatibleOptions } from './openai-compatible.js'; + +/** DeepSeek's hosted OpenAI-compatible endpoint. */ +export const DEFAULT_DEEPSEEK_URL = 'https://api.deepseek.com/v1'; + +export interface DeepSeekProviderOptions extends Omit { + apiKey: string; + /** Override the API endpoint (defaults to {@link DEFAULT_DEEPSEEK_URL}). */ + baseUrl?: string | undefined; +} + +/** + * DeepSeek's cloud models (the V4 family powers its coding capability) over the + * OpenAI-compatible Chat Completions API. Differs from the local Ollama + * provider only in endpoint, required API key, and error wording. + */ +export class DeepSeekProvider extends OpenAiCompatibleProvider { + readonly name = 'deepseek' as const; + + constructor(opts: DeepSeekProviderOptions) { + super({ ...opts, baseUrl: opts.baseUrl ?? DEFAULT_DEEPSEEK_URL }); + } + + protected override label(): string { + return 'DeepSeek'; + } + + protected override unreachableError(err: Error): Error { + return new Error( + `Cannot reach DeepSeek at ${this.baseUrl}. Check your network and DEEPSEEK_API_KEY. (${err.message})`, + ); + } +} diff --git a/src/providers/index.ts b/src/providers/index.ts index 89c6b3f..7d620af 100644 --- a/src/providers/index.ts +++ b/src/providers/index.ts @@ -3,11 +3,16 @@ import type { ResolvedConfig } from '../config/load.js'; import { AnthropicProvider } from './anthropic.js'; import { GeminiProvider } from './gemini.js'; import { OllamaProvider } from './ollama.js'; +import { DeepSeekProvider } from './deepseek.js'; +import { QwenProvider } from './qwen.js'; export type { ModelProvider, ProviderEvent, SendRequest, ToolSchema, Usage } from './types.js'; export { AnthropicProvider } from './anthropic.js'; export { GeminiProvider } from './gemini.js'; export { OllamaProvider } from './ollama.js'; +export { DeepSeekProvider } from './deepseek.js'; +export { QwenProvider } from './qwen.js'; +export { OpenAiCompatibleProvider } from './openai-compatible.js'; /** Construct the configured provider, validating that its API key is present. */ export function createProvider(config: ResolvedConfig): ModelProvider { @@ -33,6 +38,30 @@ export function createProvider(config: ResolvedConfig): ModelProvider { }); } + if (config.provider === 'deepseek') { + if (!config.deepseekApiKey) { + throw new Error('DEEPSEEK_API_KEY is not set. Export it or switch providers with --provider anthropic.'); + } + return new DeepSeekProvider({ + apiKey: config.deepseekApiKey, + baseUrl: config.deepseekBaseUrl, + model: config.model, + maxTokens: config.maxTokens, + }); + } + + if (config.provider === 'qwen') { + if (!config.qwenApiKey) { + throw new Error('QWEN_API_KEY is not set. Export it or switch providers with --provider anthropic.'); + } + return new QwenProvider({ + apiKey: config.qwenApiKey, + baseUrl: config.qwenBaseUrl, + model: config.model, + maxTokens: config.maxTokens, + }); + } + if (!config.geminiApiKey) { throw new Error('GEMINI_API_KEY is not set. Export it or switch providers with --provider anthropic.'); } diff --git a/src/providers/ollama.ts b/src/providers/ollama.ts index 79f0ca8..d59808e 100644 --- a/src/providers/ollama.ts +++ b/src/providers/ollama.ts @@ -1,272 +1,41 @@ -import type { Message } from '../agent/types.js'; -import type { ModelProvider, ProviderEvent, SendRequest, ToolSchema } from './types.js'; +import { OpenAiCompatibleProvider, type OpenAiCompatibleOptions } from './openai-compatible.js'; -export interface OllamaProviderOptions { - /** OpenAI-compatible base URL, e.g. "http://localhost:11434/v1". */ - baseUrl: string; - model: string; +// Re-exported so existing importers keep their `./ollama.js` entry point; the +// translation helpers are shared by every OpenAI-compatible provider now. +export { toOpenAiMessages, toOpenAiTools } from './openai-compatible.js'; + +export interface OllamaProviderOptions extends Omit { /** Ignored by Ollama but required by the OpenAI wire format; defaults to "ollama". */ apiKey?: string; - /** Cap on tokens to generate per response. Omitted from the request if unset. */ - maxTokens?: number; - /** - * Abort the request if no bytes arrive for this long (ms). This is an *idle* - * timeout, reset on every received chunk — a slow-but-progressing model keeps - * going; a hung one (common when the machine is RAM-starved) is cut loose. - * Defaults to 120_000. - */ - timeoutMs?: number; -} - -interface OpenAiMessage { - role: 'system' | 'user' | 'assistant' | 'tool'; - content: string; - tool_calls?: { id: string; type: 'function'; function: { name: string; arguments: string } }[]; - tool_call_id?: string; } /** - * Translate internal messages into OpenAI chat messages (the shape Ollama's - * `/v1/chat/completions` endpoint accepts). Unlike Gemini, OpenAI correlates - * tool results to calls by `tool_call_id`, and our Anthropic-style ids survive - * the round trip — so no id synthesis is needed. - * - * Assumes the loop never mixes plain text and tool results in one user turn in a - * way that would interleave them: we emit all `tool` messages first, then any - * text as a trailing user message. OpenAI requires each `tool` message to follow - * the assistant `tool_calls` that produced it; today's loop builds messages so - * that holds. If a future change interleaves them, revisit this ordering. + * Local Ollama server over its OpenAI-compatible endpoint. Same wire format as + * the cloud OpenAI-compatible providers (it also covers LM Studio and vLLM by + * pointing `baseUrl` at them); only the auth default and the connection-error + * wording — which name a local `ollama serve` and RAM pressure — differ. */ -export function toOpenAiMessages(messages: Message[]): OpenAiMessage[] { - const out: OpenAiMessage[] = []; - for (const m of messages) { - if (m.role === 'user') { - // A user turn may carry plain text and/or tool results; emit each result - // as its own `tool` message and gather any text into one user message. - let text = ''; - for (const b of m.content) { - if (b.type === 'text') text += b.text; - else if (b.type === 'tool_result') { - out.push({ role: 'tool', tool_call_id: b.toolUseId, content: b.content }); - } - } - if (text.length > 0) out.push({ role: 'user', content: text }); - continue; - } - - // assistant: merge text + tool_use into a single message - let text = ''; - const toolCalls: NonNullable = []; - for (const b of m.content) { - if (b.type === 'text') text += b.text; - else if (b.type === 'tool_use') { - toolCalls.push({ - id: b.id, - type: 'function', - function: { name: b.name, arguments: JSON.stringify(b.input ?? {}) }, - }); - } - } - const msg: OpenAiMessage = { role: 'assistant', content: text }; - if (toolCalls.length > 0) msg.tool_calls = toolCalls; - out.push(msg); - } - return out; -} - -/** Translate normalized tool schemas into OpenAI's `tools` array. */ -export function toOpenAiTools(tools: ToolSchema[]): unknown[] { - return tools.map((t) => ({ - type: 'function', - function: { name: t.name, description: t.description, parameters: t.jsonSchema }, - })); -} - -interface StreamChoice { - delta?: { - content?: string | null; - tool_calls?: { - index: number; - id?: string; - function?: { name?: string; arguments?: string }; - }[]; - }; - finish_reason?: string | null; -} - -interface StreamChunk { - choices?: StreamChoice[]; - usage?: { prompt_tokens?: number; completion_tokens?: number } | null; -} - -export class OllamaProvider implements ModelProvider { +export class OllamaProvider extends OpenAiCompatibleProvider { readonly name = 'ollama' as const; - readonly model: string; - private readonly baseUrl: string; - private readonly apiKey: string; - private readonly maxTokens: number | undefined; - private readonly timeoutMs: number; constructor(opts: OllamaProviderOptions) { - this.baseUrl = opts.baseUrl.replace(/\/$/, ''); - this.model = opts.model; - this.apiKey = opts.apiKey ?? 'ollama'; - this.maxTokens = opts.maxTokens; - this.timeoutMs = opts.timeoutMs ?? 120_000; + super({ ...opts, apiKey: opts.apiKey ?? 'ollama' }); } - async *send(req: SendRequest): AsyncIterable { - const messages: OpenAiMessage[] = [ - { role: 'system', content: req.system }, - ...toOpenAiMessages(req.messages), - ]; - - const body = { - model: this.model, - messages, - tools: req.tools.length > 0 ? toOpenAiTools(req.tools) : undefined, - stream: true, - max_tokens: this.maxTokens, - }; - - // Idle-timeout guard: abort if the server goes silent for `timeoutMs`. The - // raw fetch (unlike the cloud SDKs) has no built-in timeout, so without this - // a stuck local model would freeze the REPL with no way to recover. - const controller = new AbortController(); - let timer: ReturnType; - const armTimer = (): void => { - clearTimeout(timer); - timer = setTimeout(() => controller.abort(), this.timeoutMs); - }; - armTimer(); - - try { - let res: Response; - try { - // `stream_options.include_usage` is best-effort: it gives us token counts, - // but older Ollama builds reject unknown body fields with a 400. Rather than - // breaking every local turn over a reporting nicety, retry once without it. - res = await this.post({ ...body, stream_options: { include_usage: true } }, controller.signal); - if (res.status === 400) res = await this.post(body, controller.signal); - } catch (err) { - if (controller.signal.aborted) throw this.timeoutError(); - throw new Error( - `Cannot reach Ollama at ${this.baseUrl}. Is 'ollama serve' running? (${(err as Error).message})`, - ); - } - - if (!res.ok || !res.body) { - const detail = await res.text().catch(() => ''); - throw new Error(`Ollama request failed (${res.status}): ${detail.slice(0, 200)}`); - } - - // Accumulate tool calls by their streamed index; arguments arrive in fragments. - const calls = new Map(); - let usage = { inputTokens: 0, outputTokens: 0 }; - let finish = 'stop'; - - try { - for await (const chunk of parseSse(res.body)) { - armTimer(); // progress: reset the idle clock - const choice = chunk.choices?.[0]; - if (choice?.delta?.content) yield { type: 'text', delta: choice.delta.content }; - - for (const tc of choice?.delta?.tool_calls ?? []) { - const acc = calls.get(tc.index) ?? { id: '', name: '', args: '' }; - if (tc.id) acc.id = tc.id; - if (tc.function?.name) acc.name = tc.function.name; - if (tc.function?.arguments) acc.args += tc.function.arguments; - calls.set(tc.index, acc); - } - - if (choice?.finish_reason) finish = choice.finish_reason; - if (chunk.usage) { - usage = { - inputTokens: chunk.usage.prompt_tokens ?? 0, - outputTokens: chunk.usage.completion_tokens ?? 0, - }; - } - } - } catch (err) { - if (controller.signal.aborted) throw this.timeoutError(); - throw err; - } - - for (const [index, c] of [...calls.entries()].sort((a, b) => a[0] - b[0])) { - let input: unknown = {}; - try { - input = c.args.trim() ? JSON.parse(c.args) : {}; - } catch { - // Small models occasionally emit malformed JSON; degrade gracefully. - input = {}; - } - yield { type: 'tool_call', id: c.id || `ollama-call-${index}`, name: c.name, input }; - } - - yield { - type: 'done', - usage, - stopReason: calls.size > 0 ? 'tool_use' : finish, - }; - } finally { - clearTimeout(timer!); - } + protected override label(): string { + return 'Ollama'; } - private timeoutError(): Error { + protected override timeoutError(): Error { return new Error( `Ollama at ${this.baseUrl} went silent for ${Math.round(this.timeoutMs / 1000)}s and was aborted. ` + `The model '${this.model}' may be too large for this machine.`, ); } - /** POST a chat-completions request body to the Ollama server. */ - private post(body: unknown, signal: AbortSignal): Promise { - return fetch(`${this.baseUrl}/chat/completions`, { - method: 'POST', - headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${this.apiKey}` }, - body: JSON.stringify(body), - signal, - }); - } -} - -/** Decode a single SSE line into a chunk, or `undefined` for non-data/keep-alive lines. */ -function parseSseLine(raw: string): StreamChunk | undefined { - const line = raw.trim(); - if (!line.startsWith('data:')) return undefined; - const payload = line.slice(5).trim(); - if (payload === '[DONE]' || payload.length === 0) return undefined; - try { - return JSON.parse(payload) as StreamChunk; - } catch { - // Ignore partial/non-JSON keep-alive lines. - return undefined; - } -} - -/** Parse an SSE byte stream into decoded JSON chunks, skipping the `[DONE]` sentinel. */ -async function* parseSse(body: ReadableStream): AsyncIterable { - const decoder = new TextDecoder(); - let buffer = ''; - const reader = body.getReader(); - try { - for (;;) { - const { done, value } = await reader.read(); - if (done) break; - buffer += decoder.decode(value, { stream: true }); - let nl: number; - while ((nl = buffer.indexOf('\n')) !== -1) { - const chunk = parseSseLine(buffer.slice(0, nl)); - buffer = buffer.slice(nl + 1); - if (chunk) yield chunk; - } - } - // Emit a final line that arrived without a trailing newline (e.g. a closing - // usage frame); otherwise the last chunk's token counts would be dropped. - const tail = parseSseLine(buffer); - if (tail) yield tail; - } finally { - reader.releaseLock(); + protected override unreachableError(err: Error): Error { + return new Error( + `Cannot reach Ollama at ${this.baseUrl}. Is 'ollama serve' running? (${err.message})`, + ); } } diff --git a/src/providers/openai-compatible.ts b/src/providers/openai-compatible.ts new file mode 100644 index 0000000..8492205 --- /dev/null +++ b/src/providers/openai-compatible.ts @@ -0,0 +1,285 @@ +import type { Message } from '../agent/types.js'; +import type { ModelProvider, ProviderEvent, SendRequest, ToolSchema } from './types.js'; + +export interface OpenAiCompatibleOptions { + /** OpenAI-compatible base URL, e.g. "https://api.deepseek.com/v1". */ + baseUrl: string; + model: string; + /** Bearer token. Local servers (Ollama) ignore it; cloud APIs require it. */ + apiKey?: string; + /** Cap on tokens to generate per response. Omitted from the request if unset. */ + maxTokens?: number; + /** + * Abort the request if no bytes arrive for this long (ms). This is an *idle* + * timeout, reset on every received chunk — a slow-but-progressing model keeps + * going; a hung one is cut loose. Defaults to 120_000. + */ + timeoutMs?: number; +} + +interface OpenAiMessage { + role: 'system' | 'user' | 'assistant' | 'tool'; + content: string; + tool_calls?: { id: string; type: 'function'; function: { name: string; arguments: string } }[]; + tool_call_id?: string; +} + +/** + * Translate internal messages into OpenAI chat messages (the shape every + * `/v1/chat/completions` endpoint accepts). Unlike Gemini, OpenAI correlates + * tool results to calls by `tool_call_id`, and our Anthropic-style ids survive + * the round trip — so no id synthesis is needed. + * + * Assumes the loop never mixes plain text and tool results in one user turn in a + * way that would interleave them: we emit all `tool` messages first, then any + * text as a trailing user message. OpenAI requires each `tool` message to follow + * the assistant `tool_calls` that produced it; today's loop builds messages so + * that holds. If a future change interleaves them, revisit this ordering. + */ +export function toOpenAiMessages(messages: Message[]): OpenAiMessage[] { + const out: OpenAiMessage[] = []; + for (const m of messages) { + if (m.role === 'user') { + // A user turn may carry plain text and/or tool results; emit each result + // as its own `tool` message and gather any text into one user message. + let text = ''; + for (const b of m.content) { + if (b.type === 'text') text += b.text; + else if (b.type === 'tool_result') { + out.push({ role: 'tool', tool_call_id: b.toolUseId, content: b.content }); + } + } + if (text.length > 0) out.push({ role: 'user', content: text }); + continue; + } + + // assistant: merge text + tool_use into a single message + let text = ''; + const toolCalls: NonNullable = []; + for (const b of m.content) { + if (b.type === 'text') text += b.text; + else if (b.type === 'tool_use') { + toolCalls.push({ + id: b.id, + type: 'function', + function: { name: b.name, arguments: JSON.stringify(b.input ?? {}) }, + }); + } + } + const msg: OpenAiMessage = { role: 'assistant', content: text }; + if (toolCalls.length > 0) msg.tool_calls = toolCalls; + out.push(msg); + } + return out; +} + +/** Translate normalized tool schemas into OpenAI's `tools` array. */ +export function toOpenAiTools(tools: ToolSchema[]): unknown[] { + return tools.map((t) => ({ + type: 'function', + function: { name: t.name, description: t.description, parameters: t.jsonSchema }, + })); +} + +interface StreamChoice { + delta?: { + content?: string | null; + tool_calls?: { + index: number; + id?: string; + function?: { name?: string; arguments?: string }; + }[]; + }; + finish_reason?: string | null; +} + +interface StreamChunk { + choices?: StreamChoice[]; + usage?: { prompt_tokens?: number; completion_tokens?: number } | null; +} + +/** + * Base adapter for any OpenAI-compatible `/v1/chat/completions` server. Ollama, + * DeepSeek, and Qwen (DashScope) all speak this wire format, differing only in + * base URL, auth, and the wording of their connection errors. Subclasses set + * {@link name} and may override {@link unreachableError}/{@link timeoutError}. + */ +export abstract class OpenAiCompatibleProvider implements ModelProvider { + abstract readonly name: ModelProvider['name']; + readonly model: string; + protected readonly baseUrl: string; + protected readonly apiKey: string; + protected readonly maxTokens: number | undefined; + protected readonly timeoutMs: number; + + constructor(opts: OpenAiCompatibleOptions) { + this.baseUrl = opts.baseUrl.replace(/\/$/, ''); + this.model = opts.model; + this.apiKey = opts.apiKey ?? ''; + this.maxTokens = opts.maxTokens; + this.timeoutMs = opts.timeoutMs ?? 120_000; + } + + async *send(req: SendRequest): AsyncIterable { + const messages: OpenAiMessage[] = [ + { role: 'system', content: req.system }, + ...toOpenAiMessages(req.messages), + ]; + + const body = { + model: this.model, + messages, + tools: req.tools.length > 0 ? toOpenAiTools(req.tools) : undefined, + stream: true, + max_tokens: this.maxTokens, + }; + + // Idle-timeout guard: abort if the server goes silent for `timeoutMs`. The + // raw fetch (unlike the cloud SDKs) has no built-in timeout, so without this + // a stuck server would freeze the REPL with no way to recover. + const controller = new AbortController(); + let timer: ReturnType; + const armTimer = (): void => { + clearTimeout(timer); + timer = setTimeout(() => controller.abort(), this.timeoutMs); + }; + armTimer(); + + try { + let res: Response; + try { + // `stream_options.include_usage` is best-effort: it gives us token counts, + // but some servers reject unknown body fields with a 400. Rather than + // breaking every turn over a reporting nicety, retry once without it. + res = await this.post({ ...body, stream_options: { include_usage: true } }, controller.signal); + if (res.status === 400) res = await this.post(body, controller.signal); + } catch (err) { + if (controller.signal.aborted) throw this.timeoutError(); + throw this.unreachableError(err as Error); + } + + if (!res.ok || !res.body) { + const detail = await res.text().catch(() => ''); + throw new Error(`${this.label()} request failed (${res.status}): ${detail.slice(0, 200)}`); + } + + // Accumulate tool calls by their streamed index; arguments arrive in fragments. + const calls = new Map(); + let usage = { inputTokens: 0, outputTokens: 0 }; + let finish = 'stop'; + + try { + for await (const chunk of parseSse(res.body)) { + armTimer(); // progress: reset the idle clock + const choice = chunk.choices?.[0]; + if (choice?.delta?.content) yield { type: 'text', delta: choice.delta.content }; + + for (const tc of choice?.delta?.tool_calls ?? []) { + const acc = calls.get(tc.index) ?? { id: '', name: '', args: '' }; + if (tc.id) acc.id = tc.id; + if (tc.function?.name) acc.name = tc.function.name; + if (tc.function?.arguments) acc.args += tc.function.arguments; + calls.set(tc.index, acc); + } + + if (choice?.finish_reason) finish = choice.finish_reason; + if (chunk.usage) { + usage = { + inputTokens: chunk.usage.prompt_tokens ?? 0, + outputTokens: chunk.usage.completion_tokens ?? 0, + }; + } + } + } catch (err) { + if (controller.signal.aborted) throw this.timeoutError(); + throw err; + } + + for (const [index, c] of [...calls.entries()].sort((a, b) => a[0] - b[0])) { + let input: unknown = {}; + try { + input = c.args.trim() ? JSON.parse(c.args) : {}; + } catch { + // Small models occasionally emit malformed JSON; degrade gracefully. + input = {}; + } + yield { type: 'tool_call', id: c.id || `${this.name}-call-${index}`, name: c.name, input }; + } + + yield { + type: 'done', + usage, + stopReason: calls.size > 0 ? 'tool_use' : finish, + }; + } finally { + clearTimeout(timer!); + } + } + + /** Human-readable provider name used in error messages. */ + protected label(): string { + return this.name; + } + + /** Error raised when no usable response arrives before the idle timeout. */ + protected timeoutError(): Error { + return new Error( + `${this.label()} at ${this.baseUrl} went silent for ${Math.round(this.timeoutMs / 1000)}s and was aborted.`, + ); + } + + /** Error raised when the server can't be reached at all. */ + protected unreachableError(err: Error): Error { + return new Error(`Cannot reach ${this.label()} at ${this.baseUrl}. (${err.message})`); + } + + /** POST a chat-completions request body to the server. */ + protected post(body: unknown, signal: AbortSignal): Promise { + return fetch(`${this.baseUrl}/chat/completions`, { + method: 'POST', + headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${this.apiKey}` }, + body: JSON.stringify(body), + signal, + }); + } +} + +/** Decode a single SSE line into a chunk, or `undefined` for non-data/keep-alive lines. */ +function parseSseLine(raw: string): StreamChunk | undefined { + const line = raw.trim(); + if (!line.startsWith('data:')) return undefined; + const payload = line.slice(5).trim(); + if (payload === '[DONE]' || payload.length === 0) return undefined; + try { + return JSON.parse(payload) as StreamChunk; + } catch { + // Ignore partial/non-JSON keep-alive lines. + return undefined; + } +} + +/** Parse an SSE byte stream into decoded JSON chunks, skipping the `[DONE]` sentinel. */ +export async function* parseSse(body: ReadableStream): AsyncIterable { + const decoder = new TextDecoder(); + let buffer = ''; + const reader = body.getReader(); + try { + for (;;) { + const { done, value } = await reader.read(); + if (done) break; + buffer += decoder.decode(value, { stream: true }); + let nl: number; + while ((nl = buffer.indexOf('\n')) !== -1) { + const chunk = parseSseLine(buffer.slice(0, nl)); + buffer = buffer.slice(nl + 1); + if (chunk) yield chunk; + } + } + // Emit a final line that arrived without a trailing newline (e.g. a closing + // usage frame); otherwise the last chunk's token counts would be dropped. + const tail = parseSseLine(buffer); + if (tail) yield tail; + } finally { + reader.releaseLock(); + } +} diff --git a/src/providers/qwen.ts b/src/providers/qwen.ts new file mode 100644 index 0000000..ed926de --- /dev/null +++ b/src/providers/qwen.ts @@ -0,0 +1,33 @@ +import { OpenAiCompatibleProvider, type OpenAiCompatibleOptions } from './openai-compatible.js'; + +/** Alibaba DashScope's OpenAI-compatible endpoint (hosts the Qwen models). */ +export const DEFAULT_QWEN_URL = 'https://dashscope.aliyuncs.com/compatible-mode/v1'; + +export interface QwenProviderOptions extends Omit { + apiKey: string; + /** Override the API endpoint (defaults to {@link DEFAULT_QWEN_URL}). */ + baseUrl?: string | undefined; +} + +/** + * Alibaba's Qwen Coder models (e.g. qwen3-coder-plus) served via DashScope's + * OpenAI-compatible Chat Completions API. Differs from the local Ollama provider + * only in endpoint, required API key, and error wording. + */ +export class QwenProvider extends OpenAiCompatibleProvider { + readonly name = 'qwen' as const; + + constructor(opts: QwenProviderOptions) { + super({ ...opts, baseUrl: opts.baseUrl ?? DEFAULT_QWEN_URL }); + } + + protected override label(): string { + return 'Qwen'; + } + + protected override unreachableError(err: Error): Error { + return new Error( + `Cannot reach Qwen (DashScope) at ${this.baseUrl}. Check your network and QWEN_API_KEY. (${err.message})`, + ); + } +} diff --git a/src/providers/types.ts b/src/providers/types.ts index c18443e..1d137f3 100644 --- a/src/providers/types.ts +++ b/src/providers/types.ts @@ -34,7 +34,7 @@ export interface SendRequest { * {@link ProviderEvent}. */ export interface ModelProvider { - readonly name: 'anthropic' | 'gemini' | 'ollama'; + readonly name: 'anthropic' | 'gemini' | 'ollama' | 'deepseek' | 'qwen'; readonly model: string; send(req: SendRequest): AsyncIterable; } diff --git a/src/ui/render.ts b/src/ui/render.ts index ea87e44..a0ecb10 100644 --- a/src/ui/render.ts +++ b/src/ui/render.ts @@ -24,7 +24,12 @@ function fmtTokens(n: number): string { /** Paid (non-local) providers, where missing pricing means "unknown" not "free". */ function isCloud(provider?: string): boolean { - return provider === 'anthropic' || provider === 'gemini'; + return ( + provider === 'anthropic' || + provider === 'gemini' || + provider === 'deepseek' || + provider === 'qwen' + ); } export interface SessionTotals { diff --git a/tests/config/load.test.ts b/tests/config/load.test.ts index 2da1595..6ec7878 100644 --- a/tests/config/load.test.ts +++ b/tests/config/load.test.ts @@ -7,12 +7,17 @@ import { loadConfig } from '../../src/config/load.js'; const ENV_KEYS = [ 'ANTHROPIC_API_KEY', 'GEMINI_API_KEY', + 'DEEPSEEK_API_KEY', + 'QWEN_API_KEY', + 'DASHSCOPE_API_KEY', 'TINY_CODE_PROVIDER', 'TINY_CODE_MODEL', 'TINY_CODE_PRIORITY', 'TINY_CODE_MAX_TOKENS', 'TINY_CODE_EFFORT', 'TINY_CODE_OLLAMA_URL', + 'TINY_CODE_DEEPSEEK_URL', + 'TINY_CODE_QWEN_URL', 'TINY_CODE_IMPROVE', 'HOME', ]; @@ -161,6 +166,40 @@ describe('loadConfig', () => { expect(cfg.ollamaBaseUrl).toBe('http://gpu-box:11434/v1'); }); + it('infers deepseek when only DEEPSEEK_API_KEY is set, picking its flagship model', () => { + process.env.DEEPSEEK_API_KEY = 'sk-deep'; + const cfg = loadConfig({}, cwd); + expect(cfg.provider).toBe('deepseek'); + expect(cfg.model).toBe('deepseek-v4-pro'); + expect(cfg.deepseekApiKey).toBe('sk-deep'); + }); + + it('infers qwen from QWEN_API_KEY or DASHSCOPE_API_KEY', () => { + process.env.QWEN_API_KEY = 'sk-qwen'; + expect(loadConfig({}, cwd).provider).toBe('qwen'); + delete process.env.QWEN_API_KEY; + process.env.DASHSCOPE_API_KEY = 'sk-dash'; + const cfg = loadConfig({}, cwd); + expect(cfg.provider).toBe('qwen'); + expect(cfg.model).toBe('qwen3-coder-plus'); + expect(cfg.qwenApiKey).toBe('sk-dash'); + }); + + it('prefers anthropic over deepseek/qwen when several keys are present', () => { + process.env.ANTHROPIC_API_KEY = 'sk-a'; + process.env.DEEPSEEK_API_KEY = 'sk-d'; + process.env.QWEN_API_KEY = 'sk-q'; + expect(loadConfig({}, cwd).provider).toBe('anthropic'); + }); + + it('reads provider-specific base URL overrides', () => { + process.env.TINY_CODE_DEEPSEEK_URL = 'https://proxy/deepseek/v1'; + process.env.TINY_CODE_QWEN_URL = 'https://dashscope-intl.aliyuncs.com/compatible-mode/v1'; + const cfg = loadConfig({ provider: 'deepseek' }, cwd); + expect(cfg.deepseekBaseUrl).toBe('https://proxy/deepseek/v1'); + expect(cfg.qwenBaseUrl).toBe('https://dashscope-intl.aliyuncs.com/compatible-mode/v1'); + }); + it('defaults routing to local-first when an escalateTo target is configured', async () => { await writeFile( join(cwd, 'tiny-code.config.json'), diff --git a/tests/models/catalog.test.ts b/tests/models/catalog.test.ts index f5fdf7e..6cda7a5 100644 --- a/tests/models/catalog.test.ts +++ b/tests/models/catalog.test.ts @@ -58,6 +58,18 @@ describe('recommendModel', () => { expect(recommendModel({ provider: 'gemini', priority: 'cost' })?.id).toBe('gemini-2.5-flash'); }); + it('picks the flagship coder model for the DeepSeek and Qwen providers', () => { + expect(recommendModel({ provider: 'deepseek', priority: 'performance' })?.id).toBe( + 'deepseek-v4-pro', + ); + expect(recommendModel({ provider: 'qwen', priority: 'performance' })?.id).toBe( + 'qwen3-coder-plus', + ); + // Their cheaper variants win on cost. + expect(recommendModel({ provider: 'deepseek', priority: 'cost' })?.id).toBe('deepseek-v4-flash'); + expect(recommendModel({ provider: 'qwen', priority: 'cost' })?.id).toBe('qwen3-coder-flash'); + }); + it('balanced trades cost against capability without dropping to the weakest', () => { expect(recommendModel({ provider: 'anthropic', priority: 'balanced' })?.id).toBe( 'claude-sonnet-4-6', diff --git a/tests/providers/openaiCloudSend.test.ts b/tests/providers/openaiCloudSend.test.ts new file mode 100644 index 0000000..3e53e02 --- /dev/null +++ b/tests/providers/openaiCloudSend.test.ts @@ -0,0 +1,81 @@ +import { describe, it, expect, vi, afterEach } from 'vitest'; +import { DeepSeekProvider } from '../../src/providers/deepseek.js'; +import { QwenProvider } from '../../src/providers/qwen.js'; +import type { ProviderEvent } from '../../src/providers/types.js'; + +/** Build a fake SSE Response body from a list of OpenAI-style chunks. */ +function sseResponse(chunks: unknown[]): Response { + const lines = chunks.map((c) => `data: ${JSON.stringify(c)}\n\n`).concat('data: [DONE]\n\n'); + const stream = new ReadableStream({ + start(controller) { + const enc = new TextEncoder(); + for (const line of lines) controller.enqueue(enc.encode(line)); + controller.close(); + }, + }); + return new Response(stream, { status: 200, headers: { 'Content-Type': 'text/event-stream' } }); +} + +afterEach(() => vi.restoreAllMocks()); + +async function collect(provider: DeepSeekProvider | QwenProvider): Promise { + const events: ProviderEvent[] = []; + for await (const e of provider.send({ + system: 's', + messages: [{ role: 'user', content: [{ type: 'text', text: 'go' }] }], + tools: [{ name: 'ls', description: 'list', jsonSchema: { type: 'object' } }], + })) { + events.push(e); + } + return events; +} + +describe('DeepSeekProvider.send', () => { + it('targets the DeepSeek endpoint with the API key and streams events', async () => { + const fetchMock = vi.spyOn(globalThis, 'fetch').mockResolvedValue( + sseResponse([ + { choices: [{ delta: { content: 'hi' } }] }, + { choices: [], usage: { prompt_tokens: 5, completion_tokens: 2 } }, + ]), + ); + + const provider = new DeepSeekProvider({ apiKey: 'sk-deep', model: 'deepseek-v4-pro' }); + expect(provider.name).toBe('deepseek'); + + const events = await collect(provider); + const [url, init] = fetchMock.mock.calls[0]!; + expect(url).toBe('https://api.deepseek.com/v1/chat/completions'); + expect((init as RequestInit).headers).toMatchObject({ Authorization: 'Bearer sk-deep' }); + + const text = events.filter((e) => e.type === 'text').map((e) => (e as { delta: string }).delta); + expect(text.join('')).toBe('hi'); + const done = events.find((e) => e.type === 'done'); + expect(done).toMatchObject({ usage: { inputTokens: 5, outputTokens: 2 } }); + }); + + it('reports a DeepSeek-specific error when the host is unreachable', async () => { + vi.spyOn(globalThis, 'fetch').mockRejectedValue(new Error('ENOTFOUND')); + const provider = new DeepSeekProvider({ apiKey: 'k', model: 'deepseek-v4-pro' }); + await expect(collect(provider)).rejects.toThrow(/Cannot reach DeepSeek/); + }); +}); + +describe('QwenProvider.send', () => { + it('targets the DashScope endpoint and respects a base URL override', async () => { + const fetchMock = vi + .spyOn(globalThis, 'fetch') + .mockResolvedValue(sseResponse([{ choices: [{ delta: { content: 'ok' } }] }])); + + const provider = new QwenProvider({ + apiKey: 'sk-qwen', + model: 'qwen3-coder-plus', + baseUrl: 'https://dashscope-intl.aliyuncs.com/compatible-mode/v1', + }); + expect(provider.name).toBe('qwen'); + + await collect(provider); + const [url, init] = fetchMock.mock.calls[0]!; + expect(url).toBe('https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions'); + expect((init as RequestInit).headers).toMatchObject({ Authorization: 'Bearer sk-qwen' }); + }); +}); From 118faa080d2c984d70e63ceba438bd0f08a3a7ef Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 10 Jun 2026 13:14:39 +0000 Subject: [PATCH 2/4] Default model auto-selection to balanced priority Flip the default priority from performance to balanced so the auto-picked model is cost-aware out of the box (best capability-per-dollar behind a quality floor) rather than most-capable-at-any-price. Update config/catalog tests, CLI help, README, AGENTS.md, and .env.example to match. https://claude.ai/code/session_01GHiv4kP53a96EWsFEAAqWp --- .changeset/balanced-default-priority.md | 14 ++++++++++++++ .env.example | 2 +- AGENTS.md | 8 +++++--- README.md | 19 ++++++++++++------- src/cli.ts | 2 +- src/config/load.ts | 2 +- src/models/catalog.ts | 3 ++- tests/config/load.test.ts | 24 ++++++++++++++++++------ 8 files changed, 54 insertions(+), 20 deletions(-) create mode 100644 .changeset/balanced-default-priority.md diff --git a/.changeset/balanced-default-priority.md b/.changeset/balanced-default-priority.md new file mode 100644 index 0000000..b5ba201 --- /dev/null +++ b/.changeset/balanced-default-priority.md @@ -0,0 +1,14 @@ +--- +"@therr/tiny-code": minor +--- + +Default model selection to `balanced` priority. + +When no `model` is pinned, tiny-code now defaults to `priority: "balanced"` +instead of `performance`, picking the best capability-per-dollar model +(`codingScore / blendedCostPerMTok`, behind a quality floor) rather than the +most capable regardless of price. In line with the project's token-minimalism +goal, this makes the out-of-the-box pick cost-aware — e.g. Claude Sonnet rather +than Opus for Anthropic. Set `priority: "performance"` (or +`TINY_CODE_PRIORITY=performance`) to restore the previous most-capable defaults; +pinning a `model` still overrides everything. diff --git a/.env.example b/.env.example index bc84fe7..90c3a29 100644 --- a/.env.example +++ b/.env.example @@ -12,7 +12,7 @@ QWEN_API_KEY= # Alibaba DashScope key (DASHSCOPE_API_KEY also accepted) # TINY_CODE_OLLAMA_URL=http://localhost:11434/v1 # Ollama OpenAI-compatible endpoint # TINY_CODE_DEEPSEEK_URL=https://api.deepseek.com/v1 # TINY_CODE_QWEN_URL=https://dashscope.aliyuncs.com/compatible-mode/v1 -# TINY_CODE_PRIORITY=performance # performance | cost | balanced — auto-picks a model when none is pinned +# TINY_CODE_PRIORITY=balanced # performance | cost | balanced (default) — auto-picks a model when none is pinned # TINY_CODE_EFFORT=high # low | medium | high | xhigh | max — Anthropic thinking budget # Self-improvement: reflect on sessions and propose markdown-only improvement PRs. diff --git a/AGENTS.md b/AGENTS.md index 7c267fd..4180d00 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -29,9 +29,11 @@ runaway costs. - Keep it current: when adding/repricing a model, update its entry **and** `CATALOG_AS_OF`. Anthropic pricing comes from the bundled claude-api reference; verify Gemini pricing against Google's published rates. Don't guess prices. -- `priority` defaults to `performance`, which preserves the historical default - models (Opus for Anthropic, Gemini 2.5 Pro for Gemini). Don't change the - default without updating the config tests that assert those ids. +- `priority` defaults to `balanced` (best capability-per-dollar behind a quality + floor), so the auto-picked model is cost-aware by default — e.g. Sonnet rather + than Opus for Anthropic. `performance` restores the historical most-capable + picks. Don't change the default without updating the config/catalog tests that + assert those ids. ## Boundaries - No business logic. This is a general-purpose tool. diff --git a/README.md b/README.md index d55957b..b1548a7 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,7 @@ CLI flags. "provider": "anthropic", "model": "claude-opus-4-8", "ollamaBaseUrl": "http://localhost:11434/v1", - "priority": "performance", + "priority": "balanced", "maxTokens": 16000, "thinking": true, "effort": "high", @@ -213,14 +213,19 @@ money and to pick a model that fits your cost/performance preference. - **Priority-driven selection.** When you don't pin a `model`, tiny-code picks one for you based on `priority`: - | `priority` | Picks | - | --------------- | ----------------------------------------------------------- | - | `performance` | The most capable model (the default — current behavior). | - | `cost` | The cheapest still-capable model. | - | `balanced` | The best capability-per-dollar among capable models. | + | `priority` | Picks | + | --------------- | --------------------------------------------------------------- | + | `balanced` | The best capability-per-dollar among capable models (default). | + | `performance` | The most capable model, ignoring price. | + | `cost` | The cheapest still-capable model. | + + `balanced` is the default: it ranks capable models by + `codingScore / blendedCostPerMTok` (a model's coding aptitude per blended + dollar, weighting input 80% / output 20%) behind a quality floor, so you get + strong-but-sensibly-priced models without opting in. ```json - { "priority": "balanced" } + { "priority": "performance" } ``` Or per-session with `TINY_CODE_PRIORITY=cost`. Pinning `model` (config, env, diff --git a/src/cli.ts b/src/cli.ts index 68ce55d..e769d1a 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -26,7 +26,7 @@ Environment: QWEN_API_KEY Required for the Qwen provider (or DASHSCOPE_API_KEY) TINY_CODE_OLLAMA_URL Ollama OpenAI-compatible base URL (default http://localhost:11434/v1) TINY_CODE_PRIORITY performance | cost | balanced — auto-picks a model when - none is pinned (default: performance) + none is pinned (default: balanced) Cost-saving: set "routing": "local-first" with an "escalateTo" target in your config to run cheap/local models by default and escalate heavy tasks. Run /costs diff --git a/src/config/load.ts b/src/config/load.ts index 9c84243..3adaf90 100644 --- a/src/config/load.ts +++ b/src/config/load.ts @@ -162,7 +162,7 @@ export function loadConfig(overrides: CliOverrides = {}, cwd: string = process.c : 'anthropic'); const priority: Priority = - (env.TINY_CODE_PRIORITY as Priority | undefined) ?? file.priority ?? 'performance'; + (env.TINY_CODE_PRIORITY as Priority | undefined) ?? file.priority ?? 'balanced'; // When the user pins a model, honor it. Otherwise let the catalog pick the // best fit for the cost/performance priority, falling back to a static diff --git a/src/models/catalog.ts b/src/models/catalog.ts index 428a1bd..57d6c70 100644 --- a/src/models/catalog.ts +++ b/src/models/catalog.ts @@ -5,7 +5,8 @@ import type { Usage } from '../providers/types.js'; * How to weigh cost vs. capability when auto-selecting a model. * - `performance`: most capable model (maximize quality, ignore price) * - `cost`: cheapest capable model (maximize savings) - * - `balanced`: best capability-per-dollar among genuinely capable models + * - `balanced` (default): best capability-per-dollar among genuinely capable + * models — `codingScore / blendedCostPerMTok`, gated by a quality floor */ export type Priority = 'performance' | 'cost' | 'balanced'; diff --git a/tests/config/load.test.ts b/tests/config/load.test.ts index 6ec7878..158c26b 100644 --- a/tests/config/load.test.ts +++ b/tests/config/load.test.ts @@ -45,11 +45,12 @@ afterEach(async () => { }); describe('loadConfig', () => { - it('infers anthropic when only ANTHROPIC_API_KEY is set', () => { + it('infers anthropic when only ANTHROPIC_API_KEY is set, picking the balanced model', () => { process.env.ANTHROPIC_API_KEY = 'sk-test'; const cfg = loadConfig({}, cwd); expect(cfg.provider).toBe('anthropic'); - expect(cfg.model).toBe('claude-opus-4-8'); + // Balanced is the default priority, so it favors capability-per-dollar (Sonnet) over Opus. + expect(cfg.model).toBe('claude-sonnet-4-6'); expect(cfg.anthropicApiKey).toBe('sk-test'); }); @@ -112,8 +113,16 @@ describe('loadConfig', () => { expect(cfg.improve.onSessionEnd).toBe(false); }); - it('defaults to performance priority and the most capable model', () => { + it('defaults to balanced priority and the best capability-per-dollar model', () => { + process.env.ANTHROPIC_API_KEY = 'sk-test'; + const cfg = loadConfig({}, cwd); + expect(cfg.priority).toBe('balanced'); + expect(cfg.model).toBe('claude-sonnet-4-6'); + }); + + it('opts into the most capable model with performance priority', () => { process.env.ANTHROPIC_API_KEY = 'sk-test'; + process.env.TINY_CODE_PRIORITY = 'performance'; const cfg = loadConfig({}, cwd); expect(cfg.priority).toBe('performance'); expect(cfg.model).toBe('claude-opus-4-8'); @@ -130,7 +139,7 @@ describe('loadConfig', () => { it('lets a pinned model win over the priority recommendation', () => { process.env.ANTHROPIC_API_KEY = 'sk-test'; const cfg = loadConfig({ model: 'claude-opus-4-8' }, cwd); - expect(cfg.priority).toBe('performance'); + expect(cfg.priority).toBe('balanced'); expect(cfg.model).toBe('claude-opus-4-8'); }); @@ -166,12 +175,15 @@ describe('loadConfig', () => { expect(cfg.ollamaBaseUrl).toBe('http://gpu-box:11434/v1'); }); - it('infers deepseek when only DEEPSEEK_API_KEY is set, picking its flagship model', () => { + it('infers deepseek when only DEEPSEEK_API_KEY is set', () => { process.env.DEEPSEEK_API_KEY = 'sk-deep'; const cfg = loadConfig({}, cwd); expect(cfg.provider).toBe('deepseek'); - expect(cfg.model).toBe('deepseek-v4-pro'); + // Balanced default favors the cheaper flash; performance pins the pro flagship. + expect(cfg.model).toBe('deepseek-v4-flash'); expect(cfg.deepseekApiKey).toBe('sk-deep'); + process.env.TINY_CODE_PRIORITY = 'performance'; + expect(loadConfig({}, cwd).model).toBe('deepseek-v4-pro'); }); it('infers qwen from QWEN_API_KEY or DASHSCOPE_API_KEY', () => { From f5c383272fcf926a53f082253f33308ff3ef5e5f Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 10 Jun 2026 13:24:21 +0000 Subject: [PATCH 3/4] Add /priority command and capture provider-switching TODO Add a /priority slash command to view and switch the cost/performance priority mid-session, re-picking the auto-selected model (unless pinned or governed by local-first routing) via a new AgentLoop.setProvider and a modelPinned config flag. Record on-the-fly provider switching as a follow-up in TODO.md. https://claude.ai/code/session_01GHiv4kP53a96EWsFEAAqWp --- .changeset/priority-command.md | 13 +++++++ README.md | 7 ++-- TODO.md | 14 ++++++++ src/agent/loop.ts | 12 ++++++- src/config/load.ts | 4 +++ src/repl.ts | 63 ++++++++++++++++++++++++++++++++-- tests/agent/loop.test.ts | 16 +++++++++ tests/config/load.test.ts | 6 ++++ 8 files changed, 130 insertions(+), 5 deletions(-) create mode 100644 .changeset/priority-command.md diff --git a/.changeset/priority-command.md b/.changeset/priority-command.md new file mode 100644 index 0000000..a9b8b6e --- /dev/null +++ b/.changeset/priority-command.md @@ -0,0 +1,13 @@ +--- +"@therr/tiny-code": minor +--- + +Add a `/priority` command to switch cost/performance bias mid-session. + +`/priority` (no args) shows the current priority and the active model; +`/priority performance | balanced | cost` switches it and re-picks the +auto-selected model on the fly — e.g. jump to the most capable model when a task +gets hard, then drop back to `balanced`. Pinned models and local-first routing +keep governing the model themselves, so there the command just records the new +priority. Backed by a new `AgentLoop.setProvider` for swapping the active +provider mid-session, and a `modelPinned` flag on the resolved config. diff --git a/README.md b/README.md index b1548a7..b6144c3 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,7 @@ shell commands) prompt for approval unless pre-approved in config. - `/costs` — session token usage, estimated $ cost, and cost-saving tips - `/clear` — clear the conversation history and start fresh - `/models` — show known models, pricing, and the active one (see below) +- `/priority [performance|balanced|cost]` — show or switch the cost/performance priority mid-session; re-picks the auto-selected model unless one is pinned (see below) - `/improve` — reflect on the session and propose an improvement PR (see below) - `/ [args]` — run a custom command (see below) - `/exit` — quit @@ -228,8 +229,10 @@ money and to pick a model that fits your cost/performance preference. { "priority": "performance" } ``` - Or per-session with `TINY_CODE_PRIORITY=cost`. Pinning `model` (config, env, - or `--model`) always overrides the recommendation. + Or per-session with `TINY_CODE_PRIORITY=cost`, or on the fly with the + `/priority` command (e.g. `/priority performance` to jump to the most capable + model when a task gets hard, then `/priority balanced` to drop back). Pinning + `model` (config, env, or `--model`) always overrides the recommendation. The catalog is curated and offline (tiny-code has no live model-discovery yet — see `TODO.md`), so its prices carry an "as of" date; keep it current as vendors diff --git a/TODO.md b/TODO.md index 10484b0..455868c 100644 --- a/TODO.md +++ b/TODO.md @@ -17,6 +17,20 @@ a single condensed block. For Anthropic use the compaction beta; for Gemini summarize via a lightweight call to a cheap model. Pair with conversation persistence so compacted sessions can be resumed. +## On-the-fly provider switching +The `/priority` command already swaps the active *model* within the current +provider mid-session (`AgentLoop.setProvider`). Extend this to switch the +*provider* too, so a session can move between Anthropic, Gemini, DeepSeek, Qwen, +and Ollama without restarting. **Approach:** a `/provider [model]` +command that validates the target's API key (reuse `createProvider`'s checks), +re-resolves the model (honoring `priority` and any pin), rebuilds the provider, +and calls `agent.setProvider`. Decide how it interacts with local-first routing +(switching the primary vs. the `escalateTo` target) and keep `/costs` accurate +across providers — usage is already priced per-turn from the active model, so the +running total stays correct; just refresh the session-end summary's model. +Consider a single `/model ` shortcut that infers the provider from the +catalog entry. + ## Sub-agents Spawn isolated agent runs for parallel exploration/research (like a lightweight Explore/Plan agent). **Approach:** a `spawn_agent` tool whose `execute` constructs diff --git a/src/agent/loop.ts b/src/agent/loop.ts index d57570b..9c683ba 100644 --- a/src/agent/loop.ts +++ b/src/agent/loop.ts @@ -51,7 +51,7 @@ export interface AgentLoopOptions { * iteration guard trips). Conversation state persists across `run` calls. */ export class AgentLoop { - private readonly provider: ModelProvider; + private provider: ModelProvider; private readonly registry: ToolRegistry; private readonly gate: PermissionGate; private readonly system: string; @@ -86,6 +86,16 @@ export class AgentLoop { return this.messages; } + /** + * Swap the base provider mid-session — e.g. when the user changes the active + * model via `/priority`. Only affects un-escalated turns; if the session has + * stuck to an escalated frontier provider, that takes precedence until + * `clearHistory()` resets routing. + */ + setProvider(provider: ModelProvider): void { + this.provider = provider; + } + /** Drop the conversation history so the next turn starts fresh. Cumulative * token usage is preserved, since it reflects the whole session's cost. * Also clears sticky escalation: a fresh conversation re-routes from scratch. */ diff --git a/src/config/load.ts b/src/config/load.ts index 3adaf90..19e45d5 100644 --- a/src/config/load.ts +++ b/src/config/load.ts @@ -30,6 +30,9 @@ export interface AllowRules { export interface ResolvedConfig { provider: Provider; model: string; + /** True when `model` was explicitly pinned (CLI/env/config), so changing + * `priority` shouldn't re-pick it. */ + modelPinned: boolean; /** Cost/performance bias used to auto-pick a model when none is pinned. */ priority: Priority; anthropicApiKey: string | undefined; @@ -195,6 +198,7 @@ export function loadConfig(overrides: CliOverrides = {}, cwd: string = process.c return { provider, model, + modelPinned: pinnedModel !== undefined, priority, anthropicApiKey, geminiApiKey, diff --git a/src/repl.ts b/src/repl.ts index e9409f2..bd887b5 100644 --- a/src/repl.ts +++ b/src/repl.ts @@ -13,7 +13,7 @@ import { LocalFirstModelEngine } from './agent/decision/index.js'; import type { ModelDecisionEngine } from './agent/decision/index.js'; import { checkLocalModel } from './system/resources.js'; import { loadConfig } from './config/load.js'; -import type { CliOverrides, ResolvedConfig } from './config/load.js'; +import type { CliOverrides, ResolvedConfig, Priority } from './config/load.js'; import { loadProjectContext } from './config/context.js'; import { buildSystemPrompt } from './agent/systemPrompt.js'; import { loadCommands, renderCommand } from './commands/loader.js'; @@ -26,6 +26,7 @@ import { estimateCostUsd, formatUsd, blendedCostPerMTok, + recommendModel, } from './models/catalog.js'; import type { Usage } from './providers/types.js'; import { getUpdateNotice, maybeRefreshUpdateCache, formatUpdateNotice } from './system/updateCheck.js'; @@ -61,6 +62,7 @@ function printHelp(commands: Map): void { console.log(' /costs Show token usage, est. cost, and cost-saving tips'); console.log(' /clear Clear the conversation history and start fresh'); console.log(' /models Show known models, pricing, and the active one'); + console.log(' /priority Show or switch the cost/performance priority (e.g. /priority performance)'); console.log(' /improve Reflect on this session and propose an improvement PR'); console.log(' /exit, /quit Leave the session'); if (commands.size > 0) { @@ -154,7 +156,7 @@ export async function startRepl(overrides: CliOverrides): Promise { }); const gate = new PermissionGate(config.allow, prompt); - const modelInfo = getModelInfo(config.model); + let modelInfo = getModelInfo(config.model); const ui = createTerminalUI({ model: provider.model, provider: provider.name }); const agent = new AgentLoop({ provider, @@ -247,6 +249,51 @@ export async function startRepl(overrides: CliOverrides): Promise { } console.log(pc.dim('Type a request, /help for commands, /costs for usage, /exit to quit.')); + const PRIORITIES: Priority[] = ['performance', 'balanced', 'cost']; + + const printPriority = (): void => { + console.log(pc.bold('\nPriority: ') + config.priority + pc.dim(` (active model: ${config.model})`)); + console.log(pc.dim(' performance most capable model, ignoring price')); + console.log(pc.dim(' balanced best capability-per-dollar (default)')); + console.log(pc.dim(' cost cheapest still-capable model')); + console.log(pc.dim('Switch with: /priority performance | balanced | cost')); + }; + + // Change the auto-selection priority mid-session and re-pick the model when + // appropriate. Pinned models and local-first routing govern the model + // themselves, so there we just record the new priority. + const setPriority = (priority: Priority): void => { + if (priority === config.priority) { + console.log(pc.dim(`Priority already ${priority}.`)); + return; + } + config.priority = priority; + + if (config.modelPinned) { + console.log(pc.dim(`Priority → ${priority}. Model ${config.model} is pinned, so it stays.`)); + return; + } + if (localFirst) { + console.log( + pc.dim(`Priority → ${priority}. Local-first routing picks the model; this applies if routing is off.`), + ); + return; + } + const picked = recommendModel({ provider: config.provider, priority }); + if (!picked || picked.id === config.model) { + console.log(pc.dim(`Priority → ${priority}. Model unchanged (${config.model}).`)); + return; + } + const prevModel = config.model; + config.model = picked.id; + modelInfo = getModelInfo(picked.id); + agent.setProvider(createProvider(config)); + console.log( + pc.cyan(`Priority → ${priority}.`) + + pc.dim(` Model ${prevModel} → ${picked.id} ($${picked.inputPricePerMTok}/$${picked.outputPricePerMTok} per 1M in/out).`), + ); + }; + const handle = async (line: string): Promise => { const input = line.trim(); if (input.length === 0) { @@ -279,6 +326,18 @@ export async function startRepl(overrides: CliOverrides): Promise { ask(); return; } + if (input === '/priority' || input.startsWith('/priority ')) { + const arg = input.slice('/priority'.length).trim().toLowerCase(); + if (arg.length === 0) { + printPriority(); + } else if ((PRIORITIES as string[]).includes(arg)) { + setPriority(arg as Priority); + } else { + console.log(pc.red(`Unknown priority: ${arg} (use performance, balanced, or cost)`)); + } + ask(); + return; + } if (input === '/improve') { if (config.improve.enabled) { await improve(); diff --git a/tests/agent/loop.test.ts b/tests/agent/loop.test.ts index cec1e20..e54c910 100644 --- a/tests/agent/loop.test.ts +++ b/tests/agent/loop.test.ts @@ -367,6 +367,22 @@ describe('AgentLoop', () => { expect(loop.getUsage()).toEqual({ inputTokens: 30, outputTokens: 13 }); }); + it('routes later turns to a provider swapped in via setProvider', async () => { + const first = new ScriptedProvider([[{ type: 'text', delta: 'a' }, DONE]], 'model-a'); + const second = new ScriptedProvider([[{ type: 'text', delta: 'b' }, DONE]], 'model-b'); + const { ui } = recordingUI(); + const loop = makeLoop(first, ui, gateWith('yes')); + + await loop.run('one'); + expect(first.sent).toHaveLength(1); + + loop.setProvider(second); + await loop.run('two'); + // The swapped-in provider handles the new turn; the old one is untouched. + expect(first.sent).toHaveLength(1); + expect(second.sent).toHaveLength(1); + }); + it('stops at the iteration guard when tools never stop', async () => { const looping: ProviderEvent[][] = []; for (let i = 0; i < 10; i += 1) { diff --git a/tests/config/load.test.ts b/tests/config/load.test.ts index 158c26b..e72196a 100644 --- a/tests/config/load.test.ts +++ b/tests/config/load.test.ts @@ -120,6 +120,12 @@ describe('loadConfig', () => { expect(cfg.model).toBe('claude-sonnet-4-6'); }); + it('flags whether the model was pinned', () => { + process.env.ANTHROPIC_API_KEY = 'sk-test'; + expect(loadConfig({}, cwd).modelPinned).toBe(false); + expect(loadConfig({ model: 'claude-opus-4-8' }, cwd).modelPinned).toBe(true); + }); + it('opts into the most capable model with performance priority', () => { process.env.ANTHROPIC_API_KEY = 'sk-test'; process.env.TINY_CODE_PRIORITY = 'performance'; From fc47dc3e6cc195482cc3aae1cca7d448a3bef833 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 10 Jun 2026 13:39:01 +0000 Subject: [PATCH 4/4] Fix review findings and prepare 0.3.0 release - Price the session-end summary from the per-turn accumulated cost (matches /costs) instead of repricing all tokens at the final model's rate, which was wrong after a mid-session /priority model switch. - Validate TINY_CODE_PROVIDER / TINY_CODE_PRIORITY env values; an unrecognized value is now ignored with a warning instead of being cast through and silently mis-picking a model. - Add coverage for the provider-scoped synthetic tool-call id fallback. - Run changeset version: bump to 0.3.0 and generate CHANGELOG. https://claude.ai/code/session_01GHiv4kP53a96EWsFEAAqWp --- .changeset/balanced-default-priority.md | 14 ------ .changeset/local-models-cost-routing.md | 26 ---------- .changeset/priority-command.md | 13 ----- .changeset/qwen-deepseek-coder.md | 19 -------- CHANGELOG.md | 63 +++++++++++++++++++++++++ package.json | 2 +- src/config/load.ts | 25 +++++++++- src/repl.ts | 9 ++-- tests/config/load.test.ts | 16 +++++++ tests/providers/openaiCloudSend.test.ts | 17 +++++++ 10 files changed, 126 insertions(+), 78 deletions(-) delete mode 100644 .changeset/balanced-default-priority.md delete mode 100644 .changeset/local-models-cost-routing.md delete mode 100644 .changeset/priority-command.md delete mode 100644 .changeset/qwen-deepseek-coder.md create mode 100644 CHANGELOG.md diff --git a/.changeset/balanced-default-priority.md b/.changeset/balanced-default-priority.md deleted file mode 100644 index b5ba201..0000000 --- a/.changeset/balanced-default-priority.md +++ /dev/null @@ -1,14 +0,0 @@ ---- -"@therr/tiny-code": minor ---- - -Default model selection to `balanced` priority. - -When no `model` is pinned, tiny-code now defaults to `priority: "balanced"` -instead of `performance`, picking the best capability-per-dollar model -(`codingScore / blendedCostPerMTok`, behind a quality floor) rather than the -most capable regardless of price. In line with the project's token-minimalism -goal, this makes the out-of-the-box pick cost-aware — e.g. Claude Sonnet rather -than Opus for Anthropic. Set `priority: "performance"` (or -`TINY_CODE_PRIORITY=performance`) to restore the previous most-capable defaults; -pinning a `model` still overrides everything. diff --git a/.changeset/local-models-cost-routing.md b/.changeset/local-models-cost-routing.md deleted file mode 100644 index 240da17..0000000 --- a/.changeset/local-models-cost-routing.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -"@therr/tiny-code": minor ---- - -Add local models and cost-aware, local-first routing. - -- **Local (Ollama) provider.** Talk to a local Ollama server over its - OpenAI-compatible API (`--provider ollama`), with an idle timeout so a hung - model can't freeze the REPL, best-effort token-usage reporting, and configurable - `maxTokens`. -- **Local-first routing.** Set `routing: "local-first"` with an `escalateTo` - target to run a cheap/local model by default and escalate heavy turns (or a - stuck local model, via the new `escalate` tool) to a frontier model — with full - conversation context preserved. Escalation is sticky across follow-up turns. -- **Model-selection policy** is now owned by a pluggable `ModelDecisionEngine` - (`LocalFirstModelEngine`), keeping the agent loop pure mechanism. -- **Compute awareness.** On startup with a local model, tiny-code estimates RAM - need vs. machine capacity and warns when a model likely won't fit or is too - small (≤3B) to tool-call reliably; an over-RAM local model is routed to the - frontier up front. -- **Priority-driven model selection.** `priority` (`performance` / `cost` / - `balanced`, or `TINY_CODE_PRIORITY`) auto-picks a catalog model when none is - pinned. -- The `/costs` view reports session usage, estimated spend, and routing, and the - usage line distinguishes an unpriced *cloud* turn ("cost unknown") from a - *local* turn ("no API cost"). diff --git a/.changeset/priority-command.md b/.changeset/priority-command.md deleted file mode 100644 index a9b8b6e..0000000 --- a/.changeset/priority-command.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -"@therr/tiny-code": minor ---- - -Add a `/priority` command to switch cost/performance bias mid-session. - -`/priority` (no args) shows the current priority and the active model; -`/priority performance | balanced | cost` switches it and re-picks the -auto-selected model on the fly — e.g. jump to the most capable model when a task -gets hard, then drop back to `balanced`. Pinned models and local-first routing -keep governing the model themselves, so there the command just records the new -priority. Backed by a new `AgentLoop.setProvider` for swapping the active -provider mid-session, and a `modelPinned` flag on the resolved config. diff --git a/.changeset/qwen-deepseek-coder.md b/.changeset/qwen-deepseek-coder.md deleted file mode 100644 index 53d5185..0000000 --- a/.changeset/qwen-deepseek-coder.md +++ /dev/null @@ -1,19 +0,0 @@ ---- -"@therr/tiny-code": minor ---- - -Add DeepSeek and Qwen Coder model support. - -- **DeepSeek and Qwen providers.** Two new hosted, OpenAI-compatible providers - (`--provider deepseek` / `--provider qwen`), keyed by `DEEPSEEK_API_KEY` and - `QWEN_API_KEY` (or `DASHSCOPE_API_KEY`). Endpoints are overridable via - `TINY_CODE_DEEPSEEK_URL` / `TINY_CODE_QWEN_URL` or `deepseekBaseUrl` / - `qwenBaseUrl` in config — e.g. to target the international DashScope host. -- **Shared OpenAI-compatible core.** The streaming/tool-call adapter that backed - the Ollama provider is now a reusable `OpenAiCompatibleProvider` base; Ollama, - DeepSeek, and Qwen all extend it, differing only in endpoint, auth, and error - wording. -- **Catalog entries** for `deepseek-v4-pro`, `deepseek-v4-flash`, - `qwen3-coder-plus`, and `qwen3-coder-flash`, so `/costs` estimates and - priority-based model selection work for the new providers. `/costs` treats both - as paid cloud providers. diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..1e1383a --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,63 @@ +# @therr/tiny-code + +## 0.3.0 + +### Minor Changes + +- 118faa0: Default model selection to `balanced` priority. + + When no `model` is pinned, tiny-code now defaults to `priority: "balanced"` + instead of `performance`, picking the best capability-per-dollar model + (`codingScore / blendedCostPerMTok`, behind a quality floor) rather than the + most capable regardless of price. In line with the project's token-minimalism + goal, this makes the out-of-the-box pick cost-aware — e.g. Claude Sonnet rather + than Opus for Anthropic. Set `priority: "performance"` (or + `TINY_CODE_PRIORITY=performance`) to restore the previous most-capable defaults; + pinning a `model` still overrides everything. + +- 785b832: Add local models and cost-aware, local-first routing. + - **Local (Ollama) provider.** Talk to a local Ollama server over its + OpenAI-compatible API (`--provider ollama`), with an idle timeout so a hung + model can't freeze the REPL, best-effort token-usage reporting, and configurable + `maxTokens`. + - **Local-first routing.** Set `routing: "local-first"` with an `escalateTo` + target to run a cheap/local model by default and escalate heavy turns (or a + stuck local model, via the new `escalate` tool) to a frontier model — with full + conversation context preserved. Escalation is sticky across follow-up turns. + - **Model-selection policy** is now owned by a pluggable `ModelDecisionEngine` + (`LocalFirstModelEngine`), keeping the agent loop pure mechanism. + - **Compute awareness.** On startup with a local model, tiny-code estimates RAM + need vs. machine capacity and warns when a model likely won't fit or is too + small (≤3B) to tool-call reliably; an over-RAM local model is routed to the + frontier up front. + - **Priority-driven model selection.** `priority` (`performance` / `cost` / + `balanced`, or `TINY_CODE_PRIORITY`) auto-picks a catalog model when none is + pinned. + - The `/costs` view reports session usage, estimated spend, and routing, and the + usage line distinguishes an unpriced _cloud_ turn ("cost unknown") from a + _local_ turn ("no API cost"). + +- f5c3832: Add a `/priority` command to switch cost/performance bias mid-session. + + `/priority` (no args) shows the current priority and the active model; + `/priority performance | balanced | cost` switches it and re-picks the + auto-selected model on the fly — e.g. jump to the most capable model when a task + gets hard, then drop back to `balanced`. Pinned models and local-first routing + keep governing the model themselves, so there the command just records the new + priority. Backed by a new `AgentLoop.setProvider` for swapping the active + provider mid-session, and a `modelPinned` flag on the resolved config. + +- 52b179d: Add DeepSeek and Qwen Coder model support. + - **DeepSeek and Qwen providers.** Two new hosted, OpenAI-compatible providers + (`--provider deepseek` / `--provider qwen`), keyed by `DEEPSEEK_API_KEY` and + `QWEN_API_KEY` (or `DASHSCOPE_API_KEY`). Endpoints are overridable via + `TINY_CODE_DEEPSEEK_URL` / `TINY_CODE_QWEN_URL` or `deepseekBaseUrl` / + `qwenBaseUrl` in config — e.g. to target the international DashScope host. + - **Shared OpenAI-compatible core.** The streaming/tool-call adapter that backed + the Ollama provider is now a reusable `OpenAiCompatibleProvider` base; Ollama, + DeepSeek, and Qwen all extend it, differing only in endpoint, auth, and error + wording. + - **Catalog entries** for `deepseek-v4-pro`, `deepseek-v4-flash`, + `qwen3-coder-plus`, and `qwen3-coder-flash`, so `/costs` estimates and + priority-based model selection work for the new providers. `/costs` treats both + as paid cloud providers. diff --git a/package.json b/package.json index 1a2d61c..07004f4 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@therr/tiny-code", - "version": "0.2.3", + "version": "0.3.0", "description": "A small, extensible CLI coding agent with interchangeable Anthropic and Gemini models.", "type": "module", "bin": { diff --git a/src/config/load.ts b/src/config/load.ts index 19e45d5..f10514d 100644 --- a/src/config/load.ts +++ b/src/config/load.ts @@ -85,6 +85,27 @@ const DEFAULT_MODELS: Record = { const DEFAULT_OLLAMA_URL = 'http://localhost:11434/v1'; const PROVIDERS = ['anthropic', 'gemini', 'ollama', 'deepseek', 'qwen'] as const; +const PRIORITIES = ['performance', 'cost', 'balanced'] as const; + +/** + * Read an env var constrained to a known set. An unrecognized value is ignored + * (with a warning) rather than cast through blindly: an unchecked cast lets a + * typo like `TINY_CODE_PRIORITY=performant` fall through `recommendModel` and + * silently pick an unintended model. Returns `undefined` so resolution falls + * back to the next source in precedence. + */ +function readEnvEnum( + name: string, + value: string | undefined, + allowed: readonly T[], +): T | undefined { + if (value === undefined || value === '') return undefined; + if ((allowed as readonly string[]).includes(value)) return value as T; + process.stderr.write( + `tiny-code: ignoring ${name}="${value}" — expected one of: ${allowed.join(', ')}\n`, + ); + return undefined; +} const EscalateTargetSchema = z.object({ provider: z.enum(PROVIDERS), @@ -152,7 +173,7 @@ export function loadConfig(overrides: CliOverrides = {}, cwd: string = process.c const provider: Provider = overrides.provider ?? - (env.TINY_CODE_PROVIDER as Provider | undefined) ?? + readEnvEnum('TINY_CODE_PROVIDER', env.TINY_CODE_PROVIDER, PROVIDERS) ?? file.provider ?? (anthropicApiKey ? 'anthropic' @@ -165,7 +186,7 @@ export function loadConfig(overrides: CliOverrides = {}, cwd: string = process.c : 'anthropic'); const priority: Priority = - (env.TINY_CODE_PRIORITY as Priority | undefined) ?? file.priority ?? 'balanced'; + readEnvEnum('TINY_CODE_PRIORITY', env.TINY_CODE_PRIORITY, PRIORITIES) ?? file.priority ?? 'balanced'; // When the user pins a model, honor it. Otherwise let the catalog pick the // best fit for the cost/performance priority, falling back to a static diff --git a/src/repl.ts b/src/repl.ts index bd887b5..1bb7a00 100644 --- a/src/repl.ts +++ b/src/repl.ts @@ -156,7 +156,7 @@ export async function startRepl(overrides: CliOverrides): Promise { }); const gate = new PermissionGate(config.allow, prompt); - let modelInfo = getModelInfo(config.model); + const modelInfo = getModelInfo(config.model); const ui = createTerminalUI({ model: provider.model, provider: provider.name }); const agent = new AgentLoop({ provider, @@ -286,7 +286,6 @@ export async function startRepl(overrides: CliOverrides): Promise { } const prevModel = config.model; config.model = picked.id; - modelInfo = getModelInfo(picked.id); agent.setProvider(createProvider(config)); console.log( pc.cyan(`Priority → ${priority}.`) + @@ -378,7 +377,11 @@ export async function startRepl(overrides: CliOverrides): Promise { const usage = agent.getUsage(); if (usage.inputTokens > 0 || usage.outputTokens > 0) { const fmtN = (n: number) => n.toLocaleString('en-US'); - const cost = modelInfo ? ` ≈ ${formatUsd(estimateCostUsd(usage, modelInfo))}` : ''; + // Use the per-turn accumulated cost (matches /costs) rather than repricing + // the whole session at one model's rate — the active model can change + // mid-session via /priority, so a single-rate estimate would be wrong. + const sessionCost = ui.getTotals().cost; + const cost = sessionCost > 0 ? ` ≈ ${formatUsd(sessionCost)}` : ''; console.log( pc.dim( `\nSession: ↑ ${fmtN(usage.inputTokens)} ↓ ${fmtN(usage.outputTokens)} tokens total${cost}`, diff --git a/tests/config/load.test.ts b/tests/config/load.test.ts index e72196a..056c515 100644 --- a/tests/config/load.test.ts +++ b/tests/config/load.test.ts @@ -126,6 +126,22 @@ describe('loadConfig', () => { expect(loadConfig({ model: 'claude-opus-4-8' }, cwd).modelPinned).toBe(true); }); + it('ignores an invalid TINY_CODE_PRIORITY instead of silently mis-picking a model', () => { + process.env.ANTHROPIC_API_KEY = 'sk-test'; + process.env.TINY_CODE_PRIORITY = 'performant'; // typo + const cfg = loadConfig({}, cwd); + // Falls back to the default priority + its model, not an arbitrary catalog entry. + expect(cfg.priority).toBe('balanced'); + expect(cfg.model).toBe('claude-sonnet-4-6'); + }); + + it('ignores an invalid TINY_CODE_PROVIDER and falls back to key inference', () => { + process.env.ANTHROPIC_API_KEY = 'sk-test'; + process.env.TINY_CODE_PROVIDER = 'mistral'; // unsupported + const cfg = loadConfig({}, cwd); + expect(cfg.provider).toBe('anthropic'); + }); + it('opts into the most capable model with performance priority', () => { process.env.ANTHROPIC_API_KEY = 'sk-test'; process.env.TINY_CODE_PRIORITY = 'performance'; diff --git a/tests/providers/openaiCloudSend.test.ts b/tests/providers/openaiCloudSend.test.ts index 3e53e02..7b7bfcb 100644 --- a/tests/providers/openaiCloudSend.test.ts +++ b/tests/providers/openaiCloudSend.test.ts @@ -53,6 +53,23 @@ describe('DeepSeekProvider.send', () => { expect(done).toMatchObject({ usage: { inputTokens: 5, outputTokens: 2 } }); }); + it('synthesizes a provider-scoped tool-call id when the server omits one', async () => { + // The OpenAI wire format normally supplies an id; some servers don't. The + // fallback must stay non-empty so the result can be correlated next turn. + vi.spyOn(globalThis, 'fetch').mockResolvedValue( + sseResponse([ + { + choices: [ + { delta: { tool_calls: [{ index: 0, function: { name: 'ls', arguments: '{}' } }] } }, + ], + }, + ]), + ); + const provider = new DeepSeekProvider({ apiKey: 'k', model: 'deepseek-v4-pro' }); + const call = (await collect(provider)).find((e) => e.type === 'tool_call'); + expect(call).toMatchObject({ name: 'ls', id: 'deepseek-call-0' }); + }); + it('reports a DeepSeek-specific error when the host is unreachable', async () => { vi.spyOn(globalThis, 'fetch').mockRejectedValue(new Error('ENOTFOUND')); const provider = new DeepSeekProvider({ apiKey: 'k', model: 'deepseek-v4-pro' });