From 96753ad58946e674a5448be76bf5ba87c9833437 Mon Sep 17 00:00:00 2001 From: Jinwoo-H Date: Fri, 5 Jun 2026 10:49:43 -0700 Subject: [PATCH 1/4] Add OpenAI cloud speech transcription Co-authored-by: Orca --- src/main/ipc/speech.test.ts | 5 + src/main/ipc/speech.ts | 19 ++ src/main/speech/model-catalog.ts | 32 +++ src/main/speech/model-manager.test.ts | 33 ++- src/main/speech/model-manager.ts | 26 ++- src/main/speech/openai-api-key-store.ts | 85 +++++++ .../openai-transcription-client.test.ts | 20 ++ .../speech/openai-transcription-client.ts | 137 +++++++++++ src/main/speech/stt-service.test.ts | 90 +++++++- src/main/speech/stt-service.ts | 74 +++++- src/preload/api-types.ts | 3 + src/preload/index.ts | 6 + .../dictation/DictationController.tsx | 7 +- .../settings/OpenAiTranscriptionKeyDialog.tsx | 78 +++++++ .../OpenAiTranscriptionSettingsRow.tsx | 58 +++++ .../components/settings/VoicePane.test.tsx | 3 + .../src/components/settings/VoicePane.tsx | 214 +++++++++++------- .../settings/voice-dictation-toggle.ts | 62 +++++ .../components/settings/voice-pane-search.ts | 11 +- src/shared/constants.ts | 3 +- src/shared/speech-types.ts | 15 +- 21 files changed, 870 insertions(+), 111 deletions(-) create mode 100644 src/main/speech/openai-api-key-store.ts create mode 100644 src/main/speech/openai-transcription-client.test.ts create mode 100644 src/main/speech/openai-transcription-client.ts create mode 100644 src/renderer/src/components/settings/OpenAiTranscriptionKeyDialog.tsx create mode 100644 src/renderer/src/components/settings/OpenAiTranscriptionSettingsRow.tsx create mode 100644 src/renderer/src/components/settings/voice-dictation-toggle.ts diff --git a/src/main/ipc/speech.test.ts b/src/main/ipc/speech.test.ts index 310103fb2a..05d6dbb5e2 100644 --- a/src/main/ipc/speech.test.ts +++ b/src/main/ipc/speech.test.ts @@ -12,6 +12,11 @@ vi.mock('electron', () => ({ app: { getPath: vi.fn(() => '/tmp/orca-speech-test') }, BrowserWindow: { fromWebContents: fromWebContentsMock }, ipcMain: { handle: handleMock }, + safeStorage: { + decryptString: vi.fn(), + encryptString: vi.fn(() => Buffer.from('encrypted')), + isEncryptionAvailable: vi.fn(() => true) + }, systemPreferences: { getMediaAccessStatus: vi.fn(() => 'granted'), askForMediaAccess: vi.fn(() => Promise.resolve(true)) diff --git a/src/main/ipc/speech.ts b/src/main/ipc/speech.ts index 36a69b6409..053a0038d8 100644 --- a/src/main/ipc/speech.ts +++ b/src/main/ipc/speech.ts @@ -4,6 +4,11 @@ import { writeFile, unlink } from 'fs/promises' import { createHash } from 'crypto' import { SPEECH_MODEL_CATALOG, getCatalogModel } from '../speech/model-catalog' import { getSpeechModelManager, getSpeechSttService } from '../speech/speech-runtime-service' +import { + clearOpenAiSpeechApiKey, + hasOpenAiSpeechApiKey, + saveOpenAiSpeechApiKey +} from '../speech/openai-api-key-store' import type { Store } from '../persistence' export function registerSpeechHandlers(store: Store): void { @@ -15,6 +20,20 @@ export function registerSpeechHandlers(store: Store): void { return getSpeechModelManager(store).getModelStates() }) + ipcMain.handle('speech:getOpenAiApiKeyStatus', async () => { + return { configured: hasOpenAiSpeechApiKey() } + }) + + ipcMain.handle('speech:saveOpenAiApiKey', async (_event, apiKey: string) => { + saveOpenAiSpeechApiKey(apiKey) + return { configured: true } + }) + + ipcMain.handle('speech:clearOpenAiApiKey', async () => { + clearOpenAiSpeechApiKey() + return { configured: false } + }) + ipcMain.handle('speech:downloadModel', async (event, modelId: string) => { const manager = getSpeechModelManager(store) const window = BrowserWindow.fromWebContents(event.sender) diff --git a/src/main/speech/model-catalog.ts b/src/main/speech/model-catalog.ts index 8a595128a7..266442eb78 100644 --- a/src/main/speech/model-catalog.ts +++ b/src/main/speech/model-catalog.ts @@ -7,6 +7,7 @@ export const SPEECH_MODEL_CATALOG: SpeechModelManifest[] = [ description: 'Highest accuracy for 25 European languages. Punctuation, capitalization, and word-level timestamps.', type: 'transducer', + provider: 'local', language: 'multilingual', sizeBytes: 180_000_000, downloadUrl: @@ -25,6 +26,7 @@ export const SPEECH_MODEL_CATALOG: SpeechModelManifest[] = [ description: 'English only. Faster than v3 with similar accuracy. Punctuation and capitalization.', type: 'transducer', + provider: 'local', language: 'en', sizeBytes: 170_000_000, downloadUrl: @@ -41,6 +43,7 @@ export const SPEECH_MODEL_CATALOG: SpeechModelManifest[] = [ label: 'Zipformer Bilingual', description: 'Chinese + English with code-switching. Low-latency real-time streaming.', type: 'transducer', + provider: 'local', language: 'zh-en', sizeBytes: 130_000_000, downloadUrl: @@ -63,6 +66,7 @@ export const SPEECH_MODEL_CATALOG: SpeechModelManifest[] = [ description: 'Chinese (Mandarin + dialects) + English. Strong on accented and regional Chinese.', type: 'paraformer', + provider: 'local', language: 'zh-en', sizeBytes: 115_000_000, downloadUrl: @@ -78,6 +82,7 @@ export const SPEECH_MODEL_CATALOG: SpeechModelManifest[] = [ label: 'Zipformer Streaming EN', description: 'English only. Lightweight 20M-param model, good balance of speed and size.', type: 'transducer', + provider: 'local', language: 'en', sizeBytes: 128_000_000, downloadUrl: @@ -99,6 +104,7 @@ export const SPEECH_MODEL_CATALOG: SpeechModelManifest[] = [ label: 'Zipformer Streaming ZH', description: 'Chinese only. Ultra-lightweight 14M-param model, ideal for low-resource devices.', type: 'transducer', + provider: 'local', language: 'zh', sizeBytes: 74_000_000, downloadUrl: @@ -120,6 +126,7 @@ export const SPEECH_MODEL_CATALOG: SpeechModelManifest[] = [ label: 'Whisper Tiny', description: '90+ languages. Lower accuracy than Parakeet but broadest language coverage.', type: 'whisper', + provider: 'local', language: 'multilingual', sizeBytes: 116_000_000, downloadUrl: @@ -129,9 +136,34 @@ export const SPEECH_MODEL_CATALOG: SpeechModelManifest[] = [ files: ['tiny-encoder.onnx', 'tiny-decoder.onnx', 'tiny-tokens.txt'], sampleRate: 16000, streaming: false + }, + { + id: 'openai-gpt-4o-mini-transcribe', + label: 'GPT-4o mini Transcribe', + description: + 'Cloud transcription with strong accuracy and low cost. Requires an OpenAI API key.', + type: 'openai', + provider: 'openai', + language: 'multilingual', + sampleRate: 16000, + streaming: false + }, + { + id: 'openai-gpt-4o-transcribe', + label: 'GPT-4o Transcribe', + description: 'Cloud transcription with higher accuracy. Requires an OpenAI API key.', + type: 'openai', + provider: 'openai', + language: 'multilingual', + sampleRate: 16000, + streaming: false } ] export function getCatalogModel(id: string): SpeechModelManifest | undefined { return SPEECH_MODEL_CATALOG.find((m) => m.id === id) } + +export function isLocalSpeechModel(manifest: SpeechModelManifest): boolean { + return manifest.provider === 'local' +} diff --git a/src/main/speech/model-manager.test.ts b/src/main/speech/model-manager.test.ts index 4e6cb5ee75..7879d86d86 100644 --- a/src/main/speech/model-manager.test.ts +++ b/src/main/speech/model-manager.test.ts @@ -6,7 +6,8 @@ import { beforeEach, describe, expect, it, vi } from 'vitest' import { SPEECH_MODEL_CATALOG } from './model-catalog' import { ModelManager } from './model-manager' -const { httpsGetMock, spawnMock } = vi.hoisted(() => ({ +const { hasOpenAiSpeechApiKeyMock, httpsGetMock, spawnMock } = vi.hoisted(() => ({ + hasOpenAiSpeechApiKeyMock: vi.fn(), httpsGetMock: vi.fn(), spawnMock: vi.fn() })) @@ -27,6 +28,10 @@ vi.mock('https', async () => { return { ...(actual as Record), get: httpsGetMock } }) +vi.mock('./openai-api-key-store', () => ({ + hasOpenAiSpeechApiKey: hasOpenAiSpeechApiKeyMock +})) + type ModelManagerInternals = { verifyArchiveSha256: (archivePath: string, expectedSha256: string) => Promise downloadFile: ( @@ -48,11 +53,16 @@ type ModelManagerInternals = { describe('ModelManager', () => { beforeEach(() => { httpsGetMock.mockReset() + hasOpenAiSpeechApiKeyMock.mockReset() + hasOpenAiSpeechApiKeyMock.mockReturnValue(false) spawnMock.mockReset() }) it('requires pinned SHA-256 hashes for every catalog archive', () => { for (const manifest of SPEECH_MODEL_CATALOG) { + if (manifest.provider !== 'local') { + continue + } expect(manifest.archiveSha256).toMatch(/^[a-f0-9]{64}$/) } }) @@ -93,6 +103,27 @@ describe('ModelManager', () => { } }) + it('marks OpenAI transcription models ready only when an API key is configured', async () => { + const dir = mkdtempSync(join(tmpdir(), 'orca-model-manager-')) + try { + const manager = new ModelManager(dir) + + await expect(manager.getModelState('openai-gpt-4o-mini-transcribe')).resolves.toEqual({ + id: 'openai-gpt-4o-mini-transcribe', + status: 'not-downloaded' + }) + + hasOpenAiSpeechApiKeyMock.mockReturnValue(true) + + await expect(manager.getModelState('openai-gpt-4o-mini-transcribe')).resolves.toEqual({ + id: 'openai-gpt-4o-mini-transcribe', + status: 'ready' + }) + } finally { + rmSync(dir, { recursive: true, force: true }) + } + }) + it('aborts an in-flight model download request when cancelled', async () => { const dir = mkdtempSync(join(tmpdir(), 'orca-model-manager-')) try { diff --git a/src/main/speech/model-manager.ts b/src/main/speech/model-manager.ts index bc51de602a..7f4f5b4804 100644 --- a/src/main/speech/model-manager.ts +++ b/src/main/speech/model-manager.ts @@ -13,7 +13,8 @@ import type { SpeechModelState, SpeechModelStatus } from '../../shared/speech-types' -import { SPEECH_MODEL_CATALOG, getCatalogModel } from './model-catalog' +import { SPEECH_MODEL_CATALOG, getCatalogModel, isLocalSpeechModel } from './model-catalog' +import { hasOpenAiSpeechApiKey } from './openai-api-key-store' import { resolveTarExecutable } from './tar-executable' type DownloadHandle = { @@ -68,6 +69,13 @@ export class ModelManager { return { id: modelId, status: 'error', error: 'Unknown model' } } + if (manifest.provider === 'openai') { + return { + id: modelId, + status: hasOpenAiSpeechApiKey() ? 'ready' : 'not-downloaded' + } + } + const modelDir = this.getModelDir(modelId) if (existsSync(modelDir) && this.validateModelFiles(manifest, modelDir)) { const state: SpeechModelState = { id: modelId, status: 'ready' } @@ -97,6 +105,9 @@ export class ModelManager { } private validateModelFiles(manifest: SpeechModelManifest, modelDir: string): boolean { + if (!manifest.files) { + return false + } return manifest.files.every((f) => existsSync(join(modelDir, f))) } @@ -109,6 +120,12 @@ export class ModelManager { if (!manifest) { throw new Error(`Unknown model: ${modelId}`) } + if (!isLocalSpeechModel(manifest)) { + throw new Error(`Model does not support downloads: ${modelId}`) + } + if (!manifest.downloadUrl || !manifest.archiveSha256 || !manifest.sizeBytes) { + throw new Error(`Model download metadata missing: ${modelId}`) + } const modelDir = this.getModelDir(modelId) if (existsSync(modelDir) && this.validateModelFiles(manifest, modelDir)) { @@ -213,6 +230,10 @@ export class ModelManager { if (!getCatalogModel(modelId)) { throw new Error(`Unknown model: ${modelId}`) } + const manifest = getCatalogModel(modelId) + if (!manifest || !isLocalSpeechModel(manifest)) { + throw new Error(`Model does not support deletion: ${modelId}`) + } this.cancelDownload(modelId) const modelDir = this.getModelDir(modelId) if (existsSync(modelDir)) { @@ -542,6 +563,9 @@ export class ModelManager { } private async flattenNestedDir(modelDir: string, manifest: SpeechModelManifest): Promise { + if (!manifest.files) { + return + } const entries = await readdir(modelDir, { withFileTypes: true }) for (const entry of entries) { if (entry.isDirectory()) { diff --git a/src/main/speech/openai-api-key-store.ts b/src/main/speech/openai-api-key-store.ts new file mode 100644 index 0000000000..23e661fb71 --- /dev/null +++ b/src/main/speech/openai-api-key-store.ts @@ -0,0 +1,85 @@ +import { safeStorage } from 'electron' +import { existsSync, mkdirSync, readFileSync, rmSync, writeFileSync } from 'fs' +import { homedir } from 'os' +import { join } from 'path' + +type StoredOpenAiKey = { + encryptedKeyBase64: string +} + +const OPENAI_SPEECH_TOKEN_FILE = 'openai-speech-token.enc' + +function getOrcaDir(): string { + return join(homedir(), '.orca') +} + +function ensureOrcaDir(): void { + const dir = getOrcaDir() + if (!existsSync(dir)) { + mkdirSync(dir, { recursive: true }) + } +} + +function getOpenAiKeyPath(): string { + return join(getOrcaDir(), OPENAI_SPEECH_TOKEN_FILE) +} + +function readLegacyJsonStoredOpenAiKey(): StoredOpenAiKey | null { + const keyPath = getOpenAiKeyPath() + if (!existsSync(keyPath)) { + return null + } + try { + const parsed = JSON.parse(readFileSync(keyPath, 'utf8')) as Partial + if (typeof parsed.encryptedKeyBase64 !== 'string' || parsed.encryptedKeyBase64 === '') { + return null + } + return { encryptedKeyBase64: parsed.encryptedKeyBase64 } + } catch { + return null + } +} + +export function hasOpenAiSpeechApiKey(): boolean { + return existsSync(getOpenAiKeyPath()) +} + +export function saveOpenAiSpeechApiKey(apiKey: string): void { + const trimmed = apiKey.trim() + if (!trimmed) { + throw new Error('OpenAI API key is required') + } + ensureOrcaDir() + if (safeStorage.isEncryptionAvailable()) { + writeFileSync(getOpenAiKeyPath(), safeStorage.encryptString(trimmed), { mode: 0o600 }) + return + } + + console.warn( + '[speech] safeStorage encryption unavailable — storing OpenAI speech key in plaintext' + ) + writeFileSync(getOpenAiKeyPath(), trimmed, { encoding: 'utf8', mode: 0o600 }) +} + +export function readOpenAiSpeechApiKey(): string { + const keyPath = getOpenAiKeyPath() + if (!existsSync(keyPath)) { + throw new Error('OpenAI API key is not configured') + } + try { + const raw = readFileSync(keyPath) + const legacyJson = readLegacyJsonStoredOpenAiKey() + if (legacyJson) { + return safeStorage.decryptString(Buffer.from(legacyJson.encryptedKeyBase64, 'base64')) + } + return safeStorage.isEncryptionAvailable() + ? safeStorage.decryptString(raw) + : raw.toString('utf8') + } catch { + throw new Error('OpenAI API key could not be decrypted') + } +} + +export function clearOpenAiSpeechApiKey(): void { + rmSync(getOpenAiKeyPath(), { force: true }) +} diff --git a/src/main/speech/openai-transcription-client.test.ts b/src/main/speech/openai-transcription-client.test.ts new file mode 100644 index 0000000000..8e20f593b0 --- /dev/null +++ b/src/main/speech/openai-transcription-client.test.ts @@ -0,0 +1,20 @@ +import { describe, expect, it } from 'vitest' +import { sanitizeOpenAiTranscriptionErrorMessage } from './openai-transcription-client' + +describe('sanitizeOpenAiTranscriptionErrorMessage', () => { + it('does not expose the invalid OpenAI API key echoed by the provider', () => { + expect( + sanitizeOpenAiTranscriptionErrorMessage( + 'Incorrect API key provided: fsdfdsfsdf. You can find your API key at https://platform.openai.com/account/api-keys.' + ) + ).toBe('Incorrect OpenAI API key provided.') + }) + + it('redacts API keys and bearer tokens from other provider errors', () => { + expect( + sanitizeOpenAiTranscriptionErrorMessage( + 'Request failed for sk-testSecret123 with Authorization: Bearer token-value_123' + ) + ).toBe('Request failed for [redacted] with Authorization: Bearer [redacted]') + }) +}) diff --git a/src/main/speech/openai-transcription-client.ts b/src/main/speech/openai-transcription-client.ts new file mode 100644 index 0000000000..c61dc940f1 --- /dev/null +++ b/src/main/speech/openai-transcription-client.ts @@ -0,0 +1,137 @@ +import { resampleToRate } from './stt-audio-resample' + +export const OPENAI_TRANSCRIPTION_MODEL_BY_ID: Record = { + 'openai-gpt-4o-mini-transcribe': 'gpt-4o-mini-transcribe', + 'openai-gpt-4o-transcribe': 'gpt-4o-transcribe' +} + +const OPENAI_TRANSCRIPTION_URL = 'https://api.openai.com/v1/audio/transcriptions' +const CLOUD_TRANSCRIPTION_SAMPLE_RATE = 16000 +const MAX_CLOUD_AUDIO_SECONDS = 10 * 60 + +type OpenAiTranscriptionResponse = { + text?: unknown + error?: { + message?: unknown + } +} + +export function sanitizeOpenAiTranscriptionErrorMessage(message: string): string { + if (/incorrect api key provided:/i.test(message)) { + return 'Incorrect OpenAI API key provided.' + } + + const sanitized = message + .replace(/\bsk-[A-Za-z0-9_-]+/g, '[redacted]') + .replace(/\bBearer\s+[A-Za-z0-9._~+/=-]+/gi, 'Bearer [redacted]') + .trim() + + return sanitized || 'OpenAI transcription request failed' +} + +function encodePcm16Wav(samples: Float32Array, sampleRate: number): Buffer { + const dataBytes = samples.length * 2 + const buffer = Buffer.alloc(44 + dataBytes) + + buffer.write('RIFF', 0) + buffer.writeUInt32LE(36 + dataBytes, 4) + buffer.write('WAVE', 8) + buffer.write('fmt ', 12) + buffer.writeUInt32LE(16, 16) + buffer.writeUInt16LE(1, 20) + buffer.writeUInt16LE(1, 22) + buffer.writeUInt32LE(sampleRate, 24) + buffer.writeUInt32LE(sampleRate * 2, 28) + buffer.writeUInt16LE(2, 32) + buffer.writeUInt16LE(16, 34) + buffer.write('data', 36) + buffer.writeUInt32LE(dataBytes, 40) + + for (let i = 0; i < samples.length; i += 1) { + const clamped = Math.max(-1, Math.min(1, samples[i])) + const value = clamped < 0 ? clamped * 0x8000 : clamped * 0x7fff + buffer.writeInt16LE(Math.round(value), 44 + i * 2) + } + + return buffer +} + +function combineChunks(chunks: Float32Array[]): Float32Array { + const totalLength = chunks.reduce((sum, chunk) => sum + chunk.length, 0) + const combined = new Float32Array(totalLength) + let offset = 0 + for (const chunk of chunks) { + combined.set(chunk, offset) + offset += chunk.length + } + return combined +} + +function parseOpenAiTranscriptionResponse(data: OpenAiTranscriptionResponse): string { + if (typeof data.text === 'string') { + return data.text.trim() + } + if (typeof data.error?.message === 'string') { + throw new Error(sanitizeOpenAiTranscriptionErrorMessage(data.error.message)) + } + throw new Error('OpenAI transcription response did not include text') +} + +export class OpenAiTranscriptionSession { + private chunks: Float32Array[] = [] + private audioSeconds = 0 + + constructor( + private readonly modelId: string, + private readonly readApiKey: () => string + ) {} + + feedAudio(samples: Float32Array, sampleRate: number): void { + const normalized = resampleToRate(samples, sampleRate, CLOUD_TRANSCRIPTION_SAMPLE_RATE) + this.audioSeconds += normalized.length / CLOUD_TRANSCRIPTION_SAMPLE_RATE + if (this.audioSeconds > MAX_CLOUD_AUDIO_SECONDS) { + throw new Error('Cloud transcription is limited to 10 minutes per dictation') + } + this.chunks.push(new Float32Array(normalized)) + } + + async finish(): Promise { + if (this.chunks.length === 0) { + return '' + } + + const apiModel = OPENAI_TRANSCRIPTION_MODEL_BY_ID[this.modelId] + if (!apiModel) { + throw new Error(`Unknown OpenAI transcription model: ${this.modelId}`) + } + + const audio = combineChunks(this.chunks) + this.chunks = [] + const wav = encodePcm16Wav(audio, CLOUD_TRANSCRIPTION_SAMPLE_RATE) + const form = new FormData() + form.append('model', apiModel) + form.append('response_format', 'json') + // Why: OpenAI's transcription endpoint expects a multipart file object; + // a named WAV blob avoids filesystem temp files and works in packaged apps. + form.append('file', new Blob([new Uint8Array(wav)], { type: 'audio/wav' }), 'dictation.wav') + + const response = await fetch(OPENAI_TRANSCRIPTION_URL, { + method: 'POST', + headers: { + Authorization: `Bearer ${this.readApiKey()}` + }, + body: form + }) + + const data = (await response.json().catch(() => ({}))) as OpenAiTranscriptionResponse + if (!response.ok) { + const message = + typeof data.error?.message === 'string' + ? sanitizeOpenAiTranscriptionErrorMessage(data.error.message) + : response.statusText + throw new Error(`OpenAI transcription failed: ${message}`) + } + + return parseOpenAiTranscriptionResponse(data) + } +} diff --git a/src/main/speech/stt-service.test.ts b/src/main/speech/stt-service.test.ts index e83b07c7a1..e3aa511453 100644 --- a/src/main/speech/stt-service.test.ts +++ b/src/main/speech/stt-service.test.ts @@ -1,6 +1,14 @@ import { beforeEach, describe, expect, it, vi } from 'vitest' -const { MockWorker, getCreatedWorkerCount, getLastWorker, resetWorkers } = vi.hoisted(() => { +const { + MockOpenAiTranscriptionSession, + MockWorker, + getCloudSessions, + getCreatedWorkerCount, + getLastWorker, + resetCloudSessions, + resetWorkers +} = vi.hoisted(() => { class HoistedMockWorker extends EventTarget { static created = 0 static instances: HoistedMockWorker[] = [] @@ -64,10 +72,35 @@ const { MockWorker, getCreatedWorkerCount, getLastWorker, resetWorkers } = vi.ho } } + class HoistedMockOpenAiTranscriptionSession { + static instances: HoistedMockOpenAiTranscriptionSession[] = [] + feedCalls: { samples: Float32Array; sampleRate: number }[] = [] + + constructor( + readonly modelId: string, + readonly readApiKey: () => string + ) { + HoistedMockOpenAiTranscriptionSession.instances.push(this) + } + + feedAudio(samples: Float32Array, sampleRate: number): void { + this.feedCalls.push({ samples, sampleRate }) + } + + finish(): Promise { + return Promise.resolve(`${this.modelId}:${this.readApiKey()}`) + } + } + return { + MockOpenAiTranscriptionSession: HoistedMockOpenAiTranscriptionSession, MockWorker: HoistedMockWorker, + getCloudSessions: () => HoistedMockOpenAiTranscriptionSession.instances, getCreatedWorkerCount: () => HoistedMockWorker.created, getLastWorker: () => HoistedMockWorker.instances.at(-1), + resetCloudSessions: () => { + HoistedMockOpenAiTranscriptionSession.instances = [] + }, resetWorkers: () => { HoistedMockWorker.created = 0 HoistedMockWorker.instances = [] @@ -89,19 +122,38 @@ vi.mock('worker_threads', () => ({ })) vi.mock('./model-catalog', () => ({ - getCatalogModel: () => ({ - id: 'model-a', - type: 'transducer', - streaming: true, - sampleRate: 16000, - files: ['encoder.onnx', 'decoder.onnx', 'joiner.onnx', 'tokens.txt'] - }) + getCatalogModel: (id: string) => + id === 'openai-model' + ? { + id, + type: 'openai', + provider: 'openai', + streaming: false, + sampleRate: 16000 + } + : { + id: 'model-a', + type: 'transducer', + provider: 'local', + streaming: true, + sampleRate: 16000, + files: ['encoder.onnx', 'decoder.onnx', 'joiner.onnx', 'tokens.txt'] + } +})) + +vi.mock('./openai-api-key-store', () => ({ + readOpenAiSpeechApiKey: () => 'test-openai-key' +})) + +vi.mock('./openai-transcription-client', () => ({ + OpenAiTranscriptionSession: MockOpenAiTranscriptionSession })) import { IDLE_WORKER_TEARDOWN_MS, START_DICTATION_TIMEOUT_MS, SttService } from './stt-service' describe('SttService', () => { beforeEach(() => { + resetCloudSessions() resetWorkers() }) @@ -184,6 +236,28 @@ describe('SttService', () => { expect(worker!.messages.filter((message) => message.type === 'feed')).toHaveLength(0) }) + it('uses the OpenAI transcription session without creating a worker', async () => { + const sink = vi.fn() + const service = new SttService({ + getModelState: vi.fn().mockResolvedValue({ id: 'openai-model', status: 'ready' }), + getModelDir: vi.fn().mockReturnValue('/tmp/model-a') + } as never) + + await service.startDictation('openai-model', sink, undefined, 'desktop') + service.feedAudio(new Float32Array([0.25, -0.25]), 48000, 'desktop') + await service.stopDictation('desktop') + + expect(getCreatedWorkerCount()).toBe(0) + expect(getCloudSessions()).toHaveLength(1) + expect(getCloudSessions()[0].feedCalls).toHaveLength(1) + expect(sink).toHaveBeenCalledWith({ type: 'ready' }) + expect(sink).toHaveBeenCalledWith({ + type: 'final', + text: 'openai-model:test-openai-key' + }) + expect(sink).toHaveBeenCalledWith({ type: 'stopped' }) + }) + it('keeps startup cancellation tombstoned after the worker has been created', async () => { const service = new SttService({ getModelState: vi.fn().mockResolvedValue({ id: 'model-a', status: 'ready' }), diff --git a/src/main/speech/stt-service.ts b/src/main/speech/stt-service.ts index 81506a9a9b..f31a3b90e7 100644 --- a/src/main/speech/stt-service.ts +++ b/src/main/speech/stt-service.ts @@ -6,6 +6,8 @@ import { join } from 'path' import { app } from 'electron' import { getCatalogModel } from './model-catalog' import type { ModelManager } from './model-manager' +import { OpenAiTranscriptionSession } from './openai-transcription-client' +import { readOpenAiSpeechApiKey } from './openai-api-key-store' export const START_DICTATION_TIMEOUT_MS = 60_000 const STOP_DICTATION_TIMEOUT_MS = 60_000 @@ -22,6 +24,7 @@ export type SttEventSink = (event: SttEvent) => void export class SttService { private worker: Worker | null = null + private cloudSession: OpenAiTranscriptionSession | null = null private modelManager: ModelManager private activeModelId: string | null = null private activeHotwordsFilePath: string | undefined @@ -51,7 +54,7 @@ export class SttService { } return } - if (this.worker && this.activeOwner && this.activeOwner !== owner) { + if ((this.worker || this.cloudSession) && this.activeOwner && this.activeOwner !== owner) { throw new Error('dictation_already_active') } this.starting = true @@ -78,6 +81,34 @@ export class SttService { hotwordsFilePath?: string, owner = 'desktop' ): Promise { + const manifest = getCatalogModel(modelId) + if (!manifest) { + throw new Error(`Unknown model: ${modelId}`) + } + + if (manifest.provider === 'openai') { + if (this.worker) { + await this.stopDictation(owner, { cancelStarting: false }) + await this.teardownIdleWorker() + } + + const modelState = await this.modelManager.getModelState(modelId) + if (modelState.status !== 'ready') { + throw new Error(`Model not ready: ${modelState.status}`) + } + + this.cloudSession = new OpenAiTranscriptionSession(modelId, readOpenAiSpeechApiKey) + this.activeModelId = modelId + this.activeHotwordsFilePath = undefined + this.eventSink = sink + sink({ type: 'ready' }) + return + } + + if (this.cloudSession) { + await this.stopDictation(owner, { cancelStarting: false }) + } + if ( this.worker && this.activeModelId === modelId && @@ -93,11 +124,6 @@ export class SttService { await this.teardownIdleWorker() } - const manifest = getCatalogModel(modelId) - if (!manifest) { - throw new Error(`Unknown model: ${modelId}`) - } - const modelState = await this.modelManager.getModelState(modelId) if (modelState.status !== 'ready') { throw new Error(`Model not ready: ${modelState.status}`) @@ -207,7 +233,7 @@ export class SttService { modelType: manifest.type, streaming: manifest.streaming, sampleRate: manifest.sampleRate, - files: manifest.files, + files: manifest.files ?? [], hotwordsFilePath, modelingUnit: manifest.modelingUnit }) @@ -237,6 +263,10 @@ export class SttService { if (currentOwner !== owner) { throw new Error('dictation_owner_mismatch') } + if (this.cloudSession) { + this.cloudSession.feedAudio(samples, sampleRate) + return + } this.worker?.postMessage({ type: 'feed', samples, sampleRate }, [samples.buffer as ArrayBuffer]) } @@ -247,7 +277,7 @@ export class SttService { if (options.cancelStarting !== false && this.startingOwner === owner) { this.canceledOwners.add(owner) } - if (!this.worker) { + if (!this.worker && !this.cloudSession) { return } const currentOwner = this.activeOwner ?? this.startingOwner @@ -255,7 +285,33 @@ export class SttService { throw new Error('dictation_owner_mismatch') } + if (this.cloudSession) { + const session = this.cloudSession + this.cloudSession = null + try { + const text = await session.finish() + if (text) { + this.eventSink?.({ type: 'final', text }) + } + } catch (error) { + this.eventSink?.({ + type: 'error', + error: error instanceof Error ? error.message : String(error) + }) + } finally { + this.eventSink?.({ type: 'stopped' }) + this.activeModelId = null + this.activeHotwordsFilePath = undefined + this.activeOwner = null + this.eventSink = null + } + return + } + const worker = this.worker + if (!worker) { + return + } worker.postMessage({ type: 'stop' }) let forcedTeardown = false @@ -319,7 +375,7 @@ export class SttService { } isActive(): boolean { - return this.worker !== null + return this.worker !== null || this.cloudSession !== null } getActiveModelId(): string | null { diff --git a/src/preload/api-types.ts b/src/preload/api-types.ts index 90fb1f3a19..612f63053a 100644 --- a/src/preload/api-types.ts +++ b/src/preload/api-types.ts @@ -2433,6 +2433,9 @@ export type PreloadApi = { speech: { getCatalog: () => Promise getModelStates: () => Promise + getOpenAiApiKeyStatus: () => Promise<{ configured: boolean }> + saveOpenAiApiKey: (apiKey: string) => Promise<{ configured: boolean }> + clearOpenAiApiKey: () => Promise<{ configured: boolean }> downloadModel: (modelId: string) => Promise cancelDownload: (modelId: string) => Promise deleteModel: (modelId: string) => Promise diff --git a/src/preload/index.ts b/src/preload/index.ts index 16fd77622a..b7d8c55013 100644 --- a/src/preload/index.ts +++ b/src/preload/index.ts @@ -3479,6 +3479,12 @@ const api = { speech: { getCatalog: (): Promise => ipcRenderer.invoke('speech:getCatalog'), getModelStates: (): Promise => ipcRenderer.invoke('speech:getModelStates'), + getOpenAiApiKeyStatus: (): Promise<{ configured: boolean }> => + ipcRenderer.invoke('speech:getOpenAiApiKeyStatus'), + saveOpenAiApiKey: (apiKey: string): Promise<{ configured: boolean }> => + ipcRenderer.invoke('speech:saveOpenAiApiKey', apiKey), + clearOpenAiApiKey: (): Promise<{ configured: boolean }> => + ipcRenderer.invoke('speech:clearOpenAiApiKey'), downloadModel: (modelId: string): Promise => ipcRenderer.invoke('speech:downloadModel', modelId), cancelDownload: (modelId: string): Promise => diff --git a/src/renderer/src/components/dictation/DictationController.tsx b/src/renderer/src/components/dictation/DictationController.tsx index ef52d3851f..f580d52782 100644 --- a/src/renderer/src/components/dictation/DictationController.tsx +++ b/src/renderer/src/components/dictation/DictationController.tsx @@ -38,6 +38,7 @@ export function DictationController() { const stoppedResolversRef = useRef(new Map void>()) const stopRequestedDuringStartRef = useRef(false) const finalTranscriptReceivedRef = useRef(false) + const erroredSessionIdsRef = useRef(new Set()) const intentionalTargetCancellationRef = useRef(false) const insertedFinalTranscriptRef = useRef('') @@ -59,7 +60,8 @@ export function DictationController() { // transcript delivery is renderer IPC. Wait for this session's stopped // event so old finals cannot be mistaken for the next dictation run. await waitForStoppedSession(sessionId, stoppedSessionIdsRef, stoppedResolversRef) - if (!finalTranscriptReceivedRef.current && getCapturedChunkCount() > 0) { + const sessionErrored = erroredSessionIdsRef.current.delete(sessionId) + if (!sessionErrored && !finalTranscriptReceivedRef.current && getCapturedChunkCount() > 0) { toast.message('No speech detected.') } insertionTargetRef.current = null @@ -108,6 +110,7 @@ export function DictationController() { insertionTargetRef.current = captureInsertionTarget() stopRequestedDuringStartRef.current = false finalTranscriptReceivedRef.current = false + erroredSessionIdsRef.current.clear() insertedFinalTranscriptRef.current = '' intentionalTargetCancellationRef.current = false dictationStateRef.current = 'starting' @@ -172,6 +175,7 @@ export function DictationController() { intentionalTargetCancellationRef.current = false stopRequestedDuringStartRef.current = false finalTranscriptReceivedRef.current = false + erroredSessionIdsRef.current.clear() insertedFinalTranscriptRef.current = '' activeSessionIdRef.current = null setPartialTranscript('') @@ -381,6 +385,7 @@ export function DictationController() { return } const sessionId = data.sessionId + erroredSessionIdsRef.current.add(sessionId) dictationRunRef.current += 1 activeSessionIdRef.current = null toast.error(`Speech error: ${data.error}`) diff --git a/src/renderer/src/components/settings/OpenAiTranscriptionKeyDialog.tsx b/src/renderer/src/components/settings/OpenAiTranscriptionKeyDialog.tsx new file mode 100644 index 0000000000..e6f84068b1 --- /dev/null +++ b/src/renderer/src/components/settings/OpenAiTranscriptionKeyDialog.tsx @@ -0,0 +1,78 @@ +import { Loader2, Lock } from 'lucide-react' +import { Button } from '../ui/button' +import { + Dialog, + DialogContent, + DialogDescription, + DialogFooter, + DialogHeader, + DialogTitle +} from '../ui/dialog' +import { Input } from '../ui/input' +import { Label } from '../ui/label' + +type OpenAiTranscriptionKeyDialogProps = { + open: boolean + configured: boolean + apiKeyDraft: string + pending: boolean + onOpenChange: (open: boolean) => void + onApiKeyDraftChange: (value: string) => void + onSave: () => void + onClear: () => void +} + +export function OpenAiTranscriptionKeyDialog({ + open, + configured, + apiKeyDraft, + pending, + onOpenChange, + onApiKeyDraftChange, + onSave, + onClear +}: OpenAiTranscriptionKeyDialogProps): React.JSX.Element { + return ( + + + + OpenAI Transcription + + Audio is sent to OpenAI only when an OpenAI speech model is selected. + + +
+ + onApiKeyDraftChange(event.target.value)} + onKeyDown={(event) => { + if (event.key === 'Enter' && apiKeyDraft.trim()) { + onSave() + } + }} + /> +
+

+ + Local runtime keys are stored in ~/.orca using Electron encrypted storage when available. +

+ + {configured && ( + + )} + + +
+
+ ) +} diff --git a/src/renderer/src/components/settings/OpenAiTranscriptionSettingsRow.tsx b/src/renderer/src/components/settings/OpenAiTranscriptionSettingsRow.tsx new file mode 100644 index 0000000000..2123d89057 --- /dev/null +++ b/src/renderer/src/components/settings/OpenAiTranscriptionSettingsRow.tsx @@ -0,0 +1,58 @@ +import { CheckCircle2, Cloud, Unlink } from 'lucide-react' +import { Button } from '../ui/button' +import { Label } from '../ui/label' + +type OpenAiTranscriptionSettingsRowProps = { + configured: boolean + disabled: boolean + onConfigure: () => void + onClear: () => void +} + +export function OpenAiTranscriptionSettingsRow({ + configured, + disabled, + onConfigure, + onClear +}: OpenAiTranscriptionSettingsRowProps): React.JSX.Element { + return ( +
+
+
+ + + {configured && ( + + + Connected + + )} +
+

+ {configured + ? 'API key configured for cloud speech-to-text models.' + : 'Add an OpenAI API key before selecting cloud speech-to-text models.'} +

+
+ {configured ? ( +
+ + +
+ ) : ( + + )} +
+ ) +} diff --git a/src/renderer/src/components/settings/VoicePane.test.tsx b/src/renderer/src/components/settings/VoicePane.test.tsx index 7ee676e22b..2777d05cda 100644 --- a/src/renderer/src/components/settings/VoicePane.test.tsx +++ b/src/renderer/src/components/settings/VoicePane.test.tsx @@ -52,6 +52,9 @@ function installWindowApi( }, speech: { getCatalog: vi.fn(async () => []), + getOpenAiApiKeyStatus: vi.fn(async () => ({ configured: false })), + saveOpenAiApiKey: vi.fn(async () => ({ configured: true })), + clearOpenAiApiKey: vi.fn(async () => ({ configured: false })), onDownloadProgress: vi.fn(() => () => {}), downloadModel: vi.fn() } diff --git a/src/renderer/src/components/settings/VoicePane.tsx b/src/renderer/src/components/settings/VoicePane.tsx index a6c25b6323..9facfb4d68 100644 --- a/src/renderer/src/components/settings/VoicePane.tsx +++ b/src/renderer/src/components/settings/VoicePane.tsx @@ -1,8 +1,6 @@ import { useCallback, useEffect, useRef, useState } from 'react' import type { GlobalSettings } from '../../../../shared/types' import { getDefaultVoiceSettings } from '../../../../shared/constants' -import type { DeveloperPermissionRequestResult } from '../../../../shared/developer-permissions-types' -import type { FeatureTipId } from '../../../../shared/feature-tips' import type { SpeechModelManifest, SpeechModelState, @@ -17,75 +15,23 @@ import { DropdownMenuItem, DropdownMenuTrigger } from '../ui/dropdown-menu' -import { Download, Trash2, Loader2, ChevronDown, Check } from 'lucide-react' +import { Cloud, Download, Trash2, Loader2, ChevronDown, Check } from 'lucide-react' import { toast } from 'sonner' import { useAppStore } from '@/store' import { useShortcutLabel } from '@/hooks/useShortcutLabel' +import { OpenAiTranscriptionKeyDialog } from './OpenAiTranscriptionKeyDialog' +import { OpenAiTranscriptionSettingsRow } from './OpenAiTranscriptionSettingsRow' +import { handleVoiceDictationToggle } from './voice-dictation-toggle' +import { matchesSettingsSearch } from './settings-search' +import { OPENAI_TRANSCRIPTION_SEARCH_ENTRY } from './voice-pane-search' + +export { handleVoiceDictationToggle } type VoicePaneProps = { settings: GlobalSettings updateSettings: (updates: Partial) => void } -type VoiceDictationToggleOptions = { - voiceEnabled: boolean - markFeatureTipsSeen: (ids: FeatureTipId[]) => void - updateVoiceSettings: (updates: Partial) => void - requestMicrophonePermission: () => Promise - setPermissionPending?: (pending: boolean) => void - isMounted?: () => boolean - notifyPermissionGranted?: () => void - notifyPermissionOpenedSystemSettings?: () => void - notifyPermissionRequired?: () => void - notifyPermissionRequestFailed?: () => void -} - -export async function handleVoiceDictationToggle({ - voiceEnabled, - markFeatureTipsSeen, - updateVoiceSettings, - requestMicrophonePermission, - setPermissionPending, - isMounted, - notifyPermissionGranted, - notifyPermissionOpenedSystemSettings, - notifyPermissionRequired, - notifyPermissionRequestFailed -}: VoiceDictationToggleOptions): Promise { - // Why: changing the Voice Dictation switch proves the user discovered the - // feature; disabling it later should not make the discovery modal eligible. - markFeatureTipsSeen(['voice-dictation']) - - if (voiceEnabled) { - updateVoiceSettings({ enabled: false }) - return - } - - setPermissionPending?.(true) - try { - // Why: enabling dictation is the point where users expect the macOS - // microphone prompt, not after their first attempted recording fails. - const result = await requestMicrophonePermission() - if (result.status === 'granted' || result.status === 'unsupported') { - updateVoiceSettings({ enabled: true }) - } - - if (result.status === 'granted') { - notifyPermissionGranted?.() - } else if (result.openedSystemSettings) { - notifyPermissionOpenedSystemSettings?.() - } else if (result.status !== 'unsupported') { - notifyPermissionRequired?.() - } - } catch { - notifyPermissionRequestFailed?.() - } finally { - if (isMounted?.() ?? true) { - setPermissionPending?.(false) - } - } -} - export function VoicePane({ settings, updateSettings }: VoicePaneProps): React.JSX.Element { // Why: voice was made optional on GlobalSettings to keep older test fixtures // and pre-voice profiles type-compatible. Persistence merges defaults at @@ -95,9 +41,14 @@ export function VoicePane({ settings, updateSettings }: VoicePaneProps): React.J const modelStates = useAppStore((s) => s.modelStates) const refreshModelStates = useAppStore((s) => s.refreshModelStates) const markFeatureTipsSeen = useAppStore((s) => s.markFeatureTipsSeen) + const settingsSearchQuery = useAppStore((s) => s.settingsSearchQuery ?? '') const shortcutLabel = useShortcutLabel('voice.dictation') const [catalog, setCatalog] = useState([]) const [permissionPending, setPermissionPending] = useState(false) + const [openAiDialogOpen, setOpenAiDialogOpen] = useState(false) + const [openAiApiKeyDraft, setOpenAiApiKeyDraft] = useState('') + const [openAiKeyPending, setOpenAiKeyPending] = useState(false) + const [pendingCloudModelId, setPendingCloudModelId] = useState(null) const mountedRef = useRef(true) const handlePaneRef = useCallback((node: HTMLDivElement | null): void => { @@ -106,6 +57,18 @@ export function VoicePane({ settings, updateSettings }: VoicePaneProps): React.J mountedRef.current = node !== null }, []) + const updateVoiceSettings = useCallback( + (updates: Partial): void => { + updateSettings({ + voice: { + ...voiceSettings, + ...updates + } + }) + }, + [updateSettings, voiceSettings] + ) + useEffect(() => { let cancelled = false refreshModelStates() @@ -117,10 +80,19 @@ export function VoicePane({ settings, updateSettings }: VoicePaneProps): React.J } }) .catch(() => {}) + void window.api.speech + .getOpenAiApiKeyStatus() + .then((status) => { + if (!cancelled && status.configured !== voiceSettings.openAiApiKeyConfigured) { + updateVoiceSettings({ openAiApiKeyConfigured: status.configured }) + refreshModelStates() + } + }) + .catch(() => {}) return () => { cancelled = true } - }, [refreshModelStates]) + }, [refreshModelStates, updateVoiceSettings, voiceSettings.openAiApiKeyConfigured]) useEffect(() => { const cleanup = window.api.speech.onDownloadProgress(() => { @@ -129,15 +101,6 @@ export function VoicePane({ settings, updateSettings }: VoicePaneProps): React.J return cleanup }, [refreshModelStates]) - const updateVoiceSettings = (updates: Partial): void => { - updateSettings({ - voice: { - ...voiceSettings, - ...updates - } - }) - } - const toggleVoiceDictation = async (): Promise => { await handleVoiceDictationToggle({ voiceEnabled: voiceSettings.enabled, @@ -167,6 +130,61 @@ export function VoicePane({ settings, updateSettings }: VoicePaneProps): React.J ? getModelState(voiceSettings.sttModel) : undefined const selectedIsReady = selectedModelState?.status === 'ready' + const showOpenAiSettingsRow = + voiceSettings.openAiApiKeyConfigured || + selectedModel?.provider === 'openai' || + (settingsSearchQuery.trim() !== '' && + matchesSettingsSearch(settingsSearchQuery, OPENAI_TRANSCRIPTION_SEARCH_ENTRY)) + + const openOpenAiDialog = (modelId: string | null = null): void => { + setPendingCloudModelId(modelId) + setOpenAiApiKeyDraft('') + setOpenAiDialogOpen(true) + } + + const saveOpenAiApiKey = async (): Promise => { + setOpenAiKeyPending(true) + try { + await window.api.speech.saveOpenAiApiKey(openAiApiKeyDraft) + updateVoiceSettings({ + openAiApiKeyConfigured: true, + sttModel: pendingCloudModelId ?? voiceSettings.sttModel + }) + await refreshModelStates() + setOpenAiDialogOpen(false) + setOpenAiApiKeyDraft('') + setPendingCloudModelId(null) + toast.success('OpenAI API key saved') + } catch (err) { + toast.error(err instanceof Error ? err.message : 'Failed to save OpenAI API key') + } finally { + if (mountedRef.current) { + setOpenAiKeyPending(false) + } + } + } + + const clearOpenAiApiKey = async (): Promise => { + setOpenAiKeyPending(true) + try { + await window.api.speech.clearOpenAiApiKey() + updateVoiceSettings({ + openAiApiKeyConfigured: false, + sttModel: selectedModel?.provider === 'openai' ? '' : voiceSettings.sttModel + }) + await refreshModelStates() + setOpenAiDialogOpen(false) + setOpenAiApiKeyDraft('') + setPendingCloudModelId(null) + toast.success('OpenAI API key cleared') + } catch (err) { + toast.error(err instanceof Error ? err.message : 'Failed to clear OpenAI API key') + } finally { + if (mountedRef.current) { + setOpenAiKeyPending(false) + } + } + } return (
@@ -232,7 +250,7 @@ export function VoicePane({ settings, updateSettings }: VoicePaneProps): React.J

{selectedModel && selectedIsReady ? `${selectedModel.label} — ${selectedModel.description}` - : 'Select and download a model to enable dictation.'} + : 'Select a speech model. Local models run offline; cloud models require an API key.'}

@@ -254,7 +272,8 @@ export function VoicePane({ settings, updateSettings }: VoicePaneProps): React.J const isDownloading = mState?.status === 'downloading' || mState?.status === 'extracting' const isActive = voiceSettings.sttModel === manifest.id - const sizeMb = Math.round(manifest.sizeBytes / 1_000_000) + const isCloud = manifest.provider === 'openai' + const sizeMb = manifest.sizeBytes ? Math.round(manifest.sizeBytes / 1_000_000) : null return ( { if (isReady) { updateVoiceSettings({ sttModel: manifest.id }) + } else if (isCloud) { + openOpenAiDialog(manifest.id) } else if (!isDownloading) { void window.api.speech .downloadModel(manifest.id) @@ -270,7 +291,7 @@ export function VoicePane({ settings, updateSettings }: VoicePaneProps): React.J } }} className={`group flex items-center gap-2.5 py-2.5 ${ - !isReady && !isDownloading ? 'opacity-50' : '' + !isCloud && !isReady && !isDownloading ? 'opacity-50' : '' }`} > @@ -278,16 +299,20 @@ export function VoicePane({ settings, updateSettings }: VoicePaneProps): React.J ) : isDownloading ? ( + ) : isCloud ? ( + ) : null}
{manifest.label} - - {manifest.streaming ? 'streaming' : 'offline'} - + {!isCloud && ( + + {manifest.streaming ? 'streaming' : 'offline'} + + )} {manifest.recommended && ( - + recommended )} @@ -296,14 +321,16 @@ export function VoicePane({ settings, updateSettings }: VoicePaneProps): React.J ? mState.status === 'extracting' ? 'Extracting...' : `${Math.round(mState.progress * 100)}%` - : `${sizeMb} MB`} + : isCloud + ? null + : `${sizeMb} MB`}

{manifest.description}

- {isReady && !isActive ? ( + {!isCloud && isReady && !isActive ? ( - ) : !isReady && !isDownloading ? ( + ) : !isCloud && !isReady && !isDownloading ? ( @@ -327,6 +354,29 @@ export function VoicePane({ settings, updateSettings }: VoicePaneProps): React.J
+ + {showOpenAiSettingsRow && ( + <> + + openOpenAiDialog(null)} + onClear={() => void clearOpenAiApiKey()} + /> + + )} + + void saveOpenAiApiKey()} + onClear={() => void clearOpenAiApiKey()} + /> ) } diff --git a/src/renderer/src/components/settings/voice-dictation-toggle.ts b/src/renderer/src/components/settings/voice-dictation-toggle.ts new file mode 100644 index 0000000000..a54356c8d3 --- /dev/null +++ b/src/renderer/src/components/settings/voice-dictation-toggle.ts @@ -0,0 +1,62 @@ +import type { DeveloperPermissionRequestResult } from '../../../../shared/developer-permissions-types' +import type { FeatureTipId } from '../../../../shared/feature-tips' +import type { VoiceSettings } from '../../../../shared/speech-types' + +type VoiceDictationToggleOptions = { + voiceEnabled: boolean + markFeatureTipsSeen: (ids: FeatureTipId[]) => void + updateVoiceSettings: (updates: Partial) => void + requestMicrophonePermission: () => Promise + setPermissionPending?: (pending: boolean) => void + isMounted?: () => boolean + notifyPermissionGranted?: () => void + notifyPermissionOpenedSystemSettings?: () => void + notifyPermissionRequired?: () => void + notifyPermissionRequestFailed?: () => void +} + +export async function handleVoiceDictationToggle({ + voiceEnabled, + markFeatureTipsSeen, + updateVoiceSettings, + requestMicrophonePermission, + setPermissionPending, + isMounted, + notifyPermissionGranted, + notifyPermissionOpenedSystemSettings, + notifyPermissionRequired, + notifyPermissionRequestFailed +}: VoiceDictationToggleOptions): Promise { + // Why: changing the Voice Dictation switch proves the user discovered the + // feature; disabling it later should not make the discovery modal eligible. + markFeatureTipsSeen(['voice-dictation']) + + if (voiceEnabled) { + updateVoiceSettings({ enabled: false }) + return + } + + setPermissionPending?.(true) + try { + // Why: enabling dictation is the point where users expect the macOS + // microphone prompt, not after their first attempted recording fails. + const result = await requestMicrophonePermission() + if (result.status === 'granted' || result.status === 'unsupported') { + updateVoiceSettings({ enabled: true }) + } + + if (result.status === 'granted') { + notifyPermissionGranted?.() + } else if (result.openedSystemSettings) { + notifyPermissionOpenedSystemSettings?.() + } else if (result.status !== 'unsupported') { + notifyPermissionRequired?.() + } + } catch { + notifyPermissionRequestFailed?.() + } finally { + if (isMounted?.() ?? true) { + setPermissionPending?.(false) + } + } +} diff --git a/src/renderer/src/components/settings/voice-pane-search.ts b/src/renderer/src/components/settings/voice-pane-search.ts index 93600bbc00..da299bcda8 100644 --- a/src/renderer/src/components/settings/voice-pane-search.ts +++ b/src/renderer/src/components/settings/voice-pane-search.ts @@ -1,5 +1,11 @@ import type { SettingsSearchEntry } from './settings-search' +export const OPENAI_TRANSCRIPTION_SEARCH_ENTRY: SettingsSearchEntry = { + title: 'OpenAI Transcription', + description: 'Configure the OpenAI API key used for cloud speech-to-text models.', + keywords: ['voice', 'speech', 'stt', 'openai', 'api key', 'cloud', 'transcription'] +} + export const VOICE_PANE_SEARCH_ENTRIES: SettingsSearchEntry[] = [ { title: 'Enable Voice Dictation', @@ -11,9 +17,10 @@ export const VOICE_PANE_SEARCH_ENTRIES: SettingsSearchEntry[] = [ description: 'Toggle or hold-to-talk dictation behavior.', keywords: ['voice', 'dictation', 'mode', 'toggle', 'hold', 'push to talk'] }, + OPENAI_TRANSCRIPTION_SEARCH_ENTRY, { title: 'Speech Model', - description: 'Select which speech-to-text model to use for dictation.', - keywords: ['voice', 'model', 'speech', 'stt', 'download'] + description: 'Select a local or cloud speech-to-text model to use for dictation.', + keywords: ['voice', 'model', 'speech', 'stt', 'download', 'openai', 'api key', 'cloud'] } ] diff --git a/src/shared/constants.ts b/src/shared/constants.ts index ffe00b54e2..d7db656c07 100644 --- a/src/shared/constants.ts +++ b/src/shared/constants.ts @@ -346,7 +346,8 @@ export function getDefaultVoiceSettings(): VoiceSettings { language: 'en', dictationMode: 'toggle' as const, terminalConfirmBeforeInsert: false, - userModels: [] + userModels: [], + openAiApiKeyConfigured: false } } diff --git a/src/shared/speech-types.ts b/src/shared/speech-types.ts index 008ab1b976..9881e9b4e9 100644 --- a/src/shared/speech-types.ts +++ b/src/shared/speech-types.ts @@ -1,4 +1,5 @@ -export type SpeechModelType = 'transducer' | 'paraformer' | 'whisper' +export type SpeechModelType = 'transducer' | 'paraformer' | 'whisper' | 'openai' +export type SpeechModelProvider = 'local' | 'openai' export type ModelingUnit = 'bpe' | 'cjkchar' | 'cjkchar+bpe' @@ -7,12 +8,13 @@ export type SpeechModelManifest = { label: string description: string type: SpeechModelType + provider: SpeechModelProvider language: string - sizeBytes: number - downloadUrl: string - archiveSha256: string - archiveFormat: 'tar.bz2' - files: string[] + sizeBytes?: number + downloadUrl?: string + archiveSha256?: string + archiveFormat?: 'tar.bz2' + files?: string[] sampleRate: number streaming: boolean modelingUnit?: ModelingUnit @@ -61,4 +63,5 @@ export type VoiceSettings = { dictationMode: DictationMode terminalConfirmBeforeInsert: boolean userModels: UserModelConfig[] + openAiApiKeyConfigured: boolean } From e136dfd6c950a970e4274cfc6a2157b28ad04c5e Mon Sep 17 00:00:00 2001 From: Jinwoo-H Date: Fri, 5 Jun 2026 11:19:59 -0700 Subject: [PATCH 2/4] Avoid keychain prompts for OpenAI speech status Co-authored-by: Orca --- src/main/speech/openai-api-key-store.test.ts | 70 ++++++++++++++++++++ src/main/speech/openai-api-key-store.ts | 2 + 2 files changed, 72 insertions(+) create mode 100644 src/main/speech/openai-api-key-store.test.ts diff --git a/src/main/speech/openai-api-key-store.test.ts b/src/main/speech/openai-api-key-store.test.ts new file mode 100644 index 0000000000..1049cb2977 --- /dev/null +++ b/src/main/speech/openai-api-key-store.test.ts @@ -0,0 +1,70 @@ +import { existsSync, mkdirSync, mkdtempSync, writeFileSync } from 'fs' +import { tmpdir } from 'os' +import type * as Os from 'os' +import { join } from 'path' +import { beforeEach, describe, expect, it, vi } from 'vitest' + +const safeStorageMock = vi.hoisted(() => ({ + decryptString: vi.fn((value: Buffer) => value.toString('utf8')), + encryptString: vi.fn((value: string) => Buffer.from(value)), + isEncryptionAvailable: vi.fn(() => true) +})) + +let tempHome = '' + +async function loadStoreModule() { + vi.resetModules() + vi.doMock('electron', () => ({ + safeStorage: safeStorageMock + })) + vi.doMock('os', async () => { + const actual = await vi.importActual('os') + return { ...actual, homedir: () => tempHome } + }) + return import('./openai-api-key-store') +} + +beforeEach(() => { + tempHome = mkdtempLike('orca-openai-key-store-') + safeStorageMock.decryptString.mockClear() + safeStorageMock.encryptString.mockClear() + safeStorageMock.isEncryptionAvailable.mockClear() + safeStorageMock.isEncryptionAvailable.mockReturnValue(true) +}) + +function mkdtempLike(prefix: string): string { + return mkdtempSync(join(tmpdir(), prefix)) +} + +function writeStoredOpenAiKey(value: string): void { + const orcaDir = join(tempHome, '.orca') + mkdirSync(orcaDir, { recursive: true }) + writeFileSync(join(orcaDir, 'openai-speech-token.enc'), value) +} + +describe('OpenAI speech API key store', () => { + it('checks configured status without decrypting or touching safeStorage', async () => { + writeStoredOpenAiKey('encrypted-key') + const store = await loadStoreModule() + + expect(store.hasOpenAiSpeechApiKey()).toBe(true) + expect(safeStorageMock.isEncryptionAvailable).not.toHaveBeenCalled() + expect(safeStorageMock.decryptString).not.toHaveBeenCalled() + }) + + it('decrypts only when the key is read for an API request', async () => { + writeStoredOpenAiKey('encrypted-key') + const store = await loadStoreModule() + + expect(store.readOpenAiSpeechApiKey()).toBe('encrypted-key') + expect(safeStorageMock.decryptString).toHaveBeenCalledOnce() + }) + + it('reports missing status without creating storage files', async () => { + const store = await loadStoreModule() + + expect(store.hasOpenAiSpeechApiKey()).toBe(false) + expect(existsSync(join(tempHome, '.orca'))).toBe(false) + expect(safeStorageMock.decryptString).not.toHaveBeenCalled() + }) +}) diff --git a/src/main/speech/openai-api-key-store.ts b/src/main/speech/openai-api-key-store.ts index 23e661fb71..0944105313 100644 --- a/src/main/speech/openai-api-key-store.ts +++ b/src/main/speech/openai-api-key-store.ts @@ -41,6 +41,8 @@ function readLegacyJsonStoredOpenAiKey(): StoredOpenAiKey | null { } export function hasOpenAiSpeechApiKey(): boolean { + // Why: Settings and model-state refresh call this on startup; checking file + // existence avoids decrypting safeStorage and triggering macOS keychain prompts. return existsSync(getOpenAiKeyPath()) } From f63eb0865e319f0df0a70b8668ba3d68a21bc212 Mon Sep 17 00:00:00 2001 From: Jinwoo-H Date: Fri, 5 Jun 2026 11:24:02 -0700 Subject: [PATCH 3/4] Cache OpenAI speech key after first read Co-authored-by: Orca --- src/main/speech/openai-api-key-store.test.ts | 18 ++++++++++++++++++ src/main/speech/openai-api-key-store.ts | 16 ++++++++++++++-- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/src/main/speech/openai-api-key-store.test.ts b/src/main/speech/openai-api-key-store.test.ts index 1049cb2977..d38e013eee 100644 --- a/src/main/speech/openai-api-key-store.test.ts +++ b/src/main/speech/openai-api-key-store.test.ts @@ -60,6 +60,24 @@ describe('OpenAI speech API key store', () => { expect(safeStorageMock.decryptString).toHaveBeenCalledOnce() }) + it('caches the decrypted key so repeated dictations do not repeatedly touch safeStorage', async () => { + writeStoredOpenAiKey('encrypted-key') + const store = await loadStoreModule() + + expect(store.readOpenAiSpeechApiKey()).toBe('encrypted-key') + expect(store.readOpenAiSpeechApiKey()).toBe('encrypted-key') + expect(safeStorageMock.decryptString).toHaveBeenCalledOnce() + }) + + it('uses the in-memory key after save without decrypting from safeStorage', async () => { + const store = await loadStoreModule() + + store.saveOpenAiSpeechApiKey('saved-key') + + expect(store.readOpenAiSpeechApiKey()).toBe('saved-key') + expect(safeStorageMock.decryptString).not.toHaveBeenCalled() + }) + it('reports missing status without creating storage files', async () => { const store = await loadStoreModule() diff --git a/src/main/speech/openai-api-key-store.ts b/src/main/speech/openai-api-key-store.ts index 0944105313..74bffee19b 100644 --- a/src/main/speech/openai-api-key-store.ts +++ b/src/main/speech/openai-api-key-store.ts @@ -8,6 +8,7 @@ type StoredOpenAiKey = { } const OPENAI_SPEECH_TOKEN_FILE = 'openai-speech-token.enc' +let cachedOpenAiSpeechApiKey: string | null = null function getOrcaDir(): string { return join(homedir(), '.orca') @@ -54,6 +55,7 @@ export function saveOpenAiSpeechApiKey(apiKey: string): void { ensureOrcaDir() if (safeStorage.isEncryptionAvailable()) { writeFileSync(getOpenAiKeyPath(), safeStorage.encryptString(trimmed), { mode: 0o600 }) + cachedOpenAiSpeechApiKey = trimmed return } @@ -61,9 +63,14 @@ export function saveOpenAiSpeechApiKey(apiKey: string): void { '[speech] safeStorage encryption unavailable — storing OpenAI speech key in plaintext' ) writeFileSync(getOpenAiKeyPath(), trimmed, { encoding: 'utf8', mode: 0o600 }) + cachedOpenAiSpeechApiKey = trimmed } export function readOpenAiSpeechApiKey(): string { + if (cachedOpenAiSpeechApiKey !== null) { + return cachedOpenAiSpeechApiKey + } + const keyPath = getOpenAiKeyPath() if (!existsSync(keyPath)) { throw new Error('OpenAI API key is not configured') @@ -72,16 +79,21 @@ export function readOpenAiSpeechApiKey(): string { const raw = readFileSync(keyPath) const legacyJson = readLegacyJsonStoredOpenAiKey() if (legacyJson) { - return safeStorage.decryptString(Buffer.from(legacyJson.encryptedKeyBase64, 'base64')) + cachedOpenAiSpeechApiKey = safeStorage.decryptString( + Buffer.from(legacyJson.encryptedKeyBase64, 'base64') + ) + return cachedOpenAiSpeechApiKey } - return safeStorage.isEncryptionAvailable() + cachedOpenAiSpeechApiKey = safeStorage.isEncryptionAvailable() ? safeStorage.decryptString(raw) : raw.toString('utf8') + return cachedOpenAiSpeechApiKey } catch { throw new Error('OpenAI API key could not be decrypted') } } export function clearOpenAiSpeechApiKey(): void { + cachedOpenAiSpeechApiKey = null rmSync(getOpenAiKeyPath(), { force: true }) } From 9f4f00c81f91fd75a5a723ce2dd015cd980cb95d Mon Sep 17 00:00:00 2001 From: Jinwoo-H Date: Fri, 5 Jun 2026 11:29:27 -0700 Subject: [PATCH 4/4] Assert OpenAI key read waits for transcription upload Co-authored-by: Orca --- src/main/speech/stt-service.test.ts | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/main/speech/stt-service.test.ts b/src/main/speech/stt-service.test.ts index e3aa511453..ce0bbc77d8 100644 --- a/src/main/speech/stt-service.test.ts +++ b/src/main/speech/stt-service.test.ts @@ -6,6 +6,7 @@ const { getCloudSessions, getCreatedWorkerCount, getLastWorker, + readOpenAiSpeechApiKeyMock, resetCloudSessions, resetWorkers } = vi.hoisted(() => { @@ -98,6 +99,7 @@ const { getCloudSessions: () => HoistedMockOpenAiTranscriptionSession.instances, getCreatedWorkerCount: () => HoistedMockWorker.created, getLastWorker: () => HoistedMockWorker.instances.at(-1), + readOpenAiSpeechApiKeyMock: vi.fn(() => 'test-openai-key'), resetCloudSessions: () => { HoistedMockOpenAiTranscriptionSession.instances = [] }, @@ -142,7 +144,7 @@ vi.mock('./model-catalog', () => ({ })) vi.mock('./openai-api-key-store', () => ({ - readOpenAiSpeechApiKey: () => 'test-openai-key' + readOpenAiSpeechApiKey: readOpenAiSpeechApiKeyMock })) vi.mock('./openai-transcription-client', () => ({ @@ -155,6 +157,7 @@ describe('SttService', () => { beforeEach(() => { resetCloudSessions() resetWorkers() + readOpenAiSpeechApiKeyMock.mockClear() }) it('reuses an idle warm worker for a second dictation with the same owner', async () => { @@ -258,6 +261,22 @@ describe('SttService', () => { expect(sink).toHaveBeenCalledWith({ type: 'stopped' }) }) + it('reads the OpenAI key only when finishing cloud dictation', async () => { + const service = new SttService({ + getModelState: vi.fn().mockResolvedValue({ id: 'openai-model', status: 'ready' }), + getModelDir: vi.fn().mockReturnValue('/tmp/model-a') + } as never) + + await service.startDictation('openai-model', vi.fn(), undefined, 'desktop') + service.feedAudio(new Float32Array([0.25]), 16000, 'desktop') + + expect(readOpenAiSpeechApiKeyMock).not.toHaveBeenCalled() + + await service.stopDictation('desktop') + + expect(readOpenAiSpeechApiKeyMock).toHaveBeenCalledOnce() + }) + it('keeps startup cancellation tombstoned after the worker has been created', async () => { const service = new SttService({ getModelState: vi.fn().mockResolvedValue({ id: 'model-a', status: 'ready' }),