From 96753ad58946e674a5448be76bf5ba87c9833437 Mon Sep 17 00:00:00 2001
From: Jinwoo-H <jinwoo0825@gmail.com>
Date: Fri, 5 Jun 2026 10:49:43 -0700
Subject: [PATCH 1/4] Add OpenAI cloud speech transcription

Co-authored-by: Orca <help@stably.ai>
---
 src/main/ipc/speech.test.ts                   |   5 +
 src/main/ipc/speech.ts                        |  19 ++
 src/main/speech/model-catalog.ts              |  32 +++
 src/main/speech/model-manager.test.ts         |  33 ++-
 src/main/speech/model-manager.ts              |  26 ++-
 src/main/speech/openai-api-key-store.ts       |  85 +++++++
 .../openai-transcription-client.test.ts       |  20 ++
 .../speech/openai-transcription-client.ts     | 137 +++++++++++
 src/main/speech/stt-service.test.ts           |  90 +++++++-
 src/main/speech/stt-service.ts                |  74 +++++-
 src/preload/api-types.ts                      |   3 +
 src/preload/index.ts                          |   6 +
 .../dictation/DictationController.tsx         |   7 +-
 .../settings/OpenAiTranscriptionKeyDialog.tsx |  78 +++++++
 .../OpenAiTranscriptionSettingsRow.tsx        |  58 +++++
 .../components/settings/VoicePane.test.tsx    |   3 +
 .../src/components/settings/VoicePane.tsx     | 214 +++++++++++-------
 .../settings/voice-dictation-toggle.ts        |  62 +++++
 .../components/settings/voice-pane-search.ts  |  11 +-
 src/shared/constants.ts                       |   3 +-
 src/shared/speech-types.ts                    |  15 +-
 21 files changed, 870 insertions(+), 111 deletions(-)
 create mode 100644 src/main/speech/openai-api-key-store.ts
 create mode 100644 src/main/speech/openai-transcription-client.test.ts
 create mode 100644 src/main/speech/openai-transcription-client.ts
 create mode 100644 src/renderer/src/components/settings/OpenAiTranscriptionKeyDialog.tsx
 create mode 100644 src/renderer/src/components/settings/OpenAiTranscriptionSettingsRow.tsx
 create mode 100644 src/renderer/src/components/settings/voice-dictation-toggle.ts

diff --git a/src/main/ipc/speech.test.ts b/src/main/ipc/speech.test.ts
index 310103fb2a..05d6dbb5e2 100644
--- a/src/main/ipc/speech.test.ts
+++ b/src/main/ipc/speech.test.ts
@@ -12,6 +12,11 @@ vi.mock('electron', () => ({
   app: { getPath: vi.fn(() => '/tmp/orca-speech-test') },
   BrowserWindow: { fromWebContents: fromWebContentsMock },
   ipcMain: { handle: handleMock },
+  safeStorage: {
+    decryptString: vi.fn(),
+    encryptString: vi.fn(() => Buffer.from('encrypted')),
+    isEncryptionAvailable: vi.fn(() => true)
+  },
   systemPreferences: {
     getMediaAccessStatus: vi.fn(() => 'granted'),
     askForMediaAccess: vi.fn(() => Promise.resolve(true))
diff --git a/src/main/ipc/speech.ts b/src/main/ipc/speech.ts
index 36a69b6409..053a0038d8 100644
--- a/src/main/ipc/speech.ts
+++ b/src/main/ipc/speech.ts
@@ -4,6 +4,11 @@ import { writeFile, unlink } from 'fs/promises'
 import { createHash } from 'crypto'
 import { SPEECH_MODEL_CATALOG, getCatalogModel } from '../speech/model-catalog'
 import { getSpeechModelManager, getSpeechSttService } from '../speech/speech-runtime-service'
+import {
+  clearOpenAiSpeechApiKey,
+  hasOpenAiSpeechApiKey,
+  saveOpenAiSpeechApiKey
+} from '../speech/openai-api-key-store'
 import type { Store } from '../persistence'
 
 export function registerSpeechHandlers(store: Store): void {
@@ -15,6 +20,20 @@ export function registerSpeechHandlers(store: Store): void {
     return getSpeechModelManager(store).getModelStates()
   })
 
+  ipcMain.handle('speech:getOpenAiApiKeyStatus', async () => {
+    return { configured: hasOpenAiSpeechApiKey() }
+  })
+
+  ipcMain.handle('speech:saveOpenAiApiKey', async (_event, apiKey: string) => {
+    saveOpenAiSpeechApiKey(apiKey)
+    return { configured: true }
+  })
+
+  ipcMain.handle('speech:clearOpenAiApiKey', async () => {
+    clearOpenAiSpeechApiKey()
+    return { configured: false }
+  })
+
   ipcMain.handle('speech:downloadModel', async (event, modelId: string) => {
     const manager = getSpeechModelManager(store)
     const window = BrowserWindow.fromWebContents(event.sender)
diff --git a/src/main/speech/model-catalog.ts b/src/main/speech/model-catalog.ts
index 8a595128a7..266442eb78 100644
--- a/src/main/speech/model-catalog.ts
+++ b/src/main/speech/model-catalog.ts
@@ -7,6 +7,7 @@ export const SPEECH_MODEL_CATALOG: SpeechModelManifest[] = [
     description:
       'Highest accuracy for 25 European languages. Punctuation, capitalization, and word-level timestamps.',
     type: 'transducer',
+    provider: 'local',
     language: 'multilingual',
     sizeBytes: 180_000_000,
     downloadUrl:
@@ -25,6 +26,7 @@ export const SPEECH_MODEL_CATALOG: SpeechModelManifest[] = [
     description:
       'English only. Faster than v3 with similar accuracy. Punctuation and capitalization.',
     type: 'transducer',
+    provider: 'local',
     language: 'en',
     sizeBytes: 170_000_000,
     downloadUrl:
@@ -41,6 +43,7 @@ export const SPEECH_MODEL_CATALOG: SpeechModelManifest[] = [
     label: 'Zipformer Bilingual',
     description: 'Chinese + English with code-switching. Low-latency real-time streaming.',
     type: 'transducer',
+    provider: 'local',
     language: 'zh-en',
     sizeBytes: 130_000_000,
     downloadUrl:
@@ -63,6 +66,7 @@ export const SPEECH_MODEL_CATALOG: SpeechModelManifest[] = [
     description:
       'Chinese (Mandarin + dialects) + English. Strong on accented and regional Chinese.',
     type: 'paraformer',
+    provider: 'local',
     language: 'zh-en',
     sizeBytes: 115_000_000,
     downloadUrl:
@@ -78,6 +82,7 @@ export const SPEECH_MODEL_CATALOG: SpeechModelManifest[] = [
     label: 'Zipformer Streaming EN',
     description: 'English only. Lightweight 20M-param model, good balance of speed and size.',
     type: 'transducer',
+    provider: 'local',
     language: 'en',
     sizeBytes: 128_000_000,
     downloadUrl:
@@ -99,6 +104,7 @@ export const SPEECH_MODEL_CATALOG: SpeechModelManifest[] = [
     label: 'Zipformer Streaming ZH',
     description: 'Chinese only. Ultra-lightweight 14M-param model, ideal for low-resource devices.',
     type: 'transducer',
+    provider: 'local',
     language: 'zh',
     sizeBytes: 74_000_000,
     downloadUrl:
@@ -120,6 +126,7 @@ export const SPEECH_MODEL_CATALOG: SpeechModelManifest[] = [
     label: 'Whisper Tiny',
     description: '90+ languages. Lower accuracy than Parakeet but broadest language coverage.',
     type: 'whisper',
+    provider: 'local',
     language: 'multilingual',
     sizeBytes: 116_000_000,
     downloadUrl:
@@ -129,9 +136,34 @@ export const SPEECH_MODEL_CATALOG: SpeechModelManifest[] = [
     files: ['tiny-encoder.onnx', 'tiny-decoder.onnx', 'tiny-tokens.txt'],
     sampleRate: 16000,
     streaming: false
+  },
+  {
+    id: 'openai-gpt-4o-mini-transcribe',
+    label: 'GPT-4o mini Transcribe',
+    description:
+      'Cloud transcription with strong accuracy and low cost. Requires an OpenAI API key.',
+    type: 'openai',
+    provider: 'openai',
+    language: 'multilingual',
+    sampleRate: 16000,
+    streaming: false
+  },
+  {
+    id: 'openai-gpt-4o-transcribe',
+    label: 'GPT-4o Transcribe',
+    description: 'Cloud transcription with higher accuracy. Requires an OpenAI API key.',
+    type: 'openai',
+    provider: 'openai',
+    language: 'multilingual',
+    sampleRate: 16000,
+    streaming: false
   }
 ]
 
 export function getCatalogModel(id: string): SpeechModelManifest | undefined {
   return SPEECH_MODEL_CATALOG.find((m) => m.id === id)
 }
+
+export function isLocalSpeechModel(manifest: SpeechModelManifest): boolean {
+  return manifest.provider === 'local'
+}
diff --git a/src/main/speech/model-manager.test.ts b/src/main/speech/model-manager.test.ts
index 4e6cb5ee75..7879d86d86 100644
--- a/src/main/speech/model-manager.test.ts
+++ b/src/main/speech/model-manager.test.ts
@@ -6,7 +6,8 @@ import { beforeEach, describe, expect, it, vi } from 'vitest'
 import { SPEECH_MODEL_CATALOG } from './model-catalog'
 import { ModelManager } from './model-manager'
 
-const { httpsGetMock, spawnMock } = vi.hoisted(() => ({
+const { hasOpenAiSpeechApiKeyMock, httpsGetMock, spawnMock } = vi.hoisted(() => ({
+  hasOpenAiSpeechApiKeyMock: vi.fn(),
   httpsGetMock: vi.fn(),
   spawnMock: vi.fn()
 }))
@@ -27,6 +28,10 @@ vi.mock('https', async () => {
   return { ...(actual as Record<string, unknown>), get: httpsGetMock }
 })
 
+vi.mock('./openai-api-key-store', () => ({
+  hasOpenAiSpeechApiKey: hasOpenAiSpeechApiKeyMock
+}))
+
 type ModelManagerInternals = {
   verifyArchiveSha256: (archivePath: string, expectedSha256: string) => Promise<void>
   downloadFile: (
@@ -48,11 +53,16 @@ type ModelManagerInternals = {
 describe('ModelManager', () => {
   beforeEach(() => {
     httpsGetMock.mockReset()
+    hasOpenAiSpeechApiKeyMock.mockReset()
+    hasOpenAiSpeechApiKeyMock.mockReturnValue(false)
     spawnMock.mockReset()
   })
 
   it('requires pinned SHA-256 hashes for every catalog archive', () => {
     for (const manifest of SPEECH_MODEL_CATALOG) {
+      if (manifest.provider !== 'local') {
+        continue
+      }
       expect(manifest.archiveSha256).toMatch(/^[a-f0-9]{64}$/)
     }
   })
@@ -93,6 +103,27 @@ describe('ModelManager', () => {
     }
   })
 
+  it('marks OpenAI transcription models ready only when an API key is configured', async () => {
+    const dir = mkdtempSync(join(tmpdir(), 'orca-model-manager-'))
+    try {
+      const manager = new ModelManager(dir)
+
+      await expect(manager.getModelState('openai-gpt-4o-mini-transcribe')).resolves.toEqual({
+        id: 'openai-gpt-4o-mini-transcribe',
+        status: 'not-downloaded'
+      })
+
+      hasOpenAiSpeechApiKeyMock.mockReturnValue(true)
+
+      await expect(manager.getModelState('openai-gpt-4o-mini-transcribe')).resolves.toEqual({
+        id: 'openai-gpt-4o-mini-transcribe',
+        status: 'ready'
+      })
+    } finally {
+      rmSync(dir, { recursive: true, force: true })
+    }
+  })
+
   it('aborts an in-flight model download request when cancelled', async () => {
     const dir = mkdtempSync(join(tmpdir(), 'orca-model-manager-'))
     try {
diff --git a/src/main/speech/model-manager.ts b/src/main/speech/model-manager.ts
index bc51de602a..7f4f5b4804 100644
--- a/src/main/speech/model-manager.ts
+++ b/src/main/speech/model-manager.ts
@@ -13,7 +13,8 @@ import type {
   SpeechModelState,
   SpeechModelStatus
 } from '../../shared/speech-types'
-import { SPEECH_MODEL_CATALOG, getCatalogModel } from './model-catalog'
+import { SPEECH_MODEL_CATALOG, getCatalogModel, isLocalSpeechModel } from './model-catalog'
+import { hasOpenAiSpeechApiKey } from './openai-api-key-store'
 import { resolveTarExecutable } from './tar-executable'
 
 type DownloadHandle = {
@@ -68,6 +69,13 @@ export class ModelManager {
       return { id: modelId, status: 'error', error: 'Unknown model' }
     }
 
+    if (manifest.provider === 'openai') {
+      return {
+        id: modelId,
+        status: hasOpenAiSpeechApiKey() ? 'ready' : 'not-downloaded'
+      }
+    }
+
     const modelDir = this.getModelDir(modelId)
     if (existsSync(modelDir) && this.validateModelFiles(manifest, modelDir)) {
       const state: SpeechModelState = { id: modelId, status: 'ready' }
@@ -97,6 +105,9 @@ export class ModelManager {
   }
 
   private validateModelFiles(manifest: SpeechModelManifest, modelDir: string): boolean {
+    if (!manifest.files) {
+      return false
+    }
     return manifest.files.every((f) => existsSync(join(modelDir, f)))
   }
 
@@ -109,6 +120,12 @@ export class ModelManager {
     if (!manifest) {
       throw new Error(`Unknown model: ${modelId}`)
     }
+    if (!isLocalSpeechModel(manifest)) {
+      throw new Error(`Model does not support downloads: ${modelId}`)
+    }
+    if (!manifest.downloadUrl || !manifest.archiveSha256 || !manifest.sizeBytes) {
+      throw new Error(`Model download metadata missing: ${modelId}`)
+    }
 
     const modelDir = this.getModelDir(modelId)
     if (existsSync(modelDir) && this.validateModelFiles(manifest, modelDir)) {
@@ -213,6 +230,10 @@ export class ModelManager {
     if (!getCatalogModel(modelId)) {
       throw new Error(`Unknown model: ${modelId}`)
     }
+    const manifest = getCatalogModel(modelId)
+    if (!manifest || !isLocalSpeechModel(manifest)) {
+      throw new Error(`Model does not support deletion: ${modelId}`)
+    }
     this.cancelDownload(modelId)
     const modelDir = this.getModelDir(modelId)
     if (existsSync(modelDir)) {
@@ -542,6 +563,9 @@ export class ModelManager {
   }
 
   private async flattenNestedDir(modelDir: string, manifest: SpeechModelManifest): Promise<void> {
+    if (!manifest.files) {
+      return
+    }
     const entries = await readdir(modelDir, { withFileTypes: true })
     for (const entry of entries) {
       if (entry.isDirectory()) {
diff --git a/src/main/speech/openai-api-key-store.ts b/src/main/speech/openai-api-key-store.ts
new file mode 100644
index 0000000000..23e661fb71
--- /dev/null
+++ b/src/main/speech/openai-api-key-store.ts
@@ -0,0 +1,85 @@
+import { safeStorage } from 'electron'
+import { existsSync, mkdirSync, readFileSync, rmSync, writeFileSync } from 'fs'
+import { homedir } from 'os'
+import { join } from 'path'
+
+type StoredOpenAiKey = {
+  encryptedKeyBase64: string
+}
+
+const OPENAI_SPEECH_TOKEN_FILE = 'openai-speech-token.enc'
+
+function getOrcaDir(): string {
+  return join(homedir(), '.orca')
+}
+
+function ensureOrcaDir(): void {
+  const dir = getOrcaDir()
+  if (!existsSync(dir)) {
+    mkdirSync(dir, { recursive: true })
+  }
+}
+
+function getOpenAiKeyPath(): string {
+  return join(getOrcaDir(), OPENAI_SPEECH_TOKEN_FILE)
+}
+
+function readLegacyJsonStoredOpenAiKey(): StoredOpenAiKey | null {
+  const keyPath = getOpenAiKeyPath()
+  if (!existsSync(keyPath)) {
+    return null
+  }
+  try {
+    const parsed = JSON.parse(readFileSync(keyPath, 'utf8')) as Partial<StoredOpenAiKey>
+    if (typeof parsed.encryptedKeyBase64 !== 'string' || parsed.encryptedKeyBase64 === '') {
+      return null
+    }
+    return { encryptedKeyBase64: parsed.encryptedKeyBase64 }
+  } catch {
+    return null
+  }
+}
+
+export function hasOpenAiSpeechApiKey(): boolean {
+  return existsSync(getOpenAiKeyPath())
+}
+
+export function saveOpenAiSpeechApiKey(apiKey: string): void {
+  const trimmed = apiKey.trim()
+  if (!trimmed) {
+    throw new Error('OpenAI API key is required')
+  }
+  ensureOrcaDir()
+  if (safeStorage.isEncryptionAvailable()) {
+    writeFileSync(getOpenAiKeyPath(), safeStorage.encryptString(trimmed), { mode: 0o600 })
+    return
+  }
+
+  console.warn(
+    '[speech] safeStorage encryption unavailable — storing OpenAI speech key in plaintext'
+  )
+  writeFileSync(getOpenAiKeyPath(), trimmed, { encoding: 'utf8', mode: 0o600 })
+}
+
+export function readOpenAiSpeechApiKey(): string {
+  const keyPath = getOpenAiKeyPath()
+  if (!existsSync(keyPath)) {
+    throw new Error('OpenAI API key is not configured')
+  }
+  try {
+    const raw = readFileSync(keyPath)
+    const legacyJson = readLegacyJsonStoredOpenAiKey()
+    if (legacyJson) {
+      return safeStorage.decryptString(Buffer.from(legacyJson.encryptedKeyBase64, 'base64'))
+    }
+    return safeStorage.isEncryptionAvailable()
+      ? safeStorage.decryptString(raw)
+      : raw.toString('utf8')
+  } catch {
+    throw new Error('OpenAI API key could not be decrypted')
+  }
+}
+
+export function clearOpenAiSpeechApiKey(): void {
+  rmSync(getOpenAiKeyPath(), { force: true })
+}
diff --git a/src/main/speech/openai-transcription-client.test.ts b/src/main/speech/openai-transcription-client.test.ts
new file mode 100644
index 0000000000..8e20f593b0
--- /dev/null
+++ b/src/main/speech/openai-transcription-client.test.ts
@@ -0,0 +1,20 @@
+import { describe, expect, it } from 'vitest'
+import { sanitizeOpenAiTranscriptionErrorMessage } from './openai-transcription-client'
+
+describe('sanitizeOpenAiTranscriptionErrorMessage', () => {
+  it('does not expose the invalid OpenAI API key echoed by the provider', () => {
+    expect(
+      sanitizeOpenAiTranscriptionErrorMessage(
+        'Incorrect API key provided: fsdfdsfsdf. You can find your API key at https://platform.openai.com/account/api-keys.'
+      )
+    ).toBe('Incorrect OpenAI API key provided.')
+  })
+
+  it('redacts API keys and bearer tokens from other provider errors', () => {
+    expect(
+      sanitizeOpenAiTranscriptionErrorMessage(
+        'Request failed for sk-testSecret123 with Authorization: Bearer token-value_123'
+      )
+    ).toBe('Request failed for [redacted] with Authorization: Bearer [redacted]')
+  })
+})
diff --git a/src/main/speech/openai-transcription-client.ts b/src/main/speech/openai-transcription-client.ts
new file mode 100644
index 0000000000..c61dc940f1
--- /dev/null
+++ b/src/main/speech/openai-transcription-client.ts
@@ -0,0 +1,137 @@
+import { resampleToRate } from './stt-audio-resample'
+
+export const OPENAI_TRANSCRIPTION_MODEL_BY_ID: Record<string, string> = {
+  'openai-gpt-4o-mini-transcribe': 'gpt-4o-mini-transcribe',
+  'openai-gpt-4o-transcribe': 'gpt-4o-transcribe'
+}
+
+const OPENAI_TRANSCRIPTION_URL = 'https://api.openai.com/v1/audio/transcriptions'
+const CLOUD_TRANSCRIPTION_SAMPLE_RATE = 16000
+const MAX_CLOUD_AUDIO_SECONDS = 10 * 60
+
+type OpenAiTranscriptionResponse = {
+  text?: unknown
+  error?: {
+    message?: unknown
+  }
+}
+
+export function sanitizeOpenAiTranscriptionErrorMessage(message: string): string {
+  if (/incorrect api key provided:/i.test(message)) {
+    return 'Incorrect OpenAI API key provided.'
+  }
+
+  const sanitized = message
+    .replace(/\bsk-[A-Za-z0-9_-]+/g, '[redacted]')
+    .replace(/\bBearer\s+[A-Za-z0-9._~+/=-]+/gi, 'Bearer [redacted]')
+    .trim()
+
+  return sanitized || 'OpenAI transcription request failed'
+}
+
+function encodePcm16Wav(samples: Float32Array, sampleRate: number): Buffer {
+  const dataBytes = samples.length * 2
+  const buffer = Buffer.alloc(44 + dataBytes)
+
+  buffer.write('RIFF', 0)
+  buffer.writeUInt32LE(36 + dataBytes, 4)
+  buffer.write('WAVE', 8)
+  buffer.write('fmt ', 12)
+  buffer.writeUInt32LE(16, 16)
+  buffer.writeUInt16LE(1, 20)
+  buffer.writeUInt16LE(1, 22)
+  buffer.writeUInt32LE(sampleRate, 24)
+  buffer.writeUInt32LE(sampleRate * 2, 28)
+  buffer.writeUInt16LE(2, 32)
+  buffer.writeUInt16LE(16, 34)
+  buffer.write('data', 36)
+  buffer.writeUInt32LE(dataBytes, 40)
+
+  for (let i = 0; i < samples.length; i += 1) {
+    const clamped = Math.max(-1, Math.min(1, samples[i]))
+    const value = clamped < 0 ? clamped * 0x8000 : clamped * 0x7fff
+    buffer.writeInt16LE(Math.round(value), 44 + i * 2)
+  }
+
+  return buffer
+}
+
+function combineChunks(chunks: Float32Array[]): Float32Array {
+  const totalLength = chunks.reduce((sum, chunk) => sum + chunk.length, 0)
+  const combined = new Float32Array(totalLength)
+  let offset = 0
+  for (const chunk of chunks) {
+    combined.set(chunk, offset)
+    offset += chunk.length
+  }
+  return combined
+}
+
+function parseOpenAiTranscriptionResponse(data: OpenAiTranscriptionResponse): string {
+  if (typeof data.text === 'string') {
+    return data.text.trim()
+  }
+  if (typeof data.error?.message === 'string') {
+    throw new Error(sanitizeOpenAiTranscriptionErrorMessage(data.error.message))
+  }
+  throw new Error('OpenAI transcription response did not include text')
+}
+
+export class OpenAiTranscriptionSession {
+  private chunks: Float32Array[] = []
+  private audioSeconds = 0
+
+  constructor(
+    private readonly modelId: string,
+    private readonly readApiKey: () => string
+  ) {}
+
+  feedAudio(samples: Float32Array, sampleRate: number): void {
+    const normalized = resampleToRate(samples, sampleRate, CLOUD_TRANSCRIPTION_SAMPLE_RATE)
+    this.audioSeconds += normalized.length / CLOUD_TRANSCRIPTION_SAMPLE_RATE
+    if (this.audioSeconds > MAX_CLOUD_AUDIO_SECONDS) {
+      throw new Error('Cloud transcription is limited to 10 minutes per dictation')
+    }
+    this.chunks.push(new Float32Array(normalized))
+  }
+
+  async finish(): Promise<string> {
+    if (this.chunks.length === 0) {
+      return ''
+    }
+
+    const apiModel = OPENAI_TRANSCRIPTION_MODEL_BY_ID[this.modelId]
+    if (!apiModel) {
+      throw new Error(`Unknown OpenAI transcription model: ${this.modelId}`)
+    }
+
+    const audio = combineChunks(this.chunks)
+    this.chunks = []
+    const wav = encodePcm16Wav(audio, CLOUD_TRANSCRIPTION_SAMPLE_RATE)
+    const form = new FormData()
+    form.append('model', apiModel)
+    form.append('response_format', 'json')
+    // Why: OpenAI's transcription endpoint expects a multipart file object;
+    // a named WAV blob avoids filesystem temp files and works in packaged apps.
+    form.append('file', new Blob([new Uint8Array(wav)], { type: 'audio/wav' }), 'dictation.wav')
+
+    const response = await fetch(OPENAI_TRANSCRIPTION_URL, {
+      method: 'POST',
+      headers: {
+        Authorization: `Bearer ${this.readApiKey()}`
+      },
+      body: form
+    })
+
+    const data = (await response.json().catch(() => ({}))) as OpenAiTranscriptionResponse
+    if (!response.ok) {
+      const message =
+        typeof data.error?.message === 'string'
+          ? sanitizeOpenAiTranscriptionErrorMessage(data.error.message)
+          : response.statusText
+      throw new Error(`OpenAI transcription failed: ${message}`)
+    }
+
+    return parseOpenAiTranscriptionResponse(data)
+  }
+}
diff --git a/src/main/speech/stt-service.test.ts b/src/main/speech/stt-service.test.ts
index e83b07c7a1..e3aa511453 100644
--- a/src/main/speech/stt-service.test.ts
+++ b/src/main/speech/stt-service.test.ts
@@ -1,6 +1,14 @@
 import { beforeEach, describe, expect, it, vi } from 'vitest'
 
-const { MockWorker, getCreatedWorkerCount, getLastWorker, resetWorkers } = vi.hoisted(() => {
+const {
+  MockOpenAiTranscriptionSession,
+  MockWorker,
+  getCloudSessions,
+  getCreatedWorkerCount,
+  getLastWorker,
+  resetCloudSessions,
+  resetWorkers
+} = vi.hoisted(() => {
   class HoistedMockWorker extends EventTarget {
     static created = 0
     static instances: HoistedMockWorker[] = []
@@ -64,10 +72,35 @@ const { MockWorker, getCreatedWorkerCount, getLastWorker, resetWorkers } = vi.ho
     }
   }
 
+  class HoistedMockOpenAiTranscriptionSession {
+    static instances: HoistedMockOpenAiTranscriptionSession[] = []
+    feedCalls: { samples: Float32Array; sampleRate: number }[] = []
+
+    constructor(
+      readonly modelId: string,
+      readonly readApiKey: () => string
+    ) {
+      HoistedMockOpenAiTranscriptionSession.instances.push(this)
+    }
+
+    feedAudio(samples: Float32Array, sampleRate: number): void {
+      this.feedCalls.push({ samples, sampleRate })
+    }
+
+    finish(): Promise<string> {
+      return Promise.resolve(`${this.modelId}:${this.readApiKey()}`)
+    }
+  }
+
   return {
+    MockOpenAiTranscriptionSession: HoistedMockOpenAiTranscriptionSession,
     MockWorker: HoistedMockWorker,
+    getCloudSessions: () => HoistedMockOpenAiTranscriptionSession.instances,
     getCreatedWorkerCount: () => HoistedMockWorker.created,
     getLastWorker: () => HoistedMockWorker.instances.at(-1),
+    resetCloudSessions: () => {
+      HoistedMockOpenAiTranscriptionSession.instances = []
+    },
     resetWorkers: () => {
       HoistedMockWorker.created = 0
       HoistedMockWorker.instances = []
@@ -89,19 +122,38 @@ vi.mock('worker_threads', () => ({
 }))
 
 vi.mock('./model-catalog', () => ({
-  getCatalogModel: () => ({
-    id: 'model-a',
-    type: 'transducer',
-    streaming: true,
-    sampleRate: 16000,
-    files: ['encoder.onnx', 'decoder.onnx', 'joiner.onnx', 'tokens.txt']
-  })
+  getCatalogModel: (id: string) =>
+    id === 'openai-model'
+      ? {
+          id,
+          type: 'openai',
+          provider: 'openai',
+          streaming: false,
+          sampleRate: 16000
+        }
+      : {
+          id: 'model-a',
+          type: 'transducer',
+          provider: 'local',
+          streaming: true,
+          sampleRate: 16000,
+          files: ['encoder.onnx', 'decoder.onnx', 'joiner.onnx', 'tokens.txt']
+        }
+}))
+
+vi.mock('./openai-api-key-store', () => ({
+  readOpenAiSpeechApiKey: () => 'test-openai-key'
+}))
+
+vi.mock('./openai-transcription-client', () => ({
+  OpenAiTranscriptionSession: MockOpenAiTranscriptionSession
 }))
 
 import { IDLE_WORKER_TEARDOWN_MS, START_DICTATION_TIMEOUT_MS, SttService } from './stt-service'
 
 describe('SttService', () => {
   beforeEach(() => {
+    resetCloudSessions()
     resetWorkers()
   })
 
@@ -184,6 +236,28 @@ describe('SttService', () => {
     expect(worker!.messages.filter((message) => message.type === 'feed')).toHaveLength(0)
   })
 
+  it('uses the OpenAI transcription session without creating a worker', async () => {
+    const sink = vi.fn()
+    const service = new SttService({
+      getModelState: vi.fn().mockResolvedValue({ id: 'openai-model', status: 'ready' }),
+      getModelDir: vi.fn().mockReturnValue('/tmp/model-a')
+    } as never)
+
+    await service.startDictation('openai-model', sink, undefined, 'desktop')
+    service.feedAudio(new Float32Array([0.25, -0.25]), 48000, 'desktop')
+    await service.stopDictation('desktop')
+
+    expect(getCreatedWorkerCount()).toBe(0)
+    expect(getCloudSessions()).toHaveLength(1)
+    expect(getCloudSessions()[0].feedCalls).toHaveLength(1)
+    expect(sink).toHaveBeenCalledWith({ type: 'ready' })
+    expect(sink).toHaveBeenCalledWith({
+      type: 'final',
+      text: 'openai-model:test-openai-key'
+    })
+    expect(sink).toHaveBeenCalledWith({ type: 'stopped' })
+  })
+
   it('keeps startup cancellation tombstoned after the worker has been created', async () => {
     const service = new SttService({
       getModelState: vi.fn().mockResolvedValue({ id: 'model-a', status: 'ready' }),
diff --git a/src/main/speech/stt-service.ts b/src/main/speech/stt-service.ts
index 81506a9a9b..f31a3b90e7 100644
--- a/src/main/speech/stt-service.ts
+++ b/src/main/speech/stt-service.ts
@@ -6,6 +6,8 @@ import { join } from 'path'
 import { app } from 'electron'
 import { getCatalogModel } from './model-catalog'
 import type { ModelManager } from './model-manager'
+import { OpenAiTranscriptionSession } from './openai-transcription-client'
+import { readOpenAiSpeechApiKey } from './openai-api-key-store'
 
 export const START_DICTATION_TIMEOUT_MS = 60_000
 const STOP_DICTATION_TIMEOUT_MS = 60_000
@@ -22,6 +24,7 @@ export type SttEventSink = (event: SttEvent) => void
 
 export class SttService {
   private worker: Worker | null = null
+  private cloudSession: OpenAiTranscriptionSession | null = null
   private modelManager: ModelManager
   private activeModelId: string | null = null
   private activeHotwordsFilePath: string | undefined
@@ -51,7 +54,7 @@ export class SttService {
       }
       return
     }
-    if (this.worker && this.activeOwner && this.activeOwner !== owner) {
+    if ((this.worker || this.cloudSession) && this.activeOwner && this.activeOwner !== owner) {
       throw new Error('dictation_already_active')
     }
     this.starting = true
@@ -78,6 +81,34 @@ export class SttService {
     hotwordsFilePath?: string,
     owner = 'desktop'
   ): Promise<void> {
+    const manifest = getCatalogModel(modelId)
+    if (!manifest) {
+      throw new Error(`Unknown model: ${modelId}`)
+    }
+
+    if (manifest.provider === 'openai') {
+      if (this.worker) {
+        await this.stopDictation(owner, { cancelStarting: false })
+        await this.teardownIdleWorker()
+      }
+
+      const modelState = await this.modelManager.getModelState(modelId)
+      if (modelState.status !== 'ready') {
+        throw new Error(`Model not ready: ${modelState.status}`)
+      }
+
+      this.cloudSession = new OpenAiTranscriptionSession(modelId, readOpenAiSpeechApiKey)
+      this.activeModelId = modelId
+      this.activeHotwordsFilePath = undefined
+      this.eventSink = sink
+      sink({ type: 'ready' })
+      return
+    }
+
+    if (this.cloudSession) {
+      await this.stopDictation(owner, { cancelStarting: false })
+    }
+
     if (
       this.worker &&
       this.activeModelId === modelId &&
@@ -93,11 +124,6 @@ export class SttService {
       await this.teardownIdleWorker()
     }
 
-    const manifest = getCatalogModel(modelId)
-    if (!manifest) {
-      throw new Error(`Unknown model: ${modelId}`)
-    }
-
     const modelState = await this.modelManager.getModelState(modelId)
     if (modelState.status !== 'ready') {
       throw new Error(`Model not ready: ${modelState.status}`)
@@ -207,7 +233,7 @@ export class SttService {
       modelType: manifest.type,
       streaming: manifest.streaming,
       sampleRate: manifest.sampleRate,
-      files: manifest.files,
+      files: manifest.files ?? [],
       hotwordsFilePath,
       modelingUnit: manifest.modelingUnit
     })
@@ -237,6 +263,10 @@ export class SttService {
     if (currentOwner !== owner) {
       throw new Error('dictation_owner_mismatch')
     }
+    if (this.cloudSession) {
+      this.cloudSession.feedAudio(samples, sampleRate)
+      return
+    }
     this.worker?.postMessage({ type: 'feed', samples, sampleRate }, [samples.buffer as ArrayBuffer])
   }
 
@@ -247,7 +277,7 @@ export class SttService {
     if (options.cancelStarting !== false && this.startingOwner === owner) {
       this.canceledOwners.add(owner)
     }
-    if (!this.worker) {
+    if (!this.worker && !this.cloudSession) {
       return
     }
     const currentOwner = this.activeOwner ?? this.startingOwner
@@ -255,7 +285,33 @@ export class SttService {
       throw new Error('dictation_owner_mismatch')
     }
 
+    if (this.cloudSession) {
+      const session = this.cloudSession
+      this.cloudSession = null
+      try {
+        const text = await session.finish()
+        if (text) {
+          this.eventSink?.({ type: 'final', text })
+        }
+      } catch (error) {
+        this.eventSink?.({
+          type: 'error',
+          error: error instanceof Error ? error.message : String(error)
+        })
+      } finally {
+        this.eventSink?.({ type: 'stopped' })
+        this.activeModelId = null
+        this.activeHotwordsFilePath = undefined
+        this.activeOwner = null
+        this.eventSink = null
+      }
+      return
+    }
+
     const worker = this.worker
+    if (!worker) {
+      return
+    }
     worker.postMessage({ type: 'stop' })
 
     let forcedTeardown = false
@@ -319,7 +375,7 @@ export class SttService {
   }
 
   isActive(): boolean {
-    return this.worker !== null
+    return this.worker !== null || this.cloudSession !== null
   }
 
   getActiveModelId(): string | null {
diff --git a/src/preload/api-types.ts b/src/preload/api-types.ts
index 90fb1f3a19..612f63053a 100644
--- a/src/preload/api-types.ts
+++ b/src/preload/api-types.ts
@@ -2433,6 +2433,9 @@ export type PreloadApi = {
   speech: {
     getCatalog: () => Promise<SpeechModelManifest[]>
     getModelStates: () => Promise<SpeechModelState[]>
+    getOpenAiApiKeyStatus: () => Promise<{ configured: boolean }>
+    saveOpenAiApiKey: (apiKey: string) => Promise<{ configured: boolean }>
+    clearOpenAiApiKey: () => Promise<{ configured: boolean }>
     downloadModel: (modelId: string) => Promise<void>
     cancelDownload: (modelId: string) => Promise<void>
     deleteModel: (modelId: string) => Promise<void>
diff --git a/src/preload/index.ts b/src/preload/index.ts
index 16fd77622a..b7d8c55013 100644
--- a/src/preload/index.ts
+++ b/src/preload/index.ts
@@ -3479,6 +3479,12 @@ const api = {
   speech: {
     getCatalog: (): Promise<SpeechModelManifest[]> => ipcRenderer.invoke('speech:getCatalog'),
     getModelStates: (): Promise<SpeechModelState[]> => ipcRenderer.invoke('speech:getModelStates'),
+    getOpenAiApiKeyStatus: (): Promise<{ configured: boolean }> =>
+      ipcRenderer.invoke('speech:getOpenAiApiKeyStatus'),
+    saveOpenAiApiKey: (apiKey: string): Promise<{ configured: boolean }> =>
+      ipcRenderer.invoke('speech:saveOpenAiApiKey', apiKey),
+    clearOpenAiApiKey: (): Promise<{ configured: boolean }> =>
+      ipcRenderer.invoke('speech:clearOpenAiApiKey'),
     downloadModel: (modelId: string): Promise<void> =>
       ipcRenderer.invoke('speech:downloadModel', modelId),
     cancelDownload: (modelId: string): Promise<void> =>
diff --git a/src/renderer/src/components/dictation/DictationController.tsx b/src/renderer/src/components/dictation/DictationController.tsx
index ef52d3851f..f580d52782 100644
--- a/src/renderer/src/components/dictation/DictationController.tsx
+++ b/src/renderer/src/components/dictation/DictationController.tsx
@@ -38,6 +38,7 @@ export function DictationController() {
   const stoppedResolversRef = useRef(new Map<string, () => void>())
   const stopRequestedDuringStartRef = useRef(false)
   const finalTranscriptReceivedRef = useRef(false)
+  const erroredSessionIdsRef = useRef(new Set<string>())
   const intentionalTargetCancellationRef = useRef(false)
   const insertedFinalTranscriptRef = useRef('')
 
@@ -59,7 +60,8 @@ export function DictationController() {
       // transcript delivery is renderer IPC. Wait for this session's stopped
       // event so old finals cannot be mistaken for the next dictation run.
       await waitForStoppedSession(sessionId, stoppedSessionIdsRef, stoppedResolversRef)
-      if (!finalTranscriptReceivedRef.current && getCapturedChunkCount() > 0) {
+      const sessionErrored = erroredSessionIdsRef.current.delete(sessionId)
+      if (!sessionErrored && !finalTranscriptReceivedRef.current && getCapturedChunkCount() > 0) {
         toast.message('No speech detected.')
       }
       insertionTargetRef.current = null
@@ -108,6 +110,7 @@ export function DictationController() {
     insertionTargetRef.current = captureInsertionTarget()
     stopRequestedDuringStartRef.current = false
     finalTranscriptReceivedRef.current = false
+    erroredSessionIdsRef.current.clear()
     insertedFinalTranscriptRef.current = ''
     intentionalTargetCancellationRef.current = false
     dictationStateRef.current = 'starting'
@@ -172,6 +175,7 @@ export function DictationController() {
       intentionalTargetCancellationRef.current = false
       stopRequestedDuringStartRef.current = false
       finalTranscriptReceivedRef.current = false
+      erroredSessionIdsRef.current.clear()
       insertedFinalTranscriptRef.current = ''
       activeSessionIdRef.current = null
       setPartialTranscript('')
@@ -381,6 +385,7 @@ export function DictationController() {
         return
       }
       const sessionId = data.sessionId
+      erroredSessionIdsRef.current.add(sessionId)
       dictationRunRef.current += 1
       activeSessionIdRef.current = null
       toast.error(`Speech error: ${data.error}`)
diff --git a/src/renderer/src/components/settings/OpenAiTranscriptionKeyDialog.tsx b/src/renderer/src/components/settings/OpenAiTranscriptionKeyDialog.tsx
new file mode 100644
index 0000000000..e6f84068b1
--- /dev/null
+++ b/src/renderer/src/components/settings/OpenAiTranscriptionKeyDialog.tsx
@@ -0,0 +1,78 @@
+import { Loader2, Lock } from 'lucide-react'
+import { Button } from '../ui/button'
+import {
+  Dialog,
+  DialogContent,
+  DialogDescription,
+  DialogFooter,
+  DialogHeader,
+  DialogTitle
+} from '../ui/dialog'
+import { Input } from '../ui/input'
+import { Label } from '../ui/label'
+
+type OpenAiTranscriptionKeyDialogProps = {
+  open: boolean
+  configured: boolean
+  apiKeyDraft: string
+  pending: boolean
+  onOpenChange: (open: boolean) => void
+  onApiKeyDraftChange: (value: string) => void
+  onSave: () => void
+  onClear: () => void
+}
+
+export function OpenAiTranscriptionKeyDialog({
+  open,
+  configured,
+  apiKeyDraft,
+  pending,
+  onOpenChange,
+  onApiKeyDraftChange,
+  onSave,
+  onClear
+}: OpenAiTranscriptionKeyDialogProps): React.JSX.Element {
+  return (
+    <Dialog open={open} onOpenChange={onOpenChange}>
+      <DialogContent>
+        <DialogHeader>
+          <DialogTitle>OpenAI Transcription</DialogTitle>
+          <DialogDescription>
+            Audio is sent to OpenAI only when an OpenAI speech model is selected.
+          </DialogDescription>
+        </DialogHeader>
+        <div className="space-y-2">
+          <Label htmlFor="openai-speech-api-key">API Key</Label>
+          <Input
+            id="openai-speech-api-key"
+            type="password"
+            value={apiKeyDraft}
+            placeholder={configured ? 'API key configured' : 'sk-...'}
+            disabled={pending}
+            onChange={(event) => onApiKeyDraftChange(event.target.value)}
+            onKeyDown={(event) => {
+              if (event.key === 'Enter' && apiKeyDraft.trim()) {
+                onSave()
+              }
+            }}
+          />
+        </div>
+        <p className="flex items-center gap-1.5 text-[11px] text-muted-foreground/70">
+          <Lock className="size-3 shrink-0" />
+          Local runtime keys are stored in ~/.orca using Electron encrypted storage when available.
+        </p>
+        <DialogFooter>
+          {configured && (
+            <Button variant="outline" disabled={pending} onClick={onClear}>
+              Clear Key
+            </Button>
+          )}
+          <Button disabled={pending || !apiKeyDraft.trim()} onClick={onSave}>
+            {pending ? <Loader2 className="size-4 animate-spin" /> : null}
+            Save Key
+          </Button>
+        </DialogFooter>
+      </DialogContent>
+    </Dialog>
+  )
+}
diff --git a/src/renderer/src/components/settings/OpenAiTranscriptionSettingsRow.tsx b/src/renderer/src/components/settings/OpenAiTranscriptionSettingsRow.tsx
new file mode 100644
index 0000000000..2123d89057
--- /dev/null
+++ b/src/renderer/src/components/settings/OpenAiTranscriptionSettingsRow.tsx
@@ -0,0 +1,58 @@
+import { CheckCircle2, Cloud, Unlink } from 'lucide-react'
+import { Button } from '../ui/button'
+import { Label } from '../ui/label'
+
+type OpenAiTranscriptionSettingsRowProps = {
+  configured: boolean
+  disabled: boolean
+  onConfigure: () => void
+  onClear: () => void
+}
+
+export function OpenAiTranscriptionSettingsRow({
+  configured,
+  disabled,
+  onConfigure,
+  onClear
+}: OpenAiTranscriptionSettingsRowProps): React.JSX.Element {
+  return (
+    <div className="flex items-center justify-between gap-4 py-2">
+      <div className="min-w-0 space-y-0.5">
+        <div className="flex items-center gap-2">
+          <Cloud className="size-4 shrink-0 text-muted-foreground" />
+          <Label>OpenAI Transcription</Label>
+          {configured && (
+            <span className="flex items-center gap-1 text-xs text-muted-foreground">
+              <CheckCircle2 className="size-3.5" />
+              Connected
+            </span>
+          )}
+        </div>
+        <p className="text-xs text-muted-foreground">
+          {configured
+            ? 'API key configured for cloud speech-to-text models.'
+            : 'Add an OpenAI API key before selecting cloud speech-to-text models.'}
+        </p>
+      </div>
+      {configured ? (
+        <div className="flex shrink-0 items-center gap-1.5">
+          <Button variant="outline" size="sm" disabled={disabled} onClick={onConfigure}>
+            Replace key
+          </Button>
+          <button
+            onClick={onClear}
+            aria-label="Disconnect OpenAI API key"
+            disabled={disabled}
+            className="rounded-md p-1 text-muted-foreground/50 transition-colors hover:text-destructive disabled:cursor-not-allowed disabled:opacity-50"
+          >
+            <Unlink className="size-3.5" />
+          </button>
+        </div>
+      ) : (
+        <Button variant="outline" size="sm" disabled={disabled} onClick={onConfigure}>
+          Add API key
+        </Button>
+      )}
+    </div>
+  )
+}
diff --git a/src/renderer/src/components/settings/VoicePane.test.tsx b/src/renderer/src/components/settings/VoicePane.test.tsx
index 7ee676e22b..2777d05cda 100644
--- a/src/renderer/src/components/settings/VoicePane.test.tsx
+++ b/src/renderer/src/components/settings/VoicePane.test.tsx
@@ -52,6 +52,9 @@ function installWindowApi(
       },
       speech: {
         getCatalog: vi.fn(async () => []),
+        getOpenAiApiKeyStatus: vi.fn(async () => ({ configured: false })),
+        saveOpenAiApiKey: vi.fn(async () => ({ configured: true })),
+        clearOpenAiApiKey: vi.fn(async () => ({ configured: false })),
         onDownloadProgress: vi.fn(() => () => {}),
         downloadModel: vi.fn()
       }
diff --git a/src/renderer/src/components/settings/VoicePane.tsx b/src/renderer/src/components/settings/VoicePane.tsx
index a6c25b6323..9facfb4d68 100644
--- a/src/renderer/src/components/settings/VoicePane.tsx
+++ b/src/renderer/src/components/settings/VoicePane.tsx
@@ -1,8 +1,6 @@
 import { useCallback, useEffect, useRef, useState } from 'react'
 import type { GlobalSettings } from '../../../../shared/types'
 import { getDefaultVoiceSettings } from '../../../../shared/constants'
-import type { DeveloperPermissionRequestResult } from '../../../../shared/developer-permissions-types'
-import type { FeatureTipId } from '../../../../shared/feature-tips'
 import type {
   SpeechModelManifest,
   SpeechModelState,
@@ -17,75 +15,23 @@ import {
   DropdownMenuItem,
   DropdownMenuTrigger
 } from '../ui/dropdown-menu'
-import { Download, Trash2, Loader2, ChevronDown, Check } from 'lucide-react'
+import { Cloud, Download, Trash2, Loader2, ChevronDown, Check } from 'lucide-react'
 import { toast } from 'sonner'
 import { useAppStore } from '@/store'
 import { useShortcutLabel } from '@/hooks/useShortcutLabel'
+import { OpenAiTranscriptionKeyDialog } from './OpenAiTranscriptionKeyDialog'
+import { OpenAiTranscriptionSettingsRow } from './OpenAiTranscriptionSettingsRow'
+import { handleVoiceDictationToggle } from './voice-dictation-toggle'
+import { matchesSettingsSearch } from './settings-search'
+import { OPENAI_TRANSCRIPTION_SEARCH_ENTRY } from './voice-pane-search'
+
+export { handleVoiceDictationToggle }
 
 type VoicePaneProps = {
   settings: GlobalSettings
   updateSettings: (updates: Partial<GlobalSettings>) => void
 }
 
-type VoiceDictationToggleOptions = {
-  voiceEnabled: boolean
-  markFeatureTipsSeen: (ids: FeatureTipId[]) => void
-  updateVoiceSettings: (updates: Partial<VoiceSettings>) => void
-  requestMicrophonePermission: () => Promise<DeveloperPermissionRequestResult>
-  setPermissionPending?: (pending: boolean) => void
-  isMounted?: () => boolean
-  notifyPermissionGranted?: () => void
-  notifyPermissionOpenedSystemSettings?: () => void
-  notifyPermissionRequired?: () => void
-  notifyPermissionRequestFailed?: () => void
-}
-
-export async function handleVoiceDictationToggle({
-  voiceEnabled,
-  markFeatureTipsSeen,
-  updateVoiceSettings,
-  requestMicrophonePermission,
-  setPermissionPending,
-  isMounted,
-  notifyPermissionGranted,
-  notifyPermissionOpenedSystemSettings,
-  notifyPermissionRequired,
-  notifyPermissionRequestFailed
-}: VoiceDictationToggleOptions): Promise<void> {
-  // Why: changing the Voice Dictation switch proves the user discovered the
-  // feature; disabling it later should not make the discovery modal eligible.
-  markFeatureTipsSeen(['voice-dictation'])
-
-  if (voiceEnabled) {
-    updateVoiceSettings({ enabled: false })
-    return
-  }
-
-  setPermissionPending?.(true)
-  try {
-    // Why: enabling dictation is the point where users expect the macOS
-    // microphone prompt, not after their first attempted recording fails.
-    const result = await requestMicrophonePermission()
-    if (result.status === 'granted' || result.status === 'unsupported') {
-      updateVoiceSettings({ enabled: true })
-    }
-
-    if (result.status === 'granted') {
-      notifyPermissionGranted?.()
-    } else if (result.openedSystemSettings) {
-      notifyPermissionOpenedSystemSettings?.()
-    } else if (result.status !== 'unsupported') {
-      notifyPermissionRequired?.()
-    }
-  } catch {
-    notifyPermissionRequestFailed?.()
-  } finally {
-    if (isMounted?.() ?? true) {
-      setPermissionPending?.(false)
-    }
-  }
-}
-
 export function VoicePane({ settings, updateSettings }: VoicePaneProps): React.JSX.Element {
   // Why: voice was made optional on GlobalSettings to keep older test fixtures
   // and pre-voice profiles type-compatible. Persistence merges defaults at
@@ -95,9 +41,14 @@ export function VoicePane({ settings, updateSettings }: VoicePaneProps): React.J
   const modelStates = useAppStore((s) => s.modelStates)
   const refreshModelStates = useAppStore((s) => s.refreshModelStates)
   const markFeatureTipsSeen = useAppStore((s) => s.markFeatureTipsSeen)
+  const settingsSearchQuery = useAppStore((s) => s.settingsSearchQuery ?? '')
   const shortcutLabel = useShortcutLabel('voice.dictation')
   const [catalog, setCatalog] = useState<SpeechModelManifest[]>([])
   const [permissionPending, setPermissionPending] = useState(false)
+  const [openAiDialogOpen, setOpenAiDialogOpen] = useState(false)
+  const [openAiApiKeyDraft, setOpenAiApiKeyDraft] = useState('')
+  const [openAiKeyPending, setOpenAiKeyPending] = useState(false)
+  const [pendingCloudModelId, setPendingCloudModelId] = useState<string | null>(null)
   const mountedRef = useRef(true)
 
   const handlePaneRef = useCallback((node: HTMLDivElement | null): void => {
@@ -106,6 +57,18 @@ export function VoicePane({ settings, updateSettings }: VoicePaneProps): React.J
     mountedRef.current = node !== null
   }, [])
 
+  const updateVoiceSettings = useCallback(
+    (updates: Partial<VoiceSettings>): void => {
+      updateSettings({
+        voice: {
+          ...voiceSettings,
+          ...updates
+        }
+      })
+    },
+    [updateSettings, voiceSettings]
+  )
+
   useEffect(() => {
     let cancelled = false
     refreshModelStates()
@@ -117,10 +80,19 @@ export function VoicePane({ settings, updateSettings }: VoicePaneProps): React.J
         }
       })
       .catch(() => {})
+    void window.api.speech
+      .getOpenAiApiKeyStatus()
+      .then((status) => {
+        if (!cancelled && status.configured !== voiceSettings.openAiApiKeyConfigured) {
+          updateVoiceSettings({ openAiApiKeyConfigured: status.configured })
+          refreshModelStates()
+        }
+      })
+      .catch(() => {})
     return () => {
       cancelled = true
     }
-  }, [refreshModelStates])
+  }, [refreshModelStates, updateVoiceSettings, voiceSettings.openAiApiKeyConfigured])
 
   useEffect(() => {
     const cleanup = window.api.speech.onDownloadProgress(() => {
@@ -129,15 +101,6 @@ export function VoicePane({ settings, updateSettings }: VoicePaneProps): React.J
     return cleanup
   }, [refreshModelStates])
 
-  const updateVoiceSettings = (updates: Partial<VoiceSettings>): void => {
-    updateSettings({
-      voice: {
-        ...voiceSettings,
-        ...updates
-      }
-    })
-  }
-
   const toggleVoiceDictation = async (): Promise<void> => {
     await handleVoiceDictationToggle({
       voiceEnabled: voiceSettings.enabled,
@@ -167,6 +130,61 @@ export function VoicePane({ settings, updateSettings }: VoicePaneProps): React.J
     ? getModelState(voiceSettings.sttModel)
     : undefined
   const selectedIsReady = selectedModelState?.status === 'ready'
+  const showOpenAiSettingsRow =
+    voiceSettings.openAiApiKeyConfigured ||
+    selectedModel?.provider === 'openai' ||
+    (settingsSearchQuery.trim() !== '' &&
+      matchesSettingsSearch(settingsSearchQuery, OPENAI_TRANSCRIPTION_SEARCH_ENTRY))
+
+  const openOpenAiDialog = (modelId: string | null = null): void => {
+    setPendingCloudModelId(modelId)
+    setOpenAiApiKeyDraft('')
+    setOpenAiDialogOpen(true)
+  }
+
+  const saveOpenAiApiKey = async (): Promise<void> => {
+    setOpenAiKeyPending(true)
+    try {
+      await window.api.speech.saveOpenAiApiKey(openAiApiKeyDraft)
+      updateVoiceSettings({
+        openAiApiKeyConfigured: true,
+        sttModel: pendingCloudModelId ?? voiceSettings.sttModel
+      })
+      await refreshModelStates()
+      setOpenAiDialogOpen(false)
+      setOpenAiApiKeyDraft('')
+      setPendingCloudModelId(null)
+      toast.success('OpenAI API key saved')
+    } catch (err) {
+      toast.error(err instanceof Error ? err.message : 'Failed to save OpenAI API key')
+    } finally {
+      if (mountedRef.current) {
+        setOpenAiKeyPending(false)
+      }
+    }
+  }
+
+  const clearOpenAiApiKey = async (): Promise<void> => {
+    setOpenAiKeyPending(true)
+    try {
+      await window.api.speech.clearOpenAiApiKey()
+      updateVoiceSettings({
+        openAiApiKeyConfigured: false,
+        sttModel: selectedModel?.provider === 'openai' ? '' : voiceSettings.sttModel
+      })
+      await refreshModelStates()
+      setOpenAiDialogOpen(false)
+      setOpenAiApiKeyDraft('')
+      setPendingCloudModelId(null)
+      toast.success('OpenAI API key cleared')
+    } catch (err) {
+      toast.error(err instanceof Error ? err.message : 'Failed to clear OpenAI API key')
+    } finally {
+      if (mountedRef.current) {
+        setOpenAiKeyPending(false)
+      }
+    }
+  }
 
   return (
     <div ref={handlePaneRef} className="space-y-1">
@@ -232,7 +250,7 @@ export function VoicePane({ settings, updateSettings }: VoicePaneProps): React.J
           <p className="text-xs text-muted-foreground">
             {selectedModel && selectedIsReady
               ? `${selectedModel.label} — ${selectedModel.description}`
-              : 'Select and download a model to enable dictation.'}
+              : 'Select a speech model. Local models run offline; cloud models require an API key.'}
           </p>
         </div>
         <DropdownMenu>
@@ -254,7 +272,8 @@ export function VoicePane({ settings, updateSettings }: VoicePaneProps): React.J
               const isDownloading =
                 mState?.status === 'downloading' || mState?.status === 'extracting'
               const isActive = voiceSettings.sttModel === manifest.id
-              const sizeMb = Math.round(manifest.sizeBytes / 1_000_000)
+              const isCloud = manifest.provider === 'openai'
+              const sizeMb = manifest.sizeBytes ? Math.round(manifest.sizeBytes / 1_000_000) : null
 
               return (
                 <DropdownMenuItem
@@ -263,6 +282,8 @@ export function VoicePane({ settings, updateSettings }: VoicePaneProps): React.J
                   onSelect={() => {
                     if (isReady) {
                       updateVoiceSettings({ sttModel: manifest.id })
+                    } else if (isCloud) {
+                      openOpenAiDialog(manifest.id)
                     } else if (!isDownloading) {
                       void window.api.speech
                         .downloadModel(manifest.id)
@@ -270,7 +291,7 @@ export function VoicePane({ settings, updateSettings }: VoicePaneProps): React.J
                     }
                   }}
                   className={`group flex items-center gap-2.5 py-2.5 ${
-                    !isReady && !isDownloading ? 'opacity-50' : ''
+                    !isCloud && !isReady && !isDownloading ? 'opacity-50' : ''
                   }`}
                 >
                   <span className="flex size-4 shrink-0 items-center justify-center">
@@ -278,16 +299,20 @@ export function VoicePane({ settings, updateSettings }: VoicePaneProps): React.J
                       <Check className="size-3.5" />
                     ) : isDownloading ? (
                       <Loader2 className="size-3.5 animate-spin text-muted-foreground" />
+                    ) : isCloud ? (
+                      <Cloud className="size-3.5 text-muted-foreground" />
                     ) : null}
                   </span>
                   <div className="min-w-0 flex-1">
                     <div className="flex items-center gap-1.5">
                       <span className="text-sm font-medium">{manifest.label}</span>
-                      <span className="text-[10px] px-1 py-px rounded-full leading-none bg-muted text-muted-foreground">
-                        {manifest.streaming ? 'streaming' : 'offline'}
-                      </span>
+                      {!isCloud && (
+                        <span className="text-[10px] px-1 py-px rounded-full leading-none bg-muted text-muted-foreground">
+                          {manifest.streaming ? 'streaming' : 'offline'}
+                        </span>
+                      )}
                       {manifest.recommended && (
-                        <span className="text-[10px] px-1 py-px rounded-full leading-none bg-emerald-500/10 text-emerald-500">
+                        <span className="text-[10px] px-1 py-px rounded-full leading-none bg-status-success-background text-status-success">
                           recommended
                         </span>
                       )}
@@ -296,14 +321,16 @@ export function VoicePane({ settings, updateSettings }: VoicePaneProps): React.J
                           ? mState.status === 'extracting'
                             ? 'Extracting...'
                             : `${Math.round(mState.progress * 100)}%`
-                          : `${sizeMb} MB`}
+                          : isCloud
+                            ? null
+                            : `${sizeMb} MB`}
                       </span>
                     </div>
                     <p className="text-[11px] text-muted-foreground mt-0.5 leading-snug">
                       {manifest.description}
                     </p>
                   </div>
-                  {isReady && !isActive ? (
+                  {!isCloud && isReady && !isActive ? (
                     <button
                       onClick={(e) => {
                         e.stopPropagation()
@@ -316,7 +343,7 @@ export function VoicePane({ settings, updateSettings }: VoicePaneProps): React.J
                     >
                       <Trash2 className="size-3" />
                     </button>
-                  ) : !isReady && !isDownloading ? (
+                  ) : !isCloud && !isReady && !isDownloading ? (
                     <span className="shrink-0 p-1 text-muted-foreground opacity-0 group-hover:opacity-100 transition-opacity">
                       <Download className="size-3" />
                     </span>
@@ -327,6 +354,29 @@ export function VoicePane({ settings, updateSettings }: VoicePaneProps): React.J
           </DropdownMenuContent>
         </DropdownMenu>
       </div>
+
+      {showOpenAiSettingsRow && (
+        <>
+          <Separator />
+          <OpenAiTranscriptionSettingsRow
+            configured={voiceSettings.openAiApiKeyConfigured}
+            disabled={openAiKeyPending}
+            onConfigure={() => openOpenAiDialog(null)}
+            onClear={() => void clearOpenAiApiKey()}
+          />
+        </>
+      )}
+
+      <OpenAiTranscriptionKeyDialog
+        open={openAiDialogOpen}
+        configured={voiceSettings.openAiApiKeyConfigured}
+        apiKeyDraft={openAiApiKeyDraft}
+        pending={openAiKeyPending}
+        onOpenChange={setOpenAiDialogOpen}
+        onApiKeyDraftChange={setOpenAiApiKeyDraft}
+        onSave={() => void saveOpenAiApiKey()}
+        onClear={() => void clearOpenAiApiKey()}
+      />
     </div>
   )
 }
diff --git a/src/renderer/src/components/settings/voice-dictation-toggle.ts b/src/renderer/src/components/settings/voice-dictation-toggle.ts
new file mode 100644
index 0000000000..a54356c8d3
--- /dev/null
+++ b/src/renderer/src/components/settings/voice-dictation-toggle.ts
@@ -0,0 +1,62 @@
+import type { DeveloperPermissionRequestResult } from '../../../../shared/developer-permissions-types'
+import type { FeatureTipId } from '../../../../shared/feature-tips'
+import type { VoiceSettings } from '../../../../shared/speech-types'
+
+type VoiceDictationToggleOptions = {
+  voiceEnabled: boolean
+  markFeatureTipsSeen: (ids: FeatureTipId[]) => void
+  updateVoiceSettings: (updates: Partial<VoiceSettings>) => void
+  requestMicrophonePermission: () => Promise<DeveloperPermissionRequestResult>
+  setPermissionPending?: (pending: boolean) => void
+  isMounted?: () => boolean
+  notifyPermissionGranted?: () => void
+  notifyPermissionOpenedSystemSettings?: () => void
+  notifyPermissionRequired?: () => void
+  notifyPermissionRequestFailed?: () => void
+}
+
+export async function handleVoiceDictationToggle({
+  voiceEnabled,
+  markFeatureTipsSeen,
+  updateVoiceSettings,
+  requestMicrophonePermission,
+  setPermissionPending,
+  isMounted,
+  notifyPermissionGranted,
+  notifyPermissionOpenedSystemSettings,
+  notifyPermissionRequired,
+  notifyPermissionRequestFailed
+}: VoiceDictationToggleOptions): Promise<void> {
+  // Why: changing the Voice Dictation switch proves the user discovered the
+  // feature; disabling it later should not make the discovery modal eligible.
+  markFeatureTipsSeen(['voice-dictation'])
+
+  if (voiceEnabled) {
+    updateVoiceSettings({ enabled: false })
+    return
+  }
+
+  setPermissionPending?.(true)
+  try {
+    // Why: enabling dictation is the point where users expect the macOS
+    // microphone prompt, not after their first attempted recording fails.
+    const result = await requestMicrophonePermission()
+    if (result.status === 'granted' || result.status === 'unsupported') {
+      updateVoiceSettings({ enabled: true })
+    }
+
+    if (result.status === 'granted') {
+      notifyPermissionGranted?.()
+    } else if (result.openedSystemSettings) {
+      notifyPermissionOpenedSystemSettings?.()
+    } else if (result.status !== 'unsupported') {
+      notifyPermissionRequired?.()
+    }
+  } catch {
+    notifyPermissionRequestFailed?.()
+  } finally {
+    if (isMounted?.() ?? true) {
+      setPermissionPending?.(false)
+    }
+  }
+}
diff --git a/src/renderer/src/components/settings/voice-pane-search.ts b/src/renderer/src/components/settings/voice-pane-search.ts
index 93600bbc00..da299bcda8 100644
--- a/src/renderer/src/components/settings/voice-pane-search.ts
+++ b/src/renderer/src/components/settings/voice-pane-search.ts
@@ -1,5 +1,11 @@
 import type { SettingsSearchEntry } from './settings-search'
 
+export const OPENAI_TRANSCRIPTION_SEARCH_ENTRY: SettingsSearchEntry = {
+  title: 'OpenAI Transcription',
+  description: 'Configure the OpenAI API key used for cloud speech-to-text models.',
+  keywords: ['voice', 'speech', 'stt', 'openai', 'api key', 'cloud', 'transcription']
+}
+
 export const VOICE_PANE_SEARCH_ENTRIES: SettingsSearchEntry[] = [
   {
     title: 'Enable Voice Dictation',
@@ -11,9 +17,10 @@ export const VOICE_PANE_SEARCH_ENTRIES: SettingsSearchEntry[] = [
     description: 'Toggle or hold-to-talk dictation behavior.',
     keywords: ['voice', 'dictation', 'mode', 'toggle', 'hold', 'push to talk']
   },
+  OPENAI_TRANSCRIPTION_SEARCH_ENTRY,
   {
     title: 'Speech Model',
-    description: 'Select which speech-to-text model to use for dictation.',
-    keywords: ['voice', 'model', 'speech', 'stt', 'download']
+    description: 'Select a local or cloud speech-to-text model to use for dictation.',
+    keywords: ['voice', 'model', 'speech', 'stt', 'download', 'openai', 'api key', 'cloud']
   }
 ]
diff --git a/src/shared/constants.ts b/src/shared/constants.ts
index ffe00b54e2..d7db656c07 100644
--- a/src/shared/constants.ts
+++ b/src/shared/constants.ts
@@ -346,7 +346,8 @@ export function getDefaultVoiceSettings(): VoiceSettings {
     language: 'en',
     dictationMode: 'toggle' as const,
     terminalConfirmBeforeInsert: false,
-    userModels: []
+    userModels: [],
+    openAiApiKeyConfigured: false
   }
 }
 
diff --git a/src/shared/speech-types.ts b/src/shared/speech-types.ts
index 008ab1b976..9881e9b4e9 100644
--- a/src/shared/speech-types.ts
+++ b/src/shared/speech-types.ts
@@ -1,4 +1,5 @@
-export type SpeechModelType = 'transducer' | 'paraformer' | 'whisper'
+export type SpeechModelType = 'transducer' | 'paraformer' | 'whisper' | 'openai'
+export type SpeechModelProvider = 'local' | 'openai'
 
 export type ModelingUnit = 'bpe' | 'cjkchar' | 'cjkchar+bpe'
 
@@ -7,12 +8,13 @@ export type SpeechModelManifest = {
   label: string
   description: string
   type: SpeechModelType
+  provider: SpeechModelProvider
   language: string
-  sizeBytes: number
-  downloadUrl: string
-  archiveSha256: string
-  archiveFormat: 'tar.bz2'
-  files: string[]
+  sizeBytes?: number
+  downloadUrl?: string
+  archiveSha256?: string
+  archiveFormat?: 'tar.bz2'
+  files?: string[]
   sampleRate: number
   streaming: boolean
   modelingUnit?: ModelingUnit
@@ -61,4 +63,5 @@ export type VoiceSettings = {
   dictationMode: DictationMode
   terminalConfirmBeforeInsert: boolean
   userModels: UserModelConfig[]
+  openAiApiKeyConfigured: boolean
 }

From e136dfd6c950a970e4274cfc6a2157b28ad04c5e Mon Sep 17 00:00:00 2001
From: Jinwoo-H <jinwoo0825@gmail.com>
Date: Fri, 5 Jun 2026 11:19:59 -0700
Subject: [PATCH 2/4] Avoid keychain prompts for OpenAI speech status

Co-authored-by: Orca <help@stably.ai>
---
 src/main/speech/openai-api-key-store.test.ts | 70 ++++++++++++++++++++
 src/main/speech/openai-api-key-store.ts      |  2 +
 2 files changed, 72 insertions(+)
 create mode 100644 src/main/speech/openai-api-key-store.test.ts

diff --git a/src/main/speech/openai-api-key-store.test.ts b/src/main/speech/openai-api-key-store.test.ts
new file mode 100644
index 0000000000..1049cb2977
--- /dev/null
+++ b/src/main/speech/openai-api-key-store.test.ts
@@ -0,0 +1,70 @@
+import { existsSync, mkdirSync, mkdtempSync, writeFileSync } from 'fs'
+import { tmpdir } from 'os'
+import type * as Os from 'os'
+import { join } from 'path'
+import { beforeEach, describe, expect, it, vi } from 'vitest'
+
+const safeStorageMock = vi.hoisted(() => ({
+  decryptString: vi.fn((value: Buffer) => value.toString('utf8')),
+  encryptString: vi.fn((value: string) => Buffer.from(value)),
+  isEncryptionAvailable: vi.fn(() => true)
+}))
+
+let tempHome = ''
+
+async function loadStoreModule() {
+  vi.resetModules()
+  vi.doMock('electron', () => ({
+    safeStorage: safeStorageMock
+  }))
+  vi.doMock('os', async () => {
+    const actual = await vi.importActual<typeof Os>('os')
+    return { ...actual, homedir: () => tempHome }
+  })
+  return import('./openai-api-key-store')
+}
+
+beforeEach(() => {
+  tempHome = mkdtempLike('orca-openai-key-store-')
+  safeStorageMock.decryptString.mockClear()
+  safeStorageMock.encryptString.mockClear()
+  safeStorageMock.isEncryptionAvailable.mockClear()
+  safeStorageMock.isEncryptionAvailable.mockReturnValue(true)
+})
+
+function mkdtempLike(prefix: string): string {
+  return mkdtempSync(join(tmpdir(), prefix))
+}
+
+function writeStoredOpenAiKey(value: string): void {
+  const orcaDir = join(tempHome, '.orca')
+  mkdirSync(orcaDir, { recursive: true })
+  writeFileSync(join(orcaDir, 'openai-speech-token.enc'), value)
+}
+
+describe('OpenAI speech API key store', () => {
+  it('checks configured status without decrypting or touching safeStorage', async () => {
+    writeStoredOpenAiKey('encrypted-key')
+    const store = await loadStoreModule()
+
+    expect(store.hasOpenAiSpeechApiKey()).toBe(true)
+    expect(safeStorageMock.isEncryptionAvailable).not.toHaveBeenCalled()
+    expect(safeStorageMock.decryptString).not.toHaveBeenCalled()
+  })
+
+  it('decrypts only when the key is read for an API request', async () => {
+    writeStoredOpenAiKey('encrypted-key')
+    const store = await loadStoreModule()
+
+    expect(store.readOpenAiSpeechApiKey()).toBe('encrypted-key')
+    expect(safeStorageMock.decryptString).toHaveBeenCalledOnce()
+  })
+
+  it('reports missing status without creating storage files', async () => {
+    const store = await loadStoreModule()
+
+    expect(store.hasOpenAiSpeechApiKey()).toBe(false)
+    expect(existsSync(join(tempHome, '.orca'))).toBe(false)
+    expect(safeStorageMock.decryptString).not.toHaveBeenCalled()
+  })
+})
diff --git a/src/main/speech/openai-api-key-store.ts b/src/main/speech/openai-api-key-store.ts
index 23e661fb71..0944105313 100644
--- a/src/main/speech/openai-api-key-store.ts
+++ b/src/main/speech/openai-api-key-store.ts
@@ -41,6 +41,8 @@ function readLegacyJsonStoredOpenAiKey(): StoredOpenAiKey | null {
 }
 
 export function hasOpenAiSpeechApiKey(): boolean {
+  // Why: Settings and model-state refresh call this on startup; checking file
+  // existence avoids decrypting safeStorage and triggering macOS keychain prompts.
   return existsSync(getOpenAiKeyPath())
 }
 

From f63eb0865e319f0df0a70b8668ba3d68a21bc212 Mon Sep 17 00:00:00 2001
From: Jinwoo-H <jinwoo0825@gmail.com>
Date: Fri, 5 Jun 2026 11:24:02 -0700
Subject: [PATCH 3/4] Cache OpenAI speech key after first read

Co-authored-by: Orca <help@stably.ai>
---
 src/main/speech/openai-api-key-store.test.ts | 18 ++++++++++++++++++
 src/main/speech/openai-api-key-store.ts      | 16 ++++++++++++++--
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/src/main/speech/openai-api-key-store.test.ts b/src/main/speech/openai-api-key-store.test.ts
index 1049cb2977..d38e013eee 100644
--- a/src/main/speech/openai-api-key-store.test.ts
+++ b/src/main/speech/openai-api-key-store.test.ts
@@ -60,6 +60,24 @@ describe('OpenAI speech API key store', () => {
     expect(safeStorageMock.decryptString).toHaveBeenCalledOnce()
   })
 
+  it('caches the decrypted key so repeated dictations do not repeatedly touch safeStorage', async () => {
+    writeStoredOpenAiKey('encrypted-key')
+    const store = await loadStoreModule()
+
+    expect(store.readOpenAiSpeechApiKey()).toBe('encrypted-key')
+    expect(store.readOpenAiSpeechApiKey()).toBe('encrypted-key')
+    expect(safeStorageMock.decryptString).toHaveBeenCalledOnce()
+  })
+
+  it('uses the in-memory key after save without decrypting from safeStorage', async () => {
+    const store = await loadStoreModule()
+
+    store.saveOpenAiSpeechApiKey('saved-key')
+
+    expect(store.readOpenAiSpeechApiKey()).toBe('saved-key')
+    expect(safeStorageMock.decryptString).not.toHaveBeenCalled()
+  })
+
   it('reports missing status without creating storage files', async () => {
     const store = await loadStoreModule()
 
diff --git a/src/main/speech/openai-api-key-store.ts b/src/main/speech/openai-api-key-store.ts
index 0944105313..74bffee19b 100644
--- a/src/main/speech/openai-api-key-store.ts
+++ b/src/main/speech/openai-api-key-store.ts
@@ -8,6 +8,7 @@ type StoredOpenAiKey = {
 }
 
 const OPENAI_SPEECH_TOKEN_FILE = 'openai-speech-token.enc'
+let cachedOpenAiSpeechApiKey: string | null = null
 
 function getOrcaDir(): string {
   return join(homedir(), '.orca')
@@ -54,6 +55,7 @@ export function saveOpenAiSpeechApiKey(apiKey: string): void {
   ensureOrcaDir()
   if (safeStorage.isEncryptionAvailable()) {
     writeFileSync(getOpenAiKeyPath(), safeStorage.encryptString(trimmed), { mode: 0o600 })
+    cachedOpenAiSpeechApiKey = trimmed
     return
   }
 
@@ -61,9 +63,14 @@ export function saveOpenAiSpeechApiKey(apiKey: string): void {
     '[speech] safeStorage encryption unavailable — storing OpenAI speech key in plaintext'
   )
   writeFileSync(getOpenAiKeyPath(), trimmed, { encoding: 'utf8', mode: 0o600 })
+  cachedOpenAiSpeechApiKey = trimmed
 }
 
 export function readOpenAiSpeechApiKey(): string {
+  if (cachedOpenAiSpeechApiKey !== null) {
+    return cachedOpenAiSpeechApiKey
+  }
+
   const keyPath = getOpenAiKeyPath()
   if (!existsSync(keyPath)) {
     throw new Error('OpenAI API key is not configured')
@@ -72,16 +79,21 @@ export function readOpenAiSpeechApiKey(): string {
     const raw = readFileSync(keyPath)
     const legacyJson = readLegacyJsonStoredOpenAiKey()
     if (legacyJson) {
-      return safeStorage.decryptString(Buffer.from(legacyJson.encryptedKeyBase64, 'base64'))
+      cachedOpenAiSpeechApiKey = safeStorage.decryptString(
+        Buffer.from(legacyJson.encryptedKeyBase64, 'base64')
+      )
+      return cachedOpenAiSpeechApiKey
     }
-    return safeStorage.isEncryptionAvailable()
+    cachedOpenAiSpeechApiKey = safeStorage.isEncryptionAvailable()
       ? safeStorage.decryptString(raw)
       : raw.toString('utf8')
+    return cachedOpenAiSpeechApiKey
   } catch {
     throw new Error('OpenAI API key could not be decrypted')
   }
 }
 
 export function clearOpenAiSpeechApiKey(): void {
+  cachedOpenAiSpeechApiKey = null
   rmSync(getOpenAiKeyPath(), { force: true })
 }

From 9f4f00c81f91fd75a5a723ce2dd015cd980cb95d Mon Sep 17 00:00:00 2001
From: Jinwoo-H <jinwoo0825@gmail.com>
Date: Fri, 5 Jun 2026 11:29:27 -0700
Subject: [PATCH 4/4] Assert OpenAI key read waits for transcription upload

Co-authored-by: Orca <help@stably.ai>
---
 src/main/speech/stt-service.test.ts | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/main/speech/stt-service.test.ts b/src/main/speech/stt-service.test.ts
index e3aa511453..ce0bbc77d8 100644
--- a/src/main/speech/stt-service.test.ts
+++ b/src/main/speech/stt-service.test.ts
@@ -6,6 +6,7 @@ const {
   getCloudSessions,
   getCreatedWorkerCount,
   getLastWorker,
+  readOpenAiSpeechApiKeyMock,
   resetCloudSessions,
   resetWorkers
 } = vi.hoisted(() => {
@@ -98,6 +99,7 @@ const {
     getCloudSessions: () => HoistedMockOpenAiTranscriptionSession.instances,
     getCreatedWorkerCount: () => HoistedMockWorker.created,
     getLastWorker: () => HoistedMockWorker.instances.at(-1),
+    readOpenAiSpeechApiKeyMock: vi.fn(() => 'test-openai-key'),
     resetCloudSessions: () => {
       HoistedMockOpenAiTranscriptionSession.instances = []
     },
@@ -142,7 +144,7 @@ vi.mock('./model-catalog', () => ({
 }))
 
 vi.mock('./openai-api-key-store', () => ({
-  readOpenAiSpeechApiKey: () => 'test-openai-key'
+  readOpenAiSpeechApiKey: readOpenAiSpeechApiKeyMock
 }))
 
 vi.mock('./openai-transcription-client', () => ({
@@ -155,6 +157,7 @@ describe('SttService', () => {
   beforeEach(() => {
     resetCloudSessions()
     resetWorkers()
+    readOpenAiSpeechApiKeyMock.mockClear()
   })
 
   it('reuses an idle warm worker for a second dictation with the same owner', async () => {
@@ -258,6 +261,22 @@ describe('SttService', () => {
     expect(sink).toHaveBeenCalledWith({ type: 'stopped' })
   })
 
+  it('reads the OpenAI key only when finishing cloud dictation', async () => {
+    const service = new SttService({
+      getModelState: vi.fn().mockResolvedValue({ id: 'openai-model', status: 'ready' }),
+      getModelDir: vi.fn().mockReturnValue('/tmp/model-a')
+    } as never)
+
+    await service.startDictation('openai-model', vi.fn(), undefined, 'desktop')
+    service.feedAudio(new Float32Array([0.25]), 16000, 'desktop')
+
+    expect(readOpenAiSpeechApiKeyMock).not.toHaveBeenCalled()
+
+    await service.stopDictation('desktop')
+
+    expect(readOpenAiSpeechApiKeyMock).toHaveBeenCalledOnce()
+  })
+
   it('keeps startup cancellation tombstoned after the worker has been created', async () => {
     const service = new SttService({
       getModelState: vi.fn().mockResolvedValue({ id: 'model-a', status: 'ready' }),