stablyai · Jinwoo-H · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026
diff --git a/src/main/ipc/speech.test.ts b/src/main/ipc/speech.test.ts
@@ -12,6 +12,11 @@ vi.mock('electron', () => ({
   app: { getPath: vi.fn(() => '/tmp/orca-speech-test') },
   BrowserWindow: { fromWebContents: fromWebContentsMock },
   ipcMain: { handle: handleMock },
+  safeStorage: {
+    decryptString: vi.fn(),
+    encryptString: vi.fn(() => Buffer.from('encrypted')),
+    isEncryptionAvailable: vi.fn(() => true)
+  },
   systemPreferences: {
     getMediaAccessStatus: vi.fn(() => 'granted'),
     askForMediaAccess: vi.fn(() => Promise.resolve(true))

diff --git a/src/main/ipc/speech.ts b/src/main/ipc/speech.ts
@@ -4,6 +4,11 @@ import { writeFile, unlink } from 'fs/promises'
 import { createHash } from 'crypto'
 import { SPEECH_MODEL_CATALOG, getCatalogModel } from '../speech/model-catalog'
 import { getSpeechModelManager, getSpeechSttService } from '../speech/speech-runtime-service'
+import {
+  clearOpenAiSpeechApiKey,
+  hasOpenAiSpeechApiKey,
+  saveOpenAiSpeechApiKey
+} from '../speech/openai-api-key-store'
 import type { Store } from '../persistence'
 
 export function registerSpeechHandlers(store: Store): void {
@@ -15,6 +20,20 @@ export function registerSpeechHandlers(store: Store): void {
     return getSpeechModelManager(store).getModelStates()
   })
 
+  ipcMain.handle('speech:getOpenAiApiKeyStatus', async () => {
+    return { configured: hasOpenAiSpeechApiKey() }
+  })
+
+  ipcMain.handle('speech:saveOpenAiApiKey', async (_event, apiKey: string) => {
+    saveOpenAiSpeechApiKey(apiKey)
+    return { configured: true }
+  })
+
+  ipcMain.handle('speech:clearOpenAiApiKey', async () => {
+    clearOpenAiSpeechApiKey()
+    return { configured: false }
+  })
+
   ipcMain.handle('speech:downloadModel', async (event, modelId: string) => {
     const manager = getSpeechModelManager(store)
     const window = BrowserWindow.fromWebContents(event.sender)

diff --git a/src/main/speech/model-catalog.ts b/src/main/speech/model-catalog.ts
@@ -7,6 +7,7 @@ export const SPEECH_MODEL_CATALOG: SpeechModelManifest[] = [
     description:
       'Highest accuracy for 25 European languages. Punctuation, capitalization, and word-level timestamps.',
     type: 'transducer',
+    provider: 'local',
     language: 'multilingual',
     sizeBytes: 180_000_000,
     downloadUrl:
@@ -25,6 +26,7 @@ export const SPEECH_MODEL_CATALOG: SpeechModelManifest[] = [
     description:
       'English only. Faster than v3 with similar accuracy. Punctuation and capitalization.',
     type: 'transducer',
+    provider: 'local',
     language: 'en',
     sizeBytes: 170_000_000,
     downloadUrl:
@@ -41,6 +43,7 @@ export const SPEECH_MODEL_CATALOG: SpeechModelManifest[] = [
     label: 'Zipformer Bilingual',
     description: 'Chinese + English with code-switching. Low-latency real-time streaming.',
     type: 'transducer',
+    provider: 'local',
     language: 'zh-en',
     sizeBytes: 130_000_000,
     downloadUrl:
@@ -63,6 +66,7 @@ export const SPEECH_MODEL_CATALOG: SpeechModelManifest[] = [
     description:
       'Chinese (Mandarin + dialects) + English. Strong on accented and regional Chinese.',
     type: 'paraformer',
+    provider: 'local',
     language: 'zh-en',
     sizeBytes: 115_000_000,
     downloadUrl:
@@ -78,6 +82,7 @@ export const SPEECH_MODEL_CATALOG: SpeechModelManifest[] = [
     label: 'Zipformer Streaming EN',
     description: 'English only. Lightweight 20M-param model, good balance of speed and size.',
     type: 'transducer',
+    provider: 'local',
     language: 'en',
     sizeBytes: 128_000_000,
     downloadUrl:
@@ -99,6 +104,7 @@ export const SPEECH_MODEL_CATALOG: SpeechModelManifest[] = [
     label: 'Zipformer Streaming ZH',
     description: 'Chinese only. Ultra-lightweight 14M-param model, ideal for low-resource devices.',
     type: 'transducer',
+    provider: 'local',
     language: 'zh',
     sizeBytes: 74_000_000,
     downloadUrl:
@@ -120,6 +126,7 @@ export const SPEECH_MODEL_CATALOG: SpeechModelManifest[] = [
     label: 'Whisper Tiny',
     description: '90+ languages. Lower accuracy than Parakeet but broadest language coverage.',
     type: 'whisper',
+    provider: 'local',
     language: 'multilingual',
     sizeBytes: 116_000_000,
     downloadUrl:
@@ -129,9 +136,34 @@ export const SPEECH_MODEL_CATALOG: SpeechModelManifest[] = [
     files: ['tiny-encoder.onnx', 'tiny-decoder.onnx', 'tiny-tokens.txt'],
     sampleRate: 16000,
     streaming: false
+  },
+  {
+    id: 'openai-gpt-4o-mini-transcribe',
+    label: 'GPT-4o mini Transcribe',
+    description:
+      'Cloud transcription with strong accuracy and low cost. Requires an OpenAI API key.',
+    type: 'openai',
+    provider: 'openai',
+    language: 'multilingual',
+    sampleRate: 16000,
+    streaming: false
+  },
+  {
+    id: 'openai-gpt-4o-transcribe',
+    label: 'GPT-4o Transcribe',
+    description: 'Cloud transcription with higher accuracy. Requires an OpenAI API key.',
+    type: 'openai',
+    provider: 'openai',
+    language: 'multilingual',
+    sampleRate: 16000,
+    streaming: false
   }
 ]
 
 export function getCatalogModel(id: string): SpeechModelManifest | undefined {
   return SPEECH_MODEL_CATALOG.find((m) => m.id === id)
 }
+
+export function isLocalSpeechModel(manifest: SpeechModelManifest): boolean {
+  return manifest.provider === 'local'
+}
diff --git a/src/main/speech/model-manager.test.ts b/src/main/speech/model-manager.test.ts
@@ -6,7 +6,8 @@ import { beforeEach, describe, expect, it, vi } from 'vitest'
 import { SPEECH_MODEL_CATALOG } from './model-catalog'
 import { ModelManager } from './model-manager'
 
-const { httpsGetMock, spawnMock } = vi.hoisted(() => ({
+const { hasOpenAiSpeechApiKeyMock, httpsGetMock, spawnMock } = vi.hoisted(() => ({
+  hasOpenAiSpeechApiKeyMock: vi.fn(),
   httpsGetMock: vi.fn(),
   spawnMock: vi.fn()
 }))
@@ -27,6 +28,10 @@ vi.mock('https', async () => {
   return { ...(actual as Record<string, unknown>), get: httpsGetMock }
 })
 
+vi.mock('./openai-api-key-store', () => ({
+  hasOpenAiSpeechApiKey: hasOpenAiSpeechApiKeyMock
+}))
+
 type ModelManagerInternals = {
   verifyArchiveSha256: (archivePath: string, expectedSha256: string) => Promise<void>
   downloadFile: (
@@ -48,11 +53,16 @@ type ModelManagerInternals = {
 describe('ModelManager', () => {
   beforeEach(() => {
     httpsGetMock.mockReset()
+    hasOpenAiSpeechApiKeyMock.mockReset()
+    hasOpenAiSpeechApiKeyMock.mockReturnValue(false)
     spawnMock.mockReset()
   })
 
   it('requires pinned SHA-256 hashes for every catalog archive', () => {
     for (const manifest of SPEECH_MODEL_CATALOG) {
+      if (manifest.provider !== 'local') {
+        continue
+      }
       expect(manifest.archiveSha256).toMatch(/^[a-f0-9]{64}$/)
     }
   })
@@ -93,6 +103,27 @@ describe('ModelManager', () => {
     }
   })
 
+  it('marks OpenAI transcription models ready only when an API key is configured', async () => {
+    const dir = mkdtempSync(join(tmpdir(), 'orca-model-manager-'))
+    try {
+      const manager = new ModelManager(dir)
+
+      await expect(manager.getModelState('openai-gpt-4o-mini-transcribe')).resolves.toEqual({
+        id: 'openai-gpt-4o-mini-transcribe',
+        status: 'not-downloaded'
+      })
+
+      hasOpenAiSpeechApiKeyMock.mockReturnValue(true)
+
+      await expect(manager.getModelState('openai-gpt-4o-mini-transcribe')).resolves.toEqual({
+        id: 'openai-gpt-4o-mini-transcribe',
+        status: 'ready'
+      })
+    } finally {
+      rmSync(dir, { recursive: true, force: true })
+    }
+  })
+
   it('aborts an in-flight model download request when cancelled', async () => {
     const dir = mkdtempSync(join(tmpdir(), 'orca-model-manager-'))
     try {

diff --git a/src/main/speech/model-manager.ts b/src/main/speech/model-manager.ts
@@ -13,7 +13,8 @@ import type {
   SpeechModelState,
   SpeechModelStatus
 } from '../../shared/speech-types'
-import { SPEECH_MODEL_CATALOG, getCatalogModel } from './model-catalog'
+import { SPEECH_MODEL_CATALOG, getCatalogModel, isLocalSpeechModel } from './model-catalog'
+import { hasOpenAiSpeechApiKey } from './openai-api-key-store'
 import { resolveTarExecutable } from './tar-executable'
 
 type DownloadHandle = {
@@ -68,6 +69,13 @@ export class ModelManager {
       return { id: modelId, status: 'error', error: 'Unknown model' }
     }
 
+    if (manifest.provider === 'openai') {
+      return {
+        id: modelId,
+        status: hasOpenAiSpeechApiKey() ? 'ready' : 'not-downloaded'
+      }
+    }
+
     const modelDir = this.getModelDir(modelId)
     if (existsSync(modelDir) && this.validateModelFiles(manifest, modelDir)) {
       const state: SpeechModelState = { id: modelId, status: 'ready' }
@@ -97,6 +105,9 @@ export class ModelManager {
   }
 
   private validateModelFiles(manifest: SpeechModelManifest, modelDir: string): boolean {
+    if (!manifest.files) {
+      return false
+    }
     return manifest.files.every((f) => existsSync(join(modelDir, f)))
   }
 
@@ -109,6 +120,12 @@ export class ModelManager {
     if (!manifest) {
       throw new Error(`Unknown model: ${modelId}`)
     }
+    if (!isLocalSpeechModel(manifest)) {
+      throw new Error(`Model does not support downloads: ${modelId}`)
+    }
+    if (!manifest.downloadUrl || !manifest.archiveSha256 || !manifest.sizeBytes) {
+      throw new Error(`Model download metadata missing: ${modelId}`)
+    }
 
     const modelDir = this.getModelDir(modelId)
     if (existsSync(modelDir) && this.validateModelFiles(manifest, modelDir)) {
@@ -213,6 +230,10 @@ export class ModelManager {
     if (!getCatalogModel(modelId)) {
       throw new Error(`Unknown model: ${modelId}`)
     }
+    const manifest = getCatalogModel(modelId)
+    if (!manifest || !isLocalSpeechModel(manifest)) {
+      throw new Error(`Model does not support deletion: ${modelId}`)
+    }
     this.cancelDownload(modelId)
     const modelDir = this.getModelDir(modelId)
     if (existsSync(modelDir)) {
@@ -542,6 +563,9 @@ export class ModelManager {
   }
 
   private async flattenNestedDir(modelDir: string, manifest: SpeechModelManifest): Promise<void> {
+    if (!manifest.files) {
+      return
+    }
     const entries = await readdir(modelDir, { withFileTypes: true })
     for (const entry of entries) {
       if (entry.isDirectory()) {

diff --git a/src/main/speech/openai-api-key-store.test.ts b/src/main/speech/openai-api-key-store.test.ts
@@ -0,0 +1,88 @@
+import { existsSync, mkdirSync, mkdtempSync, writeFileSync } from 'fs'
+import { tmpdir } from 'os'
+import type * as Os from 'os'
+import { join } from 'path'
+import { beforeEach, describe, expect, it, vi } from 'vitest'
+
+const safeStorageMock = vi.hoisted(() => ({
+  decryptString: vi.fn((value: Buffer) => value.toString('utf8')),
+  encryptString: vi.fn((value: string) => Buffer.from(value)),
+  isEncryptionAvailable: vi.fn(() => true)
+}))
+
+let tempHome = ''
+
+async function loadStoreModule() {
+  vi.resetModules()
+  vi.doMock('electron', () => ({
+    safeStorage: safeStorageMock
+  }))
+  vi.doMock('os', async () => {
+    const actual = await vi.importActual<typeof Os>('os')
+    return { ...actual, homedir: () => tempHome }
+  })
+  return import('./openai-api-key-store')
+}
+
+beforeEach(() => {
+  tempHome = mkdtempLike('orca-openai-key-store-')
+  safeStorageMock.decryptString.mockClear()
+  safeStorageMock.encryptString.mockClear()
+  safeStorageMock.isEncryptionAvailable.mockClear()
+  safeStorageMock.isEncryptionAvailable.mockReturnValue(true)
+})
+
+function mkdtempLike(prefix: string): string {
+  return mkdtempSync(join(tmpdir(), prefix))
+}
+
+function writeStoredOpenAiKey(value: string): void {
+  const orcaDir = join(tempHome, '.orca')
+  mkdirSync(orcaDir, { recursive: true })
+  writeFileSync(join(orcaDir, 'openai-speech-token.enc'), value)
+}
+
+describe('OpenAI speech API key store', () => {
+  it('checks configured status without decrypting or touching safeStorage', async () => {
+    writeStoredOpenAiKey('encrypted-key')
+    const store = await loadStoreModule()
+
+    expect(store.hasOpenAiSpeechApiKey()).toBe(true)
+    expect(safeStorageMock.isEncryptionAvailable).not.toHaveBeenCalled()
+    expect(safeStorageMock.decryptString).not.toHaveBeenCalled()
+  })
+
+  it('decrypts only when the key is read for an API request', async () => {
+    writeStoredOpenAiKey('encrypted-key')
+    const store = await loadStoreModule()
+
+    expect(store.readOpenAiSpeechApiKey()).toBe('encrypted-key')
+    expect(safeStorageMock.decryptString).toHaveBeenCalledOnce()
+  })
+
+  it('caches the decrypted key so repeated dictations do not repeatedly touch safeStorage', async () => {
+    writeStoredOpenAiKey('encrypted-key')
+    const store = await loadStoreModule()
+
+    expect(store.readOpenAiSpeechApiKey()).toBe('encrypted-key')
+    expect(store.readOpenAiSpeechApiKey()).toBe('encrypted-key')
+    expect(safeStorageMock.decryptString).toHaveBeenCalledOnce()
+  })
+
+  it('uses the in-memory key after save without decrypting from safeStorage', async () => {
+    const store = await loadStoreModule()
+
+    store.saveOpenAiSpeechApiKey('saved-key')
+
+    expect(store.readOpenAiSpeechApiKey()).toBe('saved-key')
+    expect(safeStorageMock.decryptString).not.toHaveBeenCalled()
+  })
+
+  it('reports missing status without creating storage files', async () => {
+    const store = await loadStoreModule()
+
+    expect(store.hasOpenAiSpeechApiKey()).toBe(false)
+    expect(existsSync(join(tempHome, '.orca'))).toBe(false)
+    expect(safeStorageMock.decryptString).not.toHaveBeenCalled()
+  })
+})