diff --git a/apps/web/src/services/transcription/worker.ts b/apps/web/src/services/transcription/worker.ts index 244bb874..8022ac51 100644 --- a/apps/web/src/services/transcription/worker.ts +++ b/apps/web/src/services/transcription/worker.ts @@ -3,10 +3,8 @@ import { type AutomaticSpeechRecognitionPipeline, type AutomaticSpeechRecognitionOutput, } from "@huggingface/transformers"; -import type { - TranscriptionSegment, - TranscriptionWord, -} from "@/transcription/types"; +import type { TranscriptionSegment } from "@/transcription/types"; +import { groupWordsIntoSegments } from "@/transcription/group-words"; import { DEFAULT_CHUNK_LENGTH_SECONDS, DEFAULT_STRIDE_SECONDS, @@ -166,57 +164,3 @@ async function handleTranscribe({ } } -interface RawChunk { - text: string; - timestamp?: [number | null | undefined, number | null | undefined]; -} - -function groupWordsIntoSegments({ - chunks, -}: { - chunks: RawChunk[] | undefined; -}): TranscriptionSegment[] { - if (!chunks || chunks.length === 0) return []; - - const words: TranscriptionWord[] = []; - for (const chunk of chunks) { - if (!chunk.timestamp || chunk.timestamp.length < 2) continue; - const start = chunk.timestamp[0]; - const end = chunk.timestamp[1] ?? start; - if (start == null || end == null) continue; - words.push({ text: chunk.text, start, end }); - } - - if (words.length === 0) return []; - - const segments: TranscriptionSegment[] = []; - const SENTENCE_END = /[.!?]$/; - const MAX_GAP = 1.0; - const MAX_WORDS = 20; - - let buffer: TranscriptionWord[] = []; - const flush = () => { - if (buffer.length === 0) return; - segments.push({ - text: buffer.map((w) => w.text).join("").trim(), - start: buffer[0].start, - end: buffer[buffer.length - 1].end, - words: buffer, - }); - buffer = []; - }; - - for (let i = 0; i < words.length; i++) { - const word = words[i]; - buffer.push(word); - const next = words[i + 1]; - const gap = next ? next.start - word.end : 0; - const trimmed = word.text.trim(); - const endsSentence = SENTENCE_END.test(trimmed); - if (endsSentence || gap > MAX_GAP || buffer.length >= MAX_WORDS) { - flush(); - } - } - flush(); - return segments; -} diff --git a/apps/web/src/transcript-editor/transcript-panel.tsx b/apps/web/src/transcript-editor/transcript-panel.tsx index 8e526bcd..e9768fa6 100644 --- a/apps/web/src/transcript-editor/transcript-panel.tsx +++ b/apps/web/src/transcript-editor/transcript-panel.tsx @@ -1,6 +1,7 @@ "use client"; import { + Fragment, useCallback, useEffect, useMemo, @@ -739,6 +740,9 @@ function TranscriptView({
{segment.words.map((word) => { + // Skip whitespace-only tokens (e.g. legacy transcripts) so + // they don't render as empty clickable boxes. + if (word.text.trim().length === 0) return null; if (word.id === editingWordId) { return ( - {word.text} + {word.text.trim()} + {" "} ); })} diff --git a/apps/web/src/transcription/__tests__/group-words.test.ts b/apps/web/src/transcription/__tests__/group-words.test.ts new file mode 100644 index 00000000..a3e0d863 --- /dev/null +++ b/apps/web/src/transcription/__tests__/group-words.test.ts @@ -0,0 +1,112 @@ +import { describe, it, expect } from "bun:test"; +import { groupWordsIntoSegments, type RawChunk } from "../group-words"; + +const chunk = ( + text: string, + start: number | null | undefined, + end: number | null | undefined, +): RawChunk => ({ text, timestamp: [start, end] }); + +describe("groupWordsIntoSegments", () => { + it("returns no segments for empty/undefined input", () => { + expect(groupWordsIntoSegments({ chunks: undefined })).toEqual([]); + expect(groupWordsIntoSegments({ chunks: [] })).toEqual([]); + }); + + it("trims Whisper's leading-space tokens and space-joins segment text", () => { + // Whisper emits tokens like " Hello" / " world." with leading spaces. + const segments = groupWordsIntoSegments({ + chunks: [chunk(" Hello", 0, 0.5), chunk(" world.", 0.5, 1)], + }); + expect(segments).toHaveLength(1); + expect(segments[0]!.text).toBe("Hello world."); + expect(segments[0]!.words!.map((w) => w.text)).toEqual([ + "Hello", + "world.", + ]); + }); + + it("drops whitespace-only tokens (they otherwise render as empty boxes)", () => { + const segments = groupWordsIntoSegments({ + chunks: [ + chunk(" To", 0, 0.3), + chunk(" ", 0.3, 0.35), // stray whitespace token + chunk(" aim.", 0.35, 0.8), + ], + }); + expect(segments[0]!.words!.map((w) => w.text)).toEqual(["To", "aim."]); + expect(segments[0]!.text).toBe("To aim."); + }); + + it("skips chunks with missing/null timestamps", () => { + const segments = groupWordsIntoSegments({ + chunks: [ + chunk(" ok", 0, 0.4), + { text: " bad", timestamp: undefined }, + chunk(" null", null, 1), + chunk(" end.", 0.4, 0.9), + ], + }); + expect(segments[0]!.words!.map((w) => w.text)).toEqual(["ok", "end."]); + }); + + it("clamps backwards timestamps so the timeline stays monotonic", () => { + // Simulates the chunk-boundary bug: a later word reports an EARLIER start. + const segments = groupWordsIntoSegments({ + chunks: [ + chunk(" practice", 48, 49), + chunk(" room.", 40, 41), // backwards jump (49 -> 40) + ], + }); + const words = segments.flatMap((s) => s.words ?? []); + // starts/ends must be non-decreasing across the whole transcript + for (let i = 1; i < words.length; i++) { + expect(words[i]!.start).toBeGreaterThanOrEqual(words[i - 1]!.end); + } + // the backwards word is pinned to the previous end, not left at 40 + expect(words[1]!.start).toBe(49); + expect(words[1]!.end).toBe(49); // end clamped up to start (was 41 < 49) + }); + + it("preserves real forward gaps (< split threshold) while clamping", () => { + const segments = groupWordsIntoSegments({ + chunks: [chunk(" a", 0, 1), chunk(" b.", 1.5, 2)], + }); + const [w0, w1] = segments[0]!.words!; + expect(w0!.start).toBe(0); + expect(w1!.start).toBe(1.5); // genuine gap kept, not collapsed to 1 + expect(w1!.end).toBe(2); + }); + + it("splits into a new segment on sentence-ending punctuation", () => { + const segments = groupWordsIntoSegments({ + chunks: [ + chunk(" One", 0, 0.4), + chunk(" two.", 0.4, 0.8), + chunk(" Three", 0.9, 1.3), + chunk(" four.", 1.3, 1.7), + ], + }); + expect(segments).toHaveLength(2); + expect(segments[0]!.text).toBe("One two."); + expect(segments[1]!.text).toBe("Three four."); + }); + + it("splits on a long silent gap between words", () => { + const segments = groupWordsIntoSegments({ + chunks: [ + chunk(" before", 0, 0.5), + chunk(" after", 5, 5.5), // > 1s gap + ], + }); + expect(segments).toHaveLength(2); + }); + + it("carries correct segment start/end from its first/last word", () => { + const segments = groupWordsIntoSegments({ + chunks: [chunk(" hi", 2, 2.5), chunk(" there.", 2.5, 3.2)], + }); + expect(segments[0]!.start).toBe(2); + expect(segments[0]!.end).toBe(3.2); + }); +}); diff --git a/apps/web/src/transcription/group-words.ts b/apps/web/src/transcription/group-words.ts new file mode 100644 index 00000000..d3f264fc --- /dev/null +++ b/apps/web/src/transcription/group-words.ts @@ -0,0 +1,79 @@ +import type { TranscriptionSegment, TranscriptionWord } from "./types"; + +/** A raw word chunk as produced by Whisper's `return_timestamps: "word"`. */ +export interface RawChunk { + text: string; + timestamp?: [number | null | undefined, number | null | undefined]; +} + +const SENTENCE_END = /[.!?]$/; +const MAX_GAP_SECONDS = 1.0; +const MAX_WORDS_PER_SEGMENT = 20; + +/** + * Turn Whisper word chunks into sentence-ish segments, normalizing the two + * things Whisper gets wrong over chunked/stride long-form audio: + * + * 1. Each token carries a leading space and some chunks are whitespace-only — + * we trim every token and drop empties (empties otherwise render as empty + * clickable boxes, and the leading spaces break word-by-word rendering). + * 2. Word timestamps can jump backwards at chunk boundaries (the stride + * overlap is transcribed twice), which scrambles click-to-seek. We clamp + * each word to start no earlier than the previous word ended, keeping the + * timeline monotonic. + */ +export function groupWordsIntoSegments({ + chunks, +}: { + chunks: RawChunk[] | undefined; +}): TranscriptionSegment[] { + if (!chunks || chunks.length === 0) return []; + + const words: TranscriptionWord[] = []; + let prevEnd = 0; + for (const chunk of chunks) { + if (!chunk.timestamp || chunk.timestamp.length < 2) continue; + const rawStart = chunk.timestamp[0]; + const rawEnd = chunk.timestamp[1] ?? rawStart; + if (rawStart == null || rawEnd == null) continue; + const text = chunk.text.trim(); + if (text.length === 0) continue; + const start = Math.max(rawStart, prevEnd); + const end = Math.max(rawEnd, start); + prevEnd = end; + words.push({ text, start, end }); + } + + if (words.length === 0) return []; + + const segments: TranscriptionSegment[] = []; + let buffer: TranscriptionWord[] = []; + const flush = () => { + if (buffer.length === 0) return; + segments.push({ + // Tokens are already trimmed, so join with single spaces. + text: buffer.map((w) => w.text).join(" "), + start: buffer[0].start, + end: buffer[buffer.length - 1].end, + words: buffer, + }); + buffer = []; + }; + + for (let i = 0; i < words.length; i++) { + const word = words[i]; + buffer.push(word); + const next = words[i + 1]; + const gap = next ? next.start - word.end : 0; + const endsSentence = SENTENCE_END.test(word.text); + if ( + endsSentence || + gap > MAX_GAP_SECONDS || + buffer.length >= MAX_WORDS_PER_SEGMENT + ) { + flush(); + } + } + flush(); + return segments; +}