From e62e4eaa75edfa478427d0d895fa47acd8168f37 Mon Sep 17 00:00:00 2001 From: preston176 Date: Tue, 9 Jun 2026 10:55:29 +0300 Subject: [PATCH] fix(transcript): word spacing, empty boxes, and non-monotonic timestamps The transcript rendered run-together ("morePDFlost...") with stray empty boxes, and clicking a word seeked to the wrong place (timestamps were even non-monotonic, e.g. 0:49 -> 0:40). Root cause was in the Whisper post-processing + rendering: - worker stored raw tokens (Whisper emits each with a leading space) and joined segment text with "", so the editor's per-word ); })} diff --git a/apps/web/src/transcription/__tests__/group-words.test.ts b/apps/web/src/transcription/__tests__/group-words.test.ts new file mode 100644 index 00000000..a3e0d863 --- /dev/null +++ b/apps/web/src/transcription/__tests__/group-words.test.ts @@ -0,0 +1,112 @@ +import { describe, it, expect } from "bun:test"; +import { groupWordsIntoSegments, type RawChunk } from "../group-words"; + +const chunk = ( + text: string, + start: number | null | undefined, + end: number | null | undefined, +): RawChunk => ({ text, timestamp: [start, end] }); + +describe("groupWordsIntoSegments", () => { + it("returns no segments for empty/undefined input", () => { + expect(groupWordsIntoSegments({ chunks: undefined })).toEqual([]); + expect(groupWordsIntoSegments({ chunks: [] })).toEqual([]); + }); + + it("trims Whisper's leading-space tokens and space-joins segment text", () => { + // Whisper emits tokens like " Hello" / " world." with leading spaces. + const segments = groupWordsIntoSegments({ + chunks: [chunk(" Hello", 0, 0.5), chunk(" world.", 0.5, 1)], + }); + expect(segments).toHaveLength(1); + expect(segments[0]!.text).toBe("Hello world."); + expect(segments[0]!.words!.map((w) => w.text)).toEqual([ + "Hello", + "world.", + ]); + }); + + it("drops whitespace-only tokens (they otherwise render as empty boxes)", () => { + const segments = groupWordsIntoSegments({ + chunks: [ + chunk(" To", 0, 0.3), + chunk(" ", 0.3, 0.35), // stray whitespace token + chunk(" aim.", 0.35, 0.8), + ], + }); + expect(segments[0]!.words!.map((w) => w.text)).toEqual(["To", "aim."]); + expect(segments[0]!.text).toBe("To aim."); + }); + + it("skips chunks with missing/null timestamps", () => { + const segments = groupWordsIntoSegments({ + chunks: [ + chunk(" ok", 0, 0.4), + { text: " bad", timestamp: undefined }, + chunk(" null", null, 1), + chunk(" end.", 0.4, 0.9), + ], + }); + expect(segments[0]!.words!.map((w) => w.text)).toEqual(["ok", "end."]); + }); + + it("clamps backwards timestamps so the timeline stays monotonic", () => { + // Simulates the chunk-boundary bug: a later word reports an EARLIER start. + const segments = groupWordsIntoSegments({ + chunks: [ + chunk(" practice", 48, 49), + chunk(" room.", 40, 41), // backwards jump (49 -> 40) + ], + }); + const words = segments.flatMap((s) => s.words ?? []); + // starts/ends must be non-decreasing across the whole transcript + for (let i = 1; i < words.length; i++) { + expect(words[i]!.start).toBeGreaterThanOrEqual(words[i - 1]!.end); + } + // the backwards word is pinned to the previous end, not left at 40 + expect(words[1]!.start).toBe(49); + expect(words[1]!.end).toBe(49); // end clamped up to start (was 41 < 49) + }); + + it("preserves real forward gaps (< split threshold) while clamping", () => { + const segments = groupWordsIntoSegments({ + chunks: [chunk(" a", 0, 1), chunk(" b.", 1.5, 2)], + }); + const [w0, w1] = segments[0]!.words!; + expect(w0!.start).toBe(0); + expect(w1!.start).toBe(1.5); // genuine gap kept, not collapsed to 1 + expect(w1!.end).toBe(2); + }); + + it("splits into a new segment on sentence-ending punctuation", () => { + const segments = groupWordsIntoSegments({ + chunks: [ + chunk(" One", 0, 0.4), + chunk(" two.", 0.4, 0.8), + chunk(" Three", 0.9, 1.3), + chunk(" four.", 1.3, 1.7), + ], + }); + expect(segments).toHaveLength(2); + expect(segments[0]!.text).toBe("One two."); + expect(segments[1]!.text).toBe("Three four."); + }); + + it("splits on a long silent gap between words", () => { + const segments = groupWordsIntoSegments({ + chunks: [ + chunk(" before", 0, 0.5), + chunk(" after", 5, 5.5), // > 1s gap + ], + }); + expect(segments).toHaveLength(2); + }); + + it("carries correct segment start/end from its first/last word", () => { + const segments = groupWordsIntoSegments({ + chunks: [chunk(" hi", 2, 2.5), chunk(" there.", 2.5, 3.2)], + }); + expect(segments[0]!.start).toBe(2); + expect(segments[0]!.end).toBe(3.2); + }); +}); diff --git a/apps/web/src/transcription/group-words.ts b/apps/web/src/transcription/group-words.ts new file mode 100644 index 00000000..d3f264fc --- /dev/null +++ b/apps/web/src/transcription/group-words.ts @@ -0,0 +1,79 @@ +import type { TranscriptionSegment, TranscriptionWord } from "./types"; + +/** A raw word chunk as produced by Whisper's `return_timestamps: "word"`. */ +export interface RawChunk { + text: string; + timestamp?: [number | null | undefined, number | null | undefined]; +} + +const SENTENCE_END = /[.!?]$/; +const MAX_GAP_SECONDS = 1.0; +const MAX_WORDS_PER_SEGMENT = 20; + +/** + * Turn Whisper word chunks into sentence-ish segments, normalizing the two + * things Whisper gets wrong over chunked/stride long-form audio: + * + * 1. Each token carries a leading space and some chunks are whitespace-only — + * we trim every token and drop empties (empties otherwise render as empty + * clickable boxes, and the leading spaces break word-by-word rendering). + * 2. Word timestamps can jump backwards at chunk boundaries (the stride + * overlap is transcribed twice), which scrambles click-to-seek. We clamp + * each word to start no earlier than the previous word ended, keeping the + * timeline monotonic. + */ +export function groupWordsIntoSegments({ + chunks, +}: { + chunks: RawChunk[] | undefined; +}): TranscriptionSegment[] { + if (!chunks || chunks.length === 0) return []; + + const words: TranscriptionWord[] = []; + let prevEnd = 0; + for (const chunk of chunks) { + if (!chunk.timestamp || chunk.timestamp.length < 2) continue; + const rawStart = chunk.timestamp[0]; + const rawEnd = chunk.timestamp[1] ?? rawStart; + if (rawStart == null || rawEnd == null) continue; + const text = chunk.text.trim(); + if (text.length === 0) continue; + const start = Math.max(rawStart, prevEnd); + const end = Math.max(rawEnd, start); + prevEnd = end; + words.push({ text, start, end }); + } + + if (words.length === 0) return []; + + const segments: TranscriptionSegment[] = []; + let buffer: TranscriptionWord[] = []; + const flush = () => { + if (buffer.length === 0) return; + segments.push({ + // Tokens are already trimmed, so join with single spaces. + text: buffer.map((w) => w.text).join(" "), + start: buffer[0].start, + end: buffer[buffer.length - 1].end, + words: buffer, + }); + buffer = []; + }; + + for (let i = 0; i < words.length; i++) { + const word = words[i]; + buffer.push(word); + const next = words[i + 1]; + const gap = next ? next.start - word.end : 0; + const endsSentence = SENTENCE_END.test(word.text); + if ( + endsSentence || + gap > MAX_GAP_SECONDS || + buffer.length >= MAX_WORDS_PER_SEGMENT + ) { + flush(); + } + } + flush(); + return segments; +}