From e62e4eaa75edfa478427d0d895fa47acd8168f37 Mon Sep 17 00:00:00 2001
From: preston176 <prestonnyamweya@gmail.com>
Date: Tue, 9 Jun 2026 10:55:29 +0300
Subject: [PATCH] fix(transcript): word spacing, empty boxes, and non-monotonic
 timestamps

The transcript rendered run-together ("morePDFlost...") with stray empty
boxes, and clicking a word seeked to the wrong place (timestamps were even
non-monotonic, e.g. 0:49 -> 0:40).

Root cause was in the Whisper post-processing + rendering:
- worker stored raw tokens (Whisper emits each with a leading space) and
  joined segment text with "", so the editor's per-word <button> spans ran
  together; whitespace-only chunks became empty clickable boxes
- word timestamps from chunked/stride long-form audio jump backwards at chunk
  boundaries, scrambling click-to-seek

Fixes:
- extract groupWordsIntoSegments into a pure, tested module
  (transcription/group-words.ts): trim each token, drop empties, and clamp
  each word to start >= previous word's end (monotonic), space-join segments
- transcript panel: render a space separator between words, trim display text
  (robust for legacy docs), and skip whitespace-only tokens
- add group-words.test.ts (9 cases: trim, empties, null ts, monotonic clamp,
  gap preservation, segmentation)

Spacing + empty boxes are fixed for existing transcripts on reload (render
layer); improved timestamps apply to newly generated transcripts.
---
 apps/web/src/services/transcription/worker.ts |  60 +---------
 .../transcript-editor/transcript-panel.tsx    |   7 +-
 .../__tests__/group-words.test.ts             | 112 ++++++++++++++++++
 apps/web/src/transcription/group-words.ts     |  79 ++++++++++++
 4 files changed, 199 insertions(+), 59 deletions(-)
 create mode 100644 apps/web/src/transcription/__tests__/group-words.test.ts
 create mode 100644 apps/web/src/transcription/group-words.ts
diff --git a/apps/web/src/services/transcription/worker.ts b/apps/web/src/services/transcription/worker.ts
index 244bb874..8022ac51 100644
--- a/apps/web/src/services/transcription/worker.ts
+++ b/apps/web/src/services/transcription/worker.ts
@@ -3,10 +3,8 @@ import {
 	type AutomaticSpeechRecognitionPipeline,
 	type AutomaticSpeechRecognitionOutput,
 } from "@huggingface/transformers";
-import type {
-	TranscriptionSegment,
-	TranscriptionWord,
-} from "@/transcription/types";
+import type { TranscriptionSegment } from "@/transcription/types";
+import { groupWordsIntoSegments } from "@/transcription/group-words";
 import {
 	DEFAULT_CHUNK_LENGTH_SECONDS,
 	DEFAULT_STRIDE_SECONDS,
@@ -166,57 +164,3 @@ async function handleTranscribe({
 	}
 }
 
-interface RawChunk {
-	text: string;
-	timestamp?: [number | null | undefined, number | null | undefined];
-}
-
-function groupWordsIntoSegments({
-	chunks,
-}: {
-	chunks: RawChunk[] | undefined;
-}): TranscriptionSegment[] {
-	if (!chunks || chunks.length === 0) return [];
-
-	const words: TranscriptionWord[] = [];
-	for (const chunk of chunks) {
-		if (!chunk.timestamp || chunk.timestamp.length < 2) continue;
-		const start = chunk.timestamp[0];
-		const end = chunk.timestamp[1] ?? start;
-		if (start == null || end == null) continue;
-		words.push({ text: chunk.text, start, end });
-	}
-
-	if (words.length === 0) return [];
-
-	const segments: TranscriptionSegment[] = [];
-	const SENTENCE_END = /[.!?]$/;
-	const MAX_GAP = 1.0;
-	const MAX_WORDS = 20;
-
-	let buffer: TranscriptionWord[] = [];
-	const flush = () => {
-		if (buffer.length === 0) return;
-		segments.push({
-			text: buffer.map((w) => w.text).join("").trim(),
-			start: buffer[0].start,
-			end: buffer[buffer.length - 1].end,
-			words: buffer,
-		});
-		buffer = [];
-	};
-
-	for (let i = 0; i < words.length; i++) {
-		const word = words[i];
-		buffer.push(word);
-		const next = words[i + 1];
-		const gap = next ? next.start - word.end : 0;
-		const trimmed = word.text.trim();
-		const endsSentence = SENTENCE_END.test(trimmed);
-		if (endsSentence || gap > MAX_GAP || buffer.length >= MAX_WORDS) {
-			flush();
-		}
-	}
-	flush();
-	return segments;
-}
diff --git a/apps/web/src/transcript-editor/transcript-panel.tsx b/apps/web/src/transcript-editor/transcript-panel.tsx
index 8e526bcd..e9768fa6 100644
--- a/apps/web/src/transcript-editor/transcript-panel.tsx
+++ b/apps/web/src/transcript-editor/transcript-panel.tsx
@@ -1,6 +1,7 @@
 "use client";
 
 import {
+	Fragment,
 	useCallback,
 	useEffect,
 	useMemo,
@@ -739,6 +740,9 @@ function TranscriptView({
 					</div>
 					<p>
 						{segment.words.map((word) => {
+							// Skip whitespace-only tokens (e.g. legacy transcripts) so
+							// they don't render as empty clickable boxes.
+							if (word.text.trim().length === 0) return null;
 							if (word.id === editingWordId) {
 								return (
 									<input
@@ -821,7 +825,8 @@ function TranscriptView({
 									aria-pressed={isSelected}
 									title={isDeleted ? "Click to restore" : undefined}
 								>
-									{word.text}
+									{word.text.trim()}
+								{" "}
 								</button>
 							);
 						})}
diff --git a/apps/web/src/transcription/__tests__/group-words.test.ts b/apps/web/src/transcription/__tests__/group-words.test.ts
new file mode 100644
index 00000000..a3e0d863
--- /dev/null
+++ b/apps/web/src/transcription/__tests__/group-words.test.ts
@@ -0,0 +1,112 @@
+import { describe, it, expect } from "bun:test";
+import { groupWordsIntoSegments, type RawChunk } from "../group-words";
+
+const chunk = (
+	text: string,
+	start: number | null | undefined,
+	end: number | null | undefined,
+): RawChunk => ({ text, timestamp: [start, end] });
+
+describe("groupWordsIntoSegments", () => {
+	it("returns no segments for empty/undefined input", () => {
+		expect(groupWordsIntoSegments({ chunks: undefined })).toEqual([]);
+		expect(groupWordsIntoSegments({ chunks: [] })).toEqual([]);
+	});
+
+	it("trims Whisper's leading-space tokens and space-joins segment text", () => {
+		// Whisper emits tokens like " Hello" / " world." with leading spaces.
+		const segments = groupWordsIntoSegments({
+			chunks: [chunk(" Hello", 0, 0.5), chunk(" world.", 0.5, 1)],
+		});
+		expect(segments).toHaveLength(1);
+		expect(segments[0]!.text).toBe("Hello world.");
+		expect(segments[0]!.words!.map((w) => w.text)).toEqual([
+			"Hello",
+			"world.",
+		]);
+	});
+
+	it("drops whitespace-only tokens (they otherwise render as empty boxes)", () => {
+		const segments = groupWordsIntoSegments({
+			chunks: [
+				chunk(" To", 0, 0.3),
+				chunk("   ", 0.3, 0.35), // stray whitespace token
+				chunk(" aim.", 0.35, 0.8),
+			],
+		});
+		expect(segments[0]!.words!.map((w) => w.text)).toEqual(["To", "aim."]);
+		expect(segments[0]!.text).toBe("To aim.");
+	});
+
+	it("skips chunks with missing/null timestamps", () => {
+		const segments = groupWordsIntoSegments({
+			chunks: [
+				chunk(" ok", 0, 0.4),
+				{ text: " bad", timestamp: undefined },
+				chunk(" null", null, 1),
+				chunk(" end.", 0.4, 0.9),
+			],
+		});
+		expect(segments[0]!.words!.map((w) => w.text)).toEqual(["ok", "end."]);
+	});
+
+	it("clamps backwards timestamps so the timeline stays monotonic", () => {
+		// Simulates the chunk-boundary bug: a later word reports an EARLIER start.
+		const segments = groupWordsIntoSegments({
+			chunks: [
+				chunk(" practice", 48, 49),
+				chunk(" room.", 40, 41), // backwards jump (49 -> 40)
+			],
+		});
+		const words = segments.flatMap((s) => s.words ?? []);
+		// starts/ends must be non-decreasing across the whole transcript
+		for (let i = 1; i < words.length; i++) {
+			expect(words[i]!.start).toBeGreaterThanOrEqual(words[i - 1]!.end);
+		}
+		// the backwards word is pinned to the previous end, not left at 40
+		expect(words[1]!.start).toBe(49);
+		expect(words[1]!.end).toBe(49); // end clamped up to start (was 41 < 49)
+	});
+
+	it("preserves real forward gaps (< split threshold) while clamping", () => {
+		const segments = groupWordsIntoSegments({
+			chunks: [chunk(" a", 0, 1), chunk(" b.", 1.5, 2)],
+		});
+		const [w0, w1] = segments[0]!.words!;
+		expect(w0!.start).toBe(0);
+		expect(w1!.start).toBe(1.5); // genuine gap kept, not collapsed to 1
+		expect(w1!.end).toBe(2);
+	});
+
+	it("splits into a new segment on sentence-ending punctuation", () => {
+		const segments = groupWordsIntoSegments({
+			chunks: [
+				chunk(" One", 0, 0.4),
+				chunk(" two.", 0.4, 0.8),
+				chunk(" Three", 0.9, 1.3),
+				chunk(" four.", 1.3, 1.7),
+			],
+		});
+		expect(segments).toHaveLength(2);
+		expect(segments[0]!.text).toBe("One two.");
+		expect(segments[1]!.text).toBe("Three four.");
+	});
+
+	it("splits on a long silent gap between words", () => {
+		const segments = groupWordsIntoSegments({
+			chunks: [
+				chunk(" before", 0, 0.5),
+				chunk(" after", 5, 5.5), // > 1s gap
+			],
+		});
+		expect(segments).toHaveLength(2);
+	});
+
+	it("carries correct segment start/end from its first/last word", () => {
+		const segments = groupWordsIntoSegments({
+			chunks: [chunk(" hi", 2, 2.5), chunk(" there.", 2.5, 3.2)],
+		});
+		expect(segments[0]!.start).toBe(2);
+		expect(segments[0]!.end).toBe(3.2);
+	});
+});
diff --git a/apps/web/src/transcription/group-words.ts b/apps/web/src/transcription/group-words.ts
new file mode 100644
index 00000000..d3f264fc
--- /dev/null
+++ b/apps/web/src/transcription/group-words.ts
@@ -0,0 +1,79 @@
+import type { TranscriptionSegment, TranscriptionWord } from "./types";
+
+/** A raw word chunk as produced by Whisper's `return_timestamps: "word"`. */
+export interface RawChunk {
+	text: string;
+	timestamp?: [number | null | undefined, number | null | undefined];
+}
+
+const SENTENCE_END = /[.!?]$/;
+const MAX_GAP_SECONDS = 1.0;
+const MAX_WORDS_PER_SEGMENT = 20;
+
+/**
+ * Turn Whisper word chunks into sentence-ish segments, normalizing the two
+ * things Whisper gets wrong over chunked/stride long-form audio:
+ *
+ *  1. Each token carries a leading space and some chunks are whitespace-only —
+ *     we trim every token and drop empties (empties otherwise render as empty
+ *     clickable boxes, and the leading spaces break word-by-word rendering).
+ *  2. Word timestamps can jump backwards at chunk boundaries (the stride
+ *     overlap is transcribed twice), which scrambles click-to-seek. We clamp
+ *     each word to start no earlier than the previous word ended, keeping the
+ *     timeline monotonic.
+ */
+export function groupWordsIntoSegments({
+	chunks,
+}: {
+	chunks: RawChunk[] | undefined;
+}): TranscriptionSegment[] {
+	if (!chunks || chunks.length === 0) return [];
+
+	const words: TranscriptionWord[] = [];
+	let prevEnd = 0;
+	for (const chunk of chunks) {
+		if (!chunk.timestamp || chunk.timestamp.length < 2) continue;
+		const rawStart = chunk.timestamp[0];
+		const rawEnd = chunk.timestamp[1] ?? rawStart;
+		if (rawStart == null || rawEnd == null) continue;
+		const text = chunk.text.trim();
+		if (text.length === 0) continue;
+		const start = Math.max(rawStart, prevEnd);
+		const end = Math.max(rawEnd, start);
+		prevEnd = end;
+		words.push({ text, start, end });
+	}
+
+	if (words.length === 0) return [];
+
+	const segments: TranscriptionSegment[] = [];
+	let buffer: TranscriptionWord[] = [];
+	const flush = () => {
+		if (buffer.length === 0) return;
+		segments.push({
+			// Tokens are already trimmed, so join with single spaces.
+			text: buffer.map((w) => w.text).join(" "),
+			start: buffer[0].start,
+			end: buffer[buffer.length - 1].end,
+			words: buffer,
+		});
+		buffer = [];
+	};
+
+	for (let i = 0; i < words.length; i++) {
+		const word = words[i];
+		buffer.push(word);
+		const next = words[i + 1];
+		const gap = next ? next.start - word.end : 0;
+		const endsSentence = SENTENCE_END.test(word.text);
+		if (
+			endsSentence ||
+			gap > MAX_GAP_SECONDS ||
+			buffer.length >= MAX_WORDS_PER_SEGMENT
+		) {
+			flush();
+		}
+	}
+	flush();
+	return segments;
+}