Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 2 additions & 58 deletions apps/web/src/services/transcription/worker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,8 @@ import {
type AutomaticSpeechRecognitionPipeline,
type AutomaticSpeechRecognitionOutput,
} from "@huggingface/transformers";
import type {
TranscriptionSegment,
TranscriptionWord,
} from "@/transcription/types";
import type { TranscriptionSegment } from "@/transcription/types";
import { groupWordsIntoSegments } from "@/transcription/group-words";
import {
DEFAULT_CHUNK_LENGTH_SECONDS,
DEFAULT_STRIDE_SECONDS,
Expand Down Expand Up @@ -166,57 +164,3 @@ async function handleTranscribe({
}
}

interface RawChunk {
text: string;
timestamp?: [number | null | undefined, number | null | undefined];
}

function groupWordsIntoSegments({
chunks,
}: {
chunks: RawChunk[] | undefined;
}): TranscriptionSegment[] {
if (!chunks || chunks.length === 0) return [];

const words: TranscriptionWord[] = [];
for (const chunk of chunks) {
if (!chunk.timestamp || chunk.timestamp.length < 2) continue;
const start = chunk.timestamp[0];
const end = chunk.timestamp[1] ?? start;
if (start == null || end == null) continue;
words.push({ text: chunk.text, start, end });
}

if (words.length === 0) return [];

const segments: TranscriptionSegment[] = [];
const SENTENCE_END = /[.!?]$/;
const MAX_GAP = 1.0;
const MAX_WORDS = 20;

let buffer: TranscriptionWord[] = [];
const flush = () => {
if (buffer.length === 0) return;
segments.push({
text: buffer.map((w) => w.text).join("").trim(),
start: buffer[0].start,
end: buffer[buffer.length - 1].end,
words: buffer,
});
buffer = [];
};

for (let i = 0; i < words.length; i++) {
const word = words[i];
buffer.push(word);
const next = words[i + 1];
const gap = next ? next.start - word.end : 0;
const trimmed = word.text.trim();
const endsSentence = SENTENCE_END.test(trimmed);
if (endsSentence || gap > MAX_GAP || buffer.length >= MAX_WORDS) {
flush();
}
}
flush();
return segments;
}
7 changes: 6 additions & 1 deletion apps/web/src/transcript-editor/transcript-panel.tsx
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"use client";

import {
Fragment,
useCallback,
useEffect,
useMemo,
Expand Down Expand Up @@ -739,6 +740,9 @@ function TranscriptView({
</div>
<p>
{segment.words.map((word) => {
// Skip whitespace-only tokens (e.g. legacy transcripts) so
// they don't render as empty clickable boxes.
if (word.text.trim().length === 0) return null;
if (word.id === editingWordId) {
return (
<input
Expand Down Expand Up @@ -821,7 +825,8 @@ function TranscriptView({
aria-pressed={isSelected}
title={isDeleted ? "Click to restore" : undefined}
>
{word.text}
{word.text.trim()}
{" "}
</button>
);
})}
Expand Down
112 changes: 112 additions & 0 deletions apps/web/src/transcription/__tests__/group-words.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import { describe, it, expect } from "bun:test";
import { groupWordsIntoSegments, type RawChunk } from "../group-words";

const chunk = (
text: string,
start: number | null | undefined,
end: number | null | undefined,
): RawChunk => ({ text, timestamp: [start, end] });

describe("groupWordsIntoSegments", () => {
it("returns no segments for empty/undefined input", () => {
expect(groupWordsIntoSegments({ chunks: undefined })).toEqual([]);
expect(groupWordsIntoSegments({ chunks: [] })).toEqual([]);
});

it("trims Whisper's leading-space tokens and space-joins segment text", () => {
// Whisper emits tokens like " Hello" / " world." with leading spaces.
const segments = groupWordsIntoSegments({
chunks: [chunk(" Hello", 0, 0.5), chunk(" world.", 0.5, 1)],
});
expect(segments).toHaveLength(1);
expect(segments[0]!.text).toBe("Hello world.");
expect(segments[0]!.words!.map((w) => w.text)).toEqual([
"Hello",
"world.",
]);
});

it("drops whitespace-only tokens (they otherwise render as empty boxes)", () => {
const segments = groupWordsIntoSegments({
chunks: [
chunk(" To", 0, 0.3),
chunk(" ", 0.3, 0.35), // stray whitespace token
chunk(" aim.", 0.35, 0.8),
],
});
expect(segments[0]!.words!.map((w) => w.text)).toEqual(["To", "aim."]);
expect(segments[0]!.text).toBe("To aim.");
});

it("skips chunks with missing/null timestamps", () => {
const segments = groupWordsIntoSegments({
chunks: [
chunk(" ok", 0, 0.4),
{ text: " bad", timestamp: undefined },
chunk(" null", null, 1),
chunk(" end.", 0.4, 0.9),
],
});
expect(segments[0]!.words!.map((w) => w.text)).toEqual(["ok", "end."]);
});

it("clamps backwards timestamps so the timeline stays monotonic", () => {
// Simulates the chunk-boundary bug: a later word reports an EARLIER start.
const segments = groupWordsIntoSegments({
chunks: [
chunk(" practice", 48, 49),
chunk(" room.", 40, 41), // backwards jump (49 -> 40)
],
});
const words = segments.flatMap((s) => s.words ?? []);
// starts/ends must be non-decreasing across the whole transcript
for (let i = 1; i < words.length; i++) {
expect(words[i]!.start).toBeGreaterThanOrEqual(words[i - 1]!.end);
}
// the backwards word is pinned to the previous end, not left at 40
expect(words[1]!.start).toBe(49);
expect(words[1]!.end).toBe(49); // end clamped up to start (was 41 < 49)
});

it("preserves real forward gaps (< split threshold) while clamping", () => {
const segments = groupWordsIntoSegments({
chunks: [chunk(" a", 0, 1), chunk(" b.", 1.5, 2)],
});
const [w0, w1] = segments[0]!.words!;
expect(w0!.start).toBe(0);
expect(w1!.start).toBe(1.5); // genuine gap kept, not collapsed to 1
expect(w1!.end).toBe(2);
});

it("splits into a new segment on sentence-ending punctuation", () => {
const segments = groupWordsIntoSegments({
chunks: [
chunk(" One", 0, 0.4),
chunk(" two.", 0.4, 0.8),
chunk(" Three", 0.9, 1.3),
chunk(" four.", 1.3, 1.7),
],
});
expect(segments).toHaveLength(2);
expect(segments[0]!.text).toBe("One two.");
expect(segments[1]!.text).toBe("Three four.");
});

it("splits on a long silent gap between words", () => {
const segments = groupWordsIntoSegments({
chunks: [
chunk(" before", 0, 0.5),
chunk(" after", 5, 5.5), // > 1s gap
],
});
expect(segments).toHaveLength(2);
});

it("carries correct segment start/end from its first/last word", () => {
const segments = groupWordsIntoSegments({
chunks: [chunk(" hi", 2, 2.5), chunk(" there.", 2.5, 3.2)],
});
expect(segments[0]!.start).toBe(2);
expect(segments[0]!.end).toBe(3.2);
});
});
79 changes: 79 additions & 0 deletions apps/web/src/transcription/group-words.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import type { TranscriptionSegment, TranscriptionWord } from "./types";

/** A raw word chunk as produced by Whisper's `return_timestamps: "word"`. */
export interface RawChunk {
text: string;
timestamp?: [number | null | undefined, number | null | undefined];
}

const SENTENCE_END = /[.!?]$/;
const MAX_GAP_SECONDS = 1.0;
const MAX_WORDS_PER_SEGMENT = 20;

/**
* Turn Whisper word chunks into sentence-ish segments, normalizing the two
* things Whisper gets wrong over chunked/stride long-form audio:
*
* 1. Each token carries a leading space and some chunks are whitespace-only —
* we trim every token and drop empties (empties otherwise render as empty
* clickable boxes, and the leading spaces break word-by-word rendering).
* 2. Word timestamps can jump backwards at chunk boundaries (the stride
* overlap is transcribed twice), which scrambles click-to-seek. We clamp
* each word to start no earlier than the previous word ended, keeping the
* timeline monotonic.
*/
export function groupWordsIntoSegments({
chunks,
}: {
chunks: RawChunk[] | undefined;
}): TranscriptionSegment[] {
if (!chunks || chunks.length === 0) return [];

const words: TranscriptionWord[] = [];
let prevEnd = 0;
for (const chunk of chunks) {
if (!chunk.timestamp || chunk.timestamp.length < 2) continue;
const rawStart = chunk.timestamp[0];
const rawEnd = chunk.timestamp[1] ?? rawStart;
if (rawStart == null || rawEnd == null) continue;
const text = chunk.text.trim();
if (text.length === 0) continue;
const start = Math.max(rawStart, prevEnd);
const end = Math.max(rawEnd, start);
prevEnd = end;
words.push({ text, start, end });
}

if (words.length === 0) return [];

const segments: TranscriptionSegment[] = [];
let buffer: TranscriptionWord[] = [];
const flush = () => {
if (buffer.length === 0) return;
segments.push({
// Tokens are already trimmed, so join with single spaces.
text: buffer.map((w) => w.text).join(" "),
start: buffer[0].start,
end: buffer[buffer.length - 1].end,
words: buffer,
});
buffer = [];
};

for (let i = 0; i < words.length; i++) {
const word = words[i];
buffer.push(word);
const next = words[i + 1];
const gap = next ? next.start - word.end : 0;
const endsSentence = SENTENCE_END.test(word.text);
if (
endsSentence ||
gap > MAX_GAP_SECONDS ||
buffer.length >= MAX_WORDS_PER_SEGMENT
) {
flush();
}
}
flush();
return segments;
}
Loading