From f408e832825f1ac9115d6db47d66221d48765cbc Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 8 Jun 2026 16:41:15 +0000 Subject: [PATCH] fix(translation): only extract chunks from the marked text, not its context The detailed-translation and advice models intermittently surfaced reusable patterns (chunks) drawn from the surrounding `context` rather than the user's marked selection. e.g. selecting "improvements are included" returned "related to" and "based on", which appear only in the context paragraph. Add a deterministic guardrail, `filterChunksToSelection`, that drops any chunk whose text does not appear (case- and whitespace-insensitively) inside the marked `phrase`, and apply it to both `getDetailedTranslation` and `getTranslationAdvice`. Also harden both prompts to state explicitly that the context is only for disambiguating meaning and must never be a source of chunks. https://claude.ai/code/session_01UTb6tc1wE8UyWxCKvBHmwT --- .../__tests__/chunk-filter.test.ts | 63 +++++++++++++++++++ .../src/modules/translation/chunk-filter.ts | 41 ++++++++++++ server/src/modules/translation/service.ts | 18 +++++- 3 files changed, 119 insertions(+), 3 deletions(-) create mode 100644 server/src/modules/translation/__tests__/chunk-filter.test.ts create mode 100644 server/src/modules/translation/chunk-filter.ts diff --git a/server/src/modules/translation/__tests__/chunk-filter.test.ts b/server/src/modules/translation/__tests__/chunk-filter.test.ts new file mode 100644 index 0000000..12bc4e0 --- /dev/null +++ b/server/src/modules/translation/__tests__/chunk-filter.test.ts @@ -0,0 +1,63 @@ +import { describe, it, expect } from "@jest/globals"; +import { filterChunksToSelection } from "../chunk-filter"; + +describe("filterChunksToSelection", () => { + it("drops chunks taken from the context, keeping only those in the marked text", () => { + // The real-world failing case from the task: the marked text is + // "improvements are included", yet the model also returned "related to" and + // "based on", which appear only in the surrounding context. + const phrase = "improvements are included"; + const chunks = [ + { text: "improvements are included", type: "other" }, + { text: "related to", type: "collocation" }, + { text: "based on", type: "collocation" }, + ]; + + expect(filterChunksToSelection(chunks, phrase)).toEqual([ + { text: "improvements are included", type: "other" }, + ]); + }); + + it("keeps a chunk that appears verbatim inside a longer selection", () => { + const phrase = "they finally decided to give up on the project"; + const chunks = [ + { text: "give up on", type: "phrasal_verb" }, + { text: "in the end", type: "discourse_marker" }, // not in the selection + ]; + + expect(filterChunksToSelection(chunks, phrase)).toEqual([ + { text: "give up on", type: "phrasal_verb" }, + ]); + }); + + it("matches case- and whitespace-insensitively", () => { + const phrase = "In Fact, the\nresults were good"; + const chunks = [ + { text: "in fact" }, // different case + { text: "the results" }, // collapsed whitespace vs newline + ]; + + expect(filterChunksToSelection(chunks, phrase)).toEqual([ + { text: "in fact" }, + { text: "the results" }, + ]); + }); + + it("drops chunks with empty or whitespace-only text", () => { + const phrase = "a perfectly normal sentence"; + const chunks = [{ text: "" }, { text: " " }, { text: "normal" }]; + + expect(filterChunksToSelection(chunks, phrase)).toEqual([ + { text: "normal" }, + ]); + }); + + it("returns an empty array when the selection is empty", () => { + expect(filterChunksToSelection([{ text: "anything" }], "")).toEqual([]); + }); + + it("returns an empty array for missing or non-array chunks", () => { + expect(filterChunksToSelection(undefined, "some phrase")).toEqual([]); + expect(filterChunksToSelection(null, "some phrase")).toEqual([]); + }); +}); diff --git a/server/src/modules/translation/chunk-filter.ts b/server/src/modules/translation/chunk-filter.ts new file mode 100644 index 0000000..9024d4b --- /dev/null +++ b/server/src/modules/translation/chunk-filter.ts @@ -0,0 +1,41 @@ +/** + * Normalise text for a forgiving "does this appear in the selection?" check: + * lower-case, collapse any run of whitespace to a single space, and trim. This + * lets a chunk the model capitalised differently (or that spans a line break) + * still match, while still rejecting patterns that simply are not in the + * selection. + */ +function normaliseForMatch(text: string): string { + return text.toLowerCase().replace(/\s+/g, " ").trim(); +} + +/** + * Keep only the chunks whose `text` actually appears inside the user's marked + * selection (`phrase`). + * + * The detailed-translation and advice models intermittently surface reusable + * patterns drawn from the surrounding `context` rather than the marked text + * itself — e.g. selecting "improvements are included" yet returning "related to" + * and "based on", which only exist in the context paragraph. Chunks must come + * from the marked text only, so this is the deterministic guardrail that + * enforces that contract regardless of what the model returns. + * + * Matching is whitespace- and case-insensitive but otherwise verbatim (no + * punctuation stripping), matching the "appears verbatim inside the selection" + * rule the prompts ask the model to follow. + */ +export function filterChunksToSelection( + chunks: T[] | undefined | null, + phrase: string +): T[] { + if (!Array.isArray(chunks)) return []; + + const selection = normaliseForMatch(typeof phrase === "string" ? phrase : ""); + if (!selection) return []; + + return chunks.filter((chunk) => { + const text = + typeof chunk?.text === "string" ? normaliseForMatch(chunk.text) : ""; + return text.length > 0 && selection.includes(text); + }); +} diff --git a/server/src/modules/translation/service.ts b/server/src/modules/translation/service.ts index 88e5d51..0d1f3e1 100644 --- a/server/src/modules/translation/service.ts +++ b/server/src/modules/translation/service.ts @@ -9,6 +9,7 @@ import { LanguageLearningDataSchema, TranslationAdviceSchema, } from "./schema"; +import { filterChunksToSelection } from "./chunk-filter"; import { TRANSLATION_MODELS } from "../../utils/openrouter-models"; /** @@ -81,8 +82,8 @@ export async function getDetailedTranslation({ Phonetic transliteration: spell out how to pronounce the SOURCE-language "phrase" itself (${sourceLanguage} -> read by a ${targetLanguage} speaker), written using the ${targetLanguage} alphabet. Do NOT transliterate the translation. For long selections (~5 words or more), return an empty string for the top-level transliteration and rely on the per-chunk transliterations instead. - Chunks: inside the user's selection ("phrase"), find the reusable language patterns worth learning (collocations, phrasal verbs, idioms, discourse markers). - Rules: at most one chunk per 5-8 words of the selection, hard ceiling of 2 chunks. Each chunk's "text" must appear verbatim inside the selection. For each chunk, also provide: "transliteration" (how to pronounce that chunk, source language, in the ${targetLanguage} alphabet) and "definition" (a short, self-contained explanation of that chunk's meaning and usage, 1-2 sentences, in ${targetLanguage}). + Chunks: look ONLY inside the user's selection ("phrase") and find the reusable language patterns worth learning there (collocations, phrasal verbs, idioms, discourse markers). The "context" is provided ONLY to disambiguate the selection's meaning — NEVER take a chunk from the context. A pattern is a valid chunk only if its exact words appear inside the selection itself. + Rules: at most one chunk per 5-8 words of the selection, hard ceiling of 2 chunks. Each chunk's "text" must appear verbatim inside the selection (not merely inside the context). For each chunk, also provide: "transliteration" (how to pronounce that chunk, source language, in the ${targetLanguage} alphabet) and "definition" (a short, self-contained explanation of that chunk's meaning and usage, 1-2 sentences, in ${targetLanguage}). Return an empty "chunks" array when the selection is under ~5 words, or when the selection is written in a different language than the target learning language.`; const userPrompt = ` @@ -117,6 +118,11 @@ export async function getDetailedTranslation({ strict: true, }); + // Guardrail: the model occasionally pulls chunks from the surrounding + // context instead of the marked text. Chunks must come from the selection + // only, so drop any whose text does not appear inside the phrase. + result.chunks = filterChunksToSelection(result.chunks, phrase); + // Result is already validated by Zod return result; } catch (error: unknown) { @@ -149,7 +155,7 @@ export async function getTranslationAdvice({ Your MAIN job is to answer the user's questions about this phrase: meaning, grammar, usage, nuance, examples, differences between words, etc. Put your answer in "reply". - Editing the highlighted patterns is a SECONDARY ability. Only return a "chunks" array when the user EXPLICITLY asks to add, remove, or change which patterns are highlighted (e.g. "highlight X", "remove that", "don't include Y"). When you do, each chunk's "text" must appear verbatim in the selection, include its "transliteration" (pronunciation in the ${targetLanguage} alphabet), and you may also add a short "reply" explaining the change. + Editing the highlighted patterns is a SECONDARY ability. Only return a "chunks" array when the user EXPLICITLY asks to add, remove, or change which patterns are highlighted (e.g. "highlight X", "remove that", "don't include Y"). When you do, each chunk's "text" must appear verbatim in the selection — NEVER take a chunk from the context — include its "transliteration" (pronunciation in the ${targetLanguage} alphabet), and you may also add a short "reply" explaining the change. If the user is just asking a question (even one that mentions a phrase, like "what about 'on the'?"), answer it in "reply" and DO NOT change the chunks.`; // Maintain the conversation: replay prior turns so the model has context. @@ -180,6 +186,12 @@ export async function getTranslationAdvice({ strict: true, }); + // Same guardrail as the save flow: any chunks the advisor proposes must + // come from the marked text, not the surrounding context. + if (result.chunks) { + result.chunks = filterChunksToSelection(result.chunks, phrase); + } + return result; } catch (error: unknown) { console.error("Translation advice error:", error);