From 387f96a573f83d1ec9bd27f9f89a39bd078fd020 Mon Sep 17 00:00:00 2001 From: egenthon-cmd Date: Tue, 30 Jun 2026 12:39:56 -0400 Subject: [PATCH 1/2] fix(en): improve normalization after voxpopuli results Align ref/hyp WER gaps for corpus aliases, possessives, article refs, elided percentages, and hundred-scale numbers misheard as trailing zero. --- docs/steps.md | 5 +- .../languages/english/number_normalizer.py | 31 ++++++++- normalization/languages/english/operators.py | 10 +++ .../languages/english/replacements.py | 6 ++ .../remove_spaces_between_adjacent_digits.py | 4 +- .../text/remove_trailing_apostrophe_space.py | 9 ++- .../english_voxpopuli_normalization_test.py | 67 +++++++++++++++++++ 7 files changed, 127 insertions(+), 5 deletions(-) create mode 100644 tests/unit/languages/english_voxpopuli_normalization_test.py diff --git a/docs/steps.md b/docs/steps.md index 1b6d764..b1a7b7b 100644 --- a/docs/steps.md +++ b/docs/steps.md @@ -366,7 +366,10 @@ English uses comma (1,234 -> 1234), European languages use period (1.234 -> 1234 **Base class:** `TextStep` -Remove space before apostrophe (' s -> 's). +Remove space before apostrophe (' s -> 's) and orphan possessive s tokens. + +After remove_symbols, possessives like "Latvia's" become "latvia s"; collapse +those back to the base word. ### `remove_trailing_dot_word_from_emails` diff --git a/normalization/languages/english/number_normalizer.py b/normalization/languages/english/number_normalizer.py index c6380d5..5f53826 100644 --- a/normalization/languages/english/number_normalizer.py +++ b/normalization/languages/english/number_normalizer.py @@ -165,6 +165,13 @@ def process_words(self, words: list[str]) -> Iterator[str]: # noqa: C901 prefix: str | None = None value: str | int | None = None skip = False + after_hundred = False + compound_after_hundred = False + + def reset_number_phrase_state() -> None: + nonlocal after_hundred, compound_after_hundred + after_hundred = False + compound_after_hundred = False def to_fraction(s: str | float): try: @@ -179,6 +186,7 @@ def output(result: str | int): result = prefix + result value = None prefix = None + reset_number_phrase_state() return result if len(words) == 0: @@ -225,11 +233,27 @@ def output(result: str | int): elif current_lower not in self.words: if value is not None: yield output(value) + reset_number_phrase_state() yield output(current) elif current_lower in self.zeros: - value = str(value or "") + "0" + if ( + after_hundred + and compound_after_hundred + and isinstance(value, int) + and value >= 100 + and ( + next_lower is None + or next_lower not in self.words + or next_lower in self.zeros + ) + ): + value *= 1000 + else: + value = str(value or "") + "0" elif current_lower in self.ones: ones = self.ones[current_lower] + if after_hundred: + compound_after_hundred = True if value is None: value = ones @@ -270,6 +294,8 @@ def output(result: str | int): value = None elif current_lower in self.tens: tens = self.tens[current_lower] + if after_hundred: + compound_after_hundred = True if value is None: value = tens elif isinstance(value, str): @@ -292,6 +318,9 @@ def output(result: str | int): yield output(str(value) + str(tens) + suffix) elif current_lower in self.multipliers: multiplier = self.multipliers[current_lower] + if current_lower == "hundred": + after_hundred = True + compound_after_hundred = False if value is None: value = multiplier elif isinstance(value, str) or value == 0: diff --git a/normalization/languages/english/operators.py b/normalization/languages/english/operators.py index 8693a4c..731d3a4 100644 --- a/normalization/languages/english/operators.py +++ b/normalization/languages/english/operators.py @@ -183,6 +183,16 @@ def _format_colon_time(match: re.Match) -> str: return text def fix_one_word_in_numeric_contexts(self, text: str) -> str: + # Parliamentary references: EU corpus uses both "rule" and "article". + text = re.sub(r"\brule (\d+)", r"article \1", text) + # Rejoin subsection suffixes split by expand_alphanumeric_codes (142 2 a -> 142 2a). + while True: + updated = re.sub(r"\b(\d+[a-z]?) (\d) ([a-z])\b", r"\1 \2\3", text) + if updated == text: + break + text = updated + # Spoken percentages often drop "percent" before "of" (e.g. "15 of Latvia's"). + text = re.sub(r"\b(\d+) of (?!\d)", r"\1 percent of ", text) text = re.sub(r"(\d+)\s+one\s+one\b", r"\1 1 1", text) text = re.sub(r"\bone\s+one\s+(\d)", r"1 1 \1", text) text = re.sub(r"(\d+)\s+one\s+(\d)", r"\1 1 \2", text) diff --git a/normalization/languages/english/replacements.py b/normalization/languages/english/replacements.py index 91bdba5..af38e87 100644 --- a/normalization/languages/english/replacements.py +++ b/normalization/languages/english/replacements.py @@ -1743,6 +1743,12 @@ "yoghourts": "yogurts", "yoghurt": "yogurt", "yoghurts": "yogurts", + # VoxPopuli / parliamentary corpus aliases + "pttering": "pottering", + "puttering": "pottering", + "putttering": "pottering", + "puttrich": "pottering", + "guantnamo": "guantanamo", # contractions in titles/prefixes "mr": "mister", "mrs": "missus", diff --git a/normalization/steps/text/remove_spaces_between_adjacent_digits.py b/normalization/steps/text/remove_spaces_between_adjacent_digits.py index 7de4a4b..09d9093 100644 --- a/normalization/steps/text/remove_spaces_between_adjacent_digits.py +++ b/normalization/steps/text/remove_spaces_between_adjacent_digits.py @@ -5,7 +5,9 @@ from normalization.steps.base import TextStep from normalization.steps.registry import register_step -_RE_SPACES_BETWEEN_DIGITS = re.compile(r"(\d)\s+(?=\d+(?![a-z]))") +_RE_SPACES_BETWEEN_DIGITS = re.compile( + r"(\d)\s+(?=\d+(?![a-z])(?!\s+[a-z]\b))" +) @register_step diff --git a/normalization/steps/text/remove_trailing_apostrophe_space.py b/normalization/steps/text/remove_trailing_apostrophe_space.py index 85e49f6..1e5a55e 100644 --- a/normalization/steps/text/remove_trailing_apostrophe_space.py +++ b/normalization/steps/text/remove_trailing_apostrophe_space.py @@ -7,9 +7,14 @@ @register_step class RemoveTrailingApostropheSpaceStep(TextStep): - """Remove space before apostrophe (' s -> 's).""" + """Remove space before apostrophe (' s -> 's) and orphan possessive s tokens. + + After remove_symbols, possessives like "Latvia's" become "latvia s"; collapse + those back to the base word. + """ name = "remove_trailing_apostrophe_space" def __call__(self, text: str, operators: LanguageOperators) -> str: - return re.sub(r"\s+'", "'", text) + text = re.sub(r"\s+'", "'", text) + return re.sub(r"\b([a-z]{2,}) s\b", r"\1", text) diff --git a/tests/unit/languages/english_voxpopuli_normalization_test.py b/tests/unit/languages/english_voxpopuli_normalization_test.py new file mode 100644 index 0000000..6514162 --- /dev/null +++ b/tests/unit/languages/english_voxpopuli_normalization_test.py @@ -0,0 +1,67 @@ +import pytest + +from normalization.languages.english.number_normalizer import EnglishNumberNormalizer +from normalization.pipeline.loader import load_pipeline + + +@pytest.fixture +def pipeline(): + return load_pipeline("gladia-3", "en") + + +@pytest.mark.parametrize( + "raw,expected", + [ + ("pttering", "pottering"), + ("puttering", "pottering"), + ("putttering", "pottering"), + ("puttrich", "pottering"), + ("guantnamo", "guantanamo"), + ], +) +def test_voxpopuli_word_aliases(pipeline, raw, expected): + assert pipeline.normalize(raw) == expected + + +@pytest.mark.parametrize( + "raw,expected", + [ + ("50%", "50 percent"), + ( + "more than fifteen of latvia population", + "more than 15 percent of latvia population", + ), + ("15 of 20 people", "15 of 20 people"), + ], +) +def test_percent_of_patterns(pipeline, raw, expected): + assert pipeline.normalize(raw) == expected + + +@pytest.mark.parametrize( + "raw,expected", + [ + ("rule 142 2a 2b", "article 142 2a 2b"), + ("article 142 2A 2B", "article 142 2a 2b"), + ], +) +def test_parliamentary_references(pipeline, raw, expected): + assert pipeline.normalize(raw) == expected + + +@pytest.mark.parametrize( + "raw,expected", + [ + ("latvia's population", "latvia population"), + ("latvia s population", "latvia population"), + ], +) +def test_possessive_cleanup(pipeline, raw, expected): + assert pipeline.normalize(raw) == expected + + +def test_hundred_compound_zero_means_thousands(): + normalizer = EnglishNumberNormalizer() + assert normalizer("three hundred and seventy two zero") == "372000" + assert normalizer("five hundred zero") == "5000" + assert normalizer("one hundred zero") == "1000" From 3d97f82150c2ed15b8a40d426b8ef98cea1a447b Mon Sep 17 00:00:00 2001 From: egenthon-cmd Date: Tue, 30 Jun 2026 14:03:02 -0400 Subject: [PATCH 2/2] fix: feedback from coderabbit --- docs/steps.md | 8 +++-- normalization/languages/english/operators.py | 16 ++++++++-- normalization/presets/gladia-3.yaml | 3 +- .../remove_spaces_between_adjacent_digits.py | 4 +-- .../text/remove_trailing_apostrophe_space.py | 19 +++++++++--- .../english_voxpopuli_normalization_test.py | 7 +++++ .../remove_trailing_apostrophe_space_test.py | 31 +++++++++++++++++++ 7 files changed, 75 insertions(+), 13 deletions(-) create mode 100644 tests/unit/steps/text/remove_trailing_apostrophe_space_test.py diff --git a/docs/steps.md b/docs/steps.md index b1a7b7b..9afc058 100644 --- a/docs/steps.md +++ b/docs/steps.md @@ -366,10 +366,12 @@ English uses comma (1,234 -> 1234), European languages use period (1.234 -> 1234 **Base class:** `TextStep` -Remove space before apostrophe (' s -> 's) and orphan possessive s tokens. +Normalize apostrophe possessives before symbol stripping. -After remove_symbols, possessives like "Latvia's" become "latvia s"; collapse -those back to the base word. +Runs before remove_symbols so ``'s`` markers are removed while the apostrophe +is still present. Also collapses orphan ``s`` tokens produced when symbol +removal splits a possessive (``latvia s``), but not literal ``letter s`` or +product names like ``model s``. ### `remove_trailing_dot_word_from_emails` diff --git a/normalization/languages/english/operators.py b/normalization/languages/english/operators.py index 731d3a4..be7ed08 100644 --- a/normalization/languages/english/operators.py +++ b/normalization/languages/english/operators.py @@ -47,6 +47,19 @@ "fifty": "50", } +# Parliamentary / legal citation prefixes: numbers after these are not percentages. +_REFERENCE_NUMBER_LOOKBEHIND = ( + "(? str: if updated == text: break text = updated - # Spoken percentages often drop "percent" before "of" (e.g. "15 of Latvia's"). - text = re.sub(r"\b(\d+) of (?!\d)", r"\1 percent of ", text) + text = _RE_SPOKEN_PERCENT_OF.sub(r"\1 percent of ", text) text = re.sub(r"(\d+)\s+one\s+one\b", r"\1 1 1", text) text = re.sub(r"\bone\s+one\s+(\d)", r"1 1 \1", text) text = re.sub(r"(\d+)\s+one\s+(\d)", r"\1 1 \2", text) diff --git a/normalization/presets/gladia-3.yaml b/normalization/presets/gladia-3.yaml index ca3f6ff..c820532 100644 --- a/normalization/presets/gladia-3.yaml +++ b/normalization/presets/gladia-3.yaml @@ -65,6 +65,8 @@ stages: # --- Phase 6: Casefold and cleanup --- - casefold_text + # Strip possessive 's before remove_symbols turns it into an orphan "s" token. + - remove_trailing_apostrophe_space - remove_symbols - remove_diacritics - remove_standalone_currency_symbols @@ -88,7 +90,6 @@ stages: - fix_version_number_v_prefix - restore_decimal_separator_with_word - convert_decimal_periods_to_decimal_word - - remove_trailing_apostrophe_space - remove_non_numeric_trailing_dots # --- Phase 2: Time and digit collapsing --- diff --git a/normalization/steps/text/remove_spaces_between_adjacent_digits.py b/normalization/steps/text/remove_spaces_between_adjacent_digits.py index 09d9093..0ffe683 100644 --- a/normalization/steps/text/remove_spaces_between_adjacent_digits.py +++ b/normalization/steps/text/remove_spaces_between_adjacent_digits.py @@ -5,9 +5,7 @@ from normalization.steps.base import TextStep from normalization.steps.registry import register_step -_RE_SPACES_BETWEEN_DIGITS = re.compile( - r"(\d)\s+(?=\d+(?![a-z])(?!\s+[a-z]\b))" -) +_RE_SPACES_BETWEEN_DIGITS = re.compile(r"(\d)\s+(?=\d+(?![a-z])(?!\s+[a-z]\b))") @register_step diff --git a/normalization/steps/text/remove_trailing_apostrophe_space.py b/normalization/steps/text/remove_trailing_apostrophe_space.py index 1e5a55e..2d539a4 100644 --- a/normalization/steps/text/remove_trailing_apostrophe_space.py +++ b/normalization/steps/text/remove_trailing_apostrophe_space.py @@ -4,17 +4,28 @@ from normalization.steps.base import TextStep from normalization.steps.registry import register_step +# Orphan "s" left after remove_symbols turns "Latvia's" into "latvia s". +# Exclude literal letter/model references such as "the letter s" or "model s". +_RE_ORPHAN_POSSESSIVE_S = re.compile( + r"\b(?!letter s\b)(?!model s\b)([a-z]{3,}) s\b", + re.IGNORECASE, +) +_RE_WORD_POSSESSIVE_S = re.compile(r"\b(\w+)'s\b", re.IGNORECASE) + @register_step class RemoveTrailingApostropheSpaceStep(TextStep): - """Remove space before apostrophe (' s -> 's) and orphan possessive s tokens. + """Normalize apostrophe possessives before symbol stripping. - After remove_symbols, possessives like "Latvia's" become "latvia s"; collapse - those back to the base word. + Runs before remove_symbols so ``'s`` markers are removed while the apostrophe + is still present. Also collapses orphan ``s`` tokens produced when symbol + removal splits a possessive (``latvia s``), but not literal ``letter s`` or + product names like ``model s``. """ name = "remove_trailing_apostrophe_space" def __call__(self, text: str, operators: LanguageOperators) -> str: text = re.sub(r"\s+'", "'", text) - return re.sub(r"\b([a-z]{2,}) s\b", r"\1", text) + text = _RE_WORD_POSSESSIVE_S.sub(r"\1", text) + return _RE_ORPHAN_POSSESSIVE_S.sub(r"\1", text) diff --git a/tests/unit/languages/english_voxpopuli_normalization_test.py b/tests/unit/languages/english_voxpopuli_normalization_test.py index 6514162..e3d2be5 100644 --- a/tests/unit/languages/english_voxpopuli_normalization_test.py +++ b/tests/unit/languages/english_voxpopuli_normalization_test.py @@ -31,7 +31,14 @@ def test_voxpopuli_word_aliases(pipeline, raw, expected): "more than fifteen of latvia population", "more than 15 percent of latvia population", ), + ( + "fifteen of latvia s population", + "15 percent of latvia population", + ), ("15 of 20 people", "15 of 20 people"), + ("5 of the members", "5 of the members"), + ("rule 142 of the agenda", "article 142 of the agenda"), + ("article 142 of chapter 3", "article 142 of chapter 3"), ], ) def test_percent_of_patterns(pipeline, raw, expected): diff --git a/tests/unit/steps/text/remove_trailing_apostrophe_space_test.py b/tests/unit/steps/text/remove_trailing_apostrophe_space_test.py new file mode 100644 index 0000000..319d90d --- /dev/null +++ b/tests/unit/steps/text/remove_trailing_apostrophe_space_test.py @@ -0,0 +1,31 @@ +import pytest + +from normalization.languages.english import EnglishOperators +from normalization.steps.text.remove_trailing_apostrophe_space import ( + RemoveTrailingApostropheSpaceStep, +) + + +@pytest.fixture +def step(): + return RemoveTrailingApostropheSpaceStep() + + +@pytest.fixture +def operators(): + return EnglishOperators() + + +@pytest.mark.parametrize( + "raw,expected", + [ + ("latvia's population", "latvia population"), + ("latvia 's population", "latvia population"), + ("latvia s population", "latvia population"), + ("the letter s", "the letter s"), + ("model s car", "model s car"), + ("tesla model s", "tesla model s"), + ], +) +def test_remove_trailing_apostrophe_space(step, operators, raw, expected): + assert step(raw, operators) == expected