From 387f96a573f83d1ec9bd27f9f89a39bd078fd020 Mon Sep 17 00:00:00 2001
From: egenthon-cmd <egenthon@gladia.io>
Date: Tue, 30 Jun 2026 12:39:56 -0400
Subject: [PATCH 1/2] fix(en): improve normalization after voxpopuli results

Align ref/hyp WER gaps for corpus aliases, possessives, article refs,
elided percentages, and hundred-scale numbers misheard as trailing zero.
---
 docs/steps.md                                 |  5 +-
 .../languages/english/number_normalizer.py    | 31 ++++++++-
 normalization/languages/english/operators.py  | 10 +++
 .../languages/english/replacements.py         |  6 ++
 .../remove_spaces_between_adjacent_digits.py  |  4 +-
 .../text/remove_trailing_apostrophe_space.py  |  9 ++-
 .../english_voxpopuli_normalization_test.py   | 67 +++++++++++++++++++
 7 files changed, 127 insertions(+), 5 deletions(-)
 create mode 100644 tests/unit/languages/english_voxpopuli_normalization_test.py

diff --git a/docs/steps.md b/docs/steps.md
index 1b6d764..b1a7b7b 100644
--- a/docs/steps.md
+++ b/docs/steps.md
@@ -366,7 +366,10 @@ English uses comma (1,234 -> 1234), European languages use period (1.234 -> 1234
 
 **Base class:** `TextStep`
 
-Remove space before apostrophe (' s -> 's).
+Remove space before apostrophe (' s -> 's) and orphan possessive s tokens.
+
+After remove_symbols, possessives like "Latvia's" become "latvia s"; collapse
+those back to the base word.
 
 ### `remove_trailing_dot_word_from_emails`
 
diff --git a/normalization/languages/english/number_normalizer.py b/normalization/languages/english/number_normalizer.py
index c6380d5..5f53826 100644
--- a/normalization/languages/english/number_normalizer.py
+++ b/normalization/languages/english/number_normalizer.py
@@ -165,6 +165,13 @@ def process_words(self, words: list[str]) -> Iterator[str]:  # noqa: C901
         prefix: str | None = None
         value: str | int | None = None
         skip = False
+        after_hundred = False
+        compound_after_hundred = False
+
+        def reset_number_phrase_state() -> None:
+            nonlocal after_hundred, compound_after_hundred
+            after_hundred = False
+            compound_after_hundred = False
 
         def to_fraction(s: str | float):
             try:
@@ -179,6 +186,7 @@ def output(result: str | int):
                 result = prefix + result
             value = None
             prefix = None
+            reset_number_phrase_state()
             return result
 
         if len(words) == 0:
@@ -225,11 +233,27 @@ def output(result: str | int):
             elif current_lower not in self.words:
                 if value is not None:
                     yield output(value)
+                reset_number_phrase_state()
                 yield output(current)
             elif current_lower in self.zeros:
-                value = str(value or "") + "0"
+                if (
+                    after_hundred
+                    and compound_after_hundred
+                    and isinstance(value, int)
+                    and value >= 100
+                    and (
+                        next_lower is None
+                        or next_lower not in self.words
+                        or next_lower in self.zeros
+                    )
+                ):
+                    value *= 1000
+                else:
+                    value = str(value or "") + "0"
             elif current_lower in self.ones:
                 ones = self.ones[current_lower]
+                if after_hundred:
+                    compound_after_hundred = True
 
                 if value is None:
                     value = ones
@@ -270,6 +294,8 @@ def output(result: str | int):
                 value = None
             elif current_lower in self.tens:
                 tens = self.tens[current_lower]
+                if after_hundred:
+                    compound_after_hundred = True
                 if value is None:
                     value = tens
                 elif isinstance(value, str):
@@ -292,6 +318,9 @@ def output(result: str | int):
                         yield output(str(value) + str(tens) + suffix)
             elif current_lower in self.multipliers:
                 multiplier = self.multipliers[current_lower]
+                if current_lower == "hundred":
+                    after_hundred = True
+                    compound_after_hundred = False
                 if value is None:
                     value = multiplier
                 elif isinstance(value, str) or value == 0:
diff --git a/normalization/languages/english/operators.py b/normalization/languages/english/operators.py
index 8693a4c..731d3a4 100644
--- a/normalization/languages/english/operators.py
+++ b/normalization/languages/english/operators.py
@@ -183,6 +183,16 @@ def _format_colon_time(match: re.Match) -> str:
         return text
 
     def fix_one_word_in_numeric_contexts(self, text: str) -> str:
+        # Parliamentary references: EU corpus uses both "rule" and "article".
+        text = re.sub(r"\brule (\d+)", r"article \1", text)
+        # Rejoin subsection suffixes split by expand_alphanumeric_codes (142 2 a -> 142 2a).
+        while True:
+            updated = re.sub(r"\b(\d+[a-z]?) (\d) ([a-z])\b", r"\1 \2\3", text)
+            if updated == text:
+                break
+            text = updated
+        # Spoken percentages often drop "percent" before "of" (e.g. "15 of Latvia's").
+        text = re.sub(r"\b(\d+) of (?!\d)", r"\1 percent of ", text)
         text = re.sub(r"(\d+)\s+one\s+one\b", r"\1 1 1", text)
         text = re.sub(r"\bone\s+one\s+(\d)", r"1 1 \1", text)
         text = re.sub(r"(\d+)\s+one\s+(\d)", r"\1 1 \2", text)
diff --git a/normalization/languages/english/replacements.py b/normalization/languages/english/replacements.py
index 91bdba5..af38e87 100644
--- a/normalization/languages/english/replacements.py
+++ b/normalization/languages/english/replacements.py
@@ -1743,6 +1743,12 @@
     "yoghourts": "yogurts",
     "yoghurt": "yogurt",
     "yoghurts": "yogurts",
+    # VoxPopuli / parliamentary corpus aliases
+    "pttering": "pottering",
+    "puttering": "pottering",
+    "putttering": "pottering",
+    "puttrich": "pottering",
+    "guantnamo": "guantanamo",
     # contractions in titles/prefixes
     "mr": "mister",
     "mrs": "missus",
diff --git a/normalization/steps/text/remove_spaces_between_adjacent_digits.py b/normalization/steps/text/remove_spaces_between_adjacent_digits.py
index 7de4a4b..09d9093 100644
--- a/normalization/steps/text/remove_spaces_between_adjacent_digits.py
+++ b/normalization/steps/text/remove_spaces_between_adjacent_digits.py
@@ -5,7 +5,9 @@
 from normalization.steps.base import TextStep
 from normalization.steps.registry import register_step
 
-_RE_SPACES_BETWEEN_DIGITS = re.compile(r"(\d)\s+(?=\d+(?![a-z]))")
+_RE_SPACES_BETWEEN_DIGITS = re.compile(
+    r"(\d)\s+(?=\d+(?![a-z])(?!\s+[a-z]\b))"
+)
 
 
 @register_step
diff --git a/normalization/steps/text/remove_trailing_apostrophe_space.py b/normalization/steps/text/remove_trailing_apostrophe_space.py
index 85e49f6..1e5a55e 100644
--- a/normalization/steps/text/remove_trailing_apostrophe_space.py
+++ b/normalization/steps/text/remove_trailing_apostrophe_space.py
@@ -7,9 +7,14 @@
 
 @register_step
 class RemoveTrailingApostropheSpaceStep(TextStep):
-    """Remove space before apostrophe (' s -> 's)."""
+    """Remove space before apostrophe (' s -> 's) and orphan possessive s tokens.
+
+    After remove_symbols, possessives like "Latvia's" become "latvia s"; collapse
+    those back to the base word.
+    """
 
     name = "remove_trailing_apostrophe_space"
 
     def __call__(self, text: str, operators: LanguageOperators) -> str:
-        return re.sub(r"\s+'", "'", text)
+        text = re.sub(r"\s+'", "'", text)
+        return re.sub(r"\b([a-z]{2,}) s\b", r"\1", text)
diff --git a/tests/unit/languages/english_voxpopuli_normalization_test.py b/tests/unit/languages/english_voxpopuli_normalization_test.py
new file mode 100644
index 0000000..6514162
--- /dev/null
+++ b/tests/unit/languages/english_voxpopuli_normalization_test.py
@@ -0,0 +1,67 @@
+import pytest
+
+from normalization.languages.english.number_normalizer import EnglishNumberNormalizer
+from normalization.pipeline.loader import load_pipeline
+
+
+@pytest.fixture
+def pipeline():
+    return load_pipeline("gladia-3", "en")
+
+
+@pytest.mark.parametrize(
+    "raw,expected",
+    [
+        ("pttering", "pottering"),
+        ("puttering", "pottering"),
+        ("putttering", "pottering"),
+        ("puttrich", "pottering"),
+        ("guantnamo", "guantanamo"),
+    ],
+)
+def test_voxpopuli_word_aliases(pipeline, raw, expected):
+    assert pipeline.normalize(raw) == expected
+
+
+@pytest.mark.parametrize(
+    "raw,expected",
+    [
+        ("50%", "50 percent"),
+        (
+            "more than fifteen of latvia population",
+            "more than 15 percent of latvia population",
+        ),
+        ("15 of 20 people", "15 of 20 people"),
+    ],
+)
+def test_percent_of_patterns(pipeline, raw, expected):
+    assert pipeline.normalize(raw) == expected
+
+
+@pytest.mark.parametrize(
+    "raw,expected",
+    [
+        ("rule 142 2a 2b", "article 142 2a 2b"),
+        ("article 142 2A 2B", "article 142 2a 2b"),
+    ],
+)
+def test_parliamentary_references(pipeline, raw, expected):
+    assert pipeline.normalize(raw) == expected
+
+
+@pytest.mark.parametrize(
+    "raw,expected",
+    [
+        ("latvia's population", "latvia population"),
+        ("latvia s population", "latvia population"),
+    ],
+)
+def test_possessive_cleanup(pipeline, raw, expected):
+    assert pipeline.normalize(raw) == expected
+
+
+def test_hundred_compound_zero_means_thousands():
+    normalizer = EnglishNumberNormalizer()
+    assert normalizer("three hundred and seventy two zero") == "372000"
+    assert normalizer("five hundred zero") == "5000"
+    assert normalizer("one hundred zero") == "1000"

From 3d97f82150c2ed15b8a40d426b8ef98cea1a447b Mon Sep 17 00:00:00 2001
From: egenthon-cmd <egenthon@gladia.io>
Date: Tue, 30 Jun 2026 14:03:02 -0400
Subject: [PATCH 2/2] fix: feedback from coderabbit

---
 docs/steps.md                                 |  8 +++--
 normalization/languages/english/operators.py  | 16 ++++++++--
 normalization/presets/gladia-3.yaml           |  3 +-
 .../remove_spaces_between_adjacent_digits.py  |  4 +--
 .../text/remove_trailing_apostrophe_space.py  | 19 +++++++++---
 .../english_voxpopuli_normalization_test.py   |  7 +++++
 .../remove_trailing_apostrophe_space_test.py  | 31 +++++++++++++++++++
 7 files changed, 75 insertions(+), 13 deletions(-)
 create mode 100644 tests/unit/steps/text/remove_trailing_apostrophe_space_test.py

diff --git a/docs/steps.md b/docs/steps.md
index b1a7b7b..9afc058 100644
--- a/docs/steps.md
+++ b/docs/steps.md
@@ -366,10 +366,12 @@ English uses comma (1,234 -> 1234), European languages use period (1.234 -> 1234
 
 **Base class:** `TextStep`
 
-Remove space before apostrophe (' s -> 's) and orphan possessive s tokens.
+Normalize apostrophe possessives before symbol stripping.
 
-After remove_symbols, possessives like "Latvia's" become "latvia s"; collapse
-those back to the base word.
+Runs before remove_symbols so ``'s`` markers are removed while the apostrophe
+is still present. Also collapses orphan ``s`` tokens produced when symbol
+removal splits a possessive (``latvia s``), but not literal ``letter s`` or
+product names like ``model s``.
 
 ### `remove_trailing_dot_word_from_emails`
 
diff --git a/normalization/languages/english/operators.py b/normalization/languages/english/operators.py
index 731d3a4..be7ed08 100644
--- a/normalization/languages/english/operators.py
+++ b/normalization/languages/english/operators.py
@@ -47,6 +47,19 @@
     "fifty": "50",
 }
 
+# Parliamentary / legal citation prefixes: numbers after these are not percentages.
+_REFERENCE_NUMBER_LOOKBEHIND = (
+    "(?<!article )(?<!rule )(?<!section )(?<!chapter )"
+    "(?<!paragraph )(?<!part )(?<!clause )(?<!annex )(?<!appendix )"
+)
+# Spoken percentages often omit "percent" before "of" with a proper-noun object
+# (e.g. "15 of Latvia population"), but not for ratios ("15 of 20"),
+# partitives ("5 of the members"), or citation tails ("article 142 of the agenda").
+_RE_SPOKEN_PERCENT_OF = re.compile(
+    rf"{_REFERENCE_NUMBER_LOOKBEHIND}\b(\d+) of (?!\d)(?!the\b)",
+    re.IGNORECASE,
+)
+
 ENGLISH_CONFIG = LanguageConfig(
     code="en",
     decimal_separator=".",
@@ -191,8 +204,7 @@ def fix_one_word_in_numeric_contexts(self, text: str) -> str:
             if updated == text:
                 break
             text = updated
-        # Spoken percentages often drop "percent" before "of" (e.g. "15 of Latvia's").
-        text = re.sub(r"\b(\d+) of (?!\d)", r"\1 percent of ", text)
+        text = _RE_SPOKEN_PERCENT_OF.sub(r"\1 percent of ", text)
         text = re.sub(r"(\d+)\s+one\s+one\b", r"\1 1 1", text)
         text = re.sub(r"\bone\s+one\s+(\d)", r"1 1 \1", text)
         text = re.sub(r"(\d+)\s+one\s+(\d)", r"\1 1 \2", text)
diff --git a/normalization/presets/gladia-3.yaml b/normalization/presets/gladia-3.yaml
index ca3f6ff..c820532 100644
--- a/normalization/presets/gladia-3.yaml
+++ b/normalization/presets/gladia-3.yaml
@@ -65,6 +65,8 @@ stages:
 
     # --- Phase 6: Casefold and cleanup ---
     - casefold_text
+    # Strip possessive 's before remove_symbols turns it into an orphan "s" token.
+    - remove_trailing_apostrophe_space
     - remove_symbols
     - remove_diacritics
     - remove_standalone_currency_symbols
@@ -88,7 +90,6 @@ stages:
     - fix_version_number_v_prefix
     - restore_decimal_separator_with_word
     - convert_decimal_periods_to_decimal_word
-    - remove_trailing_apostrophe_space
     - remove_non_numeric_trailing_dots
 
     # --- Phase 2: Time and digit collapsing ---
diff --git a/normalization/steps/text/remove_spaces_between_adjacent_digits.py b/normalization/steps/text/remove_spaces_between_adjacent_digits.py
index 09d9093..0ffe683 100644
--- a/normalization/steps/text/remove_spaces_between_adjacent_digits.py
+++ b/normalization/steps/text/remove_spaces_between_adjacent_digits.py
@@ -5,9 +5,7 @@
 from normalization.steps.base import TextStep
 from normalization.steps.registry import register_step
 
-_RE_SPACES_BETWEEN_DIGITS = re.compile(
-    r"(\d)\s+(?=\d+(?![a-z])(?!\s+[a-z]\b))"
-)
+_RE_SPACES_BETWEEN_DIGITS = re.compile(r"(\d)\s+(?=\d+(?![a-z])(?!\s+[a-z]\b))")
 
 
 @register_step
diff --git a/normalization/steps/text/remove_trailing_apostrophe_space.py b/normalization/steps/text/remove_trailing_apostrophe_space.py
index 1e5a55e..2d539a4 100644
--- a/normalization/steps/text/remove_trailing_apostrophe_space.py
+++ b/normalization/steps/text/remove_trailing_apostrophe_space.py
@@ -4,17 +4,28 @@
 from normalization.steps.base import TextStep
 from normalization.steps.registry import register_step
 
+# Orphan "s" left after remove_symbols turns "Latvia's" into "latvia s".
+# Exclude literal letter/model references such as "the letter s" or "model s".
+_RE_ORPHAN_POSSESSIVE_S = re.compile(
+    r"\b(?!letter s\b)(?!model s\b)([a-z]{3,}) s\b",
+    re.IGNORECASE,
+)
+_RE_WORD_POSSESSIVE_S = re.compile(r"\b(\w+)'s\b", re.IGNORECASE)
+
 
 @register_step
 class RemoveTrailingApostropheSpaceStep(TextStep):
-    """Remove space before apostrophe (' s -> 's) and orphan possessive s tokens.
+    """Normalize apostrophe possessives before symbol stripping.
 
-    After remove_symbols, possessives like "Latvia's" become "latvia s"; collapse
-    those back to the base word.
+    Runs before remove_symbols so ``'s`` markers are removed while the apostrophe
+    is still present. Also collapses orphan ``s`` tokens produced when symbol
+    removal splits a possessive (``latvia s``), but not literal ``letter s`` or
+    product names like ``model s``.
     """
 
     name = "remove_trailing_apostrophe_space"
 
     def __call__(self, text: str, operators: LanguageOperators) -> str:
         text = re.sub(r"\s+'", "'", text)
-        return re.sub(r"\b([a-z]{2,}) s\b", r"\1", text)
+        text = _RE_WORD_POSSESSIVE_S.sub(r"\1", text)
+        return _RE_ORPHAN_POSSESSIVE_S.sub(r"\1", text)
diff --git a/tests/unit/languages/english_voxpopuli_normalization_test.py b/tests/unit/languages/english_voxpopuli_normalization_test.py
index 6514162..e3d2be5 100644
--- a/tests/unit/languages/english_voxpopuli_normalization_test.py
+++ b/tests/unit/languages/english_voxpopuli_normalization_test.py
@@ -31,7 +31,14 @@ def test_voxpopuli_word_aliases(pipeline, raw, expected):
             "more than fifteen of latvia population",
             "more than 15 percent of latvia population",
         ),
+        (
+            "fifteen of latvia s population",
+            "15 percent of latvia population",
+        ),
         ("15 of 20 people", "15 of 20 people"),
+        ("5 of the members", "5 of the members"),
+        ("rule 142 of the agenda", "article 142 of the agenda"),
+        ("article 142 of chapter 3", "article 142 of chapter 3"),
     ],
 )
 def test_percent_of_patterns(pipeline, raw, expected):
diff --git a/tests/unit/steps/text/remove_trailing_apostrophe_space_test.py b/tests/unit/steps/text/remove_trailing_apostrophe_space_test.py
new file mode 100644
index 0000000..319d90d
--- /dev/null
+++ b/tests/unit/steps/text/remove_trailing_apostrophe_space_test.py
@@ -0,0 +1,31 @@
+import pytest
+
+from normalization.languages.english import EnglishOperators
+from normalization.steps.text.remove_trailing_apostrophe_space import (
+    RemoveTrailingApostropheSpaceStep,
+)
+
+
+@pytest.fixture
+def step():
+    return RemoveTrailingApostropheSpaceStep()
+
+
+@pytest.fixture
+def operators():
+    return EnglishOperators()
+
+
+@pytest.mark.parametrize(
+    "raw,expected",
+    [
+        ("latvia's population", "latvia population"),
+        ("latvia 's population", "latvia population"),
+        ("latvia s population", "latvia population"),
+        ("the letter s", "the letter s"),
+        ("model s car", "model s car"),
+        ("tesla model s", "tesla model s"),
+    ],
+)
+def test_remove_trailing_apostrophe_space(step, operators, raw, expected):
+    assert step(raw, operators) == expected