gladiaio · egenthon-cmd · Jun 30, 2026 · Jun 30, 2026
diff --git a/docs/steps.md b/docs/steps.md
@@ -366,7 +366,12 @@ English uses comma (1,234 -> 1234), European languages use period (1.234 -> 1234
 
 **Base class:** `TextStep`
 
-Remove space before apostrophe (' s -> 's).
+Normalize apostrophe possessives before symbol stripping.
+
+Runs before remove_symbols so ``'s`` markers are removed while the apostrophe
+is still present. Also collapses orphan ``s`` tokens produced when symbol
+removal splits a possessive (``latvia s``), but not literal ``letter s`` or
+product names like ``model s``.
 
 ### `remove_trailing_dot_word_from_emails`
 

diff --git a/normalization/languages/english/number_normalizer.py b/normalization/languages/english/number_normalizer.py
@@ -165,6 +165,13 @@ def process_words(self, words: list[str]) -> Iterator[str]:  # noqa: C901
         prefix: str | None = None
         value: str | int | None = None
         skip = False
+        after_hundred = False
+        compound_after_hundred = False
+
+        def reset_number_phrase_state() -> None:
+            nonlocal after_hundred, compound_after_hundred
+            after_hundred = False
+            compound_after_hundred = False
 
         def to_fraction(s: str | float):
             try:
@@ -179,6 +186,7 @@ def output(result: str | int):
                 result = prefix + result
             value = None
             prefix = None
+            reset_number_phrase_state()
             return result
 
         if len(words) == 0:
@@ -225,11 +233,27 @@ def output(result: str | int):
             elif current_lower not in self.words:
                 if value is not None:
                     yield output(value)
+                reset_number_phrase_state()
                 yield output(current)
             elif current_lower in self.zeros:
-                value = str(value or "") + "0"
+                if (
+                    after_hundred
+                    and compound_after_hundred
+                    and isinstance(value, int)
+                    and value >= 100
+                    and (
+                        next_lower is None
+                        or next_lower not in self.words
+                        or next_lower in self.zeros
+                    )
+                ):
+                    value *= 1000
+                else:
+                    value = str(value or "") + "0"
             elif current_lower in self.ones:
                 ones = self.ones[current_lower]
+                if after_hundred:
+                    compound_after_hundred = True
 
                 if value is None:
                     value = ones
@@ -270,6 +294,8 @@ def output(result: str | int):
                 value = None
             elif current_lower in self.tens:
                 tens = self.tens[current_lower]
+                if after_hundred:
+                    compound_after_hundred = True
                 if value is None:
                     value = tens
                 elif isinstance(value, str):
@@ -292,6 +318,9 @@ def output(result: str | int):
                         yield output(str(value) + str(tens) + suffix)
             elif current_lower in self.multipliers:
                 multiplier = self.multipliers[current_lower]
+                if current_lower == "hundred":
+                    after_hundred = True
+                    compound_after_hundred = False
                 if value is None:
                     value = multiplier
                 elif isinstance(value, str) or value == 0:

diff --git a/normalization/languages/english/operators.py b/normalization/languages/english/operators.py
@@ -47,6 +47,19 @@
     "fifty": "50",
 }
 
+# Parliamentary / legal citation prefixes: numbers after these are not percentages.
+_REFERENCE_NUMBER_LOOKBEHIND = (
+    "(?<!article )(?<!rule )(?<!section )(?<!chapter )"
+    "(?<!paragraph )(?<!part )(?<!clause )(?<!annex )(?<!appendix )"
+)
+# Spoken percentages often omit "percent" before "of" with a proper-noun object
+# (e.g. "15 of Latvia population"), but not for ratios ("15 of 20"),
+# partitives ("5 of the members"), or citation tails ("article 142 of the agenda").
+_RE_SPOKEN_PERCENT_OF = re.compile(
+    rf"{_REFERENCE_NUMBER_LOOKBEHIND}\b(\d+) of (?!\d)(?!the\b)",
+    re.IGNORECASE,
+)
+
 ENGLISH_CONFIG = LanguageConfig(
     code="en",
     decimal_separator=".",
@@ -183,6 +196,15 @@ def _format_colon_time(match: re.Match) -> str:
         return text
 
     def fix_one_word_in_numeric_contexts(self, text: str) -> str:
+        # Parliamentary references: EU corpus uses both "rule" and "article".
+        text = re.sub(r"\brule (\d+)", r"article \1", text)
+        # Rejoin subsection suffixes split by expand_alphanumeric_codes (142 2 a -> 142 2a).
+        while True:
+            updated = re.sub(r"\b(\d+[a-z]?) (\d) ([a-z])\b", r"\1 \2\3", text)
+            if updated == text:
+                break
+            text = updated
+        text = _RE_SPOKEN_PERCENT_OF.sub(r"\1 percent of ", text)
         text = re.sub(r"(\d+)\s+one\s+one\b", r"\1 1 1", text)
         text = re.sub(r"\bone\s+one\s+(\d)", r"1 1 \1", text)
         text = re.sub(r"(\d+)\s+one\s+(\d)", r"\1 1 \2", text)

diff --git a/normalization/languages/english/replacements.py b/normalization/languages/english/replacements.py
@@ -1743,6 +1743,12 @@
     "yoghourts": "yogurts",
     "yoghurt": "yogurt",
     "yoghurts": "yogurts",
+    # VoxPopuli / parliamentary corpus aliases
+    "pttering": "pottering",
+    "puttering": "pottering",
+    "putttering": "pottering",
+    "puttrich": "pottering",
+    "guantnamo": "guantanamo",
     # contractions in titles/prefixes
     "mr": "mister",
     "mrs": "missus",

diff --git a/normalization/presets/gladia-3.yaml b/normalization/presets/gladia-3.yaml
@@ -65,6 +65,8 @@ stages:
 
     # --- Phase 6: Casefold and cleanup ---
     - casefold_text
+    # Strip possessive 's before remove_symbols turns it into an orphan "s" token.
+    - remove_trailing_apostrophe_space
     - remove_symbols
     - remove_diacritics
     - remove_standalone_currency_symbols
@@ -88,7 +90,6 @@ stages:
     - fix_version_number_v_prefix
     - restore_decimal_separator_with_word
     - convert_decimal_periods_to_decimal_word
-    - remove_trailing_apostrophe_space
     - remove_non_numeric_trailing_dots
 
     # --- Phase 2: Time and digit collapsing ---

diff --git a/normalization/steps/text/remove_spaces_between_adjacent_digits.py b/normalization/steps/text/remove_spaces_between_adjacent_digits.py
@@ -5,7 +5,7 @@
 from normalization.steps.base import TextStep
 from normalization.steps.registry import register_step
 
-_RE_SPACES_BETWEEN_DIGITS = re.compile(r"(\d)\s+(?=\d+(?![a-z]))")
+_RE_SPACES_BETWEEN_DIGITS = re.compile(r"(\d)\s+(?=\d+(?![a-z])(?!\s+[a-z]\b))")
 
 
 @register_step

diff --git a/normalization/steps/text/remove_trailing_apostrophe_space.py b/normalization/steps/text/remove_trailing_apostrophe_space.py
@@ -4,12 +4,28 @@
 from normalization.steps.base import TextStep
 from normalization.steps.registry import register_step
 
+# Orphan "s" left after remove_symbols turns "Latvia's" into "latvia s".
+# Exclude literal letter/model references such as "the letter s" or "model s".
+_RE_ORPHAN_POSSESSIVE_S = re.compile(
+    r"\b(?!letter s\b)(?!model s\b)([a-z]{3,}) s\b",
+    re.IGNORECASE,
+)
+_RE_WORD_POSSESSIVE_S = re.compile(r"\b(\w+)'s\b", re.IGNORECASE)
+
 
 @register_step
 class RemoveTrailingApostropheSpaceStep(TextStep):
-    """Remove space before apostrophe (' s -> 's)."""
+    """Normalize apostrophe possessives before symbol stripping.
+
+    Runs before remove_symbols so ``'s`` markers are removed while the apostrophe
+    is still present. Also collapses orphan ``s`` tokens produced when symbol
+    removal splits a possessive (``latvia s``), but not literal ``letter s`` or
+    product names like ``model s``.
+    """
 
     name = "remove_trailing_apostrophe_space"
 
     def __call__(self, text: str, operators: LanguageOperators) -> str:
-        return re.sub(r"\s+'", "'", text)
+        text = re.sub(r"\s+'", "'", text)
+        text = _RE_WORD_POSSESSIVE_S.sub(r"\1", text)
+        return _RE_ORPHAN_POSSESSIVE_S.sub(r"\1", text)
diff --git a/tests/unit/languages/english_voxpopuli_normalization_test.py b/tests/unit/languages/english_voxpopuli_normalization_test.py
@@ -0,0 +1,74 @@
+import pytest
+
+from normalization.languages.english.number_normalizer import EnglishNumberNormalizer
+from normalization.pipeline.loader import load_pipeline
+
+
+@pytest.fixture
+def pipeline():
+    return load_pipeline("gladia-3", "en")
+
+
+@pytest.mark.parametrize(
+    "raw,expected",
+    [
+        ("pttering", "pottering"),
+        ("puttering", "pottering"),
+        ("putttering", "pottering"),
+        ("puttrich", "pottering"),
+        ("guantnamo", "guantanamo"),
+    ],
+)
+def test_voxpopuli_word_aliases(pipeline, raw, expected):
+    assert pipeline.normalize(raw) == expected
+
+
+@pytest.mark.parametrize(
+    "raw,expected",
+    [
+        ("50%", "50 percent"),
+        (
+            "more than fifteen of latvia population",
+            "more than 15 percent of latvia population",
+        ),
+        (
+            "fifteen of latvia s population",
+            "15 percent of latvia population",
+        ),
+        ("15 of 20 people", "15 of 20 people"),
+        ("5 of the members", "5 of the members"),
+        ("rule 142 of the agenda", "article 142 of the agenda"),
+        ("article 142 of chapter 3", "article 142 of chapter 3"),
+    ],
+)
+def test_percent_of_patterns(pipeline, raw, expected):
+    assert pipeline.normalize(raw) == expected
+
+
+@pytest.mark.parametrize(
+    "raw,expected",
+    [
+        ("rule 142 2a 2b", "article 142 2a 2b"),
+        ("article 142 2A 2B", "article 142 2a 2b"),
+    ],
+)
+def test_parliamentary_references(pipeline, raw, expected):
+    assert pipeline.normalize(raw) == expected
+
+
+@pytest.mark.parametrize(
+    "raw,expected",
+    [
+        ("latvia's population", "latvia population"),
+        ("latvia s population", "latvia population"),
+    ],
+)
+def test_possessive_cleanup(pipeline, raw, expected):
+    assert pipeline.normalize(raw) == expected
+
+
+def test_hundred_compound_zero_means_thousands():
+    normalizer = EnglishNumberNormalizer()
+    assert normalizer("three hundred and seventy two zero") == "372000"
+    assert normalizer("five hundred zero") == "5000"
+    assert normalizer("one hundred zero") == "1000"
diff --git a/tests/unit/steps/text/remove_trailing_apostrophe_space_test.py b/tests/unit/steps/text/remove_trailing_apostrophe_space_test.py
@@ -0,0 +1,31 @@
+import pytest
+
+from normalization.languages.english import EnglishOperators
+from normalization.steps.text.remove_trailing_apostrophe_space import (
+    RemoveTrailingApostropheSpaceStep,
+)
+
+
+@pytest.fixture
+def step():
+    return RemoveTrailingApostropheSpaceStep()
+
+
+@pytest.fixture
+def operators():
+    return EnglishOperators()
+
+
+@pytest.mark.parametrize(
+    "raw,expected",
+    [
+        ("latvia's population", "latvia population"),
+        ("latvia 's population", "latvia population"),
+        ("latvia s population", "latvia population"),
+        ("the letter s", "the letter s"),
+        ("model s car", "model s car"),
+        ("tesla model s", "tesla model s"),
+    ],
+)
+def test_remove_trailing_apostrophe_space(step, operators, raw, expected):
+    assert step(raw, operators) == expected