diff --git a/docs/steps.md b/docs/steps.md index 1b6d764..9afc058 100644 --- a/docs/steps.md +++ b/docs/steps.md @@ -366,7 +366,12 @@ English uses comma (1,234 -> 1234), European languages use period (1.234 -> 1234 **Base class:** `TextStep` -Remove space before apostrophe (' s -> 's). +Normalize apostrophe possessives before symbol stripping. + +Runs before remove_symbols so ``'s`` markers are removed while the apostrophe +is still present. Also collapses orphan ``s`` tokens produced when symbol +removal splits a possessive (``latvia s``), but not literal ``letter s`` or +product names like ``model s``. ### `remove_trailing_dot_word_from_emails` diff --git a/normalization/languages/english/number_normalizer.py b/normalization/languages/english/number_normalizer.py index c6380d5..5f53826 100644 --- a/normalization/languages/english/number_normalizer.py +++ b/normalization/languages/english/number_normalizer.py @@ -165,6 +165,13 @@ def process_words(self, words: list[str]) -> Iterator[str]: # noqa: C901 prefix: str | None = None value: str | int | None = None skip = False + after_hundred = False + compound_after_hundred = False + + def reset_number_phrase_state() -> None: + nonlocal after_hundred, compound_after_hundred + after_hundred = False + compound_after_hundred = False def to_fraction(s: str | float): try: @@ -179,6 +186,7 @@ def output(result: str | int): result = prefix + result value = None prefix = None + reset_number_phrase_state() return result if len(words) == 0: @@ -225,11 +233,27 @@ def output(result: str | int): elif current_lower not in self.words: if value is not None: yield output(value) + reset_number_phrase_state() yield output(current) elif current_lower in self.zeros: - value = str(value or "") + "0" + if ( + after_hundred + and compound_after_hundred + and isinstance(value, int) + and value >= 100 + and ( + next_lower is None + or next_lower not in self.words + or next_lower in self.zeros + ) + ): + value *= 1000 + else: + value = str(value or "") + "0" elif current_lower in self.ones: ones = self.ones[current_lower] + if after_hundred: + compound_after_hundred = True if value is None: value = ones @@ -270,6 +294,8 @@ def output(result: str | int): value = None elif current_lower in self.tens: tens = self.tens[current_lower] + if after_hundred: + compound_after_hundred = True if value is None: value = tens elif isinstance(value, str): @@ -292,6 +318,9 @@ def output(result: str | int): yield output(str(value) + str(tens) + suffix) elif current_lower in self.multipliers: multiplier = self.multipliers[current_lower] + if current_lower == "hundred": + after_hundred = True + compound_after_hundred = False if value is None: value = multiplier elif isinstance(value, str) or value == 0: diff --git a/normalization/languages/english/operators.py b/normalization/languages/english/operators.py index 8693a4c..be7ed08 100644 --- a/normalization/languages/english/operators.py +++ b/normalization/languages/english/operators.py @@ -47,6 +47,19 @@ "fifty": "50", } +# Parliamentary / legal citation prefixes: numbers after these are not percentages. +_REFERENCE_NUMBER_LOOKBEHIND = ( + "(? str: return text def fix_one_word_in_numeric_contexts(self, text: str) -> str: + # Parliamentary references: EU corpus uses both "rule" and "article". + text = re.sub(r"\brule (\d+)", r"article \1", text) + # Rejoin subsection suffixes split by expand_alphanumeric_codes (142 2 a -> 142 2a). + while True: + updated = re.sub(r"\b(\d+[a-z]?) (\d) ([a-z])\b", r"\1 \2\3", text) + if updated == text: + break + text = updated + text = _RE_SPOKEN_PERCENT_OF.sub(r"\1 percent of ", text) text = re.sub(r"(\d+)\s+one\s+one\b", r"\1 1 1", text) text = re.sub(r"\bone\s+one\s+(\d)", r"1 1 \1", text) text = re.sub(r"(\d+)\s+one\s+(\d)", r"\1 1 \2", text) diff --git a/normalization/languages/english/replacements.py b/normalization/languages/english/replacements.py index 91bdba5..af38e87 100644 --- a/normalization/languages/english/replacements.py +++ b/normalization/languages/english/replacements.py @@ -1743,6 +1743,12 @@ "yoghourts": "yogurts", "yoghurt": "yogurt", "yoghurts": "yogurts", + # VoxPopuli / parliamentary corpus aliases + "pttering": "pottering", + "puttering": "pottering", + "putttering": "pottering", + "puttrich": "pottering", + "guantnamo": "guantanamo", # contractions in titles/prefixes "mr": "mister", "mrs": "missus", diff --git a/normalization/presets/gladia-3.yaml b/normalization/presets/gladia-3.yaml index ca3f6ff..c820532 100644 --- a/normalization/presets/gladia-3.yaml +++ b/normalization/presets/gladia-3.yaml @@ -65,6 +65,8 @@ stages: # --- Phase 6: Casefold and cleanup --- - casefold_text + # Strip possessive 's before remove_symbols turns it into an orphan "s" token. + - remove_trailing_apostrophe_space - remove_symbols - remove_diacritics - remove_standalone_currency_symbols @@ -88,7 +90,6 @@ stages: - fix_version_number_v_prefix - restore_decimal_separator_with_word - convert_decimal_periods_to_decimal_word - - remove_trailing_apostrophe_space - remove_non_numeric_trailing_dots # --- Phase 2: Time and digit collapsing --- diff --git a/normalization/steps/text/remove_spaces_between_adjacent_digits.py b/normalization/steps/text/remove_spaces_between_adjacent_digits.py index 7de4a4b..0ffe683 100644 --- a/normalization/steps/text/remove_spaces_between_adjacent_digits.py +++ b/normalization/steps/text/remove_spaces_between_adjacent_digits.py @@ -5,7 +5,7 @@ from normalization.steps.base import TextStep from normalization.steps.registry import register_step -_RE_SPACES_BETWEEN_DIGITS = re.compile(r"(\d)\s+(?=\d+(?![a-z]))") +_RE_SPACES_BETWEEN_DIGITS = re.compile(r"(\d)\s+(?=\d+(?![a-z])(?!\s+[a-z]\b))") @register_step diff --git a/normalization/steps/text/remove_trailing_apostrophe_space.py b/normalization/steps/text/remove_trailing_apostrophe_space.py index 85e49f6..2d539a4 100644 --- a/normalization/steps/text/remove_trailing_apostrophe_space.py +++ b/normalization/steps/text/remove_trailing_apostrophe_space.py @@ -4,12 +4,28 @@ from normalization.steps.base import TextStep from normalization.steps.registry import register_step +# Orphan "s" left after remove_symbols turns "Latvia's" into "latvia s". +# Exclude literal letter/model references such as "the letter s" or "model s". +_RE_ORPHAN_POSSESSIVE_S = re.compile( + r"\b(?!letter s\b)(?!model s\b)([a-z]{3,}) s\b", + re.IGNORECASE, +) +_RE_WORD_POSSESSIVE_S = re.compile(r"\b(\w+)'s\b", re.IGNORECASE) + @register_step class RemoveTrailingApostropheSpaceStep(TextStep): - """Remove space before apostrophe (' s -> 's).""" + """Normalize apostrophe possessives before symbol stripping. + + Runs before remove_symbols so ``'s`` markers are removed while the apostrophe + is still present. Also collapses orphan ``s`` tokens produced when symbol + removal splits a possessive (``latvia s``), but not literal ``letter s`` or + product names like ``model s``. + """ name = "remove_trailing_apostrophe_space" def __call__(self, text: str, operators: LanguageOperators) -> str: - return re.sub(r"\s+'", "'", text) + text = re.sub(r"\s+'", "'", text) + text = _RE_WORD_POSSESSIVE_S.sub(r"\1", text) + return _RE_ORPHAN_POSSESSIVE_S.sub(r"\1", text) diff --git a/tests/unit/languages/english_voxpopuli_normalization_test.py b/tests/unit/languages/english_voxpopuli_normalization_test.py new file mode 100644 index 0000000..e3d2be5 --- /dev/null +++ b/tests/unit/languages/english_voxpopuli_normalization_test.py @@ -0,0 +1,74 @@ +import pytest + +from normalization.languages.english.number_normalizer import EnglishNumberNormalizer +from normalization.pipeline.loader import load_pipeline + + +@pytest.fixture +def pipeline(): + return load_pipeline("gladia-3", "en") + + +@pytest.mark.parametrize( + "raw,expected", + [ + ("pttering", "pottering"), + ("puttering", "pottering"), + ("putttering", "pottering"), + ("puttrich", "pottering"), + ("guantnamo", "guantanamo"), + ], +) +def test_voxpopuli_word_aliases(pipeline, raw, expected): + assert pipeline.normalize(raw) == expected + + +@pytest.mark.parametrize( + "raw,expected", + [ + ("50%", "50 percent"), + ( + "more than fifteen of latvia population", + "more than 15 percent of latvia population", + ), + ( + "fifteen of latvia s population", + "15 percent of latvia population", + ), + ("15 of 20 people", "15 of 20 people"), + ("5 of the members", "5 of the members"), + ("rule 142 of the agenda", "article 142 of the agenda"), + ("article 142 of chapter 3", "article 142 of chapter 3"), + ], +) +def test_percent_of_patterns(pipeline, raw, expected): + assert pipeline.normalize(raw) == expected + + +@pytest.mark.parametrize( + "raw,expected", + [ + ("rule 142 2a 2b", "article 142 2a 2b"), + ("article 142 2A 2B", "article 142 2a 2b"), + ], +) +def test_parliamentary_references(pipeline, raw, expected): + assert pipeline.normalize(raw) == expected + + +@pytest.mark.parametrize( + "raw,expected", + [ + ("latvia's population", "latvia population"), + ("latvia s population", "latvia population"), + ], +) +def test_possessive_cleanup(pipeline, raw, expected): + assert pipeline.normalize(raw) == expected + + +def test_hundred_compound_zero_means_thousands(): + normalizer = EnglishNumberNormalizer() + assert normalizer("three hundred and seventy two zero") == "372000" + assert normalizer("five hundred zero") == "5000" + assert normalizer("one hundred zero") == "1000" diff --git a/tests/unit/steps/text/remove_trailing_apostrophe_space_test.py b/tests/unit/steps/text/remove_trailing_apostrophe_space_test.py new file mode 100644 index 0000000..319d90d --- /dev/null +++ b/tests/unit/steps/text/remove_trailing_apostrophe_space_test.py @@ -0,0 +1,31 @@ +import pytest + +from normalization.languages.english import EnglishOperators +from normalization.steps.text.remove_trailing_apostrophe_space import ( + RemoveTrailingApostropheSpaceStep, +) + + +@pytest.fixture +def step(): + return RemoveTrailingApostropheSpaceStep() + + +@pytest.fixture +def operators(): + return EnglishOperators() + + +@pytest.mark.parametrize( + "raw,expected", + [ + ("latvia's population", "latvia population"), + ("latvia 's population", "latvia population"), + ("latvia s population", "latvia population"), + ("the letter s", "the letter s"), + ("model s car", "model s car"), + ("tesla model s", "tesla model s"), + ], +) +def test_remove_trailing_apostrophe_space(step, operators, raw, expected): + assert step(raw, operators) == expected