Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion docs/steps.md
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,12 @@ English uses comma (1,234 -> 1234), European languages use period (1.234 -> 1234

**Base class:** `TextStep`

Remove space before apostrophe (' s -> 's).
Normalize apostrophe possessives before symbol stripping.

Runs before remove_symbols so ``'s`` markers are removed while the apostrophe
is still present. Also collapses orphan ``s`` tokens produced when symbol
removal splits a possessive (``latvia s``), but not literal ``letter s`` or
product names like ``model s``.

### `remove_trailing_dot_word_from_emails`

Expand Down
31 changes: 30 additions & 1 deletion normalization/languages/english/number_normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,13 @@ def process_words(self, words: list[str]) -> Iterator[str]: # noqa: C901
prefix: str | None = None
value: str | int | None = None
skip = False
after_hundred = False
compound_after_hundred = False

def reset_number_phrase_state() -> None:
nonlocal after_hundred, compound_after_hundred
after_hundred = False
compound_after_hundred = False

def to_fraction(s: str | float):
try:
Expand All @@ -179,6 +186,7 @@ def output(result: str | int):
result = prefix + result
value = None
prefix = None
reset_number_phrase_state()
return result

if len(words) == 0:
Expand Down Expand Up @@ -225,11 +233,27 @@ def output(result: str | int):
elif current_lower not in self.words:
if value is not None:
yield output(value)
reset_number_phrase_state()
yield output(current)
elif current_lower in self.zeros:
value = str(value or "") + "0"
if (
after_hundred
and compound_after_hundred
and isinstance(value, int)
and value >= 100
and (
next_lower is None
or next_lower not in self.words
or next_lower in self.zeros
)
):
value *= 1000
else:
value = str(value or "") + "0"
elif current_lower in self.ones:
ones = self.ones[current_lower]
if after_hundred:
compound_after_hundred = True

if value is None:
value = ones
Expand Down Expand Up @@ -270,6 +294,8 @@ def output(result: str | int):
value = None
elif current_lower in self.tens:
tens = self.tens[current_lower]
if after_hundred:
compound_after_hundred = True
if value is None:
value = tens
elif isinstance(value, str):
Expand All @@ -292,6 +318,9 @@ def output(result: str | int):
yield output(str(value) + str(tens) + suffix)
elif current_lower in self.multipliers:
multiplier = self.multipliers[current_lower]
if current_lower == "hundred":
after_hundred = True
compound_after_hundred = False
if value is None:
value = multiplier
elif isinstance(value, str) or value == 0:
Expand Down
22 changes: 22 additions & 0 deletions normalization/languages/english/operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,19 @@
"fifty": "50",
}

# Parliamentary / legal citation prefixes: numbers after these are not percentages.
_REFERENCE_NUMBER_LOOKBEHIND = (
"(?<!article )(?<!rule )(?<!section )(?<!chapter )"
"(?<!paragraph )(?<!part )(?<!clause )(?<!annex )(?<!appendix )"
)
# Spoken percentages often omit "percent" before "of" with a proper-noun object
# (e.g. "15 of Latvia population"), but not for ratios ("15 of 20"),
# partitives ("5 of the members"), or citation tails ("article 142 of the agenda").
_RE_SPOKEN_PERCENT_OF = re.compile(
rf"{_REFERENCE_NUMBER_LOOKBEHIND}\b(\d+) of (?!\d)(?!the\b)",
re.IGNORECASE,
)

ENGLISH_CONFIG = LanguageConfig(
code="en",
decimal_separator=".",
Expand Down Expand Up @@ -183,6 +196,15 @@ def _format_colon_time(match: re.Match) -> str:
return text

def fix_one_word_in_numeric_contexts(self, text: str) -> str:
# Parliamentary references: EU corpus uses both "rule" and "article".
text = re.sub(r"\brule (\d+)", r"article \1", text)
# Rejoin subsection suffixes split by expand_alphanumeric_codes (142 2 a -> 142 2a).
while True:
updated = re.sub(r"\b(\d+[a-z]?) (\d) ([a-z])\b", r"\1 \2\3", text)
if updated == text:
break
text = updated
text = _RE_SPOKEN_PERCENT_OF.sub(r"\1 percent of ", text)
text = re.sub(r"(\d+)\s+one\s+one\b", r"\1 1 1", text)
text = re.sub(r"\bone\s+one\s+(\d)", r"1 1 \1", text)
text = re.sub(r"(\d+)\s+one\s+(\d)", r"\1 1 \2", text)
Expand Down
6 changes: 6 additions & 0 deletions normalization/languages/english/replacements.py
Original file line number Diff line number Diff line change
Expand Up @@ -1743,6 +1743,12 @@
"yoghourts": "yogurts",
"yoghurt": "yogurt",
"yoghurts": "yogurts",
# VoxPopuli / parliamentary corpus aliases
"pttering": "pottering",
"puttering": "pottering",
"putttering": "pottering",
"puttrich": "pottering",
"guantnamo": "guantanamo",
Comment thread
egenthon-cmd marked this conversation as resolved.
# contractions in titles/prefixes
"mr": "mister",
"mrs": "missus",
Expand Down
3 changes: 2 additions & 1 deletion normalization/presets/gladia-3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ stages:

# --- Phase 6: Casefold and cleanup ---
- casefold_text
# Strip possessive 's before remove_symbols turns it into an orphan "s" token.
- remove_trailing_apostrophe_space
- remove_symbols
- remove_diacritics
- remove_standalone_currency_symbols
Expand All @@ -88,7 +90,6 @@ stages:
- fix_version_number_v_prefix
- restore_decimal_separator_with_word
- convert_decimal_periods_to_decimal_word
- remove_trailing_apostrophe_space
- remove_non_numeric_trailing_dots

# --- Phase 2: Time and digit collapsing ---
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from normalization.steps.base import TextStep
from normalization.steps.registry import register_step

_RE_SPACES_BETWEEN_DIGITS = re.compile(r"(\d)\s+(?=\d+(?![a-z]))")
_RE_SPACES_BETWEEN_DIGITS = re.compile(r"(\d)\s+(?=\d+(?![a-z])(?!\s+[a-z]\b))")


@register_step
Expand Down
20 changes: 18 additions & 2 deletions normalization/steps/text/remove_trailing_apostrophe_space.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,28 @@
from normalization.steps.base import TextStep
from normalization.steps.registry import register_step

# Orphan "s" left after remove_symbols turns "Latvia's" into "latvia s".
# Exclude literal letter/model references such as "the letter s" or "model s".
_RE_ORPHAN_POSSESSIVE_S = re.compile(
r"\b(?!letter s\b)(?!model s\b)([a-z]{3,}) s\b",
re.IGNORECASE,
)
_RE_WORD_POSSESSIVE_S = re.compile(r"\b(\w+)'s\b", re.IGNORECASE)


@register_step
class RemoveTrailingApostropheSpaceStep(TextStep):
"""Remove space before apostrophe (' s -> 's)."""
"""Normalize apostrophe possessives before symbol stripping.

Runs before remove_symbols so ``'s`` markers are removed while the apostrophe
is still present. Also collapses orphan ``s`` tokens produced when symbol
removal splits a possessive (``latvia s``), but not literal ``letter s`` or
product names like ``model s``.
"""

name = "remove_trailing_apostrophe_space"

def __call__(self, text: str, operators: LanguageOperators) -> str:
return re.sub(r"\s+'", "'", text)
text = re.sub(r"\s+'", "'", text)
text = _RE_WORD_POSSESSIVE_S.sub(r"\1", text)
return _RE_ORPHAN_POSSESSIVE_S.sub(r"\1", text)
74 changes: 74 additions & 0 deletions tests/unit/languages/english_voxpopuli_normalization_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import pytest

from normalization.languages.english.number_normalizer import EnglishNumberNormalizer
from normalization.pipeline.loader import load_pipeline


@pytest.fixture
def pipeline():
return load_pipeline("gladia-3", "en")


@pytest.mark.parametrize(
"raw,expected",
[
("pttering", "pottering"),
("puttering", "pottering"),
("putttering", "pottering"),
("puttrich", "pottering"),
("guantnamo", "guantanamo"),
],
)
def test_voxpopuli_word_aliases(pipeline, raw, expected):
assert pipeline.normalize(raw) == expected


@pytest.mark.parametrize(
"raw,expected",
[
("50%", "50 percent"),
(
"more than fifteen of latvia population",
"more than 15 percent of latvia population",
),
(
"fifteen of latvia s population",
"15 percent of latvia population",
),
("15 of 20 people", "15 of 20 people"),
("5 of the members", "5 of the members"),
("rule 142 of the agenda", "article 142 of the agenda"),
("article 142 of chapter 3", "article 142 of chapter 3"),
],
)
def test_percent_of_patterns(pipeline, raw, expected):
assert pipeline.normalize(raw) == expected


@pytest.mark.parametrize(
"raw,expected",
[
("rule 142 2a 2b", "article 142 2a 2b"),
("article 142 2A 2B", "article 142 2a 2b"),
],
)
def test_parliamentary_references(pipeline, raw, expected):
assert pipeline.normalize(raw) == expected


@pytest.mark.parametrize(
"raw,expected",
[
("latvia's population", "latvia population"),
("latvia s population", "latvia population"),
],
)
def test_possessive_cleanup(pipeline, raw, expected):
assert pipeline.normalize(raw) == expected


def test_hundred_compound_zero_means_thousands():
normalizer = EnglishNumberNormalizer()
assert normalizer("three hundred and seventy two zero") == "372000"
assert normalizer("five hundred zero") == "5000"
assert normalizer("one hundred zero") == "1000"
31 changes: 31 additions & 0 deletions tests/unit/steps/text/remove_trailing_apostrophe_space_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import pytest

from normalization.languages.english import EnglishOperators
from normalization.steps.text.remove_trailing_apostrophe_space import (
RemoveTrailingApostropheSpaceStep,
)


@pytest.fixture
def step():
return RemoveTrailingApostropheSpaceStep()


@pytest.fixture
def operators():
return EnglishOperators()


@pytest.mark.parametrize(
"raw,expected",
[
("latvia's population", "latvia population"),
("latvia 's population", "latvia population"),
("latvia s population", "latvia population"),
("the letter s", "the letter s"),
("model s car", "model s car"),
("tesla model s", "tesla model s"),
],
)
def test_remove_trailing_apostrophe_space(step, operators, raw, expected):
assert step(raw, operators) == expected