diff --git a/normalization/languages/german/replacements.py b/normalization/languages/german/replacements.py index 804c528..1ac7dea 100644 --- a/normalization/languages/german/replacements.py +++ b/normalization/languages/german/replacements.py @@ -1,4 +1,9 @@ GERMAN_REPLACEMENTS: dict[str, str] = { + # Colloquial / ASR variants (VoxPopuli DE) + "einmal": "mal", + "eines": "eins", + "konnt": "konnen", + "kottonou": "cotonou", "u.": "unter", "chr.": "christus", "rissströmungen": "riss-strömungen", diff --git a/normalization/languages/german/sentence_replacements.py b/normalization/languages/german/sentence_replacements.py index 31086fa..7923fd3 100644 --- a/normalization/languages/german/sentence_replacements.py +++ b/normalization/languages/german/sentence_replacements.py @@ -1,4 +1,7 @@ GERMAN_SENTENCE_REPLACEMENTS: dict[str, str] = { + # VoxPopuli DE: split compound / expanded policy term + "nun mehr": "nunmehr", + "handelspolitik und entwicklungspolitik": "handels und entwicklungspolitik", "regimeet kritischen": "regimekritischen", "cannabis joints": "cannabisjoints", "kampf handlungen": "kampfhandlungen", diff --git a/tests/unit/languages/german_voxpopuli_normalization_test.py b/tests/unit/languages/german_voxpopuli_normalization_test.py new file mode 100644 index 0000000..8f7e799 --- /dev/null +++ b/tests/unit/languages/german_voxpopuli_normalization_test.py @@ -0,0 +1,43 @@ +import pytest + +from normalization.pipeline.loader import load_pipeline + + +@pytest.fixture +def pipeline(): + return load_pipeline("gladia-3", "de") + + +@pytest.mark.parametrize( + "raw,expected", + [ + ("irgendwann einmal", "irgendwann mal"), + ("irgendwann mal", "irgendwann mal"), + ("noch eines sagen", "noch eins sagen"), + ("noch eins sagen", "noch eins sagen"), + ("nunmehr", "nunmehr"), + ("nun mehr", "nunmehr"), + ("können", "konnen"), + ("könnt", "konnen"), + ( + "handels und entwicklungspolitik", + "handels und entwicklungspolitik", + ), + ( + "handelspolitik und entwicklungspolitik", + "handels und entwicklungspolitik", + ), + ("cotonou", "cotonou"), + ("kottonou", "cotonou"), + ("zweitausendsieben", "2007"), + ("zwotausendsieben", "2007"), + ("2007", "2007"), + ], +) +def test_voxpopuli_german_aliases(pipeline, raw, expected): + assert pipeline.normalize(raw) == expected + + +def test_mal_particle_not_expanded_to_einmal(pipeline): + """Colloquial particle 'mal' must stay when it is not 'einmal'.""" + assert pipeline.normalize("halt mal so") == "mal so"