Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions normalization/languages/german/replacements.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
GERMAN_REPLACEMENTS: dict[str, str] = {
# Colloquial / ASR variants (VoxPopuli DE)
"einmal": "mal",
"eines": "eins",
"konnt": "konnen",
"kottonou": "cotonou",
"u.": "unter",
"chr.": "christus",
"rissströmungen": "riss-strömungen",
Expand Down
3 changes: 3 additions & 0 deletions normalization/languages/german/sentence_replacements.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
GERMAN_SENTENCE_REPLACEMENTS: dict[str, str] = {
# VoxPopuli DE: split compound / expanded policy term
"nun mehr": "nunmehr",
"handelspolitik und entwicklungspolitik": "handels und entwicklungspolitik",
"regimeet kritischen": "regimekritischen",
"cannabis joints": "cannabisjoints",
"kampf handlungen": "kampfhandlungen",
Expand Down
43 changes: 43 additions & 0 deletions tests/unit/languages/german_voxpopuli_normalization_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import pytest

from normalization.pipeline.loader import load_pipeline


@pytest.fixture
def pipeline():
return load_pipeline("gladia-3", "de")


@pytest.mark.parametrize(
"raw,expected",
[
("irgendwann einmal", "irgendwann mal"),
("irgendwann mal", "irgendwann mal"),
("noch eines sagen", "noch eins sagen"),
("noch eins sagen", "noch eins sagen"),
("nunmehr", "nunmehr"),
("nun mehr", "nunmehr"),
("können", "konnen"),
("könnt", "konnen"),
(
"handels und entwicklungspolitik",
"handels und entwicklungspolitik",
),
(
"handelspolitik und entwicklungspolitik",
"handels und entwicklungspolitik",
),
("cotonou", "cotonou"),
("kottonou", "cotonou"),
("zweitausendsieben", "2007"),
("zwotausendsieben", "2007"),
("2007", "2007"),
],
)
def test_voxpopuli_german_aliases(pipeline, raw, expected):
assert pipeline.normalize(raw) == expected


def test_mal_particle_not_expanded_to_einmal(pipeline):
"""Colloquial particle 'mal' must stay when it is not 'einmal'."""
assert pipeline.normalize("halt mal so") == "mal so"
Loading