diff --git a/Jenkinsfile b/Jenkinsfile index 24ac047eb..d9c3a5984 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -28,7 +28,7 @@ pipeline { MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1' KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0' - HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0' + HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/05-13-26-0' DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/text_normalization/hi/data/fraction/common_fractions.tsv b/nemo_text_processing/text_normalization/hi/data/fraction/common_fractions.tsv new file mode 100644 index 000000000..5e44cb502 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/fraction/common_fractions.tsv @@ -0,0 +1,10 @@ +१/२ आधा +१/३ तिहाई +२/३ दो तिहाई +१/४ चौथाई +३/४ तीन चौथाई +1/2 आधा +1/3 तिहाई +2/3 दो तिहाई +1/4 चौथाई +3/4 तीन चौथाई \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/taggers/fraction.py b/nemo_text_processing/text_normalization/hi/taggers/fraction.py index b5528deba..8b72b25b2 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/taggers/fraction.py @@ -26,18 +26,18 @@ ) from nemo_text_processing.text_normalization.hi.utils import get_abs_path -HI_ONE_HALF = "१/२" # 1/2 -HI_ONE_QUARTER = "१/४" # 1/4 -HI_THREE_QUARTERS = "३/४" # 3/4 +HI_ONE_HALF = "१/२" +HI_ONE_QUARTER = "१/४" +HI_THREE_QUARTERS = "३/४" class FractionFst(GraphFst): """ Finite state transducer for classifying fraction "२३ ४/६" -> - fraction { integer: "तेईस" numerator: "चार" denominator: "छः"} + fraction { integer: "तेईस" numerator: "चार" denominator: "छह"} ४/६" -> - fraction { numerator: "चार" denominator: "छः"} + fraction { numerator: "चार" denominator: "छह"} Args: @@ -54,13 +54,16 @@ def __init__(self, cardinal, deterministic: bool = True): self.optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + pynutil.insert(NEMO_SPACE), 0, 1 ) + self.integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") + self.numerator = ( pynutil.insert("numerator: \"") + cardinal_graph + pynini.cross(pynini.union("/", NEMO_SPACE + "/" + NEMO_SPACE), "\"") + pynutil.insert(NEMO_SPACE) ) + self.denominator = pynutil.insert("denominator: \"") + cardinal_graph + pynutil.insert("\"") dedh_dhai_graph = pynini.string_map( @@ -77,6 +80,15 @@ def __init__(self, cardinal, deterministic: bool = True): paune_numbers = paune + pynini.cross(NEMO_SPACE + HI_THREE_QUARTERS, "") paune_graph = pynutil.insert(HI_PAUNE) + pynutil.insert(NEMO_SPACE) + paune_numbers + common_fraction_map = pynini.string_file(get_abs_path("data/fraction/common_fractions.tsv")) + + graph_common_fraction = ( + pynutil.insert("morphosyntactic_features: \"") + + common_fraction_map + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) + graph_dedh_dhai = ( pynutil.insert("morphosyntactic_features: \"") + dedh_dhai_graph @@ -114,10 +126,11 @@ def __init__(self, cardinal, deterministic: bool = True): weighted_graph = ( final_graph + | pynutil.add_weight(graph_common_fraction, -0.3) | pynutil.add_weight(graph_dedh_dhai, -0.2) + | pynutil.add_weight(graph_paune, -0.2) | pynutil.add_weight(graph_savva, -0.1) | pynutil.add_weight(graph_sadhe, -0.1) - | pynutil.add_weight(graph_paune, -0.2) ) self.graph = weighted_graph diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py index a07c41eae..66d944ea7 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py @@ -21,8 +21,8 @@ class FractionFst(GraphFst): """ Finite state transducer for verbalizing fraction - e.g. fraction { integer: "तेईस" numerator: "चार" denominator: "छः" }-> तेईस चार बटा छः - e.g. fraction { numerator: "चार" denominator: "छः" } -> चार बटा छः + e.g. fraction { integer: "तेईस" numerator: "चार" denominator: "छह" }-> तेईस और चार बटा छह + e.g. fraction { numerator: "चार" denominator: "छह" } -> चार बटा छह Args: diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt index 4184ae9ee..6778978b7 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt @@ -20,4 +20,8 @@ १०००००००००००००००/८~एक पद्म बटा आठ 100000000000000000/412~एक शंख बटा चार सौ बारह २ २/७~दो और दो बटा सात -120 75/90~एक सौ बीस और पचहत्तर बटा नब्बे \ No newline at end of file +120 75/90~एक सौ बीस और पचहत्तर बटा नब्बे +१/२~आधा +१/३~तिहाई +1/4~चौथाई +3/4~तीन चौथाई \ No newline at end of file