From ca0122997dc83f39d492c70b84f53d4d3e9622c2 Mon Sep 17 00:00:00 2001 From: Shreyas Pawar Date: Tue, 28 Apr 2026 08:48:24 +0000 Subject: [PATCH 1/5] Fix: fraction class improvements with lexical mappings Signed-off-by: Shreyas Pawar --- Jenkinsfile | 2 +- .../text_normalization/hi/taggers/fraction.py | 47 ++++++++++++------- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 565f5df27..c8177d681 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0' + HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-28-26-0' KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-22-26-0' DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' } diff --git a/nemo_text_processing/text_normalization/hi/taggers/fraction.py b/nemo_text_processing/text_normalization/hi/taggers/fraction.py index b5528deba..bb46dd998 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/taggers/fraction.py @@ -32,20 +32,6 @@ class FractionFst(GraphFst): - """ - Finite state transducer for classifying fraction - "२३ ४/६" -> - fraction { integer: "तेईस" numerator: "चार" denominator: "छः"} - ४/६" -> - fraction { numerator: "चार" denominator: "छः"} - - - Args: - cardinal: cardinal GraphFst - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) - """ - def __init__(self, cardinal, deterministic: bool = True): super().__init__(name="fraction", kind="classify", deterministic=deterministic) @@ -54,15 +40,20 @@ def __init__(self, cardinal, deterministic: bool = True): self.optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + pynutil.insert(NEMO_SPACE), 0, 1 ) + self.integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") + self.numerator = ( pynutil.insert("numerator: \"") + cardinal_graph + pynini.cross(pynini.union("/", NEMO_SPACE + "/" + NEMO_SPACE), "\"") + pynutil.insert(NEMO_SPACE) ) + self.denominator = pynutil.insert("denominator: \"") + cardinal_graph + pynutil.insert("\"") + # ---------------- EXISTING SPECIAL FORMS ---------------- + dedh_dhai_graph = pynini.string_map( [("१" + NEMO_SPACE + HI_ONE_HALF, HI_DEDH), ("२" + NEMO_SPACE + HI_ONE_HALF, HI_DHAI)] ) @@ -77,6 +68,25 @@ def __init__(self, cardinal, deterministic: bool = True): paune_numbers = paune + pynini.cross(NEMO_SPACE + HI_THREE_QUARTERS, "") paune_graph = pynutil.insert(HI_PAUNE) + pynutil.insert(NEMO_SPACE) + paune_numbers + # ---------------- COMMON FRACTION FORMS ---------------- + + common_fraction_map = pynini.string_map([ + ("१/२", "आधा"), + ("१/३", "तिहाई"), + ("२/३", "दो तिहाई"), + ("१/४", "चौथाई"), + ("३/४", "तीन चौथाई"), + ]) + + graph_common_fraction = ( + pynutil.insert("morphosyntactic_features: \"") + + common_fraction_map + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) + + # ---------------- WRAPPING GRAPHS ---------------- + graph_dedh_dhai = ( pynutil.insert("morphosyntactic_features: \"") + dedh_dhai_graph @@ -105,6 +115,8 @@ def __init__(self, cardinal, deterministic: bool = True): + pynutil.insert(NEMO_SPACE) ) + # ---------------- DEFAULT FRACTION ---------------- + final_graph = ( self.optional_graph_negative + pynini.closure(self.integer + pynini.accep(NEMO_SPACE), 0, 1) @@ -112,16 +124,19 @@ def __init__(self, cardinal, deterministic: bool = True): + self.denominator ) + # ---------------- PRIORITY HANDLING ---------------- + weighted_graph = ( final_graph + | pynutil.add_weight(graph_common_fraction, -0.3) # ensures override of "बटा" | pynutil.add_weight(graph_dedh_dhai, -0.2) + | pynutil.add_weight(graph_paune, -0.2) | pynutil.add_weight(graph_savva, -0.1) | pynutil.add_weight(graph_sadhe, -0.1) - | pynutil.add_weight(graph_paune, -0.2) ) self.graph = weighted_graph graph = self.graph graph = self.add_tokens(graph) - self.fst = graph.optimize() + self.fst = graph.optimize() \ No newline at end of file From 2761cfba6a4caf0438ff8828e1166ec955ec7263 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 28 Apr 2026 12:59:15 +0000 Subject: [PATCH 2/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/hi/taggers/fraction.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/fraction.py b/nemo_text_processing/text_normalization/hi/taggers/fraction.py index bb46dd998..b8ce7ac62 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/taggers/fraction.py @@ -70,13 +70,15 @@ def __init__(self, cardinal, deterministic: bool = True): # ---------------- COMMON FRACTION FORMS ---------------- - common_fraction_map = pynini.string_map([ - ("१/२", "आधा"), - ("१/३", "तिहाई"), - ("२/३", "दो तिहाई"), - ("१/४", "चौथाई"), - ("३/४", "तीन चौथाई"), - ]) + common_fraction_map = pynini.string_map( + [ + ("१/२", "आधा"), + ("१/३", "तिहाई"), + ("२/३", "दो तिहाई"), + ("१/४", "चौथाई"), + ("३/४", "तीन चौथाई"), + ] + ) graph_common_fraction = ( pynutil.insert("morphosyntactic_features: \"") @@ -139,4 +141,4 @@ def __init__(self, cardinal, deterministic: bool = True): graph = self.graph graph = self.add_tokens(graph) - self.fst = graph.optimize() \ No newline at end of file + self.fst = graph.optimize() From c8411c2ed8ccdf5342b4cc8db9bd3bb8d3d9824c Mon Sep 17 00:00:00 2001 From: Shreyas Pawar Date: Wed, 13 May 2026 13:12:06 +0000 Subject: [PATCH 3/5] move common fraction mappings to TSV and add ASCII digit support Signed-off-by: Shreyas Pawar --- Jenkinsfile | 5 +++ .../hi/data/fraction/common_fractions.tsv | 10 ++++++ .../text_normalization/hi/taggers/fraction.py | 35 ++++++++++++------- .../hi/verbalizers/fraction.py | 4 +-- .../test_cases_fraction.txt | 6 +++- 5 files changed, 45 insertions(+), 15 deletions(-) create mode 100644 nemo_text_processing/text_normalization/hi/data/fraction/common_fractions.tsv diff --git a/Jenkinsfile b/Jenkinsfile index f2bfcd408..8d7cb0b41 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,8 +27,13 @@ pipeline { HY_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1' +<<<<<<< HEAD KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0' HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-28-26-0' +======= + HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/05-13-26-0' + KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-22-26-0' +>>>>>>> 9f8266e4 (move common fraction mappings to TSV and add ASCII digit support) DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/text_normalization/hi/data/fraction/common_fractions.tsv b/nemo_text_processing/text_normalization/hi/data/fraction/common_fractions.tsv new file mode 100644 index 000000000..5e44cb502 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/fraction/common_fractions.tsv @@ -0,0 +1,10 @@ +१/२ आधा +१/३ तिहाई +२/३ दो तिहाई +१/४ चौथाई +३/४ तीन चौथाई +1/2 आधा +1/3 तिहाई +2/3 दो तिहाई +1/4 चौथाई +3/4 तीन चौथाई \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/taggers/fraction.py b/nemo_text_processing/text_normalization/hi/taggers/fraction.py index b8ce7ac62..de8e9c386 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/taggers/fraction.py @@ -26,12 +26,26 @@ ) from nemo_text_processing.text_normalization.hi.utils import get_abs_path -HI_ONE_HALF = "१/२" # 1/2 -HI_ONE_QUARTER = "१/४" # 1/4 -HI_THREE_QUARTERS = "३/४" # 3/4 +HI_ONE_HALF = "१/२" +HI_ONE_QUARTER = "१/४" +HI_THREE_QUARTERS = "३/४" class FractionFst(GraphFst): + """ + Finite state transducer for classifying fraction + "२३ ४/६" -> + fraction { integer: "तेईस" numerator: "चार" denominator: "छह"} + ४/६" -> + fraction { numerator: "चार" denominator: "छह"} + + + Args: + cardinal: cardinal GraphFst + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + def __init__(self, cardinal, deterministic: bool = True): super().__init__(name="fraction", kind="classify", deterministic=deterministic) @@ -52,8 +66,6 @@ def __init__(self, cardinal, deterministic: bool = True): self.denominator = pynutil.insert("denominator: \"") + cardinal_graph + pynutil.insert("\"") - # ---------------- EXISTING SPECIAL FORMS ---------------- - dedh_dhai_graph = pynini.string_map( [("१" + NEMO_SPACE + HI_ONE_HALF, HI_DEDH), ("२" + NEMO_SPACE + HI_ONE_HALF, HI_DHAI)] ) @@ -68,6 +80,7 @@ def __init__(self, cardinal, deterministic: bool = True): paune_numbers = paune + pynini.cross(NEMO_SPACE + HI_THREE_QUARTERS, "") paune_graph = pynutil.insert(HI_PAUNE) + pynutil.insert(NEMO_SPACE) + paune_numbers +<<<<<<< HEAD # ---------------- COMMON FRACTION FORMS ---------------- common_fraction_map = pynini.string_map( @@ -80,6 +93,10 @@ def __init__(self, cardinal, deterministic: bool = True): ] ) +======= + common_fraction_map = pynini.string_file(get_abs_path("data/fraction/common_fractions.tsv")) + +>>>>>>> 9f8266e4 (move common fraction mappings to TSV and add ASCII digit support) graph_common_fraction = ( pynutil.insert("morphosyntactic_features: \"") + common_fraction_map @@ -87,8 +104,6 @@ def __init__(self, cardinal, deterministic: bool = True): + pynutil.insert(NEMO_SPACE) ) - # ---------------- WRAPPING GRAPHS ---------------- - graph_dedh_dhai = ( pynutil.insert("morphosyntactic_features: \"") + dedh_dhai_graph @@ -117,8 +132,6 @@ def __init__(self, cardinal, deterministic: bool = True): + pynutil.insert(NEMO_SPACE) ) - # ---------------- DEFAULT FRACTION ---------------- - final_graph = ( self.optional_graph_negative + pynini.closure(self.integer + pynini.accep(NEMO_SPACE), 0, 1) @@ -126,11 +139,9 @@ def __init__(self, cardinal, deterministic: bool = True): + self.denominator ) - # ---------------- PRIORITY HANDLING ---------------- - weighted_graph = ( final_graph - | pynutil.add_weight(graph_common_fraction, -0.3) # ensures override of "बटा" + | pynutil.add_weight(graph_common_fraction, -0.3) | pynutil.add_weight(graph_dedh_dhai, -0.2) | pynutil.add_weight(graph_paune, -0.2) | pynutil.add_weight(graph_savva, -0.1) diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py index a07c41eae..66d944ea7 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py @@ -21,8 +21,8 @@ class FractionFst(GraphFst): """ Finite state transducer for verbalizing fraction - e.g. fraction { integer: "तेईस" numerator: "चार" denominator: "छः" }-> तेईस चार बटा छः - e.g. fraction { numerator: "चार" denominator: "छः" } -> चार बटा छः + e.g. fraction { integer: "तेईस" numerator: "चार" denominator: "छह" }-> तेईस और चार बटा छह + e.g. fraction { numerator: "चार" denominator: "छह" } -> चार बटा छह Args: diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt index 4184ae9ee..6778978b7 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt @@ -20,4 +20,8 @@ १०००००००००००००००/८~एक पद्म बटा आठ 100000000000000000/412~एक शंख बटा चार सौ बारह २ २/७~दो और दो बटा सात -120 75/90~एक सौ बीस और पचहत्तर बटा नब्बे \ No newline at end of file +120 75/90~एक सौ बीस और पचहत्तर बटा नब्बे +१/२~आधा +१/३~तिहाई +1/4~चौथाई +3/4~तीन चौथाई \ No newline at end of file From 7e386998fc0776f21f88f63983592d0c41728cd2 Mon Sep 17 00:00:00 2001 From: Shreyas Pawar Date: Wed, 13 May 2026 14:01:10 +0000 Subject: [PATCH 4/5] Jenkins date conflict resolved Signed-off-by: Shreyas Pawar --- Jenkinsfile | 5 ----- .../text_normalization/hi/taggers/fraction.py | 17 +---------------- 2 files changed, 1 insertion(+), 21 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 8d7cb0b41..d9c3a5984 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,13 +27,8 @@ pipeline { HY_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1' -<<<<<<< HEAD KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0' - HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-28-26-0' -======= HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/05-13-26-0' - KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-22-26-0' ->>>>>>> 9f8266e4 (move common fraction mappings to TSV and add ASCII digit support) DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/text_normalization/hi/taggers/fraction.py b/nemo_text_processing/text_normalization/hi/taggers/fraction.py index de8e9c386..785ab83e6 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/taggers/fraction.py @@ -80,23 +80,8 @@ def __init__(self, cardinal, deterministic: bool = True): paune_numbers = paune + pynini.cross(NEMO_SPACE + HI_THREE_QUARTERS, "") paune_graph = pynutil.insert(HI_PAUNE) + pynutil.insert(NEMO_SPACE) + paune_numbers -<<<<<<< HEAD - # ---------------- COMMON FRACTION FORMS ---------------- - - common_fraction_map = pynini.string_map( - [ - ("१/२", "आधा"), - ("१/३", "तिहाई"), - ("२/३", "दो तिहाई"), - ("१/४", "चौथाई"), - ("३/४", "तीन चौथाई"), - ] - ) - -======= common_fraction_map = pynini.string_file(get_abs_path("data/fraction/common_fractions.tsv")) ->>>>>>> 9f8266e4 (move common fraction mappings to TSV and add ASCII digit support) graph_common_fraction = ( pynutil.insert("morphosyntactic_features: \"") + common_fraction_map @@ -152,4 +137,4 @@ def __init__(self, cardinal, deterministic: bool = True): graph = self.graph graph = self.add_tokens(graph) - self.fst = graph.optimize() + self.fst = graph.optimize() \ No newline at end of file From 70c3ea12ea4066c56e2005d61a301a82f0328854 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 13 May 2026 14:02:14 +0000 Subject: [PATCH 5/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/hi/taggers/fraction.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/fraction.py b/nemo_text_processing/text_normalization/hi/taggers/fraction.py index 785ab83e6..8b72b25b2 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/taggers/fraction.py @@ -26,9 +26,9 @@ ) from nemo_text_processing.text_normalization.hi.utils import get_abs_path -HI_ONE_HALF = "१/२" -HI_ONE_QUARTER = "१/४" -HI_THREE_QUARTERS = "३/४" +HI_ONE_HALF = "१/२" +HI_ONE_QUARTER = "१/४" +HI_THREE_QUARTERS = "३/४" class FractionFst(GraphFst): @@ -81,7 +81,7 @@ def __init__(self, cardinal, deterministic: bool = True): paune_graph = pynutil.insert(HI_PAUNE) + pynutil.insert(NEMO_SPACE) + paune_numbers common_fraction_map = pynini.string_file(get_abs_path("data/fraction/common_fractions.tsv")) - + graph_common_fraction = ( pynutil.insert("morphosyntactic_features: \"") + common_fraction_map @@ -137,4 +137,4 @@ def __init__(self, cardinal, deterministic: bool = True): graph = self.graph graph = self.add_tokens(graph) - self.fst = graph.optimize() \ No newline at end of file + self.fst = graph.optimize()