From a97ee80fb106bfbb2c8e751a33d1b9789d17d14c Mon Sep 17 00:00:00 2001 From: Omar Ibrahim Date: Sat, 20 Jun 2026 05:54:59 -0500 Subject: [PATCH] fix: exclude Arabic diacritics (harakat) from spam character-density check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unicode combining marks (category Mn) are the harakat — vowel diacritics attached to Arabic letters. c.isalpha() returns False for them, so they were counted as special characters in the density check. A heavily voweled Arabic query like اللَّهُمَّ صَلِّ عَلَى مُحَمَّدٍ has ~52% Mn characters and was wrongly rejected with 400. Among the top 100 queries by search volume, 12 of 30 Arabic queries were being rejected, including لَا إِلَهَ إِلَّا اَللَّهُ (#8, 152k searches) and the Basmala. Fix: add unicodedata.category(c) != 'Mn' guard so combining marks are not counted as punctuation/symbols in the density ratio. Co-Authored-By: Claude Sonnet 4.6 --- query_router.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/query_router.py b/query_router.py index 41b2e51..21d94b6 100644 --- a/query_router.py +++ b/query_router.py @@ -14,6 +14,7 @@ """ import re +import unicodedata from config import SearchMode @@ -44,10 +45,15 @@ def is_spam(query): if _SPAM_WA_RE.search(q): return True # High special-character density: >40% of non-space chars are punctuation/symbols. - # .isalpha() handles Arabic and other Unicode letters correctly. + # Exclude Unicode combining marks (Mn) — Arabic diacritics (harakat) are Mn + # and are part of the word, not punctuation. Without this, heavily voweled + # Arabic like "اللَّهُمَّ صَلِّ عَلَى مُحَمَّدٍ" exceeds the threshold and is + # wrongly rejected. non_space = [c for c in q if not c.isspace()] if len(non_space) >= 8: - special = sum(1 for c in non_space if not c.isalpha() and not c.isdigit()) + special = sum(1 for c in non_space + if not c.isalpha() and not c.isdigit() + and unicodedata.category(c) != 'Mn') if special / len(non_space) > 0.4: return True return False