sign · AmitMY · May 22, 2026 · May 22, 2026 · May 22, 2026 · coderabbitai
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -0,0 +1,19 @@
+# Project rules for Claude
+
+## Before opening a PR
+
+Run the CI checks locally and fix every failure before pushing or opening
+a pull request. Do not push or open a PR if any of these fail:
+
+```bash
+hatch fmt --linter --check     # ruff lint (CI: `lint` job, step "Lint")
+hatch run mypy:check           # mypy   (CI: `lint` job, step "Type Check")
+hatch build                    # sdist + wheel build  (CI: step "Check Buildable")
+hatch test                     # pytest suite (CI: `tests` matrix)
+```
+
+When the user reports a CI failure, fix it locally and re-verify all four
+before re-pushing. Do not bypass formatting/lint rules with broad `# noqa`
+suppressions unless the failure is a deliberate, narrow exception (e.g. an
+intentionally-ambiguous Unicode literal), and always pair the suppression
+with a comment explaining why.
diff --git a/extensions/wikidata-lexemes/README.md b/extensions/wikidata-lexemes/README.md
@@ -29,7 +29,19 @@ python create_extensions.py
 This will:
 1. Filter lexemes to exclude nouns, verbs, adjectives, adverbs, and phrases
 2. Build an interlingual index (ILI) linking senses across languages via English
-3. Generate XML extension files in `extensions/` for each language
+3. For lexemes with no Wikidata senses, fall back to the English Wiktionary REST API (filters out reference-only definitions, onomatopoeia, dialectal/archaic terms not covered by omw-en)
+4. Generate XML extension files in `output/` for each language
+
+Set `LANG_FILTER=en` to restrict generation to a single language while iterating.
+
+### Caching
+
+Web requests are cached on disk under `extras/` (gitignored):
+- `extras/wikidata/` — POS/language Q-code metadata
+- `extras/wiktionary/` — Wiktionary REST `definition` responses
+- `extras/wiktionary-cats/` — Wiktionary page categories (action API)
+
+To force a refresh of a cached entry, delete the corresponding file.
 
 ## Output
 

diff --git a/extensions/wikidata-lexemes/_omw_en.py b/extensions/wikidata-lexemes/_omw_en.py
@@ -0,0 +1,17 @@
+"""Cached omw-en lemma → POS coverage."""
+from functools import cache
+
+
+@cache
+def omw_en_pos() -> dict[str, frozenset[str]]:
+    """Return {lemma_lower: frozenset of WN POSes}. Empty if omw-en unavailable."""
+    try:
+        import wn
+        en = wn.Wordnet(lexicon="omw-en")
+    except Exception:
+        return {}
+    by_lemma: dict[str, set[str]] = {}
+    for word in en.words():
+        for form in (word.lemma(), *word.forms()):
+            by_lemma.setdefault(form.lower(), set()).add(word.pos)
+    return {lemma: frozenset(pos) for lemma, pos in by_lemma.items()}
diff --git a/extensions/wikidata-lexemes/_pos_map.py b/extensions/wikidata-lexemes/_pos_map.py
@@ -16,6 +16,11 @@
     VERB,
 )
 
+# Content POS codes: those that omw-en covers natively. Used to decide
+# whether a SKIP_POS-classified Wikidata lemma should still be included
+# because omw-en doesn't have it under that POS.
+CONTENT_TARGETS = frozenset({NOUN, VERB, ADJ, ADV})
+
 POS_MAP: dict[str, str] = {
     # --- Pronoun (h) ---
     "pronoun": PRON,
@@ -50,12 +55,15 @@
     "quantifier": DET,
     "partitive": DET,
     # --- Noun (n) ---
+    "noun": NOUN,
+    "proper noun": NOUN,
     "common noun": NOUN,
     "abstract noun": NOUN,
     "compound noun": NOUN,
     "count noun": NOUN,
     "mass noun": NOUN,
     "personal noun": NOUN,
+    "agent noun": NOUN,
     "locative noun": NOUN,
     "indeclinable noun": NOUN,
     "verbal noun": NOUN,
@@ -68,6 +76,9 @@
     "location": NOUN,
     "jukugo": NOUN,  # Japanese kanji compound noun
     # --- Verb (v) ---
+    "verb": VERB,
+    "proper verb": VERB,
+    "phrasal verb": VERB,
     "auxiliary verb": VERB,
     "japanese auxiliary verb": VERB,
     "passive verb": VERB,
@@ -84,6 +95,9 @@
     "imperative form": VERB,
     "infinitive": VERB,
     # --- Adjective (a) ---
+    "adjective": ADJ,
+    "satellite adjective": ADJ,
+    "proper adjective": ADJ,
     "prenominal adjective": ADJ,
     "adnominal adjective": ADJ,
     "na-adjective": ADJ,
@@ -100,6 +114,7 @@
     "predicative": ADJ,
     "nominal modifier": ADJ,
     # --- Adverb (r) ---
+    "adverb": ADV,
     "adverbial phrase": ADV,
     "adverbial locution": ADV,
     "adverbial particle": ADV,
@@ -188,3 +203,7 @@
     "acronym": OTHER,
     "abbreviation": OTHER,
 }
+
+CONTENT_POS_MAP: dict[str, str] = {
+    label: code for label, code in POS_MAP.items() if code in CONTENT_TARGETS
+}
diff --git a/extensions/wikidata-lexemes/_wikidata.py b/extensions/wikidata-lexemes/_wikidata.py
@@ -0,0 +1,68 @@
+"""Cached Wikidata entity fetcher (used to resolve POS/language Q-codes)."""
+import json
+import re
+from collections.abc import Callable
+from functools import cache
+from pathlib import Path
+
+import requests
+
+EXTRAS_DIR = Path(__file__).parent / "extras" / "wikidata"
+USER_AGENT = (
+    "WikidataLexemesBot/1.0 "
+    "(https://github.com/sign-language-processing/dictionary)"
+)
+
+
+def safe_filename(name: str, max_len: int = 80) -> str:
+    return re.sub(r"[^A-Za-z0-9_\-]", "_", name)[:max_len] or "_"
+
+
+def cached_json_fetch(path: Path, fetch: Callable[[], dict]) -> dict:
+    try:
+        with open(path, encoding="utf-8") as f:
+            return json.load(f)
+    except FileNotFoundError:
+        pass
+    data = fetch()
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=2)
+    return data
+
+
+@cache
+def fetch_wikidata_entity(q_code: str) -> dict:
+    def _fetch():
+        url = f"https://www.wikidata.org/wiki/Special:EntityData/{q_code}.json"
+        response = requests.get(
+            url, headers={"User-Agent": USER_AGENT}, timeout=30,
+        )
+        response.raise_for_status()
+        return response.json()
+
+    data = cached_json_fetch(EXTRAS_DIR / f"{q_code}.json", _fetch)
+    entities = data["entities"]
+    return entities.get(q_code) or next(iter(entities.values()))
+
-    entities = data["entities"]
-    return entities.get(q_code) or next(iter(entities.values()))
+    entities = data["entities"]
+    if not entities:
+        raise ValueError(f"No entities returned for {q_code}")
+    return entities.get(q_code) or next(iter(entities.values()))
-    entities = data["entities"]
-    return entities.get(q_code) or next(iter(entities.values()))
+    entities = data["entities"]
+    if not entities:
+        raise ValueError(f"No entities returned for {q_code}")
+    return entities.get(q_code) or next(iter(entities.values()))
+
+@cache
+def get_label(q_code: str) -> str:
+    entity = fetch_wikidata_entity(q_code)
+    labels = entity.get("labels", {})
+    if "en" in labels:
+        return labels["en"]["value"].lower()
+    if labels:
+        return next(iter(labels.values()))["value"].lower()
+    return q_code
+
+
+@cache
+def get_language_iso(q_code: str) -> str | None:
+    entity = fetch_wikidata_entity(q_code)
+    iso_claim = entity.get("claims", {}).get("P218", [])
+    if iso_claim:
+        datavalue = iso_claim[0].get("mainsnak", {}).get("datavalue")
+        if datavalue:
+            return datavalue["value"]
+    return None