Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Project rules for Claude

## Before opening a PR

Run the CI checks locally and fix every failure before pushing or opening
a pull request. Do not push or open a PR if any of these fail:

```bash
hatch fmt --linter --check # ruff lint (CI: `lint` job, step "Lint")
hatch run mypy:check # mypy (CI: `lint` job, step "Type Check")
hatch build # sdist + wheel build (CI: step "Check Buildable")
hatch test # pytest suite (CI: `tests` matrix)
```

When the user reports a CI failure, fix it locally and re-verify all four
before re-pushing. Do not bypass formatting/lint rules with broad `# noqa`
suppressions unless the failure is a deliberate, narrow exception (e.g. an
intentionally-ambiguous Unicode literal), and always pair the suppression
with a comment explaining why.
14 changes: 13 additions & 1 deletion extensions/wikidata-lexemes/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,19 @@ python create_extensions.py
This will:
1. Filter lexemes to exclude nouns, verbs, adjectives, adverbs, and phrases
2. Build an interlingual index (ILI) linking senses across languages via English
3. Generate XML extension files in `extensions/` for each language
3. For lexemes with no Wikidata senses, fall back to the English Wiktionary REST API (filters out reference-only definitions, onomatopoeia, dialectal/archaic terms not covered by omw-en)
4. Generate XML extension files in `output/` for each language

Set `LANG_FILTER=en` to restrict generation to a single language while iterating.

### Caching

Web requests are cached on disk under `extras/` (gitignored):
- `extras/wikidata/` — POS/language Q-code metadata
- `extras/wiktionary/` — Wiktionary REST `definition` responses
- `extras/wiktionary-cats/` — Wiktionary page categories (action API)

To force a refresh of a cached entry, delete the corresponding file.

## Output

Expand Down
17 changes: 17 additions & 0 deletions extensions/wikidata-lexemes/_omw_en.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""Cached omw-en lemma → POS coverage."""
from functools import cache


@cache
def omw_en_pos() -> dict[str, frozenset[str]]:
"""Return {lemma_lower: frozenset of WN POSes}. Empty if omw-en unavailable."""
try:
import wn
en = wn.Wordnet(lexicon="omw-en")
except Exception:
return {}
by_lemma: dict[str, set[str]] = {}
for word in en.words():
for form in (word.lemma(), *word.forms()):
by_lemma.setdefault(form.lower(), set()).add(word.pos)
return {lemma: frozenset(pos) for lemma, pos in by_lemma.items()}
19 changes: 19 additions & 0 deletions extensions/wikidata-lexemes/_pos_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
VERB,
)

# Content POS codes: those that omw-en covers natively. Used to decide
# whether a SKIP_POS-classified Wikidata lemma should still be included
# because omw-en doesn't have it under that POS.
CONTENT_TARGETS = frozenset({NOUN, VERB, ADJ, ADV})

POS_MAP: dict[str, str] = {
# --- Pronoun (h) ---
"pronoun": PRON,
Expand Down Expand Up @@ -50,12 +55,15 @@
"quantifier": DET,
"partitive": DET,
# --- Noun (n) ---
"noun": NOUN,
"proper noun": NOUN,
"common noun": NOUN,
"abstract noun": NOUN,
"compound noun": NOUN,
"count noun": NOUN,
"mass noun": NOUN,
"personal noun": NOUN,
"agent noun": NOUN,
"locative noun": NOUN,
"indeclinable noun": NOUN,
"verbal noun": NOUN,
Expand All @@ -68,6 +76,9 @@
"location": NOUN,
"jukugo": NOUN, # Japanese kanji compound noun
# --- Verb (v) ---
"verb": VERB,
"proper verb": VERB,
"phrasal verb": VERB,
"auxiliary verb": VERB,
"japanese auxiliary verb": VERB,
"passive verb": VERB,
Expand All @@ -84,6 +95,9 @@
"imperative form": VERB,
"infinitive": VERB,
# --- Adjective (a) ---
"adjective": ADJ,
"satellite adjective": ADJ,
"proper adjective": ADJ,
"prenominal adjective": ADJ,
"adnominal adjective": ADJ,
"na-adjective": ADJ,
Expand All @@ -100,6 +114,7 @@
"predicative": ADJ,
"nominal modifier": ADJ,
# --- Adverb (r) ---
"adverb": ADV,
"adverbial phrase": ADV,
"adverbial locution": ADV,
"adverbial particle": ADV,
Expand Down Expand Up @@ -188,3 +203,7 @@
"acronym": OTHER,
"abbreviation": OTHER,
}

CONTENT_POS_MAP: dict[str, str] = {
label: code for label, code in POS_MAP.items() if code in CONTENT_TARGETS
}
68 changes: 68 additions & 0 deletions extensions/wikidata-lexemes/_wikidata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""Cached Wikidata entity fetcher (used to resolve POS/language Q-codes)."""
import json
import re
from collections.abc import Callable
from functools import cache
from pathlib import Path

import requests

EXTRAS_DIR = Path(__file__).parent / "extras" / "wikidata"
USER_AGENT = (
"WikidataLexemesBot/1.0 "
"(https://github.com/sign-language-processing/dictionary)"
)


def safe_filename(name: str, max_len: int = 80) -> str:
return re.sub(r"[^A-Za-z0-9_\-]", "_", name)[:max_len] or "_"


def cached_json_fetch(path: Path, fetch: Callable[[], dict]) -> dict:
try:
with open(path, encoding="utf-8") as f:
return json.load(f)
except FileNotFoundError:
pass
data = fetch()
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2)
return data
Comment on lines +22 to +31

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Make cache writes atomic and recover from corrupted cache JSON.

An interrupted write can leave a partial file; next run then crashes on JSON parsing and won’t self-heal.

Proposed fix
 def cached_json_fetch(path: Path, fetch: Callable[[], dict]) -> dict:
     try:
-        with open(path, encoding="utf-8") as f:
+        with open(path, encoding="utf-8") as f:
             return json.load(f)
-    except FileNotFoundError:
+    except (FileNotFoundError, json.JSONDecodeError):
         pass
     data = fetch()
     path.parent.mkdir(parents=True, exist_ok=True)
-    with open(path, "w", encoding="utf-8") as f:
+    tmp_path = path.with_suffix(path.suffix + ".tmp")
+    with open(tmp_path, "w", encoding="utf-8") as f:
         json.dump(data, f, indent=2)
+    tmp_path.replace(path)
     return data
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@extensions/wikidata-lexemes/_wikidata.py` around lines 22 - 31, The cache
loader currently crashes on corrupted JSON and leaves partial files from
interrupted writes; modify the logic around json.load/open and the post-fetch
write to (1) catch json.JSONDecodeError alongside FileNotFoundError and treat
either as a cache miss so you call fetch(), and (2) write the fetched data
atomically by writing to a temporary file in the same directory (use
tempfile.NamedTemporaryFile or path.with_suffix(".tmp")), flush and os.fsync the
file, close it, then os.replace(temp_path, path) to atomically replace the
cache; ensure path.parent.mkdir(...) runs before creating the temp file and
clean up the temp file on errors. Reference the existing identifiers: path,
fetch(), json.load, json.dump.



@cache
def fetch_wikidata_entity(q_code: str) -> dict:
def _fetch():
url = f"https://www.wikidata.org/wiki/Special:EntityData/{q_code}.json"
response = requests.get(
url, headers={"User-Agent": USER_AGENT}, timeout=30,
)
response.raise_for_status()
return response.json()

data = cached_json_fetch(EXTRAS_DIR / f"{q_code}.json", _fetch)
entities = data["entities"]
return entities.get(q_code) or next(iter(entities.values()))

Comment on lines +45 to +47

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

Guard against empty entities payloads before fallback selection.

If entities is empty, next(iter(...)) raises StopIteration and obscures the root cause.

Proposed fix
     data = cached_json_fetch(EXTRAS_DIR / f"{q_code}.json", _fetch)
     entities = data["entities"]
-    return entities.get(q_code) or next(iter(entities.values()))
+    if not entities:
+        raise ValueError(f"No entities returned for {q_code}")
+    return entities.get(q_code) or next(iter(entities.values()))
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
entities = data["entities"]
return entities.get(q_code) or next(iter(entities.values()))
entities = data["entities"]
if not entities:
raise ValueError(f"No entities returned for {q_code}")
return entities.get(q_code) or next(iter(entities.values()))
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@extensions/wikidata-lexemes/_wikidata.py` around lines 45 - 47, The code
accesses entities = data["entities"] and does return entities.get(q_code) or
next(iter(entities.values())), but if entities is empty next(iter(...)) raises
StopIteration; fix by checking for an empty entities dict before using the
fallback: after setting entities, if not entities raise a clear exception or
return None (or a sentinel) with a helpful message, otherwise return
entities.get(q_code) or next(iter(entities.values())); update callers if you
change the return contract.


@cache
def get_label(q_code: str) -> str:
entity = fetch_wikidata_entity(q_code)
labels = entity.get("labels", {})
if "en" in labels:
return labels["en"]["value"].lower()
if labels:
return next(iter(labels.values()))["value"].lower()
return q_code


@cache
def get_language_iso(q_code: str) -> str | None:
entity = fetch_wikidata_entity(q_code)
iso_claim = entity.get("claims", {}).get("P218", [])
if iso_claim:
datavalue = iso_claim[0].get("mainsnak", {}).get("datavalue")
if datavalue:
return datavalue["value"]
return None
Loading
Loading