-
Notifications
You must be signed in to change notification settings - Fork 1
feat(wikidata-lexemes): forms, content-POS gap escape, quality filters #2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,19 @@ | ||
| # Project rules for Claude | ||
|
|
||
| ## Before opening a PR | ||
|
|
||
| Run the CI checks locally and fix every failure before pushing or opening | ||
| a pull request. Do not push or open a PR if any of these fail: | ||
|
|
||
| ```bash | ||
| hatch fmt --linter --check # ruff lint (CI: `lint` job, step "Lint") | ||
| hatch run mypy:check # mypy (CI: `lint` job, step "Type Check") | ||
| hatch build # sdist + wheel build (CI: step "Check Buildable") | ||
| hatch test # pytest suite (CI: `tests` matrix) | ||
| ``` | ||
|
|
||
| When the user reports a CI failure, fix it locally and re-verify all four | ||
| before re-pushing. Do not bypass formatting/lint rules with broad `# noqa` | ||
| suppressions unless the failure is a deliberate, narrow exception (e.g. an | ||
| intentionally-ambiguous Unicode literal), and always pair the suppression | ||
| with a comment explaining why. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,17 @@ | ||
| """Cached omw-en lemma → POS coverage.""" | ||
| from functools import cache | ||
|
|
||
|
|
||
| @cache | ||
| def omw_en_pos() -> dict[str, frozenset[str]]: | ||
| """Return {lemma_lower: frozenset of WN POSes}. Empty if omw-en unavailable.""" | ||
| try: | ||
| import wn | ||
| en = wn.Wordnet(lexicon="omw-en") | ||
| except Exception: | ||
| return {} | ||
| by_lemma: dict[str, set[str]] = {} | ||
| for word in en.words(): | ||
| for form in (word.lemma(), *word.forms()): | ||
| by_lemma.setdefault(form.lower(), set()).add(word.pos) | ||
| return {lemma: frozenset(pos) for lemma, pos in by_lemma.items()} |
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,68 @@ | ||||||||||||||
| """Cached Wikidata entity fetcher (used to resolve POS/language Q-codes).""" | ||||||||||||||
| import json | ||||||||||||||
| import re | ||||||||||||||
| from collections.abc import Callable | ||||||||||||||
| from functools import cache | ||||||||||||||
| from pathlib import Path | ||||||||||||||
|
|
||||||||||||||
| import requests | ||||||||||||||
|
|
||||||||||||||
| EXTRAS_DIR = Path(__file__).parent / "extras" / "wikidata" | ||||||||||||||
| USER_AGENT = ( | ||||||||||||||
| "WikidataLexemesBot/1.0 " | ||||||||||||||
| "(https://github.com/sign-language-processing/dictionary)" | ||||||||||||||
| ) | ||||||||||||||
|
|
||||||||||||||
|
|
||||||||||||||
| def safe_filename(name: str, max_len: int = 80) -> str: | ||||||||||||||
| return re.sub(r"[^A-Za-z0-9_\-]", "_", name)[:max_len] or "_" | ||||||||||||||
|
|
||||||||||||||
|
|
||||||||||||||
| def cached_json_fetch(path: Path, fetch: Callable[[], dict]) -> dict: | ||||||||||||||
| try: | ||||||||||||||
| with open(path, encoding="utf-8") as f: | ||||||||||||||
| return json.load(f) | ||||||||||||||
| except FileNotFoundError: | ||||||||||||||
| pass | ||||||||||||||
| data = fetch() | ||||||||||||||
| path.parent.mkdir(parents=True, exist_ok=True) | ||||||||||||||
| with open(path, "w", encoding="utf-8") as f: | ||||||||||||||
| json.dump(data, f, indent=2) | ||||||||||||||
| return data | ||||||||||||||
|
|
||||||||||||||
|
|
||||||||||||||
| @cache | ||||||||||||||
| def fetch_wikidata_entity(q_code: str) -> dict: | ||||||||||||||
| def _fetch(): | ||||||||||||||
| url = f"https://www.wikidata.org/wiki/Special:EntityData/{q_code}.json" | ||||||||||||||
| response = requests.get( | ||||||||||||||
| url, headers={"User-Agent": USER_AGENT}, timeout=30, | ||||||||||||||
| ) | ||||||||||||||
| response.raise_for_status() | ||||||||||||||
| return response.json() | ||||||||||||||
|
|
||||||||||||||
| data = cached_json_fetch(EXTRAS_DIR / f"{q_code}.json", _fetch) | ||||||||||||||
| entities = data["entities"] | ||||||||||||||
| return entities.get(q_code) or next(iter(entities.values())) | ||||||||||||||
|
|
||||||||||||||
|
Comment on lines
+45
to
+47
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Guard against empty If Proposed fix data = cached_json_fetch(EXTRAS_DIR / f"{q_code}.json", _fetch)
entities = data["entities"]
- return entities.get(q_code) or next(iter(entities.values()))
+ if not entities:
+ raise ValueError(f"No entities returned for {q_code}")
+ return entities.get(q_code) or next(iter(entities.values()))📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||
|
|
||||||||||||||
| @cache | ||||||||||||||
| def get_label(q_code: str) -> str: | ||||||||||||||
| entity = fetch_wikidata_entity(q_code) | ||||||||||||||
| labels = entity.get("labels", {}) | ||||||||||||||
| if "en" in labels: | ||||||||||||||
| return labels["en"]["value"].lower() | ||||||||||||||
| if labels: | ||||||||||||||
| return next(iter(labels.values()))["value"].lower() | ||||||||||||||
| return q_code | ||||||||||||||
|
|
||||||||||||||
|
|
||||||||||||||
| @cache | ||||||||||||||
| def get_language_iso(q_code: str) -> str | None: | ||||||||||||||
| entity = fetch_wikidata_entity(q_code) | ||||||||||||||
| iso_claim = entity.get("claims", {}).get("P218", []) | ||||||||||||||
| if iso_claim: | ||||||||||||||
| datavalue = iso_claim[0].get("mainsnak", {}).get("datavalue") | ||||||||||||||
| if datavalue: | ||||||||||||||
| return datavalue["value"] | ||||||||||||||
| return None | ||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Make cache writes atomic and recover from corrupted cache JSON.
An interrupted write can leave a partial file; next run then crashes on JSON parsing and won’t self-heal.
Proposed fix
def cached_json_fetch(path: Path, fetch: Callable[[], dict]) -> dict: try: - with open(path, encoding="utf-8") as f: + with open(path, encoding="utf-8") as f: return json.load(f) - except FileNotFoundError: + except (FileNotFoundError, json.JSONDecodeError): pass data = fetch() path.parent.mkdir(parents=True, exist_ok=True) - with open(path, "w", encoding="utf-8") as f: + tmp_path = path.with_suffix(path.suffix + ".tmp") + with open(tmp_path, "w", encoding="utf-8") as f: json.dump(data, f, indent=2) + tmp_path.replace(path) return data🤖 Prompt for AI Agents