-
Notifications
You must be signed in to change notification settings - Fork 324
(skill-eval) add filename match fast path #2140
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from all commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
6f2aa10
(skill-eval) add filename match fast path
edknv ab1489e
add lancedb search script
edknv 700d52d
drop the text field from summary echo
edknv 391f77b
Merge branch 'main' into edwardk/skill-single-source
edknv 4082e04
move regex search section
edknv 31caf8e
Merge branch 'main' into edwardk/skill-single-source
sosahi 29c6a58
greptile fixes
edknv db4c702
Merge branch 'edwardk/skill-single-source' of github.com:edknv/nv-ing…
edknv 4c9deb9
Wrapping the per-file call in a try/except
edknv File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,173 @@ | ||
| """Query-turn filename fast path for the nemo-retriever skill. | ||
|
|
||
| Reads `./pdfs/` from the current working directory. If the query string | ||
| literally contains any PDF basename **including the `.pdf` extension** | ||
| (stem ≥6 chars, case-insensitive), runs `retriever pdf stage page-elements` | ||
| on each matched file via pdfium, ranks pages by query-token frequency, | ||
| and emits a top-10 ranking + the top page's raw text. | ||
|
|
||
| Invoked from SKILL.md as: | ||
| <RETRIEVER_VENV>/bin/python <skill_dir>/scripts/filename_fast_path.py "$QUERY" | ||
|
|
||
| The retriever binary is resolved from sys.executable's directory, so the | ||
| script is portable across venvs. | ||
|
|
||
| Stdout protocol (exactly one of): | ||
| - `NO_MATCH\n` — no PDF basename in the query. | ||
| - `NO_TEXT\n` — matches found but extraction produced no | ||
| text on any page (image-only PDFs). | ||
| - `<JSON>\n---TOP_PAGE_TEXT---\n<text>` — JSON with a "ranking" list of | ||
| {doc_id, page_number, rank} (1-indexed | ||
| pages, up to 10), followed by the top- | ||
| ranked page's raw text (first 4000 chars). | ||
|
|
||
| Exit code is 0 in all three success outcomes; non-zero only on hard errors | ||
| (missing ./pdfs, page-elements subprocess failure, malformed sidecar JSON). | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import json | ||
| import os | ||
| import re | ||
| import subprocess | ||
| import sys | ||
|
|
||
| PDF_DIR = "./pdfs" | ||
| EXTRACT_OUT = "/tmp/pdf_text" | ||
| MIN_STEM_LEN = 6 | ||
| TOP_K = 10 | ||
| TOP_PAGE_TEXT_CHARS = 4000 | ||
|
|
||
| STOPWORDS = frozenset( | ||
| "the a an of in on for to and or is are was were what which how when " | ||
| "where who why this that these those with by from as at be it its do " | ||
| "does did please could would should tell me you i we us our my".split() | ||
| ) | ||
|
|
||
|
|
||
| def find_matches(query_lower: str, basenames: list[str]) -> list[str]: | ||
| """Return PDF basenames whose full name (including the `.pdf` extension) | ||
| appears verbatim in the lowercased query. Skip stems shorter than MIN_STEM_LEN. | ||
| Requiring the extension avoids false positives on common English words that | ||
| happen to appear as PDF stems (e.g. `report.pdf`, `market.pdf`).""" | ||
| matches = [] | ||
| for name in basenames: | ||
| stem, ext = os.path.splitext(name) | ||
| if ext.lower() != ".pdf" or len(stem) < MIN_STEM_LEN: | ||
| continue | ||
| if name.lower() in query_lower: | ||
| matches.append(name) | ||
| return matches | ||
|
|
||
|
|
||
| def extract_pages(retriever_bin: str, matches: list[str]) -> None: | ||
| """Extract each matched PDF; log per-file failures and continue so a single | ||
| bad PDF doesn't block remaining matches.""" | ||
| os.makedirs(EXTRACT_OUT, exist_ok=True) | ||
| for m in matches: | ||
| try: | ||
| subprocess.run( | ||
| [ | ||
| retriever_bin, | ||
| "pdf", | ||
| "stage", | ||
| "page-elements", | ||
| f"{PDF_DIR}/{m}", | ||
| "--method", | ||
| "pdfium", | ||
| "--json-output-dir", | ||
| EXTRACT_OUT, | ||
| "--compact-json", | ||
| ], | ||
| check=True, | ||
| ) | ||
| except subprocess.CalledProcessError as exc: | ||
| print(f"WARN: page-elements failed on {m}: exit {exc.returncode}", file=sys.stderr) | ||
|
|
||
|
|
||
| def sidecar_path(pdf_name: str) -> str | None: | ||
| stem = os.path.splitext(pdf_name)[0] | ||
| candidates = ( | ||
| f"{EXTRACT_OUT}/{pdf_name}.pdf_extraction.json", | ||
| f"{EXTRACT_OUT}/{stem}.pdf_extraction.json", | ||
| ) | ||
|
edknv marked this conversation as resolved.
|
||
| for c in candidates: | ||
| if os.path.exists(c): | ||
| return c | ||
| return None | ||
|
|
||
|
|
||
| def page_records(sidecar: str) -> list[dict]: | ||
| with open(sidecar) as fh: | ||
| data = json.load(fh) | ||
| if isinstance(data, list): | ||
| return data | ||
| if isinstance(data, dict): | ||
| return data.get("pages") or data.get("documents") or [] | ||
| return [] | ||
|
|
||
|
|
||
| def page_text(rec: dict) -> str: | ||
| txt = rec.get("text") or rec.get("content") or "" | ||
| if not txt and isinstance(rec.get("primitives"), list): | ||
| txt = " ".join(p.get("text", "") for p in rec["primitives"] if isinstance(p, dict)) | ||
| return txt or "" | ||
|
|
||
|
|
||
| def tokenize(query: str) -> list[str]: | ||
| return [t for t in re.split(r"[^a-z0-9]+", query.lower()) if t and t not in STOPWORDS and len(t) > 2] | ||
|
|
||
|
|
||
| def rank_pages(matches: list[str], toks: list[str]) -> list[tuple[int, int, str, str]]: | ||
| """Return list of (score, page_number, doc_stem, text) sorted by | ||
| descending score, ascending page number.""" | ||
| scored = [] | ||
| for m in matches: | ||
| sidecar = sidecar_path(m) | ||
| if sidecar is None: | ||
| continue | ||
| stem = os.path.splitext(m)[0] | ||
| for rec in page_records(sidecar): | ||
| pn = rec.get("page_number") or rec.get("page") or 0 | ||
| txt = page_text(rec) | ||
| score = sum(txt.lower().count(t) for t in toks) | ||
| if score > 0: | ||
| scored.append((score, pn, stem, txt)) | ||
| scored.sort(key=lambda r: (-r[0], r[1])) | ||
| return scored | ||
|
|
||
|
|
||
| def main() -> int: | ||
| if len(sys.argv) != 2: | ||
| print(f"usage: {sys.argv[0]} <query>", file=sys.stderr) | ||
| return 2 | ||
| query = sys.argv[1] | ||
| ql = query.lower() | ||
| retriever_bin = os.path.join(os.path.dirname(sys.executable), "retriever") | ||
|
|
||
| try: | ||
| basenames = sorted(p for p in os.listdir(PDF_DIR) if p.lower().endswith(".pdf")) | ||
| except (FileNotFoundError, PermissionError) as exc: | ||
| print(f"ERROR: cannot list {PDF_DIR}: {exc}", file=sys.stderr) | ||
| return 1 | ||
| matches = find_matches(ql, basenames) | ||
| if not matches: | ||
| print("NO_MATCH") | ||
| return 0 | ||
|
|
||
| extract_pages(retriever_bin, matches) | ||
| scored = rank_pages(matches, tokenize(ql)) | ||
| if not scored: | ||
| print("NO_TEXT") | ||
| return 0 | ||
|
|
||
| ranking = [{"doc_id": s[2], "page_number": s[1], "rank": i + 1} for i, s in enumerate(scored[:TOP_K])] | ||
| print(json.dumps({"ranking": ranking})) | ||
| print("---TOP_PAGE_TEXT---") | ||
| print(scored[0][3][:TOP_PAGE_TEXT_CHARS]) | ||
| return 0 | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| sys.exit(main()) | ||
|
edknv marked this conversation as resolved.
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,99 @@ | ||
| """Case-insensitive keyword/regex search over the corpus via the LanceDB index. | ||
|
|
||
| This script scans the already-built LanceDB table, so it returns matches | ||
| across every chunk `retriever ingest` indexed (text, table, chart, image | ||
| transcriptions where present) without re-reading any PDF. | ||
|
|
||
| Usage: | ||
| <RETRIEVER_VENV>/bin/python <skill_dir>/scripts/grep_corpus.py <pattern> \\ | ||
| [--max-hits 50] [--lancedb-uri ./lancedb] [--table-name nemo-retriever] | ||
|
|
||
| `pattern` is a Python regex, case-insensitive. For a literal-string search, | ||
| just write the string — most identifier characters (`.`, `-`, `_`, digits, | ||
| letters) are unambiguous unless you include regex metacharacters | ||
| (`(`, `|`, `*`, `?`, `[`, `]`, `\\`, `^`, `$`). | ||
|
|
||
| Output (one line per hit; sorted by pdf_basename then page_number): | ||
| <pdf_basename>:p<page_number>:<type>: ...<snippet around match>... | ||
|
|
||
| Prints `NO_MATCH` on zero hits. Caps at `--max-hits` to keep the turn output | ||
| bounded; raise it if you really want more. | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import argparse | ||
| import json | ||
| import re | ||
| import sys | ||
|
|
||
|
|
||
| def main() -> int: | ||
| ap = argparse.ArgumentParser() | ||
| ap.add_argument("pattern", help="Python regex (case-insensitive)") | ||
| ap.add_argument("--max-hits", type=int, default=50) | ||
| ap.add_argument("--snippet-pad", type=int, default=60) | ||
| ap.add_argument("--lancedb-uri", default="./lancedb") | ||
| ap.add_argument("--table-name", default="nemo-retriever") | ||
| args = ap.parse_args() | ||
|
|
||
| try: | ||
| import lancedb | ||
| except ImportError: | ||
| print("ERROR: lancedb not importable. Run with <RETRIEVER_VENV>/bin/python.", file=sys.stderr) | ||
| return 1 | ||
|
|
||
| try: | ||
| pat = re.compile(args.pattern, re.IGNORECASE) | ||
| except re.error as e: | ||
| print(f"ERROR: bad regex {args.pattern!r}: {e}", file=sys.stderr) | ||
| return 2 | ||
|
|
||
| try: | ||
| db = lancedb.connect(args.lancedb_uri) | ||
| tbl = db.open_table(args.table_name) | ||
| except Exception as e: | ||
| print(f"ERROR: can't open lancedb table {args.table_name!r} at " f"{args.lancedb_uri!r}: {e}", file=sys.stderr) | ||
| return 1 | ||
|
|
||
| rows = tbl.to_pandas() | ||
|
edknv marked this conversation as resolved.
|
||
| if "text" not in rows.columns: | ||
| print(f"ERROR: lancedb table has no 'text' column. columns={list(rows.columns)}", file=sys.stderr) | ||
| return 1 | ||
|
|
||
| hits = [] | ||
| for row in rows.itertuples(index=False): | ||
| text = getattr(row, "text", "") or "" | ||
| m = pat.search(text) | ||
| if not m: | ||
| continue | ||
| pdf = getattr(row, "pdf_basename", "?") | ||
| page = getattr(row, "page_number", "?") | ||
| meta_raw = getattr(row, "metadata", "") or "" | ||
| if isinstance(meta_raw, str): | ||
| try: | ||
| meta = json.loads(meta_raw) if meta_raw else {} | ||
| except json.JSONDecodeError: | ||
| meta = {} | ||
| elif isinstance(meta_raw, dict): | ||
| meta = meta_raw | ||
| else: | ||
| meta = {} | ||
| type_ = meta.get("type", "?") | ||
| start = max(0, m.start() - args.snippet_pad) | ||
| end = min(len(text), m.end() + args.snippet_pad) | ||
| snippet = text[start:end].replace("\n", " ") | ||
| hits.append((pdf, page, type_, snippet)) | ||
|
|
||
| hits.sort(key=lambda h: (str(h[0]), int(h[1]) if isinstance(h[1], (int, float)) else 0)) | ||
| for pdf, page, type_, snippet in hits[: args.max_hits]: | ||
| print(f"{pdf}:p{page}:{type_}: ...{snippet}...") | ||
| if not hits: | ||
| print("NO_MATCH") | ||
| elif len(hits) > args.max_hits: | ||
| print(f"... ({len(hits) - args.max_hits} more matches truncated; " f"raise --max-hits to see them)") | ||
| return 0 | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| sys.exit(main()) | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.