From 6f2aa10564cc58081607460b6a437fe652d741e2 Mon Sep 17 00:00:00 2001 From: edknv Date: Wed, 27 May 2026 13:54:22 -0700 Subject: [PATCH 1/6] (skill-eval) add filename match fast path --- .claude/skills/nemo-retriever/SKILL.md | 30 +++- .../scripts/filename_fast_path.py | 155 ++++++++++++++++++ 2 files changed, 183 insertions(+), 2 deletions(-) create mode 100644 .claude/skills/nemo-retriever/scripts/filename_fast_path.py diff --git a/.claude/skills/nemo-retriever/SKILL.md b/.claude/skills/nemo-retriever/SKILL.md index 4aa373d5d8..3d591cf605 100644 --- a/.claude/skills/nemo-retriever/SKILL.md +++ b/.claude/skills/nemo-retriever/SKILL.md @@ -33,6 +33,32 @@ Don't pre-OCR, don't pre-chunk, don't write Python wrappers — the CLI handles ## Query turn — the WHOLE workflow +### Filename fast path (try BEFORE `retriever query`) + +When the query literally names a PDF in `./pdfs/`, skip `retriever query`. Direct pdfium extraction on the named file is faster and avoids semantic-search misses: the right doc is given, and pages can be ranked by query-token overlap against the file's text. + +**Trigger:** some PDF basename in `./pdfs/` (stem ≥6 chars, with or without the `.pdf` extension) appears verbatim, case-insensitive, as a substring of the query. It fires when the user pastes or quotes the filename — including paths, basenames-with-extension, or bare stems that match a real file. It does NOT fire when the user describes a document semantically without quoting a filename (paraphrased titles, entity names, document types, or even the same words separated differently from how the file is named). Those flow through the standard `retriever query` workflow below. + +Run this single Bash call to detect, extract, rank, and report — atomically. It always runs FIRST on a query turn: + +```bash +/bin/python /scripts/filename_fast_path.py "" +``` + +`` is the "Base directory for this skill" printed when the skill loads (e.g. `/path/to/.claude/skills/nemo-retriever`). Quote the question. The script reads `./pdfs/` from the current workdir and writes per-PDF extraction sidecars under `/tmp/pdf_text/`. Source: `scripts/filename_fast_path.py` next to this file. + +Read stdout. **Three outcomes:** + +- `NO_MATCH` — no literal basename in the query. Fall through to the standard `retriever query` workflow below as if this section didn't exist. +- `NO_TEXT` — matched file is image-only / pdfium got no text. Also fall through. +- A JSON object with `"ranking"` followed by `---TOP_PAGE_TEXT---` and the top page's raw text. That's the fast-path hit — write `./output.json` directly: + - `ranked_retrieved`: copy the `"ranking"` entries verbatim (`{doc_id, page_number, rank}`, 1-indexed pages). Apply the 0-/1-indexed page adjustment from the task description as usual (`page_number - 1` if the task says 0-indexed). + - `final_answer`: synthesize from the printed `TOP_PAGE_TEXT` following the existing rules (exact number/name/date; one paragraph; honest "not in the retrieved pages" if the fact genuinely isn't there). No chart/image hedging — pdfium extracts text only. + + Then **Write `./output.json` and STOP.** Fast-path total: 2 tool calls (this Bash + Write). Do NOT also call `retriever query` — it's mutually exclusive. + +### Standard path: `retriever query` + ```bash /bin/retriever query "" --top-k 10 --embed-model-name nvidia/llama-nemotron-embed-1b-v2 --rerank \ | tee /tmp/hits.json \ @@ -41,7 +67,7 @@ Don't pre-OCR, don't pre-chunk, don't write Python wrappers — the CLI handles Run that **exactly** as a single pipeline — do not split it into `HITS=$(...)` + `echo "$HITS" | /bin/python -c ...` (the assignment swallows stdout, the pipe sees nothing, you waste 3 bash calls recovering). Stdout is clean JSON (model-init logs are silenced at the CLI layer); leave stderr unredirected so real errors surface on the first call. The full JSON sits at `/tmp/hits.json` if you need to re-parse it (`/bin/python -c "import json; print(json.load(open('/tmp/hits.json'))[6])"` for the rank-7 hit), but in the common case the summary above is all you need. -That's your FIRST tool call on every query turn. Do not Read, Glob, Grep, or list PDFs before this — those duplicate what `retriever query` already did. +This is your tool call when the fast path above printed `NO_MATCH` or `NO_TEXT`. Do not Read, Glob, Grep, or `ls` separately — the fast path's Bash call already listed `./pdfs/`, and `retriever query` indexes the content. **No narration between tool calls.** Do not write "Let me search…", "I'll now analyze…", "The retriever returned…", or any other commentary. Every assistant token you emit between the `retriever query` Bash call and the `Write` of `./output.json` becomes input tokens (and cached input tokens) for every subsequent turn in this session — quadratic cost. Go straight from reading the summary to writing the JSON file. The only assistant text in a query turn should be the tool calls themselves. @@ -71,7 +97,7 @@ After writing the file, STOP. No print, no summary, no further tool calls. ### Hard limits (cost discipline) -- ONE `retriever query` per turn. ONE optional targeted text-extract on the rank-1 PDF if the chunks miss the asked-for fact. That's the budget — it is a hard cap, not a soft preference. +- ONE call from {filename fast path, `retriever query`} per turn — they are mutually exclusive. The fast path's all-in-one Bash call counts as that one call; if it hits, write `output.json` and stop (2 tool calls total). If it printed `NO_MATCH`/`NO_TEXT`, run `retriever query` once and then take ONE optional targeted `retriever pdf stage page-elements` text-extract on the rank-1 PDF if the chunks miss the asked-for fact. That's the budget — it is a hard cap, not a soft preference. - After your 2nd tool call, write `final_answer` with what you have and STOP. If both calls left the asked-for fact unresolved, write `final_answer` that **explicitly states the retrieved pages don't contain the requested fact** (naming the closest related content if any) — **do not run more tool calls hunting for it, and do not extrapolate a plausible value.** Long-running query turns (5+ tool calls, 1M+ cache-read tokens) cost ~5× a disciplined turn and usually still produce the wrong answer. - Don't read whole PDFs. - Don't make speculative Read/Glob/Grep calls "to confirm". The retriever already found the relevant pages — trust the ranking. diff --git a/.claude/skills/nemo-retriever/scripts/filename_fast_path.py b/.claude/skills/nemo-retriever/scripts/filename_fast_path.py new file mode 100644 index 0000000000..881bcbf242 --- /dev/null +++ b/.claude/skills/nemo-retriever/scripts/filename_fast_path.py @@ -0,0 +1,155 @@ +"""Query-turn filename fast path for the nemo-retriever skill. + +Reads `./pdfs/` from the current working directory. If the query string +literally contains any PDF basename (with or without the `.pdf` extension, +stem ≥6 chars, case-insensitive), runs `retriever pdf stage page-elements` +on each matched file via pdfium, ranks pages by query-token frequency, +and emits a top-10 ranking + the top page's raw text. + +Invoked from SKILL.md as: + /bin/python /scripts/filename_fast_path.py "$QUERY" + +The retriever binary is resolved from sys.executable's directory, so the +script is portable across venvs. + +Stdout protocol (exactly one of): +- `NO_MATCH\n` — no PDF basename in the query. +- `NO_TEXT\n` — matches found but extraction produced no + text on any page (image-only PDFs). +- `\n---TOP_PAGE_TEXT---\n` — JSON with a "ranking" list of + {doc_id, page_number, rank} (1-indexed + pages, up to 10), followed by the top- + ranked page's raw text (first 4000 chars). + +Exit code is 0 in all three success outcomes; non-zero only on hard errors +(missing ./pdfs, page-elements subprocess failure, malformed sidecar JSON). +""" + +from __future__ import annotations + +import json +import os +import re +import subprocess +import sys + +PDF_DIR = "./pdfs" +EXTRACT_OUT = "/tmp/pdf_text" +MIN_STEM_LEN = 6 +TOP_K = 10 +TOP_PAGE_TEXT_CHARS = 4000 + +STOPWORDS = frozenset( + "the a an of in on for to and or is are was were what which how when " + "where who why this that these those with by from as at be it its do " + "does did please could would should tell me you i we us our my".split() +) + + +def find_matches(query_lower: str, basenames: list[str]) -> list[str]: + """Return PDF basenames whose name (with or without .pdf) appears verbatim + in the lowercased query. Skip stems shorter than MIN_STEM_LEN.""" + matches = [] + for name in basenames: + stem, ext = os.path.splitext(name) + if ext.lower() != ".pdf" or len(stem) < MIN_STEM_LEN: + continue + if name.lower() in query_lower or stem.lower() in query_lower: + matches.append(name) + return matches + + +def extract_pages(retriever_bin: str, matches: list[str]) -> None: + os.makedirs(EXTRACT_OUT, exist_ok=True) + for m in matches: + subprocess.run( + [retriever_bin, "pdf", "stage", "page-elements", f"{PDF_DIR}/{m}", + "--method", "pdfium", "--json-output-dir", EXTRACT_OUT, + "--compact-json"], + check=True, + ) + + +def sidecar_path(pdf_name: str) -> str | None: + stem = os.path.splitext(pdf_name)[0] + candidates = ( + f"{EXTRACT_OUT}/{pdf_name}.pdf_extraction.json", + f"{EXTRACT_OUT}/{stem}.pdf.pdf_extraction.json", + ) + for c in candidates: + if os.path.exists(c): + return c + return None + + +def page_records(sidecar: str) -> list[dict]: + data = json.load(open(sidecar)) + if isinstance(data, list): + return data + if isinstance(data, dict): + return data.get("pages") or data.get("documents") or [] + return [] + + +def page_text(rec: dict) -> str: + txt = rec.get("text") or rec.get("content") or "" + if not txt and isinstance(rec.get("primitives"), list): + txt = " ".join(p.get("text", "") for p in rec["primitives"] + if isinstance(p, dict)) + return txt or "" + + +def tokenize(query: str) -> list[str]: + return [t for t in re.split(r"[^a-z0-9]+", query.lower()) + if t and t not in STOPWORDS and len(t) > 2] + + +def rank_pages(matches: list[str], toks: list[str]) -> list[tuple[int, int, str, str]]: + """Return list of (score, page_number, doc_stem, text) sorted by + descending score, ascending page number.""" + scored = [] + for m in matches: + sidecar = sidecar_path(m) + if sidecar is None: + continue + stem = os.path.splitext(m)[0] + for rec in page_records(sidecar): + pn = rec.get("page_number") or rec.get("page") or 0 + txt = page_text(rec) + score = sum(txt.lower().count(t) for t in toks) + if score > 0: + scored.append((score, pn, stem, txt)) + scored.sort(key=lambda r: (-r[0], r[1])) + return scored + + +def main() -> int: + if len(sys.argv) != 2: + print(f"usage: {sys.argv[0]} ", file=sys.stderr) + return 2 + query = sys.argv[1] + ql = query.lower() + retriever_bin = os.path.join(os.path.dirname(sys.executable), "retriever") + + basenames = sorted(p for p in os.listdir(PDF_DIR) if p.lower().endswith(".pdf")) + matches = find_matches(ql, basenames) + if not matches: + print("NO_MATCH") + return 0 + + extract_pages(retriever_bin, matches) + scored = rank_pages(matches, tokenize(ql)) + if not scored: + print("NO_TEXT") + return 0 + + ranking = [{"doc_id": s[2], "page_number": s[1], "rank": i + 1} + for i, s in enumerate(scored[:TOP_K])] + print(json.dumps({"ranking": ranking})) + print("---TOP_PAGE_TEXT---") + print(scored[0][3][:TOP_PAGE_TEXT_CHARS]) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From ab1489e42ca7579a14e7aab870e6692fa4ca5824 Mon Sep 17 00:00:00 2001 From: edknv Date: Wed, 27 May 2026 17:15:13 -0700 Subject: [PATCH 2/6] add lancedb search script --- .claude/skills/nemo-retriever/SKILL.md | 12 +++ .../scripts/filename_fast_path.py | 24 +++-- .../nemo-retriever/scripts/grep_corpus.py | 99 +++++++++++++++++++ 3 files changed, 126 insertions(+), 9 deletions(-) create mode 100644 .claude/skills/nemo-retriever/scripts/grep_corpus.py diff --git a/.claude/skills/nemo-retriever/SKILL.md b/.claude/skills/nemo-retriever/SKILL.md index 3d591cf605..a282a8409c 100644 --- a/.claude/skills/nemo-retriever/SKILL.md +++ b/.claude/skills/nemo-retriever/SKILL.md @@ -95,6 +95,18 @@ When both a chart hit and a text hit cover the same fact, always prefer the text After writing the file, STOP. No print, no summary, no further tool calls. +### Keyword search across the corpus + +If you need exact text matches the semantic `retriever query` may have missed: + +```bash +/bin/python /scripts/grep_corpus.py "" [--max-hits 50] +``` + +It scans the LanceDB table the retriever already built — no PDF re-extraction. Output is `:p:: ......` per hit; `NO_MATCH` if nothing matches. Counts against the same "one optional follow-up call" budget as the targeted text-extract (mutually exclusive — pick one). + +Don't reach for `pdftotext`, `pdftohtml`, or `pdfgrep` — they're system tools that aren't guaranteed installed on the user's machine. The retriever venv bundles pdfium and `lancedb`; `grep_corpus.py` and `retriever pdf stage page-elements --method pdfium` cover the same use cases without that dependency. + ### Hard limits (cost discipline) - ONE call from {filename fast path, `retriever query`} per turn — they are mutually exclusive. The fast path's all-in-one Bash call counts as that one call; if it hits, write `output.json` and stop (2 tool calls total). If it printed `NO_MATCH`/`NO_TEXT`, run `retriever query` once and then take ONE optional targeted `retriever pdf stage page-elements` text-extract on the rank-1 PDF if the chunks miss the asked-for fact. That's the budget — it is a hard cap, not a soft preference. diff --git a/.claude/skills/nemo-retriever/scripts/filename_fast_path.py b/.claude/skills/nemo-retriever/scripts/filename_fast_path.py index 881bcbf242..f11bfd8223 100644 --- a/.claude/skills/nemo-retriever/scripts/filename_fast_path.py +++ b/.claude/skills/nemo-retriever/scripts/filename_fast_path.py @@ -63,9 +63,18 @@ def extract_pages(retriever_bin: str, matches: list[str]) -> None: os.makedirs(EXTRACT_OUT, exist_ok=True) for m in matches: subprocess.run( - [retriever_bin, "pdf", "stage", "page-elements", f"{PDF_DIR}/{m}", - "--method", "pdfium", "--json-output-dir", EXTRACT_OUT, - "--compact-json"], + [ + retriever_bin, + "pdf", + "stage", + "page-elements", + f"{PDF_DIR}/{m}", + "--method", + "pdfium", + "--json-output-dir", + EXTRACT_OUT, + "--compact-json", + ], check=True, ) @@ -94,14 +103,12 @@ def page_records(sidecar: str) -> list[dict]: def page_text(rec: dict) -> str: txt = rec.get("text") or rec.get("content") or "" if not txt and isinstance(rec.get("primitives"), list): - txt = " ".join(p.get("text", "") for p in rec["primitives"] - if isinstance(p, dict)) + txt = " ".join(p.get("text", "") for p in rec["primitives"] if isinstance(p, dict)) return txt or "" def tokenize(query: str) -> list[str]: - return [t for t in re.split(r"[^a-z0-9]+", query.lower()) - if t and t not in STOPWORDS and len(t) > 2] + return [t for t in re.split(r"[^a-z0-9]+", query.lower()) if t and t not in STOPWORDS and len(t) > 2] def rank_pages(matches: list[str], toks: list[str]) -> list[tuple[int, int, str, str]]: @@ -143,8 +150,7 @@ def main() -> int: print("NO_TEXT") return 0 - ranking = [{"doc_id": s[2], "page_number": s[1], "rank": i + 1} - for i, s in enumerate(scored[:TOP_K])] + ranking = [{"doc_id": s[2], "page_number": s[1], "rank": i + 1} for i, s in enumerate(scored[:TOP_K])] print(json.dumps({"ranking": ranking})) print("---TOP_PAGE_TEXT---") print(scored[0][3][:TOP_PAGE_TEXT_CHARS]) diff --git a/.claude/skills/nemo-retriever/scripts/grep_corpus.py b/.claude/skills/nemo-retriever/scripts/grep_corpus.py new file mode 100644 index 0000000000..1471b6e4c0 --- /dev/null +++ b/.claude/skills/nemo-retriever/scripts/grep_corpus.py @@ -0,0 +1,99 @@ +"""Case-insensitive keyword/regex search over the corpus via the LanceDB index. + +This script scans the already-built LanceDB table, so it returns matches +across every chunk `retriever ingest` indexed (text, table, chart, image +transcriptions where present) without re-reading any PDF. + +Usage: + /bin/python /scripts/grep_corpus.py \\ + [--max-hits 50] [--lancedb-uri ./lancedb] [--table-name nemo-retriever] + +`pattern` is a Python regex, case-insensitive. For a literal-string search, +just write the string — most identifier characters (`.`, `-`, `_`, digits, +letters) are unambiguous unless you include regex metacharacters +(`(`, `|`, `*`, `?`, `[`, `]`, `\\`, `^`, `$`). + +Output (one line per hit; sorted by pdf_basename then page_number): + :p:: ...... + +Prints `NO_MATCH` on zero hits. Caps at `--max-hits` to keep the turn output +bounded; raise it if you really want more. +""" + +from __future__ import annotations + +import argparse +import json +import re +import sys + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("pattern", help="Python regex (case-insensitive)") + ap.add_argument("--max-hits", type=int, default=50) + ap.add_argument("--snippet-pad", type=int, default=60) + ap.add_argument("--lancedb-uri", default="./lancedb") + ap.add_argument("--table-name", default="nemo-retriever") + args = ap.parse_args() + + try: + import lancedb + except ImportError: + print("ERROR: lancedb not importable. Run with /bin/python.", file=sys.stderr) + return 1 + + try: + pat = re.compile(args.pattern, re.IGNORECASE) + except re.error as e: + print(f"ERROR: bad regex {args.pattern!r}: {e}", file=sys.stderr) + return 2 + + try: + db = lancedb.connect(args.lancedb_uri) + tbl = db.open_table(args.table_name) + except Exception as e: + print(f"ERROR: can't open lancedb table {args.table_name!r} at " f"{args.lancedb_uri!r}: {e}", file=sys.stderr) + return 1 + + rows = tbl.to_pandas() + if "text" not in rows.columns: + print(f"ERROR: lancedb table has no 'text' column. columns={list(rows.columns)}", file=sys.stderr) + return 1 + + hits = [] + for row in rows.itertuples(index=False): + text = getattr(row, "text", "") or "" + m = pat.search(text) + if not m: + continue + pdf = getattr(row, "pdf_basename", "?") + page = getattr(row, "page_number", "?") + meta_raw = getattr(row, "metadata", "") or "" + if isinstance(meta_raw, str): + try: + meta = json.loads(meta_raw) if meta_raw else {} + except json.JSONDecodeError: + meta = {} + elif isinstance(meta_raw, dict): + meta = meta_raw + else: + meta = {} + type_ = meta.get("type", "?") + start = max(0, m.start() - args.snippet_pad) + end = min(len(text), m.end() + args.snippet_pad) + snippet = text[start:end].replace("\n", " ") + hits.append((pdf, page, type_, snippet)) + + hits.sort(key=lambda h: (str(h[0]), int(h[1]) if isinstance(h[1], (int, float)) else 0)) + for pdf, page, type_, snippet in hits[: args.max_hits]: + print(f"{pdf}:p{page}:{type_}: ...{snippet}...") + if not hits: + print("NO_MATCH") + elif len(hits) > args.max_hits: + print(f"... ({len(hits) - args.max_hits} more matches truncated; " f"raise --max-hits to see them)") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 700d52dbe1a2e4892911c84883fdf9ad94059c5e Mon Sep 17 00:00:00 2001 From: edknv Date: Thu, 28 May 2026 10:04:13 -0700 Subject: [PATCH 3/6] drop the text field from summary echo --- .claude/skills/nemo-retriever/SKILL.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.claude/skills/nemo-retriever/SKILL.md b/.claude/skills/nemo-retriever/SKILL.md index a282a8409c..f5e0dc5862 100644 --- a/.claude/skills/nemo-retriever/SKILL.md +++ b/.claude/skills/nemo-retriever/SKILL.md @@ -62,10 +62,10 @@ Read stdout. **Three outcomes:** ```bash /bin/retriever query "" --top-k 10 --embed-model-name nvidia/llama-nemotron-embed-1b-v2 --rerank \ | tee /tmp/hits.json \ - | /bin/python -c "import json,sys; [print(f'rank={h.get(\"rank\",0)} page={h[\"page_number\"]} pdf={h[\"pdf_basename\"]} type={h.get(\"metadata\",{}).get(\"type\",\"?\")} text={h[\"text\"][:200]}') for h in json.load(sys.stdin)]" + | /bin/python -c "import json,sys; [print(f'rank={h.get(\"rank\",0)} page={h[\"page_number\"]} pdf={h[\"pdf_basename\"]} type={h.get(\"metadata\",{}).get(\"type\",\"?\")}') for h in json.load(sys.stdin)]" ``` -Run that **exactly** as a single pipeline — do not split it into `HITS=$(...)` + `echo "$HITS" | /bin/python -c ...` (the assignment swallows stdout, the pipe sees nothing, you waste 3 bash calls recovering). Stdout is clean JSON (model-init logs are silenced at the CLI layer); leave stderr unredirected so real errors surface on the first call. The full JSON sits at `/tmp/hits.json` if you need to re-parse it (`/bin/python -c "import json; print(json.load(open('/tmp/hits.json'))[6])"` for the rank-7 hit), but in the common case the summary above is all you need. +Run that **exactly** as a single pipeline — do not split it into `HITS=$(...)` + `echo "$HITS" | /bin/python -c ...` (the assignment swallows stdout, the pipe sees nothing, you waste 3 bash calls recovering). Stdout is clean JSON (model-init logs are silenced at the CLI layer); leave stderr unredirected so real errors surface on the first call. The summary above lists only rank/page/pdf/type — to read hit text for synthesizing `final_answer`, parse `/tmp/hits.json` directly. The top hit's text is one one-liner away: `/bin/python -c "import json; print(json.load(open('/tmp/hits.json'))[0]['text'])"` (or `[i]` for the rank-(i+1) hit). Fetch only what you need — pulling all 10 hits' text into context inflates cached prompt size on every subsequent turn. This is your tool call when the fast path above printed `NO_MATCH` or `NO_TEXT`. Do not Read, Glob, Grep, or `ls` separately — the fast path's Bash call already listed `./pdfs/`, and `retriever query` indexes the content. From 4082e0446eeee10078e3f73c9b6124c0b9ec4a00 Mon Sep 17 00:00:00 2001 From: edknv Date: Thu, 28 May 2026 13:38:36 -0700 Subject: [PATCH 4/6] move regex search section --- skills/nemo-retriever/references/pitfalls.md | 12 ------------ skills/nemo-retriever/references/query.md | 16 ++++++++++++++-- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/skills/nemo-retriever/references/pitfalls.md b/skills/nemo-retriever/references/pitfalls.md index 27711ac76e..9d43574871 100644 --- a/skills/nemo-retriever/references/pitfalls.md +++ b/skills/nemo-retriever/references/pitfalls.md @@ -15,18 +15,6 @@ Do NOT keep doing text-extract calls across many PDFs to hunt — that exhausts For an unlisted subcommand: `/bin/retriever --help`. -## Keyword/regex search across the corpus - -If you need exact text matches that semantic `retriever query` may have skipped — e.g. "find every mention of 'mRNA-1273' across all PDFs" — use: - -```bash -/bin/python /scripts/grep_corpus.py "" [--max-hits 50] -``` - -It scans the LanceDB table the retriever already built — no PDF re-extraction. Output is `:p:: ......` per hit; `NO_MATCH` if nothing. Counts against the same "one optional follow-up call" budget as the targeted text-extract (mutually exclusive — pick one). - -Don't reach for `pdftotext`, `pdftohtml`, or `pdfgrep` — they're system tools that aren't guaranteed installed on the user's machine. The retriever venv bundles pdfium and `lancedb`; `grep_corpus.py` and `retriever pdf stage page-elements --method pdfium` cover the same use cases without that dependency. - ## Failure modes (expected, not errors) - **First `ingest` takes ~60s+** — vLLM warmup. Expected. diff --git a/skills/nemo-retriever/references/query.md b/skills/nemo-retriever/references/query.md index 42e9a87021..fc98e08c52 100644 --- a/skills/nemo-retriever/references/query.md +++ b/skills/nemo-retriever/references/query.md @@ -19,10 +19,10 @@ If the user's question literally contains a PDF basename from `./pdfs/` (stem ```bash /bin/retriever query "" --top-k 10 --embed-model-name nvidia/llama-nemotron-embed-1b-v2 --rerank \ | tee /tmp/hits.json \ - | /bin/python -c "import json,sys; [print(f'rank={h.get(\"rank\",0)} page={h[\"page_number\"]} pdf={h[\"pdf_basename\"]} type={h.get(\"metadata\",{}).get(\"type\",\"?\")} text={h[\"text\"][:200]}') for h in json.load(sys.stdin)]" + | /bin/python -c "import json,sys; [print(f'rank={h.get(\"rank\",0)} page={h[\"page_number\"]} pdf={h[\"pdf_basename\"]} type={h.get(\"metadata\",{}).get(\"type\",\"?\")}') for h in json.load(sys.stdin)]" ``` -Run that **exactly** as a single pipeline — do not split it into `HITS=$(...)` + `echo "$HITS" | /bin/python -c ...` (the assignment swallows stdout, the pipe sees nothing, you waste 3 bash calls recovering). Stdout is clean JSON (model-init logs are silenced at the CLI layer); leave stderr unredirected so real errors surface on the first call. The full JSON sits at `/tmp/hits.json` if you need to re-parse it (`/bin/python -c "import json; print(json.load(open('/tmp/hits.json'))[6])"` for the rank-7 hit), but in the common case the summary above is all you need. +Run that **exactly** as a single pipeline — do not split it into `HITS=$(...)` + `echo "$HITS" | /bin/python -c ...` (the assignment swallows stdout, the pipe sees nothing, you waste 3 bash calls recovering). Stdout is clean JSON (model-init logs are silenced at the CLI layer); leave stderr unredirected so real errors surface on the first call. The summary above lists only rank/page/pdf/type — to read hit text for synthesizing `final_answer`, parse `/tmp/hits.json` directly. The top hit's text is one one-liner away: `/bin/python -c "import json; print(json.load(open('/tmp/hits.json'))[0]['text'])"` (or `[i]` for the rank-(i+1) hit). Fetch only what you need — pulling all 10 hits' text into context inflates cached prompt size on every subsequent turn. That's your FIRST tool call on every query turn. Do not Read, Glob, Grep, or list PDFs before this — those duplicate what `retriever query` already did. @@ -30,6 +30,18 @@ That's your FIRST tool call on every query turn. Do not Read, Glob, Grep, or lis Each hit has: `text`, `pdf_basename`, `page_number` (int, **1-indexed**: the first page of a PDF is page `1`), `pdf_page` (string composite key `"_"` — not a number, don't use it as one), `_distance`, and `metadata` (JSON with `type` ∈ `text|table|chart|image`). +## Keyword/regex search across the corpus + +If you need exact text matches that semantic `retriever query` may have skipped — e.g. "find every mention of 'mRNA-1273' across all PDFs" — use: + +```bash +/bin/python /scripts/grep_corpus.py "" [--max-hits 50] +``` + +It scans the LanceDB table the retriever already built — no PDF re-extraction. Output is `:p:: ......` per hit; `NO_MATCH` if nothing. Counts against the same "one optional follow-up call" budget as the targeted text-extract (mutually exclusive — pick one). + +Don't reach for `pdftotext`, `pdftohtml`, or `pdfgrep` — they're system tools that aren't guaranteed installed on the user's machine. The retriever venv bundles pdfium and `lancedb`; `grep_corpus.py` and `retriever pdf stage page-elements --method pdfium` cover the same use cases without that dependency. + ## Write `./output.json` directly from the hits - `final_answer`: synthesize from the top hits' `text`. Include the exact number / name / date / row / column the question asks for, plus the source PDF and 0-indexed page. One paragraph. No restating the question, no hedging caveats. If the chunks talk *around* the fact but don't state it, run ONE `/bin/retriever pdf stage page-elements ./pdfs --method pdfium --json-output-dir /tmp/pdf_text --compact-json` and `Read` `/tmp/pdf_text/.pdf.pdf_extraction.json` for the rank-1 page (or rank-2 if rank-1 is metadata) — that almost always surfaces the exact figure. Then synthesize. **If after both calls the asked-for fact still isn't in the evidence, write `final_answer` that says so explicitly** — e.g. "The retrieved pages do not state [X] for [entity]; the closest content is [Y]." Do NOT invent, extrapolate, or generate plausible-sounding content from adjacent material. A confidently-wrong answer scores worse than an honest "not in the retrieved pages". From 29c6a58d0d33bebe996a074e548aca0893b9e498 Mon Sep 17 00:00:00 2001 From: edknv Date: Thu, 28 May 2026 16:05:11 -0700 Subject: [PATCH 5/6] greptile fixes --- skills/nemo-retriever/references/query.md | 2 +- .../scripts/filename_fast_path.py | 23 ++++++++++++------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/skills/nemo-retriever/references/query.md b/skills/nemo-retriever/references/query.md index fc98e08c52..669c5da4f3 100644 --- a/skills/nemo-retriever/references/query.md +++ b/skills/nemo-retriever/references/query.md @@ -2,7 +2,7 @@ ## Filename fast path — try BEFORE `retriever query` -If the user's question literally contains a PDF basename from `./pdfs/` (stem ≥6 chars, with or without `.pdf`, case-insensitive), skip semantic search. Direct pdfium extraction on the named file is faster and avoids semantic-search misses — the right doc is given, and pages rank by query-token overlap. +If the user's question literally contains a PDF basename from `./pdfs/` **including the `.pdf` extension** (stem ≥6 chars, case-insensitive), skip semantic search. Direct pdfium extraction on the named file is faster and avoids semantic-search misses — the right doc is given, and pages rank by query-token overlap. ```bash /bin/python /scripts/filename_fast_path.py "" diff --git a/skills/nemo-retriever/scripts/filename_fast_path.py b/skills/nemo-retriever/scripts/filename_fast_path.py index f11bfd8223..47a2f78c27 100644 --- a/skills/nemo-retriever/scripts/filename_fast_path.py +++ b/skills/nemo-retriever/scripts/filename_fast_path.py @@ -1,8 +1,8 @@ """Query-turn filename fast path for the nemo-retriever skill. Reads `./pdfs/` from the current working directory. If the query string -literally contains any PDF basename (with or without the `.pdf` extension, -stem ≥6 chars, case-insensitive), runs `retriever pdf stage page-elements` +literally contains any PDF basename **including the `.pdf` extension** +(stem ≥6 chars, case-insensitive), runs `retriever pdf stage page-elements` on each matched file via pdfium, ranks pages by query-token frequency, and emits a top-10 ranking + the top page's raw text. @@ -47,14 +47,16 @@ def find_matches(query_lower: str, basenames: list[str]) -> list[str]: - """Return PDF basenames whose name (with or without .pdf) appears verbatim - in the lowercased query. Skip stems shorter than MIN_STEM_LEN.""" + """Return PDF basenames whose full name (including the `.pdf` extension) + appears verbatim in the lowercased query. Skip stems shorter than MIN_STEM_LEN. + Requiring the extension avoids false positives on common English words that + happen to appear as PDF stems (e.g. `report.pdf`, `market.pdf`).""" matches = [] for name in basenames: stem, ext = os.path.splitext(name) if ext.lower() != ".pdf" or len(stem) < MIN_STEM_LEN: continue - if name.lower() in query_lower or stem.lower() in query_lower: + if name.lower() in query_lower: matches.append(name) return matches @@ -83,7 +85,7 @@ def sidecar_path(pdf_name: str) -> str | None: stem = os.path.splitext(pdf_name)[0] candidates = ( f"{EXTRACT_OUT}/{pdf_name}.pdf_extraction.json", - f"{EXTRACT_OUT}/{stem}.pdf.pdf_extraction.json", + f"{EXTRACT_OUT}/{stem}.pdf_extraction.json", ) for c in candidates: if os.path.exists(c): @@ -92,7 +94,8 @@ def sidecar_path(pdf_name: str) -> str | None: def page_records(sidecar: str) -> list[dict]: - data = json.load(open(sidecar)) + with open(sidecar) as fh: + data = json.load(fh) if isinstance(data, list): return data if isinstance(data, dict): @@ -138,7 +141,11 @@ def main() -> int: ql = query.lower() retriever_bin = os.path.join(os.path.dirname(sys.executable), "retriever") - basenames = sorted(p for p in os.listdir(PDF_DIR) if p.lower().endswith(".pdf")) + try: + basenames = sorted(p for p in os.listdir(PDF_DIR) if p.lower().endswith(".pdf")) + except (FileNotFoundError, PermissionError) as exc: + print(f"ERROR: cannot list {PDF_DIR}: {exc}", file=sys.stderr) + return 1 matches = find_matches(ql, basenames) if not matches: print("NO_MATCH") From 4c9deb90c0500fcf2a719633e71ca9e31b31702c Mon Sep 17 00:00:00 2001 From: edknv Date: Thu, 28 May 2026 16:16:37 -0700 Subject: [PATCH 6/6] Wrapping the per-file call in a try/except --- .../scripts/filename_fast_path.py | 35 +++++++++++-------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/skills/nemo-retriever/scripts/filename_fast_path.py b/skills/nemo-retriever/scripts/filename_fast_path.py index 47a2f78c27..33243912b3 100644 --- a/skills/nemo-retriever/scripts/filename_fast_path.py +++ b/skills/nemo-retriever/scripts/filename_fast_path.py @@ -62,23 +62,28 @@ def find_matches(query_lower: str, basenames: list[str]) -> list[str]: def extract_pages(retriever_bin: str, matches: list[str]) -> None: + """Extract each matched PDF; log per-file failures and continue so a single + bad PDF doesn't block remaining matches.""" os.makedirs(EXTRACT_OUT, exist_ok=True) for m in matches: - subprocess.run( - [ - retriever_bin, - "pdf", - "stage", - "page-elements", - f"{PDF_DIR}/{m}", - "--method", - "pdfium", - "--json-output-dir", - EXTRACT_OUT, - "--compact-json", - ], - check=True, - ) + try: + subprocess.run( + [ + retriever_bin, + "pdf", + "stage", + "page-elements", + f"{PDF_DIR}/{m}", + "--method", + "pdfium", + "--json-output-dir", + EXTRACT_OUT, + "--compact-json", + ], + check=True, + ) + except subprocess.CalledProcessError as exc: + print(f"WARN: page-elements failed on {m}: exit {exc.returncode}", file=sys.stderr) def sidecar_path(pdf_name: str) -> str | None: