From 584932633d14fbaf498da3bfd1bb5a2662583d73 Mon Sep 17 00:00:00 2001 From: drknowhow Date: Tue, 2 Jun 2026 06:37:11 -0400 Subject: [PATCH] feat(v0.2.0): port reference scholar + synthesis_doc_builder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ports the Python reference implementation into the public repo with agent-runtime couplings stripped. lib/scholar.py - stdlib-urllib adapter over OpenAlex, Semantic Scholar, PubMed, arXiv, Europe PMC, Crossref, and Unpaywall. - Five actions: search, multi_search, get, find_doi, resolve_oa. - Uniform normalized hit schema across all sources. - Polite-pool contact email is no longer hard-coded: * configure(contact_email, app_name) sets module-global UA + mailto. * SCHOLAR_CONTACT_EMAIL env var honored at import time. * Without configuration, mailto params are omitted (APIs still work, polite-pool benefits forfeited). - Embedding-based dedup in multi_search is now pluggable via set_embedding_deduper(fn) — no hard dep on any embeddings module. Unregistered = no-op pass-through; hash dedup remains the safety net. lib/synthesis_doc_builder.py - python-docx + matplotlib helper that renders forest plot, PRISMA flow, stance heat-table, and assembles the .docx with native heading hierarchy + tables. - Drive upload decoupled behind a DI Uploader callable: Uploader = Callable[[Path, str, str], dict] build_synthesis_doc(inputs, *, uploader=None) Without an uploader the helper returns the local .docx path and uploaded=False; with one, doc_id / web_url come back populated. - matplotlib + python-docx remain soft imports; RuntimeError on use when missing, not ImportError at load. pyproject.toml - Installable package. Core deps: stdlib only. - [viz] extra: python-docx>=1.0, matplotlib>=3.7, numpy>=1.24. tests/ - test_scholar_smoke.py — 14 tests. urllib.request.urlopen monkey- patched with per-source fake responses. Confirms normalized hit schema, configure() mutates UA + email, find_doi best-match, OA flattening, dedup behavior. Network-free. - test_synthesis_doc_builder_smoke.py — 7 tests. tempfile.mkdtemp + pathlib.Path only (no \ separator literals — Linux CI safe). Asserts .docx valid zip with word/document.xml containing heading text + table tag; uploader called with correct args + name; uploader exception preserves local artifact. matplotlib/python-docx tests SkipTest cleanly when soft deps absent. Docs - README "What ships in v0.1.0" -> "What ships in v0.2.0"; new "What's new in v0.2.0" section; pip install "deep-research[viz]" in Quickstart. - SKILL.md: drops the "when ported in v0.2" hedge; bumps version: 0.2.0. - manifests/deep-research.v0.4.json: tool.version 0.1.1 -> 0.2.0; runtime.install.ref v0.1.0 -> v0.2.0; description rewritten to name the shipped library surface. - CHANGELOG.md: new file with 0.2.0, 0.1.1, 0.1.0 entries. Abstraction surfaces in one breath - Polite-pool: configure() + env var; no embedded contact details. - Embedding dedup: register a runtime-specific function or accept the no-op fallback. - Document upload: pass an Uploader or accept "local file only". - Soft deps: matplotlib + python-docx live behind [viz]; tests skip. 21/21 tests pass locally on Python 3.13 (Windows + matplotlib + python-docx installed). No surname or yepgent.com references in the shipped tree. --- CHANGELOG.md | 72 + README.md | 70 +- SKILL.md | 11 +- lib/__init__.py | 15 + lib/scholar.py | 1473 +++++++++++++++++++++ lib/synthesis_doc_builder.py | 657 +++++++++ manifests/deep-research.v0.4.json | 6 +- pyproject.toml | 43 + tests/__init__.py | 0 tests/test_scholar_smoke.py | 314 +++++ tests/test_synthesis_doc_builder_smoke.py | 191 +++ 11 files changed, 2823 insertions(+), 29 deletions(-) create mode 100644 CHANGELOG.md create mode 100644 lib/__init__.py create mode 100644 lib/scholar.py create mode 100644 lib/synthesis_doc_builder.py create mode 100644 pyproject.toml create mode 100644 tests/__init__.py create mode 100644 tests/test_scholar_smoke.py create mode 100644 tests/test_synthesis_doc_builder_smoke.py diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..11b5026 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,72 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is loosely based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [0.2.0] - 2026-06-02 + +### Added +- `lib/scholar.py` — stdlib-urllib adapter over OpenAlex, Semantic Scholar, + PubMed, arXiv, Europe PMC, Crossref, and Unpaywall. Five actions: + `search`, `multi_search`, `get`, `find_doi`, `resolve_oa`. Returns a + uniform normalized hit schema across all sources. +- `lib/synthesis_doc_builder.py` — `python-docx` + `matplotlib` helper + that renders the synthesis doc (forest plot, PRISMA flow, stance + heat-table) with a pluggable upload callback. +- `pyproject.toml` — installable package. Core is stdlib-only; the + `[viz]` extra adds `python-docx`, `matplotlib`, and `numpy` for the + synthesis doc builder. +- `tests/test_scholar_smoke.py` — stdlib `unittest` coverage with + monkeypatched `urllib.request.urlopen`. Network-free. +- `tests/test_synthesis_doc_builder_smoke.py` — stdlib `unittest` + coverage; tests that require `python-docx` or `matplotlib` skip + cleanly when those soft deps are absent. + +### Changed +- `lib/scholar.py` exposes `configure(contact_email, app_name)` and + honors `SCHOLAR_CONTACT_EMAIL` env at import-time. Without it, + polite-pool `mailto=` params are omitted and a generic User-Agent + is sent. Embedding-based dedup in `multi_search` is now pluggable + via `set_embedding_deduper(fn)`; without a registered deduper, + embedding modes degrade to no-op. +- `lib/synthesis_doc_builder.py` decouples Drive/upload concerns + behind an `Uploader = Callable[[Path, str, str], dict]` parameter + on `build_synthesis_doc`. Without an uploader, the helper returns + the local `.docx` path and the result is `uploaded=False`. +- `manifests/deep-research.v0.4.json` bumps `tool.version` to `0.2.0`. +- `SKILL.md` drops the v0.1 hedge — `lib/synthesis_doc_builder.py` + is now part of the shipped reference implementation. +- `README.md` "What ships" tables updated for v0.2.0; "Quickstart" + shows the new `pip install "deep-research[viz]"` form. + +### Notes +- No runtime breaking changes for skill-bundle consumers — the SKILL + protocol surface is unchanged. The library additions are reference + code that orchestrators can wire into their own runtime. + +## [0.1.1] - 2026-06-01 + +### Fixed +- Redacted author surname from `LICENSE`, `README.md`, and the manifest + to comply with the project's public_identity rule. +- Removed `$schema` property from `manifests/deep-research.v0.4.json` + to satisfy `additionalProperties: false` in the v0.4 manifest schema. + +## [0.1.0] - 2026-06-01 + +### Added +- Initial public release. +- `SKILL.md`: full protocol, agent-runtime-agnostic. +- `schema/schema.sql` + `schema/schema_sqlite.sql`: three append-only + research tables. +- `agents/{scout,skeptic,methodologist,synthesizer,critic}.md`: role + prompts as plain text. +- `manifests/deep-research.v0.4.json`: install-manifest-spec v0.4 + declaration. +- `examples/cholesterol_primary_prevention/`: a real run, end to end. + +[0.2.0]: https://github.com/drknowhow/deep-research/releases/tag/v0.2.0 +[0.1.1]: https://github.com/drknowhow/deep-research/releases/tag/v0.1.1 +[0.1.0]: https://github.com/drknowhow/deep-research/releases/tag/v0.1.0 diff --git a/README.md b/README.md index 17d324d..4575da6 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ The output is a structured document with native heading hierarchy, effect-size table, forest plot, blockquoted citations, and a full reference list. It is meant to be defensible under adversarial re-read. -## What ships in v0.1.0 +## What ships in v0.2.0 | Path | What's in it | |---|---| @@ -46,20 +46,33 @@ reference list. It is meant to be defensible under adversarial re-read. | `agents/methodologist.md` | Pass-1 design-grading role prompt. | | `agents/synthesizer.md` | Phase 4 no-fabrication role prompt. | | `agents/critic.md` | Continuation mode `critique` role prompt. | +| `lib/scholar.py` | stdlib-urllib adapter over OpenAlex, Semantic Scholar, PubMed, arXiv, Europe PMC, Crossref, and Unpaywall. | +| `lib/synthesis_doc_builder.py` | `python-docx` + `matplotlib` helper that renders the synthesis doc (forest plot, PRISMA flow, stance heat-table) with a pluggable upload callback. | | `manifests/deep-research.v0.4.json` | install-manifest-spec v0.4 declaration. | | `examples/cholesterol_primary_prevention/` | A real run, end to end. | - -## What's NOT in v0.1.0 (lands in v0.2.0) - -- `lib/scholar.py` — a stdlib-urllib adapter over OpenAlex, Semantic - Scholar, PubMed, arXiv, Europe PMC, Crossref, and Unpaywall. -- `lib/synthesis_doc_builder.py` — `python-docx` + `matplotlib` helper - that renders the synthesis doc (forest plot, PRISMA flow, stance heat-table) - with a pluggable upload callback. - -A working reference implementation of both lives upstream in the Yep agent -codebase. Pointers in the SKILL.md text. v0.2.0 will extract them with -their agent-runtime couplings stripped. +| `tests/` | stdlib `unittest` smoke tests. Network-free. | + +## What's new in v0.2.0 + +- **`lib/scholar.py`** — six free academic APIs behind one normalized hit + schema. stdlib only. Configure the polite-pool contact email at install + time via `scholar.configure(contact_email=...)` or the + `SCHOLAR_CONTACT_EMAIL` env var. Embedding-based dedup in `multi_search` + is pluggable — wire your runtime's embeddings provider via + `scholar.set_embedding_deduper(fn)` or leave it unregistered and + fall back to DOI / title-hash dedup. +- **`lib/synthesis_doc_builder.py`** — decoupled upload via dependency + injection. Pass an `Uploader` callable + (`(local_path, name, mime_type) -> dict`) and the helper hands your + builder the local `.docx`; no uploader = no upload, you keep the file. + matplotlib + python-docx are soft deps gated behind the `[viz]` extra. +- **`pyproject.toml`** — installable package. Core is stdlib-only. +- **Tests** — `tests/test_scholar_smoke.py` and + `tests/test_synthesis_doc_builder_smoke.py`. Run with + `python -m unittest discover tests`. + +A working reference of both modules also lives upstream in the +[Yep agent](https://yepgent.com) codebase. ## Agent-runtime compatibility @@ -78,7 +91,16 @@ agent runtime provides. ## Quickstart -1. Pick your database: +1. Install (the `[viz]` extra pulls `python-docx` + `matplotlib` + `numpy` + for the synthesis doc builder; omit it if you only need the protocol + + scholar adapter): + ```bash + pip install "deep-research[viz]" + # or, from a clone: + pip install -e ".[viz]" + ``` + +2. Pick your database: ```bash # PostgreSQL psql "$DATABASE_URL" < schema/schema.sql @@ -87,23 +109,27 @@ agent runtime provides. sqlite3 deep_research.db < schema/schema_sqlite.sql ``` -2. Read `SKILL.md` end to end. The protocol is short. The gates are not +3. Read `SKILL.md` end to end. The protocol is short. The gates are not optional. -3. Pick a question. Draft the protocol JSON per the schema in `SKILL.md`. +4. Pick a question. Draft the protocol JSON per the schema in `SKILL.md`. Save it. Walk through Gate 1 with a human. -4. On approval, fan out the three Pass-1 subagents with the prompts in +5. On approval, fan out the three Pass-1 subagents with the prompts in `agents/`. Each writes its `research_searches` audit rows and - `research_evidence` candidate rows. + `research_evidence` candidate rows. Wire `lib.scholar.scholar(...)` + in for the search calls. -5. Roll up the corpus. Walk through Gate 2. Approve, revise, or abort. +6. Roll up the corpus. Walk through Gate 2. Approve, revise, or abort. -6. On approval, run Pass-2 retrieval. Stage extracted text. +7. On approval, run Pass-2 retrieval. Stage extracted text. -7. Run the Synthesizer. Read the hard-rules block at the top of +8. Run the Synthesizer. Read the hard-rules block at the top of `agents/synthesizer.md` first. The Synthesizer's job is to NOT make - anything up; that job is harder than it sounds. + anything up; that job is harder than it sounds. Use + `lib.synthesis_doc_builder.build_synthesis_doc(inputs, uploader=...)` + for the artifact — pass your own uploader for Drive/S3 wiring, or + omit it to keep the local `.docx`. ## Worked example: cholesterol primary prevention diff --git a/SKILL.md b/SKILL.md index f8772a8..db141c0 100644 --- a/SKILL.md +++ b/SKILL.md @@ -1,7 +1,7 @@ --- name: deep_research description: Protocol-first, gated, multi-agent literature investigation. Use when a question warrants citation-grade synthesis with traceable provenance — "what does the evidence say about X", "meta-analysis on Y", "systematic review of Z". Two human gates, four-role crew, no-fabrication enforced via verbatim quote spans. -version: 0.1.0 +version: 0.2.0 license: Apache-2.0 homepage: https://github.com/drknowhow/deep-research --- @@ -221,9 +221,12 @@ The synthesis ships as a structured document with: `— Author Year, DOI:...` in italic. - References list with full DOI per row. -`lib/synthesis_doc_builder.py` (when ported in v0.2) produces a `.docx` with -all of the above via `python-docx` + `matplotlib`. v0.1.0 ships the data -model and rendering spec; orchestrators wire in their own document builder. +`lib/synthesis_doc_builder.py` produces a `.docx` with all of the above +via `python-docx` + `matplotlib`. Install the optional `[viz]` extra +(`pip install "deep-research[viz]"`) and pass an `Uploader` callable — +`(local_path, name, mime_type) -> {"doc_id": ..., "web_url": ...}` — +to wire upload into your runtime. Without an uploader, the helper +returns the local `.docx` path and lets you ship it yourself. UPDATE `research_evidence` rows the synthesizer cites to set `supports_or_refutes` to the correct stance. diff --git a/lib/__init__.py b/lib/__init__.py new file mode 100644 index 0000000..0f82f33 --- /dev/null +++ b/lib/__init__.py @@ -0,0 +1,15 @@ +"""deep-research reference implementation library. + +Two modules: + +- ``lib.scholar`` — stdlib-urllib adapter over OpenAlex, Semantic + Scholar, PubMed, arXiv, Europe PMC, Crossref, + and Unpaywall. +- ``lib.synthesis_doc_builder`` — python-docx + matplotlib helper that renders + the synthesis doc (forest plot, PRISMA flow, + stance heat-table) with a pluggable upload + callback. + +Both are agent-runtime-agnostic. Wire them into your own orchestration layer. +""" +__version__ = "0.2.0" diff --git a/lib/scholar.py b/lib/scholar.py new file mode 100644 index 0000000..2b1369a --- /dev/null +++ b/lib/scholar.py @@ -0,0 +1,1473 @@ +"""scholar — unified search adapter across six free academic APIs. + +Reference implementation for the ``deep_research`` protocol. One module, +five actions, six sources. Returns a uniform normalized hit schema across +all sources so subagents (Scout / Skeptic / Methodologist / Synthesizer) +don't have to learn six response shapes. + +Actions +------- +- ``search`` — one query against one source. +- ``multi_search`` — fan out the same query across N sources concurrently, + dedupe by DOI (fallback: title hash), return merged hits. +- ``get`` — fetch one record by source-native id (DOI/PMID/arxiv id/…) +- ``find_doi`` — best-match DOI for a paper title (uses openalex+crossref). +- ``resolve_oa`` — DOI -> Unpaywall OA-location lookup. + +Sources (all free, no API keys; "polite-pool" contact email is sent in +queries so the APIs queue you favourably — call :func:`configure` to set +your own contact email): + +- ``openalex`` — broadest metadata, citation graph, abstract-via-inverted-index. +- ``semantic_scholar`` — unique TLDR one-liners; citation counts; OA PDF links. +- ``pubmed`` — biomedical gold standard. esearch + esummary (no efetch in v1 + — abstract is left null and Pass-2 full-text retrieval handles it). +- ``arxiv`` — preprints. Atom XML parsed with stdlib xml.etree. +- ``europe_pmc`` — DOI/PMID/PMC IDs, abstracts, citation counts. +- ``crossref`` — DOI-first, good for verification. + +Returns +------- +On success: ``{"ok": True, "action": ..., "source": ..., "query": ..., "hits": [...], ...}``. +On failure: ``{"ok": False, "error": ..., ...}``. + +Normalized hit schema (uniform across all sources):: + + { + "id": ":", + "doi": "10.x/..." | None, + "title": str, + "authors": [str, ...], + "year": int | None, + "venue": str, + "abstract": str | None, + "citation_count": int | None, + "url": str, + "open_access_url": str | None, + "tldr": str | None, # semantic_scholar only + "source_tier_hint": 1 | 2 | 3 | None, + "raw": {original-source-response-fragment}, + } + +Design rules +------------ +- **stdlib urllib only** — no third-party HTTP deps. +- **Direct HTTPS, never proxy.** Routing scholarly traffic through a generic + HTTP proxy interferes with polite-pool routing — reach upstream APIs + directly. +- Per-request timeout: 15s. On socket.timeout / HTTP-5xx, one retry with 1s + backoff, then surface ``{ok: False, source, error}``. In ``multi_search`` + partial failure is fine — successful sources' hits + an ``errors`` list. +- Hard limits: query length 1000, per-source limit 100, raw response 5 MB + (``truncated=True`` flag when cut). +- Concurrent fan-out: ``concurrent.futures.ThreadPoolExecutor`` — stdlib + urllib is sync. + +Configuration +------------- +Set the polite-pool contact email and the User-Agent string via +:func:`configure` at import time, or via the ``SCHOLAR_CONTACT_EMAIL`` +environment variable. With no configuration the module sends a generic +User-Agent and omits the contact email from polite-pool params — APIs +still work but you forfeit the polite-pool benefits. + + from lib import scholar + scholar.configure(contact_email="me@example.org", app_name="my-agent") +""" + +from __future__ import annotations + +import concurrent.futures +import json +import os +import re +import socket +import time +import urllib.error +import urllib.parse +import urllib.request +import xml.etree.ElementTree as ET +from typing import Any, Optional + + +# --------------------------------------------------------------------------- +# Configuration (module-global; set via configure() or env) +# --------------------------------------------------------------------------- + +_DEFAULT_APP_NAME = "deep-research-scholar/1.0" +_USER_AGENT = _DEFAULT_APP_NAME +_CONTACT_EMAIL: Optional[str] = None + + +def configure(contact_email: Optional[str] = None, + app_name: Optional[str] = None) -> None: + """Set the polite-pool contact email and User-Agent string. + + Args: + contact_email: address sent in ``mailto=`` query params for the + APIs that support it (OpenAlex, Crossref, Unpaywall, PubMed). + When None, polite-pool params are omitted. + app_name: User-Agent string. When None, keeps the prior value + (or the default ``deep-research-scholar/1.0``). + + Calling configure() with no arguments leaves the current configuration + unchanged. Call it again later to update — module-global state is + mutated in place. + """ + global _USER_AGENT, _CONTACT_EMAIL + if contact_email is not None: + _CONTACT_EMAIL = contact_email.strip() or None + if app_name is not None: + _USER_AGENT = app_name.strip() or _DEFAULT_APP_NAME + + +# Honor SCHOLAR_CONTACT_EMAIL as a fallback at import-time. +_env_email = os.environ.get("SCHOLAR_CONTACT_EMAIL") +if _env_email: + _CONTACT_EMAIL = _env_email.strip() or None + + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +REQUEST_TIMEOUT_S = 15 +RETRY_BACKOFF_S = 1.0 +MAX_QUERY_LEN = 1000 +MAX_PER_SOURCE_LIMIT = 100 +DEFAULT_LIMIT = 20 +MAX_RAW_BYTES = 5 * 1024 * 1024 # 5 MB + +SOURCES: tuple[str, ...] = ( + "openalex", + "semantic_scholar", + "pubmed", + "arxiv", + "europe_pmc", + "crossref", +) + +# Unpaywall (DOI -> OA-PDF resolver) — free, no key, polite-pool via email. +UNPAYWALL_BASE = "https://api.unpaywall.org/v2" + +# Valid values for multi_search's ``dedupe_mode`` parameter. +# - "hash": DOI / title-hash dedup only (the default). +# - "none": skip dedup — return the raw merged list. +# Embedding-based dedup is supported when a caller-provided embedding +# dedup function is wired in; see _dedupe_hits_with_embedding for the +# extension point. +_DEDUPE_MODES: tuple[str, ...] = ("hash", "embedding", "both", "none") + + +# --------------------------------------------------------------------------- +# Result envelope helpers +# --------------------------------------------------------------------------- + +def _ok(**payload: Any) -> dict: + return {"ok": True, **payload} + + +def _err(msg: str, **extra: Any) -> dict: + return {"ok": False, "error": msg, **extra} + + +# --------------------------------------------------------------------------- +# HTTP helper with timeout + single retry + byte cap. +# +# Returns (body_bytes, truncated_flag, raised_exception_or_None). +# Failures come back as a {ok: False} envelope so multi_search can collect +# partial results cleanly. +# --------------------------------------------------------------------------- + +def _http_get( + url: str, + *, + extra_headers: Optional[dict[str, str]] = None, + accept: str = "application/json", +) -> tuple[Optional[bytes], bool, Optional[str]]: + headers = {"User-Agent": _USER_AGENT, "Accept": accept} + if extra_headers: + headers.update(extra_headers) + + last_err: Optional[str] = None + for attempt in (1, 2): + try: + req = urllib.request.Request(url, headers=headers) + with urllib.request.urlopen(req, timeout=REQUEST_TIMEOUT_S) as resp: + # Cap the read to MAX_RAW_BYTES + 1 so we can detect overflow. + raw = resp.read(MAX_RAW_BYTES + 1) + truncated = len(raw) > MAX_RAW_BYTES + if truncated: + raw = raw[:MAX_RAW_BYTES] + return raw, truncated, None + except (socket.timeout, urllib.error.URLError) as e: + # urllib.error.HTTPError is a subclass of URLError — branch on + # whether the server responded with a 5xx (retryable) or not. + if isinstance(e, urllib.error.HTTPError): + if e.code < 500: + return None, False, f"HTTP {e.code}: {e.reason}" + last_err = f"HTTP {e.code}: {e.reason}" + else: + last_err = f"{type(e).__name__}: {e}" + except Exception as e: # noqa: BLE001 + last_err = f"{type(e).__name__}: {e}" + + if attempt == 1: + time.sleep(RETRY_BACKOFF_S) + return None, False, last_err + + +# --------------------------------------------------------------------------- +# Validation helpers +# --------------------------------------------------------------------------- + +def _validate_query(query: Any) -> Optional[str]: + if not isinstance(query, str) or not query.strip(): + return "query must be a non-empty string" + if len(query) > MAX_QUERY_LEN: + return f"query exceeds {MAX_QUERY_LEN} chars" + return None + + +def _validate_source(source: Any) -> Optional[str]: + if source not in SOURCES: + return ( + f"unknown source {source!r}; available: {list(SOURCES)}" + ) + return None + + +def _validate_limit(limit: Any) -> tuple[int, Optional[str]]: + try: + n = int(limit) if limit is not None else DEFAULT_LIMIT + except (TypeError, ValueError): + return DEFAULT_LIMIT, "limit must be an integer" + if n < 1: + return DEFAULT_LIMIT, "limit must be >= 1" + if n > MAX_PER_SOURCE_LIMIT: + n = MAX_PER_SOURCE_LIMIT + return n, None + + +# --------------------------------------------------------------------------- +# Shared normalization helpers +# --------------------------------------------------------------------------- + +def _reconstruct_openalex_abstract(inverted: Optional[dict]) -> Optional[str]: + """Invert OpenAlex's abstract_inverted_index back into a string. + + The format is ``{"word": [position, ...]}`` — sort all (position, word) + pairs by position and join. Returns None when the input is empty/falsy. + """ + if not inverted or not isinstance(inverted, dict): + return None + positions: list[tuple[int, str]] = [] + for word, idxs in inverted.items(): + if not isinstance(idxs, list): + continue + for idx in idxs: + if isinstance(idx, int): + positions.append((idx, word)) + if not positions: + return None + positions.sort() + return " ".join(w for _, w in positions) + + +def _normalize_doi(raw: Any) -> Optional[str]: + """Strip URL prefix and lowercase the registrant. None when not a DOI.""" + if not isinstance(raw, str): + return None + s = raw.strip() + if not s: + return None + s = re.sub(r"^https?://(?:dx\.)?doi\.org/", "", s, flags=re.IGNORECASE) + s = re.sub(r"^doi:\s*", "", s, flags=re.IGNORECASE) + if not s.startswith("10."): + return None + return s.lower() + + +def _title_hash_key(title: Optional[str]) -> Optional[str]: + """Normalized title for dedup: lowercase, alphanumeric only.""" + if not title: + return None + return re.sub(r"[^a-z0-9]+", "", title.lower()) or None + + +def _polite_email_param(extra: Optional[dict] = None) -> dict[str, str]: + """Inject ``mailto=`` into a params dict when the contact email is set.""" + params = dict(extra) if extra else {} + if _CONTACT_EMAIL: + params["mailto"] = _CONTACT_EMAIL + return params + + +# --------------------------------------------------------------------------- +# Per-source: OpenAlex +# --------------------------------------------------------------------------- + +OPENALEX_BASE = "https://api.openalex.org/works" + + +def _build_openalex_search_url(query: str, limit: int, filters: dict) -> str: + params = _polite_email_param({ + "search": query, + "per-page": str(limit), + }) + f_parts: list[str] = [] + if filters.get("date_from"): + f_parts.append(f"from_publication_date:{filters['date_from']}") + if filters.get("date_to"): + f_parts.append(f"to_publication_date:{filters['date_to']}") + if filters.get("open_access") is True: + f_parts.append("is_oa:true") + if f_parts: + params["filter"] = ",".join(f_parts) + return f"{OPENALEX_BASE}?{urllib.parse.urlencode(params)}" + + +def _openalex_tier_hint(work: dict) -> Optional[int]: + """1 if journal article; 3 if review; 2 if posted-content (preprint).""" + work_type = (work.get("type") or "").lower() + if work_type == "review": + return 3 + if work_type in ("posted-content", "preprint"): + return 2 + loc = (work.get("primary_location") or {}).get("source") or {} + if (loc.get("type") or "").lower() == "journal" and work_type == "article": + return 1 + return None + + +def _normalize_openalex_work(work: dict) -> dict: + doi = _normalize_doi(work.get("doi")) + authorships = work.get("authorships") or [] + authors = [ + (a.get("author") or {}).get("display_name") + for a in authorships + if isinstance(a, dict) + ] + authors = [a for a in authors if a] + venue = "" + loc = work.get("primary_location") or {} + if isinstance(loc, dict): + src = loc.get("source") or {} + if isinstance(src, dict): + venue = src.get("display_name") or "" + oa = work.get("open_access") or {} + return { + "id": f"openalex:{(work.get('id') or '').rsplit('/', 1)[-1]}", + "doi": doi, + "title": work.get("title") or work.get("display_name") or "", + "authors": authors, + "year": work.get("publication_year"), + "venue": venue, + "abstract": _reconstruct_openalex_abstract( + work.get("abstract_inverted_index") + ), + "citation_count": work.get("cited_by_count"), + "url": work.get("id") or "", + "open_access_url": (oa.get("oa_url") if isinstance(oa, dict) else None), + "tldr": None, + "source_tier_hint": _openalex_tier_hint(work), + "raw": work, + } + + +def _search_openalex(query: str, limit: int, filters: dict) -> dict: + url = _build_openalex_search_url(query, limit, filters) + body, truncated, err = _http_get(url) + if err is not None: + return _err(err, source="openalex") + try: + data = json.loads(body or b"{}") + except json.JSONDecodeError as e: + return _err(f"json parse failed: {e}", source="openalex") + works = data.get("results") or [] + return _ok( + source="openalex", + total=(data.get("meta") or {}).get("count"), + hits=[_normalize_openalex_work(w) for w in works], + truncated=truncated, + ) + + +def _get_openalex(id_: str) -> dict: + """``id_`` accepts a bare openalex id (Wxxxxx), full URL, or DOI.""" + doi = _normalize_doi(id_) + if doi is not None: + ident = f"doi:{doi}" + elif id_.startswith("http"): + ident = id_.rsplit("/", 1)[-1] + else: + ident = id_ + suffix = f"?mailto={_CONTACT_EMAIL}" if _CONTACT_EMAIL else "" + url = f"{OPENALEX_BASE}/{urllib.parse.quote(ident, safe='')}{suffix}" + body, truncated, err = _http_get(url) + if err is not None: + return _err(err, source="openalex") + try: + work = json.loads(body or b"{}") + except json.JSONDecodeError as e: + return _err(f"json parse failed: {e}", source="openalex") + return _ok(source="openalex", hit=_normalize_openalex_work(work), truncated=truncated) + + +# --------------------------------------------------------------------------- +# Per-source: Semantic Scholar +# --------------------------------------------------------------------------- + +SS_BASE = "https://api.semanticscholar.org/graph/v1" +SS_FIELDS = ( + "title,abstract,authors,year,venue,externalIds," + "citationCount,tldr,openAccessPdf" +) + + +def _normalize_ss_paper(p: dict) -> dict: + external = p.get("externalIds") or {} + doi = _normalize_doi(external.get("DOI")) + authors = [ + a.get("name") for a in (p.get("authors") or []) + if isinstance(a, dict) and a.get("name") + ] + tldr = (p.get("tldr") or {}).get("text") if isinstance(p.get("tldr"), dict) else None + oa = p.get("openAccessPdf") or {} + oa_url = oa.get("url") if isinstance(oa, dict) else None + pid = p.get("paperId") or "" + return { + "id": f"semantic_scholar:{pid}", + "doi": doi, + "title": p.get("title") or "", + "authors": authors, + "year": p.get("year"), + "venue": p.get("venue") or "", + "abstract": p.get("abstract"), + "citation_count": p.get("citationCount"), + "url": f"https://www.semanticscholar.org/paper/{pid}" if pid else "", + "open_access_url": oa_url, + "tldr": tldr, + # We can't tell tier reliably from SS metadata alone; leave null. + "source_tier_hint": None, + "raw": p, + } + + +def _search_semantic_scholar(query: str, limit: int, filters: dict) -> dict: + params = {"query": query, "limit": str(limit), "fields": SS_FIELDS} + if filters.get("date_from") or filters.get("date_to"): + lo = filters.get("date_from") or "" + hi = filters.get("date_to") or "" + # SS uses "year" param with "YYYY:YYYY" range or "YYYY-". + lo_y = lo.split("-")[0] if lo else "" + hi_y = hi.split("-")[0] if hi else "" + params["year"] = f"{lo_y}-{hi_y}".strip("-") or "" + if not params["year"]: + params.pop("year") + if filters.get("open_access") is True: + params["openAccessPdf"] = "" + url = f"{SS_BASE}/paper/search?{urllib.parse.urlencode(params)}" + body, truncated, err = _http_get(url) + if err is not None: + return _err(err, source="semantic_scholar") + try: + data = json.loads(body or b"{}") + except json.JSONDecodeError as e: + return _err(f"json parse failed: {e}", source="semantic_scholar") + papers = data.get("data") or [] + return _ok( + source="semantic_scholar", + total=data.get("total"), + hits=[_normalize_ss_paper(p) for p in papers], + truncated=truncated, + ) + + +def _get_semantic_scholar(id_: str) -> dict: + doi = _normalize_doi(id_) + if doi is not None: + ident = f"DOI:{doi}" + elif id_.lower().startswith("pmid:") or id_.isdigit(): + # SS supports PMID:nnn lookup + ident = id_ if ":" in id_ else f"PMID:{id_}" + elif id_.lower().startswith("arxiv:"): + ident = id_ # SS accepts ARXIV:xxx but also arxiv:xxx + else: + ident = id_ + url = f"{SS_BASE}/paper/{urllib.parse.quote(ident, safe=':')}?fields={SS_FIELDS}" + body, truncated, err = _http_get(url) + if err is not None: + return _err(err, source="semantic_scholar") + try: + paper = json.loads(body or b"{}") + except json.JSONDecodeError as e: + return _err(f"json parse failed: {e}", source="semantic_scholar") + return _ok( + source="semantic_scholar", + hit=_normalize_ss_paper(paper), + truncated=truncated, + ) + + +# --------------------------------------------------------------------------- +# Per-source: PubMed (esearch + esummary; no efetch in v1) +# --------------------------------------------------------------------------- + +PUBMED_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" + + +def _pubmed_polite_params() -> dict[str, str]: + params = {"tool": "deep-research-scholar"} + if _CONTACT_EMAIL: + params["email"] = _CONTACT_EMAIL + return params + + +def _normalize_pubmed_summary(pmid: str, s: dict) -> dict: + authors = [ + a.get("name") for a in (s.get("authors") or []) + if isinstance(a, dict) and a.get("name") + ] + article_ids = s.get("articleids") or [] + doi = None + for aid in article_ids: + if isinstance(aid, dict) and (aid.get("idtype") or "").lower() == "doi": + doi = _normalize_doi(aid.get("value")) + if doi: + break + year: Optional[int] = None + pub_date = s.get("pubdate") or "" + m = re.match(r"(\d{4})", pub_date) + if m: + try: + year = int(m.group(1)) + except ValueError: + year = None + return { + "id": f"pubmed:{pmid}", + "doi": doi, + "title": s.get("title") or "", + "authors": authors, + "year": year, + "venue": s.get("fulljournalname") or s.get("source") or "", + # v1 deliberately skips efetch (XML abstract retrieval); the Pass-2 + # full-text step will populate abstract via europe_pmc / openalex. + "abstract": None, + "citation_count": None, + "url": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/", + "open_access_url": None, + "tldr": None, + "source_tier_hint": 1, # PubMed-indexed = peer-reviewed by default. + "raw": s, + } + + +def _search_pubmed(query: str, limit: int, filters: dict) -> dict: + esearch_params = { + "db": "pubmed", + "term": query, + "retmode": "json", + "retmax": str(limit), + **_pubmed_polite_params(), + } + if filters.get("date_from") or filters.get("date_to"): + # Use mindate/maxdate (YYYY/MM/DD) with date_type=pdat by convention. + if filters.get("date_from"): + esearch_params["mindate"] = filters["date_from"].replace("-", "/") + if filters.get("date_to"): + esearch_params["maxdate"] = filters["date_to"].replace("-", "/") + esearch_params["datetype"] = "pdat" + esearch_url = f"{PUBMED_BASE}/esearch.fcgi?{urllib.parse.urlencode(esearch_params)}" + body, truncated_es, err = _http_get(esearch_url) + if err is not None: + return _err(err, source="pubmed") + try: + es_data = json.loads(body or b"{}") + except json.JSONDecodeError as e: + return _err(f"esearch json parse failed: {e}", source="pubmed") + es_result = es_data.get("esearchresult") or {} + pmids = es_result.get("idlist") or [] + total_raw = es_result.get("count") + try: + total = int(total_raw) if total_raw is not None else None + except (TypeError, ValueError): + total = None + if not pmids: + return _ok(source="pubmed", total=total, hits=[], truncated=truncated_es) + + esummary_params = { + "db": "pubmed", + "id": ",".join(pmids), + "retmode": "json", + **_pubmed_polite_params(), + } + esummary_url = f"{PUBMED_BASE}/esummary.fcgi?{urllib.parse.urlencode(esummary_params)}" + body2, truncated_su, err2 = _http_get(esummary_url) + if err2 is not None: + return _err(err2, source="pubmed") + try: + su_data = json.loads(body2 or b"{}") + except json.JSONDecodeError as e: + return _err(f"esummary json parse failed: {e}", source="pubmed") + result = su_data.get("result") or {} + hits = [] + for pmid in pmids: + s = result.get(pmid) + if isinstance(s, dict): + hits.append(_normalize_pubmed_summary(pmid, s)) + return _ok( + source="pubmed", + total=total, + hits=hits, + truncated=(truncated_es or truncated_su), + ) + + +def _get_pubmed(id_: str) -> dict: + # Strip "pubmed:" / "PMID:" prefix; what's left should be digits. + pmid = id_ + pmid = re.sub(r"^(?:pubmed|pmid):", "", pmid, flags=re.IGNORECASE) + if not pmid.isdigit(): + return _err(f"pubmed id must be numeric PMID, got {id_!r}", source="pubmed") + params = { + "db": "pubmed", + "id": pmid, + "retmode": "json", + **_pubmed_polite_params(), + } + url = f"{PUBMED_BASE}/esummary.fcgi?{urllib.parse.urlencode(params)}" + body, truncated, err = _http_get(url) + if err is not None: + return _err(err, source="pubmed") + try: + data = json.loads(body or b"{}") + except json.JSONDecodeError as e: + return _err(f"json parse failed: {e}", source="pubmed") + s = (data.get("result") or {}).get(pmid) + if not isinstance(s, dict): + return _err(f"PMID {pmid} not found", source="pubmed") + return _ok(source="pubmed", hit=_normalize_pubmed_summary(pmid, s), truncated=truncated) + + +# --------------------------------------------------------------------------- +# Per-source: arXiv (Atom XML over export.arxiv.org) +# --------------------------------------------------------------------------- + +ARXIV_BASE = "http://export.arxiv.org/api/query" +ARXIV_NS = { + "a": "http://www.w3.org/2005/Atom", + "arxiv": "http://arxiv.org/schemas/atom", + "opensearch": "http://a9.com/-/spec/opensearch/1.1/", +} + + +def _normalize_arxiv_entry(entry: ET.Element) -> dict: + def _text(path: str) -> str: + el = entry.find(path, ARXIV_NS) + return (el.text or "").strip() if el is not None and el.text else "" + + arxiv_url = _text("a:id") + arxiv_id = arxiv_url.rsplit("/", 1)[-1] if arxiv_url else "" + title = _text("a:title").replace("\n", " ").strip() + summary = _text("a:summary").replace("\n", " ").strip() + published = _text("a:published") + year: Optional[int] = None + if len(published) >= 4 and published[:4].isdigit(): + year = int(published[:4]) + authors = [] + for a in entry.findall("a:author/a:name", ARXIV_NS): + if a.text: + authors.append(a.text.strip()) + doi_el = entry.find("arxiv:doi", ARXIV_NS) + doi = _normalize_doi(doi_el.text) if doi_el is not None and doi_el.text else None + # arXiv categories live in attributes; capture the + # primary one as the "venue" surrogate so the field isn't empty. + primary_cat = entry.find("arxiv:primary_category", ARXIV_NS) + venue = (primary_cat.get("term") if primary_cat is not None else "") or "arXiv" + return { + "id": f"arxiv:{arxiv_id}", + "doi": doi, + "title": title, + "authors": authors, + "year": year, + "venue": venue, + "abstract": summary or None, + "citation_count": None, # arXiv API doesn't expose citations. + "url": arxiv_url or (f"https://arxiv.org/abs/{arxiv_id}" if arxiv_id else ""), + "open_access_url": ( + f"https://arxiv.org/pdf/{arxiv_id}" if arxiv_id else None + ), + "tldr": None, + "source_tier_hint": 2, # arXiv = preprint by definition. + "raw": { + "arxiv_id": arxiv_id, + "primary_category": primary_cat.get("term") if primary_cat is not None else None, + "published": published, + }, + } + + +def _search_arxiv(query: str, limit: int, filters: dict) -> dict: + # arXiv search supports field-prefixed queries; we pass the user query + # through `all:` which matches title + abstract + authors + comment. + search_query = f"all:{query}" + # date filtering is supported via submittedDate window: + # submittedDate:[YYYYMMDDhhmm TO YYYYMMDDhhmm] + if filters.get("date_from") or filters.get("date_to"): + df = (filters.get("date_from") or "1900-01-01").replace("-", "") + dt = (filters.get("date_to") or "2099-12-31").replace("-", "") + search_query += f" AND submittedDate:[{df}0000 TO {dt}2359]" + params = { + "search_query": search_query, + "max_results": str(limit), + "sortBy": "relevance", + "sortOrder": "descending", + } + url = f"{ARXIV_BASE}?{urllib.parse.urlencode(params)}" + body, truncated, err = _http_get(url, accept="application/atom+xml") + if err is not None: + return _err(err, source="arxiv") + try: + root = ET.fromstring(body or b"") + except ET.ParseError as e: + return _err(f"atom parse failed: {e}", source="arxiv") + total_el = root.find("opensearch:totalResults", ARXIV_NS) + total: Optional[int] = None + if total_el is not None and total_el.text and total_el.text.isdigit(): + total = int(total_el.text) + entries = root.findall("a:entry", ARXIV_NS) + return _ok( + source="arxiv", + total=total, + hits=[_normalize_arxiv_entry(e) for e in entries], + truncated=truncated, + ) + + +def _get_arxiv(id_: str) -> dict: + # Accept "arxiv:2401.12345v1", "2401.12345", or full URL. + raw_id = id_ + raw_id = re.sub(r"^arxiv:", "", raw_id, flags=re.IGNORECASE) + raw_id = raw_id.rsplit("/", 1)[-1] + params = {"id_list": raw_id, "max_results": "1"} + url = f"{ARXIV_BASE}?{urllib.parse.urlencode(params)}" + body, truncated, err = _http_get(url, accept="application/atom+xml") + if err is not None: + return _err(err, source="arxiv") + try: + root = ET.fromstring(body or b"") + except ET.ParseError as e: + return _err(f"atom parse failed: {e}", source="arxiv") + entries = root.findall("a:entry", ARXIV_NS) + if not entries: + return _err(f"arxiv id {raw_id!r} not found", source="arxiv") + return _ok( + source="arxiv", + hit=_normalize_arxiv_entry(entries[0]), + truncated=truncated, + ) + + +# --------------------------------------------------------------------------- +# Per-source: Europe PMC +# --------------------------------------------------------------------------- + +EUROPE_PMC_BASE = "https://www.ebi.ac.uk/europepmc/webservices/rest" + + +def _normalize_europe_pmc_result(r: dict) -> dict: + doi = _normalize_doi(r.get("doi")) + pmid = r.get("pmid") or "" + pmcid = r.get("pmcid") or "" + authors_str = r.get("authorString") or "" + authors = [a.strip() for a in authors_str.split(",") if a.strip()] + year: Optional[int] = None + pub_year = r.get("pubYear") + if pub_year: + try: + year = int(pub_year) + except (TypeError, ValueError): + year = None + venue = "" + ji = r.get("journalInfo") or {} + if isinstance(ji, dict): + j = ji.get("journal") or {} + if isinstance(j, dict): + venue = j.get("title") or j.get("medlineAbbreviation") or "" + if not venue: + venue = r.get("bookOrReportDetails", {}).get("publisher", "") if isinstance(r.get("bookOrReportDetails"), dict) else "" + + # Build canonical URL: PMC if available, else PMID, else DOI. + if pmcid: + url = f"https://europepmc.org/article/PMC/{pmcid.lstrip('PMC')}" + oa_url = f"https://europepmc.org/backend/ptpmcrender.fcgi?accid={pmcid}&blobtype=pdf" + elif pmid: + url = f"https://europepmc.org/article/MED/{pmid}" + oa_url = None + elif doi: + url = f"https://doi.org/{doi}" + oa_url = None + else: + url = "" + oa_url = None + return { + "id": f"europe_pmc:{r.get('id') or pmid or pmcid or doi or ''}", + "doi": doi, + "title": r.get("title") or "", + "authors": authors, + "year": year, + "venue": venue, + "abstract": r.get("abstractText"), + "citation_count": r.get("citedByCount"), + "url": url, + "open_access_url": oa_url, + "tldr": None, + # Source field discriminates: MED (PubMed-indexed) -> tier 1; + # PPR (preprint) -> tier 2; CTX (CiteSeerX), ETH, NBK -> null. + "source_tier_hint": ( + 1 if r.get("source") == "MED" else + (2 if r.get("source") == "PPR" else None) + ), + "raw": r, + } + + +def _search_europe_pmc(query: str, limit: int, filters: dict) -> dict: + q = query + if filters.get("date_from") or filters.get("date_to"): + df = filters.get("date_from") or "1900-01-01" + dt = filters.get("date_to") or "2099-12-31" + q = f"({q}) AND (FIRST_PDATE:[{df} TO {dt}])" + if filters.get("open_access") is True: + q = f"({q}) AND OPEN_ACCESS:Y" + params = { + "query": q, + "format": "json", + "pageSize": str(limit), + "resultType": "core", + } + url = f"{EUROPE_PMC_BASE}/search?{urllib.parse.urlencode(params)}" + body, truncated, err = _http_get(url) + if err is not None: + return _err(err, source="europe_pmc") + try: + data = json.loads(body or b"{}") + except json.JSONDecodeError as e: + return _err(f"json parse failed: {e}", source="europe_pmc") + results = ((data.get("resultList") or {}).get("result")) or [] + return _ok( + source="europe_pmc", + total=data.get("hitCount"), + hits=[_normalize_europe_pmc_result(r) for r in results], + truncated=truncated, + ) + + +def _get_europe_pmc(id_: str) -> dict: + """Resolve by DOI / PMID / PMCID — issue a search query that targets it.""" + doi = _normalize_doi(id_) + if doi is not None: + q = f"DOI:{doi}" + elif id_.upper().startswith("PMC"): + q = f"PMCID:{id_.upper()}" + elif id_.lower().startswith("pmid:"): + q = f"EXT_ID:{id_.split(':', 1)[1]} AND SRC:MED" + elif id_.isdigit(): + q = f"EXT_ID:{id_} AND SRC:MED" + else: + q = id_ + res = _search_europe_pmc(q, limit=1, filters={}) + if not res.get("ok"): + return res + hits = res.get("hits") or [] + if not hits: + return _err(f"europe_pmc id {id_!r} not found", source="europe_pmc") + return _ok(source="europe_pmc", hit=hits[0], truncated=res.get("truncated")) + + +# --------------------------------------------------------------------------- +# Per-source: Crossref +# --------------------------------------------------------------------------- + +CROSSREF_BASE = "https://api.crossref.org/works" + + +def _normalize_crossref_item(it: dict) -> dict: + doi = _normalize_doi(it.get("DOI")) + title_arr = it.get("title") or [] + title = title_arr[0] if title_arr else "" + authors_arr = it.get("author") or [] + authors = [] + for a in authors_arr: + if not isinstance(a, dict): + continue + name_parts = [] + if a.get("given"): + name_parts.append(a["given"]) + if a.get("family"): + name_parts.append(a["family"]) + if name_parts: + authors.append(" ".join(name_parts)) + elif a.get("name"): + authors.append(a["name"]) + venue_arr = it.get("container-title") or [] + venue = venue_arr[0] if venue_arr else "" + year: Optional[int] = None + # issued.date-parts is the most reliable year locator. + parts = ((it.get("issued") or {}).get("date-parts")) or [] + if parts and isinstance(parts[0], list) and parts[0]: + try: + year = int(parts[0][0]) + except (TypeError, ValueError): + year = None + crossref_type = (it.get("type") or "").lower() + tier_hint: Optional[int] = None + if crossref_type in ("journal-article",): + tier_hint = 1 + elif crossref_type in ("posted-content", "preprint"): + tier_hint = 2 + elif crossref_type in ("review-article",): + tier_hint = 3 + return { + "id": f"crossref:{doi or ''}", + "doi": doi, + "title": title, + "authors": authors, + "year": year, + "venue": venue, + # Crossref includes abstract for some records as JATS XML embedded + # in a string. We surface it as-is; the consumer can strip tags. + "abstract": it.get("abstract"), + "citation_count": it.get("is-referenced-by-count"), + "url": (f"https://doi.org/{doi}" if doi else (it.get("URL") or "")), + "open_access_url": None, + "tldr": None, + "source_tier_hint": tier_hint, + "raw": it, + } + + +def _search_crossref(query: str, limit: int, filters: dict) -> dict: + params = _polite_email_param({ + "query": query, + "rows": str(limit), + }) + filt_parts: list[str] = [] + if filters.get("date_from"): + filt_parts.append(f"from-pub-date:{filters['date_from']}") + if filters.get("date_to"): + filt_parts.append(f"until-pub-date:{filters['date_to']}") + if filters.get("open_access") is True: + filt_parts.append("has-full-text:true") + if filt_parts: + params["filter"] = ",".join(filt_parts) + url = f"{CROSSREF_BASE}?{urllib.parse.urlencode(params)}" + body, truncated, err = _http_get(url) + if err is not None: + return _err(err, source="crossref") + try: + data = json.loads(body or b"{}") + except json.JSONDecodeError as e: + return _err(f"json parse failed: {e}", source="crossref") + items = ((data.get("message") or {}).get("items")) or [] + return _ok( + source="crossref", + total=((data.get("message") or {}).get("total-results")), + hits=[_normalize_crossref_item(it) for it in items], + truncated=truncated, + ) + + +def _get_crossref(id_: str) -> dict: + doi = _normalize_doi(id_) or id_ + suffix = f"?mailto={_CONTACT_EMAIL}" if _CONTACT_EMAIL else "" + url = f"{CROSSREF_BASE}/{urllib.parse.quote(doi, safe='/')}{suffix}" + body, truncated, err = _http_get(url) + if err is not None: + return _err(err, source="crossref") + try: + data = json.loads(body or b"{}") + except json.JSONDecodeError as e: + return _err(f"json parse failed: {e}", source="crossref") + msg = data.get("message") + if not isinstance(msg, dict): + return _err(f"crossref id {id_!r} not found", source="crossref") + return _ok(source="crossref", hit=_normalize_crossref_item(msg), truncated=truncated) + + +# --------------------------------------------------------------------------- +# Dispatch tables +# --------------------------------------------------------------------------- + +_SEARCH_DISPATCH = { + "openalex": _search_openalex, + "semantic_scholar": _search_semantic_scholar, + "pubmed": _search_pubmed, + "arxiv": _search_arxiv, + "europe_pmc": _search_europe_pmc, + "crossref": _search_crossref, +} + +_GET_DISPATCH = { + "openalex": _get_openalex, + "semantic_scholar": _get_semantic_scholar, + "pubmed": _get_pubmed, + "arxiv": _get_arxiv, + "europe_pmc": _get_europe_pmc, + "crossref": _get_crossref, +} + + +# --------------------------------------------------------------------------- +# Action: search +# --------------------------------------------------------------------------- + +def _action_search(p: dict) -> dict: + source = p.get("source") + err = _validate_source(source) + if err: + return _err(err) + err = _validate_query(p.get("query")) + if err: + return _err(err, source=source) + limit, lerr = _validate_limit(p.get("limit")) + if lerr: + return _err(lerr, source=source) + filters = p.get("filters") or {} + if not isinstance(filters, dict): + return _err("filters must be a dict", source=source) + res = _SEARCH_DISPATCH[source](p["query"], limit, filters) + res.setdefault("action", "search") + res.setdefault("query", p["query"]) + return res + + +# --------------------------------------------------------------------------- +# Action: multi_search (concurrent fan-out + dedupe) +# --------------------------------------------------------------------------- + +def _dedupe_hits(hits: list[dict]) -> list[dict]: + """Dedupe by DOI when present, else by normalized-title hash. + + Preserves first-occurrence order. When a duplicate is encountered, we + record the duplicate's source in the kept hit's ``also_found_in`` list + so multi_search can show cross-source corroboration without dropping + the audit trail. + """ + seen: dict[str, int] = {} + out: list[dict] = [] + for h in hits: + key = ( + f"doi:{h.get('doi')}" if h.get("doi") else + (f"title:{_title_hash_key(h.get('title'))}" if h.get("title") else None) + ) + if key is None: + out.append(h) + continue + if key in seen: + idx = seen[key] + also = out[idx].setdefault("also_found_in", []) + src = (h.get("id") or "").split(":", 1)[0] + if src and src not in also: + also.append(src) + continue + seen[key] = len(out) + out.append(h) + return out + + +def _dedupe_key_title(h: dict) -> Optional[str]: + """Embedding key for a hit: title + first-author surname when available. + + The first author disambiguates papers with near-identical titles + (review vs original) — embedding sees the author as part of the + signature, not just the topic words. + """ + title = h.get("title") + if not isinstance(title, str) or not title.strip(): + return None + authors = h.get("authors") or [] + first = "" + if isinstance(authors, list) and authors: + a0 = authors[0] + if isinstance(a0, str): + first = a0.strip() + if first: + return f"{title.strip()} | {first}" + return title.strip() + + +# Pluggable embedding-dedup hook. Default is a no-op pass-through. Wire a +# real implementation via ``set_embedding_deduper`` if your runtime offers +# an embeddings provider. +EmbeddingDeduper = Any # Callable[[list[dict], threshold, key_fn], list[dict]] +_embedding_deduper: Optional[Any] = None + + +def set_embedding_deduper(fn: Any) -> None: + """Register a callable ``fn(hits, threshold, key_fn) -> list[dict]``. + + When registered, ``multi_search(dedupe_mode="embedding"|"both")`` will + invoke it after the hash-dedup pass. Callers without an embeddings + provider can leave it unregistered — those modes degrade to no-op. + """ + global _embedding_deduper + _embedding_deduper = fn + + +def _dedupe_hits_with_embedding( + hits: list[dict], + threshold: float, +) -> list[dict]: + """Embedding fold via the registered deduper (if any). + + Without a registered deduper this is a pass-through. Any exception + from the registered function falls back to passing the list through + unchanged — the hash pass is the safety net. + """ + if _embedding_deduper is None: + return list(hits) + try: + return _embedding_deduper( + hits, threshold=threshold, key_fn=_dedupe_key_title, + ) + except Exception: # noqa: BLE001 + return list(hits) + + +def _action_multi_search(p: dict) -> dict: + err = _validate_query(p.get("query")) + if err: + return _err(err) + sources = p.get("sources") or list(SOURCES) + if not isinstance(sources, (list, tuple)) or not sources: + return _err("sources must be a non-empty list") + bad = [s for s in sources if s not in SOURCES] + if bad: + return _err(f"unknown source(s): {bad}; available: {list(SOURCES)}") + limit, lerr = _validate_limit(p.get("limit_per_source")) + if lerr: + return _err(lerr) + filters = p.get("filters") or {} + if not isinstance(filters, dict): + return _err("filters must be a dict") + + # Dedup mode: legacy ``dedupe`` bool maps to {True -> "hash", False -> "none"}. + # New ``dedupe_mode`` takes precedence when supplied. + raw_mode = p.get("dedupe_mode") + if raw_mode is None: + raw_mode = "hash" if p.get("dedupe", True) else "none" + if raw_mode not in _DEDUPE_MODES: + return _err( + f"unknown dedupe_mode {raw_mode!r}; " + f"valid: {list(_DEDUPE_MODES)}" + ) + embedding_threshold = p.get("embedding_threshold", 0.88) + if not isinstance(embedding_threshold, (int, float)): + return _err("embedding_threshold must be a number in [-1.0, 1.0]") + if not (-1.0 <= float(embedding_threshold) <= 1.0): + return _err( + f"embedding_threshold must be in [-1.0, 1.0], " + f"got {embedding_threshold!r}" + ) + + results: dict[str, dict] = {} + with concurrent.futures.ThreadPoolExecutor(max_workers=len(sources)) as ex: + futures = { + ex.submit(_SEARCH_DISPATCH[s], p["query"], limit, filters): s + for s in sources + } + for fut in concurrent.futures.as_completed(futures): + s = futures[fut] + try: + results[s] = fut.result() + except Exception as e: # noqa: BLE001 + results[s] = _err(f"{type(e).__name__}: {e}", source=s) + + merged: list[dict] = [] + per_source: dict[str, dict] = {} + errors: list[dict] = [] + for s in sources: + r = results.get(s) or {"ok": False, "error": "no result"} + per_source[s] = { + "ok": bool(r.get("ok")), + "hit_count": len(r.get("hits") or []), + "total": r.get("total"), + "error": r.get("error"), + "truncated": r.get("truncated", False), + } + if r.get("ok"): + merged.extend(r.get("hits") or []) + else: + errors.append({"source": s, "error": r.get("error")}) + + if raw_mode == "hash": + merged = _dedupe_hits(merged) + elif raw_mode == "embedding": + merged = _dedupe_hits_with_embedding(merged, float(embedding_threshold)) + elif raw_mode == "both": + merged = _dedupe_hits(merged) + merged = _dedupe_hits_with_embedding(merged, float(embedding_threshold)) + # "none" → no-op + + return _ok( + action="multi_search", + query=p["query"], + sources=list(sources), + hits=merged, + per_source=per_source, + errors=errors, + dedupe_mode=raw_mode, + embedding_threshold=float(embedding_threshold), + ) + + +# --------------------------------------------------------------------------- +# Action: get +# --------------------------------------------------------------------------- + +def _action_get(p: dict) -> dict: + source = p.get("source") + err = _validate_source(source) + if err: + return _err(err) + id_ = p.get("id") + if not isinstance(id_, str) or not id_.strip(): + return _err("id must be a non-empty string", source=source) + res = _GET_DISPATCH[source](id_.strip()) + res.setdefault("action", "get") + return res + + +# --------------------------------------------------------------------------- +# Action: find_doi (title-search across openalex + crossref, best-match score) +# --------------------------------------------------------------------------- + +def _normalize_title_for_match(s: Optional[str]) -> str: + if not isinstance(s, str): + return "" + return re.sub(r"[^a-z0-9 ]+", "", s.lower()).strip() + + +def _title_match_score(query_title: str, candidate_title: str) -> float: + """Crude Jaccard-on-tokens. Good enough to discriminate match vs no-match. + + Range [0, 1]. We avoid pulling in rapidfuzz/Levenshtein to stay stdlib. + """ + qa = set(_normalize_title_for_match(query_title).split()) + ca = set(_normalize_title_for_match(candidate_title).split()) + if not qa or not ca: + return 0.0 + inter = qa & ca + union = qa | ca + return len(inter) / len(union) + + +def _action_find_doi(p: dict) -> dict: + title = p.get("title") + if not isinstance(title, str) or not title.strip(): + return _err("title must be a non-empty string") + if len(title) > MAX_QUERY_LEN: + return _err(f"title exceeds {MAX_QUERY_LEN} chars") + year = p.get("year") + try: + year_int = int(year) if year is not None else None + except (TypeError, ValueError): + year_int = None + + candidates: list[dict] = [] + errors: list[dict] = [] + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as ex: + f_oa = ex.submit(_search_openalex, title, 5, {}) + f_cr = ex.submit(_search_crossref, title, 5, {}) + for fut, src in ((f_oa, "openalex"), (f_cr, "crossref")): + try: + r = fut.result() + except Exception as e: # noqa: BLE001 + errors.append({"source": src, "error": f"{type(e).__name__}: {e}"}) + continue + if not r.get("ok"): + errors.append({"source": src, "error": r.get("error")}) + continue + for h in r.get("hits") or []: + if not h.get("doi"): + continue + score = _title_match_score(title, h.get("title") or "") + if year_int is not None and h.get("year") is not None: + if abs(h["year"] - year_int) > 1: + score *= 0.5 # year mismatch is a strong penalty + candidates.append({"doi": h["doi"], "source": src, "score": round(score, 4), + "title": h.get("title"), "year": h.get("year")}) + + if not candidates: + return _ok(action="find_doi", doi=None, confidence=0.0, + candidates=[], errors=errors) + candidates.sort(key=lambda c: c["score"], reverse=True) + best = candidates[0] + return _ok( + action="find_doi", + doi=best["doi"], + confidence=best["score"], + best=best, + candidates=candidates[:5], + errors=errors, + ) + + +# --------------------------------------------------------------------------- +# Action: resolve_oa (Unpaywall DOI -> Open-Access PDF/HTML) +# --------------------------------------------------------------------------- + +def _normalize_oa_location(loc: dict) -> dict: + """Flatten an Unpaywall ``oa_locations[]`` entry to the fields we care + about in Pass-2 (full-text retrieval). + """ + return { + "url": loc.get("url"), + "url_for_pdf": loc.get("url_for_pdf"), + "url_for_landing": loc.get("url_for_landing_page"), + "host_type": loc.get("host_type"), # publisher | repository + "version": loc.get("version"), # publishedVersion | acceptedVersion | submittedVersion + "license": loc.get("license"), + "is_best": bool(loc.get("is_best")), + "evidence": loc.get("evidence"), + "repository_institution": loc.get("repository_institution"), + } + + +def _action_resolve_oa(p: dict) -> dict: + """DOI -> Unpaywall OA-location lookup. + + Params:: + + { + "doi": "10.x/y" | "https://doi.org/10.x/y" | "doi:10.x/y", + "email": "" (required if configure() was + not called with a contact email; Unpaywall requires it) + } + + Returns success envelope with ``is_oa``, ``oa_status``, + ``best_oa_location`` (flattened), and ``oa_locations`` (list of + flattened entries) — these feed the deep_research skill's Pass-2 + full-text retrieval step. + """ + raw_doi = p.get("doi") + doi = _normalize_doi(raw_doi) + if not doi: + return _err("doi must look like 10.x/y", source="unpaywall") + email = p.get("email") or _CONTACT_EMAIL + if not isinstance(email, str) or not email.strip(): + return _err( + "email must be a non-empty string (or configure a contact email " + "via scholar.configure(contact_email=...))", + source="unpaywall", + ) + email = email.strip() + + url = ( + f"{UNPAYWALL_BASE}/{urllib.parse.quote(doi, safe='/:')}" + f"?email={urllib.parse.quote(email)}" + ) + body, truncated, err = _http_get(url, extra_headers={}, accept="application/json") + if err is not None: + # _http_get returns STRING errors like "HTTP 404: Not Found". + status: Optional[int] = None + if isinstance(err, str) and err.startswith("HTTP "): + try: + status = int(err.split(" ", 2)[1].rstrip(":")) + except (ValueError, IndexError): + status = None + return _err( + f"unpaywall HTTP error: {err}", + source="unpaywall", + doi=doi, + status=status, + ) + if truncated: + return _err( + "unpaywall response truncated (>5MB)", + source="unpaywall", + doi=doi, + ) + try: + data = json.loads(body.decode("utf-8")) + except (UnicodeDecodeError, json.JSONDecodeError) as e: + return _err( + f"unpaywall parse error: {type(e).__name__}: {e}", + source="unpaywall", + doi=doi, + ) + if not isinstance(data, dict): + return _err( + "unpaywall returned non-object payload", + source="unpaywall", + doi=doi, + ) + + best = data.get("best_oa_location") + locations = data.get("oa_locations") or [] + return _ok( + action="resolve_oa", + source="unpaywall", + doi=doi, + is_oa=bool(data.get("is_oa")), + oa_status=data.get("oa_status"), + best_oa_location=( + _normalize_oa_location(best) if isinstance(best, dict) else None + ), + oa_locations=[ + _normalize_oa_location(loc) + for loc in locations + if isinstance(loc, dict) + ], + title=data.get("title"), + published_date=data.get("published_date"), + journal_name=data.get("journal_name"), + publisher=data.get("publisher"), + genre=data.get("genre"), + ) + + +# --------------------------------------------------------------------------- +# Top-level dispatcher +# --------------------------------------------------------------------------- + +_ACTIONS = { + "search": _action_search, + "multi_search": _action_multi_search, + "get": _action_get, + "find_doi": _action_find_doi, + "resolve_oa": _action_resolve_oa, +} + + +def scholar(action: str, params: Optional[dict] = None) -> dict: + """Composite ``scholar`` tool entry point. + + Args: + action: one of ``_ACTIONS`` keys. + params: action-specific dict. See per-action docstrings / handlers. + + Returns: + ``{"ok": True, ...}`` on success, ``{"ok": False, "error": ...}`` on + validation / HTTP / parse error. + """ + fn = _ACTIONS.get(action) + if fn is None: + return _err( + f"unknown action {action!r}", + available=sorted(_ACTIONS.keys()), + ) + try: + return fn(params or {}) + except Exception as e: # noqa: BLE001 + return _err(f"{type(e).__name__}: {e}", action=action) + + +__all__ = [ + "scholar", + "configure", + "set_embedding_deduper", + "SOURCES", +] diff --git a/lib/synthesis_doc_builder.py b/lib/synthesis_doc_builder.py new file mode 100644 index 0000000..3e967d4 --- /dev/null +++ b/lib/synthesis_doc_builder.py @@ -0,0 +1,657 @@ +"""synthesis_doc_builder — build a formatted .docx synthesis with optional upload. + +Reference implementation for the ``deep_research`` protocol's Phase 4 +synthesis artifact. Replaces the prior "append plain text" path, which +produced unstructured docs without visuals, tables, or heading hierarchy. + +The helper takes structured synthesis content (sections, cited rows with +effect_size + quote_span, corpus counts) and produces a Word ``.docx`` +that contains: + + - Native Word heading hierarchy (Heading 1/2/3). + - Effect-size summary table (native Word table — converts to a real + Google Docs table on upload, not ASCII). + - Forest plot of point estimates + 95% CIs across cited quantitative rows. + - PRISMA-style flow diagram for Pass-1 → Pass-2 selected → retrieved → cited. + - Stance × evidence-tier heat-table for corpus composition. + - Blockquote-styled verbatim quote_span snippets per cited row. + - References list with DOI per row. + +Optional upload is handled via dependency injection — pass an ``uploader`` +callable to :func:`build_synthesis_doc` and the local ``.docx`` will be +shipped wherever you wire it (Google Drive, S3, internal API). The local +file is always retained. + +Soft imports: ``matplotlib`` and ``python-docx`` are NOT mandatory runtime +deps. Install them via the ``[viz]`` extra:: + + pip install "deep-research[viz]" + +The helpers raise a clear ``RuntimeError`` if either is missing, rather +than ImportError-ing at module load time. + +Usage (orchestrator-side):: + + from lib.synthesis_doc_builder import ( + SynthesisInputs, CitedEvidence, ForestRow, build_synthesis_doc, + ) + + def my_uploader(local_path, name, mime_type): + # ship to Drive / S3 / wherever + return {"doc_id": "...", "web_url": "https://..."} + + result = build_synthesis_doc( + SynthesisInputs(...), + uploader=my_uploader, + ) + # result['local_docx_path'] + # result['doc_id'] -> uploader-provided id (None if no uploader) + # result['web_url'] -> uploader-provided URL (None if no uploader) +""" +from __future__ import annotations + +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Callable, Optional, Sequence + + +# ──────────────────────────────────────────────────────────────────────────── +# Uploader contract (dependency injection) +# ──────────────────────────────────────────────────────────────────────────── + +# An uploader is any callable matching: +# uploader(local_path: Path, name: str, mime_type: str) -> dict +# The returned dict SHOULD include "doc_id" and "web_url" when upload +# succeeds. Any extra keys are merged into the build result. +Uploader = Callable[[Path, str, str], dict] + +_DOCX_MIME = ( + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" +) + + +# ──────────────────────────────────────────────────────────────────────────── +# Data model — what the synthesizer hands us +# ──────────────────────────────────────────────────────────────────────────── + + +@dataclass +class CitedEvidence: + """One cited row from research_evidence.""" + study_label: str # e.g. "Smith et al." + year: int # 2026 + design: str # "Meta-analysis (10 RCTs, n=85,829)" + outcome: str # "All-cause mortality" + estimate_str: str # "RR 0.98 (0.89–1.07)" + population: str # "Strict primary prevention" + doi: str # "10.x/example" + quote_span: str # verbatim quote from the source + is_acm_no_benefit: bool = False # True → table cell shaded warning-color + + +@dataclass +class ForestRow: + """One row on the forest plot.""" + label: str # "Smith 2026 (k=10, n=85,829)" + metric: str # "RR" / "OR" / "HR" + point: float # 0.98 + ci_low: float # 0.89 + ci_high: float # 1.07 + + +@dataclass +class BodySection: + """A thematic body section with prose + optional verbatim quote + source.""" + heading: str + prose: str # one or more paragraphs (\n\n separates) + quote: Optional[str] = None # verbatim quote_span + quote_source: Optional[str] = None # e.g. "— Smith 2026, DOI:..." + + +@dataclass +class PrismaCounts: + """PRISMA-style flow counts.""" + pass1_total: int + pass2_selected: int + pass2_retrieved_full_text: int + pass2_abstract_only: int + pass2_unavailable: int + cited: int + + +@dataclass +class StanceTierMatrix: + """4×4 matrix of corpus composition. Rows = stances, columns = tiers.""" + # Order: stances ["background","supports","refutes","mixed"]; + # tiers ["Tier 1","Tier 2","Tier 3","Tier 4"] + stances: list[str] = field(default_factory=lambda: ["background", "supports", "refutes", "mixed"]) + tiers: list[str] = field(default_factory=lambda: ["Tier 1\nmeta-analysis / RCT", "Tier 2", "Tier 3", "Tier 4"]) + counts: list[list[int]] = field(default_factory=list) # 4 rows × 4 cols + + +@dataclass +class PriorVersion: + """One earlier synthesis revision, surfaced in the lineage block.""" + doc_id: str + doc_url: str + headline: Optional[str] = None + completed_at: Optional[str] = None # iso timestamp string + mode: str = "original" # original | refresh | deepen | rescope | critique + + +@dataclass +class SynthesisInputs: + """Everything the helper needs to produce a synthesis doc.""" + project_id: str # research_projects.id + project_slug: str # short identifier + title: str # main doc title + subtitle: str # one-line subtitle (italic) + question: str # verbatim PICO question + headline_paragraphs: list[str] # 2–4 paragraphs of headline answer + calibrated_bottom_line: str # one-sentence calibrated takeaway + cited: list[CitedEvidence] # for the effect-size table + forest_acm: list[ForestRow] # top panel of forest plot + forest_mace: list[ForestRow] = field(default_factory=list) # bottom panel (optional) + body_sections: list[BodySection] = field(default_factory=list) + stance_tier: Optional[StanceTierMatrix] = None # heat-table input (optional) + prisma: Optional[PrismaCounts] = None # PRISMA flow (optional) + limitations: list[str] = field(default_factory=list) + bottom_line_paragraphs: list[str] = field(default_factory=list) + references: list[str] = field(default_factory=list) # full citations + # ── Lineage (populated when this synthesis is a continuation, not v1) ── + prior_versions: list[PriorVersion] = field(default_factory=list) + change_summary: Optional[str] = None # "what changed since v" + continuation_mode: Optional[str] = None # refresh|deepen|rescope|critique + # ── Output config ── + output_dir: Optional[str] = None # where to stash docx + PNGs + upload_name: Optional[str] = None # name handed to the uploader; defaults to "SYNTHESIS__" + + +# ──────────────────────────────────────────────────────────────────────────── +# Soft-import guards +# ──────────────────────────────────────────────────────────────────────────── + + +def _import_matplotlib(): + try: + import matplotlib # noqa: F401 + matplotlib.use("Agg") + import matplotlib.pyplot as plt # noqa: F401 + import matplotlib.patches # noqa: F401 + import matplotlib.colors # noqa: F401 + import numpy as np # noqa: F401 + except ImportError as e: + raise RuntimeError( + "synthesis_doc_builder requires matplotlib + numpy. " + "Install via `pip install \"deep-research[viz]\"`. " + f"Underlying ImportError: {e}" + ) + + +def _import_docx(): + try: + import docx # noqa: F401 + from docx.shared import Pt # noqa: F401 + except ImportError as e: + raise RuntimeError( + "synthesis_doc_builder requires python-docx. " + "Install via `pip install \"deep-research[viz]\"`. " + f"Underlying ImportError: {e}" + ) + + +# ──────────────────────────────────────────────────────────────────────────── +# Plot builders +# ──────────────────────────────────────────────────────────────────────────── + + +def render_forest_plot( + acm: Sequence[ForestRow], + mace: Sequence[ForestRow], + output_path: Path, + title: str = "Forest plot — effect estimates with 95% CI", +) -> Path: + """Render a one- or two-panel forest plot to PNG. Returns the path.""" + _import_matplotlib() + import matplotlib.pyplot as plt + import numpy as np + + panels = [("All-cause mortality / primary outcome", acm)] + if mace: + panels.append(("Secondary outcomes (MACE / MI / stroke / revasc)", mace)) + height = 1.4 + 0.55 * sum(len(p[1]) for p in panels) + 1.6 * (len(panels) - 1) + fig, axes = plt.subplots( + len(panels), 1, figsize=(10, max(4.0, height)), + gridspec_kw={"height_ratios": [max(2, len(p[1])) for p in panels]}, + ) + if len(panels) == 1: + axes = [axes] + + for ax, (subtitle, rows) in zip(axes, panels): + rows = list(rows) + n = len(rows) + ys = np.arange(n)[::-1] + # Figure out x-limits dynamically (with sensible log defaults). + lo_min = min(r.ci_low for r in rows) if rows else 0.5 + hi_max = max(r.ci_high for r in rows) if rows else 2.0 + xlim = (max(0.3, lo_min * 0.85), min(3.0, hi_max * 1.15)) + for y, r in zip(ys, rows): + color = "#2a8f3e" if r.ci_high < 1.0 else ("#b8443e" if r.ci_low > 1.0 else "#555555") + ax.plot([r.ci_low, r.ci_high], [y, y], color=color, lw=2.2, solid_capstyle="round") + ax.plot([r.point], [y], marker="s", markersize=11, + color=color, markeredgecolor="black", markeredgewidth=0.5) + ax.text(xlim[1] * 1.02, y, + f" {r.metric} {r.point:.2f} ({r.ci_low:.2f}–{r.ci_high:.2f})", + va="center", ha="left", fontsize=9, family="monospace") + ax.axvline(1.0, color="black", lw=0.9, linestyle="--", alpha=0.7) + ax.set_yticks(ys) + ax.set_yticklabels([r.label for r in rows], fontsize=9) + ax.set_xlim(*xlim) + ax.set_xscale("log") + ax.set_xlabel("Effect estimate (log scale) — point + 95% CI", fontsize=9) + ax.set_title(subtitle, fontsize=11, fontweight="bold", loc="left") + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + + fig.suptitle(title, fontsize=12, fontweight="bold", y=0.995) + plt.tight_layout(rect=(0, 0, 1, 0.97)) + output_path.parent.mkdir(parents=True, exist_ok=True) + plt.savefig(output_path, dpi=180, bbox_inches="tight", facecolor="white") + plt.close(fig) + return output_path + + +def render_prisma_flow(counts: PrismaCounts, output_path: Path, title: str = "PRISMA-style flow") -> Path: + """Render a PRISMA-style 4-stage flow diagram to PNG.""" + _import_matplotlib() + import matplotlib.pyplot as plt + from matplotlib.patches import FancyBboxPatch + + fig, ax = plt.subplots(figsize=(9, 8)) + ax.set_xlim(0, 10); ax.set_ylim(0, 12); ax.axis("off") + + def box(x, y, w, h, text, fc="#dfeaf7", ec="#1f4e79"): + rect = FancyBboxPatch((x - w/2, y - h/2), w, h, + boxstyle="round,pad=0.05,rounding_size=0.15", + linewidth=1.3, edgecolor=ec, facecolor=fc) + ax.add_patch(rect) + ax.text(x, y, text, ha="center", va="center", fontsize=10) + + def label_box(x, y, w, h, text): + rect = FancyBboxPatch((x - w/2, y - h/2), w, h, + boxstyle="round,pad=0.05,rounding_size=0.15", + linewidth=1.3, edgecolor="#1f4e79", facecolor="#1f4e79") + ax.add_patch(rect) + ax.text(x, y, text, color="white", ha="center", va="center", + fontsize=11, fontweight="bold") + + label_box(2, 11.2, 3.4, 0.7, "Identification") + label_box(2, 8.6, 3.4, 0.7, "Screening") + label_box(2, 5.8, 3.4, 0.7, "Eligibility") + label_box(2, 2.5, 3.4, 0.7, "Included (cited)") + + box(6.5, 11.2, 4.6, 0.95, + f"Pass-1 corpus assembled\n(Scout · Skeptic · Methodologist)\nn = {counts.pass1_total} evidence rows") + box(6.5, 8.6, 4.6, 0.95, + f"Pass-2 candidates selected\n(top-ranked by stance × tier × methodology)\nn = {counts.pass2_selected}") + box(6.5, 5.8, 4.6, 1.30, + f"Pass-2 full-text retrieval\nRetrieved (full text): {counts.pass2_retrieved_full_text}\n" + f"Abstract only: {counts.pass2_abstract_only}\n" + f"Paywalled / failed: {counts.pass2_unavailable}") + box(6.5, 2.5, 4.6, 0.95, + f"Directly cited in synthesis\n(verbatim quote_span + numerics)\nn = {counts.cited}") + + for y1, y2 in [(10.7, 9.1), (8.1, 6.5), (5.1, 3.0)]: + ax.annotate("", xy=(6.5, y2), xytext=(6.5, y1), + arrowprops=dict(arrowstyle="->", lw=1.4, color="#333")) + + not_selected = max(0, counts.pass1_total - counts.pass2_selected) + ax.annotate("", xy=(8.9, 8.6), xytext=(8.9, 11.0), + arrowprops=dict(arrowstyle="-", lw=0.8, color="#888")) + ax.text(9.4, 9.9, f"{not_selected} not selected\n(stance · tier · scope)", + fontsize=8.5, color="#666", va="center") + not_cited = max(0, counts.pass2_retrieved_full_text + counts.pass2_abstract_only - counts.cited) + ax.annotate("", xy=(8.9, 2.5), xytext=(8.9, 5.0), + arrowprops=dict(arrowstyle="-", lw=0.8, color="#888")) + ax.text(9.4, 3.7, + f"{not_cited} retrieved but\nnot quotable\nfor primary outcome", + fontsize=8.5, color="#666", va="center") + + ax.set_title(title, fontsize=12, fontweight="bold", loc="left") + output_path.parent.mkdir(parents=True, exist_ok=True) + plt.savefig(output_path, dpi=180, bbox_inches="tight", facecolor="white") + plt.close(fig) + return output_path + + +def render_stance_heat(matrix: StanceTierMatrix, output_path: Path, + title: str = "Corpus composition — stance × evidence tier") -> Path: + """Render the stance × tier heat-table to PNG.""" + _import_matplotlib() + import matplotlib.pyplot as plt + import matplotlib.colors as mcolors + import numpy as np + + arr = np.array(matrix.counts) if matrix.counts else np.zeros((len(matrix.stances), len(matrix.tiers)), dtype=int) + fig, ax = plt.subplots(figsize=(8, 4)) + cmap = mcolors.LinearSegmentedColormap.from_list("custom", ["#f5f7fa", "#1f4e79"]) + ax.imshow(arr, cmap=cmap, aspect="auto") + ax.set_xticks(range(len(matrix.tiers))) + ax.set_xticklabels(matrix.tiers, fontsize=9) + ax.set_yticks(range(len(matrix.stances))) + ax.set_yticklabels(matrix.stances, fontsize=10) + cap = max(arr.max() if arr.size else 1, 1) + for i in range(len(matrix.stances)): + for j in range(len(matrix.tiers)): + v = int(arr[i, j]) + color = "white" if v > cap * 0.45 else "black" + ax.text(j, i, str(v), ha="center", va="center", color=color, + fontsize=11, fontweight="bold" if v > 0 else "normal") + ax.set_title(f"{title} (n={int(arr.sum())})", fontsize=11, fontweight="bold", loc="left") + plt.tight_layout() + output_path.parent.mkdir(parents=True, exist_ok=True) + plt.savefig(output_path, dpi=180, bbox_inches="tight", facecolor="white") + plt.close(fig) + return output_path + + +# ──────────────────────────────────────────────────────────────────────────── +# docx assembly +# ──────────────────────────────────────────────────────────────────────────── + + +def _set_cell_bg(cell, hex_color: str) -> None: + from docx.oxml.ns import qn + from docx.oxml import OxmlElement + tc_pr = cell._tc.get_or_add_tcPr() + shd = OxmlElement("w:shd") + shd.set(qn("w:val"), "clear") + shd.set(qn("w:color"), "auto") + shd.set(qn("w:fill"), hex_color) + tc_pr.append(shd) + + +def build_docx(inputs: SynthesisInputs, plots: dict[str, Path], docx_path: Path) -> Path: + """Build the .docx synthesis. `plots` keys: 'forest', 'prisma', 'heat' (optional).""" + _import_docx() + from docx import Document + from docx.shared import Pt, Inches, RGBColor + from docx.enum.text import WD_ALIGN_PARAGRAPH + from docx.enum.table import WD_ALIGN_VERTICAL + + doc = Document() + for section in doc.sections: + section.left_margin = Inches(0.9) + section.right_margin = Inches(0.9) + section.top_margin = Inches(0.8) + section.bottom_margin = Inches(0.8) + style = doc.styles["Normal"] + style.font.name = "Calibri" + style.font.size = Pt(11) + + # ---- Title block + t = doc.add_heading(inputs.title, level=0) + t.alignment = WD_ALIGN_PARAGRAPH.LEFT + sub = doc.add_paragraph() + r = sub.add_run(inputs.subtitle) + r.italic = True + r.font.size = Pt(12) + r.font.color.rgb = RGBColor(0x55, 0x55, 0x55) + + meta = doc.add_paragraph() + meta.add_run("Project id: ").bold = True + meta.add_run(f"{inputs.project_id} ") + meta.add_run("Slug: ").bold = True + meta.add_run(f"{inputs.project_slug}") + + # ---- Continuation banner + Prior versions (only when this is v2+) + if inputs.prior_versions or inputs.continuation_mode or inputs.change_summary: + doc.add_heading("Revision context", level=1) + if inputs.continuation_mode: + banner = doc.add_paragraph() + br = banner.add_run( + f"This synthesis is a continuation of an earlier deep_research project " + f"(mode: {inputs.continuation_mode}). The prior versions of the " + f"synthesis are preserved and listed below." + ) + br.italic = True + if inputs.change_summary: + sp = doc.add_paragraph() + sp.add_run("What changed since the previous version: ").bold = True + sp.add_run(inputs.change_summary) + if inputs.prior_versions: + sub = doc.add_paragraph() + sub.add_run("Prior versions").bold = True + for v in inputs.prior_versions: + p = doc.add_paragraph(style="List Bullet") + tag = f"[{v.mode}]" + when = f" — completed {v.completed_at}" if v.completed_at else "" + head = f" — \u201c{v.headline}\u201d" if v.headline else "" + p.add_run(f"{tag}{when}{head} ") + link = p.add_run(v.doc_url or f"https://docs.google.com/document/d/{v.doc_id}/edit") + link.italic = True + link.font.color.rgb = RGBColor(0x1F, 0x4E, 0x79) + + # ---- Question + Headline + doc.add_heading("Question", level=1) + q = doc.add_paragraph() + qr = q.add_run(inputs.question) + qr.italic = True + + doc.add_heading("Headline answer", level=1) + for p in inputs.headline_paragraphs: + doc.add_paragraph(p) + if inputs.calibrated_bottom_line: + p = doc.add_paragraph() + p.add_run("Calibrated bottom line: ").bold = True + p.add_run(inputs.calibrated_bottom_line) + + # ---- Effect-size table + if inputs.cited: + doc.add_heading("Effect-size summary — cited evidence", level=1) + doc.add_paragraph( + "All numerics below are verbatim from the cited row's quote_span field in " + "research_evidence. DOIs link to source." + ) + cols = ("Study", "Year", "Design", "Outcome", "Estimate (95% CI)", "Population") + tbl = doc.add_table(rows=len(inputs.cited) + 1, cols=len(cols)) + tbl.style = "Light Grid Accent 1" + widths = [Inches(1.2), Inches(0.5), Inches(1.7), Inches(1.1), Inches(1.6), Inches(1.4)] + for j, col in enumerate(cols): + cell = tbl.rows[0].cells[j] + cell.width = widths[j] + p = cell.paragraphs[0] + run = p.add_run(col) + run.bold = True + run.font.color.rgb = RGBColor(0xFF, 0xFF, 0xFF) + run.font.size = Pt(10) + _set_cell_bg(cell, "1F4E79") + for i, c in enumerate(inputs.cited, start=1): + row = tbl.rows[i].cells + for j, val in enumerate([c.study_label, str(c.year), c.design, c.outcome, + c.estimate_str, c.population]): + row[j].width = widths[j] + row[j].vertical_alignment = WD_ALIGN_VERTICAL.CENTER + p = row[j].paragraphs[0] + run = p.add_run(val) + run.font.size = Pt(9.5) + if c.is_acm_no_benefit and c.outcome.lower().startswith("all-cause"): + _set_cell_bg(row[j], "FDECEA") + + # ---- Forest plot + if "forest" in plots: + doc.add_heading("Forest plot — effect estimates with 95% CI", level=1) + doc.add_paragraph( + "Point estimates with 95% confidence intervals for each cited row whose " + "effect_size was populated. Estimates whose CI crosses 1.0 are drawn in " + "grey; clear-benefit intervals in green; harm-side intervals in red." + ) + doc.add_picture(str(plots["forest"]), width=Inches(6.8)) + + # ---- Body sections + for sec in inputs.body_sections: + doc.add_heading(sec.heading, level=1) + for para in (sec.prose or "").split("\n\n"): + if para.strip(): + doc.add_paragraph(para.strip()) + if sec.quote: + p = doc.add_paragraph(style="Intense Quote") + p.add_run(sec.quote) + if sec.quote_source: + sp = doc.add_paragraph() + sp.add_run(sec.quote_source).italic = True + + # ---- Stance × tier heat-table + if "heat" in plots: + doc.add_heading("Corpus composition — stance × evidence tier", level=1) + doc.add_paragraph( + "Distribution of evidence rows after the synthesis pass updated stance labels " + "based on quote-spanned numerics." + ) + doc.add_picture(str(plots["heat"]), width=Inches(6.5)) + + # ---- PRISMA + if "prisma" in plots: + doc.add_heading("Evidence flow (PRISMA-style)", level=1) + doc.add_paragraph( + "Identification → screening → eligibility → inclusion counts for this project." + ) + doc.add_picture(str(plots["prisma"]), width=Inches(6.0)) + + # ---- Limitations + if inputs.limitations: + doc.add_heading("Limitations", level=1) + for li in inputs.limitations: + doc.add_paragraph(li, style="List Bullet") + + # ---- Bottom-line revisited + if inputs.bottom_line_paragraphs: + doc.add_heading("Bottom-line answer", level=1) + for p in inputs.bottom_line_paragraphs: + doc.add_paragraph(p) + + # ---- References + if inputs.references: + doc.add_heading("References (cited)", level=1) + for ref in inputs.references: + p = doc.add_paragraph(ref, style="List Number") + for run in p.runs: + run.font.size = Pt(9.5) + + docx_path.parent.mkdir(parents=True, exist_ok=True) + doc.save(docx_path) + return docx_path + + +# ──────────────────────────────────────────────────────────────────────────── +# Top-level +# ──────────────────────────────────────────────────────────────────────────── + + +def build_synthesis_doc( + inputs: SynthesisInputs, + *, + uploader: Optional[Uploader] = None, +) -> dict[str, Any]: + """Build local PNGs + ``.docx``, optionally hand the docx to an uploader. + + Args: + inputs: structured synthesis content. + uploader: optional callable ``fn(local_path, name, mime_type) -> dict`` + invoked after the local ``.docx`` is built. The returned dict + is merged into the result; ``doc_id`` and ``web_url`` keys are + surfaced specifically. When ``None`` (the default), no upload + is attempted and ``doc_id`` / ``web_url`` come back as ``None``. + + Returns: + ``{ + "local_docx_path": str, + "plots": {"forest": str, "prisma": str | None, "heat": str | None}, + "doc_id": str | None, + "web_url": str | None, + "uploaded": bool, + "upload_error": str | None, + }`` + + The local ``.docx`` is always written before any upload is attempted, + so a network/upload failure never loses the artifact. + """ + out_dir = Path(inputs.output_dir or f".scratch/synthesis_{inputs.project_slug}") + out_dir.mkdir(parents=True, exist_ok=True) + + plots: dict[str, Path] = {} + # Forest plot is required for any input with cited rows (Phase 4 mandate). + if inputs.forest_acm: + forest_path = render_forest_plot( + inputs.forest_acm, inputs.forest_mace, out_dir / "forest_plot.png", + title=f"Forest plot — {inputs.title}", + ) + plots["forest"] = forest_path + if inputs.prisma is not None: + prisma_path = render_prisma_flow( + inputs.prisma, out_dir / "prisma_flow.png", + title=f"PRISMA-style flow — {inputs.title}", + ) + plots["prisma"] = prisma_path + if inputs.stance_tier is not None and inputs.stance_tier.counts: + heat_path = render_stance_heat( + inputs.stance_tier, out_dir / "stance_heat.png", + title=f"Corpus composition — {inputs.title}", + ) + plots["heat"] = heat_path + + docx_path = out_dir / f"{inputs.project_slug}_synthesis.docx" + build_docx(inputs, plots, docx_path) + + result: dict[str, Any] = { + "local_docx_path": str(docx_path), + "plots": {k: str(v) for k, v in plots.items()}, + "doc_id": None, + "web_url": None, + "uploaded": False, + "upload_error": None, + } + + if uploader is not None: + upload_name = inputs.upload_name or f"SYNTHESIS__{inputs.project_slug}" + try: + up = uploader(docx_path, upload_name, _DOCX_MIME) or {} + if isinstance(up, dict): + # Merge any uploader-provided fields. Surface doc_id/web_url + # specifically; pass through anything else under upload_extra. + doc_id = up.get("doc_id") or up.get("id") + web_url = up.get("web_url") or up.get("webViewLink") or up.get("url") + result["doc_id"] = doc_id + result["web_url"] = web_url + result["uploaded"] = bool(doc_id or web_url) + extras = { + k: v for k, v in up.items() + if k not in {"doc_id", "id", "web_url", "webViewLink", "url"} + } + if extras: + result["upload_extra"] = extras + else: + result["upload_error"] = ( + f"uploader returned non-dict: {type(up).__name__}" + ) + except Exception as e: # noqa: BLE001 + result["upload_error"] = f"{type(e).__name__}: {e}" + + return result + + +__all__ = [ + "Uploader", + "CitedEvidence", + "ForestRow", + "BodySection", + "PrismaCounts", + "StanceTierMatrix", + "PriorVersion", + "SynthesisInputs", + "build_synthesis_doc", + "build_docx", + "render_forest_plot", + "render_prisma_flow", + "render_stance_heat", +] diff --git a/manifests/deep-research.v0.4.json b/manifests/deep-research.v0.4.json index 073043b..e3352f7 100644 --- a/manifests/deep-research.v0.4.json +++ b/manifests/deep-research.v0.4.json @@ -3,10 +3,10 @@ "tool": { "namespace": "drknowhow", "id": "deep-research", - "version": "0.1.1", + "version": "0.2.0", "name": "deep_research", "summary": "Protocol-first, gated, multi-agent literature investigation with no-fabrication enforced via verbatim quote spans.", - "description": "deep_research is an agent-runtime-agnostic workflow that turns an empirical research question into a citation-grade synthesis. Two human-in-the-loop gates (protocol pre-registration, Pass-2 spend) bracket a four-role subagent crew (Scout / Skeptic / Methodologist / Synthesizer). Every claim a synthesis ships must be backed by a row in the research_evidence table with a verbatim quote_span; claims without their quote get cut. The v0.1.0 distribution ships the protocol (SKILL.md), DB schema (PostgreSQL + SQLite), five agent role prompts, and a worked example. The Python reference implementation (scholar adapter + synthesis_doc_builder) lands in v0.2.0.", + "description": "deep_research is an agent-runtime-agnostic workflow that turns an empirical research question into a citation-grade synthesis. Two human-in-the-loop gates (protocol pre-registration, Pass-2 spend) bracket a four-role subagent crew (Scout / Skeptic / Methodologist / Synthesizer). Every claim a synthesis ships must be backed by a row in the research_evidence table with a verbatim quote_span; claims without their quote get cut. v0.2.0 adds the Python reference implementation: a stdlib-urllib scholar adapter (OpenAlex / Semantic Scholar / PubMed / arXiv / Europe PMC / Crossref / Unpaywall) and a python-docx + matplotlib synthesis builder with pluggable upload. Core stays stdlib-only; the docx builder is gated behind the optional `[viz]` extra.", "homepage": "https://github.com/drknowhow/deep-research", "author": { "name": "Dimitri T", @@ -28,7 +28,7 @@ "install": { "method": "git", "url": "https://github.com/drknowhow/deep-research", - "ref": "v0.1.0", + "ref": "v0.2.0", "layout": "skill-bundle" }, "entrypoint": { diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..cbe9125 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,43 @@ +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "deep-research" +version = "0.2.0" +description = "Protocol-first, gated, multi-agent literature investigation with no-fabrication enforced via verbatim quote spans." +readme = "README.md" +requires-python = ">=3.11" +license = { text = "Apache-2.0" } +authors = [{ name = "Dimitri T", email = "" }] +keywords = [ + "research", + "literature-review", + "meta-analysis", + "agents", + "no-fabrication", +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +# Core deps: stdlib only. The scholar adapter uses urllib + xml.etree. +dependencies = [] + +[project.optional-dependencies] +# Synthesis doc builder (forest plot, PRISMA flow, heat-table, .docx). +viz = [ + "python-docx>=1.0", + "matplotlib>=3.7", + "numpy>=1.24", +] + +[project.urls] +Homepage = "https://github.com/drknowhow/deep-research" +Issues = "https://github.com/drknowhow/deep-research/issues" + +[tool.setuptools.packages.find] +include = ["lib*"] diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_scholar_smoke.py b/tests/test_scholar_smoke.py new file mode 100644 index 0000000..b592d8f --- /dev/null +++ b/tests/test_scholar_smoke.py @@ -0,0 +1,314 @@ +"""Smoke tests for lib.scholar. + +stdlib-unittest only. No network. urllib.request.urlopen is monkeypatched +with per-source fake responses so the tests run identically on Windows, +macOS, and Linux CI. +""" +from __future__ import annotations + +import io +import json +import sys +import unittest +import urllib.request +from pathlib import Path +from typing import Any +from unittest import mock + +# Make `lib.scholar` importable when running the test file directly. +REPO_ROOT = Path(__file__).resolve().parent.parent +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from lib import scholar # noqa: E402 + + +# ──────────────────────────────────────────────────────────────────────────── +# Fake-response helpers +# ──────────────────────────────────────────────────────────────────────────── + + +class _FakeResponse(io.BytesIO): + """Minimal urlopen() context-manager-compatible response.""" + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + self.close() + return False + + +def _fake_openalex_works() -> bytes: + body = { + "meta": {"count": 1}, + "results": [ + { + "id": "https://openalex.org/W123", + "doi": "https://doi.org/10.1234/example", + "title": "Example study", + "display_name": "Example study", + "publication_year": 2024, + "type": "article", + "primary_location": { + "source": {"type": "journal", "display_name": "J. Examples"} + }, + "authorships": [ + {"author": {"display_name": "Alice A."}}, + {"author": {"display_name": "Bob B."}}, + ], + "cited_by_count": 42, + "open_access": {"oa_url": "https://example.org/oa.pdf"}, + "abstract_inverted_index": {"hello": [0], "world": [1]}, + } + ], + } + return json.dumps(body).encode("utf-8") + + +def _fake_crossref_works() -> bytes: + body = { + "message": { + "total-results": 1, + "items": [ + { + "DOI": "10.1234/example", + "title": ["Example study"], + "type": "journal-article", + "container-title": ["J. Examples"], + "author": [{"given": "Alice", "family": "A."}], + "issued": {"date-parts": [[2024, 1, 1]]}, + "is-referenced-by-count": 42, + "URL": "https://doi.org/10.1234/example", + } + ], + } + } + return json.dumps(body).encode("utf-8") + + +def _fake_unpaywall() -> bytes: + body = { + "is_oa": True, + "oa_status": "gold", + "title": "Example study", + "best_oa_location": { + "url": "https://example.org/oa.pdf", + "url_for_pdf": "https://example.org/oa.pdf", + "host_type": "publisher", + "version": "publishedVersion", + "license": "cc-by", + "is_best": True, + }, + "oa_locations": [ + { + "url": "https://example.org/oa.pdf", + "url_for_pdf": "https://example.org/oa.pdf", + "host_type": "publisher", + "version": "publishedVersion", + "license": "cc-by", + "is_best": True, + } + ], + } + return json.dumps(body).encode("utf-8") + + +def _route(req_or_url: Any, *args: Any, **kwargs: Any) -> _FakeResponse: + """Pick a fake response based on the URL host/path. + + scholar.py builds a ``urllib.request.Request`` and passes that to + ``urlopen``, so the first arg here can be a Request or a string. + """ + if hasattr(req_or_url, "full_url"): + url = req_or_url.full_url + elif hasattr(req_or_url, "get_full_url"): + url = req_or_url.get_full_url() + else: + url = str(req_or_url) + if "api.openalex.org" in url: + return _FakeResponse(_fake_openalex_works()) + if "api.crossref.org" in url: + return _FakeResponse(_fake_crossref_works()) + if "api.unpaywall.org" in url: + return _FakeResponse(_fake_unpaywall()) + # Default: empty JSON object + return _FakeResponse(b"{}") + + +# ──────────────────────────────────────────────────────────────────────────── +# Tests +# ──────────────────────────────────────────────────────────────────────────── + + +class ConfigureTests(unittest.TestCase): + def test_configure_mutates_user_agent_and_email(self): + # snapshot original values + orig_ua = scholar._USER_AGENT + orig_email = scholar._CONTACT_EMAIL + try: + scholar.configure(contact_email="alice@example.org", + app_name="my-agent/1.2") + self.assertEqual(scholar._USER_AGENT, "my-agent/1.2") + self.assertEqual(scholar._CONTACT_EMAIL, "alice@example.org") + + # configure() with no args leaves values unchanged. + scholar.configure() + self.assertEqual(scholar._USER_AGENT, "my-agent/1.2") + self.assertEqual(scholar._CONTACT_EMAIL, "alice@example.org") + finally: + scholar._USER_AGENT = orig_ua + scholar._CONTACT_EMAIL = orig_email + + def test_configure_passes_email_into_polite_param(self): + orig_email = scholar._CONTACT_EMAIL + try: + scholar.configure(contact_email="bob@example.org") + params = scholar._polite_email_param({"q": "x"}) + self.assertEqual(params.get("mailto"), "bob@example.org") + self.assertEqual(params.get("q"), "x") + + scholar.configure(contact_email="") + # Empty string clears the contact email. + params = scholar._polite_email_param({"q": "x"}) + self.assertNotIn("mailto", params) + finally: + scholar._CONTACT_EMAIL = orig_email + + +class NormalizationTests(unittest.TestCase): + def test_normalize_doi_strips_url_prefix(self): + self.assertEqual( + scholar._normalize_doi("https://doi.org/10.1234/Foo-BAR"), + "10.1234/foo-bar", + ) + self.assertEqual( + scholar._normalize_doi("doi:10.1234/abc"), + "10.1234/abc", + ) + self.assertIsNone(scholar._normalize_doi("not-a-doi")) + self.assertIsNone(scholar._normalize_doi(None)) + + def test_title_hash_collapses_punctuation(self): + self.assertEqual( + scholar._title_hash_key("Hello, World!"), + "helloworld", + ) + self.assertIsNone(scholar._title_hash_key("")) + self.assertIsNone(scholar._title_hash_key(None)) + + def test_reconstruct_openalex_abstract(self): + # word -> list of positions + inverted = {"hello": [1], "world": [0]} + self.assertEqual( + scholar._reconstruct_openalex_abstract(inverted), + "world hello", + ) + self.assertIsNone(scholar._reconstruct_openalex_abstract(None)) + + +class SearchOpenAlexTests(unittest.TestCase): + @mock.patch("urllib.request.urlopen", side_effect=_route) + def test_search_openalex_normalizes_hit_shape(self, _patch): + res = scholar.scholar("search", { + "source": "openalex", + "query": "example", + "limit": 5, + }) + self.assertTrue(res.get("ok"), msg=res) + self.assertEqual(res["source"], "openalex") + self.assertEqual(len(res["hits"]), 1) + h = res["hits"][0] + # Every required field on the normalized schema. + for key in ( + "id", "doi", "title", "authors", "year", "venue", "abstract", + "citation_count", "url", "open_access_url", "tldr", + "source_tier_hint", "raw", + ): + self.assertIn(key, h, msg=f"missing key {key!r}") + self.assertEqual(h["doi"], "10.1234/example") + self.assertEqual(h["year"], 2024) + self.assertEqual(h["source_tier_hint"], 1) # journal-article + self.assertEqual(h["authors"], ["Alice A.", "Bob B."]) + self.assertEqual(h["abstract"], "hello world") + + +class FindDoiTests(unittest.TestCase): + @mock.patch("urllib.request.urlopen", side_effect=_route) + def test_find_doi_returns_best_candidate(self, _patch): + res = scholar.scholar("find_doi", { + "title": "Example study", + "year": 2024, + }) + self.assertTrue(res.get("ok"), msg=res) + self.assertEqual(res["doi"], "10.1234/example") + self.assertGreater(res["confidence"], 0.5) + + +class ResolveOaTests(unittest.TestCase): + @mock.patch("urllib.request.urlopen", side_effect=_route) + def test_resolve_oa_requires_email(self, _patch): + orig_email = scholar._CONTACT_EMAIL + try: + scholar._CONTACT_EMAIL = None + res = scholar.scholar("resolve_oa", {"doi": "10.1234/example"}) + self.assertFalse(res.get("ok")) + self.assertIn("email", res["error"]) + finally: + scholar._CONTACT_EMAIL = orig_email + + @mock.patch("urllib.request.urlopen", side_effect=_route) + def test_resolve_oa_returns_flattened_locations(self, _patch): + res = scholar.scholar("resolve_oa", { + "doi": "10.1234/example", + "email": "qa@example.org", + }) + self.assertTrue(res.get("ok"), msg=res) + self.assertTrue(res["is_oa"]) + self.assertEqual(res["oa_status"], "gold") + self.assertEqual(res["best_oa_location"]["url"], "https://example.org/oa.pdf") + self.assertEqual(len(res["oa_locations"]), 1) + + +class DedupeTests(unittest.TestCase): + def test_dedupe_hits_collapses_by_doi(self): + hits = [ + {"id": "openalex:A", "doi": "10.1/x", "title": "X"}, + {"id": "crossref:A", "doi": "10.1/x", "title": "X"}, + ] + out = scholar._dedupe_hits(hits) + # Two DOI-identical hits collapse to one; the duplicate's source + # gets recorded in also_found_in for the survivor. + self.assertEqual(len(out), 1) + self.assertIn("also_found_in", out[0]) + self.assertIn("crossref", out[0]["also_found_in"]) + + def test_dedupe_hits_falls_back_to_title_hash(self): + # No DOIs on either hit; identical normalized title collapses them. + hits = [ + {"id": "arxiv:A", "doi": None, "title": "Hello, World!"}, + {"id": "openalex:B", "doi": None, "title": "hello world"}, + ] + out = scholar._dedupe_hits(hits) + self.assertEqual(len(out), 1) + + def test_embedding_deduper_pass_through_when_unregistered(self): + hits = [{"id": "x", "title": "Y"}, {"id": "z", "title": "Y"}] + out = scholar._dedupe_hits_with_embedding(hits, threshold=0.9) + # No registered deduper -> pass-through (length preserved) + self.assertEqual(len(out), 2) + + +class ActionDispatchTests(unittest.TestCase): + def test_unknown_action_returns_error(self): + res = scholar.scholar("not-a-real-action", {}) + self.assertFalse(res.get("ok")) + self.assertIn("available", res) + + def test_search_with_bad_source(self): + res = scholar.scholar("search", {"source": "nope", "query": "x"}) + self.assertFalse(res.get("ok")) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_synthesis_doc_builder_smoke.py b/tests/test_synthesis_doc_builder_smoke.py new file mode 100644 index 0000000..2ad1f1f --- /dev/null +++ b/tests/test_synthesis_doc_builder_smoke.py @@ -0,0 +1,191 @@ +"""Smoke tests for lib.synthesis_doc_builder. + +stdlib-unittest only. Uses tempfile.mkdtemp() + pathlib.Path so paths +work identically on Windows, macOS, and Linux CI. matplotlib/python-docx +are soft deps — tests that need them ``SkipTest`` cleanly when absent. +""" +from __future__ import annotations + +import sys +import tempfile +import unittest +import zipfile +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from lib import synthesis_doc_builder as sdb # noqa: E402 + + +def _has_python_docx() -> bool: + try: + import docx # noqa: F401 + return True + except ImportError: + return False + + +def _has_matplotlib() -> bool: + try: + import matplotlib # noqa: F401 + import numpy # noqa: F401 + return True + except ImportError: + return False + + +def _minimal_inputs(out_dir: Path) -> sdb.SynthesisInputs: + return sdb.SynthesisInputs( + project_id="00000000-0000-0000-0000-000000000000", + project_slug="smoke-test", + title="Smoke synthesis", + subtitle="unit-test subtitle", + question="Does the smoke test produce a .docx?", + headline_paragraphs=["Yes. The smoke test produces a .docx."], + calibrated_bottom_line="Pass.", + cited=[ + sdb.CitedEvidence( + study_label="Smith et al.", + year=2024, + design="Meta-analysis", + outcome="All-cause mortality", + estimate_str="RR 0.98 (0.89-1.07)", + population="Adults", + doi="10.1234/example", + quote_span="No statistically significant reduction.", + ) + ], + forest_acm=[], # skip the forest plot in the no-matplotlib case + body_sections=[ + sdb.BodySection( + heading="Findings", + prose="Some prose.\n\nA second paragraph.", + quote="Verbatim quote", + quote_source="- Smith 2024", + ) + ], + references=["Smith et al. (2024). Example. J. Examples."], + output_dir=str(out_dir), + ) + + +class DataModelTests(unittest.TestCase): + """These don't require matplotlib/python-docx — pure dataclass smoke.""" + + def test_cited_evidence_defaults(self): + c = sdb.CitedEvidence( + study_label="x", year=2024, design="d", outcome="o", + estimate_str="e", population="p", doi="10.1/x", quote_span="q", + ) + self.assertFalse(c.is_acm_no_benefit) + + def test_stance_tier_default_axes(self): + m = sdb.StanceTierMatrix() + self.assertEqual(len(m.stances), 4) + self.assertEqual(len(m.tiers), 4) + + def test_uploader_type_present(self): + # Just confirm the Uploader alias and Sentinel constant exist + self.assertTrue(hasattr(sdb, "Uploader")) + self.assertTrue(hasattr(sdb, "build_synthesis_doc")) + + +class BuildDocxSmokeTests(unittest.TestCase): + def setUp(self): + if not _has_python_docx(): + raise unittest.SkipTest("python-docx not installed (viz extra)") + self.tmp = Path(tempfile.mkdtemp(prefix="dr_smoke_")) + + def test_build_produces_local_docx_no_uploader(self): + inputs = _minimal_inputs(self.tmp) + result = sdb.build_synthesis_doc(inputs) + self.assertIn("local_docx_path", result) + docx_path = Path(result["local_docx_path"]) + self.assertTrue(docx_path.exists()) + self.assertGreater(docx_path.stat().st_size, 1000) + + # No uploader -> doc_id/web_url None, uploaded False. + self.assertIsNone(result["doc_id"]) + self.assertIsNone(result["web_url"]) + self.assertFalse(result["uploaded"]) + + # Verify the docx is a valid zip containing the expected XML part + # and that our heading text + a table tag both appear. + with zipfile.ZipFile(docx_path) as zf: + names = set(zf.namelist()) + self.assertIn("word/document.xml", names) + doc_xml = zf.read("word/document.xml").decode("utf-8") + self.assertIn("Smoke synthesis", doc_xml) + self.assertIn("Findings", doc_xml) + # python-docx emits a for any add_table(). + self.assertIn(" dict: + captured["local_path"] = local_path + captured["name"] = name + captured["mime_type"] = mime_type + return {"doc_id": "DOC123", "web_url": "https://example.org/doc/DOC123"} + + result = sdb.build_synthesis_doc(inputs, uploader=fake_uploader) + self.assertEqual(captured.get("name"), "SYNTHESIS__smoke-test") + self.assertTrue( + captured["mime_type"].endswith("wordprocessingml.document"), + captured["mime_type"], + ) + # local_path passed to uploader is the same as result['local_docx_path']. + self.assertEqual( + str(captured["local_path"]), + result["local_docx_path"], + ) + self.assertTrue(result["uploaded"]) + self.assertEqual(result["doc_id"], "DOC123") + self.assertEqual(result["web_url"], "https://example.org/doc/DOC123") + self.assertIsNone(result["upload_error"]) + + def test_uploader_exception_does_not_lose_local_artifact(self): + inputs = _minimal_inputs(self.tmp) + + def boom(local_path: Path, name: str, mime_type: str) -> dict: + raise RuntimeError("network down") + + result = sdb.build_synthesis_doc(inputs, uploader=boom) + self.assertFalse(result["uploaded"]) + self.assertIn("network down", result["upload_error"]) + # Local artifact survives the upload failure. + self.assertTrue(Path(result["local_docx_path"]).exists()) + + +class ForestPlotSmokeTests(unittest.TestCase): + def setUp(self): + if not _has_matplotlib(): + raise unittest.SkipTest("matplotlib not installed (viz extra)") + if not _has_python_docx(): + raise unittest.SkipTest("python-docx not installed (viz extra)") + self.tmp = Path(tempfile.mkdtemp(prefix="dr_forest_")) + + def test_build_with_forest_plot_embeds_image(self): + inputs = _minimal_inputs(self.tmp) + inputs.forest_acm = [ + sdb.ForestRow(label="Smith 2024", metric="RR", + point=0.98, ci_low=0.89, ci_high=1.07), + ] + result = sdb.build_synthesis_doc(inputs) + self.assertIn("forest", result["plots"]) + forest_png = Path(result["plots"]["forest"]) + self.assertTrue(forest_png.exists()) + self.assertGreater(forest_png.stat().st_size, 1000) + + # The image is embedded in the docx zip under word/media/. + with zipfile.ZipFile(result["local_docx_path"]) as zf: + media = [n for n in zf.namelist() if n.startswith("word/media/")] + self.assertTrue(media, "expected embedded media in docx") + + +if __name__ == "__main__": + unittest.main()