From 88bdcdced79773d9db0661d63ca8ba6762f1451b Mon Sep 17 00:00:00 2001 From: Anne Fouilloux Date: Sun, 28 Jun 2026 10:44:59 +0000 Subject: [PATCH] import-nanopub-chain: follow the FORRT backbone, not just refersToNanopub MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The constellation importer walked only the curated KnowledgePixels npa:refersToNanopub graph, which in practice links just CiTO <-> Outcome — so a BFS from the CiTO stopped after 2 nodes and missed the Study, Claim, AIDA and Quote. Those steps are connected by domain predicates the network graph does not index: Outcome --isOutcomeOf--> Study --targetsClaim--> Claim Claim --asAidaStatement--> --(asserted by)--> AIDA AIDA --related--> Quote Add backbone_neighbours(): read every nanopub a node points at from its TriG, resolve the Claim->AIDA hop via a new aida-sentence-nanopub.rq query, and keep only targets that are themselves FORRT chain steps (chain_step_kind) so value-lists/templates/papers are dropped and never crawled. walk() now merges these with the refersToNanopub neighbours (edge relation 'backbone'). Verified against a published 6-step chain: entering from the CiTO now returns all six steps (Quote, AIDA, Claim, Study, Outcome, CiTO) instead of two. --- scripts/import-nanopub-chain.py | 113 ++++++++++++++++++++++- scripts/queries/aida-sentence-nanopub.rq | 21 +++++ 2 files changed, 132 insertions(+), 2 deletions(-) create mode 100644 scripts/queries/aida-sentence-nanopub.rq diff --git a/scripts/import-nanopub-chain.py b/scripts/import-nanopub-chain.py index c62dccb..308a155 100644 --- a/scripts/import-nanopub-chain.py +++ b/scripts/import-nanopub-chain.py @@ -364,6 +364,107 @@ def discover_neighbours(uri: str) -> set[str]: return out +# --- FORRT-backbone discovery (edges the curated graph doesn't materialise) --- +# +# The KnowledgePixels `npa:refersToNanopub` graph (used by discover_neighbours) +# only links some FORRT steps — in practice CiTO <-> Outcome. The rest of the +# chain is connected by *domain* predicates that the network graph doesn't +# index, so a refersToNanopub-only BFS stops after two nodes. We recover the +# full chain by reading those predicates out of each node's TriG: +# +# Outcome --isOutcomeOf--> Study +# Study --targetsClaim--> Claim +# Claim --asAidaStatement--> --(asserted by)--> AIDA +# AIDA --related--> Quote +# +# To stay robust we don't hard-code the predicate list (templates evolve): +# we follow *every* nanopub a node points at, then keep only the targets that +# are themselves FORRT chain steps (so value-lists, templates, papers and other +# noise are dropped and never crawled). + +# Ordered most-specific first; the first match wins. +_CHAIN_STEP_PATTERNS = [ + ("Outcome", re.compile(r"/o/terms/[A-Za-z-]*Replication-Outcome", re.I)), + ("Study", re.compile(r"/o/terms/[A-Za-z-]*Replication-Study", re.I)), + ("Claim", re.compile(r"/o/terms/FORRT-Claim", re.I)), + ("AIDA", re.compile(r"/petapico/o/hycl#AIDA-Sentence", re.I)), + ("Quote", re.compile(r"hasQuotedText", re.I)), + ("CiTO", re.compile(r"/spar/cito/", re.I)), +] + + +def chain_step_kind(trig_text: str) -> str | None: + """Classify a nanopub's TriG as a FORRT chain step (or None for non-steps + like templates, value-lists, papers).""" + for kind, rx in _CHAIN_STEP_PATTERNS: + if rx.search(trig_text): + return kind + return None + + +def _cached_trig_text(uri: str, cache_dir: Path, timeout: int) -> str: + """Fetch (and cache) a nanopub's TriG, returning '' on failure.""" + ra_id = uri.rsplit("/", 1)[-1] + path = cache_dir / f"{ra_id}.trig" + if not path.exists(): + try: + path.write_text(fetch_trig(uri, timeout=timeout)) + except Exception: # noqa: BLE001 + return "" + return path.read_text(errors="replace") + + +def find_aida_nanopubs(aida_uri: str) -> list[str]: + """Find the AIDA-Sentence nanopub(s) asserting a given AIDA-sentence URI + (the Claim -> AIDA hop that refersToNanopub doesn't link).""" + try: + rows = sparql_query(substitute(load_query("aida-sentence-nanopub"), + aidaUri=aida_uri)) + except Exception: # noqa: BLE001 + return [] + out = [] + for r in rows: + canon = canonical_nanopub_uri(r["np"]) if "np" in r else None + if canon: + out.append(canon) + return out + + +def backbone_neighbours(uri: str, trig_path: Path, cache_dir: Path, + timeout: int) -> set[str]: + """FORRT-backbone neighbours of a node that the curated refersToNanopub + graph misses: every nanopub the node points at (plus the asAidaStatement -> + AIDA hop), filtered to those that are themselves FORRT chain steps.""" + out: set[str] = set() + graph = ConjunctiveGraph() + try: + graph.parse(source=str(trig_path), format="trig") + except Exception: # noqa: BLE001 + return out + + candidates: set[str] = set() + aida_uris: set[str] = set() + for s, p, o in graph.triples((None, None, None)): + if not isinstance(o, URIRef): + continue + if str(p).endswith("asAidaStatement"): + aida_uris.add(str(o)) # http://purl.org/aida/ + continue + canon = canonical_nanopub_uri(str(o)) + if canon and canon != uri: + candidates.add(canon) + for au in aida_uris: + candidates.update(find_aida_nanopubs(au)) + + for canon in candidates: + if canon == uri: + continue + text = _cached_trig_text(canon, cache_dir, timeout) + if text and chain_step_kind(text): + out.add(canon) + return out + + # --- BFS using SPARQL neighbourhood -------------------------------------- def walk(entry_uri: str, depth_limit: int, max_nodes: int, timeout: int, @@ -424,14 +525,22 @@ def walk(entry_uri: str, depth_limit: int, max_nodes: int, timeout: int, except Exception as e: # noqa: BLE001 print(f" ! neighbour discovery failed: {e}", file=sys.stderr) neighbours = set() + # The curated refersToNanopub graph misses most FORRT-chain edges + # (Outcome->Study->Claim->AIDA->Quote); recover them from the TriG. + try: + backbone = backbone_neighbours(uri, trig_path, cache_dir, timeout) + except Exception as e: # noqa: BLE001 + print(f" ! backbone discovery failed: {e}", file=sys.stderr) + backbone = set() # Exclude template URIs the node was created from — those are # template definitions, not chain steps. Same for any URI that # appears anywhere as the target of `wasCreatedFromTemplate`. template_targets = {node.template_uri} if node.template_uri else set() - for n in neighbours: + for n in neighbours | backbone: if n in template_targets: continue - edges.append(EdgeSummary(source=uri, target=n, relation="refersTo")) + relation = "refersTo" if n in neighbours else "backbone" + edges.append(EdgeSummary(source=uri, target=n, relation=relation)) if n not in visited: queue.append((n, depth + 1)) diff --git a/scripts/queries/aida-sentence-nanopub.rq b/scripts/queries/aida-sentence-nanopub.rq new file mode 100644 index 0000000..184b49e --- /dev/null +++ b/scripts/queries/aida-sentence-nanopub.rq @@ -0,0 +1,21 @@ +# Resolve a FORRT Claim's AIDA sentence to its AIDA-Sentence nanopub. +# +# A FORRT Claim points at its AIDA sentence with +# sciencelive:asAidaStatement > +# and a *separate* AIDA-Sentence nanopub asserts that same +# `http://purl.org/aida/` resource (typed petapico hycl:AIDA-Sentence). +# The curated `npa:refersToNanopub` graph does NOT materialise the Claim -> AIDA +# edge, so the constellation importer resolves it here: find the nanopub whose +# assertion has the AIDA-sentence URI as a subject. +# +# Placeholder: `${aidaUri}` - the `http://purl.org/aida/...` sentence URI. + +prefix np: + +select distinct ?np where { + ?np np:hasAssertion ?assertion . + graph ?assertion { + <${aidaUri}> ?p ?o . + } +} +limit 20