diff --git a/pyproject.toml b/pyproject.toml index 60f1d39..884554f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,11 +51,14 @@ dependencies = [ ] [project.optional-dependencies] -embeddings = ["sqlite-vec>=0.1.6"] +embeddings = ["sqlite-vec>=0.1.6", "sentence-transformers>=3.0"] +# sqlite-vec is a tiny wheel; including it in dev lets CI exercise the +# vector-store tests without pulling in torch via sentence-transformers. dev = [ "pytest>=8.0", "pytest-asyncio>=0.23", "ruff>=0.5", + "sqlite-vec>=0.1.6", ] # Heavyweight, optional. Required only by `benchmarks/faithfulness/` which # downloads the HaluBench subsample and self-hosts HHEM-2.1-Open for the diff --git a/src/orc/cli_commands/ingest.py b/src/orc/cli_commands/ingest.py index 063f6a4..d43eeca 100644 --- a/src/orc/cli_commands/ingest.py +++ b/src/orc/cli_commands/ingest.py @@ -35,6 +35,8 @@ def ingest_command(source: str, workspace: str | None, no_recursive: bool) -> No console.print( f"[green]Ingested[/green] {len(ids)} evidence item(s) into [bold]{ws.name}[/bold]" ) + if ws.has_embeddings: + console.print(f" embeddings: {ws.embedding_model}") for eid in ids[:10]: console.print(f" [dim]{eid}[/dim]") if len(ids) > 10: diff --git a/src/orc/cli_commands/search.py b/src/orc/cli_commands/search.py index c85363f..e680487 100644 --- a/src/orc/cli_commands/search.py +++ b/src/orc/cli_commands/search.py @@ -50,7 +50,7 @@ def search_command(query: str, workspace: str | None, k: int, as_json: bool) -> console.print("[yellow]No chunks matched[/yellow]") return - table = Table(title=f"BM25 results for '{query}'") + table = Table(title=f"Retrieval results for '{query}'") table.add_column("Rank", justify="right") table.add_column("Score", justify="right") table.add_column("Title") diff --git a/src/orc/cli_commands/workspace.py b/src/orc/cli_commands/workspace.py index 4ea5798..9be64c9 100644 --- a/src/orc/cli_commands/workspace.py +++ b/src/orc/cli_commands/workspace.py @@ -1,16 +1,33 @@ -"""`orc workspace ...` commands.""" +"""`orc workspace ...` commands. + +The embedding model is pinned in the workspace row — there is deliberately no +env var to override it at retrieval time, because the column is the +replay-pinned truth for which model embedded the corpus. +""" from __future__ import annotations import click from rich.console import Console +from rich.markup import escape from rich.table import Table -from orc.errors import WorkspaceExistsError +from orc.errors import EmbeddingsUnavailableError, WorkspaceExistsError, WorkspaceNotFoundError +from orc.paths import workspace_db_path +from orc.retrieval.embedder import DEFAULT_EMBEDDING_MODEL, embedder_available, get_embedder from orc.storage import workspace as ws_module +from orc.storage.db import open_connection, transaction +from orc.storage.embeddings_store import ( + backfill_embeddings, + ensure_chunk_vec, + load_vec_extension, + vec_extension_available, +) console = Console() +_INSTALL_HINT = 'pip install "orc-ai[embeddings]"' + @click.group("workspace") def workspace() -> None: @@ -19,10 +36,34 @@ def workspace() -> None: @workspace.command("create") @click.argument("name") -def create_command(name: str) -> None: +@click.option( + "--embeddings", + "embeddings", + is_flag=True, + help="Enable hybrid (BM25 + vector) retrieval for this workspace.", +) +@click.option( + "--embedding-model", + "embedding_model", + default=None, + help=f"Embedding model id (default: {DEFAULT_EMBEDDING_MODEL}). Requires --embeddings.", +) +def create_command(name: str, embeddings: bool, embedding_model: str | None) -> None: """Create a new workspace.""" + if embedding_model is not None and not embeddings: + raise click.UsageError("--embedding-model requires --embeddings") + model = (embedding_model or DEFAULT_EMBEDDING_MODEL) if embeddings else None + + # Warn-but-create: the flag records intent in the workspace row; the user + # can install the extra and run `orc workspace embed` later. + if model is not None and not (embedder_available() and vec_extension_available()): + console.print( + "[yellow]Warning:[/yellow] embedding dependencies are not installed; " + f"ingest will fail until you run: {escape(_INSTALL_HINT)}" + ) + try: - ws = ws_module.create(name) + ws = ws_module.create(name, embedding_model=model) except WorkspaceExistsError as exc: raise click.ClickException(str(exc)) from exc except ValueError as exc: @@ -30,6 +71,58 @@ def create_command(name: str) -> None: console.print(f"[green]Created workspace[/green] [bold]{ws.name}[/bold]") console.print(f" schema_version = {ws.schema_version}") console.print(f" created_at = {ws.created_at}") + if ws.has_embeddings: + console.print(f" embeddings = {ws.embedding_model}") + + +@workspace.command("embed") +@click.argument("name") +@click.option( + "--model", + default=None, + help="Embedding model id. Only needed when the workspace has none set yet.", +) +def embed_command(name: str, model: str | None) -> None: + """Backfill vector embeddings for all unembedded chunks in a workspace.""" + try: + ws = ws_module.resolve(name) + except WorkspaceNotFoundError as exc: + raise click.ClickException(str(exc)) from exc + + if ws.embedding_model is None: + effective_model = model or DEFAULT_EMBEDDING_MODEL + elif model is not None and model != ws.embedding_model: + raise click.ClickException( + f"Workspace {ws.name!r} is pinned to embedding model " + f"{ws.embedding_model!r}; refusing to embed with {model!r}. " + "Vectors from different models cannot be mixed." + ) + else: + effective_model = ws.embedding_model + + if not vec_extension_available(): + raise click.ClickException( + f"The sqlite-vec extension is unavailable; run: {_INSTALL_HINT}" + ) + try: + embedder = get_embedder(effective_model) + except EmbeddingsUnavailableError as exc: + raise click.ClickException(str(exc)) from exc + + with open_connection(workspace_db_path(ws.name)) as conn: + load_vec_extension(conn) + try: + ensure_chunk_vec(conn, embedder.dim) + except ValueError as exc: + raise click.ClickException(str(exc)) from exc + if ws.embedding_model is None: + with transaction(conn): + conn.execute( + "UPDATE workspace SET embedding_model = ? WHERE name = ?", + (effective_model, ws.name), + ) + count = backfill_embeddings(conn, embedder) + console.print(f"[green]Embedded[/green] {count} chunk(s) with [bold]{effective_model}[/bold]") @workspace.command("list") diff --git a/src/orc/directives/research/skills/research_topic.py b/src/orc/directives/research/skills/research_topic.py index 85eb91d..be7f5d9 100644 --- a/src/orc/directives/research/skills/research_topic.py +++ b/src/orc/directives/research/skills/research_topic.py @@ -10,7 +10,7 @@ from orc.llm.cache import build_verify_messages, format_corpus from orc.llm.client import get_client, messages_create, resolve_model_for_provider from orc.llm.models import resolve_research_model -from orc.retrieval import bm25_search +from orc.retrieval import retrieve from orc.runs.runner import Run from orc.storage.workspace import Workspace @@ -73,11 +73,13 @@ def run( raise ValueError("topic must be a non-empty string") resolved_model = resolve_research_model(model) - pool = bm25_search( - run.conn, topic, limit=retrieval_pool, corpus_version=corpus_version + res = retrieve( + run.conn, topic, workspace=workspace, limit=retrieval_pool, corpus_version=corpus_version + ) + candidates = res.chunks[:k] + run.record_retrieval( + candidates, method=res.method, candidates_considered=res.candidates_considered ) - candidates = pool[:k] - run.record_retrieval(candidates, method="bm25", candidates_considered=len(pool)) if not candidates: return { diff --git a/src/orc/directives/research/skills/search_evidence.py b/src/orc/directives/research/skills/search_evidence.py index f013274..ebf02e0 100644 --- a/src/orc/directives/research/skills/search_evidence.py +++ b/src/orc/directives/research/skills/search_evidence.py @@ -4,7 +4,7 @@ from typing import Any -from orc.retrieval import bm25_search +from orc.retrieval import retrieve from orc.runs.runner import Run from orc.storage.workspace import Workspace @@ -22,8 +22,11 @@ def run( corpus_version: int | None = None, **_unused: Any, ) -> dict[str, Any]: - chunks = bm25_search(run.conn, query, limit=k, corpus_version=corpus_version) - run.record_retrieval(chunks, method="bm25", candidates_considered=len(chunks)) + res = retrieve(run.conn, query, workspace=workspace, limit=k, corpus_version=corpus_version) + chunks = res.chunks + run.record_retrieval( + chunks, method=res.method, candidates_considered=res.candidates_considered + ) return { "query": query, "k": k, diff --git a/src/orc/directives/research/skills/verify_claim.py b/src/orc/directives/research/skills/verify_claim.py index 985f566..8345e3f 100644 --- a/src/orc/directives/research/skills/verify_claim.py +++ b/src/orc/directives/research/skills/verify_claim.py @@ -19,7 +19,7 @@ from orc.llm.cache import build_verify_messages, format_corpus from orc.llm.client import get_client, messages_create, resolve_model_for_provider from orc.llm.models import resolve_verify_model -from orc.retrieval import bm25_search +from orc.retrieval import retrieve from orc.runs.runner import Run from orc.storage.workspace import Workspace @@ -372,11 +372,17 @@ def run( candidates, method=f"{mode}_all", candidates_considered=len(candidates) ) else: - pool = bm25_search( - run.conn, claim, limit=retrieval_pool, corpus_version=corpus_version + res = retrieve( + run.conn, + claim, + workspace=workspace, + limit=retrieval_pool, + corpus_version=corpus_version, + ) + candidates = res.chunks[:k] + run.record_retrieval( + candidates, method=res.method, candidates_considered=res.candidates_considered ) - candidates = pool[:k] - run.record_retrieval(candidates, method="bm25", candidates_considered=len(pool)) if not candidates: return _make_not_found(claim=claim, model=resolved_model, run=run) diff --git a/src/orc/errors.py b/src/orc/errors.py index a9b8a7f..7cd9fba 100644 --- a/src/orc/errors.py +++ b/src/orc/errors.py @@ -28,3 +28,7 @@ class TraceNotFoundError(OrcError): class IngestError(OrcError): pass + + +class EmbeddingsUnavailableError(OrcError): + """Embeddings were requested but the optional dependencies are missing.""" diff --git a/src/orc/ingest/pipeline.py b/src/orc/ingest/pipeline.py index de827ff..b9647a4 100644 --- a/src/orc/ingest/pipeline.py +++ b/src/orc/ingest/pipeline.py @@ -80,6 +80,11 @@ def _ingest_one(workspace: Workspace, doc: LoadedDoc) -> list[str]: # Chunk before any disk write so a chunker failure leaves nothing behind. chunks = chunk_text(doc.text) + # Embed BEFORE the write transaction: model inference can be slow and + # must not hold the BEGIN IMMEDIATE write lock. The vectors are then + # inserted in the same transaction as the chunk rows (atomic). + embeddings = _embed_chunks_for_ingest(conn, workspace=workspace, chunks=chunks) + # Stage the evidence bytes to a temp file and only promote it into place # once the DB transaction commits. A failure anywhere leaves neither an # orphaned file nor a dangling row — the corpus stays consistent. @@ -95,6 +100,7 @@ def _ingest_one(workspace: Workspace, doc: LoadedDoc) -> list[str]: sha=sha, doc=doc, chunks=chunks, + embeddings=embeddings, ) except BaseException: tmp_path.unlink(missing_ok=True) @@ -103,6 +109,49 @@ def _ingest_one(workspace: Workspace, doc: LoadedDoc) -> list[str]: return [evidence_id] +def _embed_chunks_for_ingest( + conn: Any, + *, + workspace: Workspace, + chunks: list, +) -> list[list[float]] | None: + """Embed chunk texts when the workspace opts into embeddings. + + Fail-loud by design: a workspace with embedding_model set has promised + hybrid retrieval, so silently ingesting unembedded chunks would corrupt + that promise. Missing deps surface as IngestError with an install hint. + Also prepares chunk_vec (extension + table) before the write transaction. + """ + if workspace.embedding_model is None or not chunks: + return None + + from orc.errors import EmbeddingsUnavailableError + from orc.retrieval.embedder import get_embedder + from orc.storage.embeddings_store import ( + ensure_chunk_vec, + load_vec_extension, + vec_extension_available, + ) + + try: + if not vec_extension_available(): + raise EmbeddingsUnavailableError( + "the sqlite-vec extension is unavailable; " + 'run: pip install "orc-ai[embeddings]"' + ) + embedder = get_embedder(workspace.embedding_model) + except EmbeddingsUnavailableError as exc: + raise IngestError( + f"Workspace {workspace.name!r} requires embeddings " + f"(embedding_model={workspace.embedding_model!r}) but they are " + f"unavailable: {exc}" + ) from exc + + load_vec_extension(conn) + ensure_chunk_vec(conn, embedder.dim) + return embedder.embed_texts([c.text for c in chunks]) + + def _commit_evidence( conn: Any, *, @@ -112,6 +161,7 @@ def _commit_evidence( sha: str, doc: LoadedDoc, chunks: list, + embeddings: list[list[float]] | None = None, ) -> None: with transaction(conn): conn.execute( @@ -136,12 +186,13 @@ def _commit_evidence( new_corpus_version, ), ) - for c in chunks: + chunk_ids = [new_chunk_id() for _ in chunks] + for chunk_id, c in zip(chunk_ids, chunks, strict=True): conn.execute( "INSERT INTO chunk(chunk_id, evidence_id, seq, text, token_count, " "headings_path, start_offset, end_offset) VALUES (?,?,?,?,?,?,?,?)", ( - new_chunk_id(), + chunk_id, evidence_id, c.seq, c.text, @@ -151,6 +202,16 @@ def _commit_evidence( c.end_offset, ), ) + if embeddings is not None: + from orc.storage.embeddings_store import store_chunk_embeddings + + store_chunk_embeddings( + conn, + [ + (chunk_id, new_corpus_version, vector) + for chunk_id, vector in zip(chunk_ids, embeddings, strict=True) + ], + ) def _iter_files(root: Path, *, recursive: bool) -> Iterator[Path]: diff --git a/src/orc/retrieval/__init__.py b/src/orc/retrieval/__init__.py index 13db9f2..946c484 100644 --- a/src/orc/retrieval/__init__.py +++ b/src/orc/retrieval/__init__.py @@ -1,5 +1,13 @@ """Retrieval primitives. Pure functions over a sqlite connection.""" from orc.retrieval.bm25 import RetrievedChunk, bm25_search +from orc.retrieval.hybrid import RetrievalResult, retrieve, rrf_fuse, vector_search -__all__ = ["RetrievedChunk", "bm25_search"] +__all__ = [ + "RetrievalResult", + "RetrievedChunk", + "bm25_search", + "retrieve", + "rrf_fuse", + "vector_search", +] diff --git a/src/orc/retrieval/embedder.py b/src/orc/retrieval/embedder.py new file mode 100644 index 0000000..5940b45 --- /dev/null +++ b/src/orc/retrieval/embedder.py @@ -0,0 +1,103 @@ +"""Embedding model access for hybrid retrieval. + +The model is pinned per workspace (workspace.embedding_model column) — there is +deliberately NO env var override at retrieval time, because the workspace column +is the replay-pinned truth: a frozen replay must embed with the same model the +original run used. + +sentence-transformers (and its torch dependency) is heavyweight, so the import +is lazy and everything that only needs the dimension consults the module-level +registry instead of loading the model. +""" + +from __future__ import annotations + +from collections.abc import Callable +from importlib.util import find_spec +from typing import Protocol + +from orc.errors import EmbeddingsUnavailableError + +DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" + +# Known model dims, so callers can size chunk_vec without loading torch. +_MODEL_DIMS: dict[str, int] = { + "sentence-transformers/all-MiniLM-L6-v2": 384, +} + +_INSTALL_HINT = 'pip install "orc-ai[embeddings]"' + + +class Embedder(Protocol): + model_id: str + dim: int + + def embed_texts(self, texts: list[str]) -> list[list[float]]: ... + + +class SentenceTransformerEmbedder: + """Real embedder. Lazy-imports sentence_transformers so `orc` stays light + for users who never opt into embeddings.""" + + def __init__(self, model_id: str = DEFAULT_EMBEDDING_MODEL) -> None: + try: + import sentence_transformers + except ImportError as exc: + raise EmbeddingsUnavailableError( + f"sentence-transformers is not installed; run: {_INSTALL_HINT}" + ) from exc + self.model_id = model_id + self._model = sentence_transformers.SentenceTransformer(model_id) + # Renamed in sentence-transformers 5.x; support both spellings. + get_dim = getattr( + self._model, + "get_embedding_dimension", + self._model.get_sentence_embedding_dimension, + ) + self.dim = int(get_dim()) + + def embed_texts(self, texts: list[str]) -> list[list[float]]: + # Normalized embeddings make L2 distance rank-equivalent to cosine. + return self._model.encode(texts, normalize_embeddings=True).tolist() + + +_factory: Callable[[str], Embedder] | None = None +_cache: dict[str, Embedder] = {} + + +def model_dim(model_id: str) -> int | None: + """Dimension for a known model id, without loading the model.""" + return _MODEL_DIMS.get(model_id) + + +def embedder_available() -> bool: + return find_spec("sentence_transformers") is not None + + +def get_embedder(model_id: str) -> Embedder: + """Return a (cached) embedder for the model id. + + Raises EmbeddingsUnavailableError with an install hint when the optional + dependency is missing, so callers can decide between fail-loud (ingest) + and warn-and-fall-back (retrieval). + """ + if model_id in _cache: + return _cache[model_id] + if _factory is not None: + embedder = _factory(model_id) + elif not embedder_available(): + raise EmbeddingsUnavailableError( + f"Embedding model {model_id!r} requested but sentence-transformers " + f"is not installed; run: {_INSTALL_HINT}" + ) + else: + embedder = SentenceTransformerEmbedder(model_id) + _cache[model_id] = embedder + return embedder + + +def set_embedder_factory(factory: Callable[[str], Embedder] | None) -> None: + """Test hook. Pass None to clear. Clears the cache either way.""" + global _factory + _factory = factory + _cache.clear() diff --git a/src/orc/retrieval/hybrid.py b/src/orc/retrieval/hybrid.py new file mode 100644 index 0000000..a45404c --- /dev/null +++ b/src/orc/retrieval/hybrid.py @@ -0,0 +1,182 @@ +"""Hybrid retrieval: BM25 + dense vectors fused with Reciprocal Rank Fusion. + +Opt-in per workspace via the embedding_model column — workspaces without it +take the plain BM25 path and produce byte-identical results to before. + +Residual replay nondeterminism (documented, accepted): +- The QUERY embedding is recomputed at replay time. chunk_vec rows are pinned + by corpus_version, but torch/SIMD/BLAS variance across machines or library + versions can perturb the query vector in the last few ulps, which can flip + near-tie KNN orderings. Frozen replay is therefore best-effort for the + vector leg; the trace records the method actually used. +- If embedding deps are absent at replay time, retrieve() falls back to BM25 + and records method="bm25" honestly rather than failing the replay. The + replay engine warns when the method differs from the original trace. +""" + +from __future__ import annotations + +import dataclasses +import sqlite3 +import warnings +from dataclasses import dataclass + +from orc.errors import EmbeddingsUnavailableError +from orc.retrieval.bm25 import RetrievedChunk, bm25_search +from orc.retrieval.embedder import get_embedder +from orc.storage.embeddings_store import ( + knn_chunk_ids, + load_vec_extension, + vec_extension_available, +) +from orc.storage.workspace import Workspace + + +@dataclass(frozen=True) +class RetrievalResult: + chunks: list[RetrievedChunk] + method: str + candidates_considered: int + + +# Same column set as bm25._SELECT minus the FTS score: vector hits hydrate into +# the same RetrievedChunk shape so downstream consumers can't tell legs apart. +_HYDRATE_SELECT = """ +SELECT + chunk.chunk_id AS chunk_id, + chunk.evidence_id AS evidence_id, + chunk.seq AS seq, + chunk.text AS text, + chunk.headings_path AS headings_path, + chunk.token_count AS token_count, + evidence.title AS evidence_title, + evidence.source_path AS evidence_source_path +FROM chunk +JOIN evidence ON evidence.evidence_id = chunk.evidence_id +WHERE chunk.chunk_id IN ({placeholders}) +""" + + +def vector_search( + conn: sqlite3.Connection, + query_embedding: list[float], + *, + limit: int, + corpus_version: int | None, +) -> list[RetrievedChunk]: + """KNN over chunk_vec, hydrated to RetrievedChunk in KNN (nearest-first) order. + + bm25_score is 0.0 for vector hits: the field carries the FTS score and a + vector distance is not comparable, so we keep the sentinel explicit. + """ + hits = knn_chunk_ids(conn, query_embedding, limit=limit, corpus_version=corpus_version) + if not hits: + return [] + ids = [chunk_id for chunk_id, _ in hits] + placeholders = ", ".join("?" for _ in ids) + rows = conn.execute(_HYDRATE_SELECT.format(placeholders=placeholders), ids).fetchall() + by_id = {row["chunk_id"]: row for row in rows} + out: list[RetrievedChunk] = [] + for i, chunk_id in enumerate(cid for cid in ids if cid in by_id): + row = by_id[chunk_id] + out.append( + RetrievedChunk( + chunk_id=row["chunk_id"], + evidence_id=row["evidence_id"], + seq=row["seq"], + text=row["text"], + headings_path=row["headings_path"], + token_count=row["token_count"], + rank=i, + bm25_score=0.0, + evidence_title=row["evidence_title"], + evidence_source_path=row["evidence_source_path"], + ) + ) + return out + + +def rrf_fuse( + bm25_results: list[RetrievedChunk], + vector_results: list[RetrievedChunk], + *, + k: int = 60, + limit: int, +) -> list[RetrievedChunk]: + """Reciprocal Rank Fusion over the two legs, rank-only. + + score(chunk) = sum over lists containing it of 1 / (k + rank + 1), with + 0-based ranks. Rank-only fusion sidesteps the incomparability of BM25 + scores and vector distances. For overlapping chunk_ids the BM25 instance + is kept so the real bm25_score survives into the trace. Ties sort by + chunk_id for deterministic, replayable output. + """ + scores: dict[str, float] = {} + instances: dict[str, RetrievedChunk] = {} + for rank, chunk in enumerate(vector_results): + scores[chunk.chunk_id] = scores.get(chunk.chunk_id, 0.0) + 1.0 / (k + rank + 1) + instances[chunk.chunk_id] = chunk + for rank, chunk in enumerate(bm25_results): + scores[chunk.chunk_id] = scores.get(chunk.chunk_id, 0.0) + 1.0 / (k + rank + 1) + instances[chunk.chunk_id] = chunk # BM25 instance wins on overlap + ordered = sorted(scores, key=lambda cid: (-scores[cid], cid))[:limit] + return [dataclasses.replace(instances[cid], rank=i) for i, cid in enumerate(ordered)] + + +def retrieve( + conn: sqlite3.Connection, + query: str, + *, + workspace: Workspace, + limit: int = 50, + corpus_version: int | None = None, +) -> RetrievalResult: + """Retrieve chunks for a query, hybrid when the workspace opts in. + + The embedding model comes ONLY from workspace.embedding_model — no env var + override — because that column is the replay-pinned truth. When the model + is set but the vector leg can't run (deps or chunk_vec missing), retrieval + degrades to BM25 with a warning instead of failing: a read path must not + hard-fail on an optional acceleration. + """ + model = workspace.embedding_model + if model is None: + chunks = bm25_search(conn, query, limit=limit, corpus_version=corpus_version) + return RetrievalResult(chunks=chunks, method="bm25", candidates_considered=len(chunks)) + + reason = _vector_leg_unavailable_reason(conn, model) + if reason is not None: + warnings.warn( + f"Workspace {workspace.name!r} has embedding_model={model!r} but {reason}; " + "falling back to BM25. Run `orc workspace embed` after installing " + 'the embeddings extra (pip install "orc-ai[embeddings]").', + RuntimeWarning, + stacklevel=2, + ) + chunks = bm25_search(conn, query, limit=limit, corpus_version=corpus_version) + return RetrievalResult(chunks=chunks, method="bm25", candidates_considered=len(chunks)) + + embedder = get_embedder(model) + [query_embedding] = embedder.embed_texts([query]) + bm25_leg = bm25_search(conn, query, limit=limit, corpus_version=corpus_version) + vector_leg = vector_search(conn, query_embedding, limit=limit, corpus_version=corpus_version) + fused = rrf_fuse(bm25_leg, vector_leg, limit=limit) + union = {c.chunk_id for c in bm25_leg} | {c.chunk_id for c in vector_leg} + return RetrievalResult(chunks=fused, method="hybrid_rrf", candidates_considered=len(union)) + + +def _vector_leg_unavailable_reason(conn: sqlite3.Connection, model: str) -> str | None: + """None when the vector leg can run; otherwise a short human-readable reason.""" + if not vec_extension_available(): + return "the sqlite-vec extension is unavailable" + try: + get_embedder(model) + except EmbeddingsUnavailableError: + return "the embedding model dependencies are not installed" + load_vec_extension(conn) + row = conn.execute( + "SELECT name FROM sqlite_master WHERE type = 'table' AND name = 'chunk_vec'" + ).fetchone() + if row is None: + return "the chunk_vec table does not exist yet" + return None diff --git a/src/orc/runs/replay.py b/src/orc/runs/replay.py index 40e6154..cbf017f 100644 --- a/src/orc/runs/replay.py +++ b/src/orc/runs/replay.py @@ -19,6 +19,7 @@ from __future__ import annotations +import warnings from typing import Any from orc import directives @@ -70,6 +71,9 @@ def replay(run_id: str, *, live: bool = False) -> dict[str, Any]: result = skill.run(workspace=ws, run=run, **skill_kwargs) run.close(output=result) + if not live: + _warn_on_retrieval_method_drift(original_trace=trace, new_retrieval=run.retrieval) + return { "original_run_id": run_id, "new_run_id": run.run_id, @@ -82,6 +86,27 @@ def replay(run_id: str, *, live: bool = False) -> dict[str, Any]: } +def _warn_on_retrieval_method_drift( + *, + original_trace: dict[str, Any], + new_retrieval: dict[str, Any] | None, +) -> None: + """Frozen replay promises reproduction; a retrieval method change (e.g. + hybrid_rrf -> bm25 because embedding deps are absent at replay time) means + the chunk pool may differ even with corpus_version pinned. Surface it + rather than letting the drift pass silently.""" + original_method = (original_trace.get("retrieval") or {}).get("method") + new_method = (new_retrieval or {}).get("method") + if original_method and new_method and original_method != new_method: + warnings.warn( + f"Frozen replay used a different retrieval method than the original " + f"run: {original_method!r} -> {new_method!r}. Retrieved chunks may " + "differ; check embedding dependencies and chunk_vec state.", + RuntimeWarning, + stacklevel=3, + ) + + def _resolve_replay_kwargs( *, spec: Any, diff --git a/src/orc/storage/embeddings_store.py b/src/orc/storage/embeddings_store.py new file mode 100644 index 0000000..6a3e9fe --- /dev/null +++ b/src/orc/storage/embeddings_store.py @@ -0,0 +1,179 @@ +"""chunk_vec vector store backed by the sqlite-vec extension. + +The table is created lazily (only workspaces that opt into embeddings pay for +it), and its dimension is stamped into schema_meta so a later open with a +different embedding model fails loudly instead of silently mixing vector +spaces. +""" + +from __future__ import annotations + +import sqlite3 +from importlib.util import find_spec +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from orc.retrieval.embedder import Embedder + +_DIM_META_KEY = "chunk_vec_dim" + + +def vec_extension_available() -> bool: + """True when sqlite-vec can actually be loaded into this interpreter. + + Both halves matter: the wheel must be installed AND the sqlite3 build must + support runtime extension loading (some distro builds compile it out). + """ + return find_spec("sqlite_vec") is not None and hasattr( + sqlite3.Connection, "enable_load_extension" + ) + + +def load_vec_extension(conn: sqlite3.Connection) -> None: + """Load sqlite-vec into the connection. Idempotent per connection.""" + try: + conn.execute("SELECT vec_version()") + return + except sqlite3.OperationalError: + pass + import sqlite_vec + + conn.enable_load_extension(True) + try: + sqlite_vec.load(conn) + finally: + # Re-disable immediately: nothing else should load extensions through + # a connection that also executes retrieval queries over user input. + conn.enable_load_extension(False) + + +def ensure_chunk_vec(conn: sqlite3.Connection, dim: int) -> None: + """Create chunk_vec for `dim`-dimensional vectors, or verify the stamp. + + A dim mismatch means the workspace's embedding model changed under us — + distances across models are meaningless, so we refuse rather than guess. + """ + stamped = _stamped_dim(conn) + if stamped is not None and stamped != dim: + raise ValueError( + f"chunk_vec dim mismatch: table was created with dim={stamped}, " + f"requested dim={dim}. Re-embed the workspace with one model." + ) + conn.execute( + "CREATE VIRTUAL TABLE IF NOT EXISTS chunk_vec USING vec0(" + f"chunk_id TEXT PRIMARY KEY, embedding FLOAT[{dim}], corpus_version INTEGER)" + ) + if stamped is None: + conn.execute( + "INSERT OR REPLACE INTO schema_meta(key, value) VALUES (?, ?)", + (_DIM_META_KEY, str(dim)), + ) + + +def store_chunk_embeddings( + conn: sqlite3.Connection, + items: list[tuple[str, int, list[float]]], +) -> None: + """Insert (chunk_id, corpus_version, vector) rows into chunk_vec. + + No transaction here: the caller owns it, so ingest can commit chunk rows + and their vectors atomically. Vector lengths are validated up front so a + bad batch fails before any row is written. + """ + import sqlite_vec + + dim = _stamped_dim(conn) + for chunk_id, _, vector in items: + if dim is not None and len(vector) != dim: + raise ValueError( + f"embedding for chunk {chunk_id!r} has dim {len(vector)}, expected {dim}" + ) + conn.executemany( + "INSERT INTO chunk_vec(chunk_id, embedding, corpus_version) VALUES (?, ?, ?)", + [ + (chunk_id, sqlite_vec.serialize_float32(vector), corpus_version) + for chunk_id, corpus_version, vector in items + ], + ) + + +def knn_chunk_ids( + conn: sqlite3.Connection, + query_vec: list[float], + *, + limit: int, + corpus_version: int | None = None, +) -> list[tuple[str, float]]: + """K-nearest chunk_ids for a query vector, nearest first. + + The outer ORDER BY adds chunk_id as a tie-break: sqlite-vec guarantees + distance order but not tie order, and replayable retrieval needs full + determinism. + """ + import sqlite_vec + + inner = "SELECT chunk_id, distance FROM chunk_vec WHERE embedding MATCH ? AND k = ?" + params: tuple = (sqlite_vec.serialize_float32(query_vec), limit) + if corpus_version is not None: + inner += " AND corpus_version <= ?" + params = (*params, corpus_version) + # MATERIALIZED stops SQLite from flattening the subquery: vec0 KNN scans + # only accept a bare ORDER BY distance, so the tie-break must apply outside. + rows = conn.execute( + f"WITH knn AS MATERIALIZED ({inner}) " + "SELECT chunk_id, distance FROM knn ORDER BY distance, chunk_id", + params, + ).fetchall() + return [(row["chunk_id"], float(row["distance"])) for row in rows] + + +def chunks_missing_embeddings(conn: sqlite3.Connection) -> list[sqlite3.Row]: + """Chunks with no chunk_vec row, with the corpus_version of their evidence. + + Backfill must stamp each vector with the chunk's ORIGINAL corpus_version + (not the current one) so frozen replay filtering stays truthful. + """ + return conn.execute( + "SELECT chunk.chunk_id AS chunk_id, chunk.text AS text, " + "evidence.corpus_version AS corpus_version " + "FROM chunk JOIN evidence ON evidence.evidence_id = chunk.evidence_id " + "WHERE chunk.chunk_id NOT IN (SELECT chunk_id FROM chunk_vec) " + "ORDER BY chunk.chunk_id" + ).fetchall() + + +def backfill_embeddings( + conn: sqlite3.Connection, embedder: Embedder, batch_size: int = 64 +) -> int: + """Embed every chunk that has no chunk_vec row yet. Returns rows written. + + Idempotent: only missing chunks are touched, so re-running after a crash + (or on an already-complete corpus) is safe. Each vector is stamped with + the chunk's ORIGINAL evidence corpus_version so frozen replay filters + stay truthful. Batches commit independently — a failure mid-backfill + keeps completed batches, and the next run picks up the remainder. + """ + from orc.storage.db import transaction + + missing = chunks_missing_embeddings(conn) + written = 0 + for start in range(0, len(missing), batch_size): + batch = missing[start : start + batch_size] + vectors = embedder.embed_texts([row["text"] for row in batch]) + with transaction(conn): + store_chunk_embeddings( + conn, + [ + (row["chunk_id"], row["corpus_version"], vector) + for row, vector in zip(batch, vectors, strict=True) + ], + ) + written += len(batch) + return written + + +def _stamped_dim(conn: sqlite3.Connection) -> int | None: + row = conn.execute( + "SELECT value FROM schema_meta WHERE key = ?", (_DIM_META_KEY,) + ).fetchone() + return int(row["value"]) if row is not None else None diff --git a/src/orc/storage/schema.sql b/src/orc/storage/schema.sql index 9f8b879..e58d626 100644 --- a/src/orc/storage/schema.sql +++ b/src/orc/storage/schema.sql @@ -55,9 +55,11 @@ CREATE TRIGGER IF NOT EXISTS chunk_au AFTER UPDATE ON chunk BEGIN INSERT INTO chunk_fts(rowid, text) VALUES (new.rowid, new.text); END; --- chunk_vec is created lazily by storage/embeddings_store.py when embeddings are --- enabled for a workspace. Schema: --- CREATE VIRTUAL TABLE chunk_vec USING vec0(chunk_id TEXT PRIMARY KEY, embedding FLOAT[N]); +-- chunk_vec is created lazily by storage/embeddings_store.py (ensure_chunk_vec) +-- when embeddings are enabled for a workspace. Requires the sqlite-vec extension. +-- The vector dimension N is stamped in schema_meta under 'chunk_vec_dim'. Schema: +-- CREATE VIRTUAL TABLE chunk_vec USING vec0( +-- chunk_id TEXT PRIMARY KEY, embedding FLOAT[N], corpus_version INTEGER); CREATE TABLE IF NOT EXISTS run ( run_id TEXT PRIMARY KEY, diff --git a/tests/_fake_embedder.py b/tests/_fake_embedder.py new file mode 100644 index 0000000..258019c --- /dev/null +++ b/tests/_fake_embedder.py @@ -0,0 +1,44 @@ +"""Fake embedder for tests. Deterministic keyword -> one-hot mapping. + +Scripting semantic hits: pass a vocabulary mapping keyword -> dimension index. +Any text containing a vocabulary keyword embeds with 1.0 at that index, so a +query sharing the keyword lands at L2 distance 0 from the chunk. Texts with no +vocabulary hit fall back to 0.5 at a CRC32 bucket (stable across processes, +unlike Python's randomized str hash); the 0.5 magnitude guarantees a fallback +vector never equals a scripted one-hot even when the buckets collide. +""" + +from __future__ import annotations + +import re +import zlib + + +class FakeEmbedder: + def __init__( + self, + dim: int = 8, + *, + model_id: str = "fake-embedder", + vocabulary: dict[str, int] | None = None, + ) -> None: + self.model_id = model_id + self.dim = dim + self.vocabulary = dict(vocabulary or {}) + self.calls: list[list[str]] = [] + + def embed_texts(self, texts: list[str]) -> list[list[float]]: + self.calls.append(list(texts)) + return [self._embed(t) for t in texts] + + def _embed(self, text: str) -> list[float]: + vec = [0.0] * self.dim + words = set(re.findall(r"\w+", text.lower())) + hit = False + for keyword, index in sorted(self.vocabulary.items()): + if keyword in words: + vec[index % self.dim] = 1.0 + hit = True + if not hit: + vec[zlib.crc32(text.encode("utf-8")) % self.dim] = 0.5 + return vec diff --git a/tests/conftest.py b/tests/conftest.py index 722bef4..9d6af61 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,9 +8,13 @@ import os from collections.abc import Iterator from pathlib import Path +from typing import TYPE_CHECKING import pytest +if TYPE_CHECKING: + from tests._fake_embedder import FakeEmbedder + # Every env var that lets orc.llm.client.get_client() construct a live provider. # get_client() PREFERS OPENROUTER_API_KEY over ANTHROPIC_API_KEY, and ORC_PROVIDER # can force either path, so stripping only the Anthropic key is not enough. @@ -31,3 +35,19 @@ def _no_live_llm_env(monkeypatch: pytest.MonkeyPatch) -> None: if not os.environ.get("ORC_TEST_ALLOW_LIVE_LLM"): for var in _LIVE_LLM_ENV_VARS: monkeypatch.delenv(var, raising=False) + + +@pytest.fixture +def fake_embedder() -> Iterator[FakeEmbedder]: + """Install a deterministic FakeEmbedder via the embedder factory hook. + + Tests script semantic hits through fake.vocabulary (keyword -> dimension). + The factory is reset afterwards so the cache never leaks across tests. + """ + from orc.retrieval.embedder import set_embedder_factory + from tests._fake_embedder import FakeEmbedder + + fake = FakeEmbedder(dim=8) + set_embedder_factory(lambda model_id: fake) + yield fake + set_embedder_factory(None) diff --git a/tests/unit/test_embedder.py b/tests/unit/test_embedder.py new file mode 100644 index 0000000..3bd608d --- /dev/null +++ b/tests/unit/test_embedder.py @@ -0,0 +1,66 @@ +"""Embedder protocol tests: registry, factory hook, availability, errors.""" + +from __future__ import annotations + +import pytest + +from orc.errors import EmbeddingsUnavailableError, OrcError +from orc.retrieval import embedder as embedder_module +from orc.retrieval.embedder import ( + DEFAULT_EMBEDDING_MODEL, + embedder_available, + get_embedder, + model_dim, + set_embedder_factory, +) +from tests._fake_embedder import FakeEmbedder + + +def test_registry_knows_default_model_dim_without_loading() -> None: + assert model_dim(DEFAULT_EMBEDDING_MODEL) == 384 + + +def test_registry_returns_none_for_unknown_model() -> None: + assert model_dim("not/a-model") is None + + +def test_get_embedder_uses_factory_and_caches() -> None: + fake = FakeEmbedder(dim=8) + set_embedder_factory(lambda model_id: fake) + try: + first = get_embedder("any-model") + second = get_embedder("any-model") + assert first is fake + assert second is fake + finally: + set_embedder_factory(None) + + +def test_get_embedder_raises_with_install_hint_when_deps_missing( + monkeypatch: pytest.MonkeyPatch, +) -> None: + set_embedder_factory(None) + monkeypatch.setattr(embedder_module, "find_spec", lambda name: None) + with pytest.raises(EmbeddingsUnavailableError, match=r'pip install "orc-ai\[embeddings\]"'): + get_embedder(DEFAULT_EMBEDDING_MODEL) + + +def test_embeddings_unavailable_error_is_orc_error() -> None: + assert issubclass(EmbeddingsUnavailableError, OrcError) + + +def test_embedder_available_false_when_find_spec_fails( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr(embedder_module, "find_spec", lambda name: None) + assert embedder_available() is False + + +def test_fake_embedder_is_deterministic_and_scriptable() -> None: + fake = FakeEmbedder(dim=8, vocabulary={"skills": 2}) + [a] = fake.embed_texts(["the skills api"]) + [b] = fake.embed_texts(["SKILLS everywhere"]) + assert a == b + assert a[2] == 1.0 + [unrelated] = fake.embed_texts(["kubernetes pods"]) + assert unrelated != a diff --git a/tests/unit/test_embeddings_store.py b/tests/unit/test_embeddings_store.py new file mode 100644 index 0000000..f1b4245 --- /dev/null +++ b/tests/unit/test_embeddings_store.py @@ -0,0 +1,107 @@ +"""Embeddings store tests: chunk_vec lifecycle, KNN, and availability probing.""" + +from __future__ import annotations + +import sqlite3 + +import pytest + +from orc.storage import embeddings_store +from orc.storage.db import bootstrap_schema + + +def _connect() -> sqlite3.Connection: + conn = sqlite3.connect(":memory:", isolation_level=None) + conn.row_factory = sqlite3.Row + bootstrap_schema(conn) + return conn + + +def _vec_conn(dim: int) -> sqlite3.Connection: + pytest.importorskip("sqlite_vec") + conn = _connect() + embeddings_store.load_vec_extension(conn) + embeddings_store.ensure_chunk_vec(conn, dim) + return conn + + +def test_store_and_knn_roundtrip() -> None: + conn = _vec_conn(4) + embeddings_store.store_chunk_embeddings( + conn, + [ + ("c1", 1, [1.0, 0.0, 0.0, 0.0]), + ("c2", 1, [0.0, 1.0, 0.0, 0.0]), + ], + ) + hits = embeddings_store.knn_chunk_ids(conn, [1.0, 0.0, 0.0, 0.0], limit=2) + assert [cid for cid, _ in hits] == ["c1", "c2"] + assert hits[0][1] == pytest.approx(0.0) + assert hits[0][1] < hits[1][1] + + +def test_knn_corpus_version_filter() -> None: + conn = _vec_conn(4) + embeddings_store.store_chunk_embeddings( + conn, + [ + ("c1", 1, [1.0, 0.0, 0.0, 0.0]), + ("c2", 2, [1.0, 0.0, 0.0, 0.0]), + ], + ) + hits = embeddings_store.knn_chunk_ids(conn, [1.0, 0.0, 0.0, 0.0], limit=5, corpus_version=1) + assert [cid for cid, _ in hits] == ["c1"] + + +def test_knn_equal_distances_tie_break_on_chunk_id() -> None: + conn = _vec_conn(4) + # Insert in reverse-lexicographic order to prove ordering is not insertion order. + embeddings_store.store_chunk_embeddings( + conn, + [ + ("c2", 1, [0.0, 1.0, 0.0, 0.0]), + ("c1", 1, [0.0, 1.0, 0.0, 0.0]), + ], + ) + hits = embeddings_store.knn_chunk_ids(conn, [0.0, 1.0, 0.0, 0.0], limit=2) + assert [cid for cid, _ in hits] == ["c1", "c2"] + + +def test_ensure_chunk_vec_dim_mismatch_raises() -> None: + conn = _vec_conn(4) + with pytest.raises(ValueError, match="dim"): + embeddings_store.ensure_chunk_vec(conn, 8) + + +def test_store_rejects_wrong_length_vector() -> None: + conn = _vec_conn(4) + with pytest.raises(ValueError, match="dim"): + embeddings_store.store_chunk_embeddings(conn, [("c1", 1, [1.0, 0.0])]) + + +def test_vec_extension_available_false_when_find_spec_fails( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr(embeddings_store, "find_spec", lambda name: None) + assert embeddings_store.vec_extension_available() is False + + +def test_chunks_missing_embeddings_lists_unembedded_chunks() -> None: + conn = _vec_conn(4) + conn.execute( + "INSERT INTO evidence(evidence_id, source_path, stored_path, sha256, mime_type, " + "ingested_at, corpus_version) VALUES (?,?,?,?,?,?,?)", + ("ev1", "/x", "/y", "deadbeef", "text/plain", "2026-06-12T00:00:00Z", 3), + ) + conn.execute( + "INSERT INTO chunk(chunk_id, evidence_id, seq, text, token_count, headings_path, " + "start_offset, end_offset) VALUES (?,?,?,?,?,?,?,?)", + ("c1", "ev1", 0, "hello world", 2, None, 0, 11), + ) + missing = embeddings_store.chunks_missing_embeddings(conn) + assert [(m["chunk_id"], m["text"], m["corpus_version"]) for m in missing] == [ + ("c1", "hello world", 3) + ] + + embeddings_store.store_chunk_embeddings(conn, [("c1", 3, [1.0, 0.0, 0.0, 0.0])]) + assert embeddings_store.chunks_missing_embeddings(conn) == [] diff --git a/tests/unit/test_hybrid.py b/tests/unit/test_hybrid.py new file mode 100644 index 0000000..d0ed4fb --- /dev/null +++ b/tests/unit/test_hybrid.py @@ -0,0 +1,238 @@ +"""Hybrid retrieval tests: RRF fusion math, vector hydration, fallbacks.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from orc.retrieval import retrieve, rrf_fuse, vector_search +from orc.retrieval.bm25 import RetrievedChunk, bm25_search +from orc.retrieval.embedder import set_embedder_factory +from orc.storage import embeddings_store +from orc.storage import workspace as ws_module +from orc.storage.db import open_connection +from tests._fake_embedder import FakeEmbedder + + +def _chunk(chunk_id: str, *, rank: int, bm25_score: float = 0.0) -> RetrievedChunk: + return RetrievedChunk( + chunk_id=chunk_id, + evidence_id=f"ev-{chunk_id}", + seq=0, + text=f"text {chunk_id}", + headings_path=None, + token_count=3, + rank=rank, + bm25_score=bm25_score, + evidence_title=None, + evidence_source_path="/x", + ) + + +def test_rrf_fuse_hand_computed_scores() -> None: + # k=60, 0-based ranks. Scores: + # A: 1/61 (bm25 rank 0) + # B: 1/62 + 1/61 (bm25 rank 1, vector rank 0) + # C: 1/62 (vector rank 1) + # B > A > C. + bm25 = [_chunk("A", rank=0, bm25_score=-5.0), _chunk("B", rank=1, bm25_score=-4.0)] + vector = [_chunk("B", rank=0), _chunk("C", rank=1)] + fused = rrf_fuse(bm25, vector, k=60, limit=10) + assert [c.chunk_id for c in fused] == ["B", "A", "C"] + assert [c.rank for c in fused] == [0, 1, 2] + + +def test_rrf_fuse_overlap_keeps_real_bm25_score() -> None: + bm25 = [_chunk("A", rank=0, bm25_score=-7.5)] + vector = [_chunk("A", rank=0)] + [fused] = rrf_fuse(bm25, vector, k=60, limit=10) + assert fused.bm25_score == -7.5 + + +def test_rrf_fuse_vector_only_chunk_has_zero_bm25_score() -> None: + fused = rrf_fuse([], [_chunk("V", rank=0)], k=60, limit=10) + assert [c.chunk_id for c in fused] == ["V"] + assert fused[0].bm25_score == 0.0 + + +def test_rrf_fuse_ties_order_by_chunk_id() -> None: + # A appears only in bm25 at rank 0, B only in vector at rank 0: equal RRF + # scores. Determinism demands the tie-break be chunk_id, not list order. + fused = rrf_fuse([_chunk("B", rank=0)], [_chunk("A", rank=0)], k=60, limit=10) + assert [c.chunk_id for c in fused] == ["A", "B"] + + +def test_rrf_fuse_respects_limit() -> None: + bm25 = [_chunk("A", rank=0), _chunk("B", rank=1), _chunk("C", rank=2)] + fused = rrf_fuse(bm25, [], k=60, limit=2) + assert len(fused) == 2 + + +def _setup_embedded_corpus(tmp_path: Path, fake: FakeEmbedder) -> ws_module.Workspace: + """Workspace with two docs, chunk_vec populated via the fake embedder.""" + from orc.ingest.pipeline import ingest as do_ingest + from orc.paths import workspace_db_path + + fake.vocabulary.update({"caching": 0, "skills": 1}) + ws = ws_module.create("demo", embedding_model=fake.model_id) + corpus = tmp_path / "corpus" + corpus.mkdir() + (corpus / "caching.md").write_text( + "# Prompt caching\n\nPrompt caching has a 5-minute ephemeral TTL by default.\n" + ) + (corpus / "skills.md").write_text( + "# Skills API\n\nThe Skills API ships versioned auditable capabilities.\n" + ) + do_ingest(ws, str(corpus)) + + with open_connection(workspace_db_path(ws.name)) as conn: + embeddings_store.load_vec_extension(conn) + embeddings_store.ensure_chunk_vec(conn, fake.dim) + missing = embeddings_store.chunks_missing_embeddings(conn) + if missing: + vectors = fake.embed_texts([m["text"] for m in missing]) + embeddings_store.store_chunk_embeddings( + conn, + [ + (m["chunk_id"], m["corpus_version"], v) + for m, v in zip(missing, vectors, strict=True) + ], + ) + return ws_module.resolve(ws.name) + + +def test_vector_search_hydrates_in_knn_order( + orc_home: Path, tmp_path: Path, fake_embedder: FakeEmbedder +) -> None: + pytest.importorskip("sqlite_vec") + from orc.paths import workspace_db_path + + ws = _setup_embedded_corpus(tmp_path, fake_embedder) + # Query the LATER-ingested doc so KNN order differs from insertion order. + [query_vec] = fake_embedder.embed_texts(["skills"]) + with open_connection(workspace_db_path(ws.name)) as conn: + embeddings_store.load_vec_extension(conn) + chunks = vector_search(conn, query_vec, limit=5, corpus_version=None) + assert chunks[0].evidence_title == "Skills API" + assert chunks[0].text.startswith("# Skills API") + assert [c.rank for c in chunks] == list(range(len(chunks))) + assert all(c.bm25_score == 0.0 for c in chunks) + + +def test_retrieve_uses_bm25_when_no_embedding_model( + orc_home: Path, tmp_path: Path +) -> None: + from orc.ingest.pipeline import ingest as do_ingest + from orc.paths import workspace_db_path + + ws = ws_module.create("plain") + corpus = tmp_path / "corpus" + corpus.mkdir() + (corpus / "a.md").write_text("# Doc\n\nPrompt caching has a 5-minute TTL.\n") + do_ingest(ws, str(corpus)) + + with open_connection(workspace_db_path(ws.name)) as conn: + res = retrieve(conn, "prompt caching", workspace=ws, limit=5) + expected = bm25_search(conn, "prompt caching", limit=5) + assert res.method == "bm25" + assert [c.chunk_id for c in res.chunks] == [c.chunk_id for c in expected] + assert res.candidates_considered == len(expected) + + +def test_retrieve_falls_back_when_vec_extension_missing( + orc_home: Path, tmp_path: Path, fake_embedder: FakeEmbedder, monkeypatch: pytest.MonkeyPatch +) -> None: + pytest.importorskip("sqlite_vec") + from orc.paths import workspace_db_path + from orc.retrieval import hybrid as hybrid_module + + ws = _setup_embedded_corpus(tmp_path, fake_embedder) + monkeypatch.setattr(hybrid_module, "vec_extension_available", lambda: False) + with open_connection(workspace_db_path(ws.name)) as conn: # noqa: SIM117 + with pytest.warns(RuntimeWarning, match="orc workspace embed"): + res = retrieve(conn, "skills", workspace=ws, limit=5) + assert res.method == "bm25" + + +def test_retrieve_falls_back_when_embedder_missing( + orc_home: Path, tmp_path: Path, fake_embedder: FakeEmbedder, monkeypatch: pytest.MonkeyPatch +) -> None: + pytest.importorskip("sqlite_vec") + from orc.paths import workspace_db_path + from orc.retrieval import embedder as embedder_module + + ws = _setup_embedded_corpus(tmp_path, fake_embedder) + # Drop the factory and make sentence-transformers look uninstalled. + set_embedder_factory(None) + monkeypatch.setattr(embedder_module, "find_spec", lambda name: None) + try: + with open_connection(workspace_db_path(ws.name)) as conn: # noqa: SIM117 + with pytest.warns(RuntimeWarning, match="orc workspace embed"): + res = retrieve(conn, "skills", workspace=ws, limit=5) + finally: + set_embedder_factory(lambda model_id: fake_embedder) + assert res.method == "bm25" + + +def test_retrieve_falls_back_when_chunk_vec_absent( + orc_home: Path, tmp_path: Path, fake_embedder: FakeEmbedder +) -> None: + pytest.importorskip("sqlite_vec") + from orc.ingest.pipeline import ingest as do_ingest + from orc.paths import workspace_db_path + + # Corpus ingested BEFORE embeddings were enabled: chunk_vec never created. + # Flipping the model flag afterwards must not break retrieval before + # `orc workspace embed` has been run. + ws = ws_module.create("novec") + corpus = tmp_path / "corpus" + corpus.mkdir() + (corpus / "a.md").write_text("# Doc\n\nSkills are versioned capabilities.\n") + do_ingest(ws, str(corpus)) + with open_connection(workspace_db_path(ws.name)) as conn: + conn.execute( + "UPDATE workspace SET embedding_model = ? WHERE name = ?", + (fake_embedder.model_id, ws.name), + ) + with pytest.warns(RuntimeWarning, match="orc workspace embed"): + res = retrieve(conn, "skills", workspace=ws_module.resolve(ws.name), limit=5) + assert res.method == "bm25" + + +def test_search_evidence_skill_records_hybrid_method( + orc_home: Path, tmp_path: Path, fake_embedder: FakeEmbedder +) -> None: + pytest.importorskip("sqlite_vec") + from orc.directives.research.skills.search_evidence import search_evidence + from orc.runs import open_run + from orc.storage.trace_store import load_trace + + ws = _setup_embedded_corpus(tmp_path, fake_embedder) + with open_run(ws, directive="research", skill="search_evidence", inputs={}) as run: + result = search_evidence.run(workspace=ws, run=run, query="skills", k=5) + run.close(output=result) + + trace = load_trace(run.run_id) + assert trace["retrieval"]["method"] == "hybrid_rrf" + assert trace["retrieval"]["candidates_considered"] >= 1 + assert result["chunks"], "expected fused hits" + + +def test_retrieve_hybrid_fuses_and_reports_union( + orc_home: Path, tmp_path: Path, fake_embedder: FakeEmbedder +) -> None: + pytest.importorskip("sqlite_vec") + from orc.paths import workspace_db_path + + ws = _setup_embedded_corpus(tmp_path, fake_embedder) + with open_connection(workspace_db_path(ws.name)) as conn: + res = retrieve(conn, "skills", workspace=ws, limit=5) + bm25_ids = {c.chunk_id for c in bm25_search(conn, "skills", limit=5)} + [query_vec] = fake_embedder.embed_texts(["skills"]) + vec_ids = {c.chunk_id for c in vector_search(conn, query_vec, limit=5, corpus_version=None)} + assert res.method == "hybrid_rrf" + assert res.candidates_considered == len(bm25_ids | vec_ids) + # The semantically scripted doc must be in the fused result. + assert any(c.evidence_title == "Skills API" for c in res.chunks) + assert [c.rank for c in res.chunks] == list(range(len(res.chunks))) diff --git a/tests/unit/test_ingest_embeddings.py b/tests/unit/test_ingest_embeddings.py new file mode 100644 index 0000000..4f9865d --- /dev/null +++ b/tests/unit/test_ingest_embeddings.py @@ -0,0 +1,154 @@ +"""Embed-at-ingest and backfill tests.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from orc.errors import IngestError +from orc.ingest.pipeline import ingest as do_ingest +from orc.paths import workspace_db_path +from orc.retrieval.embedder import set_embedder_factory +from orc.storage import embeddings_store +from orc.storage import workspace as ws_module +from orc.storage.db import open_connection +from tests._fake_embedder import FakeEmbedder + + +def _write_doc(tmp_path: Path, name: str, text: str) -> Path: + doc = tmp_path / name + doc.write_text(text) + return doc + + +def test_ingest_embeds_chunks_atomically( + orc_home: Path, tmp_path: Path, fake_embedder: FakeEmbedder +) -> None: + pytest.importorskip("sqlite_vec") + ws = ws_module.create("demo", embedding_model=fake_embedder.model_id) + doc = _write_doc(tmp_path, "a.md", "# Doc A\n\nThe Skills API ships in October 2025.\n") + do_ingest(ws, str(doc)) + + with open_connection(workspace_db_path(ws.name)) as conn: + embeddings_store.load_vec_extension(conn) + chunk_count = conn.execute("SELECT COUNT(*) AS n FROM chunk").fetchone()["n"] + vec_rows = conn.execute( + "SELECT chunk_id, corpus_version FROM chunk_vec ORDER BY chunk_id" + ).fetchall() + evidence_cv = conn.execute("SELECT corpus_version FROM evidence").fetchone()[ + "corpus_version" + ] + assert chunk_count >= 1 + assert len(vec_rows) == chunk_count + assert all(row["corpus_version"] == evidence_cv for row in vec_rows) + + +def test_ingest_rolls_back_when_embedding_fails( + orc_home: Path, tmp_path: Path, fake_embedder: FakeEmbedder +) -> None: + pytest.importorskip("sqlite_vec") + + class _BoomError(RuntimeError): + pass + + def _explode(texts: list[str]) -> list[list[float]]: + raise _BoomError("embedding backend down") + + fake_embedder.embed_texts = _explode # type: ignore[method-assign] + ws = ws_module.create("demo", embedding_model=fake_embedder.model_id) + doc = _write_doc(tmp_path, "a.md", "# Doc A\n\nSome content.\n") + with pytest.raises(_BoomError): + do_ingest(ws, str(doc)) + + with open_connection(workspace_db_path(ws.name)) as conn: + assert conn.execute("SELECT COUNT(*) AS n FROM evidence").fetchone()["n"] == 0 + assert conn.execute("SELECT COUNT(*) AS n FROM chunk").fetchone()["n"] == 0 + + +def test_ingest_fails_loud_when_model_set_but_embedder_missing( + orc_home: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + from orc.retrieval import embedder as embedder_module + + set_embedder_factory(None) + monkeypatch.setattr(embedder_module, "find_spec", lambda name: None) + ws = ws_module.create("demo", embedding_model="some-model") + doc = _write_doc(tmp_path, "a.md", "# Doc A\n\nSome content.\n") + with pytest.raises(IngestError, match=r'pip install "orc-ai\[embeddings\]"'): + do_ingest(ws, str(doc)) + + +def _seed_two_versions_unembedded(tmp_path: Path) -> ws_module.Workspace: + """Two ingests (corpus_version 1 and 2) into a workspace WITHOUT embeddings.""" + ws = ws_module.create("demo") + do_ingest(ws, str(_write_doc(tmp_path, "a.md", "# Doc A\n\nFirst document body.\n"))) + do_ingest(ws, str(_write_doc(tmp_path, "b.md", "# Doc B\n\nSecond document body.\n"))) + return ws_module.resolve(ws.name) + + +def test_backfill_preserves_original_corpus_versions( + orc_home: Path, tmp_path: Path, fake_embedder: FakeEmbedder +) -> None: + pytest.importorskip("sqlite_vec") + ws = _seed_two_versions_unembedded(tmp_path) + with open_connection(workspace_db_path(ws.name)) as conn: + embeddings_store.load_vec_extension(conn) + embeddings_store.ensure_chunk_vec(conn, fake_embedder.dim) + count = embeddings_store.backfill_embeddings(conn, fake_embedder) + rows = conn.execute( + "SELECT chunk_vec.corpus_version AS vec_cv, evidence.corpus_version AS ev_cv " + "FROM chunk_vec " + "JOIN chunk ON chunk.chunk_id = chunk_vec.chunk_id " + "JOIN evidence ON evidence.evidence_id = chunk.evidence_id" + ).fetchall() + assert count == len(rows) >= 2 + assert all(row["vec_cv"] == row["ev_cv"] for row in rows) + assert {row["ev_cv"] for row in rows} == {1, 2} + + +def test_backfill_is_idempotent( + orc_home: Path, tmp_path: Path, fake_embedder: FakeEmbedder +) -> None: + pytest.importorskip("sqlite_vec") + ws = _seed_two_versions_unembedded(tmp_path) + with open_connection(workspace_db_path(ws.name)) as conn: + embeddings_store.load_vec_extension(conn) + embeddings_store.ensure_chunk_vec(conn, fake_embedder.dim) + first = embeddings_store.backfill_embeddings(conn, fake_embedder) + second = embeddings_store.backfill_embeddings(conn, fake_embedder) + total = conn.execute("SELECT COUNT(*) AS n FROM chunk_vec").fetchone()["n"] + assert first >= 2 + assert second == 0 + assert total == first + + +def test_cli_ingest_prints_embeddings_line_when_active( + orc_home: Path, tmp_path: Path, fake_embedder: FakeEmbedder +) -> None: + pytest.importorskip("sqlite_vec") + from click.testing import CliRunner + + from orc.cli import main + + ws_module.create("demo", embedding_model=fake_embedder.model_id) + doc = _write_doc(tmp_path, "a.md", "# Doc A\n\nSome content.\n") + runner = CliRunner() + result = runner.invoke(main, ["ingest", str(doc), "--workspace", "demo"]) + assert result.exit_code == 0, result.output + assert f"embeddings: {fake_embedder.model_id}" in result.output + + +def test_cli_ingest_no_embeddings_line_for_plain_workspace( + orc_home: Path, tmp_path: Path +) -> None: + from click.testing import CliRunner + + from orc.cli import main + + ws_module.create("demo") + doc = _write_doc(tmp_path, "a.md", "# Doc A\n\nSome content.\n") + runner = CliRunner() + result = runner.invoke(main, ["ingest", str(doc), "--workspace", "demo"]) + assert result.exit_code == 0, result.output + assert "embeddings:" not in result.output diff --git a/tests/unit/test_replay.py b/tests/unit/test_replay.py index 8d7fa37..2d8b376 100644 --- a/tests/unit/test_replay.py +++ b/tests/unit/test_replay.py @@ -153,6 +153,69 @@ def test_replay_works_for_extract_claims_runs( assert new_trace["output"]["claims"], "extract_claims should have produced claims on replay" +def _seed_embedded_corpus(tmp_path: Path, fake) -> str: + """Workspace with embeddings enabled; ingest hook embeds via the fake.""" + fake.vocabulary.update({"skills": 1}) + ws = ws_module.create("demo", embedding_model=fake.model_id) + corpus = tmp_path / "v1" + corpus.mkdir() + (corpus / "a.md").write_text("# Doc A\n\nThe Skills API ships in October 2025.\n") + do_ingest(ws, str(corpus)) + return ws.name + + +def test_replay_frozen_hybrid_pins_corpus_version( + orc_home: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, fake_embedder +) -> None: + pytest.importorskip("sqlite_vec") + name = _seed_embedded_corpus(tmp_path, fake_embedder) + original = _verify_once(name, "skills api", monkeypatch) + original_trace = load_trace(original) + assert original_trace["retrieval"]["method"] == "hybrid_rrf" + + # Grow the corpus (also auto-embedded), bumping corpus_version. + extra = tmp_path / "v2" + extra.mkdir() + (extra / "b.md").write_text("# Doc B\n\nMore skills content arriving later.\n") + do_ingest(ws_module.resolve(name), str(extra)) + + fake = FakeAnthropic(responses=[make_verdict_response(label="not_found", confidence=0.5)]) + monkeypatch.setattr(client_module, "_client", fake) + out = replay(original) + assert out["mode"] == "frozen" + + new_trace = load_trace(out["new_run_id"]) + assert new_trace["retrieval"]["method"] == "hybrid_rrf" + new_chunk_ids = {c["chunk_id"] for c in new_trace["retrieval"]["returned"]} + original_chunk_ids = {c["chunk_id"] for c in original_trace["retrieval"]["returned"]} + assert new_chunk_ids == original_chunk_ids + + +def test_replay_frozen_warns_on_retrieval_method_mismatch( + orc_home: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, fake_embedder +) -> None: + pytest.importorskip("sqlite_vec") + from orc.retrieval import embedder as embedder_module + from orc.retrieval.embedder import set_embedder_factory + + name = _seed_embedded_corpus(tmp_path, fake_embedder) + original = _verify_once(name, "skills api", monkeypatch) + assert load_trace(original)["retrieval"]["method"] == "hybrid_rrf" + + # Embedding deps vanish before the replay: retrieval falls back to BM25, + # and the replay engine must surface the method drift. + set_embedder_factory(None) + monkeypatch.setattr(embedder_module, "find_spec", lambda name: None) + fake = FakeAnthropic(responses=[make_verdict_response(label="not_found", confidence=0.5)]) + monkeypatch.setattr(client_module, "_client", fake) + try: + with pytest.warns(RuntimeWarning, match="retrieval method"): + out = replay(original) + finally: + set_embedder_factory(lambda model_id: fake_embedder) + assert load_trace(out["new_run_id"])["retrieval"]["method"] == "bm25" + + def test_replay_records_lineage_in_inputs( orc_home: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: diff --git a/tests/unit/test_workspace.py b/tests/unit/test_workspace.py index 69dbd76..2d98f9f 100644 --- a/tests/unit/test_workspace.py +++ b/tests/unit/test_workspace.py @@ -145,3 +145,96 @@ def test_cli_workspace_list_after_create(orc_home: Path) -> None: result = runner.invoke(main, ["workspace", "list"]) assert result.exit_code == 0 assert "demo" in result.output + + +def test_cli_workspace_create_embeddings_sets_default_model(orc_home: Path) -> None: + from orc.retrieval.embedder import DEFAULT_EMBEDDING_MODEL + + runner = CliRunner() + result = runner.invoke(main, ["workspace", "create", "demo", "--embeddings"]) + assert result.exit_code == 0, result.output + assert ws_module.resolve("demo").embedding_model == DEFAULT_EMBEDDING_MODEL + + +def test_cli_workspace_create_embeddings_custom_model(orc_home: Path) -> None: + runner = CliRunner() + result = runner.invoke( + main, + ["workspace", "create", "demo", "--embeddings", "--embedding-model", "my/model"], + ) + assert result.exit_code == 0, result.output + assert ws_module.resolve("demo").embedding_model == "my/model" + + +def test_cli_workspace_create_embeddings_warns_but_creates_when_deps_missing( + orc_home: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + from orc.cli_commands import workspace as workspace_cli + + monkeypatch.setattr(workspace_cli, "embedder_available", lambda: False) + runner = CliRunner() + result = runner.invoke(main, ["workspace", "create", "demo", "--embeddings"]) + assert result.exit_code == 0, result.output + assert "orc-ai[embeddings]" in result.output + assert ws_module.resolve("demo").embedding_model is not None + + +def test_cli_workspace_create_embedding_model_requires_embeddings_flag( + orc_home: Path, +) -> None: + runner = CliRunner() + result = runner.invoke( + main, ["workspace", "create", "demo", "--embedding-model", "my/model"] + ) + assert result.exit_code != 0 + assert "--embeddings" in result.output + + +def test_cli_workspace_embed_backfills_and_sets_model(orc_home: Path, tmp_path: Path) -> None: + pytest.importorskip("sqlite_vec") + from orc.ingest.pipeline import ingest as do_ingest + from orc.retrieval.embedder import set_embedder_factory + from tests._fake_embedder import FakeEmbedder + + fake = FakeEmbedder(dim=8) + set_embedder_factory(lambda model_id: fake) + try: + ws = ws_module.create("demo") + doc = tmp_path / "a.md" + doc.write_text("# Doc A\n\nSome content to embed.\n") + do_ingest(ws, str(doc)) + + runner = CliRunner() + result = runner.invoke( + main, ["workspace", "embed", "demo", "--model", fake.model_id] + ) + assert result.exit_code == 0, result.output + assert f"chunk(s) with {fake.model_id}" in result.output + assert "Embedded" in result.output + assert ws_module.resolve("demo").embedding_model == fake.model_id + + with open_connection(workspace_db_path("demo")) as conn: + from orc.storage.embeddings_store import load_vec_extension + + load_vec_extension(conn) + n = conn.execute("SELECT COUNT(*) AS n FROM chunk_vec").fetchone()["n"] + assert n >= 1 + finally: + set_embedder_factory(None) + + +def test_cli_workspace_embed_conflicting_model_errors(orc_home: Path) -> None: + pytest.importorskip("sqlite_vec") + from orc.retrieval.embedder import set_embedder_factory + from tests._fake_embedder import FakeEmbedder + + fake = FakeEmbedder(dim=8) + set_embedder_factory(lambda model_id: fake) + try: + ws_module.create("demo", embedding_model="model-a") + runner = CliRunner() + result = runner.invoke(main, ["workspace", "embed", "demo", "--model", "model-b"]) + assert result.exit_code != 0 + assert "model-a" in result.output + finally: + set_embedder_factory(None) diff --git a/uv.lock b/uv.lock index 9b48075..2634679 100644 --- a/uv.lock +++ b/uv.lock @@ -908,6 +908,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/da/e9/1f9ada30cef7b05e74bb06f52127e7a724976c225f46adb65c37b1dadfb6/jiter-0.14.0-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67f00d94b281174144d6532a04b66a12cb866cbdc47c3af3bfe2973677f9861a", size = 349613 }, ] +[[package]] +name = "joblib" +version = "1.5.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/41/f2/d34e8b3a08a9cc79a50b2208a93dce981fe615b64d5a4d4abee421d898df/joblib-1.5.3.tar.gz", hash = "sha256:8561a3269e6801106863fd0d6d84bb737be9e7631e33aaed3fb9ce5953688da3", size = 331603 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071 }, +] + [[package]] name = "jsonschema" version = "4.26.0" @@ -1201,6 +1210,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7e/82/69e539c4c2027f1e1697e09aaa2449243085a0edf81ae2c6341e84d769b6/multiprocess-0.70.19-py39-none-any.whl", hash = "sha256:0d4b4397ed669d371c81dcd1ef33fd384a44d6c3de1bd0ca7ac06d837720d3c5", size = 133477 }, ] +[[package]] +name = "narwhals" +version = "2.22.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/62/3c/c4ef2164a71c1a63d7f1ae411c4082c5fa872405106db60a4b7114989ad7/narwhals-2.22.1.tar.gz", hash = "sha256:d62920805a0a43b7ff8b54b0c0d3142d796f8a9301836ada37e573d6a33cbcd9", size = 647493 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/48/ca/36339329c4604adbcc99c899b7eb1ce1a555c499b6a6860757dc9bfed36d/narwhals-2.22.1-py3-none-any.whl", hash = "sha256:60567d774edf77db53906f89d9fbd164e66e56d66d388e1e6990f17ac33cfb53", size = 454815 }, +] + [[package]] name = "networkx" version = "3.6.1" @@ -1470,8 +1488,10 @@ dev = [ { name = "pytest" }, { name = "pytest-asyncio" }, { name = "ruff" }, + { name = "sqlite-vec" }, ] embeddings = [ + { name = "sentence-transformers" }, { name = "sqlite-vec" }, ] @@ -1492,6 +1512,8 @@ requires-dist = [ { name = "pyyaml", specifier = ">=6.0" }, { name = "rich", specifier = ">=13.0" }, { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.5" }, + { name = "sentence-transformers", marker = "extra == 'embeddings'", specifier = ">=3.0" }, + { name = "sqlite-vec", marker = "extra == 'dev'", specifier = ">=0.1.6" }, { name = "sqlite-vec", marker = "extra == 'embeddings'", specifier = ">=0.1.6" }, { name = "tiktoken", specifier = ">=0.7" }, { name = "torch", marker = "extra == 'benchmarks'", specifier = ">=2.2" }, @@ -2352,6 +2374,141 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5d/e6/ec8471c8072382cb91233ba7267fd931219753bb43814cbc71757bfd4dab/safetensors-0.7.0-cp38-abi3-win_amd64.whl", hash = "sha256:d1239932053f56f3456f32eb9625590cc7582e905021f94636202a864d470755", size = 341380 }, ] +[[package]] +name = "scikit-learn" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "joblib" }, + { name = "narwhals" }, + { name = "numpy" }, + { name = "scipy" }, + { name = "threadpoolctl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fa/6f/37092bdb25f712817231799fc5674d8e704066a8a70c1d2d40517e18b4ab/scikit_learn-1.9.0.tar.gz", hash = "sha256:8833266989d3a5110178a9fae30783675460724d0e1efb13b14901d2c660c557", size = 7750767 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f5/be/e844fd9586e66540a15b71924d17a6cbc1bb749e81ddd0a796bcdba4c055/scikit_learn-1.9.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9db6f4d34e68c8899e4cab27fdf8eafe6ed21f2ba52ceb25ea250cd237f8e47b", size = 8789686 }, + { url = "https://files.pythonhosted.org/packages/42/e2/ff880f62677a17d035817d543cb0fc8727d01eccbee81c5f7fc733a9d856/scikit_learn-1.9.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:f401448645a3e7bc115aa3c094097865155b34bff1cba8101857d9104e99074c", size = 8256782 }, + { url = "https://files.pythonhosted.org/packages/25/64/eb40435e1a508ab1b4e284ce43ae80f6a162e5be5e38ed5a6fab467a9ea4/scikit_learn-1.9.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fd3a8ef0c758555a3b23c03adaa858af32f7736785ded50ad5991f59c4ed03fa", size = 8992419 }, + { url = "https://files.pythonhosted.org/packages/8d/da/4810a28e473185429e45a57eebcc91fc991b33d889cc0676063e671db03d/scikit_learn-1.9.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f7e254636164090da847715a27f8e5478feb98c40a9e0ee90cbd277de9e5ceb8", size = 9281411 }, + { url = "https://files.pythonhosted.org/packages/3b/67/be3d369f40d8178ba3bd86635d132e08cb5329b023e4669d9426d84bc007/scikit_learn-1.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:5dc1818c77575d149e25fce9ef82dd7b7263ae372f03494158668ad632a69759", size = 8272736 }, + { url = "https://files.pythonhosted.org/packages/37/79/a733f02dc2118da7e77a134b34f39f40201a353311b011d20859d2db3556/scikit_learn-1.9.0-cp311-cp311-win_arm64.whl", hash = "sha256:366652351f092b219c248f1e72821e841960a63d8f358f1dcfd54dc1cbdbbc28", size = 7919564 }, + { url = "https://files.pythonhosted.org/packages/ac/20/75f915ff375d6249e6550ac740fdbbd66159a068fd3af1400ff62036b07a/scikit_learn-1.9.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2bd41b0d201bc81575531b96b713d3eb5e5f50fb0b82101ff0f92294fdc236ac", size = 8741122 }, + { url = "https://files.pythonhosted.org/packages/cc/d5/2b5148f2279196775e1db2aeb85d14b70ac80e7e32b3b28e7ebeafb0901d/scikit_learn-1.9.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:5be45aa4a42a68a533913a6ed736cf309de2226411c79ef8d609a5456f1939b1", size = 8261512 }, + { url = "https://files.pythonhosted.org/packages/a0/ee/5adbc77656b71f9456a2f5a7a9fdb4bcf9207a6b962889f1c2f9323afa4e/scikit_learn-1.9.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5e50ed4da51974e86e940690e9a3d82e729b62b5a49f7c9bac534d515d39d86f", size = 8837603 }, + { url = "https://files.pythonhosted.org/packages/6c/c2/63fdda36c56437eeb44aaf9493c8bcd62ce230ab1598924fc626ffbfa943/scikit_learn-1.9.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:056c92bb67ad4c28463c2f2653d9701449201e7e7a9e94e321be0f71c4fef2b8", size = 9132097 }, + { url = "https://files.pythonhosted.org/packages/83/a4/c8e67227c680e2259c8864ae72ff48b06e16a6f51253a22167aa02a8aa4e/scikit_learn-1.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:4306775fad04cc4b472a1b15af1ae9cede1540fbfcc17fbce3767cd8dc7ae283", size = 8211173 }, + { url = "https://files.pythonhosted.org/packages/cf/fd/3c0863792e98e67e9184aa4029288a175935eb65443afcd30d4f143450cf/scikit_learn-1.9.0-cp312-cp312-win_arm64.whl", hash = "sha256:26e22435f63bcdcf396b574273f29f13dd531f5ea035801f5be10ba1540a4e60", size = 7867451 }, + { url = "https://files.pythonhosted.org/packages/3c/01/cf3310626b6d48d3e9be69a1223f9180360b5e6edb045f50fade723ce494/scikit_learn-1.9.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:80746d63bd4b6eaca54d36fe5feaf4d28bb38dc6f9470f81c7cad7c40155f119", size = 8705188 }, + { url = "https://files.pythonhosted.org/packages/3e/04/5acd7ae280c5f93b6ac5ef6cdec14eef4c8d1cd91d85b3292989c94d96b1/scikit_learn-1.9.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:5b934c45c252844a91d69fda3a34cff5e7307e1db10d77cb10a3980312c74713", size = 8228299 }, + { url = "https://files.pythonhosted.org/packages/0c/39/ffe829a5b8ecb40a518724a997794657fdc354ada5e8fe8e64d998c0bac9/scikit_learn-1.9.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:38c3dcb9a1ffb85505ec53d54c7b4aea0cff70050425a7760c2af661ac85df05", size = 8789690 }, + { url = "https://files.pythonhosted.org/packages/1f/88/8dab5de10c638c083772a6be83a3d8106ced492f74a928c8693638e5bb50/scikit_learn-1.9.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:da76d09304a4706db7cc1e3ebaa3b6b98a67365cc11d2996c4f1e58ba47df714", size = 9087723 }, + { url = "https://files.pythonhosted.org/packages/20/3f/7917ca72464038f6240ec70c29f94862d08a34a74291ae4d4ec5eb8186a0/scikit_learn-1.9.0-cp313-cp313-win_amd64.whl", hash = "sha256:5808d98f15c6bf6d9d96d2348c1997392a5888ce7097e664105f930c4bca1277", size = 8184330 }, + { url = "https://files.pythonhosted.org/packages/78/c7/15739eb2f61fda3c54639e9942414e5a19ad8a8d1f5a3266afad7cb7df80/scikit_learn-1.9.0-cp313-cp313-win_arm64.whl", hash = "sha256:d77f54c017633791bc0225a43e2f8d03745fdcfe4880268fcc4df15f505dec2e", size = 7840653 }, + { url = "https://files.pythonhosted.org/packages/f4/7d/c9a35cf59b20a86fec24d306f1547b78dec194b08d367ce2a3e4854169d9/scikit_learn-1.9.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:9656acd4e93f74e0b66c8a36c88830a99252dfa900044d36bc2212ae89a47162", size = 8713289 }, + { url = "https://files.pythonhosted.org/packages/3c/a7/552a7821597c632b907f7bfe8f36f9f572777af8ef8a48353041cf8e091a/scikit_learn-1.9.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:24360002ae845e7866522b0a5bbf690802e7bc388cac8663502e78aa98598aa2", size = 8245141 }, + { url = "https://files.pythonhosted.org/packages/7d/79/f4a0c4fe9711154cddabf913471153af79056382ddc612cfe5ee0ff4b72e/scikit_learn-1.9.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5162ad10a418c8a282dde04c9aa06965de3e9a65f33c1440c0ae69bb1a09d913", size = 8847671 }, + { url = "https://files.pythonhosted.org/packages/f0/af/4d72d9e475ac83719160c662619e4bf7b95c19507cd582e7d0167a3c3dae/scikit_learn-1.9.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1fea2cc5677ab49d6f5bade978c866da44957b712d92e9635e8b4f723013c3cb", size = 9118104 }, + { url = "https://files.pythonhosted.org/packages/a2/d5/6a58eea2cb9abbb9b3f2bb8b2cfb3243d1152d69f442d256c7af71304769/scikit_learn-1.9.0-cp314-cp314-win_amd64.whl", hash = "sha256:64fa347efc1c839c487433e40c5144d38c336e8a2b59c81aa8660373945c2673", size = 8290674 }, + { url = "https://files.pythonhosted.org/packages/65/5b/d4c879cf358f1187141cf90ced473f087183489090244f50c124a2ee478b/scikit_learn-1.9.0-cp314-cp314-win_arm64.whl", hash = "sha256:1b944b6db288f6b926e3650026ddafb988929de95d11fc2cc5fa117773c9ba42", size = 7978807 }, + { url = "https://files.pythonhosted.org/packages/8a/43/bfae3121ec67ae09150d453c442c7c1cc166e9aefe056e6ab3b7728a5cfc/scikit_learn-1.9.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:4ccacf04ca5f4b492158a5f28afe0ace43f81b2571e4b9a66d34848b46128949", size = 9031941 }, + { url = "https://files.pythonhosted.org/packages/75/b0/20a4546eb17f3b25d3c66df15810411c14ed5065bcfab50b53c96fb627b2/scikit_learn-1.9.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:ee1a8db2c18c08e34c7412d4b10be1cac214cd4ea7dc9715a6a327eb49a37c96", size = 8613528 }, + { url = "https://files.pythonhosted.org/packages/18/3c/e440e039bb82cd19004edaaad00acbde0fb9b461083c3ecf37941c557312/scikit_learn-1.9.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:147e9329ef0e39f75d4cffa02b2aa48d827832684926cd5210d9a2cb5c57246b", size = 8855050 }, + { url = "https://files.pythonhosted.org/packages/43/26/b341b8dab5998da6270a3a42c2152c578501354d36f944b5856757035ef8/scikit_learn-1.9.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5bad8f8b9950321b54c965fdcbac6c6c55e79e16646b49977bcf3668d3870a1a", size = 9097190 }, + { url = "https://files.pythonhosted.org/packages/fb/de/b650b4d69b84468cfa2e28a3ff7b8103743029e6446ce1a97fe060ef688c/scikit_learn-1.9.0-cp314-cp314t-win_amd64.whl", hash = "sha256:78fc56eafd4edb9575d2d8950d1dd152061abb573341a1cb7e099fc40f6c6666", size = 8963204 }, + { url = "https://files.pythonhosted.org/packages/ee/f3/ff83d76d7418112e5a61326443cdda87be3545dd8d6599c95b2481a4419e/scikit_learn-1.9.0-cp314-cp314t-win_arm64.whl", hash = "sha256:051075bda8b7aab87b1906ab3d4740a1e1224a19d7b3781a576736edc94e76aa", size = 8222661 }, +] + +[[package]] +name = "scipy" +version = "1.17.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7a/97/5a3609c4f8d58b039179648e62dd220f89864f56f7357f5d4f45c29eb2cc/scipy-1.17.1.tar.gz", hash = "sha256:95d8e012d8cb8816c226aef832200b1d45109ed4464303e997c5b13122b297c0", size = 30573822 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/df/75/b4ce781849931fef6fd529afa6b63711d5a733065722d0c3e2724af9e40a/scipy-1.17.1-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:1f95b894f13729334fb990162e911c9e5dc1ab390c58aa6cbecb389c5b5e28ec", size = 31613675 }, + { url = "https://files.pythonhosted.org/packages/f7/58/bccc2861b305abdd1b8663d6130c0b3d7cc22e8d86663edbc8401bfd40d4/scipy-1.17.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:e18f12c6b0bc5a592ed23d3f7b891f68fd7f8241d69b7883769eb5d5dfb52696", size = 28162057 }, + { url = "https://files.pythonhosted.org/packages/6d/ee/18146b7757ed4976276b9c9819108adbc73c5aad636e5353e20746b73069/scipy-1.17.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:a3472cfbca0a54177d0faa68f697d8ba4c80bbdc19908c3465556d9f7efce9ee", size = 20334032 }, + { url = "https://files.pythonhosted.org/packages/ec/e6/cef1cf3557f0c54954198554a10016b6a03b2ec9e22a4e1df734936bd99c/scipy-1.17.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:766e0dc5a616d026a3a1cffa379af959671729083882f50307e18175797b3dfd", size = 22709533 }, + { url = "https://files.pythonhosted.org/packages/4d/60/8804678875fc59362b0fb759ab3ecce1f09c10a735680318ac30da8cd76b/scipy-1.17.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:744b2bf3640d907b79f3fd7874efe432d1cf171ee721243e350f55234b4cec4c", size = 33062057 }, + { url = "https://files.pythonhosted.org/packages/09/7d/af933f0f6e0767995b4e2d705a0665e454d1c19402aa7e895de3951ebb04/scipy-1.17.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:43af8d1f3bea642559019edfe64e9b11192a8978efbd1539d7bc2aaa23d92de4", size = 35349300 }, + { url = "https://files.pythonhosted.org/packages/b4/3d/7ccbbdcbb54c8fdc20d3b6930137c782a163fa626f0aef920349873421ba/scipy-1.17.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cd96a1898c0a47be4520327e01f874acfd61fb48a9420f8aa9f6483412ffa444", size = 35127333 }, + { url = "https://files.pythonhosted.org/packages/e8/19/f926cb11c42b15ba08e3a71e376d816ac08614f769b4f47e06c3580c836a/scipy-1.17.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4eb6c25dd62ee8d5edf68a8e1c171dd71c292fdae95d8aeb3dd7d7de4c364082", size = 37741314 }, + { url = "https://files.pythonhosted.org/packages/95/da/0d1df507cf574b3f224ccc3d45244c9a1d732c81dcb26b1e8a766ae271a8/scipy-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:d30e57c72013c2a4fe441c2fcb8e77b14e152ad48b5464858e07e2ad9fbfceff", size = 36607512 }, + { url = "https://files.pythonhosted.org/packages/68/7f/bdd79ceaad24b671543ffe0ef61ed8e659440eb683b66f033454dcee90eb/scipy-1.17.1-cp311-cp311-win_arm64.whl", hash = "sha256:9ecb4efb1cd6e8c4afea0daa91a87fbddbce1b99d2895d151596716c0b2e859d", size = 24599248 }, + { url = "https://files.pythonhosted.org/packages/35/48/b992b488d6f299dbe3f11a20b24d3dda3d46f1a635ede1c46b5b17a7b163/scipy-1.17.1-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:35c3a56d2ef83efc372eaec584314bd0ef2e2f0d2adb21c55e6ad5b344c0dcb8", size = 31610954 }, + { url = "https://files.pythonhosted.org/packages/b2/02/cf107b01494c19dc100f1d0b7ac3cc08666e96ba2d64db7626066cee895e/scipy-1.17.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:fcb310ddb270a06114bb64bbe53c94926b943f5b7f0842194d585c65eb4edd76", size = 28172662 }, + { url = "https://files.pythonhosted.org/packages/cf/a9/599c28631bad314d219cf9ffd40e985b24d603fc8a2f4ccc5ae8419a535b/scipy-1.17.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:cc90d2e9c7e5c7f1a482c9875007c095c3194b1cfedca3c2f3291cdc2bc7c086", size = 20344366 }, + { url = "https://files.pythonhosted.org/packages/35/f5/906eda513271c8deb5af284e5ef0206d17a96239af79f9fa0aebfe0e36b4/scipy-1.17.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:c80be5ede8f3f8eded4eff73cc99a25c388ce98e555b17d31da05287015ffa5b", size = 22704017 }, + { url = "https://files.pythonhosted.org/packages/da/34/16f10e3042d2f1d6b66e0428308ab52224b6a23049cb2f5c1756f713815f/scipy-1.17.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e19ebea31758fac5893a2ac360fedd00116cbb7628e650842a6691ba7ca28a21", size = 32927842 }, + { url = "https://files.pythonhosted.org/packages/01/8e/1e35281b8ab6d5d72ebe9911edcdffa3f36b04ed9d51dec6dd140396e220/scipy-1.17.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:02ae3b274fde71c5e92ac4d54bc06c42d80e399fec704383dcd99b301df37458", size = 35235890 }, + { url = "https://files.pythonhosted.org/packages/c5/5c/9d7f4c88bea6e0d5a4f1bc0506a53a00e9fcb198de372bfe4d3652cef482/scipy-1.17.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8a604bae87c6195d8b1045eddece0514d041604b14f2727bbc2b3020172045eb", size = 35003557 }, + { url = "https://files.pythonhosted.org/packages/65/94/7698add8f276dbab7a9de9fb6b0e02fc13ee61d51c7c3f85ac28b65e1239/scipy-1.17.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f590cd684941912d10becc07325a3eeb77886fe981415660d9265c4c418d0bea", size = 37625856 }, + { url = "https://files.pythonhosted.org/packages/a2/84/dc08d77fbf3d87d3ee27f6a0c6dcce1de5829a64f2eae85a0ecc1f0daa73/scipy-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:41b71f4a3a4cab9d366cd9065b288efc4d4f3c0b37a91a8e0947fb5bd7f31d87", size = 36549682 }, + { url = "https://files.pythonhosted.org/packages/bc/98/fe9ae9ffb3b54b62559f52dedaebe204b408db8109a8c66fdd04869e6424/scipy-1.17.1-cp312-cp312-win_arm64.whl", hash = "sha256:f4115102802df98b2b0db3cce5cb9b92572633a1197c77b7553e5203f284a5b3", size = 24547340 }, + { url = "https://files.pythonhosted.org/packages/76/27/07ee1b57b65e92645f219b37148a7e7928b82e2b5dbeccecb4dff7c64f0b/scipy-1.17.1-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:5e3c5c011904115f88a39308379c17f91546f77c1667cea98739fe0fccea804c", size = 31590199 }, + { url = "https://files.pythonhosted.org/packages/ec/ae/db19f8ab842e9b724bf5dbb7db29302a91f1e55bc4d04b1025d6d605a2c5/scipy-1.17.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:6fac755ca3d2c3edcb22f479fceaa241704111414831ddd3bc6056e18516892f", size = 28154001 }, + { url = "https://files.pythonhosted.org/packages/5b/58/3ce96251560107b381cbd6e8413c483bbb1228a6b919fa8652b0d4090e7f/scipy-1.17.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:7ff200bf9d24f2e4d5dc6ee8c3ac64d739d3a89e2326ba68aaf6c4a2b838fd7d", size = 20325719 }, + { url = "https://files.pythonhosted.org/packages/b2/83/15087d945e0e4d48ce2377498abf5ad171ae013232ae31d06f336e64c999/scipy-1.17.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:4b400bdc6f79fa02a4d86640310dde87a21fba0c979efff5248908c6f15fad1b", size = 22683595 }, + { url = "https://files.pythonhosted.org/packages/b4/e0/e58fbde4a1a594c8be8114eb4aac1a55bcd6587047efc18a61eb1f5c0d30/scipy-1.17.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2b64ca7d4aee0102a97f3ba22124052b4bd2152522355073580bf4845e2550b6", size = 32896429 }, + { url = "https://files.pythonhosted.org/packages/f5/5f/f17563f28ff03c7b6799c50d01d5d856a1d55f2676f537ca8d28c7f627cd/scipy-1.17.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:581b2264fc0aa555f3f435a5944da7504ea3a065d7029ad60e7c3d1ae09c5464", size = 35203952 }, + { url = "https://files.pythonhosted.org/packages/8d/a5/9afd17de24f657fdfe4df9a3f1ea049b39aef7c06000c13db1530d81ccca/scipy-1.17.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:beeda3d4ae615106d7094f7e7cef6218392e4465cc95d25f900bebabfded0950", size = 34979063 }, + { url = "https://files.pythonhosted.org/packages/8b/13/88b1d2384b424bf7c924f2038c1c409f8d88bb2a8d49d097861dd64a57b2/scipy-1.17.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6609bc224e9568f65064cfa72edc0f24ee6655b47575954ec6339534b2798369", size = 37598449 }, + { url = "https://files.pythonhosted.org/packages/35/e5/d6d0e51fc888f692a35134336866341c08655d92614f492c6860dc45bb2c/scipy-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:37425bc9175607b0268f493d79a292c39f9d001a357bebb6b88fdfaff13f6448", size = 36510943 }, + { url = "https://files.pythonhosted.org/packages/2a/fd/3be73c564e2a01e690e19cc618811540ba5354c67c8680dce3281123fb79/scipy-1.17.1-cp313-cp313-win_arm64.whl", hash = "sha256:5cf36e801231b6a2059bf354720274b7558746f3b1a4efb43fcf557ccd484a87", size = 24545621 }, + { url = "https://files.pythonhosted.org/packages/6f/6b/17787db8b8114933a66f9dcc479a8272e4b4da75fe03b0c282f7b0ade8cd/scipy-1.17.1-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:d59c30000a16d8edc7e64152e30220bfbd724c9bbb08368c054e24c651314f0a", size = 31936708 }, + { url = "https://files.pythonhosted.org/packages/38/2e/524405c2b6392765ab1e2b722a41d5da33dc5c7b7278184a8ad29b6cb206/scipy-1.17.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:010f4333c96c9bb1a4516269e33cb5917b08ef2166d5556ca2fd9f082a9e6ea0", size = 28570135 }, + { url = "https://files.pythonhosted.org/packages/fd/c3/5bd7199f4ea8556c0c8e39f04ccb014ac37d1468e6cfa6a95c6b3562b76e/scipy-1.17.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:2ceb2d3e01c5f1d83c4189737a42d9cb2fc38a6eeed225e7515eef71ad301dce", size = 20741977 }, + { url = "https://files.pythonhosted.org/packages/d9/b8/8ccd9b766ad14c78386599708eb745f6b44f08400a5fd0ade7cf89b6fc93/scipy-1.17.1-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:844e165636711ef41f80b4103ed234181646b98a53c8f05da12ca5ca289134f6", size = 23029601 }, + { url = "https://files.pythonhosted.org/packages/6d/a0/3cb6f4d2fb3e17428ad2880333cac878909ad1a89f678527b5328b93c1d4/scipy-1.17.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:158dd96d2207e21c966063e1635b1063cd7787b627b6f07305315dd73d9c679e", size = 33019667 }, + { url = "https://files.pythonhosted.org/packages/f3/c3/2d834a5ac7bf3a0c806ad1508efc02dda3c8c61472a56132d7894c312dea/scipy-1.17.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74cbb80d93260fe2ffa334efa24cb8f2f0f622a9b9febf8b483c0b865bfb3475", size = 35264159 }, + { url = "https://files.pythonhosted.org/packages/4d/77/d3ed4becfdbd217c52062fafe35a72388d1bd82c2d0ba5ca19d6fcc93e11/scipy-1.17.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:dbc12c9f3d185f5c737d801da555fb74b3dcfa1a50b66a1a93e09190f41fab50", size = 35102771 }, + { url = "https://files.pythonhosted.org/packages/bd/12/d19da97efde68ca1ee5538bb261d5d2c062f0c055575128f11a2730e3ac1/scipy-1.17.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:94055a11dfebe37c656e70317e1996dc197e1a15bbcc351bcdd4610e128fe1ca", size = 37665910 }, + { url = "https://files.pythonhosted.org/packages/06/1c/1172a88d507a4baaf72c5a09bb6c018fe2ae0ab622e5830b703a46cc9e44/scipy-1.17.1-cp313-cp313t-win_amd64.whl", hash = "sha256:e30bdeaa5deed6bc27b4cc490823cd0347d7dae09119b8803ae576ea0ce52e4c", size = 36562980 }, + { url = "https://files.pythonhosted.org/packages/70/b0/eb757336e5a76dfa7911f63252e3b7d1de00935d7705cf772db5b45ec238/scipy-1.17.1-cp313-cp313t-win_arm64.whl", hash = "sha256:a720477885a9d2411f94a93d16f9d89bad0f28ca23c3f8daa521e2dcc3f44d49", size = 24856543 }, + { url = "https://files.pythonhosted.org/packages/cf/83/333afb452af6f0fd70414dc04f898647ee1423979ce02efa75c3b0f2c28e/scipy-1.17.1-cp314-cp314-macosx_10_14_x86_64.whl", hash = "sha256:a48a72c77a310327f6a3a920092fa2b8fd03d7deaa60f093038f22d98e096717", size = 31584510 }, + { url = "https://files.pythonhosted.org/packages/ed/a6/d05a85fd51daeb2e4ea71d102f15b34fedca8e931af02594193ae4fd25f7/scipy-1.17.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:45abad819184f07240d8a696117a7aacd39787af9e0b719d00285549ed19a1e9", size = 28170131 }, + { url = "https://files.pythonhosted.org/packages/db/7b/8624a203326675d7746a254083a187398090a179335b2e4a20e2ddc46e83/scipy-1.17.1-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:3fd1fcdab3ea951b610dc4cef356d416d5802991e7e32b5254828d342f7b7e0b", size = 20342032 }, + { url = "https://files.pythonhosted.org/packages/c9/35/2c342897c00775d688d8ff3987aced3426858fd89d5a0e26e020b660b301/scipy-1.17.1-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:7bdf2da170b67fdf10bca777614b1c7d96ae3ca5794fd9587dce41eb2966e866", size = 22678766 }, + { url = "https://files.pythonhosted.org/packages/ef/f2/7cdb8eb308a1a6ae1e19f945913c82c23c0c442a462a46480ce487fdc0ac/scipy-1.17.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:adb2642e060a6549c343603a3851ba76ef0b74cc8c079a9a58121c7ec9fe2350", size = 32957007 }, + { url = "https://files.pythonhosted.org/packages/0b/2e/7eea398450457ecb54e18e9d10110993fa65561c4f3add5e8eccd2b9cd41/scipy-1.17.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:eee2cfda04c00a857206a4330f0c5e3e56535494e30ca445eb19ec624ae75118", size = 35221333 }, + { url = "https://files.pythonhosted.org/packages/d9/77/5b8509d03b77f093a0d52e606d3c4f79e8b06d1d38c441dacb1e26cacf46/scipy-1.17.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d2650c1fb97e184d12d8ba010493ee7b322864f7d3d00d3f9bb97d9c21de4068", size = 35042066 }, + { url = "https://files.pythonhosted.org/packages/f9/df/18f80fb99df40b4070328d5ae5c596f2f00fffb50167e31439e932f29e7d/scipy-1.17.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:08b900519463543aa604a06bec02461558a6e1cef8fdbb8098f77a48a83c8118", size = 37612763 }, + { url = "https://files.pythonhosted.org/packages/4b/39/f0e8ea762a764a9dc52aa7dabcfad51a354819de1f0d4652b6a1122424d6/scipy-1.17.1-cp314-cp314-win_amd64.whl", hash = "sha256:3877ac408e14da24a6196de0ddcace62092bfc12a83823e92e49e40747e52c19", size = 37290984 }, + { url = "https://files.pythonhosted.org/packages/7c/56/fe201e3b0f93d1a8bcf75d3379affd228a63d7e2d80ab45467a74b494947/scipy-1.17.1-cp314-cp314-win_arm64.whl", hash = "sha256:f8885db0bc2bffa59d5c1b72fad7a6a92d3e80e7257f967dd81abb553a90d293", size = 25192877 }, + { url = "https://files.pythonhosted.org/packages/96/ad/f8c414e121f82e02d76f310f16db9899c4fcde36710329502a6b2a3c0392/scipy-1.17.1-cp314-cp314t-macosx_10_14_x86_64.whl", hash = "sha256:1cc682cea2ae55524432f3cdff9e9a3be743d52a7443d0cba9017c23c87ae2f6", size = 31949750 }, + { url = "https://files.pythonhosted.org/packages/7c/b0/c741e8865d61b67c81e255f4f0a832846c064e426636cd7de84e74d209be/scipy-1.17.1-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:2040ad4d1795a0ae89bfc7e8429677f365d45aa9fd5e4587cf1ea737f927b4a1", size = 28585858 }, + { url = "https://files.pythonhosted.org/packages/ed/1b/3985219c6177866628fa7c2595bfd23f193ceebbe472c98a08824b9466ff/scipy-1.17.1-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:131f5aaea57602008f9822e2115029b55d4b5f7c070287699fe45c661d051e39", size = 20757723 }, + { url = "https://files.pythonhosted.org/packages/c0/19/2a04aa25050d656d6f7b9e7b685cc83d6957fb101665bfd9369ca6534563/scipy-1.17.1-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:9cdc1a2fcfd5c52cfb3045feb399f7b3ce822abdde3a193a6b9a60b3cb5854ca", size = 23043098 }, + { url = "https://files.pythonhosted.org/packages/86/f1/3383beb9b5d0dbddd030335bf8a8b32d4317185efe495374f134d8be6cce/scipy-1.17.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6e3dcd57ab780c741fde8dc68619de988b966db759a3c3152e8e9142c26295ad", size = 33030397 }, + { url = "https://files.pythonhosted.org/packages/41/68/8f21e8a65a5a03f25a79165ec9d2b28c00e66dc80546cf5eb803aeeff35b/scipy-1.17.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a9956e4d4f4a301ebf6cde39850333a6b6110799d470dbbb1e25326ac447f52a", size = 35281163 }, + { url = "https://files.pythonhosted.org/packages/84/8d/c8a5e19479554007a5632ed7529e665c315ae7492b4f946b0deb39870e39/scipy-1.17.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:a4328d245944d09fd639771de275701ccadf5f781ba0ff092ad141e017eccda4", size = 35116291 }, + { url = "https://files.pythonhosted.org/packages/52/52/e57eceff0e342a1f50e274264ed47497b59e6a4e3118808ee58ddda7b74a/scipy-1.17.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a77cbd07b940d326d39a1d1b37817e2ee4d79cb30e7338f3d0cddffae70fcaa2", size = 37682317 }, + { url = "https://files.pythonhosted.org/packages/11/2f/b29eafe4a3fbc3d6de9662b36e028d5f039e72d345e05c250e121a230dd4/scipy-1.17.1-cp314-cp314t-win_amd64.whl", hash = "sha256:eb092099205ef62cd1782b006658db09e2fed75bffcae7cc0d44052d8aa0f484", size = 37345327 }, + { url = "https://files.pythonhosted.org/packages/07/39/338d9219c4e87f3e708f18857ecd24d22a0c3094752393319553096b98af/scipy-1.17.1-cp314-cp314t-win_arm64.whl", hash = "sha256:200e1050faffacc162be6a486a984a0497866ec54149a01270adc8a59b7c7d21", size = 25489165 }, +] + +[[package]] +name = "sentence-transformers" +version = "5.5.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, + { name = "numpy" }, + { name = "scikit-learn" }, + { name = "scipy" }, + { name = "torch" }, + { name = "tqdm" }, + { name = "transformers" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cf/d4/7ef93157485e978c016f49da05363c1e4e7237beb5343b64b5631101f0f1/sentence_transformers-5.5.1.tar.gz", hash = "sha256:02b7740dfc60bdbbcb6061625f5d97a5c1a4e2d3baac5f9391b912bb5eae2290", size = 445161 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bf/03/ee99a6b030e7a2e056547729f8a4709dd93e13d9c6f07590f74c395c4017/sentence_transformers-5.5.1-py3-none-any.whl", hash = "sha256:4fe11d433badc5282d32f7fc08bc714216b7a5aca426f9df77a45a554756deb7", size = 588887 }, +] + [[package]] name = "setuptools" version = "81.0.0" @@ -2438,6 +2595,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353 }, ] +[[package]] +name = "threadpoolctl" +version = "3.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b7/4d/08c89e34946fce2aec4fbb45c9016efd5f4d7f24af8e5d93296e935631d8/threadpoolctl-3.6.0.tar.gz", hash = "sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e", size = 21274 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638 }, +] + [[package]] name = "tiktoken" version = "0.12.0"