Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,14 @@ dependencies = [
]

[project.optional-dependencies]
embeddings = ["sqlite-vec>=0.1.6"]
embeddings = ["sqlite-vec>=0.1.6", "sentence-transformers>=3.0"]
# sqlite-vec is a tiny wheel; including it in dev lets CI exercise the
# vector-store tests without pulling in torch via sentence-transformers.
dev = [
"pytest>=8.0",
"pytest-asyncio>=0.23",
"ruff>=0.5",
"sqlite-vec>=0.1.6",
]
# Heavyweight, optional. Required only by `benchmarks/faithfulness/` which
# downloads the HaluBench subsample and self-hosts HHEM-2.1-Open for the
Expand Down
2 changes: 2 additions & 0 deletions src/orc/cli_commands/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ def ingest_command(source: str, workspace: str | None, no_recursive: bool) -> No
console.print(
f"[green]Ingested[/green] {len(ids)} evidence item(s) into [bold]{ws.name}[/bold]"
)
if ws.has_embeddings:
console.print(f" embeddings: {ws.embedding_model}")
for eid in ids[:10]:
console.print(f" [dim]{eid}[/dim]")
if len(ids) > 10:
Expand Down
2 changes: 1 addition & 1 deletion src/orc/cli_commands/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def search_command(query: str, workspace: str | None, k: int, as_json: bool) ->
console.print("[yellow]No chunks matched[/yellow]")
return

table = Table(title=f"BM25 results for '{query}'")
table = Table(title=f"Retrieval results for '{query}'")
table.add_column("Rank", justify="right")
table.add_column("Score", justify="right")
table.add_column("Title")
Expand Down
101 changes: 97 additions & 4 deletions src/orc/cli_commands/workspace.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,33 @@
"""`orc workspace ...` commands."""
"""`orc workspace ...` commands.

The embedding model is pinned in the workspace row — there is deliberately no
env var to override it at retrieval time, because the column is the
replay-pinned truth for which model embedded the corpus.
"""

from __future__ import annotations

import click
from rich.console import Console
from rich.markup import escape
from rich.table import Table

from orc.errors import WorkspaceExistsError
from orc.errors import EmbeddingsUnavailableError, WorkspaceExistsError, WorkspaceNotFoundError
from orc.paths import workspace_db_path
from orc.retrieval.embedder import DEFAULT_EMBEDDING_MODEL, embedder_available, get_embedder
from orc.storage import workspace as ws_module
from orc.storage.db import open_connection, transaction
from orc.storage.embeddings_store import (
backfill_embeddings,
ensure_chunk_vec,
load_vec_extension,
vec_extension_available,
)

console = Console()

_INSTALL_HINT = 'pip install "orc-ai[embeddings]"'


@click.group("workspace")
def workspace() -> None:
Expand All @@ -19,17 +36,93 @@ def workspace() -> None:

@workspace.command("create")
@click.argument("name")
def create_command(name: str) -> None:
@click.option(
"--embeddings",
"embeddings",
is_flag=True,
help="Enable hybrid (BM25 + vector) retrieval for this workspace.",
)
@click.option(
"--embedding-model",
"embedding_model",
default=None,
help=f"Embedding model id (default: {DEFAULT_EMBEDDING_MODEL}). Requires --embeddings.",
)
def create_command(name: str, embeddings: bool, embedding_model: str | None) -> None:
"""Create a new workspace."""
if embedding_model is not None and not embeddings:
raise click.UsageError("--embedding-model requires --embeddings")
model = (embedding_model or DEFAULT_EMBEDDING_MODEL) if embeddings else None

# Warn-but-create: the flag records intent in the workspace row; the user
# can install the extra and run `orc workspace embed` later.
if model is not None and not (embedder_available() and vec_extension_available()):
console.print(
"[yellow]Warning:[/yellow] embedding dependencies are not installed; "
f"ingest will fail until you run: {escape(_INSTALL_HINT)}"
)

try:
ws = ws_module.create(name)
ws = ws_module.create(name, embedding_model=model)
except WorkspaceExistsError as exc:
raise click.ClickException(str(exc)) from exc
except ValueError as exc:
raise click.ClickException(str(exc)) from exc
console.print(f"[green]Created workspace[/green] [bold]{ws.name}[/bold]")
console.print(f" schema_version = {ws.schema_version}")
console.print(f" created_at = {ws.created_at}")
if ws.has_embeddings:
console.print(f" embeddings = {ws.embedding_model}")


@workspace.command("embed")
@click.argument("name")
@click.option(
"--model",
default=None,
help="Embedding model id. Only needed when the workspace has none set yet.",
)
def embed_command(name: str, model: str | None) -> None:
"""Backfill vector embeddings for all unembedded chunks in a workspace."""
try:
ws = ws_module.resolve(name)
except WorkspaceNotFoundError as exc:
raise click.ClickException(str(exc)) from exc

if ws.embedding_model is None:
effective_model = model or DEFAULT_EMBEDDING_MODEL
elif model is not None and model != ws.embedding_model:
raise click.ClickException(
f"Workspace {ws.name!r} is pinned to embedding model "
f"{ws.embedding_model!r}; refusing to embed with {model!r}. "
"Vectors from different models cannot be mixed."
)
else:
effective_model = ws.embedding_model

if not vec_extension_available():
raise click.ClickException(
f"The sqlite-vec extension is unavailable; run: {_INSTALL_HINT}"
)
try:
embedder = get_embedder(effective_model)
except EmbeddingsUnavailableError as exc:
raise click.ClickException(str(exc)) from exc

with open_connection(workspace_db_path(ws.name)) as conn:
load_vec_extension(conn)
try:
ensure_chunk_vec(conn, embedder.dim)
except ValueError as exc:
raise click.ClickException(str(exc)) from exc
if ws.embedding_model is None:
with transaction(conn):
conn.execute(
"UPDATE workspace SET embedding_model = ? WHERE name = ?",
(effective_model, ws.name),
)
count = backfill_embeddings(conn, embedder)
console.print(f"[green]Embedded[/green] {count} chunk(s) with [bold]{effective_model}[/bold]")


@workspace.command("list")
Expand Down
12 changes: 7 additions & 5 deletions src/orc/directives/research/skills/research_topic.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from orc.llm.cache import build_verify_messages, format_corpus
from orc.llm.client import get_client, messages_create, resolve_model_for_provider
from orc.llm.models import resolve_research_model
from orc.retrieval import bm25_search
from orc.retrieval import retrieve
from orc.runs.runner import Run
from orc.storage.workspace import Workspace

Expand Down Expand Up @@ -73,11 +73,13 @@ def run(
raise ValueError("topic must be a non-empty string")

resolved_model = resolve_research_model(model)
pool = bm25_search(
run.conn, topic, limit=retrieval_pool, corpus_version=corpus_version
res = retrieve(
run.conn, topic, workspace=workspace, limit=retrieval_pool, corpus_version=corpus_version
)
candidates = res.chunks[:k]
run.record_retrieval(
candidates, method=res.method, candidates_considered=res.candidates_considered
)
candidates = pool[:k]
run.record_retrieval(candidates, method="bm25", candidates_considered=len(pool))

if not candidates:
return {
Expand Down
9 changes: 6 additions & 3 deletions src/orc/directives/research/skills/search_evidence.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from typing import Any

from orc.retrieval import bm25_search
from orc.retrieval import retrieve
from orc.runs.runner import Run
from orc.storage.workspace import Workspace

Expand All @@ -22,8 +22,11 @@ def run(
corpus_version: int | None = None,
**_unused: Any,
) -> dict[str, Any]:
chunks = bm25_search(run.conn, query, limit=k, corpus_version=corpus_version)
run.record_retrieval(chunks, method="bm25", candidates_considered=len(chunks))
res = retrieve(run.conn, query, workspace=workspace, limit=k, corpus_version=corpus_version)
chunks = res.chunks
run.record_retrieval(
chunks, method=res.method, candidates_considered=res.candidates_considered
)
return {
"query": query,
"k": k,
Expand Down
16 changes: 11 additions & 5 deletions src/orc/directives/research/skills/verify_claim.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from orc.llm.cache import build_verify_messages, format_corpus
from orc.llm.client import get_client, messages_create, resolve_model_for_provider
from orc.llm.models import resolve_verify_model
from orc.retrieval import bm25_search
from orc.retrieval import retrieve
from orc.runs.runner import Run
from orc.storage.workspace import Workspace

Expand Down Expand Up @@ -372,11 +372,17 @@ def run(
candidates, method=f"{mode}_all", candidates_considered=len(candidates)
)
else:
pool = bm25_search(
run.conn, claim, limit=retrieval_pool, corpus_version=corpus_version
res = retrieve(
run.conn,
claim,
workspace=workspace,
limit=retrieval_pool,
corpus_version=corpus_version,
)
candidates = res.chunks[:k]
run.record_retrieval(
candidates, method=res.method, candidates_considered=res.candidates_considered
)
candidates = pool[:k]
run.record_retrieval(candidates, method="bm25", candidates_considered=len(pool))

if not candidates:
return _make_not_found(claim=claim, model=resolved_model, run=run)
Expand Down
4 changes: 4 additions & 0 deletions src/orc/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,7 @@ class TraceNotFoundError(OrcError):

class IngestError(OrcError):
pass


class EmbeddingsUnavailableError(OrcError):
"""Embeddings were requested but the optional dependencies are missing."""
65 changes: 63 additions & 2 deletions src/orc/ingest/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,11 @@ def _ingest_one(workspace: Workspace, doc: LoadedDoc) -> list[str]:
# Chunk before any disk write so a chunker failure leaves nothing behind.
chunks = chunk_text(doc.text)

# Embed BEFORE the write transaction: model inference can be slow and
# must not hold the BEGIN IMMEDIATE write lock. The vectors are then
# inserted in the same transaction as the chunk rows (atomic).
embeddings = _embed_chunks_for_ingest(conn, workspace=workspace, chunks=chunks)

# Stage the evidence bytes to a temp file and only promote it into place
# once the DB transaction commits. A failure anywhere leaves neither an
# orphaned file nor a dangling row — the corpus stays consistent.
Expand All @@ -95,6 +100,7 @@ def _ingest_one(workspace: Workspace, doc: LoadedDoc) -> list[str]:
sha=sha,
doc=doc,
chunks=chunks,
embeddings=embeddings,
)
except BaseException:
tmp_path.unlink(missing_ok=True)
Expand All @@ -103,6 +109,49 @@ def _ingest_one(workspace: Workspace, doc: LoadedDoc) -> list[str]:
return [evidence_id]


def _embed_chunks_for_ingest(
conn: Any,
*,
workspace: Workspace,
chunks: list,
) -> list[list[float]] | None:
"""Embed chunk texts when the workspace opts into embeddings.

Fail-loud by design: a workspace with embedding_model set has promised
hybrid retrieval, so silently ingesting unembedded chunks would corrupt
that promise. Missing deps surface as IngestError with an install hint.
Also prepares chunk_vec (extension + table) before the write transaction.
"""
if workspace.embedding_model is None or not chunks:
return None

from orc.errors import EmbeddingsUnavailableError
from orc.retrieval.embedder import get_embedder
from orc.storage.embeddings_store import (
ensure_chunk_vec,
load_vec_extension,
vec_extension_available,
)

try:
if not vec_extension_available():
raise EmbeddingsUnavailableError(
"the sqlite-vec extension is unavailable; "
'run: pip install "orc-ai[embeddings]"'
)
embedder = get_embedder(workspace.embedding_model)
except EmbeddingsUnavailableError as exc:
raise IngestError(
f"Workspace {workspace.name!r} requires embeddings "
f"(embedding_model={workspace.embedding_model!r}) but they are "
f"unavailable: {exc}"
) from exc

load_vec_extension(conn)
ensure_chunk_vec(conn, embedder.dim)
return embedder.embed_texts([c.text for c in chunks])


def _commit_evidence(
conn: Any,
*,
Expand All @@ -112,6 +161,7 @@ def _commit_evidence(
sha: str,
doc: LoadedDoc,
chunks: list,
embeddings: list[list[float]] | None = None,
) -> None:
with transaction(conn):
conn.execute(
Expand All @@ -136,12 +186,13 @@ def _commit_evidence(
new_corpus_version,
),
)
for c in chunks:
chunk_ids = [new_chunk_id() for _ in chunks]
for chunk_id, c in zip(chunk_ids, chunks, strict=True):
conn.execute(
"INSERT INTO chunk(chunk_id, evidence_id, seq, text, token_count, "
"headings_path, start_offset, end_offset) VALUES (?,?,?,?,?,?,?,?)",
(
new_chunk_id(),
chunk_id,
evidence_id,
c.seq,
c.text,
Expand All @@ -151,6 +202,16 @@ def _commit_evidence(
c.end_offset,
),
)
if embeddings is not None:
from orc.storage.embeddings_store import store_chunk_embeddings

store_chunk_embeddings(
conn,
[
(chunk_id, new_corpus_version, vector)
for chunk_id, vector in zip(chunk_ids, embeddings, strict=True)
],
)


def _iter_files(root: Path, *, recursive: bool) -> Iterator[Path]:
Expand Down
10 changes: 9 additions & 1 deletion src/orc/retrieval/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
"""Retrieval primitives. Pure functions over a sqlite connection."""

from orc.retrieval.bm25 import RetrievedChunk, bm25_search
from orc.retrieval.hybrid import RetrievalResult, retrieve, rrf_fuse, vector_search

__all__ = ["RetrievedChunk", "bm25_search"]
__all__ = [
"RetrievalResult",
"RetrievedChunk",
"bm25_search",
"retrieve",
"rrf_fuse",
"vector_search",
]
Loading
Loading