Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions cmd/engine/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,9 @@ func buildPageIndexStrategy(c config.RetrievalConfig, client llmgate.Client, sto
if c.PageIndex.PageContentLimit > 0 {
p.PageContentLimit = c.PageIndex.PageContentLimit
}
if c.PageIndex.MaxCitations > 0 {
p.MaxCitations = c.PageIndex.MaxCitations
}
p.ModelOverride = c.PageIndex.Model
return p
}
Expand Down
3 changes: 3 additions & 0 deletions cmd/server/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,9 @@ func buildPageIndexStrategy(c enginecfg.RetrievalConfig, client llmgate.Client,
if c.PageIndex.PageContentLimit > 0 {
p.PageContentLimit = c.PageIndex.PageContentLimit
}
if c.PageIndex.MaxCitations > 0 {
p.MaxCitations = c.PageIndex.MaxCitations
}
p.ModelOverride = c.PageIndex.Model
return p
}
Expand Down
10 changes: 10 additions & 0 deletions config.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,16 @@ retrieval:
# flagship model's context window. Higher values risk burning
# context budget on stray full-document fetches.
page_content_limit: 16000
# Cap on how many distinct page ranges the FINAL answer may
# cite. A confidence backstop, not a navigation limit: the
# model may read as many pages as max_hops allows, but the
# citation set it commits to is bounded. FinanceBench data
# shows the failure mode is "spray ~5 low-confidence ranges ->
# miss all" while a single confident pick scores perfectly;
# capping the final set tames the spray. 3 keeps a genuinely
# multi-location answer (e.g. a figure + its footnote) while
# cutting the low-confidence tail. Env: VLE_/VLS_RETRIEVAL_PAGEINDEX_MAX_CITATIONS.
max_citations: 3
# Override the navigation-loop model; empty inherits the
# request's model (which itself falls back to the engine
# default). Most deployments leave this blank — navigation
Expand Down
42 changes: 22 additions & 20 deletions internal/api/pageindex.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@
"citations": citations,
"strategy": perReq.Name(),
"model": budget.ModelName,
"confidence": res.Confidence,
"hops_taken": res.HopsTaken,
"usage": map[string]any{
"input_tokens": res.Usage.InputTokens,
Expand Down Expand Up @@ -268,6 +269,7 @@
"citations": citations,
"strategy": strat.Name(),
"model": budget.ModelName,
"confidence": res.Confidence,
"hops_taken": res.HopsTaken,
"usage": map[string]any{
"input_tokens": res.Usage.InputTokens,
Expand Down Expand Up @@ -298,35 +300,35 @@
if res == nil {
return nil
}
// Build a citation per UNIQUE page range present in PagesRead.
// The set of pages the model "read" is a superset of what it
// cited — some get_pages calls don't end up in the final
// cited_pages list — but the union is the right cone of trust
// to surface as evidence. The trace token is computed over
// only the strictly-cited ranges, which the strategy already
// has, so citation drift doesn't break replay.
seen := make(map[[2]int]struct{}, len(res.PagesRead))
citations := make([]map[string]any, 0, len(res.PagesRead))

for _, pr := range res.PagesRead {
key := [2]int{pr.StartPage, pr.EndPage}
if _, dup := seen[key]; dup {
continue
// Source of truth for citations is the FINAL cited-range set
// (res.CitedPages) — already deduped and capped to MaxCitations by
// the strategy. This is what makes a confident single-pick answer
// surface ONE citation even when it skimmed several pages to find
// it: PagesRead is the navigation footprint (a superset), not the
// commitment. Fall back to PagesRead only when nothing was cited
// (refusal) or the run was hop-capped without a done — the trace
// token is keyed on the cited ranges either way, so this never
// breaks replay.
sources := retrieval.CitationSources(res)
citations := make([]map[string]any, 0, len(sources))

for _, src := range sources {
sectionIDs := src.SectionIDs
if sectionIDs == nil {
sectionIDs = retrieval.SectionIDsOverlapping(t, src.Start, src.End)
}
seen[key] = struct{}{}

c := map[string]any{
"start_page": pr.StartPage,
"end_page": pr.EndPage,
"section_ids": pr.SectionIDs,
"start_page": src.Start,
"end_page": src.End,
"section_ids": sectionIDs,
}

// Quote extraction is best-effort: an LLM blip or empty
// content returns no quote, which is a normal degradation
// path. We materialise the cited content from storage and
// run one SpanExtractor call per citation.
if d.LLM != nil {
content := d.materialiseCitedContent(ctx, t, pr.SectionIDs)
content := d.materialiseCitedContent(ctx, t, sectionIDs)
if strings.TrimSpace(content) != "" {
ext := d.pageIndexSpanExtractor(requestModel)
span, _, err := ext.Extract(ctx, content, query)
Expand Down Expand Up @@ -476,7 +478,7 @@
// the in-response trace_token against the canonical input set —
// kept here rather than exported from the retrieval package so
// the API layer owns its own input wiring.
func pageIndexTraceTokenFromCitations(docID tree.DocumentID, model string, ranges [][2]int) string {

Check failure on line 481 in internal/api/pageindex.go

View workflow job for this annotation

GitHub Actions / lint

func pageIndexTraceTokenFromCitations is unused (U1000)
strs := make([]string, 0, len(ranges))
for _, r := range ranges {
if r[0] == r[1] {
Expand Down
101 changes: 101 additions & 0 deletions internal/api/pageindex_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,107 @@ func TestHandleAnswerPageIndexReasoningTrace(t *testing.T) {
}
}

// TestHandleAnswerPageIndexDedupAndCapCitations is the end-to-end
// proof of the bench-facing fix: a done that sprays the SAME range
// five times plus extras must produce a citations[] array that is
// deduped and capped at MaxCitations — no duplicate page ranges, no
// repeated section ids across citations, and confidence surfaced.
// This is the API-layer mirror of the strategy's dedup test and the
// reason precision@5 stops deflating.
func TestHandleAnswerPageIndexDedupAndCapCitations(t *testing.T) {
t.Parallel()

// Read one range, then a done that cites [1,2] five times plus
// two more distinct ranges and a confidence. MaxCitations=3.
deps, _, _ := newTestDeps(t,
`{"tool":"get_pages","start_page":1,"end_page":2,"reasoning":"skim"}`,
`{"tool":"done","answer":"sprayed","confidence":0.8,"cited_pages":[[1,2],[1,2],[1,2],[1,2],[1,2],[3,4],[8,9]]}`,
)

body := strings.NewReader(`{"document_id":"doc_x","query":"q"}`)
req := httptest.NewRequest(http.MethodPost, "/v1/answer/pageindex", body)
rec := httptest.NewRecorder()
pageIndexHandlerRouter(deps).ServeHTTP(rec, req)
if rec.Code != http.StatusOK {
t.Fatalf("status = %d, body = %s", rec.Code, rec.Body.String())
}
var resp map[string]any
if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
t.Fatalf("unmarshal: %v", err)
}

cits, ok := resp["citations"].([]any)
if !ok {
t.Fatalf("citations missing: %v", resp["citations"])
}
// Capped at MaxCitations=3, and every page range distinct.
if len(cits) > 3 {
t.Errorf("citations must be capped at 3, got %d", len(cits))
}
seenRange := map[[2]int]int{}
for _, raw := range cits {
c := raw.(map[string]any)
key := [2]int{int(c["start_page"].(float64)), int(c["end_page"].(float64))}
seenRange[key]++
}
for key, n := range seenRange {
if n > 1 {
t.Errorf("citation page range %v appears %d times; must be deduped", key, n)
}
}
// The three distinct ranges all survive (under the cap).
for _, want := range [][2]int{{1, 2}, {3, 4}, {8, 9}} {
if seenRange[want] == 0 {
t.Errorf("expected a citation for range %v, citations: %v", want, cits)
}
}
// confidence surfaces on the response.
if conf, ok := resp["confidence"].(float64); !ok || conf != 0.8 {
t.Errorf("response confidence = %v, want 0.8", resp["confidence"])
}
}

// TestHandleAnswerPageIndexConfidentSingleCitation is the happy half
// at the API layer: a confident single-range done — even after the
// model skimmed several pages — surfaces exactly ONE citation. This
// is the f1=1.0 commit case, and the fix that stops a multi-page
// navigation footprint from leaking into citations[].
func TestHandleAnswerPageIndexConfidentSingleCitation(t *testing.T) {
t.Parallel()

// The model reads pages 1-2 AND 8-9 while searching, but commits
// to a single cited range [8,9]. citations[] must be just [8,9].
deps, _, _ := newTestDeps(t,
`{"tool":"get_pages","start_page":1,"end_page":2,"reasoning":"check setup"}`,
`{"tool":"get_pages","start_page":8,"end_page":9,"reasoning":"check debt"}`,
`{"tool":"done","answer":"Debt is on 8-9.","confidence":0.95,"cited_pages":[[8,9]],"reasoning":"clear"}`,
)

body := strings.NewReader(`{"document_id":"doc_x","query":"debt?"}`)
req := httptest.NewRequest(http.MethodPost, "/v1/answer/pageindex", body)
rec := httptest.NewRecorder()
pageIndexHandlerRouter(deps).ServeHTTP(rec, req)
if rec.Code != http.StatusOK {
t.Fatalf("status = %d, body = %s", rec.Code, rec.Body.String())
}
var resp map[string]any
_ = json.Unmarshal(rec.Body.Bytes(), &resp)

cits, ok := resp["citations"].([]any)
if !ok || len(cits) != 1 {
t.Fatalf("confident single pick must yield exactly ONE citation, got %v", resp["citations"])
}
first := cits[0].(map[string]any)
if first["start_page"].(float64) != 8 || first["end_page"].(float64) != 9 {
t.Errorf("citation = %v-%v, want 8-9", first["start_page"], first["end_page"])
}
// pages_read still records the full navigation footprint (both
// reads) — only citations[] is tightened to the commitment.
if pages, ok := resp["pages_read"].([]any); !ok || len(pages) != 2 {
t.Errorf("pages_read must keep the full footprint (2 reads), got %v", resp["pages_read"])
}
}

// TestHandleAnswerPageIndexReasoningTraceQueryParam: the
// ?reasoning=true query param is an alternative to the body field.
// Some clients prefer it for GET-friendliness when prototyping.
Expand Down
10 changes: 10 additions & 0 deletions internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,16 @@ func applyEnvOverrides(c *Config) {
if v := firstEnv("VLS_INGEST_MODE", "VLE_INGEST_MODE"); v != "" {
c.Engine.Ingest.Mode = v
}
// PageIndex final-citation cap. Forwarded so the spray backstop can
// be tuned on the live server without a config edit (e.g. tighten
// to 1 to force single-citation answers, or relax for a doc set
// with many genuinely multi-location questions). VLS_ wins over
// VLE_; a garbled or negative value leaves the engine default (3).
if v := firstEnv("VLS_RETRIEVAL_PAGEINDEX_MAX_CITATIONS", "VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS"); v != "" {
if n, err := strconv.Atoi(v); err == nil && n >= 0 {
c.Engine.Retrieval.PageIndex.MaxCitations = n
}
}
// Total-parse timeout (seconds). Forwarded so the deployed server
// honours a tuned parse deadline without a secret/config edit — the
// outermost robustness valve against a parse that hangs (pre-LLM,
Expand Down
54 changes: 54 additions & 0 deletions internal/config/config_pageindex_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
package config

import (
"os"
"testing"
)

// TestForwardPageIndexMaxCitations covers the server-config
// pass-through for the PageIndex final-citation cap: the operator can
// set it via VLS_ or VLE_ and have it reach the embedded engine
// config, with VLS_ winning when both are present and a garbled value
// preserving the engine default.
func TestForwardPageIndexMaxCitations(t *testing.T) {
prevVLS := os.Getenv("VLS_RETRIEVAL_PAGEINDEX_MAX_CITATIONS")
prevVLE := os.Getenv("VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS")
defer func() {
os.Setenv("VLS_RETRIEVAL_PAGEINDEX_MAX_CITATIONS", prevVLS)
os.Setenv("VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS", prevVLE)
}()

// Engine default is 3 with no env set.
os.Unsetenv("VLS_RETRIEVAL_PAGEINDEX_MAX_CITATIONS")
os.Unsetenv("VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS")
cfg := Default()
applyEnvOverrides(&cfg)
if cfg.Engine.Retrieval.PageIndex.MaxCitations != 3 {
t.Errorf("default max_citations = %d, want 3", cfg.Engine.Retrieval.PageIndex.MaxCitations)
}

// VLE_ alone forwards through.
os.Setenv("VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS", "2")
cfg2 := Default()
applyEnvOverrides(&cfg2)
if cfg2.Engine.Retrieval.PageIndex.MaxCitations != 2 {
t.Errorf("VLE_ forward: max_citations = %d, want 2", cfg2.Engine.Retrieval.PageIndex.MaxCitations)
}

// VLS_ wins over VLE_.
os.Setenv("VLS_RETRIEVAL_PAGEINDEX_MAX_CITATIONS", "1")
cfg3 := Default()
applyEnvOverrides(&cfg3)
if cfg3.Engine.Retrieval.PageIndex.MaxCitations != 1 {
t.Errorf("VLS_ should win: max_citations = %d, want 1", cfg3.Engine.Retrieval.PageIndex.MaxCitations)
}

// Garbled value preserves the engine default (3), does not zero it.
os.Unsetenv("VLS_RETRIEVAL_PAGEINDEX_MAX_CITATIONS")
os.Setenv("VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS", "heaps")
cfg4 := Default()
applyEnvOverrides(&cfg4)
if cfg4.Engine.Retrieval.PageIndex.MaxCitations != 3 {
t.Errorf("garbage value should preserve default 3, got %d", cfg4.Engine.Retrieval.PageIndex.MaxCitations)
}
}
35 changes: 19 additions & 16 deletions internal/handler/answer_pageindex.go
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ func (h *AnswerPageIndexHandler) HandleAnswerPageIndex(w http.ResponseWriter, r
"citations": citations,
"strategy": perReq.Name(),
"model": budget.ModelName,
"confidence": res.Confidence,
"hops_taken": res.HopsTaken,
"usage": map[string]any{
"input_tokens": res.Usage.InputTokens,
Expand Down Expand Up @@ -281,6 +282,7 @@ func (h *AnswerPageIndexHandler) serveStream(w http.ResponseWriter, r *http.Requ
"citations": citations,
"strategy": strat.Name(),
"model": budget.ModelName,
"confidence": res.Confidence,
"hops_taken": res.HopsTaken,
"usage": map[string]any{
"input_tokens": res.Usage.InputTokens,
Expand All @@ -296,32 +298,33 @@ func (h *AnswerPageIndexHandler) serveStream(w http.ResponseWriter, r *http.Requ
emitSSE("answer", final)
}

// buildCitations transforms the strategy's PagesRead + the section
// tree into the response's citations array: one citation per unique
// cited page range, each carrying the overlapping section IDs and a
// best-effort grounding quote extracted from the cited content.
// buildCitations transforms the strategy's FINAL cited ranges + the
// section tree into the response's citations array: one citation per
// cited page range (deduped + capped by the strategy), each carrying
// the overlapping section IDs and a best-effort grounding quote
// extracted from the cited content. Falls back to the PagesRead
// footprint when nothing was cited (refusal / hop-capped run) so the
// caller still sees where the model looked.
func (h *AnswerPageIndexHandler) buildCitations(ctx context.Context, t *tree.Tree, res *retrieval.Result, query, requestModel string) []map[string]any {
if res == nil {
return nil
}
seen := make(map[[2]int]struct{}, len(res.PagesRead))
citations := make([]map[string]any, 0, len(res.PagesRead))
sources := retrieval.CitationSources(res)
citations := make([]map[string]any, 0, len(sources))

for _, pr := range res.PagesRead {
key := [2]int{pr.StartPage, pr.EndPage}
if _, dup := seen[key]; dup {
continue
for _, src := range sources {
sectionIDs := src.SectionIDs
if sectionIDs == nil {
sectionIDs = retrieval.SectionIDsOverlapping(t, src.Start, src.End)
}
seen[key] = struct{}{}

c := map[string]any{
"start_page": pr.StartPage,
"end_page": pr.EndPage,
"section_ids": pr.SectionIDs,
"start_page": src.Start,
"end_page": src.End,
"section_ids": sectionIDs,
}

if h.llm != nil {
content := h.materialiseCitedContent(ctx, t, pr.SectionIDs)
content := h.materialiseCitedContent(ctx, t, sectionIDs)
if strings.TrimSpace(content) != "" {
ext := h.spanExtractor(requestModel)
span, _, err := ext.Extract(ctx, content, query)
Expand Down
14 changes: 14 additions & 0 deletions openapi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1118,6 +1118,20 @@ components:
enum: [pageindex]
model:
type: string
confidence:
type: number
format: float
minimum: 0
maximum: 1
description: |
The agent's self-reported confidence in the answer +
citation set, in [0,1], taken from the terminal `done`
tool call. 0 means the model did not report one (it is
NOT a "definitely wrong" signal). Surfaced so callers
can treat a low-confidence answer more cautiously; it
does not suppress the answer — even an unsure agent
still returns its single best citation rather than a
spray of hedged ones.
hops_taken:
type: integer
description: |
Expand Down
Loading
Loading