diff --git a/cmd/engine/main.go b/cmd/engine/main.go index edfc821..793238e 100644 --- a/cmd/engine/main.go +++ b/cmd/engine/main.go @@ -412,6 +412,9 @@ func buildPageIndexStrategy(c config.RetrievalConfig, client llmgate.Client, sto if c.PageIndex.PageContentLimit > 0 { p.PageContentLimit = c.PageIndex.PageContentLimit } + if c.PageIndex.MaxCitations > 0 { + p.MaxCitations = c.PageIndex.MaxCitations + } p.ModelOverride = c.PageIndex.Model return p } diff --git a/cmd/server/main.go b/cmd/server/main.go index 2430862..63191d2 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -488,6 +488,9 @@ func buildPageIndexStrategy(c enginecfg.RetrievalConfig, client llmgate.Client, if c.PageIndex.PageContentLimit > 0 { p.PageContentLimit = c.PageIndex.PageContentLimit } + if c.PageIndex.MaxCitations > 0 { + p.MaxCitations = c.PageIndex.MaxCitations + } p.ModelOverride = c.PageIndex.Model return p } diff --git a/config.example.yaml b/config.example.yaml index 6bec472..4a91048 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -286,6 +286,16 @@ retrieval: # flagship model's context window. Higher values risk burning # context budget on stray full-document fetches. page_content_limit: 16000 + # Cap on how many distinct page ranges the FINAL answer may + # cite. A confidence backstop, not a navigation limit: the + # model may read as many pages as max_hops allows, but the + # citation set it commits to is bounded. FinanceBench data + # shows the failure mode is "spray ~5 low-confidence ranges -> + # miss all" while a single confident pick scores perfectly; + # capping the final set tames the spray. 3 keeps a genuinely + # multi-location answer (e.g. a figure + its footnote) while + # cutting the low-confidence tail. Env: VLE_/VLS_RETRIEVAL_PAGEINDEX_MAX_CITATIONS. + max_citations: 3 # Override the navigation-loop model; empty inherits the # request's model (which itself falls back to the engine # default). Most deployments leave this blank — navigation diff --git a/internal/api/pageindex.go b/internal/api/pageindex.go index 92fd540..541799a 100644 --- a/internal/api/pageindex.go +++ b/internal/api/pageindex.go @@ -175,6 +175,7 @@ func (d Deps) handleAnswerPageIndex(w http.ResponseWriter, r *http.Request) { "citations": citations, "strategy": perReq.Name(), "model": budget.ModelName, + "confidence": res.Confidence, "hops_taken": res.HopsTaken, "usage": map[string]any{ "input_tokens": res.Usage.InputTokens, @@ -268,6 +269,7 @@ func (d Deps) serveAnswerPageIndexStream(w http.ResponseWriter, r *http.Request, "citations": citations, "strategy": strat.Name(), "model": budget.ModelName, + "confidence": res.Confidence, "hops_taken": res.HopsTaken, "usage": map[string]any{ "input_tokens": res.Usage.InputTokens, @@ -298,27 +300,27 @@ func (d Deps) buildPageIndexCitations(ctx context.Context, t *tree.Tree, res *re if res == nil { return nil } - // Build a citation per UNIQUE page range present in PagesRead. - // The set of pages the model "read" is a superset of what it - // cited — some get_pages calls don't end up in the final - // cited_pages list — but the union is the right cone of trust - // to surface as evidence. The trace token is computed over - // only the strictly-cited ranges, which the strategy already - // has, so citation drift doesn't break replay. - seen := make(map[[2]int]struct{}, len(res.PagesRead)) - citations := make([]map[string]any, 0, len(res.PagesRead)) - - for _, pr := range res.PagesRead { - key := [2]int{pr.StartPage, pr.EndPage} - if _, dup := seen[key]; dup { - continue + // Source of truth for citations is the FINAL cited-range set + // (res.CitedPages) — already deduped and capped to MaxCitations by + // the strategy. This is what makes a confident single-pick answer + // surface ONE citation even when it skimmed several pages to find + // it: PagesRead is the navigation footprint (a superset), not the + // commitment. Fall back to PagesRead only when nothing was cited + // (refusal) or the run was hop-capped without a done — the trace + // token is keyed on the cited ranges either way, so this never + // breaks replay. + sources := retrieval.CitationSources(res) + citations := make([]map[string]any, 0, len(sources)) + + for _, src := range sources { + sectionIDs := src.SectionIDs + if sectionIDs == nil { + sectionIDs = retrieval.SectionIDsOverlapping(t, src.Start, src.End) } - seen[key] = struct{}{} - c := map[string]any{ - "start_page": pr.StartPage, - "end_page": pr.EndPage, - "section_ids": pr.SectionIDs, + "start_page": src.Start, + "end_page": src.End, + "section_ids": sectionIDs, } // Quote extraction is best-effort: an LLM blip or empty @@ -326,7 +328,7 @@ func (d Deps) buildPageIndexCitations(ctx context.Context, t *tree.Tree, res *re // path. We materialise the cited content from storage and // run one SpanExtractor call per citation. if d.LLM != nil { - content := d.materialiseCitedContent(ctx, t, pr.SectionIDs) + content := d.materialiseCitedContent(ctx, t, sectionIDs) if strings.TrimSpace(content) != "" { ext := d.pageIndexSpanExtractor(requestModel) span, _, err := ext.Extract(ctx, content, query) diff --git a/internal/api/pageindex_test.go b/internal/api/pageindex_test.go index d76bfa4..ca31cc7 100644 --- a/internal/api/pageindex_test.go +++ b/internal/api/pageindex_test.go @@ -261,6 +261,107 @@ func TestHandleAnswerPageIndexReasoningTrace(t *testing.T) { } } +// TestHandleAnswerPageIndexDedupAndCapCitations is the end-to-end +// proof of the bench-facing fix: a done that sprays the SAME range +// five times plus extras must produce a citations[] array that is +// deduped and capped at MaxCitations — no duplicate page ranges, no +// repeated section ids across citations, and confidence surfaced. +// This is the API-layer mirror of the strategy's dedup test and the +// reason precision@5 stops deflating. +func TestHandleAnswerPageIndexDedupAndCapCitations(t *testing.T) { + t.Parallel() + + // Read one range, then a done that cites [1,2] five times plus + // two more distinct ranges and a confidence. MaxCitations=3. + deps, _, _ := newTestDeps(t, + `{"tool":"get_pages","start_page":1,"end_page":2,"reasoning":"skim"}`, + `{"tool":"done","answer":"sprayed","confidence":0.8,"cited_pages":[[1,2],[1,2],[1,2],[1,2],[1,2],[3,4],[8,9]]}`, + ) + + body := strings.NewReader(`{"document_id":"doc_x","query":"q"}`) + req := httptest.NewRequest(http.MethodPost, "/v1/answer/pageindex", body) + rec := httptest.NewRecorder() + pageIndexHandlerRouter(deps).ServeHTTP(rec, req) + if rec.Code != http.StatusOK { + t.Fatalf("status = %d, body = %s", rec.Code, rec.Body.String()) + } + var resp map[string]any + if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil { + t.Fatalf("unmarshal: %v", err) + } + + cits, ok := resp["citations"].([]any) + if !ok { + t.Fatalf("citations missing: %v", resp["citations"]) + } + // Capped at MaxCitations=3, and every page range distinct. + if len(cits) > 3 { + t.Errorf("citations must be capped at 3, got %d", len(cits)) + } + seenRange := map[[2]int]int{} + for _, raw := range cits { + c := raw.(map[string]any) + key := [2]int{int(c["start_page"].(float64)), int(c["end_page"].(float64))} + seenRange[key]++ + } + for key, n := range seenRange { + if n > 1 { + t.Errorf("citation page range %v appears %d times; must be deduped", key, n) + } + } + // The three distinct ranges all survive (under the cap). + for _, want := range [][2]int{{1, 2}, {3, 4}, {8, 9}} { + if seenRange[want] == 0 { + t.Errorf("expected a citation for range %v, citations: %v", want, cits) + } + } + // confidence surfaces on the response. + if conf, ok := resp["confidence"].(float64); !ok || conf != 0.8 { + t.Errorf("response confidence = %v, want 0.8", resp["confidence"]) + } +} + +// TestHandleAnswerPageIndexConfidentSingleCitation is the happy half +// at the API layer: a confident single-range done — even after the +// model skimmed several pages — surfaces exactly ONE citation. This +// is the f1=1.0 commit case, and the fix that stops a multi-page +// navigation footprint from leaking into citations[]. +func TestHandleAnswerPageIndexConfidentSingleCitation(t *testing.T) { + t.Parallel() + + // The model reads pages 1-2 AND 8-9 while searching, but commits + // to a single cited range [8,9]. citations[] must be just [8,9]. + deps, _, _ := newTestDeps(t, + `{"tool":"get_pages","start_page":1,"end_page":2,"reasoning":"check setup"}`, + `{"tool":"get_pages","start_page":8,"end_page":9,"reasoning":"check debt"}`, + `{"tool":"done","answer":"Debt is on 8-9.","confidence":0.95,"cited_pages":[[8,9]],"reasoning":"clear"}`, + ) + + body := strings.NewReader(`{"document_id":"doc_x","query":"debt?"}`) + req := httptest.NewRequest(http.MethodPost, "/v1/answer/pageindex", body) + rec := httptest.NewRecorder() + pageIndexHandlerRouter(deps).ServeHTTP(rec, req) + if rec.Code != http.StatusOK { + t.Fatalf("status = %d, body = %s", rec.Code, rec.Body.String()) + } + var resp map[string]any + _ = json.Unmarshal(rec.Body.Bytes(), &resp) + + cits, ok := resp["citations"].([]any) + if !ok || len(cits) != 1 { + t.Fatalf("confident single pick must yield exactly ONE citation, got %v", resp["citations"]) + } + first := cits[0].(map[string]any) + if first["start_page"].(float64) != 8 || first["end_page"].(float64) != 9 { + t.Errorf("citation = %v-%v, want 8-9", first["start_page"], first["end_page"]) + } + // pages_read still records the full navigation footprint (both + // reads) — only citations[] is tightened to the commitment. + if pages, ok := resp["pages_read"].([]any); !ok || len(pages) != 2 { + t.Errorf("pages_read must keep the full footprint (2 reads), got %v", resp["pages_read"]) + } +} + // TestHandleAnswerPageIndexReasoningTraceQueryParam: the // ?reasoning=true query param is an alternative to the body field. // Some clients prefer it for GET-friendliness when prototyping. diff --git a/internal/config/config.go b/internal/config/config.go index 8a2ed78..a44d5e8 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -326,6 +326,16 @@ func applyEnvOverrides(c *Config) { if v := firstEnv("VLS_INGEST_MODE", "VLE_INGEST_MODE"); v != "" { c.Engine.Ingest.Mode = v } + // PageIndex final-citation cap. Forwarded so the spray backstop can + // be tuned on the live server without a config edit (e.g. tighten + // to 1 to force single-citation answers, or relax for a doc set + // with many genuinely multi-location questions). VLS_ wins over + // VLE_; a garbled or negative value leaves the engine default (3). + if v := firstEnv("VLS_RETRIEVAL_PAGEINDEX_MAX_CITATIONS", "VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS"); v != "" { + if n, err := strconv.Atoi(v); err == nil && n >= 0 { + c.Engine.Retrieval.PageIndex.MaxCitations = n + } + } // Total-parse timeout (seconds). Forwarded so the deployed server // honours a tuned parse deadline without a secret/config edit — the // outermost robustness valve against a parse that hangs (pre-LLM, diff --git a/internal/config/config_pageindex_test.go b/internal/config/config_pageindex_test.go new file mode 100644 index 0000000..326d8b9 --- /dev/null +++ b/internal/config/config_pageindex_test.go @@ -0,0 +1,54 @@ +package config + +import ( + "os" + "testing" +) + +// TestForwardPageIndexMaxCitations covers the server-config +// pass-through for the PageIndex final-citation cap: the operator can +// set it via VLS_ or VLE_ and have it reach the embedded engine +// config, with VLS_ winning when both are present and a garbled value +// preserving the engine default. +func TestForwardPageIndexMaxCitations(t *testing.T) { + prevVLS := os.Getenv("VLS_RETRIEVAL_PAGEINDEX_MAX_CITATIONS") + prevVLE := os.Getenv("VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS") + defer func() { + os.Setenv("VLS_RETRIEVAL_PAGEINDEX_MAX_CITATIONS", prevVLS) + os.Setenv("VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS", prevVLE) + }() + + // Engine default is 3 with no env set. + os.Unsetenv("VLS_RETRIEVAL_PAGEINDEX_MAX_CITATIONS") + os.Unsetenv("VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS") + cfg := Default() + applyEnvOverrides(&cfg) + if cfg.Engine.Retrieval.PageIndex.MaxCitations != 3 { + t.Errorf("default max_citations = %d, want 3", cfg.Engine.Retrieval.PageIndex.MaxCitations) + } + + // VLE_ alone forwards through. + os.Setenv("VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS", "2") + cfg2 := Default() + applyEnvOverrides(&cfg2) + if cfg2.Engine.Retrieval.PageIndex.MaxCitations != 2 { + t.Errorf("VLE_ forward: max_citations = %d, want 2", cfg2.Engine.Retrieval.PageIndex.MaxCitations) + } + + // VLS_ wins over VLE_. + os.Setenv("VLS_RETRIEVAL_PAGEINDEX_MAX_CITATIONS", "1") + cfg3 := Default() + applyEnvOverrides(&cfg3) + if cfg3.Engine.Retrieval.PageIndex.MaxCitations != 1 { + t.Errorf("VLS_ should win: max_citations = %d, want 1", cfg3.Engine.Retrieval.PageIndex.MaxCitations) + } + + // Garbled value preserves the engine default (3), does not zero it. + os.Unsetenv("VLS_RETRIEVAL_PAGEINDEX_MAX_CITATIONS") + os.Setenv("VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS", "heaps") + cfg4 := Default() + applyEnvOverrides(&cfg4) + if cfg4.Engine.Retrieval.PageIndex.MaxCitations != 3 { + t.Errorf("garbage value should preserve default 3, got %d", cfg4.Engine.Retrieval.PageIndex.MaxCitations) + } +} diff --git a/internal/handler/answer_pageindex.go b/internal/handler/answer_pageindex.go index fb207f0..3d890d5 100644 --- a/internal/handler/answer_pageindex.go +++ b/internal/handler/answer_pageindex.go @@ -200,6 +200,7 @@ func (h *AnswerPageIndexHandler) HandleAnswerPageIndex(w http.ResponseWriter, r "citations": citations, "strategy": perReq.Name(), "model": budget.ModelName, + "confidence": res.Confidence, "hops_taken": res.HopsTaken, "usage": map[string]any{ "input_tokens": res.Usage.InputTokens, @@ -281,6 +282,7 @@ func (h *AnswerPageIndexHandler) serveStream(w http.ResponseWriter, r *http.Requ "citations": citations, "strategy": strat.Name(), "model": budget.ModelName, + "confidence": res.Confidence, "hops_taken": res.HopsTaken, "usage": map[string]any{ "input_tokens": res.Usage.InputTokens, @@ -296,32 +298,33 @@ func (h *AnswerPageIndexHandler) serveStream(w http.ResponseWriter, r *http.Requ emitSSE("answer", final) } -// buildCitations transforms the strategy's PagesRead + the section -// tree into the response's citations array: one citation per unique -// cited page range, each carrying the overlapping section IDs and a -// best-effort grounding quote extracted from the cited content. +// buildCitations transforms the strategy's FINAL cited ranges + the +// section tree into the response's citations array: one citation per +// cited page range (deduped + capped by the strategy), each carrying +// the overlapping section IDs and a best-effort grounding quote +// extracted from the cited content. Falls back to the PagesRead +// footprint when nothing was cited (refusal / hop-capped run) so the +// caller still sees where the model looked. func (h *AnswerPageIndexHandler) buildCitations(ctx context.Context, t *tree.Tree, res *retrieval.Result, query, requestModel string) []map[string]any { if res == nil { return nil } - seen := make(map[[2]int]struct{}, len(res.PagesRead)) - citations := make([]map[string]any, 0, len(res.PagesRead)) + sources := retrieval.CitationSources(res) + citations := make([]map[string]any, 0, len(sources)) - for _, pr := range res.PagesRead { - key := [2]int{pr.StartPage, pr.EndPage} - if _, dup := seen[key]; dup { - continue + for _, src := range sources { + sectionIDs := src.SectionIDs + if sectionIDs == nil { + sectionIDs = retrieval.SectionIDsOverlapping(t, src.Start, src.End) } - seen[key] = struct{}{} - c := map[string]any{ - "start_page": pr.StartPage, - "end_page": pr.EndPage, - "section_ids": pr.SectionIDs, + "start_page": src.Start, + "end_page": src.End, + "section_ids": sectionIDs, } if h.llm != nil { - content := h.materialiseCitedContent(ctx, t, pr.SectionIDs) + content := h.materialiseCitedContent(ctx, t, sectionIDs) if strings.TrimSpace(content) != "" { ext := h.spanExtractor(requestModel) span, _, err := ext.Extract(ctx, content, query) diff --git a/openapi.yaml b/openapi.yaml index ddd8022..511ab1b 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -1118,6 +1118,20 @@ components: enum: [pageindex] model: type: string + confidence: + type: number + format: float + minimum: 0 + maximum: 1 + description: | + The agent's self-reported confidence in the answer + + citation set, in [0,1], taken from the terminal `done` + tool call. 0 means the model did not report one (it is + NOT a "definitely wrong" signal). Surfaced so callers + can treat a low-confidence answer more cautiously; it + does not suppress the answer — even an unsure agent + still returns its single best citation rather than a + spray of hedged ones. hops_taken: type: integer description: | diff --git a/pkg/config/config.go b/pkg/config/config.go index 8a6b30e..bfca023 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -458,6 +458,16 @@ type PageIndexBlock struct { // fetch from torching the model's context window. PageContentLimit int `yaml:"page_content_limit"` + // MaxCitations caps how many distinct page ranges the final + // answer may cite. Default: 3. Set to 0 to use the strategy's + // built-in default. This is a confidence backstop, not a + // navigation limit: the model may read as many pages as MaxHops + // allows, but the citation set it commits to is bounded. The cap + // mechanically tames the "spray many low-confidence ranges → + // miss all" failure mode FinanceBench surfaced, while still + // allowing a genuinely multi-location answer to cite two or three. + MaxCitations int `yaml:"max_citations"` + // Model overrides the LLM model used for the navigation loop. // Empty means use the request's model (which itself falls back // to the engine default). Useful when navigation should run on @@ -743,6 +753,7 @@ func Default() Config { Enabled: true, MaxHops: 8, PageContentLimit: 16000, + MaxCitations: 3, }, }, Ingest: IngestConfig{ @@ -1132,6 +1143,14 @@ func applyEnvOverrides(c *Config) { c.Retrieval.PageIndex.PageContentLimit = n } } + // MaxCitations accepts both the VLE_ and VLS_ prefixes so the + // deploy layer (which forwards VLS_*) and local VLE_* envs both + // reach it. A garbled or negative value preserves the default. + if v := firstEnv("VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS", "VLS_RETRIEVAL_PAGEINDEX_MAX_CITATIONS"); v != "" { + if n, err := strconv.Atoi(v); err == nil && n >= 0 { + c.Retrieval.PageIndex.MaxCitations = n + } + } if v := os.Getenv("VLE_RETRIEVAL_PAGEINDEX_MODEL"); v != "" { c.Retrieval.PageIndex.Model = v } @@ -1296,6 +1315,9 @@ func (c Config) Validate() error { if c.Retrieval.PageIndex.PageContentLimit < 0 { return fmt.Errorf("retrieval.pageindex.page_content_limit must be >= 0, got %d", c.Retrieval.PageIndex.PageContentLimit) } + if c.Retrieval.PageIndex.MaxCitations < 0 { + return fmt.Errorf("retrieval.pageindex.max_citations must be >= 0, got %d", c.Retrieval.PageIndex.MaxCitations) + } return nil } diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index 8830dde..5d4e920 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -684,6 +684,9 @@ func TestPageIndexDefaults(t *testing.T) { if cfg.Retrieval.PageIndex.PageContentLimit != 16000 { t.Errorf("page_content_limit = %d, want 16000", cfg.Retrieval.PageIndex.PageContentLimit) } + if cfg.Retrieval.PageIndex.MaxCitations != 3 { + t.Errorf("max_citations = %d, want 3", cfg.Retrieval.PageIndex.MaxCitations) + } if cfg.Retrieval.PageIndex.Model != "" { t.Errorf("model default should be empty (inherit), got %q", cfg.Retrieval.PageIndex.Model) } @@ -695,17 +698,20 @@ func TestPageIndexEnvOverride(t *testing.T) { prevEnabled := os.Getenv("VLE_RETRIEVAL_PAGEINDEX_ENABLED") prevHops := os.Getenv("VLE_RETRIEVAL_PAGEINDEX_MAX_HOPS") prevLimit := os.Getenv("VLE_RETRIEVAL_PAGEINDEX_PAGE_CONTENT_LIMIT") + prevCits := os.Getenv("VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS") prevModel := os.Getenv("VLE_RETRIEVAL_PAGEINDEX_MODEL") defer func() { os.Setenv("VLE_RETRIEVAL_PAGEINDEX_ENABLED", prevEnabled) os.Setenv("VLE_RETRIEVAL_PAGEINDEX_MAX_HOPS", prevHops) os.Setenv("VLE_RETRIEVAL_PAGEINDEX_PAGE_CONTENT_LIMIT", prevLimit) + os.Setenv("VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS", prevCits) os.Setenv("VLE_RETRIEVAL_PAGEINDEX_MODEL", prevModel) }() os.Setenv("VLE_RETRIEVAL_PAGEINDEX_ENABLED", "false") os.Setenv("VLE_RETRIEVAL_PAGEINDEX_MAX_HOPS", "12") os.Setenv("VLE_RETRIEVAL_PAGEINDEX_PAGE_CONTENT_LIMIT", "32000") + os.Setenv("VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS", "5") os.Setenv("VLE_RETRIEVAL_PAGEINDEX_MODEL", "gemini-2.0-flash") cfg := Default() @@ -720,11 +726,43 @@ func TestPageIndexEnvOverride(t *testing.T) { if cfg.Retrieval.PageIndex.PageContentLimit != 32000 { t.Errorf("page_content_limit = %d, want 32000", cfg.Retrieval.PageIndex.PageContentLimit) } + if cfg.Retrieval.PageIndex.MaxCitations != 5 { + t.Errorf("max_citations = %d, want 5", cfg.Retrieval.PageIndex.MaxCitations) + } if cfg.Retrieval.PageIndex.Model != "gemini-2.0-flash" { t.Errorf("model = %q, want gemini-2.0-flash", cfg.Retrieval.PageIndex.Model) } } +// TestPageIndexMaxCitationsVLSAlias: the VLS_ prefix reaches +// MaxCitations too (the deploy layer forwards VLS_*), and VLE_ wins +// when both are set. +func TestPageIndexMaxCitationsVLSAlias(t *testing.T) { + prevVLE := os.Getenv("VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS") + prevVLS := os.Getenv("VLS_RETRIEVAL_PAGEINDEX_MAX_CITATIONS") + defer func() { + os.Setenv("VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS", prevVLE) + os.Setenv("VLS_RETRIEVAL_PAGEINDEX_MAX_CITATIONS", prevVLS) + }() + + // VLS_ alone reaches the field. + os.Unsetenv("VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS") + os.Setenv("VLS_RETRIEVAL_PAGEINDEX_MAX_CITATIONS", "2") + cfg := Default() + applyEnvOverrides(&cfg) + if cfg.Retrieval.PageIndex.MaxCitations != 2 { + t.Errorf("VLS_ alias: max_citations = %d, want 2", cfg.Retrieval.PageIndex.MaxCitations) + } + + // VLE_ wins when both are set. + os.Setenv("VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS", "4") + cfg2 := Default() + applyEnvOverrides(&cfg2) + if cfg2.Retrieval.PageIndex.MaxCitations != 4 { + t.Errorf("VLE_ should win over VLS_: max_citations = %d, want 4", cfg2.Retrieval.PageIndex.MaxCitations) + } +} + // TestPageIndexEnvOverrideEnable: toggle on from an explicitly // disabled state. func TestPageIndexEnvOverrideEnable(t *testing.T) { @@ -745,13 +783,16 @@ func TestPageIndexEnvOverrideEnable(t *testing.T) { func TestPageIndexEnvOverrideRejectsBad(t *testing.T) { prevHops := os.Getenv("VLE_RETRIEVAL_PAGEINDEX_MAX_HOPS") prevLimit := os.Getenv("VLE_RETRIEVAL_PAGEINDEX_PAGE_CONTENT_LIMIT") + prevCits := os.Getenv("VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS") defer func() { os.Setenv("VLE_RETRIEVAL_PAGEINDEX_MAX_HOPS", prevHops) os.Setenv("VLE_RETRIEVAL_PAGEINDEX_PAGE_CONTENT_LIMIT", prevLimit) + os.Setenv("VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS", prevCits) }() os.Setenv("VLE_RETRIEVAL_PAGEINDEX_MAX_HOPS", "abc") os.Setenv("VLE_RETRIEVAL_PAGEINDEX_PAGE_CONTENT_LIMIT", "not-a-number") + os.Setenv("VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS", "lots") cfg := Default() applyEnvOverrides(&cfg) @@ -761,6 +802,9 @@ func TestPageIndexEnvOverrideRejectsBad(t *testing.T) { if cfg.Retrieval.PageIndex.PageContentLimit != 16000 { t.Errorf("garbage page_content_limit env should preserve default, got %d", cfg.Retrieval.PageIndex.PageContentLimit) } + if cfg.Retrieval.PageIndex.MaxCitations != 3 { + t.Errorf("garbage max_citations env should preserve default 3, got %d", cfg.Retrieval.PageIndex.MaxCitations) + } } // TestValidatePageIndexNegatives: negatives rejected by Validate. @@ -780,10 +824,18 @@ func TestValidatePageIndexNegatives(t *testing.T) { t.Error("negative page_content_limit should fail validation") } + cfgCits := Default() + cfgCits.Database.URL = "postgres://localhost/test" + cfgCits.Retrieval.PageIndex.MaxCitations = -1 + if err := cfgCits.Validate(); err == nil { + t.Error("negative max_citations should fail validation") + } + cfg3 := Default() cfg3.Database.URL = "postgres://localhost/test" cfg3.Retrieval.PageIndex.MaxHops = 0 cfg3.Retrieval.PageIndex.PageContentLimit = 0 + cfg3.Retrieval.PageIndex.MaxCitations = 0 if err := cfg3.Validate(); err != nil { t.Errorf("zero values should pass (defaults applied at runtime): %v", err) } diff --git a/pkg/retrieval/pageindex_strategy.go b/pkg/retrieval/pageindex_strategy.go index 6df2ed2..17411d2 100644 --- a/pkg/retrieval/pageindex_strategy.go +++ b/pkg/retrieval/pageindex_strategy.go @@ -73,6 +73,19 @@ type PageIndexStrategy struct { // easily blow past 200K chars otherwise. PageContentLimit int + // MaxCitations caps how many distinct page ranges the FINAL done + // action may cite. Zero means use defaultPageIndexMaxCitations. + // + // This is a confidence backstop, not a navigation limit: the + // model is free to read as many pages as MaxHops allows, but the + // citation set it commits to is bounded. FinanceBench data shows + // the loss mode is "spray ~5 low-confidence ranges → all miss" + // while a single confident pick scores f1=1.0; capping the final + // set mechanically tames the spray even when the prompt doesn't + // fully. When done returns more ranges than the cap, the top-N + // by confidence (ties broken by first-seen order) are kept. + MaxCitations int + // ModelOverride, if non-empty, replaces the budget's ModelName // for every turn. Useful for routing the navigation loop to a // cheaper or faster model than the rest of the engine. @@ -106,7 +119,13 @@ type PageIndexEvent struct { SectionIDs []tree.SectionID `json:"section_ids,omitempty"` Answer string `json:"answer,omitempty"` CitedPages [][2]int `json:"cited_pages,omitempty"` - Note string `json:"note,omitempty"` + // Confidence is the model's self-reported confidence in the FINAL + // answer + citation set, in [0,1]. Populated only on the "done" + // event. Surfaced so SSE/trace consumers can show how sure the + // agent was — and, downstream, so a low-confidence answer can be + // treated more cautiously than a confident one. + Confidence float64 `json:"confidence,omitempty"` + Note string `json:"note,omitempty"` } // defaultPageIndexMaxHops bounds the loop. Eight turns is enough for @@ -121,6 +140,16 @@ const defaultPageIndexMaxHops = 8 // excerpt. Matches PageIndex's reference behaviour. const defaultPageContentLimit = 16000 +// defaultPageIndexMaxCitations bounds the FINAL cited-range set. Three +// is generous for the answer-spans-one-place common case (where ONE is +// ideal) while still allowing a genuinely multi-location answer (e.g. a +// 10-K figure cross-referenced between the income statement and a +// footnote) to cite two or three distinct ranges. The FinanceBench +// signal that motivated the cap: confident single-pick = f1 1.0, +// 5-range spray = f1 0. Capping at 3 keeps the legitimate multi-range +// case while removing the long tail of low-confidence noise. +const defaultPageIndexMaxCitations = 3 + // strategyNamePageIndex is the stable identifier for config // (retrieval.strategy: pageindex) and telemetry. const strategyNamePageIndex = "pageindex" @@ -170,6 +199,7 @@ func NewPageIndexStrategy(client llmgate.Client) *PageIndexStrategy { LLM: client, MaxHops: defaultPageIndexMaxHops, PageContentLimit: defaultPageContentLimit, + MaxCitations: defaultPageIndexMaxCitations, } } @@ -214,6 +244,10 @@ func (s *PageIndexStrategy) SelectWithCost(ctx context.Context, t *tree.Tree, qu if pageLimit <= 0 { pageLimit = defaultPageContentLimit } + maxCitations := s.MaxCitations + if maxCitations <= 0 { + maxCitations = defaultPageIndexMaxCitations + } // Pre-flatten the tree into an ordinal section list ordered by // page. The get_pages observation iterates this twice per call; @@ -280,17 +314,28 @@ func (s *PageIndexStrategy) SelectWithCost(ctx context.Context, t *tree.Tree, qu case pageActionDone: finalAnswer = strings.TrimSpace(action.Answer) finalReasoning = strings.TrimSpace(action.Reasoning) - citedRanges = normaliseRanges(action.CitedPages, maxPage) + // Confidence-gate the final citation set: clamp + dedup the + // model's cited ranges, then cap to maxCitations keeping the + // highest-confidence (ties → first-seen) ranges. This is the + // fix for the "spray 5 low-confidence ranges, hit none" + // failure mode — a confident single pick survives untouched, + // a spray is collapsed to its best few. + citedRanges = selectCitedRanges(action.CitedPages, action.CitedConfidences, maxPage, maxCitations) + confidence := clampConfidence(action.Confidence) selectedIDs := sectionsOverlapping(sections, citedRanges) s.emit(PageIndexEvent{ Hop: hopsTaken, Type: pageActionDone, Reasoning: finalReasoning, Answer: finalAnswer, - CitedPages: action.CitedPages, + CitedPages: rangesToPairs(citedRanges), + Confidence: confidence, }) return &Result{ SelectedIDs: selectedIDs, + Confidences: confidenceMap(selectedIDs, confidence), + Confidence: confidence, + CitedPages: rangesToPairs(citedRanges), Reasoning: finalAnswer, // /v1/answer/pageindex reads this ModelUsed: model, Usage: totalUsage, @@ -373,12 +418,16 @@ func (s *PageIndexStrategy) SelectWithCost(ctx context.Context, t *tree.Tree, qu // ignores the rule, we return whatever pages have been read so // the caller at least sees the navigation footprint and an empty // answer rather than a 500. - finalAnswer, finalReasoning, citedRanges = s.forceDone(ctx, &msgs, &totalUsage, &hopsTaken, model, maxPage) + var forcedConfidence float64 + finalAnswer, finalReasoning, citedRanges, forcedConfidence = s.forceDone(ctx, &msgs, &totalUsage, &hopsTaken, model, maxPage, maxCitations) selectedIDs := sectionsOverlapping(sections, citedRanges) log.Printf("retrieval: pageindex strategy hit max_hops=%d; forced done", maxHops) _ = finalReasoning return &Result{ SelectedIDs: selectedIDs, + Confidences: confidenceMap(selectedIDs, forcedConfidence), + Confidence: forcedConfidence, + CitedPages: rangesToPairs(citedRanges), Reasoning: finalAnswer, ModelUsed: model, Usage: totalUsage, @@ -544,13 +593,14 @@ func (s *PageIndexStrategy) loadSectionBody(ctx context.Context, sec sectionPage // forceDone runs one final hop with a hard "emit done NOW" prompt so // the loop can exit gracefully on a stubborn model. Returns -// (answer, reasoning, cited_ranges). When the model still doesn't -// emit a valid done action, the empty values flow back and the -// caller sees a hop-capped Result. -func (s *PageIndexStrategy) forceDone(ctx context.Context, msgs *[]llmgate.Message, totalUsage *Usage, hopsTaken *int, model string, maxPage int) (string, string, []pageRange) { +// (answer, reasoning, cited_ranges, confidence). When the model still +// doesn't emit a valid done action, the empty values flow back and the +// caller sees a hop-capped Result. The forced citation set is gated +// through the same dedup + cap as the normal done path. +func (s *PageIndexStrategy) forceDone(ctx context.Context, msgs *[]llmgate.Message, totalUsage *Usage, hopsTaken *int, model string, maxPage, maxCitations int) (string, string, []pageRange, float64) { *msgs = append(*msgs, llmgate.Message{ Role: llmgate.RoleUser, - Content: "You have used your tool-call budget. Reply NOW with one JSON object: {\"tool\":\"done\",\"answer\":\" kept[idx].conf { + kept[idx].conf = conf + } continue } - seen[pr] = struct{}{} - out = append(out, pr) + seen[pr] = len(kept) + kept = append(kept, scored{r: pr, conf: conf, order: i}) } + if len(kept) == 0 { + return nil + } + + // Cap: when the distinct set overflows, keep the top-N by + // confidence, breaking ties by first-seen order so the result is + // deterministic and respects the model's own prioritisation. + if maxCitations > 0 && len(kept) > maxCitations { + sort.SliceStable(kept, func(i, j int) bool { + if kept[i].conf != kept[j].conf { + return kept[i].conf > kept[j].conf + } + return kept[i].order < kept[j].order + }) + kept = kept[:maxCitations] + } + + out := make([]pageRange, len(kept)) + for i, k := range kept { + out[i] = k.r + } + // Canonical page order for a stable trace token. sort.Slice(out, func(i, j int) bool { if out[i].Start != out[j].Start { return out[i].Start < out[j].Start @@ -789,6 +898,121 @@ func normaliseRanges(raw [][2]int, maxPage int) []pageRange { return out } +// clampConfidence coerces a model-emitted confidence into [0,1]. Out +// of range or NaN-ish values clamp to the nearest bound; a missing +// confidence arrives as 0 and stays 0 (treated as "unstated", not +// "certainly wrong" — the answer is still returned). +func clampConfidence(c float64) float64 { + if c < 0 { + return 0 + } + if c > 1 { + return 1 + } + return c +} + +// confidenceMap projects a single overall confidence across every +// selected section ID so the API layer's existing per-pick confidence +// surface (Result.Confidences) and abstention machinery can read it +// uniformly with the section-based strategies. Returns nil when there +// is no signal to surface (no IDs, or confidence unstated) so callers +// that gate on a non-empty map behave exactly as before for runs that +// don't carry confidence. +func confidenceMap(ids []tree.SectionID, confidence float64) map[tree.SectionID]float64 { + if len(ids) == 0 || confidence <= 0 { + return nil + } + m := make(map[tree.SectionID]float64, len(ids)) + for _, id := range ids { + m[id] = confidence + } + return m +} + +// rangesToPairs flattens []pageRange back to the [][2]int wire shape +// used by PageIndexEvent.CitedPages, so the done event reports the +// FINAL (deduped, capped) citation set rather than the raw model +// output. Consumers building citations from the event therefore never +// see the pre-dedup spray. +func rangesToPairs(ranges []pageRange) [][2]int { + if len(ranges) == 0 { + return nil + } + out := make([][2]int, len(ranges)) + for i, r := range ranges { + out[i] = [2]int{r.Start, r.End} + } + return out +} + +// CitationSource is one resolved citation the API layer renders into +// the response's citations[] array. Start/End are the inclusive page +// range; SectionIDs are the sections whose page range overlaps it +// (nil when the caller must compute them from the tree). It is the +// shared shape both /v1/answer/pageindex implementations build from so +// citation construction stays identical across the two API layers. +type CitationSource struct { + Start int + End int + SectionIDs []tree.SectionID +} + +// CitationSources resolves which page ranges a page-based Result +// should surface as citations, in render order. It prefers the FINAL +// cited-range set (res.CitedPages — deduped + capped by the strategy) +// so a confident single-pick answer yields ONE citation. When nothing +// was cited (a refusal, or a hop-capped run with no valid done) it +// falls back to the unique ranges in PagesRead so the caller still +// sees the navigation footprint rather than an empty citations array. +// +// In the cited-ranges path SectionIDs is left nil (the caller resolves +// it from the tree via SectionIDsOverlapping); in the PagesRead +// fallback the per-read SectionIDs the strategy already recorded are +// reused directly. +func CitationSources(res *Result) []CitationSource { + if res == nil { + return nil + } + if len(res.CitedPages) > 0 { + out := make([]CitationSource, 0, len(res.CitedPages)) + seen := make(map[[2]int]struct{}, len(res.CitedPages)) + for _, p := range res.CitedPages { + key := [2]int{p[0], p[1]} + if _, dup := seen[key]; dup { + continue + } + seen[key] = struct{}{} + out = append(out, CitationSource{Start: p[0], End: p[1]}) + } + return out + } + // Fallback: unique PagesRead ranges, first-seen order. + out := make([]CitationSource, 0, len(res.PagesRead)) + seen := make(map[[2]int]struct{}, len(res.PagesRead)) + for _, pr := range res.PagesRead { + key := [2]int{pr.StartPage, pr.EndPage} + if _, dup := seen[key]; dup { + continue + } + seen[key] = struct{}{} + out = append(out, CitationSource{Start: pr.StartPage, End: pr.EndPage, SectionIDs: pr.SectionIDs}) + } + return out +} + +// SectionIDsOverlapping returns, in document order, the IDs of every +// section in the tree whose [PageStart,PageEnd] overlaps the inclusive +// range [start,end]. It is the tree-facing companion to the internal +// sectionsOverlapping helper, exported so the API layers can resolve a +// cited page range to its sections without re-flattening logic. +func SectionIDsOverlapping(t *tree.Tree, start, end int) []tree.SectionID { + if t == nil || t.Root == nil { + return nil + } + return sectionsOverlapping(flattenSectionsByPage(t), []pageRange{{Start: start, End: end}}) +} + // sectionsOverlapping returns the IDs of every section whose // [PageStart, PageEnd] overlaps any of the cited ranges. Preserves // document order (because sections is page-sorted) and deduplicates. @@ -879,6 +1103,24 @@ type PageIndexAction struct { // single page can be expressed as [5,5]. CitedPages [][2]int `json:"cited_pages,omitempty"` + // Confidence is the model's self-reported confidence in the + // answer + citation set as a whole, in [0,1]. Optional on a done + // action. The loop surfaces it on the Result so callers can see + // how sure the agent was. A low overall confidence does NOT + // suppress the answer — the agent still returns its single best + // pick rather than nothing — it only annotates it. + Confidence float64 `json:"confidence,omitempty"` + + // CitedConfidences is an OPTIONAL per-range confidence aligned by + // index with CitedPages: CitedConfidences[i] is the model's + // confidence in CitedPages[i]. When present and the cited set + // exceeds the cap, it drives which ranges are kept (highest + // confidence first). When absent, the cap falls back to the + // model's own ordering (it is told to list its best range first). + // Parsed from the rich cited_pages shape + // [{"pages":[5,7],"confidence":0.9}] when the model emits it. + CitedConfidences []float64 `json:"-"` + // Reasoning is the per-call explanation the system prompt // asks the model to emit. Surfaced into the reasoning_trace // when the endpoint is called with ?reasoning=true. @@ -972,21 +1214,52 @@ func ParsePageIndexAction(raw string) (PageIndexAction, error) { if v, ok := fields["reasoning"]; ok { _ = json.Unmarshal(v, &a.Reasoning) } + // Overall answer confidence (0-1). Tolerant of a bare number or a + // numeric string; anything unparseable leaves it at 0 ("unstated"). + if v, ok := fields["confidence"]; ok && len(v) > 0 { + if err := json.Unmarshal(v, &a.Confidence); err != nil { + var cs string + if json.Unmarshal(v, &cs) == nil { + if f, perr := strconv.ParseFloat(strings.TrimSpace(cs), 64); perr == nil { + a.Confidence = f + } + } + } + } - // cited_pages: try the typed shape first ([[1,2],[5,7]]); fall - // back to the string-shape (["1-2","5-7"]) when the typed - // decode fails or is empty. + // cited_pages accepts three shapes, tried most-structured first: + // 1. [[1,2],[5,7]] (preferred) + // 2. [{"pages":[1,2],"confidence":0.9}, ...] (rich, per-range conf) + // 3. ["1-2","5-7"] (tolerated string form) + // The rich shape lets the model attach per-range confidence the + // cap can rank on; the other two leave CitedConfidences empty and + // the cap falls back to emission order. if v, ok := fields["cited_pages"]; ok && len(v) > 0 { if err := json.Unmarshal(v, &a.CitedPages); err != nil || len(a.CitedPages) == 0 { a.CitedPages = nil - var asStrings []string - if err := json.Unmarshal(v, &asStrings); err == nil { - for _, p := range asStrings { - s, e, ok := parsePageRangeString(p) + // Rich object form before string form: an array of objects + // fails [][2]int decode but carries the most signal. + var asObjs []citedRangeObj + if err := json.Unmarshal(v, &asObjs); err == nil && len(asObjs) > 0 { + for _, o := range asObjs { + s, e, ok := o.toRange() if !ok { continue } a.CitedPages = append(a.CitedPages, [2]int{s, e}) + a.CitedConfidences = append(a.CitedConfidences, o.Confidence) + } + } + if len(a.CitedPages) == 0 { + var asStrings []string + if err := json.Unmarshal(v, &asStrings); err == nil { + for _, p := range asStrings { + s, e, ok := parsePageRangeString(p) + if !ok { + continue + } + a.CitedPages = append(a.CitedPages, [2]int{s, e}) + } } } } @@ -1005,6 +1278,69 @@ func ParsePageIndexAction(raw string) (PageIndexAction, error) { return a, nil } +// citedRangeObj is the rich cited_pages element shape: +// {"pages":[5,7],"confidence":0.9}. It also tolerates explicit +// start/end keys ({"start":5,"end":7,"confidence":0.9}) and a +// "pages":"5-7" string, so a model that reaches for any of those +// keyings still produces a usable range + confidence. +type citedRangeObj struct { + Pages []int `json:"pages"` + PagesStr string `json:"-"` + Start int `json:"start"` + End int `json:"end"` + Confidence float64 `json:"confidence"` +} + +// UnmarshalJSON lets "pages" be either [5,7] or the "5-7" string form +// without tripping the array decode. Everything else uses the struct +// tags via an alias to avoid recursion. +func (o *citedRangeObj) UnmarshalJSON(b []byte) error { + type alias struct { + Pages json.RawMessage `json:"pages"` + Start int `json:"start"` + End int `json:"end"` + Confidence float64 `json:"confidence"` + } + var a alias + if err := json.Unmarshal(b, &a); err != nil { + return err + } + o.Start, o.End, o.Confidence = a.Start, a.End, a.Confidence + if len(a.Pages) > 0 { + if err := json.Unmarshal(a.Pages, &o.Pages); err != nil { + // Not an int array — try the "5-7" string form. + o.Pages = nil + _ = json.Unmarshal(a.Pages, &o.PagesStr) + } + } + return nil +} + +// toRange resolves the object's range, preferring the pages array, +// then the pages string, then explicit start/end. Returns ok=false +// when no usable range is present. +func (o citedRangeObj) toRange() (int, int, bool) { + if len(o.Pages) == 1 && o.Pages[0] > 0 { + return o.Pages[0], o.Pages[0], true + } + if len(o.Pages) >= 2 && o.Pages[0] > 0 && o.Pages[1] > 0 { + return o.Pages[0], o.Pages[1], true + } + if o.PagesStr != "" { + if s, e, ok := parsePageRangeString(o.PagesStr); ok { + return s, e, true + } + } + if o.Start > 0 { + end := o.End + if end <= 0 { + end = o.Start + } + return o.Start, end, true + } + return 0, 0, false +} + // parsePageRangeString parses "5", "5-7", or "5,7" (the loosest // shape the model is allowed to emit). Returns (start, end, true) // on success; (0, 0, false) otherwise. "5,7" is treated as @@ -1057,7 +1393,16 @@ func wrapPageObservation(tool, body string) string { // - Use tight page ranges; never fetch the whole document. // - Emit a one-sentence reason before each tool call. // - Answer only from tool output (no priors). -// - End with a done action carrying answer + cited_pages. +// - End with a done action carrying answer + cited_pages + confidence. +// +// CITATION DISCIPLINE is the load-bearing addition over the reference +// PageIndex prompt. FinanceBench measurements show the failure mode is +// not retrieval but commitment: when the model is sure it names ONE +// page range and scores f1=1.0; when unsure it lists ~5 hedged ranges +// and misses on all of them. The prompt therefore makes "commit to the +// single best range" the default and frames listing many uncertain +// ranges as the WORSE choice, and asks for an explicit confidence so a +// low-confidence answer is still a single committed pick, not a spray. const pageIndexSystemPrompt = `You are a document QA assistant navigating a paginated document. TOOL USE PROTOCOL: @@ -1065,19 +1410,28 @@ TOOL USE PROTOCOL: - Always call get_document_structure first to see titles + page ranges. - Call get_pages with TIGHT page ranges (e.g. {"tool":"get_pages","start_page":5,"end_page":7}). Never fetch the whole document. - Before each tool call, populate the "reasoning" field with ONE short sentence explaining why you're calling it. -- When you have enough evidence, emit done with the natural-language answer, the page ranges you relied on, and a one-line reasoning trace. +- When you have enough evidence, emit done with the natural-language answer, the page ranges you relied on, a confidence score, and a one-line reasoning trace. + +CITATION DISCIPLINE (read carefully — this determines whether your answer scores): +- Cite the FEWEST page ranges that actually contain the answer. Ideally cite exactly ONE range. Commit to your single best range rather than hedging. +- Listing many ranges because you are unsure is WORSE than committing to the one best range: every extra low-relevance citation hurts more than it helps. Do NOT pad cited_pages with maybes. +- Add a second or third range ONLY when the answer genuinely depends on evidence from physically separate parts of the document (e.g. a figure on one page and the footnote that defines it on another). If a single range already supports the answer, cite only that one. +- Even when your overall confidence is LOW, still return your SINGLE best range — never replace one uncertain pick with five. Report the low confidence in the "confidence" field instead. +- "confidence" is your honesty signal in [0,1] for how sure you are the answer is correct and grounded on the cited pages. Set it low when unsure; this never penalizes you — it only annotates the answer. RULES: - Answer based ONLY on tool output. Do not invent facts. - Cite by page range, not by section title. - Be concise. Single-paragraph answers when possible. -- If nothing in the document answers the query, emit done with answer="The document does not address this query." and an empty cited_pages array.` +- If nothing in the document answers the query, emit done with answer="The document does not address this query.", an empty cited_pages array, and confidence 0.` // pageIndexActionHelp is the one-shot reminder appended to the // initial user prompt so the model gets concrete examples without us -// needing to maintain a separate few-shot block. +// needing to maintain a separate few-shot block. The done example +// shows the SINGLE-range default (the multi-range form is allowed but +// deliberately not modelled here, so the example nudges toward one). const pageIndexActionHelp = `- {"tool":"get_document_structure","reasoning":"orient by titles"} — fetch the TOC tree (titles + page ranges, no body text) - {"tool":"get_pages","start_page":5,"end_page":7,"reasoning":"section on debt"} — fetch text covering pages 5-7 -- {"tool":"done","answer":"...","cited_pages":[[5,7],[12,12]],"reasoning":"the answer is grounded on these pages"} — final answer +- {"tool":"done","answer":"...","cited_pages":[[5,7]],"confidence":0.9,"reasoning":"the answer is grounded on these pages"} — final answer (cite ONE range when one suffices) Reply with ONLY the JSON object.` diff --git a/pkg/retrieval/pageindex_strategy_test.go b/pkg/retrieval/pageindex_strategy_test.go index 4d50b37..cbc569d 100644 --- a/pkg/retrieval/pageindex_strategy_test.go +++ b/pkg/retrieval/pageindex_strategy_test.go @@ -829,3 +829,321 @@ func indexOfSection(haystack []tree.SectionID, needle tree.SectionID) (int, bool } return -1, false } + +// distinctRangeCount counts the unique [start,end] pairs in a +// CitedPages slice. The whole point of the dedup work is that this +// equals len(CitedPages) — no pair appears twice. +func distinctRangeCount(pairs [][2]int) int { + seen := map[[2]int]struct{}{} + for _, p := range pairs { + seen[p] = struct{}{} + } + return len(seen) +} + +// TestPageIndexDedupCollapsesRepeatedRange is the core regression for +// the FinanceBench "same id ×5" miss (id_00499 returned sec_363... five +// times). A done that cites the SAME range five times, plus two more +// distinct ranges, must collapse to distinct ranges only AND respect +// the MaxCitations cap. With MaxCitations=3 the three distinct ranges +// survive; the four duplicate repeats are gone. +func TestPageIndexDedupCollapsesRepeatedRange(t *testing.T) { + t.Parallel() + + tr := buildPagedTree() // pages 1-9 + llm := &pageScriptedLLM{ + replies: []string{ + // sec_a1 (pages 1-2) cited five times, then two distinct + // ranges. This is the spray-with-dupes pattern. + `{"tool":"done","answer":"sprayed","cited_pages":[[1,2],[1,2],[1,2],[1,2],[1,2],[3,4],[8,9]]}`, + }, + } + s := retrieval.NewPageIndexStrategy(llm) + s.PageLoader = pageMapLoader{data: map[string]string{}} + // Default MaxCitations is 3. + + res, err := s.SelectWithCost(context.Background(), tr, "q", retrieval.ContextBudget{MaxTokens: 100000}) + if err != nil { + t.Fatalf("SelectWithCost: %v", err) + } + + // CitedPages must be deduped: the [1,2] repeat collapses to one, + // and the total distinct set is capped at 3. + if n := distinctRangeCount(res.CitedPages); n != len(res.CitedPages) { + t.Errorf("CitedPages contains duplicates: %v (distinct=%d, len=%d)", res.CitedPages, n, len(res.CitedPages)) + } + if len(res.CitedPages) > 3 { + t.Errorf("CitedPages must be capped at MaxCitations=3, got %d: %v", len(res.CitedPages), res.CitedPages) + } + // The three distinct ranges [1,2],[3,4],[8,9] all survive (each is + // under the cap), in canonical page order. + want := [][2]int{{1, 2}, {3, 4}, {8, 9}} + if len(res.CitedPages) != len(want) { + t.Fatalf("CitedPages = %v, want %v", res.CitedPages, want) + } + for i := range want { + if res.CitedPages[i] != want[i] { + t.Errorf("CitedPages[%d] = %v, want %v", i, res.CitedPages[i], want[i]) + } + } + + // SelectedIDs must also be deduped — sec_a1 must appear exactly + // once even though it was cited five times. This is the precision + // fix: no section id repeats. + count := map[tree.SectionID]int{} + for _, id := range res.SelectedIDs { + count[id]++ + } + for id, c := range count { + if c > 1 { + t.Errorf("section id %q appears %d times in SelectedIDs; must be deduped", id, c) + } + } +} + +// TestPageIndexCapKeepsHighestConfidence proves the cap is +// confidence-aware: when more than MaxCitations distinct ranges are +// cited WITH per-range confidence, the highest-confidence ranges win +// (not the first-listed). Here five ranges are cited; the cap is 2; +// the two with the top confidence must be the ones kept. +func TestPageIndexCapKeepsHighestConfidence(t *testing.T) { + t.Parallel() + + tr := buildPagedTree() // pages 1-9 + // Rich cited_pages with per-range confidence. The two strongest + // are [8,9] (0.95) and [5,7] (0.9); the cap of 2 should keep those. + llm := &pageScriptedLLM{ + replies: []string{ + `{"tool":"done","answer":"x","confidence":0.9,"cited_pages":[` + + `{"pages":[1,2],"confidence":0.1},` + + `{"pages":[3,4],"confidence":0.2},` + + `{"pages":[5,7],"confidence":0.9},` + + `{"pages":[8,9],"confidence":0.95},` + + `{"pages":[1,1],"confidence":0.05}]}`, + }, + } + s := retrieval.NewPageIndexStrategy(llm) + s.PageLoader = pageMapLoader{data: map[string]string{}} + s.MaxCitations = 2 + + res, err := s.SelectWithCost(context.Background(), tr, "q", retrieval.ContextBudget{MaxTokens: 100000}) + if err != nil { + t.Fatalf("SelectWithCost: %v", err) + } + if len(res.CitedPages) != 2 { + t.Fatalf("CitedPages must be capped at 2, got %d: %v", len(res.CitedPages), res.CitedPages) + } + // Output is page-sorted, so [5,7] then [8,9]. + want := [][2]int{{5, 7}, {8, 9}} + for i := range want { + if res.CitedPages[i] != want[i] { + t.Errorf("CitedPages[%d] = %v, want %v (the two highest-confidence ranges)", i, res.CitedPages[i], want[i]) + } + } + // Overall confidence (0.9) must surface on the Result. + if res.Confidence != 0.9 { + t.Errorf("Result.Confidence = %v, want 0.9", res.Confidence) + } +} + +// TestPageIndexConfidentSinglePreserved is the happy half of the +// signal: a confident single citation must pass through untouched and +// the confidence must surface on both Result.Confidence and the +// per-section Confidences map (so the abstain machinery can read it). +func TestPageIndexConfidentSinglePreserved(t *testing.T) { + t.Parallel() + + tr := buildPagedTree() + llm := &pageScriptedLLM{ + replies: []string{ + `{"tool":"done","answer":"Install on pages 1-2.","cited_pages":[[1,2]],"confidence":0.92,"reasoning":"clear"}`, + }, + } + s := retrieval.NewPageIndexStrategy(llm) + s.PageLoader = pageMapLoader{data: map[string]string{}} + + res, err := s.SelectWithCost(context.Background(), tr, "how do I install?", retrieval.ContextBudget{MaxTokens: 100000}) + if err != nil { + t.Fatalf("SelectWithCost: %v", err) + } + if len(res.CitedPages) != 1 || res.CitedPages[0] != [2]int{1, 2} { + t.Errorf("single confident citation must be preserved, got %v", res.CitedPages) + } + if res.Confidence != 0.92 { + t.Errorf("Result.Confidence = %v, want 0.92", res.Confidence) + } + if len(res.Confidences) == 0 { + t.Fatal("Confidences map must surface the answer confidence per selected section") + } + // sec_a1 overlaps pages 1-2 and must carry the confidence. + if got := res.Confidences["sec_a1"]; got != 0.92 { + t.Errorf("Confidences[sec_a1] = %v, want 0.92", got) + } +} + +// TestPageIndexLowConfidenceStillCommitsSingle guards the over- +// suppression risk: even when the model reports LOW confidence, it must +// still return its single best pick — never an empty citation set. A +// low confidence annotates the answer; it does not delete it. +func TestPageIndexLowConfidenceStillCommitsSingle(t *testing.T) { + t.Parallel() + + tr := buildPagedTree() + llm := &pageScriptedLLM{ + replies: []string{ + `{"tool":"done","answer":"Probably debt on 8-9.","cited_pages":[[8,9]],"confidence":0.15,"reasoning":"unsure"}`, + }, + } + s := retrieval.NewPageIndexStrategy(llm) + s.PageLoader = pageMapLoader{data: map[string]string{}} + + res, err := s.SelectWithCost(context.Background(), tr, "q", retrieval.ContextBudget{MaxTokens: 100000}) + if err != nil { + t.Fatalf("SelectWithCost: %v", err) + } + if len(res.CitedPages) != 1 || res.CitedPages[0] != [2]int{8, 9} { + t.Errorf("low-confidence answer must still commit to its single pick, got %v", res.CitedPages) + } + if res.Confidence != 0.15 { + t.Errorf("Result.Confidence = %v, want 0.15 (low but surfaced)", res.Confidence) + } + if len(res.SelectedIDs) == 0 { + t.Error("low confidence must not empty the selection") + } +} + +// TestPageIndexConfidenceClamped: out-of-range confidence values clamp +// into [0,1] rather than propagating absurd numbers. +func TestPageIndexConfidenceClamped(t *testing.T) { + t.Parallel() + + tr := buildPagedTree() + llm := &pageScriptedLLM{ + replies: []string{ + `{"tool":"done","answer":"x","cited_pages":[[1,2]],"confidence":7.5}`, + }, + } + s := retrieval.NewPageIndexStrategy(llm) + s.PageLoader = pageMapLoader{data: map[string]string{}} + + res, err := s.SelectWithCost(context.Background(), tr, "q", retrieval.ContextBudget{MaxTokens: 100000}) + if err != nil { + t.Fatalf("SelectWithCost: %v", err) + } + if res.Confidence != 1.0 { + t.Errorf("confidence 7.5 must clamp to 1.0, got %v", res.Confidence) + } +} + +// TestPageIndexCapConfigurable: a custom MaxCitations is honoured. Six +// distinct ranges cited, cap=1 → exactly one survives. +func TestPageIndexCapConfigurable(t *testing.T) { + t.Parallel() + + tr := buildPagedTree() + llm := &pageScriptedLLM{ + replies: []string{ + `{"tool":"done","answer":"x","cited_pages":[[1,1],[2,2],[3,3],[4,4],[5,5],[6,6]]}`, + }, + } + s := retrieval.NewPageIndexStrategy(llm) + s.PageLoader = pageMapLoader{data: map[string]string{}} + s.MaxCitations = 1 + + res, err := s.SelectWithCost(context.Background(), tr, "q", retrieval.ContextBudget{MaxTokens: 100000}) + if err != nil { + t.Fatalf("SelectWithCost: %v", err) + } + if len(res.CitedPages) != 1 { + t.Errorf("MaxCitations=1 must keep exactly one range, got %d: %v", len(res.CitedPages), res.CitedPages) + } + // With no per-range confidence the cap falls back to emission + // order: the first-listed range [1,1] wins. + if res.CitedPages[0] != [2]int{1, 1} { + t.Errorf("cap with no confidence should keep first-listed range [1,1], got %v", res.CitedPages[0]) + } +} + +// TestPageIndexEmptyCitationsNoConfidence: a refusal (empty cited_pages) +// leaves CitedPages nil, Confidences nil, and Confidence 0 — so the +// API layer's abstain check (which fires only on a non-empty +// Confidences map) behaves exactly as before for refusals. +func TestPageIndexEmptyCitationsNoConfidence(t *testing.T) { + t.Parallel() + + tr := buildPagedTree() + llm := &pageScriptedLLM{ + replies: []string{ + `{"tool":"done","answer":"The document does not address this query.","cited_pages":[],"confidence":0}`, + }, + } + s := retrieval.NewPageIndexStrategy(llm) + s.PageLoader = pageMapLoader{data: map[string]string{}} + + res, err := s.SelectWithCost(context.Background(), tr, "q", retrieval.ContextBudget{MaxTokens: 100000}) + if err != nil { + t.Fatalf("SelectWithCost: %v", err) + } + if len(res.CitedPages) != 0 { + t.Errorf("refusal must cite nothing, got %v", res.CitedPages) + } + if len(res.Confidences) != 0 { + t.Errorf("no confidence map on a zero-confidence refusal, got %v", res.Confidences) + } + if len(res.SelectedIDs) != 0 { + t.Errorf("refusal must select nothing, got %v", res.SelectedIDs) + } +} + +// TestParsePageIndexConfidenceAndRichCitations covers the new parser +// surfaces: a top-level confidence number, and the rich cited_pages +// object form carrying per-range confidence. +func TestParsePageIndexConfidenceAndRichCitations(t *testing.T) { + t.Parallel() + + t.Run("top_level_confidence", func(t *testing.T) { + a, err := retrieval.ParsePageIndexAction(`{"tool":"done","answer":"x","cited_pages":[[1,2]],"confidence":0.8}`) + if err != nil { + t.Fatalf("parse: %v", err) + } + if a.Confidence != 0.8 { + t.Errorf("Confidence = %v, want 0.8", a.Confidence) + } + }) + + t.Run("confidence_as_string", func(t *testing.T) { + a, err := retrieval.ParsePageIndexAction(`{"tool":"done","answer":"x","cited_pages":[[1,2]],"confidence":"0.6"}`) + if err != nil { + t.Fatalf("parse: %v", err) + } + if a.Confidence != 0.6 { + t.Errorf("Confidence = %v, want 0.6 (string form)", a.Confidence) + } + }) + + t.Run("rich_cited_pages_objects", func(t *testing.T) { + a, err := retrieval.ParsePageIndexAction(`{"tool":"done","answer":"x","cited_pages":[{"pages":[5,7],"confidence":0.9},{"pages":[12,12],"confidence":0.4}]}`) + if err != nil { + t.Fatalf("parse: %v", err) + } + want := [][2]int{{5, 7}, {12, 12}} + if len(a.CitedPages) != len(want) { + t.Fatalf("CitedPages = %v, want %v", a.CitedPages, want) + } + for i := range want { + if a.CitedPages[i] != want[i] { + t.Errorf("CitedPages[%d] = %v, want %v", i, a.CitedPages[i], want[i]) + } + } + }) + + t.Run("rich_cited_pages_start_end", func(t *testing.T) { + a, err := retrieval.ParsePageIndexAction(`{"tool":"done","answer":"x","cited_pages":[{"start":3,"end":4,"confidence":0.7}]}`) + if err != nil { + t.Fatalf("parse: %v", err) + } + if len(a.CitedPages) != 1 || a.CitedPages[0] != [2]int{3, 4} { + t.Errorf("CitedPages = %v, want [[3 4]]", a.CitedPages) + } + }) +} diff --git a/pkg/retrieval/strategy.go b/pkg/retrieval/strategy.go index a3db98a..0aa428d 100644 --- a/pkg/retrieval/strategy.go +++ b/pkg/retrieval/strategy.go @@ -97,6 +97,24 @@ type Result struct { // cost/coverage debugging: a 10-K answer that read pages 50-55 + // 102-104 leaves a concrete page footprint behind. PagesRead []PageReadEntry `json:"pages_read,omitempty"` + + // CitedPages is the FINAL set of page ranges the answer commits + // to — the model's cited_pages after dedup and the confidence cap, + // NOT every page it read (that is PagesRead). Page-based strategies + // populate this; it is the authoritative input for building the + // response's citations[] so a confident single-pick answer surfaces + // ONE citation even when it skimmed several pages to find it. Nil + // when the answer cited nothing (refusal) or the run was hop-capped + // without a valid done — callers then fall back to PagesRead. + CitedPages [][2]int `json:"cited_pages,omitempty"` + + // Confidence is the model's self-reported confidence in the answer + // as a whole, in [0,1], for page-based strategies that ask for it + // on the done action. Zero means unstated (the model didn't report + // one) — it is NOT a "definitely wrong" signal, and the API layer + // must not abstain on a zero here. Per-pick confidence for section + // strategies still lives in Confidences. + Confidence float64 `json:"confidence,omitempty"` } // PageReadEntry is one get_pages tool call that materialised during a