hallelx2 · hallelx2 · May 29, 2026 · May 29, 2026 · May 29, 2026
diff --git a/cmd/engine/main.go b/cmd/engine/main.go
@@ -412,6 +412,9 @@ func buildPageIndexStrategy(c config.RetrievalConfig, client llmgate.Client, sto
 	if c.PageIndex.PageContentLimit > 0 {
 		p.PageContentLimit = c.PageIndex.PageContentLimit
 	}
+	if c.PageIndex.MaxCitations > 0 {
+		p.MaxCitations = c.PageIndex.MaxCitations
+	}
 	p.ModelOverride = c.PageIndex.Model
 	return p
 }

diff --git a/cmd/server/main.go b/cmd/server/main.go
@@ -488,6 +488,9 @@ func buildPageIndexStrategy(c enginecfg.RetrievalConfig, client llmgate.Client,
 	if c.PageIndex.PageContentLimit > 0 {
 		p.PageContentLimit = c.PageIndex.PageContentLimit
 	}
+	if c.PageIndex.MaxCitations > 0 {
+		p.MaxCitations = c.PageIndex.MaxCitations
+	}
 	p.ModelOverride = c.PageIndex.Model
 	return p
 }

diff --git a/config.example.yaml b/config.example.yaml
@@ -286,6 +286,16 @@ retrieval:
     # flagship model's context window. Higher values risk burning
     # context budget on stray full-document fetches.
     page_content_limit: 16000
+    # Cap on how many distinct page ranges the FINAL answer may
+    # cite. A confidence backstop, not a navigation limit: the
+    # model may read as many pages as max_hops allows, but the
+    # citation set it commits to is bounded. FinanceBench data
+    # shows the failure mode is "spray ~5 low-confidence ranges ->
+    # miss all" while a single confident pick scores perfectly;
+    # capping the final set tames the spray. 3 keeps a genuinely
+    # multi-location answer (e.g. a figure + its footnote) while
+    # cutting the low-confidence tail. Env: VLE_/VLS_RETRIEVAL_PAGEINDEX_MAX_CITATIONS.
+    max_citations: 3
     # Override the navigation-loop model; empty inherits the
     # request's model (which itself falls back to the engine
     # default). Most deployments leave this blank — navigation

diff --git a/internal/api/pageindex.go b/internal/api/pageindex.go
@@ -175,6 +175,7 @@
 		"citations":   citations,
 		"strategy":    perReq.Name(),
 		"model":       budget.ModelName,
+		"confidence":  res.Confidence,
 		"hops_taken":  res.HopsTaken,
 		"usage": map[string]any{
 			"input_tokens":  res.Usage.InputTokens,
@@ -268,6 +269,7 @@
 		"citations":   citations,
 		"strategy":    strat.Name(),
 		"model":       budget.ModelName,
+		"confidence":  res.Confidence,
 		"hops_taken":  res.HopsTaken,
 		"usage": map[string]any{
 			"input_tokens":  res.Usage.InputTokens,
@@ -298,35 +300,35 @@
 	if res == nil {
 		return nil
 	}
-	// Build a citation per UNIQUE page range present in PagesRead.
-	// The set of pages the model "read" is a superset of what it
-	// cited — some get_pages calls don't end up in the final
-	// cited_pages list — but the union is the right cone of trust
-	// to surface as evidence. The trace token is computed over
-	// only the strictly-cited ranges, which the strategy already
-	// has, so citation drift doesn't break replay.
-	seen := make(map[[2]int]struct{}, len(res.PagesRead))
-	citations := make([]map[string]any, 0, len(res.PagesRead))
-
-	for _, pr := range res.PagesRead {
-		key := [2]int{pr.StartPage, pr.EndPage}
-		if _, dup := seen[key]; dup {
-			continue
+	// Source of truth for citations is the FINAL cited-range set
+	// (res.CitedPages) — already deduped and capped to MaxCitations by
+	// the strategy. This is what makes a confident single-pick answer
+	// surface ONE citation even when it skimmed several pages to find
+	// it: PagesRead is the navigation footprint (a superset), not the
+	// commitment. Fall back to PagesRead only when nothing was cited
+	// (refusal) or the run was hop-capped without a done — the trace
+	// token is keyed on the cited ranges either way, so this never
+	// breaks replay.
+	sources := retrieval.CitationSources(res)
+	citations := make([]map[string]any, 0, len(sources))
+
+	for _, src := range sources {
+		sectionIDs := src.SectionIDs
+		if sectionIDs == nil {
+			sectionIDs = retrieval.SectionIDsOverlapping(t, src.Start, src.End)
 		}
-		seen[key] = struct{}{}
-
 		c := map[string]any{
-			"start_page":  pr.StartPage,
-			"end_page":    pr.EndPage,
-			"section_ids": pr.SectionIDs,
+			"start_page":  src.Start,
+			"end_page":    src.End,
+			"section_ids": sectionIDs,
 		}
 
 		// Quote extraction is best-effort: an LLM blip or empty
 		// content returns no quote, which is a normal degradation
 		// path. We materialise the cited content from storage and
 		// run one SpanExtractor call per citation.
 		if d.LLM != nil {
-			content := d.materialiseCitedContent(ctx, t, pr.SectionIDs)
+			content := d.materialiseCitedContent(ctx, t, sectionIDs)
 			if strings.TrimSpace(content) != "" {
 				ext := d.pageIndexSpanExtractor(requestModel)
 				span, _, err := ext.Extract(ctx, content, query)
@@ -476,7 +478,7 @@
 // the in-response trace_token against the canonical input set —
 // kept here rather than exported from the retrieval package so
 // the API layer owns its own input wiring.
 func pageIndexTraceTokenFromCitations(docID tree.DocumentID, model string, ranges [][2]int) string {
 	strs := make([]string, 0, len(ranges))
 	for _, r := range ranges {
 		if r[0] == r[1] {

diff --git a/internal/api/pageindex_test.go b/internal/api/pageindex_test.go
@@ -261,6 +261,107 @@ func TestHandleAnswerPageIndexReasoningTrace(t *testing.T) {
 	}
 }
 
+// TestHandleAnswerPageIndexDedupAndCapCitations is the end-to-end
+// proof of the bench-facing fix: a done that sprays the SAME range
+// five times plus extras must produce a citations[] array that is
+// deduped and capped at MaxCitations — no duplicate page ranges, no
+// repeated section ids across citations, and confidence surfaced.
+// This is the API-layer mirror of the strategy's dedup test and the
+// reason precision@5 stops deflating.
+func TestHandleAnswerPageIndexDedupAndCapCitations(t *testing.T) {
+	t.Parallel()
+
+	// Read one range, then a done that cites [1,2] five times plus
+	// two more distinct ranges and a confidence. MaxCitations=3.
+	deps, _, _ := newTestDeps(t,
+		`{"tool":"get_pages","start_page":1,"end_page":2,"reasoning":"skim"}`,
+		`{"tool":"done","answer":"sprayed","confidence":0.8,"cited_pages":[[1,2],[1,2],[1,2],[1,2],[1,2],[3,4],[8,9]]}`,
+	)
+
+	body := strings.NewReader(`{"document_id":"doc_x","query":"q"}`)
+	req := httptest.NewRequest(http.MethodPost, "/v1/answer/pageindex", body)
+	rec := httptest.NewRecorder()
+	pageIndexHandlerRouter(deps).ServeHTTP(rec, req)
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status = %d, body = %s", rec.Code, rec.Body.String())
+	}
+	var resp map[string]any
+	if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("unmarshal: %v", err)
+	}
+
+	cits, ok := resp["citations"].([]any)
+	if !ok {
+		t.Fatalf("citations missing: %v", resp["citations"])
+	}
+	// Capped at MaxCitations=3, and every page range distinct.
+	if len(cits) > 3 {
+		t.Errorf("citations must be capped at 3, got %d", len(cits))
+	}
+	seenRange := map[[2]int]int{}
+	for _, raw := range cits {
+		c := raw.(map[string]any)
+		key := [2]int{int(c["start_page"].(float64)), int(c["end_page"].(float64))}
+		seenRange[key]++
+	}
+	for key, n := range seenRange {
+		if n > 1 {
+			t.Errorf("citation page range %v appears %d times; must be deduped", key, n)
+		}
+	}
+	// The three distinct ranges all survive (under the cap).
+	for _, want := range [][2]int{{1, 2}, {3, 4}, {8, 9}} {
+		if seenRange[want] == 0 {
+			t.Errorf("expected a citation for range %v, citations: %v", want, cits)
+		}
+	}
+	// confidence surfaces on the response.
+	if conf, ok := resp["confidence"].(float64); !ok || conf != 0.8 {
+		t.Errorf("response confidence = %v, want 0.8", resp["confidence"])
+	}
+}
+
+// TestHandleAnswerPageIndexConfidentSingleCitation is the happy half
+// at the API layer: a confident single-range done — even after the
+// model skimmed several pages — surfaces exactly ONE citation. This
+// is the f1=1.0 commit case, and the fix that stops a multi-page
+// navigation footprint from leaking into citations[].
+func TestHandleAnswerPageIndexConfidentSingleCitation(t *testing.T) {
+	t.Parallel()
+
+	// The model reads pages 1-2 AND 8-9 while searching, but commits
+	// to a single cited range [8,9]. citations[] must be just [8,9].
+	deps, _, _ := newTestDeps(t,
+		`{"tool":"get_pages","start_page":1,"end_page":2,"reasoning":"check setup"}`,
+		`{"tool":"get_pages","start_page":8,"end_page":9,"reasoning":"check debt"}`,
+		`{"tool":"done","answer":"Debt is on 8-9.","confidence":0.95,"cited_pages":[[8,9]],"reasoning":"clear"}`,
+	)
+
+	body := strings.NewReader(`{"document_id":"doc_x","query":"debt?"}`)
+	req := httptest.NewRequest(http.MethodPost, "/v1/answer/pageindex", body)
+	rec := httptest.NewRecorder()
+	pageIndexHandlerRouter(deps).ServeHTTP(rec, req)
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status = %d, body = %s", rec.Code, rec.Body.String())
+	}
+	var resp map[string]any
+	_ = json.Unmarshal(rec.Body.Bytes(), &resp)
+
+	cits, ok := resp["citations"].([]any)
+	if !ok || len(cits) != 1 {
+		t.Fatalf("confident single pick must yield exactly ONE citation, got %v", resp["citations"])
+	}
+	first := cits[0].(map[string]any)
+	if first["start_page"].(float64) != 8 || first["end_page"].(float64) != 9 {
+		t.Errorf("citation = %v-%v, want 8-9", first["start_page"], first["end_page"])
+	}
+	// pages_read still records the full navigation footprint (both
+	// reads) — only citations[] is tightened to the commitment.
+	if pages, ok := resp["pages_read"].([]any); !ok || len(pages) != 2 {
+		t.Errorf("pages_read must keep the full footprint (2 reads), got %v", resp["pages_read"])
+	}
+}
+
 // TestHandleAnswerPageIndexReasoningTraceQueryParam: the
 // ?reasoning=true query param is an alternative to the body field.
 // Some clients prefer it for GET-friendliness when prototyping.

diff --git a/internal/config/config.go b/internal/config/config.go
@@ -326,6 +326,16 @@ func applyEnvOverrides(c *Config) {
 	if v := firstEnv("VLS_INGEST_MODE", "VLE_INGEST_MODE"); v != "" {
 		c.Engine.Ingest.Mode = v
 	}
+	// PageIndex final-citation cap. Forwarded so the spray backstop can
+	// be tuned on the live server without a config edit (e.g. tighten
+	// to 1 to force single-citation answers, or relax for a doc set
+	// with many genuinely multi-location questions). VLS_ wins over
+	// VLE_; a garbled or negative value leaves the engine default (3).
+	if v := firstEnv("VLS_RETRIEVAL_PAGEINDEX_MAX_CITATIONS", "VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS"); v != "" {
+		if n, err := strconv.Atoi(v); err == nil && n >= 0 {
+			c.Engine.Retrieval.PageIndex.MaxCitations = n
+		}
+	}
 	// Total-parse timeout (seconds). Forwarded so the deployed server
 	// honours a tuned parse deadline without a secret/config edit — the
 	// outermost robustness valve against a parse that hangs (pre-LLM,

diff --git a/internal/config/config_pageindex_test.go b/internal/config/config_pageindex_test.go
@@ -0,0 +1,54 @@
+package config
+
+import (
+	"os"
+	"testing"
+)
+
+// TestForwardPageIndexMaxCitations covers the server-config
+// pass-through for the PageIndex final-citation cap: the operator can
+// set it via VLS_ or VLE_ and have it reach the embedded engine
+// config, with VLS_ winning when both are present and a garbled value
+// preserving the engine default.
+func TestForwardPageIndexMaxCitations(t *testing.T) {
+	prevVLS := os.Getenv("VLS_RETRIEVAL_PAGEINDEX_MAX_CITATIONS")
+	prevVLE := os.Getenv("VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS")
+	defer func() {
+		os.Setenv("VLS_RETRIEVAL_PAGEINDEX_MAX_CITATIONS", prevVLS)
+		os.Setenv("VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS", prevVLE)
+	}()
+
+	// Engine default is 3 with no env set.
+	os.Unsetenv("VLS_RETRIEVAL_PAGEINDEX_MAX_CITATIONS")
+	os.Unsetenv("VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS")
+	cfg := Default()
+	applyEnvOverrides(&cfg)
+	if cfg.Engine.Retrieval.PageIndex.MaxCitations != 3 {
+		t.Errorf("default max_citations = %d, want 3", cfg.Engine.Retrieval.PageIndex.MaxCitations)
+	}
+
+	// VLE_ alone forwards through.
+	os.Setenv("VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS", "2")
+	cfg2 := Default()
+	applyEnvOverrides(&cfg2)
+	if cfg2.Engine.Retrieval.PageIndex.MaxCitations != 2 {
+		t.Errorf("VLE_ forward: max_citations = %d, want 2", cfg2.Engine.Retrieval.PageIndex.MaxCitations)
+	}
+
+	// VLS_ wins over VLE_.
+	os.Setenv("VLS_RETRIEVAL_PAGEINDEX_MAX_CITATIONS", "1")
+	cfg3 := Default()
+	applyEnvOverrides(&cfg3)
+	if cfg3.Engine.Retrieval.PageIndex.MaxCitations != 1 {
+		t.Errorf("VLS_ should win: max_citations = %d, want 1", cfg3.Engine.Retrieval.PageIndex.MaxCitations)
+	}
+
+	// Garbled value preserves the engine default (3), does not zero it.
+	os.Unsetenv("VLS_RETRIEVAL_PAGEINDEX_MAX_CITATIONS")
+	os.Setenv("VLE_RETRIEVAL_PAGEINDEX_MAX_CITATIONS", "heaps")
+	cfg4 := Default()
+	applyEnvOverrides(&cfg4)
+	if cfg4.Engine.Retrieval.PageIndex.MaxCitations != 3 {
+		t.Errorf("garbage value should preserve default 3, got %d", cfg4.Engine.Retrieval.PageIndex.MaxCitations)
+	}
+}
diff --git a/internal/handler/answer_pageindex.go b/internal/handler/answer_pageindex.go
@@ -200,6 +200,7 @@ func (h *AnswerPageIndexHandler) HandleAnswerPageIndex(w http.ResponseWriter, r
 		"citations":   citations,
 		"strategy":    perReq.Name(),
 		"model":       budget.ModelName,
+		"confidence":  res.Confidence,
 		"hops_taken":  res.HopsTaken,
 		"usage": map[string]any{
 			"input_tokens":  res.Usage.InputTokens,
@@ -281,6 +282,7 @@ func (h *AnswerPageIndexHandler) serveStream(w http.ResponseWriter, r *http.Requ
 		"citations":   citations,
 		"strategy":    strat.Name(),
 		"model":       budget.ModelName,
+		"confidence":  res.Confidence,
 		"hops_taken":  res.HopsTaken,
 		"usage": map[string]any{
 			"input_tokens":  res.Usage.InputTokens,
@@ -296,32 +298,33 @@ func (h *AnswerPageIndexHandler) serveStream(w http.ResponseWriter, r *http.Requ
 	emitSSE("answer", final)
 }
 
-// buildCitations transforms the strategy's PagesRead + the section
-// tree into the response's citations array: one citation per unique
-// cited page range, each carrying the overlapping section IDs and a
-// best-effort grounding quote extracted from the cited content.
+// buildCitations transforms the strategy's FINAL cited ranges + the
+// section tree into the response's citations array: one citation per
+// cited page range (deduped + capped by the strategy), each carrying
+// the overlapping section IDs and a best-effort grounding quote
+// extracted from the cited content. Falls back to the PagesRead
+// footprint when nothing was cited (refusal / hop-capped run) so the
+// caller still sees where the model looked.
 func (h *AnswerPageIndexHandler) buildCitations(ctx context.Context, t *tree.Tree, res *retrieval.Result, query, requestModel string) []map[string]any {
 	if res == nil {
 		return nil
 	}
-	seen := make(map[[2]int]struct{}, len(res.PagesRead))
-	citations := make([]map[string]any, 0, len(res.PagesRead))
+	sources := retrieval.CitationSources(res)
+	citations := make([]map[string]any, 0, len(sources))
 
-	for _, pr := range res.PagesRead {
-		key := [2]int{pr.StartPage, pr.EndPage}
-		if _, dup := seen[key]; dup {
-			continue
+	for _, src := range sources {
+		sectionIDs := src.SectionIDs
+		if sectionIDs == nil {
+			sectionIDs = retrieval.SectionIDsOverlapping(t, src.Start, src.End)
 		}
-		seen[key] = struct{}{}
-
 		c := map[string]any{
-			"start_page":  pr.StartPage,
-			"end_page":    pr.EndPage,
-			"section_ids": pr.SectionIDs,
+			"start_page":  src.Start,
+			"end_page":    src.End,
+			"section_ids": sectionIDs,
 		}
 
 		if h.llm != nil {
-			content := h.materialiseCitedContent(ctx, t, pr.SectionIDs)
+			content := h.materialiseCitedContent(ctx, t, sectionIDs)
 			if strings.TrimSpace(content) != "" {
 				ext := h.spanExtractor(requestModel)
 				span, _, err := ext.Extract(ctx, content, query)

diff --git a/openapi.yaml b/openapi.yaml
@@ -1118,6 +1118,20 @@ components:
           enum: [pageindex]
         model:
           type: string
+        confidence:
+          type: number
+          format: float
+          minimum: 0
+          maximum: 1
+          description: |
+            The agent's self-reported confidence in the answer +
+            citation set, in [0,1], taken from the terminal `done`
+            tool call. 0 means the model did not report one (it is
+            NOT a "definitely wrong" signal). Surfaced so callers
+            can treat a low-confidence answer more cautiously; it
+            does not suppress the answer — even an unsure agent
+            still returns its single best citation rather than a
+            spray of hedged ones.
         hops_taken:
           type: integer
           description: |