From 856b0f4f33928c921814d2a62903803dd3999afd Mon Sep 17 00:00:00 2001
From: Halleluyah Oludele <halleluyaholudele@gmail.com>
Date: Thu, 28 May 2026 17:39:43 +0100
Subject: [PATCH] fix(parser): time-box pdftable table extraction (un-hang
 parse)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A 1.27MB 3M 10-K sat in `parsing` for 20+ minutes during a bench run:
pdftable's geometry-based table finder has a pathological case on dense
financial pages (a balance sheet's hundreds of ruling segments blow up
the intersection-grid construction). Parse is pure-geometry / pre-LLM,
so the ingest per-LLM-call timeout (the earlier fix) doesn't cover it,
and there was no other bound — the document hung indefinitely.

Two bounds, both in pkg/parser/pdf.go:
  - safeExtractTables now runs page.ExtractTables on a goroutine and
    abandons it after tableExtractPageTimeout (15s). The slow page is
    skipped (text-only); a single bad page can't stall the parse.
  - extractPDFTables tracks a doc-wide tableExtractDocBudget (90s);
    once spent, remaining pages are skipped and ingest proceeds.

A document that loses its table sections still indexes fine — the page
text (including the table's text) lives in the prose sections, which is
exactly what the page-based retrieval strategy reads. The proper fix is
upstream in pdftable's finder; this keeps ingest robust meanwhile.

go build/vet/test green.
---
 pkg/parser/pdf.go | 84 +++++++++++++++++++++++++++++++++++++----------
 1 file changed, 66 insertions(+), 18 deletions(-)

diff --git a/pkg/parser/pdf.go b/pkg/parser/pdf.go
index 7c9bb46..bf120dd 100644
--- a/pkg/parser/pdf.go
+++ b/pkg/parser/pdf.go
@@ -10,6 +10,7 @@ import (
 	"regexp"
 	"sort"
 	"strings"
+	"time"
 
 	"github.com/hallelx2/pdftable"
 	pdflib "github.com/ledongthuc/pdf"
@@ -1256,7 +1257,21 @@ func extractPDFTables(doc pdftable.Document, opts *TableOpts) []Section {
 	minCols := opts.MinTableCols
 
 	var sections []Section
+	// Total table-extraction budget across the whole document. pdftable's
+	// finder is geometry-heavy and has pathological cases on dense
+	// financial pages (a balance sheet with hundreds of ruling segments
+	// can blow up the intersection grid). Once the doc-wide budget is
+	// spent we stop extracting tables on the remaining pages and let
+	// ingest proceed text-only — a doc with no table sections still
+	// indexes fine (the page text, including the table's text, is in the
+	// prose sections).
+	deadline := time.Now().Add(tableExtractDocBudget)
 	for n := 1; n <= doc.NumPages(); n++ {
+		if time.Now().After(deadline) {
+			slog.Warn("pdf: table-extraction doc budget exhausted; skipping remaining pages",
+				"pages_done", n-1, "total_pages", doc.NumPages())
+			break
+		}
 		page, err := doc.Page(n)
 		if err != nil {
 			continue
@@ -1298,27 +1313,60 @@ func extractPDFTables(doc pdftable.Document, opts *TableOpts) []Section {
 	return sections
 }
 
-// safeExtractTables wraps page.ExtractTables in a recover() so a bug
-// deep inside pdftable can never take down the engine's ingest
-// pipeline. Errors and panics are logged at warn level (not error —
-// the document still gets ingested, just without its tables).
-func safeExtractTables(page pdftable.Page, settings pdftable.TableSettings, pageNum int) (tables []*pdftable.Table) {
-	defer func() {
-		if r := recover(); r != nil {
-			slog.Warn("pdf: table extraction panicked",
-				"page", pageNum,
-				"panic", fmt.Sprintf("%v", r))
-			tables = nil
-		}
+// Table-extraction time bounds. pdftable's finder is pure geometry (no
+// LLM, so the ingest LLM-call timeout doesn't cover it) and has
+// pathological cases on dense financial pages where the
+// intersection-grid construction blows up. A 92-page 10-K was observed
+// stuck in `parsing` for 20+ minutes on a single such page. These
+// bounds keep table extraction strictly time-boxed:
+//   - tableExtractPageTimeout caps any single page.
+//   - tableExtractDocBudget caps the whole document (checked in the
+//     page loop); once spent, remaining pages are skipped text-only.
+const (
+	tableExtractPageTimeout = 15 * time.Second
+	tableExtractDocBudget   = 90 * time.Second
+)
+
+// safeExtractTables wraps page.ExtractTables so a bug — or a pathological
+// O(n^2) page — deep inside pdftable can never take down (or hang) the
+// engine's ingest pipeline. It runs the extraction on a goroutine and
+// abandons it if it exceeds tableExtractPageTimeout. Errors, panics, and
+// timeouts are logged at warn level; the document still ingests, just
+// without that page's tables.
+func safeExtractTables(page pdftable.Page, settings pdftable.TableSettings, pageNum int) []*pdftable.Table {
+	type result struct {
+		tables []*pdftable.Table
+		err    error
+	}
+	done := make(chan result, 1)
+	go func() {
+		defer func() {
+			if r := recover(); r != nil {
+				slog.Warn("pdf: table extraction panicked",
+					"page", pageNum, "panic", fmt.Sprintf("%v", r))
+				done <- result{}
+			}
+		}()
+		t, err := page.ExtractTables(settings)
+		done <- result{tables: t, err: err}
 	}()
-	tables, err := page.ExtractTables(settings)
-	if err != nil {
-		slog.Warn("pdf: table extraction failed",
-			"page", pageNum,
-			"err", err)
+
+	select {
+	case <-time.After(tableExtractPageTimeout):
+		// The goroutine is abandoned (it will finish on its own and the
+		// buffered channel lets it send without blocking, then get GC'd).
+		// A single slow page is not worth stalling the whole ingest.
+		slog.Warn("pdf: table extraction timed out; skipping page",
+			"page", pageNum, "timeout", tableExtractPageTimeout)
 		return nil
+	case res := <-done:
+		if res.err != nil {
+			slog.Warn("pdf: table extraction failed",
+				"page", pageNum, "err", res.err)
+			return nil
+		}
+		return res.tables
 	}
-	return tables
 }
 
 // normaliseTableRows trims whitespace per cell and pads short rows out