hallelx2 · hallelx2 · May 28, 2026 · May 28, 2026 · sourcery-ai · May 28, 2026
diff --git a/pkg/parser/pdf.go b/pkg/parser/pdf.go
@@ -10,6 +10,7 @@
 	"regexp"
 	"sort"
 	"strings"
+	"time"
 
 	"github.com/hallelx2/pdftable"
 	pdflib "github.com/ledongthuc/pdf"
@@ -1174,7 +1175,7 @@
 // isEncryptedPDFError reports whether the given error from
 // ledongthuc/pdf indicates the document is encrypted. The library
 // has no proper error type for this, so we match on the message.
 func isEncryptedPDFError(err error) bool {
 	if err == nil {
 		return false
 	}
@@ -1256,7 +1257,21 @@
 	minCols := opts.MinTableCols
 
 	var sections []Section
+	// Total table-extraction budget across the whole document. pdftable's
+	// finder is geometry-heavy and has pathological cases on dense
+	// financial pages (a balance sheet with hundreds of ruling segments
+	// can blow up the intersection grid). Once the doc-wide budget is
+	// spent we stop extracting tables on the remaining pages and let
+	// ingest proceed text-only — a doc with no table sections still
+	// indexes fine (the page text, including the table's text, is in the
+	// prose sections).
+	deadline := time.Now().Add(tableExtractDocBudget)
 	for n := 1; n <= doc.NumPages(); n++ {
+		if time.Now().After(deadline) {
+			slog.Warn("pdf: table-extraction doc budget exhausted; skipping remaining pages",
+				"pages_done", n-1, "total_pages", doc.NumPages())
+			break
+		}
 		page, err := doc.Page(n)
 		if err != nil {
 			continue
@@ -1298,27 +1313,60 @@
 	return sections
 }
 
-// safeExtractTables wraps page.ExtractTables in a recover() so a bug
-// deep inside pdftable can never take down the engine's ingest
-// pipeline. Errors and panics are logged at warn level (not error —
-// the document still gets ingested, just without its tables).
-func safeExtractTables(page pdftable.Page, settings pdftable.TableSettings, pageNum int) (tables []*pdftable.Table) {
-	defer func() {
-		if r := recover(); r != nil {
-			slog.Warn("pdf: table extraction panicked",
-				"page", pageNum,
-				"panic", fmt.Sprintf("%v", r))
-			tables = nil
-		}
+// Table-extraction time bounds. pdftable's finder is pure geometry (no
+// LLM, so the ingest LLM-call timeout doesn't cover it) and has
+// pathological cases on dense financial pages where the
+// intersection-grid construction blows up. A 92-page 10-K was observed
+// stuck in `parsing` for 20+ minutes on a single such page. These
+// bounds keep table extraction strictly time-boxed:
+//   - tableExtractPageTimeout caps any single page.
+//   - tableExtractDocBudget caps the whole document (checked in the
+//     page loop); once spent, remaining pages are skipped text-only.
+const (
+	tableExtractPageTimeout = 15 * time.Second
+	tableExtractDocBudget   = 90 * time.Second
+)
+
+// safeExtractTables wraps page.ExtractTables so a bug — or a pathological
+// O(n^2) page — deep inside pdftable can never take down (or hang) the
+// engine's ingest pipeline. It runs the extraction on a goroutine and
+// abandons it if it exceeds tableExtractPageTimeout. Errors, panics, and
+// timeouts are logged at warn level; the document still ingests, just
+// without that page's tables.
+func safeExtractTables(page pdftable.Page, settings pdftable.TableSettings, pageNum int) []*pdftable.Table {
+	type result struct {
+		tables []*pdftable.Table
+		err    error
+	}
+	done := make(chan result, 1)
+	go func() {
+		defer func() {
+			if r := recover(); r != nil {
+				slog.Warn("pdf: table extraction panicked",
+					"page", pageNum, "panic", fmt.Sprintf("%v", r))
+				done <- result{}
+			}
+		}()
+		t, err := page.ExtractTables(settings)
+		done <- result{tables: t, err: err}
 	}()
-	tables, err := page.ExtractTables(settings)
-	if err != nil {
-		slog.Warn("pdf: table extraction failed",
-			"page", pageNum,
-			"err", err)
+
+	select {
+	case <-time.After(tableExtractPageTimeout):
+		// The goroutine is abandoned (it will finish on its own and the
+		// buffered channel lets it send without blocking, then get GC'd).
+		// A single slow page is not worth stalling the whole ingest.
+		slog.Warn("pdf: table extraction timed out; skipping page",
+			"page", pageNum, "timeout", tableExtractPageTimeout)
 		return nil
+	case res := <-done:
+		if res.err != nil {
+			slog.Warn("pdf: table extraction failed",
+				"page", pageNum, "err", res.err)
+			return nil
+		}
+		return res.tables
 	}
-	return tables
 }
 
 // normaliseTableRows trims whitespace per cell and pads short rows out