Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 66 additions & 18 deletions pkg/parser/pdf.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"regexp"
"sort"
"strings"
"time"

"github.com/hallelx2/pdftable"
pdflib "github.com/ledongthuc/pdf"
Expand Down Expand Up @@ -1174,7 +1175,7 @@
// isEncryptedPDFError reports whether the given error from
// ledongthuc/pdf indicates the document is encrypted. The library
// has no proper error type for this, so we match on the message.
func isEncryptedPDFError(err error) bool {

Check failure on line 1178 in pkg/parser/pdf.go

View workflow job for this annotation

GitHub Actions / lint

func isEncryptedPDFError is unused (U1000)
if err == nil {
return false
}
Expand Down Expand Up @@ -1256,7 +1257,21 @@
minCols := opts.MinTableCols

var sections []Section
// Total table-extraction budget across the whole document. pdftable's
// finder is geometry-heavy and has pathological cases on dense
// financial pages (a balance sheet with hundreds of ruling segments
// can blow up the intersection grid). Once the doc-wide budget is
// spent we stop extracting tables on the remaining pages and let
// ingest proceed text-only — a doc with no table sections still
// indexes fine (the page text, including the table's text, is in the
// prose sections).
deadline := time.Now().Add(tableExtractDocBudget)
for n := 1; n <= doc.NumPages(); n++ {
if time.Now().After(deadline) {
slog.Warn("pdf: table-extraction doc budget exhausted; skipping remaining pages",
"pages_done", n-1, "total_pages", doc.NumPages())
break
}
page, err := doc.Page(n)
if err != nil {
continue
Expand Down Expand Up @@ -1298,27 +1313,60 @@
return sections
}

// safeExtractTables wraps page.ExtractTables in a recover() so a bug
// deep inside pdftable can never take down the engine's ingest
// pipeline. Errors and panics are logged at warn level (not error —
// the document still gets ingested, just without its tables).
func safeExtractTables(page pdftable.Page, settings pdftable.TableSettings, pageNum int) (tables []*pdftable.Table) {
defer func() {
if r := recover(); r != nil {
slog.Warn("pdf: table extraction panicked",
"page", pageNum,
"panic", fmt.Sprintf("%v", r))
tables = nil
}
// Table-extraction time bounds. pdftable's finder is pure geometry (no
// LLM, so the ingest LLM-call timeout doesn't cover it) and has
// pathological cases on dense financial pages where the
// intersection-grid construction blows up. A 92-page 10-K was observed
// stuck in `parsing` for 20+ minutes on a single such page. These
// bounds keep table extraction strictly time-boxed:
// - tableExtractPageTimeout caps any single page.
// - tableExtractDocBudget caps the whole document (checked in the
// page loop); once spent, remaining pages are skipped text-only.
const (
tableExtractPageTimeout = 15 * time.Second
tableExtractDocBudget = 90 * time.Second
)

// safeExtractTables wraps page.ExtractTables so a bug — or a pathological
// O(n^2) page — deep inside pdftable can never take down (or hang) the
// engine's ingest pipeline. It runs the extraction on a goroutine and
// abandons it if it exceeds tableExtractPageTimeout. Errors, panics, and
// timeouts are logged at warn level; the document still ingests, just
// without that page's tables.
func safeExtractTables(page pdftable.Page, settings pdftable.TableSettings, pageNum int) []*pdftable.Table {
type result struct {
tables []*pdftable.Table
err error
}
done := make(chan result, 1)
go func() {
defer func() {
if r := recover(); r != nil {
slog.Warn("pdf: table extraction panicked",
"page", pageNum, "panic", fmt.Sprintf("%v", r))
done <- result{}
}
}()
t, err := page.ExtractTables(settings)
Comment on lines +1341 to +1350
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (performance): Current timeout approach abandons the goroutine instead of canceling it, which could tie up resources longer than necessary.

Since the goroutine keeps running after the select times out, page.ExtractTables can still consume CPU and memory for its full worst-case duration, and many such timeouts could lead to a buildup of in-flight goroutines. If pdftable supports contexts, consider wrapping the call in a context.WithTimeout and passing it through so the work is actually canceled. If not, consider limiting concurrent extractions (e.g., worker pool or semaphore) so callers can’t create unbounded abandoned goroutines on problematic documents.

done <- result{tables: t, err: err}
}()
Comment on lines +1341 to 1352
tables, err := page.ExtractTables(settings)
if err != nil {
slog.Warn("pdf: table extraction failed",
"page", pageNum,
"err", err)

select {
case <-time.After(tableExtractPageTimeout):
// The goroutine is abandoned (it will finish on its own and the
// buffered channel lets it send without blocking, then get GC'd).
// A single slow page is not worth stalling the whole ingest.
slog.Warn("pdf: table extraction timed out; skipping page",
"page", pageNum, "timeout", tableExtractPageTimeout)
return nil
case res := <-done:
if res.err != nil {
slog.Warn("pdf: table extraction failed",
"page", pageNum, "err", res.err)
return nil
}
return res.tables
}
Comment on lines +1354 to 1369
return tables
}

// normaliseTableRows trims whitespace per cell and pads short rows out
Expand Down
Loading