Silverfin-Engineering · vandric · Apr 20, 2026 · peter-toth-silverfin · May 8, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 *.gem
 .DS_Store
+.idea
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -0,0 +1,50 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Commands
+
+```bash
+# Install dependencies (open3_safe is fetched from GitHub at a pinned commit ref — see Gemfile)
+bundle install
+
+# Run all tests
+bundle exec rake test
+
+# Run a single test file
+bundle exec ruby test/unit/test_extract_text.rb
+
+# Build and install the gem locally
+bundle exec rake gem:install
+```
+
+## External Dependencies
+
+The following system tools must be installed for full functionality:
+- `gm` (GraphicsMagick) — image extraction and OCR pre-processing
+- `pdftotext`, `pdfinfo`, `pdftk` — text extraction and PDF metadata
+- `tesseract` — OCR (with optional `osd` language pack for orientation detection)
+- `java` + JODConverter (vendored in `vendor/`) — non-PDF document conversion
+- `open3_safe` gem — pinned to a specific GitHub commit ref in `Gemfile`; `Gemfile.lock` must be regenerated after changing the ref
+
+## Architecture
+
+`lib/docsplit.rb` is the public API entry point. It defines the `Docsplit` module, checks `PATH` for dependencies at load time, and delegates to extractor classes.
+
+**Extractor classes** (`lib/docsplit/`):
+- `TextExtractor` — extracts text via `pdftotext`, falls back to Tesseract OCR for pages below `MIN_TEXT_PER_PAGE` (100 bytes)
+- `ImageExtractor` — rasterizes PDF pages via GraphicsMagick (`gm convert`/`gm mogrify`)
+- `PdfExtractor` — converts non-PDF documents to PDF using LibreOffice or JODConverter (Java)
+- `InfoExtractor` — parses `pdfinfo` output for metadata
+- `PageExtractor` — bursts PDFs into single-page PDFs via `pdftk`/`pdftailor`
+
+**`ExternalProcess` module** (`external_process.rb`) is mixed into extractor classes. Its `run` method wraps `Open3Safe.capture3_safe` to execute subprocesses with:
+- timeout (SIGTERM → SIGKILL after 5s)
+- optional RSS memory limit via `max_rss:`
+- stdout+stderr merged, blank lines and consecutive duplicate lines filtered (guards against memory bloat from corrupt PDFs — silverfin/issues/1998)
+
+**Timeout-aware public API**: `extract_text_with_timeouts` and `extract_images_with_timeouts` accept `timeout` (overall) and `item_timeout` (per page/file); `extract_pdf_with_timeout` accepts only `timeout`. RSS caps are not part of the public API — each extractor hardcodes its own `MAX_RSS` constant (`TextExtractor`/`ImageExtractor`: 512 MiB, `TextExtractor::TESSERACT_MAX_RSS`: 1 GiB, `PdfExtractor`: 2 GiB) and passes it into `run(..., max_rss:)` internally. The plain `extract_*` variants have no timeouts.
+
+## Test Structure
+
+Tests live in `test/unit/`, use Minitest, and write output to `test/output/` (cleaned up in `teardown`). Fixtures are in `test/fixtures/` — a mix of PDFs, Office docs, and edge-case files (encrypted, unicode, spaces/quotes in filenames).
diff --git a/Gemfile b/Gemfile
@@ -0,0 +1,6 @@
+source 'https://rubygems.org'
+
+gemspec
+
+gem "open3_safe", github: "Silverfin-Engineering/open3_safe", ref: "5badbe14e94bd01d4420fc13ee9a1f129d78b182"
+gem "minitest", "~> 6.0"
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -0,0 +1,58 @@
+GIT
+  remote: https://github.com/Silverfin-Engineering/open3_safe.git
+  revision: 5badbe14e94bd01d4420fc13ee9a1f129d78b182
+  ref: 5badbe14e94bd01d4420fc13ee9a1f129d78b182
+  specs:
+    open3_safe (0.1.0)
+      get_process_mem
+
+PATH
+  remote: .
+  specs:
+    docsplit (0.7.6)
+      open3_safe
+
+GEM
+  remote: https://rubygems.org/
+  specs:
+    bigdecimal (4.1.1)
+    drb (2.2.3)
+    ffi (1.17.4)
+    ffi (1.17.4-aarch64-linux-gnu)
+    ffi (1.17.4-aarch64-linux-musl)
+    ffi (1.17.4-arm-linux-gnu)
+    ffi (1.17.4-arm-linux-musl)
+    ffi (1.17.4-arm64-darwin)
+    ffi (1.17.4-x86-linux-gnu)
+    ffi (1.17.4-x86-linux-musl)
+    ffi (1.17.4-x86_64-darwin)
+    ffi (1.17.4-x86_64-linux-gnu)
+    ffi (1.17.4-x86_64-linux-musl)
+    get_process_mem (1.0.0)
+      bigdecimal (>= 2.0)
+      ffi (~> 1.0)
+    minitest (6.0.4)
+      drb (~> 2.0)
+      prism (~> 1.5)
+    prism (1.9.0)
+
+PLATFORMS
+  aarch64-linux-gnu
+  aarch64-linux-musl
+  arm-linux-gnu
+  arm-linux-musl
+  arm64-darwin
+  ruby
+  x86-linux-gnu
+  x86-linux-musl
+  x86_64-darwin
+  x86_64-linux-gnu
+  x86_64-linux-musl
+
+DEPENDENCIES
+  docsplit!
+  minitest (~> 6.0)
+  open3_safe!
+
+BUNDLED WITH
+   2.6.6
diff --git a/docsplit.gemspec b/docsplit.gemspec
@@ -22,4 +22,6 @@ Gem::Specification.new do |s|
 
   s.files = Dir['build/**/*', 'lib/**/*', 'bin/*', 'vendor/**/*',
                 'docsplit.gemspec', 'LICENSE', 'README']
+
+  s.add_dependency 'open3_safe'
 end
diff --git a/lib/docsplit.rb b/lib/docsplit.rb
@@ -2,6 +2,10 @@
 require 'fileutils'
 require 'shellwords'
 
+# Loaded ahead of the module body so the load-time OSD probe can use
+# ExternalProcess.run instead of a raw backtick.
+require File.expand_path(File.dirname(__FILE__) + '/docsplit/external_process')
+
 # The Docsplit module delegates to the Java PDF extractors.
 module Docsplit
 
@@ -21,6 +25,13 @@ module Docsplit
   PEMRISSIONS_PATTERN = /(?<=\().+?(?=\))/
   DEFAULT_PERMISSION = {"print"=>true, "copy"=>true, "change"=>true, "addNotes"=>true}
 
+  # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
+  # broke.
+  class ExtractionFailed < StandardError; end
+
+  # Raise an TimeoutError when running external tool timeouts.
+  class TimeoutError < StandardError; end
+
   # Check for all dependencies, and note their absence.
   dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
   DEPENDENCIES.each_key do |dep|
@@ -34,18 +45,11 @@ module Docsplit
 
   # if tesseract is found check for the osd plugin so that we can do orientation independent OCR.
   if DEPENDENCIES[:tesseract]
-    # osd will be listed in tesseract --listlangs
-    val = %x[ #{'tesseract --list-langs'} 2>&1 >/dev/null ]
+    # osd will be listed in tesseract --list-langs
+    val = ExternalProcess.run("tesseract --list-langs") rescue ""
     DEPENDENCIES[:osd] = true if val =~ /\bosd\b/
   end
 
-    # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
-  # broke.
-  class ExtractionFailed < StandardError; end
-
-  # Raise an TimeoutError when running external tool timeouts.
-  class TimeoutError < StandardError; end
-
   # Use the ExtractPages Java class to burst a PDF into single pages.
   def self.extract_pages(pdfs, opts={})
     pdfs = ensure_pdfs(pdfs)
@@ -144,7 +148,6 @@ def self.normalize_value(value)
 
 end
 
-require "#{Docsplit::ROOT}/lib/docsplit/external_process"
 require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
 require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
 require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"

diff --git a/lib/docsplit/external_process.rb b/lib/docsplit/external_process.rb
@@ -1,35 +1,46 @@
+require 'shellwords'
+require 'open3_safe'
+
 module Docsplit
   module ExternalProcess
+    extend self
+
+    # Seconds to wait after SIGTERM before escalating to SIGKILL.
+    KILL_AFTER = 5
+
+    # Default RSS byte cap applied when no max_rss is supplied (or nil is passed) — 512 MiB.
+    DEFAULT_MAX_RSS = 512 * 1024 * 1024
+
     # Run an external process and raise an exception if it fails.
-    def run(command, env = "", timeout = nil)
-      # If a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between).
-      # By filtering these we avoid memory bloat when the executing process tries to capture stdout. The timeout makes
-      # sure we exit at some point.
-      #
-      # - See https://github.com/GetSilverfin/silverfin/issues/1998
-      # - Add timeout so a stuck process doesn't block our Ruby process forever
-      # - Remove blank lines
-      # - Remove duplicate lines
-      run_command = "#{env} #{timeout_prefix(timeout)} #{command} | grep -v \"^$\" | uniq"
-
-      # - Run through bash so we can use PIPESTATUS
-      # - Use PIPESTATUS to return the exit status of #{command} instead of `uniq`
-      result = `bash -c '#{run_command}; exit ${PIPESTATUS[0]}'`.chomp
-      exit_code = $?.exitstatus
-
-      raise TimeoutError, run_command if exit_code == 137
-      raise ExtractionFailed, result if exit_code != 0
-
-      result
-    end
+    #
+    # command  - shell command string (may include 2>&1, which is stripped)
+    # env      - Hash of extra environment variables (default: {})
+    # timeout  - seconds before the process is sent SIGTERM (nil = no timeout)
+    # max_rss: - RSS byte threshold; defaults to DEFAULT_MAX_RSS
+    def run(command, env = {}, timeout = nil, max_rss: DEFAULT_MAX_RSS)
+      # Strip 2>&1 redirects — stderr is captured separately by Open3Safe.
+      cmd = Shellwords.split(command.gsub(/\s*2>&1\s*/, ' ').strip)
 
-    def timeout_prefix(timeout)
-      timeout ? "#{timeout_bin} #{timeout}" : ""
-    end
+      opts = { signal: :TERM, kill_after: KILL_AFTER, max_rss: max_rss }
+      opts[:timeout] = timeout if timeout
+
+      args = env.empty? ? [*cmd, opts] : [env, *cmd, opts]
+      result = Open3Safe.capture3_safe(*args)
+
+      # Combine stdout+stderr (previously unified via 2>&1 in callers), then filter blank
+      # lines and consecutive duplicates. This avoids memory bloat from corrupt PDFs that
+      # generate an infinite stream of identical warnings — see silverfin/issues/1998.
+      output = (result[:stdout] + result[:stderr])
+                 .lines
+                 .reject { |l| l.chomp.empty? }
+                 .each_with_object([]) { |l, acc| acc << l unless acc.last == l }
+                 .join
+                 .chomp
+
+      raise TimeoutError, command if result[:timeout] || result[:oom_killed]
+      raise ExtractionFailed, output if result[:status].exitstatus != 0
 
-    def timeout_bin
-      # gtimeout on Mac
-      `which timeout` != "" ? "timeout --signal=KILL" : "gtimeout --signal=KILL"
+      output
     end
   end
-end
+end
diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb
@@ -8,6 +8,7 @@ class ImageExtractor
     MEMORY_ARGS     = "-limit memory 256MiB -limit map 512MiB"
     DEFAULT_FORMAT  = :png
     DEFAULT_DENSITY = '150'
+    MAX_RSS = 512 * 1024 * 1024
 
     def initialize(timeout = nil, item_timeout = nil)
       @timeout = timeout
@@ -42,20 +43,16 @@ def convert(pdf, size, format, previous=nil)
       pages     = @pages || '1-' + Docsplit.extract_length(pdf).to_s
       escaped_pdf = ESCAPE[pdf]
       FileUtils.mkdir_p(directory) unless File.exist?(directory)
-      env = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2"
+      env = { "MAGICK_TMPDIR" => tempdir, "OMP_NUM_THREADS" => "2" }
       common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
 
       if previous
         FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
-        # We're adding `| grep -v '^$' | uniq` here and below because if a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between).
-        # By filtering these we avoid memory bloat when the executing process tries to capture stdout.
-        # See https://github.com/GetSilverfin/silverfin/issues/1998
-
-        run("gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1", env, @timeout)
+        run("gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\"", env, @timeout, max_rss: MAX_RSS)
       else
         page_list(pages).each do |page|
           out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
-          run("gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1", env, @item_timeout)
+          run("gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file}", env, @item_timeout, max_rss: MAX_RSS)
         end
       end
     ensure

diff --git a/lib/docsplit/info_extractor.rb b/lib/docsplit/info_extractor.rb
@@ -2,6 +2,7 @@ module Docsplit
 
   # Delegates to **pdfinfo** in order to extract information about a PDF file.
   class InfoExtractor
+    include ExternalProcess
 
     # Regex matchers for different bits of information.
     MATCHERS = {
@@ -24,9 +25,7 @@ def extract(key, pdfs, opts)
 
     def extract_all(pdfs, opts)
       pdf = [pdfs].flatten.first
-      cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
-      result = `#{cmd}`.chomp
-      raise ExtractionFailed, result if $? != 0
+      result = run("pdfinfo #{ESCAPE[pdf]}")
       # ruby  1.8 (iconv) and 1.9 (String#encode) :
       if String.method_defined?(:encode)
         result.encode!('UTF-8', 'binary', :invalid => :replace, :undef => :replace, :replace => "") unless result.valid_encoding?

diff --git a/lib/docsplit/page_extractor.rb b/lib/docsplit/page_extractor.rb
@@ -3,6 +3,7 @@ module Docsplit
   # Delegates to **pdftk** in order to create bursted single pages from
   # a PDF document.
   class PageExtractor
+    include ExternalProcess
 
     # Burst a list of pdfs into single pages, as `pdfname_pagenumber.pdf`.
     def extract(pdfs, opts)
@@ -11,16 +12,17 @@ def extract(pdfs, opts)
         pdf_name = File.basename(pdf, File.extname(pdf))
         page_path = ESCAPE[File.join(@output, "#{pdf_name}")] + "_%d.pdf"
         FileUtils.mkdir_p @output unless File.exist?(@output)
-        
+
         cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
-          "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1"
+          "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]}"
         else
-          "pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"
+          "pdftk #{ESCAPE[pdf]} burst output #{page_path}"
+        end
+        begin
+          run(cmd)
+        ensure
+          FileUtils.rm('doc_data.txt') if File.exist?('doc_data.txt')
         end
-        result = `#{cmd}`.chomp
-        FileUtils.rm('doc_data.txt') if File.exist?('doc_data.txt')
-        raise ExtractionFailed, result if $? != 0
-        result
       end
     end