From 59c73e621d00bd6dee7ef3b8d58a32aeab93e0e3 Mon Sep 17 00:00:00 2001 From: Vladimir Andric Date: Mon, 20 Apr 2026 09:17:14 +0200 Subject: [PATCH] Replace shell-based process runner with open3_safe Replaces the bash pipe + system timeout binary approach in ExternalProcess#run with Open3Safe.capture3_safe, which provides proper stdout/stderr capture, timeout handling, and RSS-based memory limiting without spawning a shell. - Add open3_safe gem dependency - Propagate max_rss: keyword arg through public API and all extractors - Convert env strings to Hashes for Open3 compatibility - Remove 2>&1 redirects (stderr now captured separately) - Duplicate/blank-line filtering preserved in the new implementation Co-Authored-By: Claude Sonnet 4.6 --- .gitignore | 1 + CLAUDE.md | 50 ++++++++++++++++++++++++ Gemfile | 6 +++ Gemfile.lock | 58 +++++++++++++++++++++++++++ docsplit.gemspec | 2 + lib/docsplit.rb | 23 ++++++----- lib/docsplit/external_process.rb | 67 +++++++++++++++++++------------- lib/docsplit/image_extractor.rb | 11 ++---- lib/docsplit/info_extractor.rb | 5 +-- lib/docsplit/page_extractor.rb | 16 ++++---- lib/docsplit/pdf_extractor.rb | 21 +++++----- lib/docsplit/text_extractor.rb | 27 +++++++------ 12 files changed, 210 insertions(+), 77 deletions(-) create mode 100644 CLAUDE.md create mode 100644 Gemfile create mode 100644 Gemfile.lock diff --git a/.gitignore b/.gitignore index f1d7f5e..b3f02c4 100755 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ *.gem .DS_Store +.idea diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..92fc3e2 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,50 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Commands + +```bash +# Install dependencies (open3_safe is fetched from GitHub at a pinned commit ref — see Gemfile) +bundle install + +# Run all tests +bundle exec rake test + +# Run a single test file +bundle exec ruby test/unit/test_extract_text.rb + +# Build and install the gem locally +bundle exec rake gem:install +``` + +## External Dependencies + +The following system tools must be installed for full functionality: +- `gm` (GraphicsMagick) — image extraction and OCR pre-processing +- `pdftotext`, `pdfinfo`, `pdftk` — text extraction and PDF metadata +- `tesseract` — OCR (with optional `osd` language pack for orientation detection) +- `java` + JODConverter (vendored in `vendor/`) — non-PDF document conversion +- `open3_safe` gem — pinned to a specific GitHub commit ref in `Gemfile`; `Gemfile.lock` must be regenerated after changing the ref + +## Architecture + +`lib/docsplit.rb` is the public API entry point. It defines the `Docsplit` module, checks `PATH` for dependencies at load time, and delegates to extractor classes. + +**Extractor classes** (`lib/docsplit/`): +- `TextExtractor` — extracts text via `pdftotext`, falls back to Tesseract OCR for pages below `MIN_TEXT_PER_PAGE` (100 bytes) +- `ImageExtractor` — rasterizes PDF pages via GraphicsMagick (`gm convert`/`gm mogrify`) +- `PdfExtractor` — converts non-PDF documents to PDF using LibreOffice or JODConverter (Java) +- `InfoExtractor` — parses `pdfinfo` output for metadata +- `PageExtractor` — bursts PDFs into single-page PDFs via `pdftk`/`pdftailor` + +**`ExternalProcess` module** (`external_process.rb`) is mixed into extractor classes. Its `run` method wraps `Open3Safe.capture3_safe` to execute subprocesses with: +- timeout (SIGTERM → SIGKILL after 5s) +- optional RSS memory limit via `max_rss:` +- stdout+stderr merged, blank lines and consecutive duplicate lines filtered (guards against memory bloat from corrupt PDFs — silverfin/issues/1998) + +**Timeout-aware public API**: `extract_text_with_timeouts` and `extract_images_with_timeouts` accept `timeout` (overall) and `item_timeout` (per page/file); `extract_pdf_with_timeout` accepts only `timeout`. RSS caps are not part of the public API — each extractor hardcodes its own `MAX_RSS` constant (`TextExtractor`/`ImageExtractor`: 512 MiB, `TextExtractor::TESSERACT_MAX_RSS`: 1 GiB, `PdfExtractor`: 2 GiB) and passes it into `run(..., max_rss:)` internally. The plain `extract_*` variants have no timeouts. + +## Test Structure + +Tests live in `test/unit/`, use Minitest, and write output to `test/output/` (cleaned up in `teardown`). Fixtures are in `test/fixtures/` — a mix of PDFs, Office docs, and edge-case files (encrypted, unicode, spaces/quotes in filenames). \ No newline at end of file diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..e854bfc --- /dev/null +++ b/Gemfile @@ -0,0 +1,6 @@ +source 'https://rubygems.org' + +gemspec + +gem "open3_safe", github: "Silverfin-Engineering/open3_safe", ref: "5badbe14e94bd01d4420fc13ee9a1f129d78b182" +gem "minitest", "~> 6.0" diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..6262972 --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,58 @@ +GIT + remote: https://github.com/Silverfin-Engineering/open3_safe.git + revision: 5badbe14e94bd01d4420fc13ee9a1f129d78b182 + ref: 5badbe14e94bd01d4420fc13ee9a1f129d78b182 + specs: + open3_safe (0.1.0) + get_process_mem + +PATH + remote: . + specs: + docsplit (0.7.6) + open3_safe + +GEM + remote: https://rubygems.org/ + specs: + bigdecimal (4.1.1) + drb (2.2.3) + ffi (1.17.4) + ffi (1.17.4-aarch64-linux-gnu) + ffi (1.17.4-aarch64-linux-musl) + ffi (1.17.4-arm-linux-gnu) + ffi (1.17.4-arm-linux-musl) + ffi (1.17.4-arm64-darwin) + ffi (1.17.4-x86-linux-gnu) + ffi (1.17.4-x86-linux-musl) + ffi (1.17.4-x86_64-darwin) + ffi (1.17.4-x86_64-linux-gnu) + ffi (1.17.4-x86_64-linux-musl) + get_process_mem (1.0.0) + bigdecimal (>= 2.0) + ffi (~> 1.0) + minitest (6.0.4) + drb (~> 2.0) + prism (~> 1.5) + prism (1.9.0) + +PLATFORMS + aarch64-linux-gnu + aarch64-linux-musl + arm-linux-gnu + arm-linux-musl + arm64-darwin + ruby + x86-linux-gnu + x86-linux-musl + x86_64-darwin + x86_64-linux-gnu + x86_64-linux-musl + +DEPENDENCIES + docsplit! + minitest (~> 6.0) + open3_safe! + +BUNDLED WITH + 2.6.6 diff --git a/docsplit.gemspec b/docsplit.gemspec index 0a147e9..0f45c57 100755 --- a/docsplit.gemspec +++ b/docsplit.gemspec @@ -22,4 +22,6 @@ Gem::Specification.new do |s| s.files = Dir['build/**/*', 'lib/**/*', 'bin/*', 'vendor/**/*', 'docsplit.gemspec', 'LICENSE', 'README'] + + s.add_dependency 'open3_safe' end \ No newline at end of file diff --git a/lib/docsplit.rb b/lib/docsplit.rb index 5267c58..ff948ac 100755 --- a/lib/docsplit.rb +++ b/lib/docsplit.rb @@ -2,6 +2,10 @@ require 'fileutils' require 'shellwords' +# Loaded ahead of the module body so the load-time OSD probe can use +# ExternalProcess.run instead of a raw backtick. +require File.expand_path(File.dirname(__FILE__) + '/docsplit/external_process') + # The Docsplit module delegates to the Java PDF extractors. module Docsplit @@ -21,6 +25,13 @@ module Docsplit PEMRISSIONS_PATTERN = /(?<=\().+?(?=\))/ DEFAULT_PERMISSION = {"print"=>true, "copy"=>true, "change"=>true, "addNotes"=>true} + # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise + # broke. + class ExtractionFailed < StandardError; end + + # Raise an TimeoutError when running external tool timeouts. + class TimeoutError < StandardError; end + # Check for all dependencies, and note their absence. dirs = ENV['PATH'].split(File::PATH_SEPARATOR) DEPENDENCIES.each_key do |dep| @@ -34,18 +45,11 @@ module Docsplit # if tesseract is found check for the osd plugin so that we can do orientation independent OCR. if DEPENDENCIES[:tesseract] - # osd will be listed in tesseract --listlangs - val = %x[ #{'tesseract --list-langs'} 2>&1 >/dev/null ] + # osd will be listed in tesseract --list-langs + val = ExternalProcess.run("tesseract --list-langs") rescue "" DEPENDENCIES[:osd] = true if val =~ /\bosd\b/ end - # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise - # broke. - class ExtractionFailed < StandardError; end - - # Raise an TimeoutError when running external tool timeouts. - class TimeoutError < StandardError; end - # Use the ExtractPages Java class to burst a PDF into single pages. def self.extract_pages(pdfs, opts={}) pdfs = ensure_pdfs(pdfs) @@ -144,7 +148,6 @@ def self.normalize_value(value) end -require "#{Docsplit::ROOT}/lib/docsplit/external_process" require "#{Docsplit::ROOT}/lib/docsplit/image_extractor" require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs" require "#{Docsplit::ROOT}/lib/docsplit/text_extractor" diff --git a/lib/docsplit/external_process.rb b/lib/docsplit/external_process.rb index 2d77a2a..8621fe0 100644 --- a/lib/docsplit/external_process.rb +++ b/lib/docsplit/external_process.rb @@ -1,35 +1,46 @@ +require 'shellwords' +require 'open3_safe' + module Docsplit module ExternalProcess + extend self + + # Seconds to wait after SIGTERM before escalating to SIGKILL. + KILL_AFTER = 5 + + # Default RSS byte cap applied when no max_rss is supplied (or nil is passed) — 512 MiB. + DEFAULT_MAX_RSS = 512 * 1024 * 1024 + # Run an external process and raise an exception if it fails. - def run(command, env = "", timeout = nil) - # If a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between). - # By filtering these we avoid memory bloat when the executing process tries to capture stdout. The timeout makes - # sure we exit at some point. - # - # - See https://github.com/GetSilverfin/silverfin/issues/1998 - # - Add timeout so a stuck process doesn't block our Ruby process forever - # - Remove blank lines - # - Remove duplicate lines - run_command = "#{env} #{timeout_prefix(timeout)} #{command} | grep -v \"^$\" | uniq" - - # - Run through bash so we can use PIPESTATUS - # - Use PIPESTATUS to return the exit status of #{command} instead of `uniq` - result = `bash -c '#{run_command}; exit ${PIPESTATUS[0]}'`.chomp - exit_code = $?.exitstatus - - raise TimeoutError, run_command if exit_code == 137 - raise ExtractionFailed, result if exit_code != 0 - - result - end + # + # command - shell command string (may include 2>&1, which is stripped) + # env - Hash of extra environment variables (default: {}) + # timeout - seconds before the process is sent SIGTERM (nil = no timeout) + # max_rss: - RSS byte threshold; defaults to DEFAULT_MAX_RSS + def run(command, env = {}, timeout = nil, max_rss: DEFAULT_MAX_RSS) + # Strip 2>&1 redirects — stderr is captured separately by Open3Safe. + cmd = Shellwords.split(command.gsub(/\s*2>&1\s*/, ' ').strip) - def timeout_prefix(timeout) - timeout ? "#{timeout_bin} #{timeout}" : "" - end + opts = { signal: :TERM, kill_after: KILL_AFTER, max_rss: max_rss } + opts[:timeout] = timeout if timeout + + args = env.empty? ? [*cmd, opts] : [env, *cmd, opts] + result = Open3Safe.capture3_safe(*args) + + # Combine stdout+stderr (previously unified via 2>&1 in callers), then filter blank + # lines and consecutive duplicates. This avoids memory bloat from corrupt PDFs that + # generate an infinite stream of identical warnings — see silverfin/issues/1998. + output = (result[:stdout] + result[:stderr]) + .lines + .reject { |l| l.chomp.empty? } + .each_with_object([]) { |l, acc| acc << l unless acc.last == l } + .join + .chomp + + raise TimeoutError, command if result[:timeout] || result[:oom_killed] + raise ExtractionFailed, output if result[:status].exitstatus != 0 - def timeout_bin - # gtimeout on Mac - `which timeout` != "" ? "timeout --signal=KILL" : "gtimeout --signal=KILL" + output end end -end +end \ No newline at end of file diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb index 2988a33..62ed1d7 100755 --- a/lib/docsplit/image_extractor.rb +++ b/lib/docsplit/image_extractor.rb @@ -8,6 +8,7 @@ class ImageExtractor MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB" DEFAULT_FORMAT = :png DEFAULT_DENSITY = '150' + MAX_RSS = 512 * 1024 * 1024 def initialize(timeout = nil, item_timeout = nil) @timeout = timeout @@ -42,20 +43,16 @@ def convert(pdf, size, format, previous=nil) pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s escaped_pdf = ESCAPE[pdf] FileUtils.mkdir_p(directory) unless File.exist?(directory) - env = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2" + env = { "MAGICK_TMPDIR" => tempdir, "OMP_NUM_THREADS" => "2" } common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}" if previous FileUtils.cp(Dir[directory_for(previous) + '/*'], directory) - # We're adding `| grep -v '^$' | uniq` here and below because if a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between). - # By filtering these we avoid memory bloat when the executing process tries to capture stdout. - # See https://github.com/GetSilverfin/silverfin/issues/1998 - - run("gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1", env, @timeout) + run("gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\"", env, @timeout, max_rss: MAX_RSS) else page_list(pages).each do |page| out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")] - run("gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1", env, @item_timeout) + run("gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file}", env, @item_timeout, max_rss: MAX_RSS) end end ensure diff --git a/lib/docsplit/info_extractor.rb b/lib/docsplit/info_extractor.rb index 1af5502..97a6f2d 100644 --- a/lib/docsplit/info_extractor.rb +++ b/lib/docsplit/info_extractor.rb @@ -2,6 +2,7 @@ module Docsplit # Delegates to **pdfinfo** in order to extract information about a PDF file. class InfoExtractor + include ExternalProcess # Regex matchers for different bits of information. MATCHERS = { @@ -24,9 +25,7 @@ def extract(key, pdfs, opts) def extract_all(pdfs, opts) pdf = [pdfs].flatten.first - cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1" - result = `#{cmd}`.chomp - raise ExtractionFailed, result if $? != 0 + result = run("pdfinfo #{ESCAPE[pdf]}") # ruby 1.8 (iconv) and 1.9 (String#encode) : if String.method_defined?(:encode) result.encode!('UTF-8', 'binary', :invalid => :replace, :undef => :replace, :replace => "") unless result.valid_encoding? diff --git a/lib/docsplit/page_extractor.rb b/lib/docsplit/page_extractor.rb index 0216ec5..e746b27 100644 --- a/lib/docsplit/page_extractor.rb +++ b/lib/docsplit/page_extractor.rb @@ -3,6 +3,7 @@ module Docsplit # Delegates to **pdftk** in order to create bursted single pages from # a PDF document. class PageExtractor + include ExternalProcess # Burst a list of pdfs into single pages, as `pdfname_pagenumber.pdf`. def extract(pdfs, opts) @@ -11,16 +12,17 @@ def extract(pdfs, opts) pdf_name = File.basename(pdf, File.extname(pdf)) page_path = ESCAPE[File.join(@output, "#{pdf_name}")] + "_%d.pdf" FileUtils.mkdir_p @output unless File.exist?(@output) - + cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability - "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1" + "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]}" else - "pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1" + "pdftk #{ESCAPE[pdf]} burst output #{page_path}" + end + begin + run(cmd) + ensure + FileUtils.rm('doc_data.txt') if File.exist?('doc_data.txt') end - result = `#{cmd}`.chomp - FileUtils.rm('doc_data.txt') if File.exist?('doc_data.txt') - raise ExtractionFailed, result if $? != 0 - result end end diff --git a/lib/docsplit/pdf_extractor.rb b/lib/docsplit/pdf_extractor.rb index a261466..08fce42 100644 --- a/lib/docsplit/pdf_extractor.rb +++ b/lib/docsplit/pdf_extractor.rb @@ -7,6 +7,8 @@ class PdfExtractor @@executable = nil @@version_string = nil + MAX_RSS = 2 * 1024 * 1024 * 1024 + def initialize(timeout = nil) @timeout = timeout end @@ -27,10 +29,9 @@ def linux? # of the office software to be used for extraction. def version_string unless @@version_string - null = windows? ? "NUL" : "/dev/null" - @@version_string = `#{office_executable} -h 2>#{null}`.split("\n").first + @@version_string = (run("#{office_executable} -h") rescue '').split("\n").first if !!@@version_string.to_s.match(/[0-9]*/) - @@version_string = `#{office_executable} --version`.split("\n").first + @@version_string = (run("#{office_executable} --version") rescue '').split("\n").first end end @@version_string @@ -129,8 +130,8 @@ def extract(docs, opts) basename = File.basename(doc, ext) escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE) - if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0]) - `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf` + if GM_FORMATS.include?(run("file -b --mime #{ESCAPE[doc]}").strip.split(/[:;]\s+/)[0]) + run("gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", {}, @timeout, max_rss: MAX_RSS) else if libre_office? # Set the LibreOffice user profile, so that parallel uses of cloudcrowd don't trip over each other. @@ -138,9 +139,8 @@ def extract(docs, opts) ENV['SYSUSERCONFIG']="file://#{tmp_sys_dir}" options = "--headless --invisible --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}" - cmd = "#{office_executable} #{options} 2>&1" - result = run(cmd, "", @timeout) - raise ExtractionFailed, result if $? != 0 + cmd = "#{office_executable} #{options}" + run(cmd, {}, @timeout, max_rss: MAX_RSS) end true else # open office presumably, rely on JODConverter to figure it out. @@ -165,9 +165,8 @@ def run_jod(command, pdfs, opts, return_output=false) pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ') office = osx? ? "-Doffice.home=#{office_path}" : office_path - cmd = "java #{HEADLESS} #{LOGGING} #{office} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1" - result = run(cmd, "", @timeout) - raise ExtractionFailed, result if $? != 0 + cmd = "java #{HEADLESS} #{LOGGING} #{office} -cp #{CLASSPATH} #{command} #{pdfs}" + result = run(cmd, {}, @timeout, max_rss: MAX_RSS) return return_output ? (result.empty? ? nil : result) : true end diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb index 09b5962..e2176a1 100644 --- a/lib/docsplit/text_extractor.rb +++ b/lib/docsplit/text_extractor.rb @@ -17,12 +17,14 @@ module Docsplit class TextExtractor include ExternalProcess - NO_TEXT_DETECTED = /---------\n\Z/ + NO_TEXT_DETECTED = /---------\z/ OCR_FLAGS = '-density 400x400 -colorspace GRAY' MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB' MIN_TEXT_PER_PAGE = 100 # in bytes + MAX_RSS = 512 * 1024 * 1024 + TESSERACT_MAX_RSS = 1024 * 1024 * 1024 def initialize(timeout = nil, item_timeout = nil) @pages_to_ocr = [] @@ -52,8 +54,11 @@ def extract(pdfs, opts) # Does a PDF have any text embedded? def contains_text?(pdf) - fonts = `pdffonts #{ESCAPE[pdf]} 2>&1` - !fonts.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '').match(NO_TEXT_DETECTED) + fonts = run("pdffonts #{ESCAPE[pdf]}", max_rss: MAX_RSS) + .encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '') + !fonts.match(NO_TEXT_DETECTED) + rescue ExtractionFailed + false end # Extract a page range worth of text from a PDF, directly. @@ -67,25 +72,25 @@ def extract_from_ocr(pdf, pages) tempdir = Dir.mktmpdir base_path = File.join(@output, @pdf_name) escaped_pdf = ESCAPE[pdf] - psm = @detect_orientation ? "-psm 1" : "" - env = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2" + psm = @detect_orientation ? "--psm 1" : "" + env = { "MAGICK_TMPDIR" => tempdir, "OMP_NUM_THREADS" => "2" } if pages pages.each do |page| tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif" escaped_tiff = ESCAPE[tiff] file = "#{base_path}_#{page}" - run("gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1", env, @item_timeout) - run("tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1", "", @item_timeout) + run("gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff}", env, @item_timeout, max_rss: MAX_RSS) + run("tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm}", {}, @item_timeout, max_rss: TESSERACT_MAX_RSS) clean_text(file + '.txt') if @clean_ocr FileUtils.remove_entry_secure tiff end else tiff = "#{tempdir}/#{@pdf_name}.tif" escaped_tiff = ESCAPE[tiff] - run("gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1", env, @timeout) + run("gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff}", env, @timeout, max_rss: MAX_RSS) #if the user says don't do orientation detection or the plugin is not installed, set psm to 0 - run("tesseract #{escaped_tiff} #{ESCAPE[base_path]} -l #{@language} #{psm} 2>&1", "", @timeout) + run("tesseract #{escaped_tiff} #{ESCAPE[base_path]} -l #{@language} #{psm}", {}, @timeout, max_rss: TESSERACT_MAX_RSS) clean_text(base_path + '.txt') if @clean_ocr end ensure @@ -107,14 +112,14 @@ def clean_text(file) # Extract the full contents of a pdf as a single file, directly. def extract_full(pdf) text_path = File.join(@output, "#{@pdf_name}.txt") - run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1", "", @timeout + run("pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]}", {}, @timeout, max_rss: MAX_RSS) end # Extract the contents of a single page of text, directly, adding it to # the `@pages_to_ocr` list if the text length is inadequate. def extract_page(pdf, page) text_path = File.join(@output, "#{@pdf_name}_#{page}.txt") - run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1", "", @item_timeout + run("pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]}", {}, @item_timeout, max_rss: MAX_RSS) unless @forbid_ocr @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE end