Skip to content

Added in more options for watcher #35

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
9 changes: 9 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,15 @@ Folder monitoring:

--> Every time a pdf file is added to `watch_directory` it will be OCR'ed

pypdfocr -w watch_directory --archive --archive_suffix _orig.pdf

--> Every time a pdf file is added to `watch_directory` it will be OCR'ed. The original will move to _orig and the
OCR'ed version will have its name

pypdfocr -w watch_directory --archive --archive_suffix _orig.pdf --initial_scan
--> Every time a pdf file is added to `watch_directory` it will be OCR'ed. The original will move to _orig and the
OCR'ed version will have its name. All PDF's in the folder will be scanned and OCR'ed if they have not been already.

Automatic filing:
~~~~~~~~~~~~~~~~~

Expand Down
47 changes: 36 additions & 11 deletions pypdfocr/pypdfocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,17 @@ def get_options(self, argv):
default=False, dest='match_using_filename', help='Use filename to match if contents did not match anything, before filing to default folder')


#--------------
# Watch Options
#--------------
p.add_argument('--archive', action='store_true',
dest='archive', help='Move the source document to an archive')
p.add_argument('--initial_scan', action='store_true',
dest='initial_scan', help='Include PDF documents already in folder if not processed')
p.add_argument('--archive_suffix',
dest='archive_suffix', help='Include PDF documents already in folder if not processed', default='_orig.pdf')


# Add flow option to single mode extract_images,preprocess,ocr,write

args = p.parse_args(argv)
Expand All @@ -173,6 +184,10 @@ def get_options(self, argv):
self.match_using_filename = args.match_using_filename
self.skip_preprocess = args.skip_preprocess

self.archive = args.archive
self.archive_suffix = args.archive_suffix
self.initial_scan = args.initial_scan

if self.debug:
logging.basicConfig(level=logging.DEBUG, format='%(message)s')

Expand Down Expand Up @@ -320,7 +335,11 @@ def run_conversion(self, pdf_filename):
"""
print ("Starting conversion of %s" % pdf_filename)
# Make the images for Tesseract
img_dpi, glob_img_filename = self.gs.make_img_from_pdf(pdf_filename)
try:
img_dpi, glob_img_filename = self.gs.make_img_from_pdf(pdf_filename)
except Exception, e:
print "Exception occurred in processing %s: %s" % (pdf_filename, e)
return

fns = glob.glob(glob_img_filename)

Expand All @@ -337,7 +356,8 @@ def run_conversion(self, pdf_filename):

# Generate new pdf with overlayed text
#ocr_pdf_filename = self.pdf.overlay_hocr(tiff_dpi, hocr_filename, pdf_filename)
ocr_pdf_filename = self.pdf.overlay_hocr_pages(img_dpi, hocr_filenames, pdf_filename)
ocr_pdf_filename = self.pdf.overlay_hocr_pages(img_dpi, hocr_filenames, pdf_filename,
archive=self.archive, archive_suffix=self.archive_suffix)

# Clean up the files
if not self.debug:
Expand Down Expand Up @@ -426,13 +446,15 @@ def go(self, argv):
if self.watch:
while True: # Make sure the watcher doesn't terminate
try:
py_watcher = PyPdfWatcher(self.watch_dir, self.config.get('watch'))
py_watcher = PyPdfWatcher(self.watch_dir, self.config.get('watch'),
archive=self.archive, initial_scan=self.initial_scan,
archive_suffix=self.archive_suffix)
for pdf_filename in py_watcher.start():
self._convert_and_file_email(pdf_filename)
except KeyboardInterrupt:
break
except Exception as e:
print traceback.print_exc(e)
traceback.print_exc(e)
py_watcher.stop()

else:
Expand All @@ -442,14 +464,17 @@ def _convert_and_file_email(self, pdf_filename):
"""
Helper function to run the conversion, then do the optional filing, and optional emailing.
"""
ocr_pdffilename = self.run_conversion(pdf_filename)
if self.enable_filing:
filing = self.file_converted_file(ocr_pdffilename, pdf_filename)
else:
filing = "None"
try:
ocr_pdffilename = self.run_conversion(pdf_filename)
if self.enable_filing:
filing = self.file_converted_file(ocr_pdffilename, pdf_filename)
else:
filing = "None"

if self.enable_email:
self._send_email(pdf_filename, ocr_pdffilename, filing)
if self.enable_email:
self._send_email(pdf_filename, ocr_pdffilename, filing)
except Exception, e:
print traceback.print_exc(e)

def main(): # pragma: no cover
script = PyPDFOCR()
Expand Down
10 changes: 5 additions & 5 deletions pypdfocr/pypdfocr_gs.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,12 +174,12 @@ def _run_gs(self, options, output_filename, pdf_filename):
out = subprocess.check_output(cmd, shell=True)

except subprocess.CalledProcessError as e:
print e.output
print "Exception running Ghostscript:\n\n", e.output

if "undefined in .getdeviceparams" in e.output:
error(self.msgs['GS_OUTDATED'])
raise(self.msgs['GS_OUTDATED'])
else:
error (self.msgs['GS_FAILED'])

raise(self.msgs['GS_FAILED'])

def make_img_from_pdf(self, pdf_filename):
self._get_dpi(pdf_filename) # No need to bother anymore
Expand All @@ -189,7 +189,6 @@ def make_img_from_pdf(self, pdf_filename):

filename, filext = os.path.splitext(pdf_filename)


# Create ancillary jpeg files to use later to calculate image dpi etc
# We no longer use these for the final image. Instead the text is merged
# directly with the original PDF. Yay!
Expand All @@ -213,6 +212,7 @@ def make_img_from_pdf(self, pdf_filename):
options = ' '.join(self.gs_options[self.img_format][1]) % {'dpi':self.output_dpi}
output_filename = '%s_%%d.%s' % (filename, self.img_file_ext)
self._run_gs(options, output_filename, pdf_filename)

for fn in glob.glob(globable_filename):
logging.info("Created image %s" % fn)
return (self.output_dpi, globable_filename)
Expand Down
22 changes: 21 additions & 1 deletion pypdfocr/pypdfocr_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def mergeRotateAroundPointPage(self,page, page2, rotation, tx, ty):
ctm[1][0], ctm[1][1],
ctm[2][0], ctm[2][1]])

def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):
def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename, archive=False, archive_suffix="_orig.pdf"):

logging.debug("Going to overlay following files onto %s" % orig_pdf_filename)
# Sort the hocr_filenames into natural keys!
Expand All @@ -87,6 +87,7 @@ def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):
basename = os.path.splitext(pdf_basename)[0]
pdf_filename = os.path.join(pdf_dir, "%s_ocr.pdf" % (basename))


text_pdf_filenames = []
for img_filename, hocr_filename in hocr_filenames:
text_pdf_filename = self.overlay_hocr_page(dpi, hocr_filename, img_filename)
Expand All @@ -96,6 +97,16 @@ def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):

writer = PdfFileWriter()
orig = open(orig_pdf_filename, 'rb')
orig_reader = PdfFileReader(orig)

# Save the properties
pdf_info = orig_reader.getDocumentInfo()
if pdf_info is not None:
writer.addMetadata(pdf_info)

writer.addMetadata({ '/PyPDFOCR': 'True' })

# Loop through the pages
for orig_pg, text_pg_filename in zip(self.iter_pdf_page(orig), text_pdf_filenames):
text_file = open(text_pg_filename, 'rb')
text_pg = self.iter_pdf_page(text_file).next()
Expand Down Expand Up @@ -123,6 +134,15 @@ def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):
for fn in text_pdf_filenames:
os.remove(fn)

print "Done on conversion: ", orig_pdf_filename
if archive:
original_filename = os.path.join(pdf_dir, "%s%s" % (basename, archive_suffix))
ocr_filename = orig_pdf_filename
print "Archiving PDF %s -> %s, %s -> %s" % (orig_pdf_filename, original_filename, pdf_filename, ocr_filename)
os.rename(orig_pdf_filename, original_filename)
os.rename(pdf_filename, ocr_filename)


logging.info("Created OCR'ed pdf as %s" % (pdf_filename))
return pdf_filename

Expand Down
61 changes: 57 additions & 4 deletions pypdfocr/pypdfocr_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,42 @@
import logging
import glob
import functools
import signal

from multiprocessing import Pool

TIMEOUT = 500

# Ugly hack to pass in object method to the multiprocessing library
# From http://www.rueckstiess.net/research/snippets/show/ca1d7d90
# Basically gets passed in a pair of (self, arg), and calls the method
def unwrap_self(arg, **kwarg):
return PyPreprocess._run_preprocess(*arg, **kwarg)

class TimeoutError(Exception):
pass


def handler(signum, frame):
raise TimeoutError()

def which(program):
import os
def is_exe(fpath):
return os.path.isfile(fpath) and os.access(fpath, os.X_OK)

fpath, fname = os.path.split(program)
if fpath:
if is_exe(program):
return program
else:
for path in os.environ["PATH"].split(os.pathsep):
path = path.strip('"')
exe_file = os.path.join(path, program)
if is_exe(exe_file):
return exe_file

return None

class PyPreprocess(object):
"""Class to wrap all the ImageMagick convert calls"""
Expand All @@ -51,12 +78,31 @@ def cmd(self, cmd_list):
cmd_list = ' '.join(cmd_list)
logging.debug("Running cmd: %s" % cmd_list)
try:
out = subprocess.check_output(cmd_list, stderr=subprocess.STDOUT, shell=True)
signal.signal(signal.SIGALRM, handler)
signal.alarm(TIMEOUT)
proc = subprocess.Popen(cmd_list, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, preexec_fn=os.setsid)
pid = proc.pid
(out, error) = proc.communicate()
signal.alarm(0)
logging.debug(out)
return out
except subprocess.CalledProcessError as e:
print e.output
self._warn("Could not run command %s" % cmd_list)
except TimeoutError, te:
print "Timeout exceeded PID", pid, cmd_list
os.killpg(pid, signal.SIGTERM)
# os.kill(pid, signal.SIGTERM)
finally:
signal.alarm(0)

if proc:
proc.terminate()
proc.kill()
print "Killing processes"

return None



def _run_preprocess(self, in_filename):
Expand All @@ -69,7 +115,8 @@ def _run_preprocess(self, in_filename):
else:
backslash = '\\'

c = ['convert',
convert = which('convert');
c = [convert,
'"%s"' % in_filename,
'-respect-parenthesis',
#'\\( $setcspace -colorspace gray -type grayscale \\)',
Expand All @@ -86,17 +133,23 @@ def _run_preprocess(self, in_filename):
]
logging.info("Preprocessing image %s for better OCR" % in_filename)
res = self.cmd(c)

if res is None:
return in_filename
else:
return out_filename
# Make sure the convert process did not die on us
if os.path.isfile(out_filename):
print "Filename does not exist: ", out_filename, " using ", in_filename
return out_filename

return in_filename

def preprocess(self, in_filenames):
fns = in_filenames

pool = Pool(processes=self.threads)
logging.info("Starting preprocessing parallel execution")
preprocessed_filenames = pool.map(unwrap_self,zip([self]*len(fns),fns))
preprocessed_filenames = pool.map(unwrap_self, zip([self]*len(fns),fns))
pool.close()
pool.join()
logging.info ("Completed preprocessing")
Expand Down
11 changes: 7 additions & 4 deletions pypdfocr/pypdfocr_tesseract.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import os, sys
import logging
import subprocess
import signal
import glob
from subprocess import CalledProcessError
from multiprocessing import Pool
Expand All @@ -36,6 +37,9 @@ def error(text):
def unwrap_self(arg, **kwarg):
return PyTesseract.make_hocr_from_pnm(*arg, **kwarg)

def init_worker():
signal.signal(signal.SIGINT, signal.SIG_IGN)

class PyTesseract(object):
"""Class to wrap all the tesseract calls"""
def __init__(self, config):
Expand All @@ -44,7 +48,7 @@ def __init__(self, config):
"""
self.lang = 'eng'
self.required = "3.02.02"
self.threads = config.get('threads',4)
self.threads = config.get('threads', 4)

if "binary" in config: # Override location of binary
binary = config['binary']
Expand Down Expand Up @@ -129,12 +133,11 @@ def make_hocr_from_pnms(self, fns):

# Glob it
#fns = glob.glob(img_filename)
pool = Pool(processes=self.threads)
print("Making pool")
pool = Pool(processes=self.threads, initializer=init_worker)
hocr_filenames = pool.map(unwrap_self, zip([self]*len(fns), fns))
pool.close()
pool.join()
return zip(fns,hocr_filenames)
return zip(fns, hocr_filenames)


def make_hocr_from_pnm(self, img_filename):
Expand Down
Loading