Skip to content

Commit

Permalink
Merge pull request #57 from kba/pdf-xml-parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
zuphilip authored Sep 17, 2016
2 parents edfb38e + fb994c3 commit b482964
Showing 1 changed file with 6 additions and 19 deletions.
25 changes: 6 additions & 19 deletions hocr-pdf
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ from PIL import Image
from reportlab.pdfgen.canvas import Canvas
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from xml.etree.ElementTree import ElementTree, ParseError
from lxml import etree, html

class StdoutWrapper:
"""
Expand Down Expand Up @@ -70,31 +70,18 @@ def add_text_layer(pdf, image, height, dpi):
p1 = re.compile('bbox((\s+\d+){4})')
p2 = re.compile('baseline((\s+[\d\.\-]+){2})')
hocrfile = os.path.splitext(image)[0] + ".hocr"
hocr = ElementTree()
hocr.parse(hocrfile)
for line in hocr.findall(".//{http://www.w3.org/1999/xhtml}span"):
if line.attrib['class'] != 'ocr_line':
continue
hocr = etree.parse(hocrfile, html.XHTMLParser())
for line in hocr.xpath('//*[@class="ocr_line"]'):
linebox = p1.search(line.attrib['title']).group(1).split()
try:
baseline = p2.search(line.attrib['title']).group(1).split()
except AttributeError:
baseline = [ 0, 0 ]
linebox = [float(i) for i in linebox]
baseline = [float(i) for i in baseline]
for word in line:
if word.attrib['class'] != 'ocrx_word':
continue
if word.text is not None:
rawtext = word.text.strip()
else:
try:
innerword = word[0]
if innerword.text is not None:
rawtext = innerword.text.strip()
else:
continue
except:
for word in line.xpath('.//*[@class="ocrx_word"]'):
rawtext = word.text_content().strip()
if rawtext == '':
continue
font_width = pdf.stringWidth(rawtext, 'invisible', 8)
if font_width <= 0:
Expand Down

0 comments on commit b482964

Please sign in to comment.