Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ADD script to create a simplified version of hocr-files #152

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
126 changes: 126 additions & 0 deletions hocr-simplify
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#!/usr/bin/env python

# Create a simplipfied hocr-version by:
# change level of typesetting
# remove properties
# remove attributes
# remove empty contents
# remove character alternatives (choices)


from __future__ import print_function
import argparse
import re
import os
from io import open
import sys

from lxml import etree, html

parser = argparse.ArgumentParser(
description=('change level of typesetting and/or'
'remove properties to create'
'a simplified hocr-version'))

properties = ['baseline', 'bbox', 'cflow', 'cuts', 'hardbreak', 'image',
'imagemd5', 'lpageno', 'ppageno', 'nlp', 'order', 'poly',
'scan_res', 'textangle', 'x_booxes', 'x_font', 'x_fsize',
'x_confs', 'x_scanner', 'x_source', 'x_wconf']

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to have also an option to delete id and/or dir parameter, but they are on their own.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removing attributes is now implemented

typesettings = ['ocrx_word', 'ocr_line', 'ocr_par', 'ocr_carea', 'ocr_page']

parser.add_argument('file', nargs='?', default=sys.stdin)
parser.add_argument('-t', '--typesetting', type=str,
choices=typesettings,
help='Sets a new minimum typesetting level.\n'
'List of typesetting: {}'.format(','.join(typesettings)))
parser.add_argument('-a', '--remove-attributes', nargs='+',
help='Removes attributes, e.g. id')
parser.add_argument('-c', '--remove-choices', action='store_true',
help='Removes character alternatives (tesseract outputs only)')
parser.add_argument('-e', '--remove-empty-contents', action='store_true',
help='Removes contents which are empty or contains whitespaces only')
parser.add_argument('-p', '--remove-properties', nargs='+',
help='List of properties: {}'.format(','.join(properties)))
parser.add_argument('fileout', nargs='?',
help="Output path, default: print to terminal")
parser.add_argument('-v', '--verbose',
action='store_true', help='Verbose, default: %(default)s')

args = parser.parse_args()

with open(args.file,"r",encoding="utf-8") as f:
doc = html.parse(f)

# delete all nodes where the id-attribute contain lstm_choices
if args.remove_choices:
for node in doc.xpath('.//*[contains(@id,"lstm_choices")]'):
node.getparent().remove(node)

# change level of typesetting
if args.typesetting:
# update meta content
node = doc.find("//*[@name='ocr-capabilities']")
if node is not None:
content = node.get("content")
if content is not None and args.typesetting in content:
node.set("content", content.split(args.typesetting)[0] + args.typesetting)
if args.verbose:
print(node.get("content"))

# apply new level of typesetting
for typesetting in typesettings:
for node in doc.xpath("//*[@class='{}']".format(typesetting)):
if args.verbose and typesetting == args.typesetting:
print(re.sub(r'\s+', '\x20', node.text_content()).strip())
text_content = node.text_content()
seperator = "\n"
if "word" in typesetting:
seperator = ""
elif "line" in typesetting:
seperator = " "
node.text = seperator.join([text.strip().replace("\n","") for text in
text_content.splitlines() if
not text.strip() != "\n" and
args.remove_empty_contents or text.strip() != ""])
for child in list(node):
node.remove(child)
if typesetting == args.typesetting:
break

# remove properties
if args.remove_properties:
for node in doc.xpath("//*[@title]"):
title = node.get("title")
node.set('title', ';'.join([prop.replace("\"","'") for prop in
title.split(";") if
prop.strip().split(None, 1)[0] not in
args.remove_properties]))
if args.verbose:
print("Replaced :{}".format(title))
else:
# Replace double quotation marks with single
for node in doc.xpath("//*[@title]"):
node.set("title",node.get("title").replace("\"","'"))

# remove attributes
if args.remove_attributes:
for attr in args.remove_attributes:
for node in doc.xpath("//*[@{}]".format(attr)):
node.attrib.pop("{}".format(attr))

# if no output path is given, print to terminal
if args.fileout is None:
encoding = "utf-8"
if sys.version_info[0] > 2:
encoding = str
print(etree.tostring(doc, pretty_print=True,encoding=encoding))

else:
# create output path if needed
if not os.path.isdir(os.path.dirname(args.fileout)):
os.makedirs(os.path.dirname(args.fileout))

# write new hocr file
with open(args.fileout, "wb") as f:
f.write(etree.tostring(doc, pretty_print=True,encoding="utf-8"))
17 changes: 17 additions & 0 deletions test/hocr-simplify/hocr-simplify.tsht
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/usr/bin/env tsht
TESTDATA="../testdata"
SIMPLEFILE="./tess.simple.hocr"

plan 3

after () {
rm -f "$SIMPLEFILE"
}
hocr-simplify "$TESTDATA/tess.hocr" -t ocr_page > "$SIMPLEFILE" || fail 'hocr-simplify'
equals 3268 $(ls -l "$SIMPLEFILE" | cut -d " " -f5 ) 'filesize == 3268'

hocr-simplify "$TESTDATA/tess_choices.hocr" -c -t ocr_line > "$SIMPLEFILE" || fail 'hocr-simplify'
equals 9691 $(ls -l "$SIMPLEFILE" | cut -d " " -f5 ) 'filesize == 9691'

hocr-simplify "$TESTDATA/tess_choices_charboxes.hocr" -c -t ocrx_word > "$SIMPLEFILE" || fail 'hocr-simplify'
equals 58622 $(ls -l "$SIMPLEFILE" | cut -d " " -f5 ) 'filesize == 58622'
2 changes: 1 addition & 1 deletion test/smoke.tsht
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env tsht

for f in check combine eval eval-geom eval-lines extract-g1000 extract-images lines merge-dc pdf split;do
for f in check combine eval eval-geom eval-lines extract-g1000 extract-images lines merge-dc pdf split simplify;do
exec_ok "hocr-$f" "--help"
exec_ok "hocr-$f" "-h"
done
Empty file added test/testdata/kraken.hocr
Empty file.
Empty file added test/testdata/ocropus.hocr
Empty file.
Loading