-
Notifications
You must be signed in to change notification settings - Fork 6
/
merge_boxes.py
executable file
·71 lines (58 loc) · 2.73 KB
/
merge_boxes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#TODO: Handle multiple boxfiles at once
from Tesseract3Box import Tesseract3Box
from utils import parse_boxfile, separation_x, merge_two_boxes
import codecs
import optparse
def main():
parser = optparse.OptionParser(usage="Usage: %prog [-t threshold] boxfile")
parser.add_option('-t', '--threshold', dest='threshold', action='store',
type='int', default=1, help='Adjacent boxes separated horizontally by THRESHOLD or fewer pixels will be merged. Horizontal separation is ignored. Note that this means that boxes located on different lines might be merged in certain (rare) circumstances. Defaults to 1 (boxes are adjacent).')
parser.add_option('-o', '--output', dest='output', action='store',
type='str', default='merged.box', help='Output file. Will overwrite existing files.')
parser.add_option('-d', '--dry', dest='dry', action='store_true',
help='Perform a dry run. No files will be written, info about number of merged boxes will be output to the command line.')
(opts, args) = parser.parse_args()
if len(args) != 1:
parser.print_help()
return 0
boxes = parse_boxfile(args[0])
(merged,stats) = merge_nearby_boxes(opts,boxes)
if opts.dry:
print "Merged %d out of %d boxes. Outputting %d boxes." %(stats["num_merged"], stats["total_in"], stats["total_out"])
else:
with codecs.open(opts.output, mode='wb',encoding='utf-8') as outfile:
for box in merged:
outfile.write(unicode(box)+u'\n')
def merge_nearby_boxes(opts,boxes):
"""Merge boxes in the passed array of boxes which are both adjacent and
separated by fewer pixels than the threshold given in opts.threshold.
Outputs other boxes unchanged."""
stats = {"total_in": 0,"total_out": 0, "num_merged": 0}
stats["total_in"] = len(boxes)
output = list()
newbox = None
while(len(boxes) > 0):
pivot = boxes.pop(0)
# Newbox is the result of all previous merge operations
# In most cases, this is simply the previous pivot box.
if newbox is not None:
#Check horizontal separation
if separation_x(newbox,pivot) <= opts.threshold:
newbox = merge_two_boxes(newbox,pivot)
stats["num_merged"] += 1
#No merge, onto output list.
else:
output.append(newbox)
newbox = pivot
else:
newbox = pivot
#Loop cleanup: push the final box onto the output
if newbox is not None:
output.append(newbox)
stats["total_out"] = len(output)
return (output,stats)
# If program is run directly
if __name__ == "__main__":
main()