-
Notifications
You must be signed in to change notification settings - Fork 79
/
hocr-wordfreq
executable file
·65 lines (61 loc) · 1.61 KB
/
hocr-wordfreq
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env python
import sys
import re
import argparse
from lxml import html
parser = argparse.ArgumentParser(
description='Calculate word frequency in an hOCR file')
parser.add_argument(
'-i',
'--case-insensitive',
action='store_true',
default=False,
help="ignore case")
parser.add_argument(
'-s',
'--spaces',
action='store_true',
default=False,
help="split on spaces only")
parser.add_argument(
'-y',
'--dehyphenate',
action='store_true',
default=False,
help="try to dehyphenate the text before analysis")
parser.add_argument(
'-n',
'--max',
type=int,
default=10,
help="number of hits (default: %(default)s)")
parser.add_argument(
'hocr_in',
help="hOCR file to count frequency for (default: standard input)",
type=argparse.FileType('r'),
nargs='?',
default=sys.stdin)
args = parser.parse_args()
doc = html.parse(args.hocr_in)
text = doc.find('//body').text_content().strip()
if args.case_insensitive:
text = text.lower()
if args.dehyphenate:
# delete blank lines
text = re.sub(r"^\s*$\r?\n", "", text)
# dehyphenate
text = re.sub(r"-\r?\n", "", text)
# replace line breaks with a space
text = re.sub(r"\r?\n", " ", text)
wc = {}
separators = re.compile(r'\W+', re.UNICODE)
if args.spaces:
separators = re.compile(r'\s+', re.UNICODE)
for word in separators.split(text):
if word == '':
continue
wc[word] = wc[word] + 1 if word in wc else 1
for idx, word in enumerate(sorted(wc, reverse=True, key=wc.get)):
if idx > args.max:
break
print("%-5d\t%s" % (wc[word], word))