-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare_ann.py
110 lines (91 loc) · 4.07 KB
/
prepare_ann.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os.path
import re
import typing as tg
usage = """Prepares files for annotation.
Reads and converts text files such as 'abc.txt' one-by-one and writes their
converted version to path 'outputdir/abc.txt'.
Breaks lines after each sentence (using simple heuristics to determine
the end of sentences) and inserts empty annotation braces {{}}
on the next line.
"""
def configure_argparser(p_prepare_ann):
p_prepare_ann.add_argument('outputdir',
help="Directory where prepared files will be placed")
p_prepare_ann.add_argument('textfile', nargs='+',
help="*.txt file into which to line-split by sentence and insert empty {{}}")
def prepare_annotations(outputdir: str, inputfiles: tg.Sequence[str]):
for inputfile in inputfiles:
prepare_one_file(inputfile, outputdir)
def prepare_one_file(inputfile: str, outputdir: str):
with open(inputfile, mode='rt', encoding="utf8") as f:
inputstring = f.read()
inputfilename = os.path.basename(inputfile)
outputpathname = f"{outputdir}/{inputfilename}"
if os.path.exists(outputpathname):
print(f"#### '{outputpathname}' exists!. SKIPPED.")
return
print(f"---- writing '{outputpathname}'")
with open(outputpathname, mode='wt', encoding="utf8") as f:
f.write(prepared(inputstring))
def prepared(txt: str) -> str:
"""
Splits into sentences and inserts '{{}}' pairs.
Possible sentence ends are . : ? ! followed by a blank,
but instead of a blank there can be '\n' or end-of-file.
Replacements enforce '\n' there and another after the '{{}}'.
"""
possible_end = r'[.:?!]\s*[ \n$]'
txt2 = with_protection(txt) # save non-sentence-ends from being treated like sentence ends
result = ""
# ----- process abstract text:
while len(txt2) > 0:
end_match = re.search(possible_end, txt2)
if end_match:
# print(f"## match ") #'{end_match.group()}'")
endpos = end_match.end()
candidate = txt2[0:endpos]
txt2 = txt2[endpos:]
result += replacement_for(candidate)
else: # no further sentence end at all
# print("## remainder ")
result += replacement_for(txt2)
txt2 = ""
return unprotect(result) # put back protected possible_ends
protection_replacements = [('.', '\u2059'), (':', '\u205a'), ('?', '\u2056'), ('!', '\u205e'), ]
def protect(txt: str) -> str:
"""Replace characters that could indicate a sentence end by super-rare ones."""
for char, rplcmnt in protection_replacements:
txt = txt.replace(char, rplcmnt)
return txt
def unprotect(txt: str) -> str:
"""Replace the replacement characters back by the original characters."""
for char, rplcmnt in protection_replacements:
txt = txt.replace(rplcmnt, char)
return txt
def with_protection(txt: str) -> str:
"""
Knows some kinds of sentence-end-lookalikes that are not really sentence ends
and protects them by replacing the sentence-end-indicator characters by dummies.
"""
non_sentenceends = (r"[Ee]\.g\.\s",
r"et ?al.\s",
r"https?:\s",
r"[Ii]\.e\.\s",
r"vs\.\s",
r"\n# \d\d?\.?\s.+(?=\n)", # heading with dotted number or pseudo-end in title
)
mark_as_sentence = (r"\n(\d\d?\.){0,3}\d\d?\.?\s.+(?=\n)", # e.g. "2.3.4. Acro: The Design Phase!"
)
result = txt
for non_end in non_sentenceends:
result = re.sub(non_end, lambda mm: protect(mm.group()), result)
for sentence_structure in mark_as_sentence:
result = re.sub(sentence_structure, lambda mm: protect(mm.group()) + ".", result)
return result
def replacement_for(candidate: str) -> str:
"""
Replaces a sentence by what should appear in the coding file for it:
replaces the trailing whitespace by "\n{{}}\n".
"""
result = re.sub(r"\s*$", r"", candidate) # remove trailing whitespace
return result + "\n{{}}\n"