-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse.py
171 lines (134 loc) · 5.01 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import cgi, re
home_rule_act = open("source_docs/home_rule_act_july2012.txt").read()
home_rule_act = home_rule_act.decode("utf8")
# recode page numbering
home_rule_act = re.sub(u"(\d+)\n(\u000C)", u"\n [\u000C]\n ", home_rule_act) # brackets prevent para collapse
# incorrect encoding of en-dashes
home_rule_act = re.sub(r" B(\s)", ur" \u2013 \1", home_rule_act)
# incorrect encoding of section symbol
home_rule_act = re.sub(r" Code '", u" Code \xa7", home_rule_act)
# combine lines into paragraphs
paragraphs = []
for line in home_rule_act.split("\n"):
if len(paragraphs) > 0 and re.search(r"\w", line) and re.search(r"\w", paragraphs[-1]) and line == line.upper() and paragraphs[-1] == paragraphs[-1].upper():
# Collapse two all-caps lines in a row.
paragraphs[-1] += " " + line.strip()
elif line.startswith(" ") or line.strip() == "":
paragraphs.append(line.lstrip())
else:
paragraphs[-1] += " " + line
# Extract front matter, through the Table of Contents
main_start = paragraphs.index('TITLE I - SHORT TITLE, PURPOSES, AND DEFINITIONS')
front_paragraphs = paragraphs[:main_start]
# Extract back matter, which starts with Organic and Amendment History
back_start = paragraphs.index("DISTRICT OF COLUMBIA HOME RULE ACT ORGANIC AND AMENDATORY HISTORY")
back_paragraphs = paragraphs[back_start:]
# The remainder are main body paragraphs
paragraphs = paragraphs[main_start:back_start]
# Process main body paragraphs
paragraphs = [
{ "indent": 0,
"text": p,
} for p in paragraphs]
for p in paragraphs:
m = re.match(r"(SEC\. \w+\.? )?(\[D\.C\. (?:Official )?Code [^\]]*\]\.? )?((?:\(\S+\)\s*)*)(?:(.*\S) --\s*)?", p["text"], re.I)
m_heading = re.match(r"(?:(TITLE|PART|Subpart)\ ([\w\-]+))\s[\s\-]*(.*)", p["text"])
try:
p["heading-type"] = m_heading.group(1).lower()
p["num"] = m_heading.group(2)
p["heading"] = m_heading.group(3)
continue
except AttributeError:
pass
section_num, dc_code_cite, paragraph_heads, heading = m.groups()
# chop off the section head and citation info
p["text"] = p["text"][len(m.group(0)):]
# starts a new section
if section_num:
p["heading-type"] = "section"
p["num"] = re.sub(r"(SEC|Sec)\. (.*\S)\.", r"\2", section_num).strip()
p["heading"] = heading
p["dc_code_cite"] = dc_code_cite
continue
if paragraph_heads:
p["para_num"] = paragraph_heads
p["heading"] = heading
# Compute indentation levels within each section.
def assign_indentation(section_paragraphs):
# Get a flat list of symbols.
para_symbols = []
for p in section_paragraphs:
if p.get("para_num"):
list_levels = re.findall(r"\((.*?)\)", p["para_num"])
para_symbols.extend(list_levels)
if len(para_symbols) == 0: return
# Solve indentation level.
from infer_list_indentation import infer_list_indentation
indents = infer_list_indentation(para_symbols)
if indents == None: return # could not figure it out
# Apply.
for p in section_paragraphs:
if p.get("para_num"):
if p.get("heading-type") != "section": p["indent"] = indents[0][0]+1
list_levels = re.findall(r"\((.*?)\)", p["para_num"])
for ll in list_levels: indents.pop(0)
cur_section = []
for p in paragraphs:
if p.get("heading-type") == "section":
if len(cur_section) > 0: assign_indentation(cur_section)
cur_section = []
cur_section.append(p)
if len(cur_section) > 0: assign_indentation(cur_section)
print open("front_matter.xml").read()
level_types = ('title', 'part', 'subpart', 'section')
big_stack = []
little_stack = 0
for p in paragraphs:
if p["text"] == u"[\u000C]":
#print "<hr>"
continue
#if "ref" in p:
# ref = "--".join(p["ref"])
# ref = cgi.escape(ref).encode("utf8")
# print "<a name='" + ref + "'> </a>"
if p.get("heading-type"):
while little_stack > 0:
print "</level>"
little_stack -= 1
while big_stack and level_types.index(big_stack[-1]) >= level_types.index(p["heading-type"]):
print "</level>"
big_stack.pop()
big_stack.append(p["heading-type"])
print """
<level>
<type>toc</type>
<prefix>%s</prefix>
<num>%s</num>
""" % (p["heading-type"].title(), cgi.escape(p["num"]).encode("utf8"))
if p.get("dc_code_cite"): print "\t<dc-code-parallel-citation>%s</dc-code-parallel-citation>" % cgi.escape(p["dc_code_cite"]).encode("utf8")
if p.get("heading"): print "\t<heading>%s</heading>" % cgi.escape(p["heading"]).encode("utf8")
if p.get("text"): print "\t<text>%s</text>" % cgi.escape(p["text"]).encode("utf8")
else:
if p.get("para_num") or p.get("heading"):
while little_stack >= p['indent']:
print "</level>"
little_stack -= 1
little_stack += 1
print "<level>"
if p.get("para_num"): print "\t<num>%s</num>" % cgi.escape(p["para_num"]).encode("utf8").strip()
if p.get("heading"): print "\t<heading>%s</heading>" % cgi.escape(p["heading"]).encode("utf8")
print "<text>%s</text>" % cgi.escape(p["text"]).encode("utf8")
else:
if p["text"].strip() == "": continue
print """
<text>%s</text>
""" % cgi.escape(p["text"]).encode("utf8")
while little_stack > 0:
print "</level>"
little_stack -= 1
while big_stack:
print "</level>"
big_stack.pop()
print """
</level>
"""