-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathTreeBuild.py
164 lines (121 loc) · 5.11 KB
/
TreeBuild.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# coding: utf-8
# In[1]:
import re
import bisect
from collections import defaultdict
# example of autovivification
def tree(): return defaultdict(tree)
# In[ ]:
def _leadingSpaces_(target):
return len(target) - len(target.lstrip())
def _findParent_(curIndent, parid, treeRef):
tmpid = parid
while (curIndent <= treeRef[tmpid]['indent']):
tmpid = treeRef[tmpid]['parid']
return tmpid
# In[ ]:
def _generateTree_(rawTokens, treeRef):
# ----------------------------- CLEARED ---------------------------------#
# (token
REGEX_OPEN = r"^\s*\(([a-zA-Z0-9_']*)\s*$"
# (token (tok1 tok2) (tok3 tok4) .... (tokx toky))
REGEX_COMP = r"^\s*\(([a-zA-Z0-9_']+)\s*((?:[(]([a-zA-Z0-9_;.,?'!]+)\s*([a-zA-Z0-9_;\.,?!']+)[)]\s*)+)"
# (, ,) as stand-alone. Used for match() not search()
REGEX_PUNC = r"^\s*\([,!?.'\"]\s*[,!?.'\"]\)"
# (tok1 tok2) as stand-alone
REGEX_SOLO_PAIR = r"^\s*\(([a-zA-Z0-9_']+)\s*([a-zA-Z0-9_']+)\)"
# (tok1 tok2) used in search()
REGEX_ISOL_IN_COMP = r"\(([a-zA-Z0-9_;.,?!']+)\s*([a-zA-Z0-9_;.,?!']+)\)"
# (punc punc) used in search()
REGEX_PUNC_SOLO = r"\([,!?.'\"]\s*[,!?.'\"]\)"
# manually insert Root token
treeRef[len(treeRef)] = {'curid':0,
'parid':-1,
'posOrTok':'ROOT',
'indent':0,
'children':[],
'childrenTok':[]}
ID_CTR = 1
for tok in rawTokens[1:]:
curIndent = _leadingSpaces_(tok) # the current indent level
parid = _findParent_(curIndent, ID_CTR-1, treeRef) # determine parid
# CHECK FOR COMPOSITE TOKENS
checkChild = re.match(REGEX_COMP, tok)
if (checkChild):
treeRef[ID_CTR] = {'curid':ID_CTR,
'parid':parid,
'posOrTok':checkChild.group(1),
'indent':curIndent,
'children':[],
'childrenTok':[]}
upCTR = ID_CTR
ID_CTR += 1
# Eliminate further punctuation
subCheck = re.sub(REGEX_PUNC_SOLO,'',checkChild.group(2))
subs = re.findall(REGEX_ISOL_IN_COMP, subCheck)
for ch in subs:
# THE INDENTING IS WRONG HERE - THE HEIRARCHY IS MESSED UP - check test output
treeRef[ID_CTR] = {'curid':ID_CTR,
'parid':upCTR,
'posOrTok':ch[0],
'indent':curIndent+2,
'children':[],
'childrenTok':[]}
ID_CTR += 1
treeRef[ID_CTR] = {'curid':ID_CTR,
'parid':ID_CTR-1,
'posOrTok':ch[1],
'indent':curIndent+2,
'children':[],
'childrenTok':[]}
ID_CTR += 1
continue
checkSingle = re.match(REGEX_SOLO_PAIR, tok)
if (checkSingle):
treeRef[ID_CTR] = {'curid':ID_CTR,
'parid':parid,
'posOrTok':checkSingle.group(1),
'indent':curIndent+2,
'children':[],
'childrenTok':[]}
ID_CTR += 1
treeRef[ID_CTR] = {'curid':ID_CTR,
'parid':ID_CTR-1,
'posOrTok':checkSingle.group(2),
'indent':curIndent+2,
'children':[],
'childrenTok':[]}
ID_CTR += 1
continue
checkPunc = re.match(REGEX_PUNC, tok)
if (checkPunc): # ignore punctuation
continue
checkMatch = re.match(REGEX_OPEN, tok)
if (checkMatch):
treeRef[ID_CTR] = {'curid':ID_CTR,
'parid':parid,
'posOrTok':checkMatch.group(1),
'indent':curIndent,
'children':[],
'childrenTok':[]}
ID_CTR += 1
continue
return
# In[ ]:
'''
_generateTree_() method only provides tree (dict representation) listing parents.
This is a naive method to add a "children" field to the tree - necessary for optimal Tree Kernel methods.
'''
# Switching to 2-pass O(N)
def _flipTree_(treeRef):
# Pass 1 fill in children
for k,v in treeRef.items():
if (k > 0):
bisect.insort(treeRef[v['parid']]['children'], k)
# Pass 2 map children to tokens
for k,v in treeRef.items():
if (k > 0):
treeRef[k]['childrenTok'] = [treeRef[ch]['posOrTok'] for ch in treeRef[k]['children']]
treeRef[0]['childrenTok'] = treeRef[1]['posOrTok']
# In[ ]:
# In[ ]: