-
Notifications
You must be signed in to change notification settings - Fork 0
/
mammotab_filter.py
176 lines (150 loc) · 6.45 KB
/
mammotab_filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.14.1
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# +
# # +
from email import header
import numpy as np
from tqdm import tqdm
from utilities.column_classifier import ColumnClassifier
import gzip, json
import sys
import os
from collections import Counter
import numpy as np
import sys
from utilities.utils import clean_links
# -
source_folder = sys.argv[1]
folder_name = sys.argv[2]
output_folder_name = sys.argv[3]
min_links_number = 3 # minimum number of links in a single column for a table to be in the dataset
all_diz = []
kept = 0
total = 0
clean_links_v = np.vectorize(clean_links)
for f_name in tqdm(os.listdir(os.path.join(source_folder, folder_name))):
#print(f_name)
if 'diz_' in f_name:
with gzip.open(os.path.join(source_folder, folder_name, f_name), 'rt') as f:
diz = json.load(f)
tables_to_keep = set()
for tabcode, tab in diz['tables'].items():
text_mat = np.array(tab['text'])
header_mat = np.array(tab['header'])
link_mat = np.array(tab['link'])
cells_mat = np.array(tab['cells'])
cell_types = np.array(tab['cell_types'])
tab['cell_types_dict'] = {}
tab['col_by_row'] = {}
tab['tags'] = {}
# by col
col_to_remove = set()
for col_id, col in enumerate(text_mat.T):
if 'IMAGE' in col \
or 'HELP_PAGE' in col \
or 'WIKI_PROJ_PAGE' in col:
col_to_remove.add(col_id)
else:
current_cell_types_col = cell_types[len(header_mat):, col_id]
frequency_dict = dict(Counter(current_cell_types_col))
frequency_dict = {str(key): value for key, value in frequency_dict.items()}
current_col = col[len(header_mat):]
if(current_col.size == 0):
col_to_remove.add(col_id)
continue
tab['cell_types_dict'][col_id] = frequency_dict
tab['col_by_row'][col_id] = current_col.tolist()
# CO1 RULE remove when more than half are empty string or -
empty_cells_count = sum(1 for cell in current_col if cell in set(['', ' ', '-','None']))
if empty_cells_count >= len(current_col) / 2:
col_to_remove.add(col_id)
# CO2 RULE remove columns with only one repeated value
elif len(set(current_col)) == 1:
col_to_remove.add(col_id)
# CO3 RULE all wikidata Q ids
elif all(map(lambda x: x.startswith('Q') and x[1:].isnumeric(), current_col)):
col_to_remove.add(col_id)
# CO4 RULE remove first word when repeated in the column
#TODO CONTROLLARE E RIVEDERE
first = False
for c in current_col:
if(first == False):
first = c.split(' ')[0]
else:
if c.split(' ')[0] == first:
c = c.replace(first, '', 1)
break
col_to_keep = set(range(text_mat.shape[1])) - col_to_remove
col_to_keep = list(col_to_keep)
text_mat = text_mat[:,col_to_keep]
if header_mat.size > 0:
header_mat = header_mat[:,col_to_keep]
link_mat = link_mat[:,col_to_keep]
cell_col_to_keep = list(map(lambda x: x*2, col_to_keep)) + list(map(lambda x: x*2+1, col_to_keep))
cells_mat = cells_mat[:,cell_col_to_keep]
# by row
row_to_remove = set()
for row_id, row in enumerate(text_mat):
# TR1 RULE remove all empty string or -
empty_cells_count = sum(1 for cell in row if cell in set(['', ' ', '-', 'None']))
if empty_cells_count >= len(row) / 2:
row_to_remove.add(row_id)
# TR2 RULE remove columns with only one repeated value
elif len(set(col[len(row):])) == 1:
row_to_remove.add(row_id)
# TR3 RULE remove rows where total appears at least twice
elif list(map(lambda x: 'total' in x.lower(), row)).count(True) >= 2:
row_to_remove.add(row_id)
elif all(map(lambda x: 'none' in x.lower(), row)):
row_to_remove.add(row_id)
row_to_keep = set(range(text_mat.shape[0])) - row_to_remove
header_row_to_keep = list(row_to_keep.intersection(set(range(header_mat.shape[0]))))
row_to_keep = list(row_to_keep)
text_mat = text_mat[row_to_keep]
if header_mat.size > 0:
header_mat = header_mat[header_row_to_keep]
link_mat = link_mat[row_to_keep]
cells_mat = cells_mat[row_to_keep]
# clean links
if link_mat.size > 0:
link_mat = clean_links_v(link_mat)
assert text_mat.shape == link_mat.shape
assert text_mat.shape[0] == cells_mat.shape[0]
assert text_mat.shape[1] * 2 == cells_mat.shape[1]
if header_mat.size > 0:
assert text_mat.shape[1] == header_mat.shape[1]
assert text_mat.shape[0] >= header_mat.shape[0]
tab['text'] = text_mat.tolist()
tab['header'] = header_mat.tolist()
tab['link'] = link_mat.tolist()
tab['cells'] = cells_mat.tolist()
cfier = ColumnClassifier(tab['col_by_row'], tab['cell_types_dict'])
tab['tags'] = cfier.get_columns_tags()
# if table is not empty
if text_mat.size > 0:
# check max number of links per column (to filter table with too few links)
link_sum = np.array(np.sum(link_mat != '', axis = 0)) #sum of links
assert link_sum.shape[0] == text_mat.shape[1]
current_max = np.amax(link_sum)
if current_max >= min_links_number:
tables_to_keep.add(tabcode)
total += len(diz['tables'])
diz['tables'] = {tabcode: table for tabcode, table in diz['tables'].items() if tabcode in tables_to_keep}
# skip pages with no tables
if diz['tables']:
kept += len(diz['tables'])
os.makedirs(os.path.join(output_folder_name, folder_name), exist_ok=True)
with gzip.open(os.path.join(output_folder_name, folder_name, f_name), 'wt') as f:
json.dump(diz, f)
print('kept', kept, 'of', total)