-
Notifications
You must be signed in to change notification settings - Fork 0
/
gui.py
369 lines (290 loc) · 14.6 KB
/
gui.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
import logging
import os.path
import sys
from collections import Counter
import numpy as np
from qtpy import QtCore, uic
from qtpy.QtCore import Qt
from qtpy.QtWidgets import (qApp, QApplication, QFileDialog, QMainWindow, QMessageBox,
QTreeWidgetItem, QHeaderView, QTreeWidgetItemIterator)
from sources.doctopic import (MyCorpus, load_from_folder, read_file, data_to_query,
build_index, update_indices, TEMP_FOLDER)
from sources.utils import make_model_archive, move_from_temp
from sources.log import logger
console_handler = logging.StreamHandler(sys.stdout)
log_formatter = logging.Formatter('%(asctime)s : %(levelname)s : %(message)s')
console_handler.setFormatter(log_formatter)
# Overwrite log_viewer default handler
logger.addHandler(console_handler)
logger.setLevel(10)
class MyWindow(QMainWindow):
def __init__(self):
super(MyWindow, self).__init__()
self.init_ui()
# Load settings
self.settings = QtCore.QSettings("DocFirm Inc.", "docfind")
# Retrieve user input from settings
save_folder = self.settings.value("savedFolder", "")
self.input_line_edit_model.setText(save_folder)
# save_doc = self.settings.value("savedDoc", "")
# self.input_line_edit_query.setText(save_doc)
# Write new user input to settings
self.input_line_edit_model.textChanged.connect(self.new_folder_changed)
# self.input_line_edit_query.textChanged.connect(self.new_doc_changed)
# TODO: Define any class attributes here
def new_folder_changed(self, newFolder):
"""Store dirpath to settings"""
self.settings.setValue("savedFolder", w.input_line_edit_model.text())
def init_ui(self):
uic.loadUi(os.path.join('GUI', 'doctopic.ui'), self)
self.setWindowTitle('DocTopic')
self.setContentsMargins(7, 0, 7, 0)
self.statusBar().showMessage('Ready')
self.show()
# def new_doc_changed(self):
# """Store filepath to settings"""
# self.settings.setValue("savedDoc", w.input_line_edit_query.text())
@staticmethod
def open_model():
"""Open file dialog and write dirpath to line edit."""
dirpath = QFileDialog.getExistingDirectory()
w.input_line_edit_model.setText(dirpath)
@staticmethod
def open_folder():
"""Open file dialog and write dirpath to line edit."""
dirpath = QFileDialog.getExistingDirectory()
w.input_line_edit_train.setText(dirpath)
@staticmethod
def open_doc():
"""Open file dialog and write filepath(s) to list widget."""
w.query_list_widget.clear()
fp = QFileDialog.getOpenFileNames(filter='TXT-Datei (*.txt)')[0]
w.query_list_widget.addItems(fp)
@staticmethod
def load_model():
"""Load model parameters.
This method controls which model files will be loaded from disk and added to the
current object as attributes.
"""
# Check if a model has been already loaded
if hasattr(w, 'lsi'):
pass
# Load relevant files from model directory
else:
logger.info('Loading model parameters')
params = ['lsi', 'dictionary', 'tfidf', 'tfidf_index', 'lsi_index', 'labels', 'corpus']
load = load_from_folder(params, w.input_line_edit_model.text())
for k, v in load.items():
setattr(w, k, v)
def run_index_query(self):
""" Run cross-comparison on checked files."""
w.textOutput_index.clear()
w.load_model()
msk = np.zeros(len(w.labels), dtype='bool')
# Retrieve index number of files checked in the tree view
idx = w.get_checked()
# Create a boolean mask from indeces
for i in idx:
msk[i] = True
sections = [' LSI Similarity ', ' Most Prominent LSI topic ', ' tf-idf Similarity ']
headers = '{:<5s}{:>8s}{:>20s}{:>20s}{:>27s}'.format('RANK', 'SIM', 'CLIENT', 'PROJECT', 'FILE')
line = 120
dashed = '{:-{align}{width}}'.format('', align='^', width=line)
for i in idx:
w.textOutput_index.append('Printing results for {}'.format(w.labels[str(i)]))
# Apply mask on indexed file
sims_lsi = list(w.lsi_index)[i][msk]
sims_tfidf = list(w.tfidf_index)[i][msk]
# Add index number to scores for reference purposes
sims_lsi = zip(sims_lsi, idx)
sims_tfidf = zip(sims_tfidf, idx)
# Sort tuples by score
sims_lsi = sorted(sims_lsi, key=lambda item: -item[0])
sims_tfidf = sorted(sims_tfidf, key=lambda item: -item[0])
w.textOutput_index.append('{:-{align}{width}}'.format(sections[0], align='^', width=line))
w.textOutput_index.append(headers)
w.textOutput_index.append(dashed)
for j in range(len(sims_lsi)):
# load label from
labels = w.labels[str(sims_lsi[j][1])]
w.textOutput_index.append('{:<10d}{: 8.3f}\t{:>14.12}{:>14.12}\t{:<12s}'
.format(j + 1, sims_lsi[j][0], labels[0], labels[1], labels[-1]))
w.textOutput_index.append('\n')
w.textOutput_index.append('{:-{align}{width}}'.format(sections[2], align='^', width=line))
w.textOutput_index.append(headers)
w.textOutput_index.append(dashed)
for j in range(len(sims_tfidf)):
labels = w.labels[str(sims_tfidf[j][1])]
w.textOutput_index.append(
'{:<10d}{: 8.3f}\t{:>14.12}{:>14.12}\t{:<12s}'
.format(j + 1, sims_tfidf[j][0], labels[0], labels[1], labels[-1]))
w.textOutput_index.append('\n')
@staticmethod
def run_query():
"""Run query against model."""
w.textOutput.clear()
w.load_model()
# Todo: Implement batch querying for improved performance
for path in [str(w.query_list_widget.item(i).text()) for i in range(w.query_list_widget.count())]:
basename = os.path.basename(path)
w.textOutput.append('Printing results for {}'.format(basename))
if os.path.isfile(path):
data = read_file(path)
vec_bow = data_to_query(data, w.dictionary)
vec_bow = w.tfidf[vec_bow]
# Serialize tfidf transformation and convert search vector to LSI space
# Note: When using transformed search vectors, apply same transformation when building the index
vec_lsi = w.lsi[vec_bow]
# Apply search vector to indexed LSI corpus and sort resulting index-similarity tuples.
sims_lsi = w.lsi_index[vec_lsi]
sims_lsi = sorted(enumerate(sims_lsi), key=lambda item: -item[1])
# Retrieve most prominent topic from search vector
topic = w.lsi.print_topic(max(vec_lsi, key=lambda item: abs(item[1]))[0])
# Apply search vector to transformed tfidf corpus and sort resulting index-similarity tuples
sims_tfidf = w.tfidf_index[vec_bow]
sims_tfidf = sorted(enumerate(sims_tfidf), key=lambda item: -item[1])
w.print_details(sims_lsi, sims_tfidf, topic)
else:
w.textOutput.setText('{} not found. Please select a valid file.'.format(basename))
def load_tree(self):
"""Build a index tree view from labels items."""
w.treeWidget.clear()
w.treeWidget.header().setSectionResizeMode(QHeaderView.ResizeToContents)
# Load model parameters including json dictionary with corpus metadata
w.load_model()
# Create lists from corpus metadata for populating tree cells
clients = [v[0] for v in w.labels.values()]
projects = [v[1] for v in w.labels.values()]
files_idx = [(v[2], k) for k, v in w.labels.items()]
# Create unique keys from list items for reference purposes
c_cnt = Counter(clients)
p_cnt = Counter(zip(clients, projects))
f_cnt = Counter(zip(clients, projects, files_idx))
c_dict, p_dict, f_dict = dict(), dict(), dict()
# Iterate over counter items to create client items
for key, count in c_cnt.items():
c_dict[key] = QTreeWidgetItem(w.treeWidget, [key, str(count)])
c_dict[key].setFlags(Qt.ItemIsUserCheckable | Qt.ItemIsEnabled | Qt.ItemIsEditable | Qt.ItemIsSelectable)
c_dict[key].setExpanded(False)
# Add project children to client items
for key, count in p_cnt.items():
p_dict[key] = QTreeWidgetItem(c_dict[key[0]], [key[1], str(count)])
p_dict[key].setFlags(Qt.ItemIsUserCheckable | Qt.ItemIsEnabled | Qt.ItemIsEditable | Qt.ItemIsSelectable)
# p_dict[key].setCheckState(0, Qt.Unchecked)
# Add file children to project items
for key in f_cnt.keys():
f_dict[key] = QTreeWidgetItem(p_dict[key[:2]], [key[2][0], "", key[2][1]])
f_dict[key].setFlags(Qt.ItemIsUserCheckable | Qt.ItemIsEnabled | Qt.ItemIsEditable | Qt.ItemIsSelectable)
f_dict[key].setCheckState(0, Qt.Unchecked)
@staticmethod
def get_checked():
"""Iterate over tree items and return keys of checked files."""
iterator = QTreeWidgetItemIterator(w.treeWidget)
idx = []
while iterator.value():
item = iterator.value()
if item.checkState(0) == Qt.Checked:
# Convert string index to integer and add to list
idx.append(int(item.text(2)))
iterator += 1
return sorted(idx)
def print_details(self, sims_lsi, sims_tfidf, topic):
"""Print query results to text output."""
sections = [' LSI Similarity ', ' Most Prominent LSI topic ', ' tf-idf Similarity ']
headers = '{:<5s}{:>8s}{:>20s}{:>20s}{:>27s}'.format('RANK', 'SIM', 'CLIENT', 'PROJECT', 'FILE')
line = 120
dashed = '{:-{align}{width}}'.format('', align='^', width=line)
self.textOutput.append('{:-{align}{width}}'.format(sections[0], align='^', width=line))
self.textOutput.append(headers)
self.textOutput.append(dashed)
for i in range(min(len(sims_lsi), 10)):
labels = w.labels[str(sims_lsi[i][0])]
self.textOutput.append('{:<10d}{: 8.3f}\t{:>14.12}{:>14.12}\t{:<12s}'
.format(i + 1, sims_lsi[i][1], labels[0], labels[1], labels[-1]))
self.textOutput.append('\n')
self.textOutput.append('{:-{align}{width}}'.format(sections[1], align='^', width=line))
self.textOutput.append(topic)
self.textOutput.append('\n')
self.textOutput.append('{:-{align}{width}}'.format(sections[2], align='^', width=line))
self.textOutput.append(headers)
self.textOutput.append(dashed)
for i in range(min(len(sims_tfidf), 10)):
labels = w.labels[str(sims_tfidf[i][0])]
self.textOutput.append(
'{:<10d}{: 8.3f}\t{:>14.12}{:>14.12}\t{:<12s}'
.format(i + 1, sims_tfidf[i][1], labels[0], labels[1], labels[-1]))
self.textOutput.append('\n')
@staticmethod
def reset_model():
"""Delete attribute when changing the model folder."""
if hasattr(w, 'lsi'):
del w.lsi
logger.info('Model reset')
@staticmethod
def train_model():
src = w.input_line_edit_train.text()
dst = [os.path.join(TEMP_FOLDER, name) for name in ['lsi.index', 'tfidf.index']]
if os.path.isdir(src):
logger.info('Starting training…')
corpus = MyCorpus(src)
corpus.save_to_temp()
num_features = len(corpus.dictionary)
tfidf, corpus_tfidf, lsi, corpus_lsi = corpus.build_lsi(topics=300)
# build LSI index and save to temp folder
lsi_index = build_index(dst[0], corpus_lsi, num_features)
lsi_index.save(dst[0])
# build tfidf index and save to temp folder
tfidf_index = build_index(dst[1], corpus_tfidf, num_features)
tfidf_index.save(dst[1])
w.textOutput_train.setText('Training complete! Click "Save Project" to save parameters to model folder.')
w.save_model_button.setEnabled(True)
else:
w.textOutput_train.setText('Please select a valid training folder')
w.reset_model()
@staticmethod
def add_docs():
dst = w.input_line_edit_model.text()
src = w.input_line_edit_train.text()
if os.path.isdir(dst) and os.path.isdir(src):
# Prompt user before updating indices
dialog = QMessageBox.question(w, 'DocTopic', 'This operation will change your search indices. Continue?',
QMessageBox.Save | QMessageBox.Cancel, QMessageBox.Cancel)
if dialog == QMessageBox.Save:
cnt = update_indices(src, dst)
w.textOutput_train.setText('{} documents added!'.format(cnt))
w.save_model_button.setDisabled(True)
w.reset_model()
else:
pass
else:
w.textOutput_train.setText('Please select a valid folder.')
@staticmethod
def save_model():
"""Saving training output to model folder."""
dst = w.input_line_edit_model.text()
# Check for files in model dir and zip folder if any
if os.listdir(dst):
archive = make_model_archive(dst)
w.textOutput_train.append(f'Backup of model folder created here: {archive}')
else:
pass
# Copy model files to model folder
cnt = move_from_temp(TEMP_FOLDER, dst)
w.textOutput_train.append(f'{cnt} files saved')
w.save_model_button.setDisabled(True)
if __name__ == '__main__':
app = QApplication(sys.argv)
w = MyWindow()
w.add_documents_button.clicked.connect(MyWindow.add_docs)
w.open_model_button.clicked.connect(MyWindow.open_model)
w.open_doc_button.clicked.connect(MyWindow.open_doc)
w.open_folder_button.clicked.connect(MyWindow.open_folder)
w.run_query_button.clicked.connect(MyWindow.run_query)
w.actionOpen_Model.triggered.connect(MyWindow.open_model)
w.actionExit.triggered.connect(qApp.quit)
w.input_line_edit_model.textChanged.connect(MyWindow.reset_model)
w.train_model_button.clicked.connect(MyWindow.train_model)
w.save_model_button.clicked.connect(MyWindow.save_model)
w.load_tree_button.clicked.connect(MyWindow.load_tree)
w.compare_checked_button.clicked.connect(MyWindow.run_index_query)
sys.exit(app.exec_())