-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathscorer.py
54 lines (43 loc) · 1.83 KB
/
scorer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import nltk
import codecs
from fcutils import tokenizeDocText
class Scorer(object):
""" Base class for VSM scorers """
def __init__(self, docTexts):
self.keywords = []
self.dictionary = None;
self.model = None;
self.similarityModel = None
self.constructModel(docTexts)
def constructModel(self, docTexts):
""" Given document texts, constructs the VSM and similarity models"""
return
def calculate_score(self,docText):
""" Given document text, returns relevancy score.
Document text is tokenized, transformed into vector space,
and then the maximum dot-product is returned"""
docTokens = tokenizeDocText(docText)
# transform document into model's vector space
doc_bow = self.dictionary.doc2bow(docTokens)
vec = self.model[doc_bow]
# return maximum similarity (dot products)
simList = self.similarityModel[vec]
return max(simList)
def labelDocs(self, docNames, minSize, irrelThresh, relThresh):
""" Labels a list of documents as relevant (score >= relThresh) or non-relevant (score <= irrelThresh). """
relevantDocNames = []
irrelevantDocNames = []
for docName in docNames:
f = codecs.open(docName, "r")
text = f.read()
f.close()
score = self.calculateScore(text)
if score <= irrelThresh:
irrelevantDocNames.append(docName)
elif score >= relThresh:
relevantDocNames.append(docName)
if len(relevantDocNames) < minSize:
raise Exception("Not enough relevant documents")
if len(irrelevantDocNames) < minSize:
raise Exception("Not enough irrelevant documents")
return relevantDocNames, irrelevantDocNames