-
Notifications
You must be signed in to change notification settings - Fork 13
/
document.py
110 lines (90 loc) · 2.81 KB
/
document.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python3
import operator
from pprint import pprint
from textprocessor import TokenProcessor
class Document:
def __init__(self, id):
self.id = id
self.terms = []
self.frequency_map = {}
self.text = ""
self.smoothing = False
def add_text(self, text):
"""
Append text
"""
self.text += "\n" + text
self.text = self.text.strip()
def load_from_file(self, filename):
"""
This is used to append text from a file
"""
with open(filename, 'r') as f:
for line in f:
self.add_text(line)
def get_total_term_count(self):
"""
Return the total number of terms in the document
"""
return len(self.terms)
def get_term_count(self, term, smoothing=False):
"""
Return the total occurence of given term in the document
"""
count = 0
try:
count = self.frequency_map[term]
except KeyError:
count = 0
return count
def generate_frequency_map(self):
"""
It is used to generate a frequency map for the document.
Eg:
{
"a" : 12,
"i" : 1
}
"""
for term in self.terms:
try:
count = self.frequency_map[term]
except KeyError:
count = 0
self.frequency_map[term] = count + 1
def extract_terms(self, tokenizer):
"""
It is used to extract individual terms from the text.
It uses the tokenizer passed as the parameter
"""
self.terms = tokenizer.tokenize(self.text)
self.terms = tokenizer.remove_stopwords(self.terms)
self.terms = tokenizer.remove_shortwords(self.terms)
self.terms = tokenizer.lemmatize(self.terms)
def get_frequent_terms(self, size=5):
"""
This method returns the map with most frequent terms in the document
sorted in descending order by frequency.
"""
# gives a list of tuple with (key,value)
sorted_map = sorted(self.frequency_map.items(), key = operator.itemgetter(1), reverse=True)
to_ret = {}
if size > len(sorted_map):
size = len(sorted_map)
for i in range(size):
pair = sorted_map[i]
to_ret[pair[0]] = pair[1]
return to_ret
def display(self):
self.__str__()
def __str__(self):
return str(pprint(vars(self)))
def main():
token_processor = TokenProcessor()
doc = Document(1)
doc.load_from_file("data/test")
doc.extract_terms(token_processor)
doc.generate_frequency_map()
print(doc.get_frequent_terms())
if __name__ == "__main__":
main()