-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdocument_analyzer.py
132 lines (101 loc) · 3.41 KB
/
document_analyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import requests
import re
import html2text
import math
import bing_search
# TF RELATED
def process_document(URL):
# setup
s = requests.session()
s.headers= {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:53.0) Gecko/20100101 Firefox/53.0'}
# get document in html form
try:
response = s.get(URL)
raw_text = response.text
except:
raw_text = ""
# convert html to text
document = html2text.html2text(raw_text).encode('utf-8')
return document
def generate_term_count_dict(document):
# seperate terms into list
terms_list = re.findall(b"[a-zA-Z]+", document)
# record term counts in a dictionary
term_count_dict = {}
for term in terms_list:
if len(term) >= 2:
term = term.lower()
if term in term_count_dict:
term_count_dict[term] += 1
else:
term_count_dict[term] = 1
return term_count_dict
def sum_values_in_dict(dict):
sum = 0
for key, value in dict.items():
sum = sum + value
return sum
def calculate_tf(term_count, num_of_terms):
return term_count / num_of_terms
# tf is specific to document for each word
def generate_term_tf_dict(term_count_dict):
num_of_terms = sum_values_in_dict(term_count_dict)
term_tf_dict = {}
for key, value in term_count_dict.items():
term_tf_dict[key] = calculate_tf(value, float(num_of_terms))
return term_tf_dict
# IDF RELATED
def generate_set_of_all_terms(URL_termcountdict_dict):
terms_set = []
for URL, term_count_dict in URL_termcountdict_dict.items():
for term, count in term_count_dict.items():
if term not in terms_set:
terms_set.append(term)
return terms_set
def calculate_idf(num_of_docs_containing_term, num_of_docs):
return math.log(num_of_docs / float(num_of_docs_containing_term))
# idf is general to all documents for each word
def generate_term_idf_dict(terms_set, URL_termcountdict_dict):
num_of_docs = len(URL_termcountdict_dict)
term_idf_dict = {}
for term in terms_set:
containing_doc_count = 0
for URL, dictionary in URL_termcountdict_dict.items():
if term in dictionary:
containing_doc_count += 1
term_idf_dict[term] = calculate_idf(containing_doc_count, num_of_docs)
return term_idf_dict
def get_tfidf_scores(URLs):
# save count of each term for each URL
URL_termcountdict_dict = {}
# save tf of each term for each URL
URL_termtfdict_dict = {}
for URL in URLs:
doc = process_document(URL)
term_count_dict = generate_term_count_dict(doc)
term_tf_dict = generate_term_tf_dict(term_count_dict)
URL_termcountdict_dict[URL] = term_count_dict
URL_termtfdict_dict[URL] = term_tf_dict
terms_set = generate_set_of_all_terms(URL_termcountdict_dict)
term_idf_dict = generate_term_idf_dict(terms_set, URL_termcountdict_dict)
term_tfidf_dict = {}
for URL in URLs:
for term in terms_set:
if term in URL_termcountdict_dict[URL]: # check if term exists in URL
if term in term_tfidf_dict:
term_tfidf_dict[term] += URL_termtfdict_dict[URL][term] * term_idf_dict[term]
else:
term_tfidf_dict[term] = URL_termtfdict_dict[URL][term] * term_idf_dict[term]
return term_tfidf_dict
def analyze_search_result(query, count, length):
URL_list = bing_search.search(query, count)
print(URL_list)
return reformat_dictionary(get_tfidf_scores(URL_list), length)
def reformat_dictionary(old_dict, length):
new_list = []
temp_dict = {}
for word, score in old_dict.items():
if len(word) == length:
temp_dict = { "word": word, "score": score }
new_list.append(temp_dict)
return new_list