-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py~
199 lines (136 loc) · 4.63 KB
/
main.py~
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import re
import csv
import string
import math
from nltk.stem import*
stemmer = PorterStemmer()
new_feature_list = []
number_of_documents = 4.0 #number of training sets
def replaceTwoOrMore(s):
#look for 2 or more repetitions of character and replace with the character itself
pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
return pattern.sub(r"\1\1", s)
def getStopWords():
stopwords=[]
sw = open('stopwords.txt', 'r')
line = sw.readline()
while line:
word = line.strip() #to remove newline characters
stopwords.append(word)
line = sw.readline()
sw.close()
return stopwords
def getFeatureVector(tweet, stopwords):
featureVector = []
#split tweet into words
words = tweet.split()
for w in words:
#strip punctuation
table = string.maketrans("","") #import string
w = w.translate(table, string.punctuation)
#replace two or more with two occurrences
#w = replaceTwoOrMore(w)
#check if the word stats with an alphabet
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
#ignore if it is a stop word
if(w in stopwords or val is None):
continue
else:
featureVector.append(stemmer.stem(w.lower()))
return featureVector
def processTweet(tweet):
# process the tweets
#Convert to lower case
tweet = tweet.lower()
#Convert www.* or https?://* to URL
tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
#Convert @username to AT_USER
tweet = re.sub('@[^\s]+','AT_USER',tweet)
#Remove additional white spaces
tweet = re.sub('[\s]+', ' ', tweet)
#Replace #word with word
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
#trim
tweet = tweet.strip('\'"')
return tweet
def extract_features(tweet):
processedTweet = processTweet(tweet)
#print "after process"
#print processedTweet
featureVector = getFeatureVector(processedTweet, stopwords)
#print featureVector
features = {}
for words in featureList:
features['contains[%s]'%words] = words in featureVector
sampleTweet = fp.readline()
return features
#calculate frequency of all the words in the featureList and remove duplicates
def get_global_vector(featureList):
global_vector = {}
for w in featureList:
stemmed_word = stemmer.stem(w)
try:
global_vector[w] = global_vector[w]+1
except:
global_vector[w] = 1
new_feature_list.append(w)
#new_feature_list = set(new_feature_list)
print global_vector
return global_vector
#returns weight array of individual tweets using tf-idf
def calculate_weight(featureVector, featureList, global_vector):
weight_vector = {}
new_weight = []
for w in featureList:
weight_vector[w] = 0
for w in featureVector:
if(w in featureList):
weight_vector[w] = weight_vector[w]+1
size_of_vector = len(featureVector)
for w in featureList:
value_of_weight = weight_vector[w]/float(size_of_vector) #atleast one has to be float
value_of_weight = value_of_weight*math.log10(number_of_documents/global_vector[w])
if(value_of_weight<0):
value_of_weight=0
new_weight.append(value_of_weight)
#print new_weight
return new_weight
#main
fp = open('tweets.txt', 'r')
sampleTweet = fp.readline()
inp = csv.reader(open('training.csv', 'rb'), delimiter=',')
stopwords = getStopWords()
tweets = []
featureVector = []
featureList = []
#getting featureList and featureVector
for row in inp:
sentiment = row[0]
tweet = row[1]
processedTweet = processTweet(tweet)
featureVector = getFeatureVector(processedTweet, stopwords)
#print featureVector
#tweets.append((featureVector, sentiment));
tweets.append(featureVector);
for w in featureVector:
featureList.append(w)
global_vector = get_global_vector(featureList)
#print new_feature_list
#print global_vector
#print len(global_vector)
#print featureVector
#print featureList
'''
print "enter data\n"
review = "This gibberish, flop, insufficient fool exaggerate. Bad ridiculous, absolutely raddi"
processedTweet = processTweet(review)
featureVector = getFeatureVector(processedTweet, stopwords)
print calculate_weight(featureVector, new_feature_list, global_vector)
'''
for w in tweets:
weight_of_tweet = calculate_weight(w, new_feature_list, global_vector)
#print weight_of_tweet
#print new_feature_list
#print tweets
#print featureList
#print extract_features("awesome world with rainbows")