-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathextracting_unique_words_from_all_emails.py
57 lines (49 loc) · 1.71 KB
/
extracting_unique_words_from_all_emails.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import os
import nltk
import time
import string
import operator
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
def text_cleanup(text):
text_without_punctuation = [c for c in text if c not in string.punctuation]
text_without_punctuation = ''.join(text_without_punctuation)
text_without_stopwords = [word for word in text_without_punctuation.split() if word.lower() not in stopwords.words('english')]
text_without_stopwords = ' '.join(text_without_stopwords)
cleaned_text = [word.lower() for word in text_without_stopwords.split()]
return cleaned_text
start_time = time.time()
lmtzr = WordNetLemmatizer()
k=0
count = {}
directory_in_str = "emails/"
directory = os.fsencode(directory_in_str)
for file in os.listdir(directory):
file = file.decode("utf-8")
file_name = str(os.getcwd()) + '/emails/'
file_name = file_name + file
file_reading = open(file_name,"r",encoding='utf-8', errors='ignore')
words = text_cleanup(file_reading.read())
for word in words:
if (word.isdigit()==False and len(word)>2):
word = lmtzr.lemmatize(word)
if word in count:
count[word] += 1
else:
count[word] = 1
k+=1
if(k%100==0):
print("Done " + str(k))
sorted_count = sorted(count.items(),key=operator.itemgetter(1),reverse=True)
sorted_count = dict(sorted_count)
f= open("wordslist.csv","w+")
f.write('word,count')
f.write('\n')
for word , times in sorted_count.items():
if times < 100:
break
f.write(str(word) + ',' + str(times))
f.write('\n')
f.close()
print('Time (in seconds) to pre process the emails ' + str(round(time.time() - start_time,2)))