-
Notifications
You must be signed in to change notification settings - Fork 2
/
util.py
89 lines (82 loc) · 3.4 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
'''
Created on Apr 12, 2013
@author: MiguelAngel
'''
import nltk
import re
from gensim import corpora
class MyCorpus(object):
def __init__(self,dictionary,corpusDir,filenames,n=1,lemmatize='no',removeStopwords='no',lowerCase='no',other='no',wordnetDictionary='no'):
self.dictionary=dictionary
self.corpusDir=corpusDir
self.filenames=filenames
self.n=n
self.lemmatize=lemmatize
self.removeStopwords=removeStopwords
self.lowerCase=lowerCase
self.other=other
self.wordnetDictionary=wordnetDictionary
def __iter__(self):
for filename in self.filenames:
fid=open(self.corpusDir+filename,'rU')
text=fid.read()
features=NGrams(PreProcesser(Tokenizer(text),self.lemmatize,self.removeStopwords,self.lowerCase,self.other,self.wordnetDictionary),self.n)
fid.close()
yield self.dictionary.doc2bow(features)
def DictionaryGenerator(corpusDir,filenames,n=1,lemmatize='no',removeStopwords='no',lowerCase='no',other='no',wordnetDictionary='no'):
dictionary=corpora.Dictionary([])
cont=0
num=0
lim=len(filenames)/10
for filename in filenames:
fid=open(corpusDir+filename,'rU')
text=fid.read()
dictionary.add_documents([NGrams(PreProcesser(Tokenizer(text),\
lemmatize,removeStopwords,lowerCase,other,wordnetDictionary),n)])
fid.close()
cont+=1
if cont==lim:
num+=1
print num*10,'% ..... Done!'
cont=0
return dictionary
def Tokenizer(text):
sentDetector = nltk.data.load('tokenizers/punkt/english.pickle')
patternWord=r'''(?x)([A-Z]\.)+|\w+(-\w+)*|\$?\d+(\.\d+)?%?'''
sents=sentDetector.tokenize(text)
return [nltk.regexp_tokenize(sent,patternWord) for sent in sents]
def PreProcesser(sentsWords,lemmatize='no',removeStopwords='no',lowerCase='no',other='no',wordnetDictionary='no'):
Res=[]
wnl=nltk.WordNetLemmatizer()
stopwords=['the','of','and','a','in','to','is','was','it','for','with', \
'he','be','on','i','that','by','at','you','\'s','are','not', \
'his','this','from','but','had','which','she','they','or','an', \
'were','we','their','been','has','have','will','would','her', \
'\'t','there','can','all','as','if','who','what','said']
for words in sentsWords:
textTemp=words
if lowerCase=='yes':
textTemp=[word.lower() for word in textTemp]
if removeStopwords=='yes':
textTemp=[word for word in textTemp if word not in stopwords]
if lemmatize=='yes':
textTemp=[wnl.lemmatize(word) for word in textTemp]
if other=='yes':
textTemp=[word for word in textTemp if re.match(r'[a-zA-Z](.)*',word) and len(word)>2]
if wordnetDictionary=='yes':
textTemp=[word for word in textTemp if nltk.corpus.wordnet.synsets(word)]
Res.append(textTemp)
return Res
def NGrams(text,n,divSent='no'):
ng=[]
if divSent=='no':
for sent in text:
for i in range(len(sent)-n+1):
ng.append(tuple(sent[i:i+n]))
else:
for sent in text:
ngTemp=[]
for i in range(len(sent)-n+1):
ngTemp.append(tuple(sent[i:i+n]))
ng.append(ngTemp)
return ng