-
Notifications
You must be signed in to change notification settings - Fork 2
/
LSAModelBuilder.py
65 lines (60 loc) · 1.79 KB
/
LSAModelBuilder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
'''
Created on Apr 12, 2013
Generates and saves a gensim lsa model for further use based in the corpuDir (all *.txt documents in the folder)
@author: MiguelAngel
'''
from gensim import models,corpora
import time
import util
import pan2011corpus_util
from nltk.corpus import wordnet as wn
######### MAIN ########
'''Parameters'''
numOfTopics=300
n=1
lemmatize='yes'
removeStopwords='yes'
lowerCase='yes'
other='yes'
lang='en'
wordnetDictionary='yes'
'''end Parameters'''
t0=time.time()
print 'Getting filenames .....'
corpusDir='D:/CIC/Research visit Greece/pan-plagiarism-corpus-2011/external-detection-corpus/source-document/'
pan=pan2011corpus_util.pan2011corpus(corpusDir)
filenames=pan.fn_per_lang(lang)
print 'Total of files:',len(filenames)
print 'Done!\t',
t1=time.time()
print 'Time:',t1-t0
'''
print 'Generating dictionary .....'
dictionary=corpora.Dictionary([list(wn.all_lemma_names())])
print 'Dictionary length: ',len(dictionary)
print 'Done!\t',
t2=time.time()
print 'Time:',t2-t1
'''
print 'Generating dictionary .....'
dictionary=util.DictionaryGenerator(corpusDir, filenames[:1000],n,lemmatize,removeStopwords,lowerCase,other,wordnetDictionary)
dictionary.save('models/plagdict.dict')
print 'Dictionary length: ',len(dictionary)
print 'Done!\t',
t2=time.time()
print 'Time:',t2-t1
print 'Generating tfidf model .....'
corpus=util.MyCorpus(dictionary,corpusDir,filenames[:1000],n,lemmatize,removeStopwords,lowerCase,other,wordnetDictionary)
tfidfModel=models.TfidfModel(corpus)
tfidfModel.save('models/tfidfmodel.tfidf')
print 'Done!\t',
t3=time.time()
print 'Time:',t3-t2
print 'Generating lsa model .....'
lsaModel=models.LsiModel(tfidfModel[corpus],num_topics=numOfTopics,id2word=dictionary)
lsaModel.save('models/lsamodel.lsi')
print 'Done!\t',
t4=time.time()
print t4-t3
t5=time.time()
print 'Spend time:',t5-t0