-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathLatent Semantic Analysis.py
69 lines (59 loc) · 2 KB
/
Latent Semantic Analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import nltk
import string
import os
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from nltk.stem.porter import PorterStemmer
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from matplotlib import pyplot as plt
from sklearn.decomposition import TruncatedSVD as SVD
path = './msc-plagiarism-assigment'
token_dict = {}
def tokenize(text):
tokens = nltk.word_tokenize(text)
stems = []
for item in tokens:
stems.append(PorterStemmer().stem(item))
return stems
for dirpath, dirs, files in os.walk(path):
for f in files:
fname = os.path.join(dirpath, f)
print ("fname=", fname)
try:
with open(fname) as pearl:
text = pearl.read()
token_dict[f] = re.sub("[^A-Za-z]", " ", text)
except UnicodeDecodeError as e:
with open(fname,encoding="utf8") as pearl:
text = pearl.read()
token_dict[f] = re.sub("[^A-Za-z]", " ", text)
stopwords = stopwords.words("english")
add = ['search','engine','web','internet']
stopwords.extend(add)
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words=stopwords)
tfs = tfidf.fit_transform(token_dict.values())
lsa = SVD(n_components = 4, n_iter =100)
doc_top=lsa.fit_transform(tfs)
doc_top=Normalizer(copy=False).fit_transform(doc_top)
terms = tfidf.get_feature_names()
for i, comp in enumerate(lsa.components_):
termsInComp = zip(terms,comp)
sortedTerms = sorted(termsInComp, key=lambda x:x[1], reverse=True) [:5]
print ("Topic %d:" %i)
for term in sortedTerms:
print (term[0])
print (" ")
##import umap
##X_topics = lsa.fit_transform(tfs)
##embedding = umap.UMAP(n_neighbors=150, min_dist=0.5, random_state=12).fit_transform(X_topics)
##
##plt.figure(figsize=(7,5))
##plt.scatter(embedding[:, 0], embedding[:, 1],
##c = tfidf.get_feature_names(),
##s = 10, # size
##edgecolor='none'
##)
##plt.show()