-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdm2c.py
90 lines (73 loc) · 2.53 KB
/
dm2c.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import nltk
import string
import os
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from nltk.stem.porter import PorterStemmer
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from matplotlib import pyplot as plt
from sklearn.decomposition import TruncatedSVD as SVD
from scipy.cluster.hierarchy import dendrogram, linkage,cut_tree
from sklearn.cluster import AgglomerativeClustering
path = './msc-plagiarism-assigment'
token_dict = {}
def tokenize(text):
tokens = nltk.word_tokenize(text)
stems = []
for item in tokens:
stems.append(PorterStemmer().stem(item))
return stems
for dirpath, dirs, files in os.walk(path):
for f in files:
fname = os.path.join(dirpath, f)
print ("fname=", fname)
try:
with open(fname) as pearl:
text = pearl.read()
token_dict[f] = re.sub("[^A-Za-z]", " ", text)
except UnicodeDecodeError as e:
with open(fname,encoding="utf8") as pearl:
text = pearl.read()
token_dict[f] = re.sub("[^A-Za-z]", " ", text)
stopwords = stopwords.words("english")
add = ['search','engine','web','internet']
stopwords.extend(add)
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words=stopwords)
tfs = tfidf.fit_transform(token_dict.values())
lsa = SVD(n_components = 4, n_iter =100)
doc_top=lsa.fit_transform(tfs)
doc_top=Normalizer(copy=False).fit_transform(doc_top)
terms = tfidf.get_feature_names()
for i, comp in enumerate(lsa.components_):
termsInComp = zip(terms,comp)
sortedTerms = sorted(termsInComp, key=lambda x:x[1], reverse=True) [:5]
print ("Topic %d:" %i)
for term in sortedTerms:
print (term[0])
print (" ")
cos = cosine_similarity(doc_top)
#Dendrogram
plt.figure(figsize=(10, 7))
plt.title("dendrogram")
distanceMatrix = 1-cos
Z=linkage(cos,method='complete')
dend = dendrogram(Z, 5, orientation = 'top',
color_threshold=10,
leaf_font_size=10, show_leaf_counts=True)
plt.show()
cluster = AgglomerativeClustering(n_clusters=10, affinity='precomputed', linkage='complete')
cluster.fit_predict(cos)
##import umap
##X_topics = lsa.fit_transform(tfs)
##embedding = umap.UMAP(n_neighbors=150, min_dist=0.5, random_state=12).fit_transform(X_topics)
##
##plt.figure(figsize=(7,5))
##plt.scatter(embedding[:, 0], embedding[:, 1],
##c = tfidf.get_feature_names(),
##s = 10, # size
##edgecolor='none'
##)
##plt.show()