forked from MODELS2019-AURORA/AURORA
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtdmMM.py
executable file
·125 lines (107 loc) · 3.78 KB
/
tdmMM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import os
import re
import string
import textmining
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#Init
rootOutput = 'TDMsMM_BIGRAMS2/'
TEMP_MATRIX_FILE = rootOutput + 'matrixMM.csv'
RESULT_MATRIX_FILE = rootOutput + 'matrixLabeled.csv'
MAP_FILE = rootOutput + "map.txt"
INPUT_PATH = '/home/juri/PycharmProjects/mergeDB/demo/'
nltk.download('punkt')
nltk.download('stopwords')
set(stopwords.words('english'))
printable = set(string.printable)
def no_tokenizer(document):
document = document.lower()
return document.strip().split()
def myIR(text):
text = text.lower()
splits = text.split(" ")
result = ''
for s in splits:
if(len(s.split(".")) == 2):
p1,p2 = s.split(".")
line = ir_text(p1).strip() + "." + ir_text(p2).strip()
result = result + " " + line
return result
def termdocumentmatrix(input_path = INPUT_PATH, cutoff = 1, ir = True):
listFile = walking_dir(input_path)
count = 1
reverseMap = {}
tdm = textmining.TermDocumentMatrix(tokenizer=no_tokenizer)
for file in listFile:
text = ''
with open(file, 'r') as myfile:
for curline in myfile:
text = text + curline
text = filter(lambda x: x in printable, text)
reverseMap[count] = file
count = count + 1
if ir: tdm.add_doc(ir_text(text))
else: tdm.add_doc(myIR(text))
tdm.write_csv(TEMP_MATRIX_FILE+ str(cutoff), cutoff=cutoff)
max = 0;
with open(TEMP_MATRIX_FILE + str(cutoff), 'r') as matrix:
for cnt, line in enumerate(matrix):
if cnt != 0:
part = line.split(",")
for value in part:
if max < value:
max = value
with open(TEMP_MATRIX_FILE + str(cutoff), 'r') as matrix:
with open (RESULT_MATRIX_FILE + str(cutoff), "w") as matrixLabeled:
for cnt, line in enumerate(matrix):
if cnt != 0:
fileName = reverseMap[cnt].replace(INPUT_PATH,"")[4:7]
normalizedRow = normalize(line.rstrip(),max)
matrixLabeled.write(normalizedRow + "," + fileName + "\n")
with open (MAP_FILE + str(cutoff), "w") as writer:
for key, value in reverseMap.items():
writer.write(str(key) + "\t" + str(value) + "\n")
def normalize(string, maxValue):
result = ''
parts = string.split(",")
for part in parts:
result = result + str(float(part)/float(maxValue)) + ","
result = result[:-1]
return result
def stoppingwords(string):
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(string)
filtered_sentence = ''
for w in word_tokens:
if w not in stop_words:
filtered_sentence = filtered_sentence + ' ' + w
return filtered_sentence
def stemming(string):
porter_stemmer = PorterStemmer()
word_data = string
nltk_tokens = nltk.word_tokenize(word_data)
result = ''
for w in nltk_tokens:
result = result + ' ' + porter_stemmer.stem(w)
return result
def lemmatization(string):
wordnet_lemmatizer = WordNetLemmatizer()
word_data = string
nltk_tokens = nltk.word_tokenize(word_data)
result = ''
for w in nltk_tokens:
result = result + ' ' + wordnet_lemmatizer.lemmatize(w)
return result
def ir_text(string): return stoppingwords(lemmatization(stemming(string)))
def walking_dir(rootdir):
fileList = []
for root, subdirs, files in os.walk(rootdir):
for name in files:
fileList.append(os.path.join(root,name))
return fileList
for i in range(1,4):
print 'iteration ' + str(i)
termdocumentmatrix(INPUT_PATH, i, ir=False)