-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTP-code.py
291 lines (252 loc) · 11.6 KB
/
TP-code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
"""
Created on Sat May 5 01:12:48 2019
@author: sudhirsingh
This is the NLP term project code for identification of similar languages and varieties.
"""
import sys
# import nltk
import string
# from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
# from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
# from sklearn.utils.multiclass import unique_labels
from sklearn import metrics
# from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
def read_file_and_preprocess(file_name):
"""
Function to read file and perform pre-processing.
:param file_name: file to read and perform pre-processing.
:return: na
"""
data = open(file_name).read()
texts, labels = [], []
for idx, line in enumerate(data.split("\n")):
line_contents = line.split('\t')
if any(line_contents):
line = line_contents[0]
line = line.replace(".", "")
line = line.translate(str.maketrans('','','1234567890'))
line = line.translate(str.maketrans('', '', string.punctuation))
line = line.translate(str.maketrans('', '', '#NE#'))
line = line.replace('"', '')
line = line.replace('\n', '')
line = line.strip()
texts.append(line)
labels.append(line_contents[1])
return texts, labels
def unique_class_label(y_train):
"""
Function to retrieve unique class labels from test data set.
:param y_train: all class labels from test data set.
:return: list of unique class labels.
"""
return list(set(y_train))
def print_line_each_language(X, y):
"""
Function to print each line of different language.
:param X: all sentences from dev & test data set.
:param y: all calss lables from dev & test data set.
:return: na
"""
unique_lang = set()
for line, l_code in zip(X, y):
if l_code not in unique_lang:
unique_lang.add(l_code)
print(line + "\t" + l_code)
def feature_extraction_and_modeling(X_train, X_test, feature="tf-idf", ngram=1):
"""
Function to create and extract features from dev and test data set.
:param X_train: the train data set.
:param X_test: the test data set.
:param feature: the feature to be used.
:param ngram: the n-grams to be used in the feature selection.
:return: train and test feature matrix
"""
# feature_type = feature
if feature == "tf-idf":
# ngram level tf-idf
# min_df=0.01, max_df = 0.95,
# create count vector object
word_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(1, ngram), max_df=0.95, max_features=None)
# fit (create matrix) & transform the count vector object according to data
word_vectorizer_X_train = word_vectorizer.fit_transform(X_train)
# create tf-idf transformer
tf_idf_transformer = TfidfTransformer()
# transform the fitted word vector to tf-idf matrix
X_train_feature_matrix = tf_idf_transformer.fit_transform(word_vectorizer_X_train)
# do the same for test.
word_vectorizer_X_test = word_vectorizer.transform(X_test)
tf_idf_transformer = TfidfTransformer()
X_test_feature_matrix = tf_idf_transformer.fit_transform(word_vectorizer_X_test)
return X_train_feature_matrix, X_test_feature_matrix
else:
# create count vector object
count_vector = CountVectorizer(analyzer='word', ngram_range=(1, ngram), max_df=0.95, max_features=None)
# fit (create matrix) the count vector object according to data
count_vector.fit(X_train)
# transform data using count vector object
X_train_feature_matrix = count_vector.transform(X_train)
# do the same for test.
X_test_feature_matrix = count_vector.transform(X_test)
return X_train_feature_matrix, X_test_feature_matrix
def train_model_naive_bayes(X, y):
"""
The Naive Bayes classifier model.
:param X: the train data set.
:param y: the test data set.
:return: the Naive Bayes classifier model
"""
ros = RandomOverSampler(random_state=None)
X_resampled, y_resampled = ros.fit_resample(X, y)
nb_classifier = MultinomialNB(alpha=1.0, fit_prior=True)
nb_classifier.fit(X_resampled, y_resampled)
return nb_classifier
def train_model_LinearSVC(X, y):
"""
The Linear SVM classifier model.
:param X: the train data set.
:param y: the test data set.
:return: the Linear SVM classifier model
"""
ros = RandomOverSampler(random_state=None)
X_resampled, y_resampled = ros.fit_resample(X, y)
svm_classifier = LinearSVC(random_state=None)
svm_classifier.fit(X_resampled, y_resampled)
return svm_classifier
def train_model_LogisticRegression(X, y):
"""
The Logistic regression classifier model.
:param X: the train data set.
:param y: the test data set.
:return: the Logistic regression classifier model
"""
ros = RandomOverSampler(random_state=None)
X_resampled, y_resampled = ros.fit_resample(X, y)
lr_classifier = LogisticRegression(n_jobs=1, random_state=0, C=1e5, solver='lbfgs', max_iter=10000, multi_class='multinomial')
lr_classifier.fit(X_resampled, y_resampled)
return lr_classifier
def test_data_with_model(model, X, y):
"""
Function to predict the test labels using different models.
:param model: models to be used for this test.
:param X: the train data set.
:param y: the test data set.
:return: the prediction and accuracy of the model provided on test data set.
"""
prediction = model.predict(X)
accuracy = model.score(X, y)
return prediction, accuracy
def matrix_and_report(y_test, y_pred, unique_class):
"""
Function to print confusion matrix and classification report on the screen.
:param y_test: the test data set (actual class labels).
:param y_pred: the predicted result on test data set (predicted class labels).
:param unique_class: the unique class labels for the test data set.
:return: na
"""
tf_idf_ngram_cm = metrics.confusion_matrix(y_test, y_pred, labels=unique_class)
figure, axis = plt.subplots(figsize=(10,10))
axis.set_title('Confusion matrix')
sns.heatmap(tf_idf_ngram_cm, annot=True, fmt='d', xticklabels=unique_class, yticklabels=unique_class)
plt.ylabel('Predicted')
plt.xlabel('Actual')
plt.show()
print("--------------------------------------------------------------------------")
print("Confusion matrix, without normalization")
print(metrics.confusion_matrix(y_test, y_pred, labels=unique_class))
plt.show()
print("--------------------------------------------------------------------------")
print("Classification Report")
print(metrics.classification_report(y_test, y_pred, labels=unique_class))
print("--------------------------------------------------------------------------")
def perform_operations(file_train, file_dev_test, file_test):
"""
The main function to perform operations on train and test data set.
operation performed:
- read file and pre-processing
- extract unique class labels
- feature extraction and modeling
- model creation, Naive Bayes & Linear SVM
- model prediction and accuracy
- print model accuracy
- print confusion matrix & classification report on screen.
:param file_train: the train data set file name
:param file_dev_test: the development test data set file name
:param file_test: the test data set file name
:return: na
"""
"""
Training and testing on dev data
"""
# Read file and preprocess the file
X_train, y_train = read_file_and_preprocess(file_train)
X_dev_test, y_dev_test = read_file_and_preprocess(file_dev_test)
# Get all unique class labels (unique languages)
train_unique_class = unique_class_label(y_train)
# # Feature selection and modeling for train data
X_train_tf_idf_ngram, X_dev_test_tf_idf_ngram = feature_extraction_and_modeling(X_train, X_dev_test,
feature="tf-idf", ngram=9)
# Using Naive Bayes: create model
nb_model = train_model_naive_bayes(X_train_tf_idf_ngram, y_train)
# Using Naive Bayes: Test and make prediction of train data
nb_prediction, nb_accuracy = test_data_with_model(nb_model, X_dev_test_tf_idf_ngram, y_dev_test)
# Using Naive Bayes: Accuracy of predicted dev test data
print("Multinomial Naive Bayes classifier accuracy score for test set=%0.4f" % nb_accuracy)
# Multinomial Naive Bayes - Confusion matrix and classification report of train data
matrix_and_report(y_dev_test, nb_prediction, train_unique_class)
# Using Linear SVM: create model
svm_model = train_model_LinearSVC(X_train_tf_idf_ngram, y_train)
# Using Linear SVM: Test and make prediction of train data
svm_prediction, svm_accuracy = test_data_with_model(svm_model, X_dev_test_tf_idf_ngram, y_dev_test)
# Using Linear SVM: Accuracy of predicted dev test data
print("Linear SVM classifier accuracy score for test set=%0.4f" % svm_accuracy)
# Linear SVM - Confusion matrix and classification report of train data
matrix_and_report(y_dev_test, svm_prediction, train_unique_class)
"""
Testing on test data
"""
# Read file and preprocess the file
test_texts, test_classes = read_file_and_preprocess(file_test)
# Get all unique class labels (unique languages)
test_unique_class = unique_class_label(test_classes)
# Feature selection and modeling for test data.
X_train_tf_idf_ngram, test_tf_idf_ngram = feature_extraction_and_modeling(X_train, test_texts, feature="tf-idf",
ngram=9)
# Using Naive Bayes: Test and make prediction of test data
nb_test_prediction, nb_test_accuracy = test_data_with_model(nb_model, test_tf_idf_ngram, test_classes)
# Using Naive Bayes: Accuracy of predicted test data
print("Multinomial Naive Bayes classifier accuracy score for test set=%0.4f" % nb_accuracy)
# Using Naive Bayes: Print Confusion matrix and classification report of test data
matrix_and_report(test_classes, nb_test_prediction, test_unique_class)
# Using Linear SVM: Test and make prediction of test data
svm_test_prediction, svm_test_accuracy = test_data_with_model(svm_model, test_tf_idf_ngram, test_classes)
# Using Linear SVM: Accuracy of predicted test data
print("Linear SVM classifier accuracy score for test set=%0.4f" % svm_test_accuracy)
# Using Linear SVM: Print Confusion matrix and classification report of test data
matrix_and_report(test_classes, svm_test_prediction, test_unique_class)
def main():
"""
The main function to read file name and call perform_operations function for further processing.
:return: na
"""
if len(sys.argv) == 4:
file_train = sys.argv[1]
file_dev_test = sys.argv[2]
file_test = sys.argv[3]
else:
file_train = input("Enter train file name:")
file_dev_test = input("Enter dev test file name:")
file_test = input("Enter test file name:")
if file_train != '' and file_dev_test != '' and file_test != '':
perform_operations(file_train, file_dev_test, file_test)
else:
print("Empty file name(s).")
if __name__=='__main__':
main()