Skip to content

Commit 15df85f

Browse files
Add files via upload
1 parent 23c233a commit 15df85f

17 files changed

+6731
-0
lines changed

Business.png

180 KB
Loading

EvaluationMetric_10fold.csv

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Statistic Measure Naive Bayes Random Forest SVM KNN My Method
2+
Accuracy 0.961437 0.96437 0.965024 0.923281 0.967469
3+
Precision 0.957688 0.962305 0.96238 0.925392 0.964766
4+
Recall 0.959811 0.961534 0.962631 0.918338 0.965858
5+
F-Measure 0.958599 0.96183 0.962347 0.918607 0.965192

Film.png

189 KB
Loading

Football.png

179 KB
Loading

Politics.png

201 KB
Loading

Technology.png

159 KB
Loading

beat_the_benchmark.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
import pandas as pd
2+
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
3+
from sklearn.feature_extraction.text import TfidfVectorizer
4+
from sklearn import preprocessing
5+
from sklearn.preprocessing import Normalizer
6+
from sklearn.svm import SVC
7+
from sklearn.decomposition import TruncatedSVD
8+
from sklearn.model_selection import cross_validate
9+
from nltk import PorterStemmer
10+
from sklearn.model_selection import StratifiedKFold
11+
12+
pd.set_option('precision', 6)
13+
14+
# Use porter stemmer #
15+
stemmer = PorterStemmer()
16+
17+
# 10-fold #
18+
kf = StratifiedKFold(n_splits=10, random_state=123)
19+
20+
# Read data #
21+
train_data = pd.read_csv('dataSets/train_set.csv', encoding="utf-8", sep="\t")
22+
test_data = pd.read_csv('dataSets/test_set.csv', encoding="utf-8", sep="\t")
23+
24+
# Drop useless columns #
25+
train_data = train_data.drop(['RowNum', 'Id'], axis=1)
26+
27+
y_train = train_data["Category"]
28+
X_train = train_data["Content"]
29+
X_test = test_data["Content"]
30+
X_title = train_data["Title"]
31+
Y_title = test_data["Title"]
32+
33+
# Perform stemming #
34+
lst = []
35+
for i in range(X_train.shape[0]):
36+
s = X_train.iloc[i]
37+
x = []
38+
for t in s.split(" "):
39+
x.append(stemmer.stem(t))
40+
41+
lst.append(" ".join(x))
42+
43+
tmp = pd.DataFrame(lst, columns=["Content"])
44+
45+
X_train = tmp["Content"]
46+
47+
# Perform stemming in test set #
48+
lst = []
49+
for i in range(X_test.shape[0]):
50+
s = X_test.iloc[i]
51+
x = []
52+
for t in s.split(" "):
53+
x.append(stemmer.stem(t))
54+
55+
lst.append(" ".join(x))
56+
57+
tmp = pd.DataFrame(lst, columns=["Content"])
58+
59+
X_test = tmp["Content"]
60+
61+
# Add labels #
62+
le_train = preprocessing.LabelEncoder()
63+
X_train_le = le_train.fit_transform(y_train)
64+
X_train_cat = le_train.inverse_transform(X_train_le)
65+
66+
# Create matrix of TF-IDF features #
67+
# Use title efficiently #
68+
tfidf_vectorizer = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)
69+
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train + (5 * X_title))
70+
X_test_tfidf = tfidf_vectorizer.transform(X_test + (5 * Y_title))
71+
72+
# Normalize data #
73+
norm = Normalizer()
74+
X_train_tfidf = norm.fit_transform(X_train_tfidf)
75+
X_test_tfidf = norm.transform(X_test_tfidf)
76+
77+
# Classifier #
78+
clf = SVC(C=1, kernel="rbf", gamma=10)
79+
80+
# Use LSA for dimensionality reduction #
81+
svd = TruncatedSVD(n_components=100, random_state=123)
82+
83+
# Perform dimensionality reduction #
84+
X_train_reduced = svd.fit_transform(X_train_tfidf)
85+
X_test_tfidf = svd.transform(X_test_tfidf)
86+
87+
# Metrics #
88+
scoring = ['precision_macro', 'recall_macro', 'f1_macro', 'accuracy']
89+
90+
# Evaluate my method #
91+
scores = cross_validate(clf, X_train_reduced, X_train_le, scoring=scoring, cv=kf)
92+
93+
# Print results to csv #
94+
Evaluation_metric_df = pd.read_csv('EvaluationMetric_10fold.csv', sep="\t")
95+
96+
Evaluation_metric_df['My Method'] = [str(round(scores['test_accuracy'].mean(), 6)),
97+
str(round(scores['test_precision_macro'].mean(), 6)),
98+
str(round(scores['test_recall_macro'].mean(), 6)),
99+
str(round(scores['test_f1_macro'].mean(), 6))]
100+
101+
# Create csv #
102+
Evaluation_metric_df.to_csv("EvaluationMetric_10fold.csv", encoding='utf-8', index=False, sep="\t")
103+
104+
# Predict test set #
105+
106+
# Train model #
107+
clf.fit(X_train_reduced, X_train_le)
108+
109+
# Predict categories #
110+
y_test = clf.predict(X_test_tfidf)
111+
y_cat = le_train.inverse_transform(y_test)
112+
113+
# Create csv of predicted categories #
114+
cols = ['Id', 'Category']
115+
lst = []
116+
117+
# Lst: list of lists #
118+
# Every single list #
119+
# contains id and #
120+
# predicted category #
121+
122+
for i in range(test_data.shape[0]):
123+
curr_id = test_data.iloc[i]['Id']
124+
lst.append([curr_id, y_cat[i]])
125+
126+
# Create a dataframe and convert it into csv #
127+
pf = pd.DataFrame(lst, columns=cols)
128+
pf.to_csv("testSet_categories.csv", encoding="utf-8", sep="\t", index=False)

datasets/test_set.csv

Lines changed: 3068 additions & 0 deletions
Large diffs are not rendered by default.

grid_search_SVM.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import pandas as pd
2+
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
3+
from sklearn.feature_extraction.text import TfidfVectorizer
4+
from sklearn import preprocessing
5+
from sklearn.svm import SVC
6+
from sklearn.decomposition import TruncatedSVD
7+
from sklearn.model_selection import GridSearchCV
8+
from sklearn.model_selection import StratifiedKFold
9+
10+
# Read data #
11+
train_data = pd.read_csv('dataSets/train_set.csv', encoding='utf-8', sep="\t")
12+
13+
# Drop useless columns #
14+
train_data = train_data.drop(['RowNum', 'Id', 'Title'], axis=1)
15+
16+
y_train = train_data["Category"]
17+
X_train = train_data["Content"]
18+
19+
# Add labels #
20+
le = preprocessing.LabelEncoder()
21+
X_train_le = le.fit_transform(y_train)
22+
X_train_cat = le.inverse_transform(X_train_le)
23+
24+
# Create matrix of TF-IDF features #
25+
tfidf_vectorizer = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)
26+
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
27+
28+
# Use LSA for dimensionality reduction #
29+
svd = TruncatedSVD(n_components=100, random_state=123)
30+
31+
# Perform dimensionality reduction #
32+
X_train_reduced = svd.fit_transform(X_train_tfidf)
33+
34+
# 10-fold #
35+
kf = StratifiedKFold(n_splits=10, random_state=123)
36+
37+
# Classifier #
38+
clf = SVC()
39+
40+
# SVM #
41+
# Note: Hyperparameters will be selected to #
42+
# be the best based also on time to train the #
43+
# model #
44+
# Best hyperparameters #
45+
# Kernel: linear #
46+
# C: 5 #
47+
# Gamma: auto #
48+
49+
# Tune hyperparameters #
50+
51+
parameters = {
52+
"C": [1.0, 5, 0.05],
53+
"kernel": ["rbf", "linear"],
54+
"gamma": ["auto", 50, 500],
55+
"random_state": [123]
56+
}
57+
58+
# Notes: #
59+
# C: avoid misclassifying each training example #
60+
# Kernel: seperation algorithm #
61+
# Gamma: how far the influence of a single training example reaches #
62+
63+
gs_clf = GridSearchCV(clf, parameters, cv=kf)
64+
gs_clf = gs_clf.fit(X_train_reduced, X_train_le)
65+
66+
print("Support Vector Machines best parameters: ")
67+
print(gs_clf.best_params_)

grid_search_mnb.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import pandas as pd
2+
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
3+
from sklearn.feature_extraction.text import TfidfVectorizer
4+
from sklearn import preprocessing
5+
from sklearn.model_selection import StratifiedKFold
6+
from sklearn.model_selection import GridSearchCV
7+
from sklearn.naive_bayes import MultinomialNB
8+
9+
# Read data #
10+
train_data = pd.read_csv('dataSets/train_set.csv', encoding='utf-8', sep="\t")
11+
12+
# Drop useless columns #
13+
train_data = train_data.drop(['RowNum', 'Id', 'Title'], axis=1)
14+
15+
y_train = train_data["Category"]
16+
X_train = train_data["Content"]
17+
18+
# Add labels #
19+
le = preprocessing.LabelEncoder()
20+
X_train_le = le.fit_transform(y_train)
21+
X_train_cat = le.inverse_transform(X_train_le)
22+
23+
# Create matrix of TF-IDF features #
24+
tfidf_vectorizer = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)
25+
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
26+
27+
# 10-fold #
28+
kf = StratifiedKFold(n_splits=10, random_state=123)
29+
30+
# Classifier #
31+
clf = MultinomialNB()
32+
33+
# MNB #
34+
# Note: Hyperparameters will be selected to #
35+
# be the best, based also on time to train the #
36+
# model #
37+
# Best hyperparameters #
38+
# alpha=0.02 #
39+
# fit_prior=True #
40+
41+
# Tune hyperparameters #
42+
parameters = {
43+
"alpha": [50, 15, 10, 5, 1, 0.5, 0.3, 0.1, 0.05, 0.03, 0.02, 0.01, 0.001],
44+
"fit_prior": [True, False],
45+
}
46+
47+
# Use grid search with 10-fold cross validation #
48+
gs_clf = GridSearchCV(clf, parameters, cv=kf)
49+
gs_clf = gs_clf.fit(X_train_tfidf, X_train_le)
50+
51+
# Print results #
52+
print("MultinomialNB best parameters: ")
53+
print(gs_clf.best_params_)

0 commit comments

Comments
 (0)