-
Notifications
You must be signed in to change notification settings - Fork 4
/
Preprocessing.py
123 lines (89 loc) · 2.84 KB
/
Preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#import os
import pandas as pd
import csv
import numpy as np
import nltk
from nltk.stem import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
import seaborn as sb
test_filename = 'data/test.csv'
train_filename = 'data/train.csv'
valid_filename = 'data/valid.csv'
train_news = pd.read_csv(train_filename)
test_news = pd.read_csv(test_filename)
valid_news = pd.read_csv(valid_filename)
#data observation
def data_obs():
print("training dataset size:")
print(train_news.shape)
print(train_news.head(10))
#below dataset were used for testing and validation purposes
print(test_news.shape)
print(test_news.head(10))
print(valid_news.shape)
print(valid_news.head(10))
#check the data by calling below function
#data_obs()
#distribution of classes for prediction
def create_distribution(dataFile):
return sb.countplot(x='Label', data=dataFile, palette='hls')
#by calling below we can see that training, test and valid data seems to be failry evenly distributed between the classes
create_distribution(train_news)
create_distribution(test_news)
create_distribution(valid_news)
#data integrity check (missing label values)
#none of the datasets contains missing values therefore no cleaning required
def data_qualityCheck():
print("Checking data qualitites...")
train_news.isnull().sum()
train_news.info()
print("check finished.")
#below datasets were used to
test_news.isnull().sum()
test_news.info()
valid_news.isnull().sum()
valid_news.info()
#run the below function call to see the quality check results
#data_qualityCheck()
#eng_stemmer = SnowballStemmer('english')
#stopwords = set(nltk.corpus.stopwords.words('english'))
#Stemming
def stem_tokens(tokens, stemmer):
stemmed = []
for token in tokens:
stemmed.append(stemmer.stem(token))
return stemmed
#process the data
def process_data(data,exclude_stopword=True,stem=True):
tokens = [w.lower() for w in data]
tokens_stemmed = tokens
tokens_stemmed = stem_tokens(tokens, eng_stemmer)
tokens_stemmed = [w for w in tokens_stemmed if w not in stopwords ]
return tokens_stemmed
#creating ngrams
#unigram
def create_unigram(words):
assert type(words) == list
return words
#bigram
def create_bigrams(words):
assert type(words) == list
skip = 0
join_str = " "
Len = len(words)
if Len > 1:
lst = []
for i in range(Len-1):
for k in range(1,skip+2):
if i+k < Len:
lst.append(join_str.join([words[i],words[i+k]]))
else:
#set it as unigram
lst = create_unigram(words)
return lst
porter = PorterStemmer()
def tokenizer(text):
return text.split()
def tokenizer_porter(text):
return [porter.stem(word) for word in text.split()]