-
Notifications
You must be signed in to change notification settings - Fork 4
/
utils.py
36 lines (32 loc) · 1.17 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import pandas as pd
from text_processing import *
LINUX_BUGS_DATA_PATH = './linux_bugs_usage_ready.csv'
CHROMIUM_BUGS_DATA_PATH = './chromium.csv'
def merge_title_and_message(data, message_col_name='message'):
'''
This function is specific to the linux bug tracker dataset. It contains two
feature columns (with text) - `title` and `message`, this merges them into a
single column called `text`
'''
data['text'] = data['title'] + ' ' + data[message_col_name]
data = data.drop(['title'], axis=1)
data = data.drop([message_col_name], axis=1)
return data
def load_linux_bug_data():
'''
Load linux bugs dataset and apply the preprocessing pipeline.
'''
data = pd.read_csv(LINUX_BUGS_DATA_PATH, sep='\t')
data = merge_title_and_message(data)
data = strip_punctuations(data)
# data = stemm_text(data) - this has shown poor results
data = remove_linux_garbage(data)
return data
def load_chromium_bug_data():
'''
Load chromium bugs dataset and apply the preprocessing pipeline.
'''
data = pd.read_csv(CHROMIUM_BUGS_DATA_PATH, sep='\t')
data = merge_title_and_message(data, message_col_name='description')
data = strip_punctuations(data)
return data