-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathload_data.py
131 lines (117 loc) · 4.12 KB
/
load_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import os
import pandas as pd
import numpy as np
import tldextract
from urllib.parse import urlparse
from nltk.stem.snowball import FrenchStemmer
import re
import pickle
stemmer = FrenchStemmer()
class LoadData:
"""
class for loading and preprocessing data
"""
def __init__(self, config):
"""
args :
config : config file
"""
self.config = config
self.df_data = self.load_zip(config.path_zip_data)
self.dict_mapping = self.get_mapping_target()
self.config.n_classes = len(self.dict_mapping)
self.stopwords_list = self.load_stop_words_list()
def load_zip(self, path_data):
"""
load zip files
"""
df_data = pd.DataFrame(columns=["url", "target", "day"])
for filename in os.listdir(path_data):
if filename.endswith(".parquet"):
df_data = pd.concat(
[
df_data,
pd.read_parquet(
os.path.join(path_data, filename), engine="pyarrow"
),
],
ignore_index=True,
)
return df_data
def get_mapping_target(self):
"""
Returns a dict that maps old targets to new ones having the property of being succesive.
"""
full_labels = []
for index, row in self.df_data.iterrows():
labels = [label for label in row["target"].astype("int64")]
full_labels += labels
full_labels = np.array(full_labels)
dict_mapping = dict(
zip(np.unique(full_labels), range(len(np.unique(full_labels))))
)
return dict_mapping
def load_stop_words_list(self):
with open(self.config.path_stopwords_list, "r") as f:
lines = f.readlines()
lines = [l.replace("\n", "") for l in lines]
return lines
@staticmethod
def get_new_target(target, dict_mapping):
"""
map old targets to the new ones
"""
labels = target.astype("int64")
new_label = [dict_mapping[label] for label in labels]
return new_label
def parse_url(self, url):
"""
parse the url. It returns the domain name and the words constituting the path. We remove
stopwords , words with length less than 2 and digit parts. We finally stem the words.
"""
def removing_condition(token, stopwords):
cond = (
any(c.isdigit() for c in token)
or len(token) <= 2
or token in stopwords
)
return not (cond)
domain_name = tldextract.extract(url)[1] ## extract domaine name
full_path = urlparse(url).path
## split path based on special caracters
first_tokens = re.split("[- _ % : , / \. \+ ]", full_path)
## get chars from tokens composed of chars + numbers. for instance , extract awd for
## awd789
tokens = []
for token in first_tokens:
tokens += re.split("\d+", token)
tokens = [
stemmer.stem(token.lower())
for token in tokens
if removing_condition(token.lower(), self.stopwords_list)
]
tokens = [
token
for token in tokens
if removing_condition(token, self.stopwords_list)
]
# return unique elements
final_sentence = list(dict.fromkeys([domain_name] + tokens))
return " ".join(final_sentence)
def preprocess(self):
"""
preprocessing function
"""
## first we preprocess the labels
self.df_data["labels"] = self.df_data["target"].apply(
lambda x: self.get_new_target(x, self.dict_mapping)
)
## next we preprocess the urls
self.df_data["text_url"] = self.df_data["url"].apply(
lambda x: self.parse_url(x)
)
def save(self):
pickle.dump(self, open(self.config.preprocessing_class_path, "wb"))
@staticmethod
def load(preprocessing_class_path):
return pickle.load(open(preprocessing_class_path, "rb"))