|
1 | 1 | # Libraries imported.
|
2 | 2 | import re
|
3 | 3 | import os
|
| 4 | +import io |
4 | 5 | import tensorflow as tf
|
5 | 6 | import pandas as pd
|
6 | 7 | import nltk
|
|
15 | 16 | nltk.download('wordnet')
|
16 | 17 |
|
17 | 18 | class Dataset:
|
18 |
| - def __init__(self, data_path, vocab_size, data_classes): |
| 19 | + def __init__(self, data_path, vocab_size, data_classes, vocab_folder): |
19 | 20 | self.data_path = data_path
|
20 | 21 | self.vocab_size = vocab_size
|
21 | 22 | self.data_classes = data_classes
|
22 | 23 | self.sentences_tokenizer = None
|
23 | 24 | self.label_dict = None
|
24 |
| - |
| 25 | + self.vocab_folder = vocab_folder |
| 26 | + self.save_tokenizer_path = '{}tokenizer.json'.format(self.vocab_folder) |
| 27 | + self.save_label_path = 'label.json' |
| 28 | + |
| 29 | + if os.path.isfile(self.save_tokenizer_path): |
| 30 | + with open(self.save_tokenizer_path) as file: |
| 31 | + data = json.load(file) |
| 32 | + self.sentences_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data) |
| 33 | + |
| 34 | + if os.path.isfile(self.save_label_path): |
| 35 | + with open(self.save_label_path) as file: |
| 36 | + self.label_dict = json.load(file) |
| 37 | + |
25 | 38 | def labels_encode(self, labels, data_classes):
|
26 | 39 | '''Encode labels to categorical'''
|
27 | 40 | labels.replace(data_classes, inplace=True)
|
28 | 41 |
|
29 | 42 | labels_target = labels.values
|
30 | 43 | labels_target = tf.keras.utils.to_categorical(labels_target)
|
31 |
| - |
| 44 | + |
32 | 45 | return labels_target
|
33 | 46 |
|
34 | 47 | def removeHTML(self, text):
|
@@ -105,22 +118,38 @@ def load_dataset(self, max_length, data_name, label_name):
|
105 | 118 | datastore = pd.read_csv(self.data_path)
|
106 | 119 | sentences = datastore[data_name]
|
107 | 120 | labels = datastore[label_name]
|
| 121 | + self.label_dict = dict((item, idx) |
| 122 | + for idx, item in enumerate(set(labels))) |
108 | 123 |
|
109 | 124 | # Cleaning
|
110 | 125 | sentences, labels = self.data_processing(sentences, labels)
|
111 |
| - |
| 126 | + |
112 | 127 | # Tokenizing
|
113 | 128 | self.sentences_tokenizer = self.build_tokenizer(sentences, self.vocab_size)
|
114 | 129 | tensor = self.tokenize(
|
115 | 130 | self.sentences_tokenizer, sentences, max_length)
|
116 | 131 |
|
117 |
| - print("Done! Next to ... ") |
118 | 132 | print(" ")
|
| 133 | + print("Save tokenizer ... ") |
| 134 | + |
| 135 | + # Saving tokenizer |
| 136 | + if not os.path.exists(self.vocab_folder): |
| 137 | + try: |
| 138 | + os.makedirs(self.vocab_folder) |
| 139 | + except OSError as e: |
| 140 | + raise IOError("Failed to create folders") |
| 141 | + |
| 142 | + tokenizer_json = self.sentences_tokenizer.to_json() |
| 143 | + with io.open(self.save_tokenizer_path, 'w', encoding='utf-8') as file: |
| 144 | + file.write(json.dumps(tokenizer_json, ensure_ascii=False)) |
119 | 145 |
|
120 | 146 | # Saving label dict
|
121 | 147 | with open('label.json', 'w') as f:
|
122 |
| - json.dump(self.label_dict, f) |
123 |
| - |
| 148 | + json.dump(self.label_dict, f) |
| 149 | + |
| 150 | + print("Done! Next to ... ") |
| 151 | + print(" ") |
| 152 | + |
124 | 153 | return tensor, labels
|
125 | 154 |
|
126 | 155 | def build_dataset(self, max_length=128, test_size=0.2, buffer_size=128, batch_size=128, data_name='review', label_name='sentiment'):
|
|
0 commit comments