From ffa496ed5ee2d794a7749b03fac868d2d64ab465 Mon Sep 17 00:00:00 2001 From: jonasgabriel18 Date: Fri, 7 Jun 2024 13:46:45 -0300 Subject: [PATCH] fixed classification with unknown words --- api/DataProcesser.py | 20 +++++++++++--------- api/Neural_Network2.py | 36 +++++++++++++++++++++++------------- 2 files changed, 34 insertions(+), 22 deletions(-) diff --git a/api/DataProcesser.py b/api/DataProcesser.py index 3d75d800..0ec55876 100644 --- a/api/DataProcesser.py +++ b/api/DataProcesser.py @@ -104,11 +104,15 @@ def pretrained_predict(self, df, pipeline, model_name = None): if model_name: label_map_filename = f"api/encoders/LabelMapping-{model_name.split('_')[0]}.joblib" label_encoder = joblib.load(label_map_filename) + texts_to_predict = df['input_column'] texts_to_predict = [str(text) for text in texts_to_predict] + predictions = pipeline.predict(texts_to_predict) label_predictions = label_encoder.inverse_transform(predictions) + df['output_column'] = label_predictions + return df def load_weights_and_model(self, name): @@ -123,6 +127,9 @@ def trained_predict(self, df, model_name): label_map_filename = f"api/encoders/LabelMapping-{model_name}.joblib" label_encoder = joblib.load(label_map_filename) + vocab_file = f"api/encoders/Vocab-{model_name}.joblib" + token2id = joblib.load(vocab_file) + model = self.load_weights_and_model(model_name) model.eval() @@ -147,17 +154,12 @@ def trained_predict(self, df, model_name): lambda tokens: any(token != '' for token in tokens), )] - vocab = sorted({ - sublst for lst in df.tokens.tolist() for sublst in lst - }) - self.token2idx = {token: idx for idx, token in enumerate(vocab)} - - self.token2idx[''] = max(self.token2idx.values()) + 1 - - self.idx2token = {idx: token for token, idx in self.token2idx.items()} + #token2id[''] = max(token2id.values()) + 1 + if '' not in token2id: + token2id[''] = max(token2id.values()) + 1 df['indexed_tokens'] = df.tokens.apply( - lambda tokens: [self.token2idx[token] for token in tokens], + lambda tokens: [token2id.get(token, 0) for token in tokens], ) predictions = [] diff --git a/api/Neural_Network2.py b/api/Neural_Network2.py index caa027a1..51b58dd2 100644 --- a/api/Neural_Network2.py +++ b/api/Neural_Network2.py @@ -17,7 +17,7 @@ from torch.optim.lr_scheduler import CosineAnnealingLR from torch.utils.data import Dataset, DataLoader from torch.utils.data.dataset import random_split -from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence +from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence from torchtext.vocab import build_vocab_from_iterator tqdm.pandas() @@ -74,6 +74,11 @@ def __init__(self, df, max_vocab, max_len, name): # Add a padding idx self.token2idx[''] = max(self.token2idx.values()) + 1 + vocab_filename = f"Vocab-{name}" + vocab_file = os.path.join("api", "encoders", f"{vocab_filename}.joblib") + os.makedirs(os.path.dirname(vocab_file), exist_ok=True) + joblib.dump(self.token2idx, vocab_file) + self.idx2token = {idx: token for token, idx in self.token2idx.items()} df['indexed_tokens'] = df.tokens.apply( @@ -92,7 +97,7 @@ def __init__(self, df, max_vocab, max_len, name): self.targets = df.encoded_labels.tolist() def __getitem__(self, i): - return self.sequences[i], self.targets[i], self.text[i] + return torch.tensor(self.sequences[i]), torch.tensor(self.targets[i]), self.text[i] def __len__(self): return len(self.sequences) @@ -109,10 +114,11 @@ def split_train_valid_test(corpus, valid_ratio=0.1, test_ratio=0.1): def collate(batch): - inputs = [item[0] for item in batch] - target = torch.LongTensor([item[1] for item in batch]) - text = [item[2] for item in batch] - return inputs, target, text + inputs, target, text = zip(*batch) + + padded_inputs = pad_sequence(inputs, batch_first=True, padding_value=0) + + return padded_inputs, torch.tensor(target), text def pad_sequences(sequences, padding_val=0, pad_left=False): """Pad a list of sequences to the same length with a padding_val.""" @@ -140,6 +146,7 @@ def __init__(self, output_size, hidden_size, vocab_size, padding_idx, self.dropout_probability = dropout_probability self.device = device self.padding_idx = padding_idx + self.vocab_size = vocab_size # We need to multiply some layers by two if the model is bidirectional self.input_size_factor = 2 if bidirectional else 1 @@ -210,8 +217,8 @@ def forward(self, inputs, return_activations=False): lengths, permutation_indices = lengths.sort(0, descending=True) # Pad sequences so that they are all the same length - padded_inputs = pad_sequences(inputs, padding_val=self.padding_idx) - inputs = torch.LongTensor(padded_inputs) + #padded_inputs = pad_sequences(inputs, padding_val=self.padding_idx) + inputs = torch.LongTensor(inputs) # Sort inputs inputs = inputs[permutation_indices].to(self.device) @@ -248,6 +255,9 @@ def train_epoch(model, optimizer, scheduler, train_loader, criterion, curr_epoch progress_bar = tqdm(train_loader, desc='Training', leave=False) num_iters = 0 for inputs, target, text in progress_bar: + # print(inputs) + # print(target) + # print(text) target = target.to(device) # Verifica se o cancelamento foi solicitado a cada batch @@ -257,15 +267,15 @@ def train_epoch(model, optimizer, scheduler, train_loader, criterion, curr_epoch print("Training canceled during epoch:", curr_epoch) return total_loss / max(total, 1), True # Retorna a perda média e o status de cancelamento - # Clean old gradients - optimizer.zero_grad() - # Forwards pass output = model(inputs) # Calculate how wrong the model is loss = criterion(output, target) + # Clean old gradients + optimizer.zero_grad() + # Perform gradient descent, backwards pass loss.backward() @@ -311,7 +321,7 @@ def validate_epoch(model, valid_loader, criterion): total_loss += loss.item() total += len(target) - return total_loss / total + return total_loss / total if total != 0 else 0 def create_and_train_rnn_model(df, name, epochs = 10, batch_size = 32, learning_rate = 0.001): # Configurações iniciais e preparações do modelo @@ -340,7 +350,7 @@ def create_and_train_rnn_model(df, name, epochs = 10, batch_size = 32, learning_ output_size=len(df['labels'].unique()), hidden_size=hidden_size, embedding_dimension=embedding_dimension, - vocab_size=len(dataset.token2idx), + vocab_size=len(dataset.token2idx)+1, padding_idx=dataset.token2idx[''], dropout_probability=dropout_probability, bidirectional=is_bidirectional,