Skip to content

Commit

Permalink
fixed classification with unknown words
Browse files Browse the repository at this point in the history
  • Loading branch information
jonasgabriel18 authored and cmaloney111 committed Jun 8, 2024
1 parent a4cf82b commit ffa496e
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 22 deletions.
20 changes: 11 additions & 9 deletions api/DataProcesser.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,11 +104,15 @@ def pretrained_predict(self, df, pipeline, model_name = None):
if model_name:
label_map_filename = f"api/encoders/LabelMapping-{model_name.split('_')[0]}.joblib"
label_encoder = joblib.load(label_map_filename)

texts_to_predict = df['input_column']
texts_to_predict = [str(text) for text in texts_to_predict]

predictions = pipeline.predict(texts_to_predict)
label_predictions = label_encoder.inverse_transform(predictions)

df['output_column'] = label_predictions

return df

def load_weights_and_model(self, name):
Expand All @@ -123,6 +127,9 @@ def trained_predict(self, df, model_name):
label_map_filename = f"api/encoders/LabelMapping-{model_name}.joblib"
label_encoder = joblib.load(label_map_filename)

vocab_file = f"api/encoders/Vocab-{model_name}.joblib"
token2id = joblib.load(vocab_file)

model = self.load_weights_and_model(model_name)
model.eval()

Expand All @@ -147,17 +154,12 @@ def trained_predict(self, df, model_name):
lambda tokens: any(token != '<UNK>' for token in tokens),
)]

vocab = sorted({
sublst for lst in df.tokens.tolist() for sublst in lst
})
self.token2idx = {token: idx for idx, token in enumerate(vocab)}

self.token2idx['<PAD>'] = max(self.token2idx.values()) + 1

self.idx2token = {idx: token for token, idx in self.token2idx.items()}
#token2id['<PAD>'] = max(token2id.values()) + 1
if '<UNK>' not in token2id:
token2id['<UNK>'] = max(token2id.values()) + 1

df['indexed_tokens'] = df.tokens.apply(
lambda tokens: [self.token2idx[token] for token in tokens],
lambda tokens: [token2id.get(token, 0) for token in tokens],
)

predictions = []
Expand Down
36 changes: 23 additions & 13 deletions api/Neural_Network2.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from torchtext.vocab import build_vocab_from_iterator

tqdm.pandas()
Expand Down Expand Up @@ -74,6 +74,11 @@ def __init__(self, df, max_vocab, max_len, name):
# Add a padding idx
self.token2idx['<PAD>'] = max(self.token2idx.values()) + 1

vocab_filename = f"Vocab-{name}"
vocab_file = os.path.join("api", "encoders", f"{vocab_filename}.joblib")
os.makedirs(os.path.dirname(vocab_file), exist_ok=True)
joblib.dump(self.token2idx, vocab_file)

self.idx2token = {idx: token for token, idx in self.token2idx.items()}

df['indexed_tokens'] = df.tokens.apply(
Expand All @@ -92,7 +97,7 @@ def __init__(self, df, max_vocab, max_len, name):
self.targets = df.encoded_labels.tolist()

def __getitem__(self, i):
return self.sequences[i], self.targets[i], self.text[i]
return torch.tensor(self.sequences[i]), torch.tensor(self.targets[i]), self.text[i]

def __len__(self):
return len(self.sequences)
Expand All @@ -109,10 +114,11 @@ def split_train_valid_test(corpus, valid_ratio=0.1, test_ratio=0.1):


def collate(batch):
inputs = [item[0] for item in batch]
target = torch.LongTensor([item[1] for item in batch])
text = [item[2] for item in batch]
return inputs, target, text
inputs, target, text = zip(*batch)

padded_inputs = pad_sequence(inputs, batch_first=True, padding_value=0)

return padded_inputs, torch.tensor(target), text

def pad_sequences(sequences, padding_val=0, pad_left=False):
"""Pad a list of sequences to the same length with a padding_val."""
Expand Down Expand Up @@ -140,6 +146,7 @@ def __init__(self, output_size, hidden_size, vocab_size, padding_idx,
self.dropout_probability = dropout_probability
self.device = device
self.padding_idx = padding_idx
self.vocab_size = vocab_size

# We need to multiply some layers by two if the model is bidirectional
self.input_size_factor = 2 if bidirectional else 1
Expand Down Expand Up @@ -210,8 +217,8 @@ def forward(self, inputs, return_activations=False):
lengths, permutation_indices = lengths.sort(0, descending=True)

# Pad sequences so that they are all the same length
padded_inputs = pad_sequences(inputs, padding_val=self.padding_idx)
inputs = torch.LongTensor(padded_inputs)
#padded_inputs = pad_sequences(inputs, padding_val=self.padding_idx)
inputs = torch.LongTensor(inputs)

# Sort inputs
inputs = inputs[permutation_indices].to(self.device)
Expand Down Expand Up @@ -248,6 +255,9 @@ def train_epoch(model, optimizer, scheduler, train_loader, criterion, curr_epoch
progress_bar = tqdm(train_loader, desc='Training', leave=False)
num_iters = 0
for inputs, target, text in progress_bar:
# print(inputs)
# print(target)
# print(text)
target = target.to(device)

# Verifica se o cancelamento foi solicitado a cada batch
Expand All @@ -257,15 +267,15 @@ def train_epoch(model, optimizer, scheduler, train_loader, criterion, curr_epoch
print("Training canceled during epoch:", curr_epoch)
return total_loss / max(total, 1), True # Retorna a perda média e o status de cancelamento

# Clean old gradients
optimizer.zero_grad()

# Forwards pass
output = model(inputs)

# Calculate how wrong the model is
loss = criterion(output, target)

# Clean old gradients
optimizer.zero_grad()

# Perform gradient descent, backwards pass
loss.backward()

Expand Down Expand Up @@ -311,7 +321,7 @@ def validate_epoch(model, valid_loader, criterion):
total_loss += loss.item()
total += len(target)

return total_loss / total
return total_loss / total if total != 0 else 0

def create_and_train_rnn_model(df, name, epochs = 10, batch_size = 32, learning_rate = 0.001):
# Configurações iniciais e preparações do modelo
Expand Down Expand Up @@ -340,7 +350,7 @@ def create_and_train_rnn_model(df, name, epochs = 10, batch_size = 32, learning_
output_size=len(df['labels'].unique()),
hidden_size=hidden_size,
embedding_dimension=embedding_dimension,
vocab_size=len(dataset.token2idx),
vocab_size=len(dataset.token2idx)+1,
padding_idx=dataset.token2idx['<PAD>'],
dropout_probability=dropout_probability,
bidirectional=is_bidirectional,
Expand Down

0 comments on commit ffa496e

Please sign in to comment.