Skip to content

Commit

Permalink
canceling training between batches
Browse files Browse the repository at this point in the history
  • Loading branch information
GusttavoOliveira committed May 11, 2024
1 parent 9250bf0 commit 2b3d02b
Showing 1 changed file with 52 additions and 44 deletions.
96 changes: 52 additions & 44 deletions api/Neural_Network2.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,13 @@ def train_epoch(model, optimizer, scheduler, train_loader, criterion, curr_epoch
for inputs, target, text in progress_bar:
target = target.to(device)

# Verifica se o cancelamento foi solicitado a cada batch
with open('training_progress.json', 'r') as f:
data = json.load(f)
if data.get('cancel_requested', False):
print("Training canceled during epoch:", curr_epoch)
return total_loss / max(total, 1), True # Retorna a perda média e o status de cancelamento

# Clean old gradients
optimizer.zero_grad()

Expand All @@ -277,7 +284,6 @@ def train_epoch(model, optimizer, scheduler, train_loader, criterion, curr_epoch
num_iters += 1
if num_iters % 20 == 0:
with open('training_progress.json', 'r+') as f:
data = json.load(f)
progress = 100 * (curr_epoch + num_iters / len(train_loader)) / num_total_epochs
data.update({
'training_progress': progress,
Expand All @@ -287,7 +293,7 @@ def train_epoch(model, optimizer, scheduler, train_loader, criterion, curr_epoch
json.dump(data, f)
f.truncate()

return total_loss / total
return total_loss / max(total, 1), False

def validate_epoch(model, valid_loader, criterion):
model.eval()
Expand All @@ -309,8 +315,16 @@ def validate_epoch(model, valid_loader, criterion):

return total_loss / total

def create_and_train_model(df, name, epochs = 10, batch_size = 32, learning_rate = 0.001):
import os
import json
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelEncoder
import joblib

def create_and_train_model(df, name, epochs=10, batch_size=32, learning_rate=0.001):
# Configurações iniciais e preparações do modelo
dropout_probability = 0.2
n_rnn_layers = 1
embedding_dimension = 128
Expand All @@ -321,20 +335,19 @@ def create_and_train_model(df, name, epochs = 10, batch_size = 32, learning_rate
valid_ratio = 0.05
test_ratio = 0.05

# Preparação do dataset
dataset = CustomDataset(df, max_vocab, max_len, name)

train_dataset, valid_dataset, test_dataset = split_train_valid_test(
dataset, valid_ratio=valid_ratio, test_ratio=test_ratio)
len(train_dataset), len(valid_dataset), len(test_dataset)


# Preparação dos DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate)


# Inicialização do modelo
model = RNNClassifier(
output_size=len(df.labels),
output_size=len(df['labels'].unique()),
hidden_size=hidden_size,
embedding_dimension=embedding_dimension,
vocab_size=len(dataset.token2idx),
Expand All @@ -343,54 +356,49 @@ def create_and_train_model(df, name, epochs = 10, batch_size = 32, learning_rate
bidirectional=is_bidirectional,
n_layers=n_rnn_layers,
device=device,
batch_size=batch_size,
batch_size=batch_size
)
model = model.to(device)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(
filter(lambda p: p.requires_grad, model.parameters()),
lr=learning_rate,
)
scheduler = CosineAnnealingLR(optimizer, 1)
# Definição da função de perda e otimizador
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 1)

n_epochs = 0
train_losses, valid_losses = [], []
canceled = False
for curr_epoch in range(epochs):
train_loss, canceled = train_epoch(model, optimizer, scheduler, train_loader, criterion, curr_epoch, epochs)
if canceled:
print(f"Training canceled during epoch {curr_epoch + 1}")
break

# Check if there was a request to cancel the training process
with open('training_progress.json', 'r') as file:
data = json.load(file)
if data.get('cancel_requested', False):
print("Training canceled!")
return

train_loss = train_epoch(model, optimizer, scheduler, train_loader, criterion, curr_epoch, epochs)
valid_loss = validate_epoch(model, valid_loader, criterion)

tqdm.write(
f'epoch #{n_epochs + 1:3d}\ttrain_loss: {train_loss:.2e}'
f'\tvalid_loss: {valid_loss:.2e}\n',
f'Epoch #{curr_epoch + 1:3d}\ttrain_loss: {train_loss:.2e}'
f'\tvalid_loss: {valid_loss:.2e}'
)

# Early stopping if the current valid_loss is greater than the last three valid losses
if len(valid_losses) > 2 and all(valid_loss >= loss
for loss in valid_losses[-3:]):
print('Stopping early')
if len(valid_losses) > 2 and all(valid_loss >= loss for loss in valid_losses[-3:]):
print('Stopping early due to lack of improvement in validation loss.')
break

train_losses.append(train_loss)
valid_losses.append(valid_loss)

n_epochs += 1

model_path = os.path.join('api', 'models', name)
os.makedirs(os.path.dirname(model_path), exist_ok=True)
torch.save(model, model_path)

training_progress = {
'training_progress': 0,
'training_in_progress': True
}
with open('training_progress.json', 'w') as file:
json.dump(training_progress, file)
# Finalizar e salvar o modelo se não foi cancelado
if not canceled:
model_path = os.path.join('api', 'models', name)
os.makedirs(os.path.dirname(model_path), exist_ok=True)
torch.save(model.state_dict(), model_path)

# Atualizar e salvar o estado de treinamento final
training_progress = {
'training_progress': 100,
'training_in_progress': False,
'cancel_requested': False
}
with open('training_progress.json', 'w') as file:
json.dump(training_progress, file)

print("Training complete.")

0 comments on commit 2b3d02b

Please sign in to comment.