From bf0b3c9d684cac3f98340184ae2c298adcfac051 Mon Sep 17 00:00:00 2001 From: Luiz Gusttavo Date: Wed, 17 Apr 2024 10:53:34 -0300 Subject: [PATCH 1/5] new route for cancelling and cancel checks --- api/Neural_Network2.py | 8 ++++++++ api/app.py | 21 ++++++++++++++++++--- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/api/Neural_Network2.py b/api/Neural_Network2.py index 62abf20f..ff0118c4 100644 --- a/api/Neural_Network2.py +++ b/api/Neural_Network2.py @@ -354,6 +354,14 @@ def create_and_train_model(df, name, epochs = 10, batch_size = 32, learning_rate n_epochs = 0 train_losses, valid_losses = [], [] for curr_epoch in range(epochs): + + # Check if there was a request to cancel the training process + with open('training_progress.json', 'r') as file: + data = json.load(file) + if data.get('cancel_requested', False): + print("Training canceled!") + return + train_loss = train_epoch(model, optimizer, scheduler, train_loader, criterion, curr_epoch, epochs) valid_loss = validate_epoch(model, valid_loader, criterion) diff --git a/api/app.py b/api/app.py index e2cbcb1d..987f7b35 100644 --- a/api/app.py +++ b/api/app.py @@ -90,7 +90,8 @@ def train_model(): # reseta status training_progress = { 'training_progress': 0, - 'training_in_progress': True + 'training_in_progress': True, + 'cancel_requested': False } with open('training_progress.json', 'w') as file: json.dump(training_progress, file) @@ -120,9 +121,23 @@ def get_training_status(): return jsonify({'training_in_progress': True, 'training_progress': 0}) training_status = data.get('training_in_progress', False) progress = data.get('training_progress', 0) - return jsonify({'training_in_progress': training_status, 'training_progress': progress}) + cancel_request = data.get('cancel_requested', False) + return jsonify({'training_in_progress': training_status, 'training_progress': progress, 'cancel_requested': cancel_request}) except FileNotFoundError: - return jsonify({'training_in_progress': False, 'training_progress': 0}) + return jsonify({'training_in_progress': False, 'training_progress': 0, 'cancel_requested': False}) + +@app.route('/cancel-training', methods=['POST']) +def cancel_training(): + try: + with open('training_progress.json', 'r+') as file: + data = json.load(file) + data['cancel_requested'] = True + file.seek(0) + json.dump(data, file) + file.truncate() + return jsonify({'message': 'Cancellation requested.'}), 200 + except Exception as e: + return jsonify({'error': str(e)}), 500 if __name__ == '__main__': From a5349c124b00ffccd6106477d30bd5df96d0daf1 Mon Sep 17 00:00:00 2001 From: Luiz Gusttavo Date: Wed, 17 Apr 2024 10:54:43 -0300 Subject: [PATCH 2/5] handle cancel training function --- src/pages/train.tsx | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/pages/train.tsx b/src/pages/train.tsx index 91be7f58..b1571960 100644 --- a/src/pages/train.tsx +++ b/src/pages/train.tsx @@ -14,6 +14,7 @@ export default function Train() { const [selectedLabel, setSelectedLabel] = useState(0); const [isLoading, setIsLoading] = useState(false); + const [isCancelling, setIsCancelling] = useState(false); const handleChangeSelectedColumn = (event: any) => { setSelectedColumn(event.target.value); @@ -23,6 +24,18 @@ export default function Train() { setSelectedLabel(event.target.value); }; + const handleCancelTraining = async () => { + setIsCancelling(true); // Ativa o estado de cancelamento + try { + await axios.post('http://localhost:5000/cancel-training'); + alert('Treinamento cancelado com sucesso!'); + } catch (error) { + console.error('Erro ao cancelar o treinamento:', error); + alert('Falha ao cancelar o treinamento.'); + } + setIsCancelling(false); // Desativa o estado de cancelamento + }; + const handleSubmit = async () => { setIsLoading(true); setLoadingProgress(0); From 9250bf0c28edb123732f7965eafe073b71ef60cf Mon Sep 17 00:00:00 2001 From: Luiz Gusttavo Date: Sat, 11 May 2024 12:56:32 -0300 Subject: [PATCH 3/5] canceling training between one epoch and another --- api/Neural_Network2.py | 13 ++++++++----- src/pages/train.tsx | 10 ++++++++++ 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/api/Neural_Network2.py b/api/Neural_Network2.py index ff0118c4..d0a6d703 100644 --- a/api/Neural_Network2.py +++ b/api/Neural_Network2.py @@ -276,13 +276,16 @@ def train_epoch(model, optimizer, scheduler, train_loader, criterion, curr_epoch total += len(target) num_iters += 1 if num_iters % 20 == 0: - with open('training_progress.json', 'w') as f: - progress = 100 * (curr_epoch + num_iters/len(train_loader)) / num_total_epochs - training_progress = { + with open('training_progress.json', 'r+') as f: + data = json.load(f) + progress = 100 * (curr_epoch + num_iters / len(train_loader)) / num_total_epochs + data.update({ 'training_progress': progress, 'training_in_progress': True - } - json.dump(training_progress, f) + }) + f.seek(0) + json.dump(data, f) + f.truncate() return total_loss / total diff --git a/src/pages/train.tsx b/src/pages/train.tsx index b1571960..73657fb0 100644 --- a/src/pages/train.tsx +++ b/src/pages/train.tsx @@ -344,6 +344,16 @@ export default function Train() { > {isLoading ? "Carregando..." : "Treinar"} + + {isLoading && ( + + )} From 2b3d02be63c317e31f474501d51c11e4eb36de4c Mon Sep 17 00:00:00 2001 From: Luiz Gusttavo Date: Sat, 11 May 2024 13:14:11 -0300 Subject: [PATCH 4/5] canceling training between batches --- api/Neural_Network2.py | 96 +++++++++++++++++++++++------------------- 1 file changed, 52 insertions(+), 44 deletions(-) diff --git a/api/Neural_Network2.py b/api/Neural_Network2.py index d0a6d703..8a293918 100644 --- a/api/Neural_Network2.py +++ b/api/Neural_Network2.py @@ -255,6 +255,13 @@ def train_epoch(model, optimizer, scheduler, train_loader, criterion, curr_epoch for inputs, target, text in progress_bar: target = target.to(device) + # Verifica se o cancelamento foi solicitado a cada batch + with open('training_progress.json', 'r') as f: + data = json.load(f) + if data.get('cancel_requested', False): + print("Training canceled during epoch:", curr_epoch) + return total_loss / max(total, 1), True # Retorna a perda média e o status de cancelamento + # Clean old gradients optimizer.zero_grad() @@ -277,7 +284,6 @@ def train_epoch(model, optimizer, scheduler, train_loader, criterion, curr_epoch num_iters += 1 if num_iters % 20 == 0: with open('training_progress.json', 'r+') as f: - data = json.load(f) progress = 100 * (curr_epoch + num_iters / len(train_loader)) / num_total_epochs data.update({ 'training_progress': progress, @@ -287,7 +293,7 @@ def train_epoch(model, optimizer, scheduler, train_loader, criterion, curr_epoch json.dump(data, f) f.truncate() - return total_loss / total + return total_loss / max(total, 1), False def validate_epoch(model, valid_loader, criterion): model.eval() @@ -309,8 +315,16 @@ def validate_epoch(model, valid_loader, criterion): return total_loss / total -def create_and_train_model(df, name, epochs = 10, batch_size = 32, learning_rate = 0.001): +import os +import json +import torch +from tqdm import tqdm +from torch.utils.data import DataLoader +from sklearn.preprocessing import LabelEncoder +import joblib +def create_and_train_model(df, name, epochs=10, batch_size=32, learning_rate=0.001): + # Configurações iniciais e preparações do modelo dropout_probability = 0.2 n_rnn_layers = 1 embedding_dimension = 128 @@ -321,20 +335,19 @@ def create_and_train_model(df, name, epochs = 10, batch_size = 32, learning_rate valid_ratio = 0.05 test_ratio = 0.05 + # Preparação do dataset dataset = CustomDataset(df, max_vocab, max_len, name) - train_dataset, valid_dataset, test_dataset = split_train_valid_test( dataset, valid_ratio=valid_ratio, test_ratio=test_ratio) - len(train_dataset), len(valid_dataset), len(test_dataset) - + # Preparação dos DataLoader train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate) valid_loader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate) test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate) - + # Inicialização do modelo model = RNNClassifier( - output_size=len(df.labels), + output_size=len(df['labels'].unique()), hidden_size=hidden_size, embedding_dimension=embedding_dimension, vocab_size=len(dataset.token2idx), @@ -343,54 +356,49 @@ def create_and_train_model(df, name, epochs = 10, batch_size = 32, learning_rate bidirectional=is_bidirectional, n_layers=n_rnn_layers, device=device, - batch_size=batch_size, + batch_size=batch_size ) - model = model.to(device) + model.to(device) - criterion = nn.CrossEntropyLoss() - optimizer = optim.Adam( - filter(lambda p: p.requires_grad, model.parameters()), - lr=learning_rate, - ) - scheduler = CosineAnnealingLR(optimizer, 1) + # Definição da função de perda e otimizador + criterion = torch.nn.CrossEntropyLoss() + optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) + scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 1) - n_epochs = 0 train_losses, valid_losses = [], [] + canceled = False for curr_epoch in range(epochs): + train_loss, canceled = train_epoch(model, optimizer, scheduler, train_loader, criterion, curr_epoch, epochs) + if canceled: + print(f"Training canceled during epoch {curr_epoch + 1}") + break - # Check if there was a request to cancel the training process - with open('training_progress.json', 'r') as file: - data = json.load(file) - if data.get('cancel_requested', False): - print("Training canceled!") - return - - train_loss = train_epoch(model, optimizer, scheduler, train_loader, criterion, curr_epoch, epochs) valid_loss = validate_epoch(model, valid_loader, criterion) - tqdm.write( - f'epoch #{n_epochs + 1:3d}\ttrain_loss: {train_loss:.2e}' - f'\tvalid_loss: {valid_loss:.2e}\n', + f'Epoch #{curr_epoch + 1:3d}\ttrain_loss: {train_loss:.2e}' + f'\tvalid_loss: {valid_loss:.2e}' ) - # Early stopping if the current valid_loss is greater than the last three valid losses - if len(valid_losses) > 2 and all(valid_loss >= loss - for loss in valid_losses[-3:]): - print('Stopping early') + if len(valid_losses) > 2 and all(valid_loss >= loss for loss in valid_losses[-3:]): + print('Stopping early due to lack of improvement in validation loss.') break train_losses.append(train_loss) valid_losses.append(valid_loss) - n_epochs += 1 - - model_path = os.path.join('api', 'models', name) - os.makedirs(os.path.dirname(model_path), exist_ok=True) - torch.save(model, model_path) - - training_progress = { - 'training_progress': 0, - 'training_in_progress': True - } - with open('training_progress.json', 'w') as file: - json.dump(training_progress, file) + # Finalizar e salvar o modelo se não foi cancelado + if not canceled: + model_path = os.path.join('api', 'models', name) + os.makedirs(os.path.dirname(model_path), exist_ok=True) + torch.save(model.state_dict(), model_path) + + # Atualizar e salvar o estado de treinamento final + training_progress = { + 'training_progress': 100, + 'training_in_progress': False, + 'cancel_requested': False + } + with open('training_progress.json', 'w') as file: + json.dump(training_progress, file) + + print("Training complete.") \ No newline at end of file From 448c8d0ad70f2627fe63afd08eba5de505a14355 Mon Sep 17 00:00:00 2001 From: Luiz Date: Fri, 17 May 2024 16:46:08 -0300 Subject: [PATCH 5/5] Remove unnecessary imports --- api/Neural_Network2.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/api/Neural_Network2.py b/api/Neural_Network2.py index 8a293918..378a844f 100644 --- a/api/Neural_Network2.py +++ b/api/Neural_Network2.py @@ -315,13 +315,6 @@ def validate_epoch(model, valid_loader, criterion): return total_loss / total -import os -import json -import torch -from tqdm import tqdm -from torch.utils.data import DataLoader -from sklearn.preprocessing import LabelEncoder -import joblib def create_and_train_model(df, name, epochs=10, batch_size=32, learning_rate=0.001): # Configurações iniciais e preparações do modelo