diff --git a/src/templates/template-common/config.yaml b/src/templates/template-common/config.yaml index 0ca2db54..b0055c3e 100644 --- a/src/templates/template-common/config.yaml +++ b/src/templates/template-common/config.yaml @@ -1,7 +1,7 @@ seed: 777 data_path: ./ -batch_size: 32 -eval_batch_size: 32 +batch_size: 512 +eval_batch_size: 1024 num_workers: 4 max_epochs: 20 use_amp: false diff --git a/src/templates/template-vision-classification/config.yaml b/src/templates/template-vision-classification/config.yaml index 5918a3af..c93bcc54 100644 --- a/src/templates/template-vision-classification/config.yaml +++ b/src/templates/template-vision-classification/config.yaml @@ -1,3 +1,6 @@ #::= from_template_common ::# lr: 0.0001 model: resnet18 +momentum: 0.9 +weight_decay: 1e-4 +num_warmup_epochs: 4 diff --git a/src/templates/template-vision-classification/main.py b/src/templates/template-vision-classification/main.py index fb6f9d0e..31c4c27d 100644 --- a/src/templates/template-vision-classification/main.py +++ b/src/templates/template-vision-classification/main.py @@ -44,18 +44,23 @@ def run(local_rank: int, config: Any): # donwload datasets and create dataloaders dataloader_train, dataloader_eval = setup_data(config) - # model, optimizer, loss function, device + # model, optimizer, loss function, device, lr_scheduler device = idist.device() model = idist.auto_model(setup_model(config.model)) - optimizer = idist.auto_optim(optim.Adam(model.parameters(), lr=config.lr)) + optimizer = idist.auto_optim( + optim.SGD( + model.parameters(), lr=config.lr, momentum=config.momentum, weight_decay=config.weight_decay, nesterov=True + ) + ) loss_fn = nn.CrossEntropyLoss().to(device=device) + le = len(dataloader_train) milestones_values = [ (0, 0.0), ( - len(dataloader_train), + le * config.num_warmup_epochs, config.lr, ), - (config.max_epochs * len(dataloader_train), 0.0), + (config.max_epochs * le, 0.0), ] lr_scheduler = PiecewiseLinear(optimizer, "lr", milestones_values=milestones_values) diff --git a/src/templates/template-vision-classification/trainers.py b/src/templates/template-vision-classification/trainers.py index d4e8cea6..bd254139 100644 --- a/src/templates/template-vision-classification/trainers.py +++ b/src/templates/template-vision-classification/trainers.py @@ -3,7 +3,7 @@ import ignite.distributed as idist import torch from ignite.engine import DeterministicEngine, Engine, Events -from torch.cuda.amp import autocast +from torch.cuda.amp import autocast, GradScaler from torch.nn import Module from torch.optim import Optimizer from torch.utils.data import DistributedSampler, Sampler @@ -27,9 +27,10 @@ def train_function(engine: Union[Engine, DeterministicEngine], batch: Any): outputs = model(samples) loss = loss_fn(outputs, targets) - loss.backward() - optimizer.step() optimizer.zero_grad() + scaler.scale(loss).backward() + scaler.step(optimizer) + scaler.update() train_loss = loss.item() engine.state.metrics = { @@ -45,6 +46,8 @@ def train_function(engine: Union[Engine, DeterministicEngine], batch: Any): trainer = Engine(train_function) #::: } :::# + + scaler = GradScaler(enabled=config.use_amp) # set epoch for distributed sampler @trainer.on(Events.EPOCH_STARTED)