clovaai · Nov 23, 2020
diff --git a/‎.gitignore
+5 b/‎.gitignore
+5
diff --git a/‎DatasetLoader.py
+16-14 b/‎DatasetLoader.py
+16-14
diff --git a/‎README.md
+43-33 b/‎README.md
+43-33
diff --git a/‎SpeakerNet.py
+45-32 b/‎SpeakerNet.py
+45-32
diff --git a/‎loss/aamsoftmax.py
+3-3 b/‎loss/aamsoftmax.py
+3-3
diff --git a/‎loss/amsoftmax.py
+2-2 b/‎loss/amsoftmax.py
+2-2
diff --git a/‎loss/angleproto.py
+3-3 b/‎loss/angleproto.py
+3-3
diff --git a/‎loss/ge2e.py
+1-1 b/‎loss/ge2e.py
+1-1
diff --git a/‎loss/proto.py
+4-4 b/‎loss/proto.py
+4-4
diff --git a/‎loss/softmax.py
+1-1 b/‎loss/softmax.py
+1-1
diff --git a/‎models/ResNetSE34.py
-112 b/‎models/ResNetSE34.py
-112
diff --git a/‎models/ResNetSE34L.py
+5-3 b/‎models/ResNetSE34L.py
+5-3
diff --git a/‎models/ResNetSE34V2.py
100755100644
+4-3 b/‎models/ResNetSE34V2.py
100755100644
+4-3
diff --git a/‎models/VGGVox.py
+5-3 b/‎models/VGGVox.py
+5-3
diff --git a/‎requirements.txt
+3-3 b/‎requirements.txt
+3-3
diff --git a/‎trainSpeakerNet.py
+35-12 b/‎trainSpeakerNet.py
+35-12
diff --git a/‎tuneThreshold.py
+58-4 b/‎tuneThreshold.py
+58-4
@@ -7,6 +7,11 @@ data/
 exps/
 core.*
 
+# NSML related
+.nsmlignore
+*.nsml.py
+setup.py
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 
@@ -92,7 +92,6 @@ def additive_noise(self, noisecat, audio):
 
         return numpy.sum(numpy.concatenate(noises,axis=0),axis=0,keepdims=True) + audio
 
-
     def reverberate(self, audio):
 
         rir_file    = random.choice(self.rir_files)
@@ -103,18 +102,6 @@ def reverberate(self, audio):
 
         return signal.convolve(audio, rir, mode='full')[:,:self.max_audio]
 
-    def speed_up(self, audio):
-
-        audio = audio[0].astype(numpy.int16)
-
-        return numpy.expand_dims(self.speedup.build_array(input_array=audio, sample_rate_in=16000),0).astype(numpy.float)[:,:self.max_audio]
-
-    def slow_down(self, audio):
-
-        audio = audio[0].astype(numpy.int16)
-
-        return numpy.expand_dims(self.slowdown.build_array(input_array=audio, sample_rate_in=16000),0).astype(numpy.float)[:,:self.max_audio]
-
 
 class voxceleb_loader(Dataset):
     def __init__(self, dataset_file_name, augment, musan_path, rir_path, max_frames, train_path):
@@ -182,6 +169,22 @@ def __len__(self):
         return len(self.data_list)
 
 
+
+class test_dataset_loader(Dataset):
+    def __init__(self, test_list, test_path, eval_frames, num_eval, **kwargs):
+        self.max_frames = eval_frames;
+        self.num_eval   = num_eval
+        self.test_path  = test_path
+        self.test_list  = test_list
+
+    def __getitem__(self, index):
+        audio = loadWAV(os.path.join(self.test_path,self.test_list[index]), self.max_frames, evalmode=True, num_eval=self.num_eval)
+        return torch.FloatTensor(audio), self.test_list[index]
+
+    def __len__(self):
+        return len(self.test_list)
+
+
 class voxceleb_sampler(torch.utils.data.Sampler):
     def __init__(self, data_source, nPerSpeaker, max_seg_per_spk, batch_size):
 
@@ -228,7 +231,6 @@ def __len__(self):
         return len(self.data_source)
 
 
-
 def get_data_loader(dataset_file_name, batch_size, augment, musan_path, rir_path, max_frames, max_seg_per_spk, nDataLoaderThread, nPerSpeaker, train_path, **kwargs):
 
     train_dataset = voxceleb_loader(dataset_file_name, augment, musan_path, rir_path, max_frames, train_path)
 
@@ -1,25 +1,6 @@
 # VoxCeleb trainer
 
-This repository contains the framework for training speaker recognition models described in 'In defence of metric learning for speaker recognition.'
-
-
-### Distributed training
-
-This branch contains experimental code for distributed training. It will be merged into `master` in the future.
-
-- GPU indices should be set using the command `export CUDA_VISIBLE_DEVICES=0,1,2,3`.
-
-- Evaluation is not performed between epochs during training.
-
-- Use `--distributed` flag to enable distributed training.
-
-- At every epoch, the whole dataset is passed through **each** GPU once. Therefore `test_interval` and `max_epochs` must be divided by the number of GPUs for the same number of forward passes as single GPU training. For example, `--test_interval 10` using 1 GPU should be equivalent to `--test_interval 2` using 5 GPUs.
-
-- If you run more than one distributed training session, you need to change the port.
-
-- The code only works on Linux systems with CUDA 9.2 or later.
-
-If you have any suggestions for improvement, please raise it as an issue.
+This repository contains the framework for training speaker recognition models described in the paper '_In defence of metric learning for speaker recognition_'.
 
 ### Dependencies
 ```
@@ -47,32 +28,32 @@ In addition to the Python dependencies, `wget` and `ffmpeg` must be installed on
 
 - AM-Softmax:
 ```
-python ./trainSpeakerNet.py --model ResNetSE34L --log_input True --encoder_type SAP --trainfunc amsoftmax --save_path exps/exp1 --nClasses 5994 --batch_size 200 --scale 30 --margin 0.3 --train_list train_list.txt --test_list test_list.txt
+python ./trainSpeakerNet.py --model ResNetSE34L --log_input True --encoder_type SAP --trainfunc amsoftmax --save_path exps/exp1 --nClasses 5994 --batch_size 200 --scale 30 --margin 0.3
 ```
 
 - Angular prototypical:
 ```
-python ./trainSpeakerNet.py --model ResNetSE34L --log_input True --encoder_type SAP --trainfunc angleproto --save_path exps/exp2 --nPerSpeaker 2 --batch_size 200 --train_list train_list.txt --test_list test_list.txt
+python ./trainSpeakerNet.py --model ResNetSE34L --log_input True --encoder_type SAP --trainfunc angleproto --save_path exps/exp2 --nPerSpeaker 2 --batch_size 200
 ```
 
 The arguments can also be passed as `--config path_to_config.yaml`. Note that the configuration file overrides the arguments passed via command line.
 
 ### Pretrained models
 
-A pretrained model can be downloaded from [here](http://www.robots.ox.ac.uk/~joon/data/baseline_lite_ap.model).
+A pretrained model, described in [1], can be downloaded from [here](http://www.robots.ox.ac.uk/~joon/data/baseline_lite_ap.model).
 
 You can check that the following script returns: `EER 2.1792`. You will be given an option to save the scores.
 
 ```
-python ./trainSpeakerNet.py --eval --model ResNetSE34L --log_input True --trainfunc angleproto --save_path exps/test --eval_frames 400 --test_list test_list.txt --initial_model baseline_lite_ap.model
+python ./trainSpeakerNet.py --eval --model ResNetSE34L --log_input True --trainfunc angleproto --save_path exps/test --eval_frames 400 --initial_model baseline_lite_ap.model
 ```
 
-A larger model trained with data augmentation can be downloaded from [here](http://www.robots.ox.ac.uk/~joon/data/baseline_v2_ap.model). 
+A larger model trained with online data augmentation, described in [2], can be downloaded from [here](http://www.robots.ox.ac.uk/~joon/data/baseline_v2_ap.model). 
 
-The following script should return: `EER 1.1771`. 
+The following script should return: `EER 1.1771`.
 
 ```
-python ./trainSpeakerNet.py --eval --model ResNetSE34V2 --log_input True --encoder_type ASP --n_mels 64 --trainfunc softmaxproto --save_path exps/test --eval_frames 400 --test_list test_list.txt --initial_model baseline_v2_ap.model
+python ./trainSpeakerNet.py --eval --model ResNetSE34V2 --log_input True --encoder_type ASP --n_mels 64 --trainfunc softmaxproto --save_path exps/test --eval_frames 400  --initial_model baseline_v2_ap.model
 ```
 
 ### Implemented loss functions
@@ -88,16 +69,33 @@ Angular Prototypical (angleproto)
 
 ### Implemented models and encoders
 ```
-ResNetSE34 (SAP)
 ResNetSE34L (SAP, ASP)
 ResNetSE34V2 (SAP, ASP)
 VGGVox40 (SAP, TAP, MAX)
 ```
 
+### Data augmentation
+
+`--augment True` enables online data augmentation, described in [2].
+
 ### Adding new models and loss functions
 
 You can add new models and loss functions to `models` and `loss` directories respectively. See the existing definitions for examples.
 
+### Accelerating training
+
+- Use `--mixedprec` flag to enable mixed precision training. This is recommended for Tesla V100, GeForce RTX 20 series or later models.
+
+- Use `--distributed` flag to enable distributed training.
+
+  - GPU indices should be set using the command `export CUDA_VISIBLE_DEVICES=0,1,2,3`.
+
+  - Evaluation is not performed between epochs during training.
+
+  - If you are running more than one distributed training session, you need to change the port.
+
+  - At every epoch, the whole dataset is passed through **each** GPU once. Therefore `test_interval` and `max_epochs` must be divided by the number of GPUs for the same number of forward passes as single GPU training.
+
 ### Data
 
 The [VoxCeleb](http://www.robots.ox.ac.uk/~vgg/data/voxceleb/) datasets are used for these experiments.
@@ -114,9 +112,10 @@ test list for VoxCeleb1 from [here](http://www.robots.ox.ac.uk/~vgg/data/voxcele
 ### Replicating the results from the paper
 
 1. Model definitions
-  - `VGG-M-40` in the paper is `VGGVox` in the code.
-  - `Thin ResNet-34` is in the paper `ResNetSE34` in the code.
-  - `Fast ResNet-34` is in the paper `ResNetSE34L` in the code.
+  - `VGG-M-40` in [1] is `VGGVox` in the repository.
+  - `Thin ResNet-34` in [1] is `ResNetSE34` in the repository.
+  - `Fast ResNet-34` in [1] is `ResNetSE34L` in the repository.
+  - `H / ASP` in [2] is `ResNetSE34V2` in the repository.
 
 2. For metric learning objectives, the batch size in the paper is `nPerSpeaker` multiplied by `batch_size` in the code. For the batch size of 800 in the paper, use `--nPerSpeaker 2 --batch_size 400`, `--nPerSpeaker 3 --batch_size 266`, etc.
 
@@ -125,13 +124,14 @@ test list for VoxCeleb1 from [here](http://www.robots.ox.ac.uk/~vgg/data/voxcele
 4. You can get a good balance between speed and performance using the configuration below.
 
 ```
-python ./trainSpeakerNet.py --model ResNetSE34L --trainfunc angleproto --batch_size 400 --nPerSpeaker 2 --train_list train_list.txt --test_list test_list.txt 
+python ./trainSpeakerNet.py --model ResNetSE34L --trainfunc angleproto --batch_size 400 --nPerSpeaker 2 
 ```
 
 ### Citation
 
-Please cite the following if you make use of the code. Please see [here](References.md) for the full list of methods used in this trainer.
+Please cite [1] if you make use of the code. Please see [here](References.md) for the full list of methods used in this trainer.
 
+[1] _In defence of metric learning for speaker recognition_
 ```
 @inproceedings{chung2020in,
   title={In defence of metric learning for speaker recognition},
@@ -141,6 +141,16 @@ Please cite the following if you make use of the code. Please see [here](Referen
 }
 ```
 
+[2] _Clova baseline system for the VoxCeleb Speaker Recognition Challenge 2020_
+```
+@article{heo2020clova,
+  title={Clova baseline system for the {VoxCeleb} Speaker Recognition Challenge 2020},
+  author={Heo, Hee Soo and Lee, Bong-Jin and Huh, Jaesung and Chung, Joon Son},
+  journal={arXiv preprint arXiv:2009.14153},
+  year={2020}
+}
+```
+
 ### License
 ```
 Copyright (c) 2020-present NAVER Corp.
 
@@ -7,7 +7,9 @@
 import numpy, math, pdb, sys, random
 import time, os, itertools, shutil, importlib
 from tuneThreshold import tuneThresholdfromScore
-from DatasetLoader import loadWAV
+from DatasetLoader import test_dataset_loader
+
+from torch.cuda.amp import autocast, GradScaler
 
 class WrappedModel(nn.Module):
 
@@ -43,15 +45,17 @@ def forward(self, data, label=None):
             return outp
 
         else:
+
             outp    = outp.reshape(self.nPerSpeaker,-1,outp.size()[-1]).transpose(1,0).squeeze(1)
+
             nloss, prec1 = self.__L__.forward(outp,label)
 
             return nloss, prec1
 
 
 class ModelTrainer(object):
 
-    def __init__(self, speaker_model, optimizer, scheduler, gpu, **kwargs):
+    def __init__(self, speaker_model, optimizer, scheduler, gpu, mixedprec, **kwargs):
 
         self.__model__  = speaker_model
 
@@ -61,8 +65,12 @@ def __init__(self, speaker_model, optimizer, scheduler, gpu, **kwargs):
         Scheduler = importlib.import_module('scheduler.'+scheduler).__getattribute__('Scheduler')
         self.__scheduler__, self.lr_step = Scheduler(self.__optimizer__, **kwargs)
 
+        self.scaler = GradScaler() 
+
         self.gpu = gpu
 
+        self.mixedprec = mixedprec
+
         assert self.lr_step in ['epoch', 'iteration']
 
     # ## ===== ===== ===== ===== ===== ===== ===== =====
@@ -90,15 +98,24 @@ def train_network(self, loader, verbose):
 
             label   = torch.LongTensor(data_label).cuda()
 
-            nloss, prec1 = self.__model__(data, label)
+            if self.mixedprec:
+                with autocast():
+                    nloss, prec1 = self.__model__(data, label)
+                self.scaler.scale(nloss).backward();
+                self.scaler.step(self.__optimizer__);
+                self.scaler.update();       
+            else:
+                nloss, prec1 = self.__model__(data, label)
+                nloss.backward();
+                self.__optimizer__.step();
+
 
             loss    += nloss.detach().cpu();
-            top1    += prec1
+            top1    += prec1.detach().cpu()
             counter += 1;
             index   += stepsize;
 
-            nloss.backward();
-            self.__optimizer__.step();
+        
 
             telapsed = time.time() - tstart
             tstart = time.time()
@@ -121,7 +138,7 @@ def train_network(self, loader, verbose):
     ## Evaluate from list
     ## ===== ===== ===== ===== ===== ===== ===== =====
 
-    def evaluateFromList(self, listfilename, print_interval=100, test_path='', num_eval=10, eval_frames=None):
+    def evaluateFromList(self, test_list, test_path, nDataLoaderThread, print_interval=100, num_eval=10, **kwargs):
 
         self.__model__.eval();
 
@@ -131,34 +148,30 @@ def evaluateFromList(self, listfilename, print_interval=100, test_path='', num_e
         tstart      = time.time()
 
         ## Read all lines
-        with open(listfilename) as listfile:
-            while True:
-                line = listfile.readline();
-                if (not line):
-                    break;
-
-                data = line.split();
-
-                ## Append random label if missing
-                if len(data) == 2: data = [random.randint(0,1)] + data
-
-                files.append(data[1])
-                files.append(data[2])
-                lines.append(line)
+        with open(test_list) as f:
+            lines = f.readlines()
 
+        ## Get a list of unique file names
+        files = sum([x.strip().split()[-2:] for x in lines],[])
         setfiles = list(set(files))
         setfiles.sort()
 
-        ## Save all features to file
-        for idx, file in enumerate(setfiles):
-
-            inp1 = torch.FloatTensor(loadWAV(os.path.join(test_path,file), eval_frames, evalmode=True, num_eval=num_eval)).cuda()
-
-            ref_feat    = self.__model__(inp1).detach().cpu()
-
-            feats[file] = ref_feat
-
-            telapsed    = time.time() - tstart
+        ## Define test data loader
+        test_dataset = test_dataset_loader(setfiles, test_path, num_eval=num_eval, **kwargs)
+        test_loader = torch.utils.data.DataLoader(
+            test_dataset,
+            batch_size=1,
+            shuffle=False,
+            num_workers=nDataLoaderThread,
+            drop_last=False,
+        )
+
+        ## Extract features for every image
+        for idx, data in enumerate(test_loader):
+            inp1                = data[0][0].cuda()
+            ref_feat            = self.__model__(inp1).detach().cpu()
+            feats[data[1][0]]   = ref_feat
+            telapsed            = time.time() - tstart
 
             if idx % print_interval == 0:
                 sys.stdout.write("\rReading %d of %d: %.2f Hz, embedding size %d"%(idx,len(setfiles),idx/telapsed,ref_feat.size()[1]));
@@ -197,7 +210,7 @@ def evaluateFromList(self, listfilename, print_interval=100, test_path='', num_e
                 sys.stdout.write("\rComputing %d of %d: %.2f Hz"%(idx,len(lines),idx/telapsed));
                 sys.stdout.flush();
 
-        print('\n')
+        print('')
 
         return (all_scores, all_labels, all_trials);
 
 
@@ -39,7 +39,7 @@ def forward(self, x, label=None):
         # cos(theta)
         cosine = F.linear(F.normalize(x), F.normalize(self.weight))
         # cos(theta + m)
-        sine = torch.sqrt((1.0 - torch.pow(cosine, 2)).clamp(0, 1))
+        sine = torch.sqrt((1.0 - torch.mul(cosine, cosine)).clamp(0, 1))
         phi = cosine * self.cos_m - sine * self.sin_m
 
         if self.easy_margin:
@@ -53,6 +53,6 @@ def forward(self, x, label=None):
         output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
         output = output * self.s
 
-        loss = self.ce(output, label)
-        prec1, _    = accuracy(output.detach().cpu(), label.detach().cpu(), topk=(1, 5))
+        loss    = self.ce(output, label)
+        prec1   = accuracy(output.detach(), label.detach(), topk=(1,))[0]
         return loss, prec1
@@ -39,7 +39,7 @@ def forward(self, x, label=None):
         if x.is_cuda: delt_costh = delt_costh.cuda()
         costh_m = costh - delt_costh
         costh_m_s = self.s * costh_m
-        loss = self.ce(costh_m_s, label)
-        prec1, _    = accuracy(costh_m_s.detach().cpu(), label.detach().cpu(), topk=(1, 5))
+        loss    = self.ce(costh_m_s, label)
+        prec1   = accuracy(costh_m_s.detach(), label.detach(), topk=(1,))[0]
         return loss, prec1
 
@@ -32,8 +32,8 @@ def forward(self, x, label=None):
         torch.clamp(self.w, 1e-6)
         cos_sim_matrix = cos_sim_matrix * self.w + self.b
 
-        label       = torch.from_numpy(numpy.asarray(range(0,stepsize))).cuda()
-        nloss       = self.criterion(cos_sim_matrix, label)
-        prec1, _    = accuracy(cos_sim_matrix.detach().cpu(), label.detach().cpu(), topk=(1, 5))
+        label   = torch.from_numpy(numpy.asarray(range(0,stepsize))).cuda()
+        nloss   = self.criterion(cos_sim_matrix, label)
+        prec1   = accuracy(cos_sim_matrix.detach(), label.detach(), topk=(1,))[0]
 
         return nloss, prec1
@@ -48,6 +48,6 @@ def forward(self, x, label=None):
 
         label = torch.from_numpy(numpy.asarray(range(0,stepsize))).cuda()
         nloss = self.criterion(cos_sim_matrix.view(-1,stepsize), torch.repeat_interleave(label,repeats=gsize,dim=0).cuda())
-        prec1, _    = accuracy(cos_sim_matrix.view(-1,stepsize).detach().cpu(), torch.repeat_interleave(label,repeats=gsize,dim=0).detach().cpu(), topk=(1, 5))
+        prec1 = accuracy(cos_sim_matrix.view(-1,stepsize).detach(), torch.repeat_interleave(label,repeats=gsize,dim=0).detach(), topk=(1,))[0]
 
         return nloss, prec1
@@ -28,9 +28,9 @@ def forward(self, x, label=None):
         out_positive    = x[:,0,:]
         stepsize        = out_anchor.size()[0]
 
-        output      = -1 * (F.pairwise_distance(out_positive.unsqueeze(-1),out_anchor.unsqueeze(-1).transpose(0,2))**2)
-        label       = torch.from_numpy(numpy.asarray(range(0,stepsize))).cuda()
-        nloss       = self.criterion(output, label)
-        prec1, _    = accuracy(output.detach().cpu(), label.detach().cpu(), topk=(1, 5))
+        output  = -1 * (F.pairwise_distance(out_positive.unsqueeze(-1),out_anchor.unsqueeze(-1).transpose(0,2))**2)
+        label   = torch.from_numpy(numpy.asarray(range(0,stepsize))).cuda()
+        nloss   = self.criterion(output, label)
+        prec1   = accuracy(output.detach(), label.detach(), topk=(1,))[0]
 
         return nloss, prec1
@@ -22,6 +22,6 @@ def forward(self, x, label=None):
 
 		x 		= self.fc(x)
 		nloss   = self.criterion(x, label)
-		prec1, _ = accuracy(x.detach().cpu(), label.detach().cpu(), topk=(1, 5))
+		prec1	= accuracy(x.detach(), label.detach(), topk=(1,))[0]
 
 		return nloss, prec1
@@ -76,9 +76,11 @@ def new_parameter(self, *size):
 
     def forward(self, x):
 
-        x = self.torchfb(x)+1e-6
-        if self.log_input: x = x.log()
-        x = self.instancenorm(x).unsqueeze(1).detach()
+        with torch.no_grad():
+            with torch.cuda.amp.autocast(enabled=False):
+                x = self.torchfb(x)+1e-6
+                if self.log_input: x = x.log()
+                x = self.instancenorm(x).unsqueeze(1).detach()
 
         x = self.conv1(x)
         x = self.bn1(x)
 
@@ -87,9 +87,10 @@ def new_parameter(self, *size):
     def forward(self, x):
 
         with torch.no_grad():
-            x = self.torchfb(x)+1e-6
-            if self.log_input: x = x.log()
-            x = self.instancenorm(x).unsqueeze(1)
+            with torch.cuda.amp.autocast(enabled=False):
+                x = self.torchfb(x)+1e-6
+                if self.log_input: x = x.log()
+                x = self.instancenorm(x).unsqueeze(1)
 
         x = self.conv1(x)
         x = self.relu(x)
 
@@ -71,9 +71,11 @@ def new_parameter(self, *size):
 
     def forward(self, x):
 
-        x = self.torchfb(x)+1e-6
-        if self.log_input: x = x.log()
-        x = self.instancenorm(x).unsqueeze(1).detach()
+        with torch.no_grad():
+            with torch.cuda.amp.autocast(enabled=False):
+                x = self.torchfb(x)+1e-6
+                if self.log_input: x = x.log()
+                x = self.instancenorm(x).unsqueeze(1)
 
         x = self.netcnn(x);
 
 
@@ -1,7 +1,7 @@
-torch>=1.5.0
-torchaudio>=0.5.0
+torch>=1.6.0
+torchaudio>=0.6.0
 numpy
 scipy
 scikit-learn
 tqdm
-pyyaml
+pyyaml
@@ -15,6 +15,10 @@
 import torch.distributed as dist
 import torch.multiprocessing as mp
 
+# ## ===== ===== ===== ===== ===== ===== ===== =====
+# ## Parse arguments
+# ## ===== ===== ===== ===== ===== ===== ===== =====
+
 parser = argparse.ArgumentParser(description = "SpeakerNet");
 
 parser.add_argument('--config',         type=str,   default=None,   help='Config YAML file');
@@ -42,18 +46,18 @@
 ## Loss functions
 parser.add_argument("--hard_prob",      type=float, default=0.5,    help='Hard negative mining probability, otherwise random, only for some loss functions');
 parser.add_argument("--hard_rank",      type=int,   default=10,     help='Hard negative mining rank in the batch, only for some loss functions');
-parser.add_argument('--margin',         type=float, default=1,      help='Loss margin, only for some loss functions');
-parser.add_argument('--scale',          type=float, default=15,     help='Loss scale, only for some loss functions');
+parser.add_argument('--margin',         type=float, default=0.1,    help='Loss margin, only for some loss functions');
+parser.add_argument('--scale',          type=float, default=30,     help='Loss scale, only for some loss functions');
 parser.add_argument('--nPerSpeaker',    type=int,   default=1,      help='Number of utterances per speaker per batch, only for metric learning based losses');
 parser.add_argument('--nClasses',       type=int,   default=5994,   help='Number of speakers in the softmax layer, only for softmax-based losses');
 
 ## Load and save
 parser.add_argument('--initial_model',  type=str,   default="",     help='Initial model weights');
-parser.add_argument('--save_path',      type=str,   default="./data/exp1", help='Path for model and logs');
+parser.add_argument('--save_path',      type=str,   default="exps/exp1", help='Path for model and logs');
 
 ## Training and test data
-parser.add_argument('--train_list',     type=str,   default="",     help='Train list');
-parser.add_argument('--test_list',      type=str,   default="",     help='Evaluation list');
+parser.add_argument('--train_list',     type=str,   default="data/train_list.txt",  help='Train list');
+parser.add_argument('--test_list',      type=str,   default="data/test_list.txt",   help='Evaluation list');
 parser.add_argument('--train_path',     type=str,   default="data/voxceleb2", help='Absolute path to the train set');
 parser.add_argument('--test_path',      type=str,   default="data/voxceleb1", help='Absolute path to the test set');
 parser.add_argument('--musan_path',     type=str,   default="data/musan_split", help='Absolute path to the test set');
@@ -69,9 +73,10 @@
 ## For test only
 parser.add_argument('--eval',           dest='eval', action='store_true', help='Eval only')
 
-## Distributed training
+## Distributed and mixed precision training
 parser.add_argument('--port',           type=str,   default="8888", help='Port for distributed training, input as text');
 parser.add_argument('--distributed',    dest='distributed', action='store_true', help='Enable distributed training')
+parser.add_argument('--mixedprec',      dest='mixedprec',   action='store_true', help='Enable mixed precision training')
 
 args = parser.parse_args();
 
@@ -120,13 +125,10 @@ def main_worker(gpu, ngpus_per_node, args):
     else:
         s = WrappedModel(s).cuda(args.gpu)
 
-    prevloss    = float("inf");
-    sumloss     = 0;
-    min_eer     = [100];
     it          = 1
 
     ## Write args to scorefile
-    scorefile = open(args.result_save_path+"/scores.txt", "a+");
+    scorefile   = open(args.result_save_path+"/scores.txt", "a+");
 
     ## Initialise trainer and data loader
     trainLoader = get_data_loader(args.train_list, **vars(args));
@@ -150,12 +152,24 @@ def main_worker(gpu, ngpus_per_node, args):
     ## Evaluation code - must run on single GPU
     if args.eval == True:
 
+        pytorch_total_params = sum(p.numel() for p in s.module.__S__.parameters())
+
+        print('Total parameters: ',pytorch_total_params)
+        print('Test list',args.test_list)
+
         assert args.distributed == False
 
-        sc, lab, _ = trainer.evaluateFromList(args.test_list, print_interval=100, test_path=args.test_path, eval_frames=args.eval_frames)
+        sc, lab, _ = trainer.evaluateFromList(**vars(args))
         result = tuneThresholdfromScore(sc, lab, [1, 0.1]);
 
-        print(result[1])
+        p_target = 0.05
+        c_miss = 1
+        c_fa = 1
+
+        fnrs, fprs, thresholds = ComputeErrorRates(sc, lab)
+        mindcf, threshold = ComputeMinDcf(fnrs, fprs, thresholds, p_target, c_miss, c_fa)
+
+        print('EER %2.4f MinDCF %.5f'%(result[1],mindcf))
         quit();
 
     ## Save training code and params
@@ -182,6 +196,14 @@ def main_worker(gpu, ngpus_per_node, args):
 
         if it % args.test_interval == 0 and args.gpu == 0:
 
+            ## Perform evaluation only in single GPU training
+            if not args.distributed:
+                sc, lab, _ = trainer.evaluateFromList(**vars(args))
+                result = tuneThresholdfromScore(sc, lab, [1, 0.1]);
+
+                print("IT %d, VEER %2.4f"%(it, result[1]));
+                scorefile.write("IT %d, VEER %2.4f\n"%(it, result[1]));
+
             trainer.saveParameters(args.model_save_path+"/model%09d.model"%it);
 
         print(time.strftime("%Y-%m-%d %H:%M:%S"), "TEER/TAcc %2.2f, TLOSS %f"%( traineer, loss));
@@ -214,6 +236,7 @@ def main():
     print('Python Version:', sys.version)
     print('PyTorch Version:', torch.__version__)
     print('Number of GPUs:', torch.cuda.device_count())
+    print('Save path:',args.save_path)
 
     if args.distributed:
         mp.spawn(main_worker, nprocs=n_gpus, args=(n_gpus, args))
 
@@ -8,14 +8,12 @@
 from sklearn import metrics
 import numpy
 import pdb
+from operator import itemgetter
 
 def tuneThresholdfromScore(scores, labels, target_fa, target_fr = None):
 
     fpr, tpr, thresholds = metrics.roc_curve(labels, scores, pos_label=1)
     fnr = 1 - tpr
-    
-    fnr = fnr*100
-    fpr = fpr*100
 
     tunedThreshold = [];
     if target_fr:
@@ -28,6 +26,62 @@ def tuneThresholdfromScore(scores, labels, target_fa, target_fr = None):
         tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]]);
 
     idxE = numpy.nanargmin(numpy.absolute((fnr - fpr)))
-    eer  = max(fpr[idxE],fnr[idxE])
+    eer  = max(fpr[idxE],fnr[idxE])*100
 
     return (tunedThreshold, eer, fpr, fnr);
+
+# Creates a list of false-negative rates, a list of false-positive rates
+# and a list of decision thresholds that give those error-rates.
+def ComputeErrorRates(scores, labels):
+
+      # Sort the scores from smallest to largest, and also get the corresponding
+      # indexes of the sorted scores.  We will treat the sorted scores as the
+      # thresholds at which the the error-rates are evaluated.
+      sorted_indexes, thresholds = zip(*sorted(
+          [(index, threshold) for index, threshold in enumerate(scores)],
+          key=itemgetter(1)))
+      sorted_labels = []
+      labels = [labels[i] for i in sorted_indexes]
+      fnrs = []
+      fprs = []
+
+      # At the end of this loop, fnrs[i] is the number of errors made by
+      # incorrectly rejecting scores less than thresholds[i]. And, fprs[i]
+      # is the total number of times that we have correctly accepted scores
+      # greater than thresholds[i].
+      for i in range(0, len(labels)):
+          if i == 0:
+              fnrs.append(labels[i])
+              fprs.append(1 - labels[i])
+          else:
+              fnrs.append(fnrs[i-1] + labels[i])
+              fprs.append(fprs[i-1] + 1 - labels[i])
+      fnrs_norm = sum(labels)
+      fprs_norm = len(labels) - fnrs_norm
+
+      # Now divide by the total number of false negative errors to
+      # obtain the false positive rates across all thresholds
+      fnrs = [x / float(fnrs_norm) for x in fnrs]
+
+      # Divide by the total number of corret positives to get the
+      # true positive rate.  Subtract these quantities from 1 to
+      # get the false positive rates.
+      fprs = [1 - x / float(fprs_norm) for x in fprs]
+      return fnrs, fprs, thresholds
+
+# Computes the minimum of the detection cost function.  The comments refer to
+# equations in Section 3 of the NIST 2016 Speaker Recognition Evaluation Plan.
+def ComputeMinDcf(fnrs, fprs, thresholds, p_target, c_miss, c_fa):
+    min_c_det = float("inf")
+    min_c_det_threshold = thresholds[0]
+    for i in range(0, len(fnrs)):
+        # See Equation (2).  it is a weighted sum of false negative
+        # and false positive errors.
+        c_det = c_miss * fnrs[i] * p_target + c_fa * fprs[i] * (1 - p_target)
+        if c_det < min_c_det:
+            min_c_det = c_det
+            min_c_det_threshold = thresholds[i]
+    # See Equations (3) and (4).  Now we normalize the cost.
+    c_def = min(c_miss * p_target, c_fa * (1 - p_target))
+    min_dcf = min_c_det / c_def
+    return min_dcf, min_c_det_threshold