diff --git a/.gitignore b/.gitignore
index df38727..9ab6c7b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,142 @@ kaldi
corpora
kaldifeat
*.egg-info/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
diff --git a/install.sh b/install.sh
index 6845fec..d33b5f6 100755
--- a/install.sh
+++ b/install.sh
@@ -6,19 +6,28 @@ nj=$(nproc)
home=$PWD
-# python/CONDA
+# CUDA version
+CUDAROOT=/usr/local/cuda
+if [ "$(id -g --name)" == "lium" ]; then
+ CUDAROOT=/opt/cuda/10.2 # LIUM Cluster
+ echo "Using local \$CUDAROOT: $CUDAROOT"
+fi
+
+[ ! -d $CUDAROOT ] && echo "CUDAROOT: '$FILE' does not exist." && exit 1
+
+cuda_version=$($CUDAROOT/bin/nvcc --version | grep "Cuda compilation tools" | cut -d" " -f5 | sed s/,//)
+cuda_version_witout_dot=$(echo $cuda_version | xargs | sed 's/\.//')
+
+# CONDA
conda_url=https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
conda_url=https://repo.anaconda.com/miniconda/Miniconda3-py38_4.9.2-Linux-x86_64.sh
-# PYTORCH version
+# PYTORCH
torch_version=1.8.2
torchvision_version=0.9.2
torchaudio_version=0.8.2
-torch_wheels="https://download.pytorch.org/whl/lts/1.8/torch_lts.html"
-
-# CUDA version
-cuda_version=10.2
+torch_wheels="https://download.pytorch.org/whl/lts/1.8/torch_lts.html"
venv_dir=$PWD/venv
@@ -35,38 +44,24 @@ if [ ! -f $mark ]; then
. $venv_dir/bin/activate
echo "Installing conda dependencies"
- yes | conda install -c conda-forge sox || exit 1
- yes | conda install -c conda-forge libflac || exit 1
- yes | conda install -c conda-forge cudatoolkit=$cuda_version || exit 1
+ yes | conda install -c conda-forge sox
+ yes | conda install -c conda-forge libflac
touch $mark
fi
source $venv_dir/bin/activate
-exit 0
-
-# CUDA version
-CUDAROOT=/usr/local/cuda
-if [ "$(id -g --name)" == "lium" ]; then
- CUDAROOT=/opt/cuda/10.2 # LIUM Cluster
- echo "Using local \$CUDAROOT: $CUDAROOT"
-fi
-_cuda_version=$($CUDAROOT/bin/nvcc --version | grep "Cuda compilation tools" | cut -d" " -f5 | sed s/,//)
-if [[ $cuda_version != $_cuda_version ]]; then
- echo "CUDA env not properly setup! (installed cuda v$cuda_version != in path cuda v$_cuda_version)"
- exit 1
-fi
-cuda_version_witout_dot=$(echo $cuda_version | xargs | sed 's/\.//')
-
export PATH=$CUDAROOT/bin:$PATH
export LD_LIBRARY_PATH=$CUDAROOT/lib64:$LD_LIBRARY_PATH
export CFLAGS="-I$CUDAROOT/include $CFLAGS"
export CUDA_HOME=$CUDAROOT
export CUDA_PATH=$CUDAROOT
+
echo "if [ \$(which python) != $venv_dir/bin/python ]; then source $venv_dir/bin/activate; fi; export CUDAROOT=$CUDAROOT; export LD_LIBRARY_PATH=$LD_LIBRARY_PATH;" > env.sh
mark=.done-pytorch
if [ ! -f $mark ]; then
echo " == Installing pytorch $torch_version for cuda $cuda_version =="
+ # pip3 install torch==1.7.1+cu101 torchvision==0.8.2+cu101 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
pip3 install torch==$torch_version+cu$cuda_version_witout_dot torchvision==$torchvision_version+cu$cuda_version_witout_dot torchaudio==$torchaudio_version -f $torch_wheels
cd $home
touch $mark
diff --git a/sidekit/.gitignore b/sidekit/.gitignore
new file mode 100755
index 0000000..b9e6703
--- /dev/null
+++ b/sidekit/.gitignore
@@ -0,0 +1,3 @@
+*.pyc
+*.DS_Store
+docs
diff --git a/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/.gitignore b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/.gitignore
new file mode 100644
index 0000000..403c167
--- /dev/null
+++ b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/.gitignore
@@ -0,0 +1,3 @@
+log
+model
+.ipynb_checkpoints
diff --git a/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/VoxCeleb12_ssd.yaml b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/VoxCeleb12_ssd.yaml
new file mode 100644
index 0000000..dcf9d8e
--- /dev/null
+++ b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/VoxCeleb12_ssd.yaml
@@ -0,0 +1,70 @@
+# Dataset description
+
+
+# General options
+data_path: /lium/scratch/larcher/
+data_file_extension: .wav
+dataset_csv: list/vox12.csv
+
+# DEV4S smaller dataset:
+# awk -F, 'x[$1]<10;{x[$1]++}' vox12.csv > vox12_DEV.csv
+# dataset_csv: list/vox12_DEV.csv
+
+# DEV4S smaller dataset (with only long utterances):
+# (head -n 2 list/vox12_DEV.csv ; cat list/vox12_DEV.csv | sort -t , -k 5,5 -rn | head -n 7204) > list/vox12_DEV_long.csv
+# tail -n +2 list/vox12_DEV.csv | cut -d , -f 1 | sort | uniq > /tmp/len
+# tail -n +2 list/vox12_DEV_long.csv | cut -d, -f2- > /tmp/vox12_DEV_long_side.csv
+# paste -d',' /tmp/len /tmp/vox12_DEV_long_side.csv > /tmp/vox12_DEV_long_wrongspkid.csv
+# (head -n 2 list/vox12_DEV.csv; cat /tmp/vox12_DEV_long_wrongspkid.csv) > list/vox12_DEV_long.csv
+# dataset_csv: list/vox12_DEV_long_sorted.csv
+
+sample_rate: 16000
+
+validation_ratio: 0.02
+batch_size: 512
+
+# Training set
+train:
+ duration: 3.
+ chunk_per_segment: -1
+ overlap: 3.
+
+ sampler:
+ examples_per_speaker: 1
+ samples_per_speaker: 100
+ augmentation_replica: 1
+
+ transform_number: 1
+
+ transformation:
+ pipeline: add_reverb,add_noise,filtering,phone_filtering,codec
+ spec_aug: 0.5
+ temp_aug: 0.5
+
+ add_noise:
+ noise_db_csv: /lium/raid01_c/larcher/data/musan_split.csv
+ data_path: /lium/scratch/larcher/musan_split/
+
+ add_reverb:
+ rir_db_csv: list/reverb.csv
+ data_path: /lium/scratch/amehrish/Database/REVERB_UPDATED/REVERB/RIRS_NOISES/
+
+# Validation set
+valid:
+ duration: 3.
+
+ transformation:
+ pipeline:
+ spec_aug: 0.5
+ temp_aug: 0.5
+
+ add_noise:
+ noise_db_csv: /lium/raid01_c/larcher/data/musan_split.csv
+ data_path: /lium/scratch/larcher/musan_split/
+
+# Test set
+test:
+ idmap: /lium/raid01_c/larcher/data/vox1_test_cleaned_idmap.h5
+ ndx: /lium/raid01_c/larcher/data/vox1_test_cleaned_ndx.h5
+ key: /lium/raid01_c/larcher/data/vox1_test_cleaned_key.h5
+ data_path: /lium/scratch/larcher/voxceleb1/test/wav
diff --git a/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/VoxCeleb2_ssd.yaml b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/VoxCeleb2_ssd.yaml
new file mode 100644
index 0000000..accfb3e
--- /dev/null
+++ b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/VoxCeleb2_ssd.yaml
@@ -0,0 +1,59 @@
+# Dataset description
+
+
+# General options
+data_path: /lium/scratch/larcher/
+data_file_extension: .wav
+dataset_csv: list/voxceleb2_dev.csv
+sample_rate: 16000
+
+
+validation_ratio: 0.02
+batch_size: 256
+
+
+# Training set
+train:
+ duration: 4
+ chunk_per_segment: -1
+ overlap: 3.
+
+ sampler:
+ examples_per_speaker: 1
+ samples_per_speaker: 100
+ augmentation_replica: 1
+
+ transform_number: 1
+
+ transformation:
+ pipeline: add_reverb,add_noise,filtering,codec
+ spec_aug: 0.5
+ temp_aug: 0.5
+
+ add_noise:
+ noise_db_csv: /lium/raid01_c/larcher/data/musan_split.csv
+ data_path: /lium/scratch/larcher/musan_split/
+
+ add_reverb:
+ rir_db_csv: /lium/home/amehrish/Database/REVERB_UPDATED/REVERB/Reverb.csv
+ data_path: /lium/home/amehrish/Database/REVERB_UPDATED/REVERB/RIRS_NOISES/
+
+# Validation set
+valid:
+ duration: 4.
+
+ transformation:
+ pipeline:
+ spec_aug: 0.5
+ temp_aug: 0.5
+
+ add_noise:
+ noise_db_csv: /lium/raid01_c/larcher/data/musan_split.csv
+ data_path: /lium/scratch/larcher/musan_split/
+
+# Test set
+test:
+ idmap: /lium/raid01_c/larcher/data/vox1_test_cleaned_idmap.h5
+ ndx: /lium/raid01_c/larcher/data/vox1_test_cleaned_ndx.h5
+ key: /lium/raid01_c/larcher/data/vox1_test_cleaned_key.h5
+ data_path: /lium/scratch/larcher/voxceleb1/test/wav
diff --git a/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/allies.yaml b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/allies.yaml
new file mode 100644
index 0000000..ce08a16
--- /dev/null
+++ b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/allies.yaml
@@ -0,0 +1,55 @@
+# Dataset description
+
+
+# General options
+data_path: /lium/scratch/larcher/ALLIES_train_split/wav/
+data_file_extension: .wav
+dataset_csv: /lium/raid01_c/larcher/data/allies_train_split.csv
+sample_rate: 16000
+
+
+validation_ratio: 0.1
+batch_size: 64
+
+
+# Training set
+train:
+ duration: 1.5
+ chunk_per_segment: -1
+ overlap: 1.4
+
+ transformation:
+ pipeline: add_noise
+ spec_aug: 0.5
+ temp_aug: 0.5
+
+ add_noise:
+ noise_db_csv: /lium/raid01_c/larcher/data/musan_split.csv
+ data_path: /lium/scratch/larcher/musan_split/
+
+ add_reverb:
+
+
+# Validation set
+valid:
+ duration: 1.5
+
+ transformation:
+ pipeline:
+ spec_aug: 0.5
+ temp_aug: 0.5
+
+ add_noise:
+ noise_db_csv: /lium/raid01_c/larcher/data/musan_split.csv
+ data_path: /lium/scratch/larcher/musan_split/
+
+ add_reverb:
+
+# Test set
+test:
+ idmap: /lium/raid01_c/larcher/data/allies_dev_verif_idmap.h5
+ ndx: /lium/raid01_c/larcher/data/allies_dev_verif_ndx.h5
+ key: /lium/raid01_c/larcher/data/allies_dev_verif_key.h5
+ data_path: /lium/corpus/base/ALLIES/wav
+
+
diff --git a/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/model.yaml b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/model.yaml
new file mode 100644
index 0000000..43e9fd3
--- /dev/null
+++ b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/model.yaml
@@ -0,0 +1,20 @@
+# Model description
+
+speaker_number: 7205
+
+loss:
+ type: aam
+ aam_margin: 0.2
+ aam_s: 30
+
+# Initialize model from file, reset and freeze parts of it
+# initial_model_name: ./model/tmp_half_clr_adam_aam0.2_30_b256_vox12.pt
+initial_model_name:
+reset_parts: []
+freeze_parts: []
+
+# Model can be fastresnet34, resnet34, xvector of custom
+# if custom, then need more options
+model_type: halfresnet34
+
+feature_size: 80
diff --git a/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/training.yaml b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/training.yaml
new file mode 100644
index 0000000..e2af66b
--- /dev/null
+++ b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/training.yaml
@@ -0,0 +1,42 @@
+# Training description
+
+
+# Gerenal options
+log_file: log/aam_CLR2-40000-1e-10_adam.log
+torch_seed: 42
+numpy_seed: 42
+random_seed: 42
+deterministic: false
+
+epochs: 1000
+lr: 0.000001
+patience: 100
+
+multi_gpu: true
+num_cpu: 8
+
+mixed_precision: true
+clipping: false
+
+# Optimizer and scheduler options
+optimizer:
+ type: adam
+ options:
+
+scheduler:
+ type: CyclicLR
+ mode: triangular2
+ base_lr: 0.0000001
+ step_size_up: 40000
+
+
+
+# Evaluation options
+compute_test_eer: true
+log_interval: 10
+validation_frequency: 1
+
+# Save options
+tmp_model_name: model/tmp_half_clr_adam_aam0.2_30_b256_vox12.pt
+best_model_name: model/best_halp_clr_adam_aam0.2_30_b256_vox12.pt
+checkpoint_frequency:
diff --git a/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/h5f/.gitkeep b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/h5f/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/list/.gitkeep b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/list/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/train.sh b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/train.sh
new file mode 100755
index 0000000..3e2a650
--- /dev/null
+++ b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/train.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+export NUM_NODES=1
+export NUM_GPUS_PER_NODE=2
+export NODE_RANK=0
+export WORLD_SIZE=$(($NUM_NODES * $NUM_GPUS_PER_NODE))
+
+# env DEV4S=True WORLD_SIZE=1 ipython3 train_xtractor.py # DEV
+
+cd ../../..
+. ./env.sh
+cd -
+
+python3 -m torch.distributed.launch \
+ --nproc_per_node=$NUM_GPUS_PER_NODE \
+ --nnodes=$NUM_NODES \
+ --node_rank $NODE_RANK \
+ train_xtractor.py
diff --git a/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/train_xtractor.py b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/train_xtractor.py
new file mode 100644
index 0000000..1862138
--- /dev/null
+++ b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/train_xtractor.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import torch
+import sidekit
+import yaml
+
+from argparse import ArgumentParser
+
+def main():
+ parser = ArgumentParser('DDP usage example')
+ parser.add_argument('--local_rank', type=int, default=-1, metavar='N', help='Local process rank.')
+ args = parser.parse_args()
+
+ # keep track of whether the current process is the `master` process (totally optional, but I find it useful for data laoding, logging, etc.)
+ args.is_master = args.local_rank == 0
+
+ sidekit.nnet.xvector.xtrain(dataset_description="cfg/VoxCeleb12_ssd.yaml",
+ model_description="cfg/model.yaml",
+ training_description="cfg/training.yaml")
+
+
+if __name__ == '__main__':
+ main()
+
diff --git a/sidekit/setup.py b/sidekit/setup.py
new file mode 100644
index 0000000..25f5c4b
--- /dev/null
+++ b/sidekit/setup.py
@@ -0,0 +1,7 @@
+#!/usr/bin/env python
+
+from distutils.core import setup
+
+setup(name='sidekit',
+ version='1.0',
+ )
diff --git a/sidekit/sidekit/__init__.py b/sidekit/sidekit/__init__.py
new file mode 100755
index 0000000..ef36a52
--- /dev/null
+++ b/sidekit/sidekit/__init__.py
@@ -0,0 +1,191 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+
+"""
+Copyright 2014-2021 Anthony Larcher and Sylvain Meignier
+"""
+
+from ctypes import *
+from ctypes.util import find_library
+import importlib
+import logging
+import numpy
+import os
+import sys
+
+
+# Read environment variable if it exists
+SIDEKIT_CONFIG={"libsvm":False,
+ "mpi":False,
+ "cuda":True
+ }
+
+if 'SIDEKIT' in os.environ:
+ for cfg in os.environ['SIDEKIT'].split(","):
+ k, val = cfg.split("=")
+ if k == "libsvm":
+ if val == "false":
+ SIDEKIT_CONFIG["libsvm"] = False
+ elif k == "mpi":
+ if val == "true":
+ SIDEKIT_CONFIG["mpi"] = True
+ if k == "cuda":
+ if val == "true":
+ SIDEKIT_CONFIG["cuda"] = True
+
+
+PARALLEL_MODULE = 'multiprocessing' # can be , threading, multiprocessing MPI is planned in the future
+PARAM_TYPE = numpy.float32
+STAT_TYPE = numpy.float64 # can be numpy.float32 to speed up the computation but can lead to numerical issuess
+
+# Import bosaris-like classes
+from .bosaris import IdMap
+from .bosaris import Ndx
+from .bosaris import Key
+from .bosaris import Scores
+from .bosaris import DetPlot
+from .bosaris import effective_prior
+from .bosaris import logit_effective_prior
+from .bosaris import fast_minDCF
+
+# Import classes
+from .features_extractor import FeaturesExtractor
+from .features_server import FeaturesServer
+from .mixture import Mixture, vad_energy
+from .statserver import StatServer
+from .factor_analyser import FactorAnalyser
+
+from .frontend.io import write_pcm
+from .frontend.io import read_pcm
+from .frontend.io import pcmu2lin
+from .frontend.io import read_sph
+from .frontend.io import write_label
+from .frontend.io import read_label
+from .frontend.io import read_spro4
+from .frontend.io import read_audio
+from .frontend.io import write_spro4
+from .frontend.io import read_htk
+from .frontend.io import write_htk
+
+from .frontend.vad import vad_energy
+from .frontend.vad import vad_snr
+from .frontend.vad import label_fusion
+from .frontend.vad import speech_enhancement
+
+from .frontend.normfeat import cms
+from .frontend.normfeat import cmvn
+from .frontend.normfeat import stg
+from .frontend.normfeat import rasta_filt
+
+from .frontend.features import compute_delta
+from .frontend.features import framing
+from .frontend.features import pre_emphasis
+from .frontend.features import trfbank
+from .frontend.features import mel_filter_bank
+from .frontend.features import mfcc
+from .frontend.features import pca_dct
+from .frontend.features import shifted_delta_cepstral
+
+from .iv_scoring import cosine_scoring
+from .iv_scoring import mahalanobis_scoring
+from .iv_scoring import two_covariance_scoring
+from .iv_scoring import PLDA_scoring
+from .iv_scoring import fast_PLDA_scoring
+
+from .gmm_scoring import gmm_scoring
+
+from .jfa_scoring import jfa_scoring
+
+from .score_normalization import znorm
+from .score_normalization import tnorm
+from .score_normalization import ztnorm
+from .score_normalization import snorm
+
+from .sidekit_io import write_norm_hdf5
+from .sidekit_io import write_matrix_hdf5
+
+
+from .sv_utils import clean_stat_server
+
+libsvm_loaded = False
+if SIDEKIT_CONFIG["libsvm"]:
+ try:
+ dirname = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'libsvm')
+ if sys.platform == 'win32':
+ libsvm = CDLL(os.path.join(dirname, r'libsvm.dll'))
+ libsvm_loaded = True
+ else:
+ libsvm = CDLL(os.path.join(dirname, 'libsvm.so.2'))
+ libsvm_loaded = True
+ except:
+ # For unix the prefix 'lib' is not considered.
+ if find_library('svm'):
+ libsvm = CDLL(find_library('svm'))
+ libsvm_loaded = True
+ elif find_library('libsvm'):
+ libsvm = CDLL(find_library('libsvm'))
+ libsvm_loaded = True
+ else:
+ libsvm_loaded = False
+ logging.warning('WARNNG: libsvm is not installed, please refer to the' +
+ ' documentation if you intend to use SVM classifiers')
+
+if libsvm_loaded:
+ from sidekit.libsvm import *
+ from sidekit.svm_scoring import *
+ from sidekit.svm_training import *
+
+
+CUDA = False
+if SIDEKIT_CONFIG["cuda"]:
+ CUDA = True
+
+
+if CUDA:
+ from .nnet import Xtractor
+ from .nnet import xtrain
+ from .nnet import extract_embeddings
+ from .nnet import ResBlock
+ from .nnet import SincNet
+
+else:
+ print("Don't import Torch")
+
+if SIDEKIT_CONFIG["mpi"]:
+ found_mpi4py = importlib.find_loader('mpi4py') is not None
+ if found_mpi4py:
+ from .sidekit_mpi import EM_split
+ from .sidekit_mpi import total_variability
+ from .sidekit_mpi import extract_ivector
+ print("Import MPI")
+
+
+__author__ = "Anthony Larcher and Sylvain Meignier"
+__copyright__ = "Copyright 2014-2021 Anthony Larcher and Sylvain Meignier"
+__license__ = "LGPL"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+__version__="1.4"
+
diff --git a/sidekit/sidekit/bosaris/Licence b/sidekit/sidekit/bosaris/Licence
new file mode 100755
index 0000000..41af980
--- /dev/null
+++ b/sidekit/sidekit/bosaris/Licence
@@ -0,0 +1,124 @@
+License agreement
+
+Agnitio Labs
+
+Non-Commercial Use Only
+
+
+
+_____________________________________________________________________
+
+
+
+This AGNITIO Labs License Agreement, including all exhibits ("AGN-LA") is a
+legal agreement between you and AGNITIO S. L. (“AGNITIO” or “we”) for the
+software or data identified above, which may include source code, and any
+associated materials, text or speech files, associated media and "online" or
+electronic documentation and any updates we provide in our discretion
+(together, the "Software").
+
+
+
+By installing, copying, or otherwise using this Software, you agree to be bound
+by the terms of this AGN-LA. If you do not agree, do not install copy or use
+the Software. The Software is protected by copyright and other intellectual
+property laws and is licensed, not sold.
+
+
+
+SCOPE OF RIGHTS:
+
+You may use, copy, reproduce, and distribute this Software for any
+non-commercial purpose, subject to the restrictions in this AGN-LA. Some
+purposes which can be non-commercial are teaching, academic research, public
+demonstrations and personal experimentation. You may also distribute this
+Software with books or other teaching materials, or publish the Software on
+websites, that are intended to teach the use of the Software for academic or
+other non-commercial purposes.
+
+You may not use or distribute this Software or any derivative works in any form
+for commercial purposes. Examples of commercial purposes would be running
+business operations, licensing, leasing, or selling the Software, distributing
+the Software for use with commercial products, using the Software in the
+creation or use of commercial products or any other activity which purpose is
+to procure a commercial gain to you or others.
+
+If the Software includes source code or data, you may create derivative works
+of such portions of the Software and distribute the modified Software for
+non-commercial purposes, as provided herein.
+
+If you distribute the Software or any derivative works of the Software, you
+will distribute them under the same terms and conditions as in this license,
+and you will not grant other rights to the Software or derivative works that
+are different from those provided by this AGN-LA.
+
+If you have created derivative works of the Software, and distribute such
+derivative works, you will cause the modified files to carry prominent notices
+so that recipients know that they are not receiving the original Software. Such
+notices must state: (i) that you have changed the Software; and (ii) the date
+of any changes.
+
+
+
+In return, we simply require that you agree:
+
+1. That you will not remove any copyright or other notices from the Software.
+
+2. That if any of the Software is in binary format, you will not attempt to
+modify such portions of the Software, or to reverse engineer or decompile them,
+except and only to the extent authorized by applicable law.
+
+3. That AGNITIO is granted back, without any restrictions or limitations, a
+non-exclusive, perpetual, irrevocable, royalty-free, assignable and
+sub-licensable license, to reproduce, publicly perform or display, install,
+use, modify, post, distribute, make and have made, sell and transfer your
+modifications to and/or derivative works of the Software source code or data,
+for any purpose.
+
+4. That any feedback about the Software provided by you to us is voluntarily
+given, and AGNITIO shall be free to use the feedback as it sees fit without
+obligation or restriction of any kind, even if the feedback is designated by
+you as confidential.
+
+5. THAT THE SOFTWARE COMES "AS IS", WITH NO WARRANTIES. THIS MEANS NO EXPRESS,
+IMPLIED OR STATUTORY WARRANTY, INCLUDING WITHOUT LIMITATION, WARRANTIES OF
+MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, ANY WARRANTY AGAINST
+INTERFERENCE WITH YOUR ENJOYMENT OF THE SOFTWARE OR ANY WARRANTY OF TITLE OR
+NON-INFRINGEMENT. THERE IS NO WARRANTY THAT THIS SOFTWARE WILL FULFILL ANY OF
+YOUR PARTICULAR PURPOSES OR NEEDS. ALSO, YOU MUST PASS THIS DISCLAIMER ON
+WHENEVER YOU DISTRIBUTE THE SOFTWARE OR DERIVATIVE WORKS.
+
+6. THAT NEITHER AGNITIO NOR ANY CONTRIBUTOR TO THE SOFTWARE WILL BE LIABLE FOR
+ANY DAMAGES RELATED TO THE SOFTWARE OR THIS AGN-LA, INCLUDING DIRECT, INDIRECT,
+SPECIAL, CONSEQUENTIAL OR INCIDENTAL DAMAGES, TO THE MAXIMUM EXTENT THE LAW
+PERMITS, NO MATTER WHAT LEGAL THEORY IT IS BASED ON. ALSO, YOU MUST PASS THIS
+LIMITATION OF LIABILITY ON WHENEVER YOU DISTRIBUTE THE SOFTWARE OR DERIVATIVE
+WORKS.
+
+7. That we have no duty of reasonable care or lack of negligence, and we are
+not obligated to (and will not) provide technical support for the Software.
+
+8. That if you breach this AGN-LA or if you sue anyone over patents that you
+think may apply to or read on the Software or anyone's use of the Software,
+this AGN-LA (and your license and rights obtained herein) terminate
+automatically. Upon any such termination, you shall destroy all of your copies
+of the Software immediately. Sections 3, 4, 5, 6, 7, 8, 11 and 12 of this
+AGN-LA shall survive any termination of this AGN-LA.
+
+9. That the patent rights, if any, granted to you in this AGN-LA only apply to
+the Software, not to any derivative works you make.
+
+10. That the Software may be subject to Europe export jurisdiction at the time
+it is licensed to you, and it may be subject to additional export or import
+laws in other places. You agree to comply with all such laws and regulations
+that may apply to the Software after delivery of the software to you.
+
+11. That all rights not expressly granted to you in this AGN-LA are reserved.
+
+12. That this AGN-LA shall be construed and controlled by the laws of the
+Kingdom of Spain, without regard to conflicts of law. If any provision of this
+AGN-LA shall be deemed unenforceable or contrary to law, the rest of this
+AGN-LA shall remain in full effect and interpreted in an enforceable manner
+that most nearly captures the intent of the original language.
+
+Copyright (c) AGNITIO. All rights reserved.
diff --git a/sidekit/sidekit/bosaris/__init__.py b/sidekit/sidekit/bosaris/__init__.py
new file mode 100755
index 0000000..bdc3556
--- /dev/null
+++ b/sidekit/sidekit/bosaris/__init__.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+"""
+This package is a translation of a part of the BOSARIS toolkit.
+The authors thank Niko Brummer and Agnitio for allowing them to
+translate this code and provide the community with efficient structures
+and tools.
+
+The BOSARIS Toolkit is a collection of functions and classes in Matlab
+that can be used to calibrate, fuse and plot scores from speaker recognition
+(or other fields in which scores are used to test the hypothesis that two
+samples are from the same source) trials involving a model and a test segment.
+The toolkit was written at the BOSARIS2010 workshop which took place at the
+University of Technology in Brno, Czech Republic from 5 July to 6 August 2010.
+See the User Guide (available on the toolkit website)1 for a discussion of the
+theory behind the toolkit and descriptions of some of the algorithms used.
+
+The BOSARIS toolkit in MATLAB can be downloaded from `the website
+`_.
+"""
+
+from .idmap import IdMap
+from .ndx import Ndx
+from .plotwindow import PlotWindow
+from .key import Key
+from .scores import Scores
+from .detplot import DetPlot
+from .detplot import effective_prior
+from .detplot import logit_effective_prior
+from .detplot import fast_minDCF
+
+
+__author__ = "Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+__credits__ = ["Niko Brummer", "Edward de Villiers"]
+
diff --git a/sidekit/sidekit/bosaris/detplot.py b/sidekit/sidekit/bosaris/detplot.py
new file mode 100755
index 0000000..02b3f91
--- /dev/null
+++ b/sidekit/sidekit/bosaris/detplot.py
@@ -0,0 +1,937 @@
+# -*- coding: utf-8 -*-
+
+# This package is a translation of a part of the BOSARIS toolkit.
+# The authors thank Niko Brummer and Agnitio for allowing them to
+# translate this code and provide the community with efficient structures
+# and tools.
+#
+# The BOSARIS Toolkit is a collection of functions and classes in Matlab
+# that can be used to calibrate, fuse and plot scores from speaker recognition
+# (or other fields in which scores are used to test the hypothesis that two
+# samples are from the same source) trials involving a model and a test segment.
+# The toolkit was written at the BOSARIS2010 workshop which took place at the
+# University of Technology in Brno, Czech Republic from 5 July to 6 August 2010.
+# See the User Guide (available on the toolkit website)1 for a discussion of the
+# theory behind the toolkit and descriptions of some of the algorithms used.
+#
+# The BOSARIS toolkit in MATLAB can be downloaded from `the website
+# `_.
+
+"""
+This is the 'detplot' module
+
+ This module supplies tools for ploting DET curve.
+ It includes a class for creating a plot for displaying detection performance
+ with the axes scaled and labelled so that a normal Gaussian
+ distribution will plot as a straight line.
+
+ The y axis represents the miss probability.
+ The x axis represents the false alarm probability.
+
+ This file is a translation of the BOSARIS toolkit.
+ For more information, refers to the license provided with this package.
+"""
+import copy
+import matplotlib
+import numpy
+import os
+
+if "DISPLAY" not in os.environ:
+ matplotlib.use('PDF')
+import matplotlib.pyplot as mpl
+import scipy
+from collections import namedtuple
+import logging
+
+from . import PlotWindow
+from . import Scores
+from . import Key
+
+
+__author__ = "Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+__credits__ = ["Niko Brummer", "Edward de Villiers"]
+
+
+colorStyle = [
+ ((0, 0, 0), '-', 2), # black
+ ((0, 0, 1.0), '--', 2), # blue
+ ((0.8, 0.0, 0.0), '-.', 2), # red
+ ((0, 0.6, 0.0), ':', 2), # green
+ ((0.5, 0.0, 0.5), '-', 2), # magenta
+ ((0.3, 0.3, 0.0), '--', 2), # orange
+ ((0, 0, 0), ':', 2), # black
+ ((0, 0, 1.0), ':', 2), # blue
+ ((0.8, 0.0, 0.0), ':', 2), # red
+ ((0, 0.6, 0.0), '-', 2), # green
+ ((0.5, 0.0, 0.5), '-.', 2), # magenta
+ ((0.3, 0.3, 0.0), '-', 2), # orange
+ ]
+
+grayStyle = [
+ ((0, 0, 0), '-', 2), # black
+ ((0, 0, 0), '--', 2), # black
+ ((0, 0, 0), '-.', 2), # black
+ ((0, 0, 0), ':', 2), # black
+ ((0.3, 0.3, 0.3), '-', 2), # gray
+ ((0.3, 0.3, 0.3), '--', 2), # gray
+ ((0.3, 0.3, 0.3), '-.', 2), # gray
+ ((0.3, 0.3, 0.3), ':', 2), # gray
+ ((0.6, 0.6, 0.6), '-', 2), # lighter gray
+ ((0.6, 0.6, 0.6), '--', 2), # lighter gray
+ ((0.6, 0.6, 0.6), '-.', 2), # lighter gray
+ ((0.6, 0.6, 0.6), ':', 2), # lighter gray
+ ]
+
+Box = namedtuple("Box", "left right top bottom")
+
+
+def effective_prior(Ptar, cmiss, cfa):
+ """This function adjusts a given prior probability of target p_targ,
+ to incorporate the effects of a cost of miss,
+ cmiss, and a cost of false-alarm, cfa.
+ In particular note:
+ EFFECTIVE_PRIOR(EFFECTIVE_PRIOR(p,cmiss,cfa),1,1)
+ = EFFECTIVE_PRIOR(p,cfa,cmiss)
+
+ The effective prior for the NIST SRE detection cost fuction,
+ with p_targ = 0.01, cmiss = 10, cfa = 1 is therefore:
+ EFFECTIVE_PRIOR(0.01,10,1) = 0.0917
+
+ :param Ptar: is the probability of a target trial
+ :param cmiss: is the cost of a miss
+ :param cfa: is the cost of a false alarm
+
+ :return: a prior
+ """
+ p = Ptar * cmiss / (Ptar * cmiss + (1 - Ptar) * cfa)
+ return p
+
+
+def logit_effective_prior(Ptar, cmiss, cfa):
+ """This function adjusts a given prior probability of target p_targ,
+ to incorporate the effects of a cost of miss,
+ cmiss, and a cost of false-alarm, cfa.
+ In particular note:
+ EFFECTIVE_PRIOR(EFFECTIVE_PRIOR(p,cmiss,cfa),1,1)
+ = EFFECTIVE_PRIOR(p,cfa,cmiss)
+
+ The effective prior for the NIST SRE detection cost fuction,
+ with p_targ = 0.01, cmiss = 10, cfa = 1 is therefore:
+ EFFECTIVE_PRIOR(0.01,10,1) = 0.0917
+
+ :param Ptar: is the probability of a target trial
+ :param cmiss: is the cost of a miss
+ :param cfa: is the cost of a false alarm
+
+ :return: a prior
+ """
+ p = Ptar * cmiss / (Ptar * cmiss + (1 - Ptar) * cfa)
+ return __logit__(p)
+
+
+def __probit__(p):
+ """Map from [0,1] to [-inf,inf] as used to make DET out of a ROC
+
+ :param p: the value to map
+
+ :return: probit(input)
+ """
+ y = numpy.sqrt(2) * scipy.special.erfinv(2 * p - 1)
+ return y
+
+
+def __logit__(p):
+ """logit function.
+ This is a one-to-one mapping from probability to log-odds.
+ i.e. it maps the interval (0,1) to the real line.
+ The inverse function is given by SIGMOID.
+
+ log_odds = logit(p) = log(p/(1-p))
+
+ :param p: the input value
+
+ :return: logit(input)
+ """
+ p = numpy.array(p)
+ lp = numpy.zeros(p.shape)
+ f0 = p == 0
+ f1 = p == 1
+ f = (p > 0) & (p < 1)
+
+ if lp.shape == ():
+ if f:
+ lp = numpy.log(p / (1 - p))
+ elif f0:
+ lp = -numpy.inf
+ elif f1:
+ lp = numpy.inf
+ else:
+ lp[f] = numpy.log(p[f] / (1 - p[f]))
+ lp[f0] = -numpy.inf
+ lp[f1] = numpy.inf
+ return lp
+
+
+def __DETsort__(x, col=''):
+ """DETsort Sort rows, the first in ascending, the remaining in descending
+ thereby postponing the false alarms on like scores.
+ based on SORTROWS
+
+ :param x: the array to sort
+ :param col: not used here
+
+ :return: a sorted vector of scores
+ """
+ assert x.ndim > 1, 'x must be a 2D matrix'
+ if col == '':
+ list(range(1, x.shape[1]))
+
+ ndx = numpy.arange(x.shape[0])
+
+ # sort 2nd column ascending
+ ind = numpy.argsort(x[:, 1], kind='mergesort')
+ ndx = ndx[ind]
+
+ # reverse to descending order
+ ndx = ndx[::-1]
+
+ # now sort first column ascending
+ ind = numpy.argsort(x[ndx, 0], kind='mergesort')
+
+ ndx = ndx[ind]
+ sort_scores = x[ndx, :]
+ return sort_scores
+
+
+def __compute_roc__(true_scores, false_scores):
+ """Computes the (observed) miss/false_alarm probabilities
+ for a set of detection output scores.
+
+ true_scores (false_scores) are detection output scores for a set of
+ detection trials, given that the target hypothesis is true (false).
+ (By convention, the more positive the score,
+ the more likely is the target hypothesis.)
+
+ :param true_scores: a 1D array of target scores
+ :param false_scores: a 1D array of non-target scores
+
+ :return: a tuple of two vectors, Pmiss,Pfa
+ """
+ num_true = true_scores.shape[0]
+ num_false = false_scores.shape[0]
+ assert num_true > 0, "Vector of target scores is empty"
+ assert num_false > 0, "Vector of nontarget scores is empty"
+
+ total = num_true + num_false
+
+ Pmiss = numpy.zeros((total + 1))
+ Pfa = numpy.zeros((total + 1))
+
+ scores = numpy.zeros((total, 2))
+ scores[:num_false, 0] = false_scores
+ scores[:num_false, 1] = 0
+ scores[num_false:, 0] = true_scores
+ scores[num_false:, 1] = 1
+
+ scores = __DETsort__(scores)
+
+ sumtrue = numpy.cumsum(scores[:, 1], axis=0)
+ sumfalse = num_false - (numpy.arange(1, total + 1) - sumtrue)
+
+ Pmiss[0] = 0
+ Pfa[0] = 1
+ Pmiss[1:] = sumtrue / num_true
+ Pfa[1:] = sumfalse / num_false
+ return Pmiss, Pfa
+
+
+def __filter_roc__(pm, pfa):
+ """Removes redundant points from the sequence of points (pfa,pm) so
+ that plotting an ROC or DET curve will be faster. The output ROC
+ curve will be identical to the one plotted from the input
+ vectors. All points internal to straight (horizontal or
+ vertical) sections on the ROC curve are removed i.e. only the
+ points at the start and end of line segments in the curve are
+ retained. Since the plotting code draws straight lines between
+ points, the resulting plot will be the same as the original.
+
+ :param pm: the vector of miss probabilities of the ROC Convex
+ :param pfa: the vector of false-alarm probabilities of the ROC Convex
+
+ :return: a tuple of two vectors, Pmiss, Pfa
+ """
+ out = 0
+ new_pm = [pm[0]]
+ new_pfa = [pfa[0]]
+
+ for i in range(1, pm.shape[0]):
+ if (pm[i] == new_pm[out]) | (pfa[i] == new_pfa[out]):
+ pass
+ else:
+ # save previous point, because it is the last point before the
+ # change. On the next iteration, the current point will be saved.
+ out += 1
+ new_pm.append(pm[i - 1])
+ new_pfa.append(pfa[i - 1])
+
+ out += 1
+ new_pm.append(pm[-1])
+ new_pfa.append(pfa[-1])
+ pm = numpy.array(new_pm)
+ pfa = numpy.array(new_pfa)
+ return pm, pfa
+
+
+def pavx(y):
+ """PAV: Pool Adjacent Violators algorithm.
+ Non-paramtetric optimization subject to monotonicity.
+
+ ghat = pav(y)
+ fits a vector ghat with nondecreasing components to the
+ data vector y such that sum((y - ghat).^2) is minimal.
+ (Pool-adjacent-violators algorithm).
+
+ optional outputs:
+ width: width of pav bins, from left to right
+ (the number of bins is data dependent)
+ height: corresponding heights of bins (in increasing order)
+
+ Author: This code is a simplified version of the 'IsoMeans.m' code
+ made available by Lutz Duembgen at:
+ http://www.imsv.unibe.ch/~duembgen/software
+
+ :param y: input value
+ """
+ assert y.ndim == 1, 'Argument should be a 1-D array'
+ assert y.shape[0] > 0, 'Input array is empty'
+ n = y.shape[0]
+
+ index = numpy.zeros(n, dtype=int)
+ length = numpy.zeros(n, dtype=int)
+
+ # An interval of indices is represented by its left endpoint
+ # ("index") and its length "length"
+ ghat = numpy.zeros(n)
+
+ ci = 0
+ index[ci] = 0
+ length[ci] = 1
+ ghat[ci] = y[0]
+
+ # ci is the number of the interval considered currently.
+ # ghat(ci) is the mean of y-values within this interval.
+ for j in range(1, n):
+ # a new index interval, {j}, is created:
+ ci += 1
+ index[ci] = j + 1
+ length[ci] = 1
+ ghat[ci] = y[j]
+ while (ci >= 1) & (ghat[numpy.max(ci - 1, 0)] >= ghat[ci]):
+ # pool adjacent violators:
+ nw = length[ci - 1] + length[ci]
+ ghat[ci - 1] = ghat[ci - 1] + (length[ci] / nw) * (ghat[ci] - ghat[ci - 1])
+ length[ci - 1] = nw
+ ci -= 1
+
+ height = copy.deepcopy(ghat[:ci + 1])
+ width = copy.deepcopy(length[:ci + 1])
+
+ # Now define ghat for all indices:
+ while n >= 0:
+ for j in range(int(index[ci]), int(n + 1)):
+ ghat[j - 1] = ghat[ci]
+
+ n = index[ci] - 1
+ ci -= 1
+
+ return ghat, width, height
+
+
+def rocch2eer(pmiss, pfa):
+ """Calculates the equal error rate (eer) from pmiss and pfa vectors.
+ Note: pmiss and pfa contain the coordinates of the vertices of the
+ ROC Convex Hull.
+ Use rocch.m to convert target and non-target scores to pmiss and
+ pfa values.
+
+ :param pmiss: the vector of miss probabilities
+ :param pfa: the vector of false-alarm probabilities
+
+ :return: the equal error rate
+ """
+ eer = 0
+ for i in range(pfa.shape[0] - 1):
+ xx = pfa[i:i + 2]
+ yy = pmiss[i:i + 2]
+
+ # xx and yy should be sorted:
+ assert (xx[1] <= xx[0]) & (yy[0] <= yy[1]), \
+ 'pmiss and pfa have to be sorted'
+
+ XY = numpy.column_stack((xx, yy))
+ dd = numpy.dot(numpy.array([1, -1]), XY)
+ if numpy.min(numpy.abs(dd)) == 0:
+ eerseg = 0
+ else:
+ # find line coefficients seg s.t. seg'[xx(i);yy(i)] = 1,
+ # when xx(i),yy(i) is on the line.
+ seg = numpy.linalg.solve(XY, numpy.array([[1], [1]]))
+ # candidate for EER, eer is highest candidate
+ eerseg = 1 / (numpy.sum(seg))
+
+ eer = max([eer, eerseg])
+ return eer
+
+
+def rocch(tar_scores, nontar_scores):
+ """ROCCH: ROC Convex Hull.
+ Note: pmiss and pfa contain the coordinates of the vertices of the
+ ROC Convex Hull.
+
+ For a demonstration that plots ROCCH against ROC for a few cases, just
+ type 'rocch' at the MATLAB command line.
+
+ :param tar_scores: vector of target scores
+ :param nontar_scores: vector of non-target scores
+
+ :return: a tupple of two vectors: Pmiss, Pfa
+ """
+ Nt = tar_scores.shape[0]
+ Nn = nontar_scores.shape[0]
+ N = Nt + Nn
+ scores = numpy.concatenate((tar_scores, nontar_scores))
+ # Pideal is the ideal, but non-monotonic posterior
+ Pideal = numpy.concatenate((numpy.ones(Nt), numpy.zeros(Nn)))
+ #
+ # It is important here that scores that are the same
+ # (i.e. already in order) should NOT be swapped.rb
+ perturb = numpy.argsort(scores, kind='mergesort')
+ #
+ Pideal = Pideal[perturb]
+ Popt, width, foo = pavx(Pideal)
+ #
+ nbins = width.shape[0]
+ pmiss = numpy.zeros(nbins + 1)
+ pfa = numpy.zeros(nbins + 1)
+ #
+ # threshold leftmost: accept everything, miss nothing
+ left = 0 # 0 scores to left of threshold
+ fa = Nn
+ miss = 0
+ #
+ for i in range(nbins):
+ pmiss[i] = miss / Nt
+ pfa[i] = fa / Nn
+ left = int(left + width[i])
+ miss = numpy.sum(Pideal[:left])
+ fa = N - left - numpy.sum(Pideal[left:])
+ #
+ pmiss[nbins] = miss / Nt
+ pfa[nbins] = fa / Nn
+ #
+ return pmiss, pfa
+
+
+def sigmoid(log_odds):
+ """SIGMOID: Inverse of the logit function.
+ This is a one-to-one mapping from log odds to probability.
+ i.e. it maps the real line to the interval (0,1).
+
+ p = sigmoid(log_odds)
+
+ :param log_odds: the input value
+
+ :return: sigmoid(input)
+ """
+ p = 1 / (1 + numpy.exp(-log_odds))
+ return p
+
+
+def fast_minDCF(tar, non, plo, normalize=False):
+ """Compute the minimum COST for given target and non-target scores
+ Note that minDCF is parametrized by plo:
+
+ minDCF(Ptar) = min_t Ptar * Pmiss(t) + (1-Ptar) * Pfa(t)
+
+ where t is the adjustable decision threshold and:
+
+ Ptar = sigmoid(plo) = 1./(1+exp(-plo))
+
+ If normalize == true, then the returned value is:
+
+ minDCF(Ptar) / min(Ptar,1-Ptar).
+
+ Pmiss: a vector with one value for every element of plo.
+ This is Pmiss(tmin), where tmin is the minimizing threshold
+ for minDCF, at every value of plo. Pmiss is not altered by
+ parameter 'normalize'.
+
+ Pfa: a vector with one value for every element of plo.
+ This is Pfa(tmin), where tmin is the minimizing threshold for
+ minDCF, at every value of plo. Pfa is not altered by
+ parameter 'normalize'.
+
+ Note, for the un-normalized case:
+
+ minDCF(plo) = sigmoid(plo).*Pfa(plo) + sigmoid(-plo).*Pmiss(plo)
+
+ :param tar: vector of target scores
+ :param non: vector of non-target scores
+ :param plo: vector of prior-log-odds: plo = logit(Ptar) = log(Ptar) - log(1-Ptar)
+ :param normalize: if true, return normalized minDCF, else un-normalized (optional, default = false)
+
+ :return: the minDCF value
+ :return: the miss probability for this point
+ :return: the false-alarm probability for this point
+ :return: the precision-recall break-even point: Where #FA == #miss
+ :return the equal error rate
+ """
+ Pmiss, Pfa = rocch(tar, non)
+ Nmiss = Pmiss * tar.shape[0]
+ Nfa = Pfa * non.shape[0]
+ prbep = rocch2eer(Nmiss, Nfa)
+ eer = rocch2eer(Pmiss, Pfa)
+
+ Ptar = sigmoid(plo)
+ Pnon = sigmoid(-plo)
+ cdet = numpy.dot(numpy.array([[Ptar, Pnon]]), numpy.vstack((Pmiss, Pfa)))
+ ii = numpy.argmin(cdet, axis=1)
+ minDCF = cdet[0, ii][0]
+
+ Pmiss = Pmiss[ii]
+ Pfa = Pfa[ii]
+
+ if normalize:
+ minDCF = minDCF / min([Ptar, Pnon])
+
+ return minDCF, Pmiss[0], Pfa[0], prbep, eer
+
+
+def plotseg(xx, yy, box, dps):
+ """Prepare the plotting of a curve.
+ :param xx:
+ :param yy:
+ :param box:
+ :param dps:
+ """
+ assert ((xx[1] <= xx[0]) & (yy[0] <= yy[1])), 'xx and yy should be sorted'
+
+ XY = numpy.column_stack((xx, yy))
+ dd = numpy.dot(numpy.array([1, -1]), XY)
+ if numpy.min(abs(dd)) == 0:
+ eer = 0
+ else:
+ # find line coefficients seg s.t. seg'[xx(i);yy(i)] = 1,
+ # when xx(i),yy(i) is on the line.
+ seg = numpy.linalg.solve(XY, numpy.array([[1], [1]]))
+ # candidate for EER, eer is highest candidate
+ eer = 1.0 / numpy.sum(seg)
+
+ # segment completely outside of box
+ if (xx[0] < box.left) | (xx[1] > box.right) | (yy[1] < box.bottom) | (yy[0] > box.top):
+ x = numpy.array([])
+ y = numpy.array([])
+ else:
+ if xx[1] < box.left:
+ xx[1] = box.left
+ yy[1] = (1 - seg[0] * box.left) / seg[1]
+
+ if xx[0] > box.right:
+ xx[0] = box.right
+ yy[0] = (1 - seg[0] * box.right) / seg[1]
+
+ if yy[0] < box.bottom:
+ yy[0] = box.bottom
+ xx[0] = (1 - seg[1] * box.bottom) / seg[0]
+
+ if yy[1] > box.top:
+ yy[1] = box.top
+ xx[1] = (1 - seg[1] * box.top) / seg[0]
+
+ dx = xx[1] - xx[0]
+ xdots = xx[0] + dx * numpy.arange(dps + 1) / dps
+ ydots = (1 - seg[0] * xdots) / seg[1]
+ x = __probit__(xdots)
+ y = __probit__(ydots)
+
+ return x, y, eer
+
+
+def rocchdet(tar, non,
+ dcfweights=numpy.array([]),
+ pfa_min=5e-4,
+ pfa_max=0.5,
+ pmiss_min=5e-4,
+ pmiss_max=0.5,
+ dps=100,
+ normalize=False):
+ """ROCCHDET: Computes ROC Convex Hull and then maps that to the DET axes.
+ The DET-curve is infinite, non-trivial limits (away from 0 and 1)
+ are mandatory.
+
+ :param tar: vector of target scores
+ :param non: vector of non-target scores
+ :param dcfweights: 2-vector, such that: DCF = [pmiss,pfa]*dcfweights(:) (Optional, provide only if mindcf is
+ desired, otherwise omit or use []
+ :param pfa_min: limit of DET-curve rectangle. Default is 0.0005
+ :param pfa_max: limit of DET-curve rectangle. Default is 0.5
+ :param pmiss_min: limit of DET-curve rectangle. Default is 0.0005
+ :param pmiss_max: limits of DET-curve rectangle. Default is 0.5
+ :param dps: number of returned (x,y) dots (arranged in a curve) in DET space, for every straight line-segment
+ (edge) of the ROC Convex Hull. Default is 100.
+ :param normalize: normalize the curve
+
+ :return: probit(Pfa)
+ :return: probit(Pmiss)
+ :return: ROCCH EER = max_p mindcf(dcfweights=[p,1-p]), which is also equal to the intersection of the ROCCH
+ with the line pfa = pmiss.
+ :return: the mindcf: Identical to result using traditional ROC, but computed by mimimizing over the ROCCH
+ vertices, rather than over all the ROC points.
+ """
+ assert ((pfa_min > 0) & (pfa_max < 1) & (pmiss_min > 0) & (pmiss_max < 1)), 'limits must be strictly inside (0,1)'
+ assert((pfa_min < pfa_max) & (pmiss_min < pmiss_max)), 'pfa and pmiss min and max values are not consistent'
+
+ pmiss, pfa = rocch(tar, non)
+ mindcf = 0.0
+
+ if dcfweights.shape == (2,):
+ dcf = numpy.dot(dcfweights, numpy.vstack((pmiss, pfa)))
+ mindcf = numpy.min(dcf)
+ if normalize:
+ mindcf = mindcf/min(dcfweights)
+
+ # pfa is decreasing
+ # pmiss is increasing
+ box = Box(left=pfa_min, right=pfa_max, top=pmiss_max, bottom=pmiss_min)
+
+ x = []
+ y = []
+ eer = 0
+ for i in range(pfa.shape[0] - 1):
+ xx = pfa[i:i + 2]
+ yy = pmiss[i:i + 2]
+ xdots, ydots, eerseg = plotseg(xx, yy, box, dps)
+ x = x + xdots.tolist()
+ y = y + ydots.tolist()
+ eer = max(eer, eerseg)
+
+ return numpy.array(x), numpy.array(y), eer, mindcf
+
+
+class DetPlot:
+ """A class for creating a plot for displaying detection performance
+ with the axes scaled and labelled so that a normal Gaussian
+ distribution will plot as a straight line.
+
+ - The y axis represents the miss probability.
+ - The x axis represents the false alarm probability.
+
+ :attr __plotwindow__: PlotWindow object to plot into
+ :attr __title__: title of the plot
+ :attr __sys_name__: list of IDs of the systems
+ :attr __tar__: list of arrays of of target scores for each system
+ :attr __non__: list of arrays of the non-target scores for each system
+ :attr __figure__: figure to plot into
+ """
+
+ def __init__(self, window_style='old', plot_title=''):
+ """Initialize an empty DetPlot object"""
+ self.__plotwindow__ = PlotWindow(window_style)
+ self.__title__ = plot_title
+ self.__sys_name__ = []
+ self.__tar__ = []
+ self.__non__ = []
+ self.__figure__ = ''
+ self.title = ''
+
+ def set_title(self, title):
+ """Modify the title of a DetPlot object
+
+ :param title: title of the plot to display
+ """
+ self.title = title
+
+ def create_figure(self, idx=0):
+ """Create a figure to plot the DET-curve.
+ Default plot everything on one single figure
+
+ :param idx: Index of the figure to create. Default is 0.
+ """
+ self.__figure__ = mpl.figure(idx)
+ ax = self.__figure__.add_subplot(111)
+ ax.set_aspect('equal')
+ mpl.axis([__probit__(self.__plotwindow__.__pfa_limits__[0]),
+ __probit__(self.__plotwindow__.__pfa_limits__[1]),
+ __probit__(self.__plotwindow__.__pmiss_limits__[0]),
+ __probit__(self.__plotwindow__.__pmiss_limits__[1])])
+ xticks = __probit__(self.__plotwindow__.__xticks__)
+ yticks = __probit__(self.__plotwindow__.__yticks__)
+ ax.set_xticks(xticks)
+ ax.set_xticklabels(self.__plotwindow__.__xticklabels__,
+ size='x-small')
+ ax.set_yticks(yticks)
+ ax.set_yticklabels(self.__plotwindow__.__yticklabels__, size='x-small')
+ if not self.__title__ == '':
+ mpl.title(self.__title__)
+ mpl.grid(True)
+ mpl.xlabel('False Acceptance Rate [in %]')
+ mpl.ylabel('False Rejection Rate [in %]')
+
+ # assuring, limits are kept by matplotlib after probit transform of axes
+ mpl.gca().set_xlim(
+ left=__probit__(self.__plotwindow__.__pfa_limits__[0]),
+ right=__probit__(self.__plotwindow__.__pfa_limits__[1])
+ )
+ mpl.gca().set_ylim(
+ bottom=__probit__(self.__plotwindow__.__pmiss_limits__[0]),
+ top=__probit__(self.__plotwindow__.__pmiss_limits__[1])
+ )
+
+ def set_system(self, tar, non, sys_name=''):
+ """Sets the scores to be plotted. This function must be called
+ before plots are made for a system, but it can be called several
+ times with different systems (with calls to plotting functions in
+ between) so that curves for different systems appear on the same plot.
+
+ :param tar: A vector of target scores.
+ :param non: A vector of non-target scores.
+ :param sys_name: A string describing the system. This string will
+ be prepended to the plot names in the legend.
+ You can pass an empty string to this argument or omit it.
+
+ """
+ assert tar.ndim == 1, 'Vector of target scores should be 1-dimensional'
+ assert non.ndim == 1, 'Vector of nontarget scores should be 1-dimensional'
+ assert tar.shape[0] > 0, 'Vector of target scores is empty'
+ assert non.shape[0] > 0, 'Vector of nontarget scores is empty'
+ self.__sys_name__.append(sys_name)
+ self.__tar__.append(tar)
+ self.__non__.append(non)
+
+ def set_system_from_scores(self, scores, key, sys_name=''):
+ """Sets the scores to be plotted. This function must be called
+ before plots are made for a system, but it can be called several
+ times with different systems (with calls to plotting functions in
+ between) so that curves for different systems appear on the same plot.
+
+ :param scores: A Scores object containing system scores.
+ :param key: A Key object for distinguishing target and non-target scores.
+ :param sys_name: A string describing the system. This string will be
+ prepended to the plot names in the legend. You can pass an
+ empty string to this argument or omit it.
+
+ """
+ assert isinstance(scores, Scores), 'First argument should be a Score object'
+ assert isinstance(key, Key), 'Second argument should be a Key object'
+ assert scores.validate(), 'Wrong format of Scores'
+ assert key.validate(), 'Wrong format of Key'
+ tar, non = scores.get_tar_non(key)
+ self.set_system(tar, non, sys_name)
+
+ def plot_steppy_det(self, idx=0, style='color', plot_args=''):
+ """Plots a DET curve.
+
+ :param idx: the idx of the curve to plot in case tar and non have
+ several dimensions
+ :param style: style of the curve, can be gray or color
+ :param plot_args: a cell array of arguments to be passed to plot
+ that control the appearance of the curve.
+ """
+ Pmiss, Pfa = __compute_roc__(self.__tar__[idx], self.__non__[idx])
+ Pmiss, Pfa = __filter_roc__(Pmiss, Pfa)
+
+ x = __probit__(Pfa)
+ y = __probit__(Pmiss)
+
+ # In case the plotting arguments are not specified, they are initialized
+ # by using default values
+ if not (isinstance(plot_args, tuple) & (len(plot_args) == 3)):
+ if style == 'gray':
+ plot_args = grayStyle[idx]
+ else:
+ plot_args = colorStyle[idx]
+
+ fig = mpl.plot(x, y,
+ label=self.__sys_name__[idx],
+ color=plot_args[0],
+ linestyle=plot_args[1],
+ linewidth=plot_args[2])
+ mpl.legend()
+ if matplotlib.get_backend() == 'agg':
+ mpl.savefig(self.__title__ + '.pdf')
+
+ def plot_rocch_det(self, idx=0, style='color', target_prior=0.001, plot_args=''):
+ """Plots a DET curve using the ROCCH.
+
+ :param idx: index of the figure to plot on
+ :param style: style of the DET-curve (see DetPlot description)
+ :param target_prior: prior of the target trials
+ :param plot_args: a list of arguments to be passed
+ to plot that control the appearance of the curve.
+ """
+ # In case the plotting arguments are not specified, they are initialized
+ # by using default values
+ if not (isinstance(plot_args, tuple) & (len(plot_args) == 3)):
+ if style == 'gray':
+ plot_args = grayStyle[idx]
+ else:
+ plot_args = colorStyle[idx]
+
+ pfa_min = self.__plotwindow__.__pfa_limits__[0]
+ pfa_max = self.__plotwindow__.__pfa_limits__[1]
+ pmiss_min = self.__plotwindow__.__pmiss_limits__[0]
+ pmiss_max = self.__plotwindow__.__pmiss_limits__[1]
+
+ dps = 100 # dots per segment
+
+ tmp = numpy.array([sigmoid(__logit__(target_prior)), sigmoid(-__logit__(target_prior))])
+ x, y, eer, mindcf = rocchdet(self.__tar__[idx], self.__non__[idx],
+ tmp, pfa_min, pfa_max,
+ pmiss_min, pmiss_max, dps, normalize=True)
+
+ fig = mpl.plot(x, y,
+ label='{}; (eer; minDCF) = ({:.03}; {:.04})'.format(
+ self.__sys_name__[idx],
+ 100. * eer,
+ 100. * mindcf),
+ color=plot_args[0],
+ linestyle=plot_args[1],
+ linewidth=plot_args[2])
+ mpl.legend()
+ if matplotlib.get_backend() == 'agg':
+ mpl.savefig(self.__title__ + '.pdf')
+
+ def plot_mindcf_point(self, target_prior, idx=0, plot_args='ok', legend_string=''):
+ """Places the mindcf point for the current system.
+
+ :param target_prior: The effective target prior.
+ :param idx: inde of the figure to plot in
+ :param plot_args: a list of arguments to be
+ passed to 'plot' that control the appearance of the curve.
+ :param legend_string: Optional. A string to describe this curve
+ in the legend.
+ """
+ mindcf, pmiss, pfa, prbep, eer = fast_minDCF(self.__tar__[idx],
+ self.__non__[idx], __logit__(target_prior), True)
+ if (pfa < self.__plotwindow__.__pfa_limits__[0]) | (pfa > self.__plotwindow__.__pfa_limits__[1]):
+ logging.warning('pfa of %f is not between %f and %f mindcf point will not be plotted.', format(pfa),
+ self.__plotwindow__.__pfa_limits__[0],
+ self.__plotwindow__.__pfa_limits__[1])
+ elif (pmiss < self.__plotwindow__.__pmiss_limits__[0]) | (pmiss > self.__plotwindow__.__pmiss_limits__[1]):
+ logging.warning('pmiss of %f is not between %f and %f. The mindcf point will not be plotted.',
+ pmiss, self.__plotwindow__.__pmiss_limits__[0],
+ self.__plotwindow__.__pmiss_limits__[1])
+ else:
+ fig = mpl.plot(__probit__(pfa), __probit__(pmiss), plot_args)
+ if matplotlib.get_backend() == 'agg':
+ mpl.savefig(self.__title__ + '.pdf')
+
+ def plot_DR30_fa(self,
+ idx=0,
+ plot_args=((0, 0, 0), '--', 1),
+ legend_string=''):
+ """Plots a vertical line indicating the Doddington 30 point for
+ false alarms. This is the point left of which the number of false
+ alarms is below 30, so that the estimate of the false alarm rate
+ is no longer good enough to satisfy Doddington's Rule of 30.
+
+ :param idx: index of the figure to plot in
+ :param plot_args: A cell array of arguments to be passed to 'plot'
+ that control the appearance of the curve.
+ :param legend_string: Optional. A string to describe this curve
+ in the legend.
+ """
+ assert isinstance(plot_args, tuple) & (len(plot_args) == 3), 'Invalid plot_args'
+
+ pfa_min = self.__plotwindow__.__pfa_limits__[0]
+ pfa_max = self.__plotwindow__.__pfa_limits__[1]
+ pmiss_min = self.__plotwindow__.__pmiss_limits__[0]
+ pmiss_max = self.__plotwindow__.__pmiss_limits__[1]
+ pfaval = 30.0 / self.__non__[idx].shape[0]
+
+ if (pfaval < pfa_min) | (pfaval > pfa_max):
+ logging.warning('Pfa DR30 of %f is not between %f and %f Pfa DR30 line will not be plotted.',
+ pfaval, pfa_min, pfa_max)
+ else:
+ drx = __probit__(pfaval)
+ mpl.axvline(drx,
+ ymin=__probit__(pmiss_min),
+ ymax=__probit__(pmiss_max),
+ color=plot_args[0],
+ linestyle=plot_args[1],
+ linewidth=plot_args[2],
+ label=legend_string)
+
+ def plot_DR30_miss(self,
+ idx=0,
+ plot_args=((0, 0, 0), '--', 1),
+ legend_string=''):
+ """Plots a horizontal line indicating the Doddington 30 point for
+ misses. This is the point above which the number of misses is
+ below 30, so that the estimate of the miss rate is no longer good
+ enough to satisfy Doddington's Rule of 30.
+
+ :param idx: index of the figure to plot in
+ :param plot_args: A cell array of arguments to be passed
+ to 'plot' that control the appearance of the curve.
+ :param legend_string: Optional. A string to describe this curve
+ in the legend.
+ """
+ assert isinstance(plot_args, tuple) & (len(plot_args) == 3), 'Invalid plot_args'
+
+ pfa_min = self.__plotwindow__.__pfa_limits__[0]
+ pfa_max = self.__plotwindow__.__pfa_limits__[1]
+ pmiss_min = self.__plotwindow__.__pmiss_limits__[0]
+ pmiss_max = self.__plotwindow__.__pmiss_limits__[1]
+ pmissval = 30.0 / self.__tar__[idx].shape[0]
+
+ if (pmissval < pmiss_min) | (pmissval > pmiss_max):
+ logging.warning('Pmiss DR30 of is not between %f and %f Pfa DR30 line will not be plotted.',
+ pmissval, pmiss_min, pmiss_max)
+ else:
+ dry = __probit__(pmissval)
+ mpl.axhline(y=dry,
+ xmin=__probit__(pfa_min),
+ xmax=__probit__(pfa_max),
+ color=plot_args[0],
+ linestyle=plot_args[1],
+ linewidth=plot_args[2],
+ label=legend_string)
+
+ def plot_DR30_both(self,
+ idx=0,
+ plot_args_fa=((0, 0, 0), '--', 1),
+ plot_args_miss=((0, 0, 0), '--', 1),
+ legend_string=''):
+ """Plots two lines indicating Doddington's Rule of 30 points: one
+ for false alarms and one for misses. See the documentation of
+ plot_DR30_fa and plot_DR30_miss for details.
+
+ :param idx: index of the figure to plot in
+ :param plot_args_fa: A tuple of arguments to be passed to 'plot' that control
+ the appearance of the DR30_fa point.
+ :param plot_args_miss: A tuple of arguments to be passed to 'plot' that control
+ the appearance of the DR30_miss point.
+ :param legend_string: Optional. A string to describe this curve
+ in the legend.
+ """
+ self.plot_DR30_fa(idx, plot_args_fa, 'pfa DR30')
+ self.plot_DR30_miss(idx, plot_args_miss, 'pmiss DR30')
+
+ def display_legend(self):
+ # to complete
+ pass
+
+ def save_as_pdf(self, outfilename):
+ # to complete
+ pass
+
+ def add_legend_entry(self, lh, legend_string, append_name):
+ # to complete
+ pass
+
+
diff --git a/sidekit/sidekit/bosaris/idmap.py b/sidekit/sidekit/bosaris/idmap.py
new file mode 100755
index 0000000..1c19412
--- /dev/null
+++ b/sidekit/sidekit/bosaris/idmap.py
@@ -0,0 +1,386 @@
+# -*- coding: utf-8 -*-
+
+# This package is a translation of a part of the BOSARIS toolkit.
+# The authors thank Niko Brummer and Agnitio for allowing them to
+# translate this code and provide the community with efficient structures
+# and tools.
+#
+# The BOSARIS Toolkit is a collection of functions and classes in Matlab
+# that can be used to calibrate, fuse and plot scores from speaker recognition
+# (or other fields in which scores are used to test the hypothesis that two
+# samples are from the same source) trials involving a model and a test segment.
+# The toolkit was written at the BOSARIS2010 workshop which took place at the
+# University of Technology in Brno, Czech Republic from 5 July to 6 August 2010.
+# See the User Guide (available on the toolkit website)1 for a discussion of the
+# theory behind the toolkit and descriptions of some of the algorithms used.
+#
+# The BOSARIS toolkit in MATLAB can be downloaded from `the website
+# `_.
+
+"""
+This is the 'idmap' module
+"""
+import sys
+import numpy
+import logging
+import copy
+import h5py
+
+from ..sidekit_wrappers import check_path_existance
+
+
+__author__ = "Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+__credits__ = ["Niko Brummer", "Edward de Villiers"]
+
+
+class IdMap:
+ """A class that stores a map between identifiers (strings). One
+ list is called 'leftids' and the other 'rightids'. The class
+ provides methods that convert a sequence of left ids to a
+ sequence of right ids and vice versa. If `leftids` or `rightids`
+ contains duplicates then all occurrences are used as the index
+ when mapping.
+
+ :attr leftids: a list of classes in a ndarray
+ :attr rightids: a list of segments in a ndarray
+ :attr start: index of the first frame of the segment
+ :attr stop: index of the last frame of the segment
+ """
+
+ def __init__(self, idmap_filename=''):
+ """Initialize an IdMap object
+
+ :param idmap_filename: name of a file to load. Default is ''.
+ In case the idmap_filename is empty, initialize an empty IdMap object.
+ """
+ self.leftids = numpy.empty(0, dtype="|O")
+ self.rightids = numpy.empty(0, dtype="|O")
+ self.start = numpy.empty(0, dtype="|O")
+ self.stop = numpy.empty(0, dtype="|O")
+
+ if idmap_filename == '':
+ pass
+ else:
+ tmp = IdMap.read(idmap_filename)
+ self.leftids = tmp.leftids
+ self.rightids = tmp.rightids
+ self.start = tmp.start
+ self.stop = tmp.stop
+
+ def __repr__(self):
+ ch = '-' * 30 + '\n'
+ ch += 'left ids:' + self.leftids.__repr__() + '\n'
+ ch += 'right ids:' + self.rightids.__repr__() + '\n'
+ ch += 'seg start:' + self.start.__repr__() + '\n'
+ ch += 'seg stop:' + self.stop.__repr__() + '\n'
+ ch += '-' * 30 + '\n'
+ return ch
+
+ @check_path_existance
+ def write(self, output_file_name):
+ """ Save IdMap in HDF5 format
+
+ :param output_file_name: name of the file to write to
+ """
+ assert self.validate(), "Error: wrong IdMap format"
+ with h5py.File(output_file_name, "w") as f:
+ f.create_dataset("leftids", data=self.leftids.astype('S'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ f.create_dataset("rightids", data=self.rightids.astype('S'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ # WRITE START and STOP
+ start = copy.deepcopy(self.start)
+ start[numpy.isnan(self.start.astype('float'))] = -1
+ start = start.astype('int32', copy=False)
+
+ stop = copy.deepcopy(self.stop)
+ stop[numpy.isnan(self.stop.astype('float'))] = -1
+ stop = stop.astype('int32', copy=False)
+
+ f.create_dataset("start", data=start,
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ f.create_dataset("stop", data=stop,
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+
+ @check_path_existance
+ def write_txt(self, output_file_name):
+ """Saves the Id_Map to a text file.
+
+ :param output_file_name: name of the output text file
+ """
+ with open(output_file_name, 'w') as outputFile:
+ for left, right, start, stop in zip(self.leftids, self.rightids, self.start, self.stop):
+ line = ' '.join(filter(None, (left, right, str(start), str(stop)))) + '\n'
+ outputFile.write(line)
+
+ def map_left_to_right(self, leftidlist):
+ """Maps an array of ids to a new array of ids using the given map.
+ The input ids are matched against the leftids of the map and the
+ output ids are taken from the corresponding rightids of the map.
+
+ Beware: if leftids are not unique in the IdMap, only the last value
+ corresponding is kept
+
+ :param leftidlist: an array of strings to be matched against the
+ leftids of the idmap. The rightids corresponding to these
+ leftids will be returned.
+
+ :return: an array of strings that are the mappings of the
+ strings in leftidlist.
+ """
+ tmp_dict = dict(zip(self.leftids, self.rightids))
+ inter = numpy.intersect1d(self.leftids, leftidlist)
+ rightids = numpy.empty(inter.shape[0], '|O')
+
+ idx = 0
+ for left in leftidlist:
+ if left in inter:
+ rightids[idx] = tmp_dict[left]
+ idx += 1
+
+ lost_ids = numpy.unique(leftidlist).shape[0] - inter.shape[0]
+ if lost_ids:
+ logging.warning('{} ids could not be mapped'.format(lost_ids))
+
+ return rightids
+
+ def map_right_to_left(self, rightidlist):
+ """Maps an array of ids to a new array of ids using the given map.
+ The input ids are matched against the rightids of the map and the
+ output ids are taken from the corresponding leftids of the map.
+
+ Beware: if rightids are not unique in the IdMap, only the last value
+ corresponding is kept
+
+ :param rightidlist: An array of strings to be matched against the
+ rightids of the idmap. The leftids corresponding to these
+ rightids will be returned.
+
+ :return: an array of strings that are the mappings of the
+ strings in rightidlist.
+ """
+ tmp_dict = dict(zip(self.rightids, self.leftids))
+ inter = numpy.intersect1d(self.rightids, rightidlist)
+ leftids = numpy.empty(inter.shape[0], '|O')
+
+ idx = 0
+ for right in rightidlist:
+ if right in inter:
+ leftids[idx] = tmp_dict[right]
+ idx += 1
+
+ lost_ids = numpy.unique(rightidlist).shape[0] - inter.shape[0]
+ if lost_ids:
+ logging.warning('{} ids could not be mapped'.format(lost_ids))
+
+ return leftids
+
+ def filter_on_left(self, idlist, keep):
+ """Removes some of the information in an idmap. Depending on the
+ value of 'keep', the idlist indicates the strings to retain or
+ the strings to discard.
+
+ :param idlist: an array of strings which will be compared with
+ the leftids of the current.
+ :param keep: A boolean indicating whether idlist contains the ids to
+ keep or to discard.
+
+ :return: a filtered version of the current IdMap.
+ """
+ # get the list of ids to keep
+ if keep:
+ keepids = numpy.unique(idlist)
+ else:
+ keepids = numpy.setdiff1d(self.leftids, idlist)
+
+ keep_idx = numpy.in1d(self.leftids, keepids)
+ out_idmap = IdMap()
+ out_idmap.leftids = self.leftids[keep_idx]
+ out_idmap.rightids = self.rightids[keep_idx]
+ out_idmap.start = self.start[keep_idx]
+ out_idmap.stop = self.stop[keep_idx]
+
+ return out_idmap
+
+ def filter_on_right(self, idlist, keep):
+ """Removes some of the information in an idmap. Depending on the
+ value of 'keep', the idlist indicates the strings to retain or
+ the strings to discard.
+
+ :param idlist: an array of strings which will be compared with
+ the rightids of the current IdMap.
+ :param keep: a boolean indicating whether idlist contains the ids to
+ keep or to discard.
+
+ :return: a filtered version of the current IdMap.
+ """
+ # get the list of ids to keep
+ if keep:
+ keepids = numpy.unique(idlist)
+ else:
+ keepids = numpy.setdiff1d(self.rightids, idlist)
+
+ keep_idx = numpy.in1d(self.rightids, keepids)
+ out_idmap = IdMap()
+ out_idmap.leftids = self.leftids[keep_idx]
+ out_idmap.rightids = self.rightids[keep_idx]
+ out_idmap.start = self.start[keep_idx]
+ out_idmap.stop = self.stop[keep_idx]
+ return out_idmap
+
+ def validate(self, warn=False):
+ """Checks that an object of type Id_Map obeys certain rules that
+ must alows be true.
+
+ :param warn: boolean. If True, print a warning if strings are
+ duplicated in either left or right array
+
+ :return: a boolean value indicating whether the object is valid.
+
+ """
+ ok = (self.leftids.shape == self.rightids.shape == self.start.shape == self.stop.shape) & self.leftids.ndim == 1
+
+ if warn & (self.leftids.shape != numpy.unique(self.leftids).shape):
+ logging.warning('The left id list contains duplicate identifiers')
+ if warn & (self.rightids.shape != numpy.unique(self.rightids).shape):
+ logging.warning('The right id list contains duplicate identifiers')
+ return ok
+
+ def set(self, left, right, start=None, stop=None):
+ self.leftids = copy.deepcopy(left)
+ self.rightids = copy.deepcopy(right)
+
+ if start is not None:
+ self.start = copy.deepcopy(start)
+ else:
+ self.start = numpy.empty(self.rightids.shape, '|O')
+
+ if stop is not None:
+ self.stop = copy.deepcopy(stop)
+ else:
+ self.stop = numpy.empty(self.rightids.shape, '|O')
+
+ @staticmethod
+ def read(input_file_name):
+ """Read IdMap in hdf5 format.
+
+ :param input_file_name: name of the file to read from
+ """
+ with h5py.File(input_file_name, "r") as f:
+ idmap = IdMap()
+
+ idmap.leftids = f.get("leftids")[()]
+ idmap.rightids = f.get("rightids")[()]
+
+ # if running python 3, need a conversion to unicode
+ if sys.version_info[0] == 3:
+ idmap.leftids = idmap.leftids.astype('U255', copy=False)
+ idmap.rightids = idmap.rightids.astype('U255', copy=False)
+
+ tmpstart = f.get("start")[()]
+ tmpstop = f.get("stop")[()]
+ idmap.start = numpy.empty(f["start"].shape, '|O')
+ idmap.stop = numpy.empty(f["stop"].shape, '|O')
+ idmap.start[tmpstart != -1] = tmpstart[tmpstart != -1]
+ idmap.stop[tmpstop != -1] = tmpstop[tmpstop != -1]
+
+ assert idmap.validate(), "Error: wrong IdMap format"
+ return idmap
+
+ @classmethod
+ @check_path_existance
+ def read_txt(cls, input_file_name):
+ """Read IdMap in text format.
+
+ :param input_file_name: name of the file to read from
+ """
+ idmap = IdMap()
+
+ with open(input_file_name, "r") as f:
+ columns = len(f.readline().split(' '))
+
+ if columns == 2:
+ idmap.leftids, idmap.rightids = numpy.loadtxt(input_file_name,
+ dtype={'names': ('left', 'right'), 'formats': ('|O', '|O')},
+ usecols=(0, 1), unpack=True)
+ idmap.start = numpy.empty(idmap.rightids.shape, '|O')
+ idmap.stop = numpy.empty(idmap.rightids.shape, '|O')
+
+ # If four columns
+ elif columns == 4:
+ idmap.leftids, idmap.rightids, idmap.start, idmap.stop = numpy.loadtxt(
+ input_file_name,
+ dtype={'names': ('left', 'right', 'start', 'stop'),
+ 'formats': ('|O', '|O', 'int', 'int')}, unpack=True)
+
+ if not idmap.validate():
+ raise Exception('Wrong format of IdMap')
+ assert idmap.validate(), "Error: wrong IdMap format"
+ return idmap
+
+ def merge(self, idmap2):
+ """ Merges the current IdMap with another IdMap or a list of IdMap objects..
+
+ :param idmap2: Another Id_Map object.
+
+ :return: an Id_Map object that contains the information from the two
+ input Id_Maps.
+ """
+ idmap = IdMap()
+ if self.validate() & idmap2.validate():
+ # create tuples of (model,seg) for both IdMaps for quick comparaison
+ tup1 = [(mod, seg) for mod, seg in zip(self.leftids, self.rightids)]
+ tup2 = [(mod, seg) for mod, seg in zip(idmap2.leftids, idmap2.rightids)]
+
+ # Get indices of common sessions
+ existing_sessions = set(tup1).intersection(set(tup2))
+ # Get indices of sessions which are not common in idmap2
+ idx_new = numpy.sort(numpy.array([idx for idx, sess in enumerate(tup2) if sess not in tup1]))
+ if len(idx_new) == 0:
+ idx_new = numpy.zeros(idmap2.leftids.shape[0], dtype='bool')
+
+ idmap.leftids = numpy.concatenate((self.leftids, idmap2.leftids[idx_new]), axis=0)
+ idmap.rightids = numpy.concatenate((self.rightids, idmap2.rightids[idx_new]), axis=0)
+ idmap.start = numpy.concatenate((self.start, idmap2.start[idx_new]), axis=0)
+ idmap.stop = numpy.concatenate((self.stop, idmap2.stop[idx_new]), axis=0)
+
+ else:
+ raise Exception('Cannot merge IdMaps, wrong type')
+
+ if not idmap.validate():
+ raise Exception('Wrong format of IdMap')
+
+ return idmap
+
+ def split(self, N):
+ """
+ Split an IdMap object into N IdMap of same size (if possible)
+
+ :param N: the number of IdMap to generate
+ :return: a list of IdMap
+ """
+ session_nb = self.leftids.shape[0]
+ sub_indices = numpy.array_split(numpy.arange(session_nb), N)
+
+ im_list = []
+ for ii in range(N):
+ im_list.append(IdMap())
+ im_list[ii].leftids = self.leftids[sub_indices[ii]]
+ im_list[ii].rightids = self.rightids[sub_indices[ii]]
+ im_list[ii].start = self.start[sub_indices[ii]]
+ im_list[ii].stop = self.stop[sub_indices[ii]]
+ assert im_list[ii].validate(), "Error: wrong IdMap format"
+
+ return im_list
diff --git a/sidekit/sidekit/bosaris/key.py b/sidekit/sidekit/bosaris/key.py
new file mode 100755
index 0000000..cf0cb1f
--- /dev/null
+++ b/sidekit/sidekit/bosaris/key.py
@@ -0,0 +1,360 @@
+# -*- coding: utf-8 -*-
+
+# This package is a translation of a part of the BOSARIS toolkit.
+# The authors thank Niko Brummer and Agnitio for allowing them to
+# translate this code and provide the community with efficient structures
+# and tools.
+#
+# The BOSARIS Toolkit is a collection of functions and classes in Matlab
+# that can be used to calibrate, fuse and plot scores from speaker recognition
+# (or other fields in which scores are used to test the hypothesis that two
+# samples are from the same source) trials involving a model and a test segment.
+# The toolkit was written at the BOSARIS2010 workshop which took place at the
+# University of Technology in Brno, Czech Republic from 5 July to 6 August 2010.
+# See the User Guide (available on the toolkit website)1 for a discussion of the
+# theory behind the toolkit and descriptions of some of the algorithms used.
+#
+# The BOSARIS toolkit in MATLAB can be downloaded from `the website
+# `_.
+
+"""
+This is the 'key' module
+"""
+import h5py
+import logging
+import numpy
+import sys
+from .ndx import Ndx
+from ..sidekit_wrappers import check_path_existance
+
+__author__ = "Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+__credits__ = ["Niko Brummer", "Edward de Villiers"]
+
+
+def diff(list1, list2):
+ c = [item for item in list1 if item not in list2]
+ c.sort()
+ return c
+
+
+def ismember(list1, list2):
+ c = [item in list2 for item in list1]
+ return c
+
+
+class Key:
+ """A class for representing a Key i.e. it classifies trials as
+ target or non-target trials.
+
+ :attr modelset: list of the models into a ndarray of strings
+ :attr segset: list of the test segments into a ndarray of strings
+ :attr tar: 2D ndarray of booleans which rows correspond to the models
+ and columns to the test segments. True if target trial.
+ :attr non: 2D ndarray of booleans which rows correspond to the models
+ and columns to the test segments. True is non-target trial.
+ """
+
+ def __init__(self, key_file_name=None,
+ models=numpy.array([]),
+ testsegs=numpy.array([]),
+ trials=numpy.array([])):
+ """Initialize a Key object.
+ :param key_file_name: name of the file to load. Default is ''.
+ :param models: a list of models
+ :param testsegs: a list of test segments
+
+ In case the key_file_name is empty, initialize an empty Key object.
+ """
+ self.modelset = numpy.empty(0, dtype="|O")
+ self.segset = numpy.empty(0, dtype="|O")
+ self.tar = numpy.array([], dtype="bool")
+ self.non = numpy.array([], dtype="bool")
+
+ if key_file_name is None and models is None and testsegs is None and trials is None:
+ pass
+
+ elif key_file_name == None:
+ modelset = numpy.unique(models)
+ segset = numpy.unique(testsegs)
+
+ tar = numpy.zeros((modelset.shape[0], segset.shape[0]), dtype="bool")
+ non = numpy.zeros((modelset.shape[0], segset.shape[0]), dtype="bool")
+
+ for idx_m, model in enumerate(modelset):
+ idx_current_model = numpy.argwhere(models == model).flatten()
+ current_model_keys = dict(zip(testsegs[idx_current_model],
+ trials[idx_current_model]))
+ for idx_s, seg in enumerate(segset):
+ if seg in current_model_keys:
+ tar[idx_m, idx_s] = (current_model_keys[seg] == 'target')
+ non[idx_m, idx_s] = (current_model_keys[seg] == 'nontarget')
+
+ self.modelset = modelset
+ self.segset = segset
+ self.tar = tar
+ self.non = non
+ assert self.validate(), "Wrong Key format"
+
+ else:
+ tmp = self.read(key_file_name)
+ self.modelset = tmp.modelset
+ self.segset = tmp.segset
+ self.tar = tmp.tar
+ self.non = tmp.non
+
+ @classmethod
+ def create(cls, modelset, segset, tar, non):
+ key = Key()
+ key.modelset = modelset
+ key.segset = segset
+ key.tar = tar
+ key.non = non
+ assert key.validate(), "Wrong Key format"
+ return key
+
+ @check_path_existance
+ def write(self, output_file_name):
+ """ Save Key in HDF5 format
+
+ :param output_file_name: name of the file to write to
+ """
+ assert self.validate(), "Error: wrong Key format"
+
+ with h5py.File(output_file_name, "w") as f:
+ f.create_dataset("modelset", data=self.modelset.astype('S'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ f.create_dataset("segset", data=self.segset.astype('S'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ trialmask = numpy.array(self.tar, dtype='int8') - numpy.array(self.non, dtype='int8')
+ f.create_dataset("trial_mask", data=trialmask,
+ maxshape=(None, None),
+ compression="gzip",
+ fletcher32=True)
+
+ @check_path_existance
+ def write_txt(self, output_file_name):
+ """Save a Key object to a text file.
+
+ :param output_file_name: name of the output text file
+ """
+ fid = open(output_file_name, 'w')
+ for m in range(self.modelset.shape[0]):
+ segs = self.segset[self.tar[m, ]]
+ for s in range(segs.shape[0]):
+ fid.write('{} {} {}\n'.format(self.modelset[m], segs[s], 'target'))
+ segs = self.segset[self.non[m, ]]
+ for s in range(segs.shape[0]):
+ fid.write('{} {} {}\n'.format(self.modelset[m], segs[s], 'nontarget'))
+ fid.close()
+
+ def filter(self, modlist, seglist, keep):
+ """Removes some of the information in a key. Useful for creating a
+ gender specific key from a pooled gender key. Depending on the
+ value of \'keep\', the two input lists indicate the strings to
+ retain or the strings to discard.
+
+ :param modlist: a cell array of strings which will be compared with
+ the modelset of 'inkey'.
+ :param seglist: a cell array of strings which will be compared with
+ the segset of 'inkey'.
+ :param keep: a boolean indicating whether modlist and seglist are the
+ models to keep or discard.
+
+ :return: a filtered version of 'inkey'.
+ """
+ if keep:
+ keepmods = modlist
+ keepsegs = seglist
+ else:
+ keepmods = diff(self.modelset, modlist)
+ keepsegs = diff(self.segset, seglist)
+
+ keepmodidx = numpy.array(ismember(self.modelset, keepmods))
+ keepsegidx = numpy.array(ismember(self.segset, keepsegs))
+
+ outkey = Key()
+ outkey.modelset = self.modelset[keepmodidx]
+ outkey.segset = self.segset[keepsegidx]
+ tmp = self.tar[numpy.array(keepmodidx), :]
+ outkey.tar = tmp[:, numpy.array(keepsegidx)]
+ tmp = self.non[numpy.array(keepmodidx), :]
+ outkey.non = tmp[:, numpy.array(keepsegidx)]
+
+ assert(outkey.validate())
+
+ if self.modelset.shape[0] > outkey.modelset.shape[0]:
+ logging.info('Number of models reduced from %d to %d', self.modelset.shape[0], outkey.modelset.shape[0])
+ if self.segset.shape[0] > outkey.segset.shape[0]:
+ logging.info('Number of test segments reduced from %d to %d', self.segset.shape[0], outkey.segset.shape[0])
+ return outkey
+
+ def to_ndx(self):
+ """Create a Ndx object based on the Key object
+
+ :return: a Ndx object based on the Key
+ """
+ ndx = Ndx()
+ ndx.modelset = self.modelset
+ ndx.segset = self.segset
+ ndx.trialmask = self.tar | self.non
+ return ndx
+
+ def validate(self):
+ """Checks that an object of type Key obeys certain rules that
+ must always be true.
+
+ :return: a boolean value indicating whether the object is valid.
+ """
+ ok = isinstance(self.modelset, numpy.ndarray)
+ ok &= isinstance(self.segset, numpy.ndarray)
+ ok &= isinstance(self.tar, numpy.ndarray)
+ ok &= isinstance(self.non, numpy.ndarray)
+ ok &= self.modelset.ndim == 1
+ ok &= self.segset.ndim == 1
+ ok &= self.tar.ndim == 2
+ ok &= self.non.ndim == 2
+ ok &= self.tar.shape == self.non.shape
+ ok &= self.tar.shape[0] == self.modelset.shape[0]
+ ok &= self.tar.shape[1] == self.segset.shape[0]
+ return ok
+
+ @staticmethod
+ def read(input_file_fame):
+ """Reads a Key object from an hdf5 file.
+
+ :param input_file_fame: name of the file to read from
+ """
+ with h5py.File(input_file_fame, "r") as f:
+
+ key = Key()
+ key.modelset = f["modelset"][()]
+ key.segset = f["segset"][()]
+
+ # if running python 3, need a conversion to unicode
+ if sys.version_info[0] == 3:
+ key.modelset = key.modelset.astype('U100', copy=False)
+ key.segset = key.segset.astype('U100', copy=False)
+
+ trialmask = f.get("trial_mask")[()]
+ key.tar = (trialmask == 1)
+ key.non = (trialmask == -1)
+
+ assert key.validate(), "Error: wrong Key format"
+ return key
+
+ @staticmethod
+ def read_txt(input_file_name):
+ """Creates a Key object from information stored in a text file.
+
+ :param input_file_name: name of the file to read from
+ """
+ key = Key()
+
+ models, testsegs, trial = numpy.loadtxt(input_file_name,
+ delimiter=' ',
+ dtype={'names': ('mod', 'seg', 'key'),
+ 'formats': ('S1000', 'S1000', 'S10')},
+ unpack=True)
+
+ models = models.astype('|O', copy=False).astype('S', copy=False)
+ testsegs = testsegs.astype('|O', copy=False).astype('S', copy=False)
+ trial = trial.astype('|O', copy=False).astype('S', copy=False)
+
+ if sys.version_info[0] == 3:
+ models = models.astype('U', copy=False)
+ testsegs = testsegs.astype('U', copy=False)
+ trial = trial.astype('U', copy=False)
+
+ modelset = numpy.unique(models)
+ segset = numpy.unique(testsegs)
+
+ tar = numpy.zeros((modelset.shape[0], segset.shape[0]), dtype="bool")
+ non = numpy.zeros((modelset.shape[0], segset.shape[0]), dtype="bool")
+
+ for idx_m, model in enumerate(modelset):
+ idx_current_model = numpy.argwhere(models == model).flatten()
+ current_model_keys = dict(zip(testsegs[idx_current_model], trial[idx_current_model]))
+ for idx_s, seg in enumerate(segset):
+ if seg in current_model_keys:
+ tar[idx_m, idx_s] = (current_model_keys[seg] == 'target')
+ non[idx_m, idx_s] = (current_model_keys[seg] == 'nontarget')
+
+ key.modelset = modelset
+ key.segset = segset
+ key.tar = tar
+ key.non = non
+ assert key.validate(), "Wrong Key format"
+ return key
+
+ def merge(self, key_list):
+ """Merges Key objects. This function takes as input a list of
+ Key objects to merge in the curent one.
+
+ :param key_list: the list of Keys to merge
+ """
+ # the output key must have all models and segment in the input
+ # keys (only once) and the same target and non-target trials.
+ # It is an error if a trial is a target in one key and a
+ # non-target in another, but a target or non-target marker will
+ # override a 'non-trial' marker.
+ assert isinstance(key_list, list), "Input is not a list"
+ for key in key_list:
+ assert isinstance(key_list, list), \
+ '{} {} {}'.format("Element ", key, " is not a list")
+
+ for key2 in key_list:
+ key_new = Key()
+ key1 = self
+
+ # create new ndx with empty masks
+ key_new.modelset = numpy.union1d(key1.modelset, key2.modelset)
+ key_new.segset = numpy.union1d(key1.segset, key2.segset)
+
+ # expand ndx1 mask
+ tar_1 = numpy.zeros((key_new.modelset.shape[0],
+ key_new.segset.shape[0]),
+ dtype="bool")
+ non_1 = numpy.zeros((key_new.modelset.shape[0],
+ key_new.segset.shape[0]), dtype="bool")
+ model_index_a = numpy.argwhere(numpy.in1d(key_new.modelset, key1.modelset))
+ model_index_b = numpy.argwhere(numpy.in1d(key1.modelset, key_new.modelset))
+ seg_index_a = numpy.argwhere(numpy.in1d(key_new.segset, key1.segset))
+ seg_index_b = numpy.argwhere(numpy.in1d(key1.segset, key_new.segset))
+ tar_1[model_index_a[:, None], seg_index_a] = key1.tar[model_index_b[:, None], seg_index_b]
+ non_1[model_index_a[:, None], seg_index_a] = key1.non[model_index_b[:, None], seg_index_b]
+
+ # expand ndx2 mask
+ tar_2 = numpy.zeros((key_new.modelset.shape[0],
+ key_new.segset.shape[0]), dtype="bool")
+ non_2 = numpy.zeros((key_new.modelset.shape[0],
+ key_new.segset.shape[0]), dtype="bool")
+ model_index_a = numpy.argwhere(numpy.in1d(key_new.modelset, key2.modelset))
+ model_index_b = numpy.argwhere(numpy.in1d(key2.modelset, key_new.modelset))
+ seg_index_a = numpy.argwhere(numpy.in1d(key_new.segset, key2.segset))
+ seg_index_b = numpy.argwhere(numpy.in1d(key2.segset, key_new.segset))
+ tar_2[model_index_a[:, None], seg_index_a] = key2.tar[model_index_b[:, None], seg_index_b]
+ non_2[model_index_a[:, None], seg_index_a] = key2.non[model_index_b[:, None], seg_index_b]
+
+ # merge masks
+ tar = tar_1 | tar_2
+ non = non_1 | non_2
+
+ # check for clashes
+ assert numpy.sum(tar & non) == 0, "Conflict in the new Key"
+
+ # build new key
+ key_new.tar = tar
+ key_new.non = non
+ self.modelset = key_new.modelset
+ self.segset = key_new.segset
+ self.tar = key_new.tar
+ self.non = key_new.non
+ self.validate()
diff --git a/sidekit/sidekit/bosaris/ndx.py b/sidekit/sidekit/bosaris/ndx.py
new file mode 100755
index 0000000..c485876
--- /dev/null
+++ b/sidekit/sidekit/bosaris/ndx.py
@@ -0,0 +1,284 @@
+# -*- coding: utf-8 -*-
+
+# This package is a translation of a part of the BOSARIS toolkit.
+# The authors thank Niko Brummer and Agnitio for allowing them to
+# translate this code and provide the community with efficient structures
+# and tools.
+#
+# The BOSARIS Toolkit is a collection of functions and classes in Matlab
+# that can be used to calibrate, fuse and plot scores from speaker recognition
+# (or other fields in which scores are used to test the hypothesis that two
+# samples are from the same source) trials involving a model and a test segment.
+# The toolkit was written at the BOSARIS2010 workshop which took place at the
+# University of Technology in Brno, Czech Republic from 5 July to 6 August 2010.
+# See the User Guide (available on the toolkit website)1 for a discussion of the
+# theory behind the toolkit and descriptions of some of the algorithms used.
+#
+# The BOSARIS toolkit in MATLAB can be downloaded from `the website
+# `_.
+
+"""
+This is the 'ndx' module
+"""
+import h5py
+import logging
+import numpy
+import sys
+from ..sidekit_wrappers import check_path_existance, deprecated
+
+__author__ = "Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+__credits__ = ["Niko Brummer", "Edward de Villiers"]
+
+
+def diff(list1, list2):
+ c = [item for item in list1 if item not in list2]
+ c.sort()
+ return c
+
+
+def ismember(list1, list2):
+ c = [item in list2 for item in list1]
+ return c
+
+
+class Ndx:
+ """A class that encodes trial index information. It has a list of
+ model names and a list of test segment names and a matrix
+ indicating which combinations of model and test segment are
+ trials of interest.
+
+ :attr modelset: list of unique models in a ndarray
+ :attr segset: list of unique test segments in a ndarray
+ :attr trialmask: 2D ndarray of boolean. Rows correspond to the models
+ and columns to the test segments. True if the trial is of interest.
+ """
+
+ def __init__(self, ndx_file_name='',
+ models=numpy.array([]),
+ testsegs=numpy.array([])):
+ """Initialize a Ndx object by loading information from a file
+ in HDF5 or text format.
+
+ :param ndx_file_name: name of the file to load
+ """
+ self.modelset = numpy.empty(0, dtype="|O")
+ self.segset = numpy.empty(0, dtype="|O")
+ self.trialmask = numpy.array([], dtype="bool")
+
+ if ndx_file_name == '':
+ modelset = numpy.unique(models)
+ segset = numpy.unique(testsegs)
+
+ trialmask = numpy.zeros((modelset.shape[0], segset.shape[0]), dtype="bool")
+ for m in range(modelset.shape[0]):
+ segs = testsegs[numpy.array(ismember(models, modelset[m]))]
+ trialmask[m, ] = ismember(segset, segs)
+
+ self.modelset = modelset
+ self.segset = segset
+ self.trialmask = trialmask
+ assert self.validate(), "Wrong Ndx format"
+
+ else:
+ ndx = Ndx.read(ndx_file_name)
+ self.modelset = ndx.modelset
+ self.segset = ndx.segset
+ self.trialmask = ndx.trialmask
+
+ @check_path_existance
+ def write(self, output_file_name):
+ """ Save Ndx object in HDF5 format
+
+ :param output_file_name: name of the file to write to
+ """
+ assert self.validate(), "Error: wrong Ndx format"
+
+ with h5py.File(output_file_name, "w") as f:
+ f.create_dataset("modelset", data=self.modelset.astype('S'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ f.create_dataset("segset", data=self.segset.astype('S'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ f.create_dataset("trial_mask", data=self.trialmask.astype('int8'),
+ maxshape=(None, None),
+ compression="gzip",
+ fletcher32=True)
+
+ @check_path_existance
+ def save_txt(self, output_file_name):
+
+ """Save a Ndx object in a text file
+
+ :param output_file_name: name of the file to write to
+ """
+ fid = open(output_file_name, 'w')
+ for m in range(self.modelset.shape[0]):
+ segs = self.segset[self.trialmask[m, ]]
+ for s in segs:
+ fid.write('{} {}\n'.format(self.modelset[m], s))
+ fid.close()
+
+ def filter(self, modlist, seglist, keep):
+ """Removes some of the information in an Ndx. Useful for creating a
+ gender specific Ndx from a pooled gender Ndx. Depending on the
+ value of \'keep\', the two input lists indicate the strings to
+ retain or the strings to discard.
+
+ :param modlist: a cell array of strings which will be compared with
+ the modelset of 'inndx'.
+ :param seglist: a cell array of strings which will be compared with
+ the segset of 'inndx'.
+ :param keep: a boolean indicating whether modlist and seglist are the
+ models to keep or discard.
+
+ :return: a filtered version of the current Ndx object.
+ """
+ if keep:
+ keepmods = modlist
+ keepsegs = seglist
+ else:
+ keepmods = diff(self.modelset, modlist)
+ keepsegs = diff(self.segset, seglist)
+
+ keepmodidx = numpy.array(ismember(self.modelset, keepmods))
+ keepsegidx = numpy.array(ismember(self.segset, keepsegs))
+
+ outndx = Ndx()
+ outndx.modelset = self.modelset[keepmodidx]
+ outndx.segset = self.segset[keepsegidx]
+ tmp = self.trialmask[numpy.array(keepmodidx), :]
+ outndx.trialmask = tmp[:, numpy.array(keepsegidx)]
+
+ assert outndx.validate, "Wrong Ndx format"
+
+ if self.modelset.shape[0] > outndx.modelset.shape[0]:
+ logging.info('Number of models reduced from %d to %d', self.modelset.shape[0], outndx.modelset.shape[0])
+ if self.segset.shape[0] > outndx.segset.shape[0]:
+ logging.info('Number of test segments reduced from %d to %d', self.segset.shape[0], outndx.segset.shape[0])
+ return outndx
+
+ def validate(self):
+ """Checks that an object of type Ndx obeys certain rules that
+ must always be true.
+
+ :return: a boolean value indicating whether the object is valid
+ """
+ ok = isinstance(self.modelset, numpy.ndarray)
+ ok &= isinstance(self.segset, numpy.ndarray)
+ ok &= isinstance(self.trialmask, numpy.ndarray)
+
+ ok &= (self.modelset.ndim == 1)
+ ok &= (self.segset.ndim == 1)
+ ok &= (self.trialmask.ndim == 2)
+
+ ok &= (self.trialmask.shape == (self.modelset.shape[0], self.segset.shape[0]))
+ return ok
+
+ @staticmethod
+ def read(input_file_name):
+ """Creates an Ndx object from the information in an hdf5 file.
+
+ :param input_file_name: name of the file to read from
+ """
+ with h5py.File(input_file_name, "r") as f:
+ ndx = Ndx()
+ ndx.modelset = f["modelset"][()]
+ ndx.segset = f["segset"][()]
+
+ # if running python 3, need a conversion to unicode
+ if sys.version_info[0] == 3:
+ ndx.modelset = ndx.modelset.astype('U100')
+ ndx.segset = ndx.segset.astype('U100')
+
+ ndx.trialmask = numpy.zeros((ndx.modelset.shape[0], ndx.segset.shape[0]), dtype=numpy.bool)
+ f["trial_mask"].read_direct(ndx.trialmask)
+
+ assert ndx.validate(), "Error: wrong Ndx format"
+ return ndx
+
+ @classmethod
+ @check_path_existance
+ def read_txt(cls, input_filename):
+ """Creates an Ndx object from information stored in a text file.
+
+ :param input_filename: name of the file to read from
+ """
+ ndx = Ndx()
+
+ with open(input_filename, 'r') as fid:
+ lines = [l.rstrip().split() for l in fid]
+
+ models = numpy.empty(len(lines), '|O')
+ testsegs = numpy.empty(len(lines), '|O')
+ for ii in range(len(lines)):
+ models[ii] = lines[ii][0]
+ testsegs[ii] = lines[ii][1]
+
+ modelset = numpy.unique(models)
+ segset = numpy.unique(testsegs)
+
+ trialmask = numpy.zeros((modelset.shape[0], segset.shape[0]), dtype="bool")
+ for m in range(modelset.shape[0]):
+ segs = testsegs[numpy.array(ismember(models, modelset[m]))]
+ trialmask[m, ] = ismember(segset, segs)
+
+ ndx.modelset = modelset
+ ndx.segset = segset
+ ndx.trialmask = trialmask
+
+ assert ndx.validate(), "Wrong Ndx format"
+ return ndx
+
+ def merge(self, ndx_list):
+ """Merges a list of Ndx objects into the current one.
+ The resulting ndx must have all models and segment in the input
+ ndxs (only once). A trial in any ndx becomes a trial in the
+ output ndx
+
+ :param ndx_list: list of Ndx objects to merge
+ """
+ assert isinstance(ndx_list, list), "Input is not a list"
+ for ndx in ndx_list:
+ assert isinstance(ndx_list, list), \
+ '{} {} {}'.format("Element ", ndx, " is not an Ndx")
+
+ self.validate()
+ for ndx2 in ndx_list:
+ ndx_new = Ndx()
+ ndx1 = self
+
+ # create new ndx with empty masks
+ ndx_new.modelset = numpy.union1d(ndx1.modelset, ndx2.modelset)
+ ndx_new.segset = numpy.union1d(ndx1.segset, ndx2.segset)
+
+ # expand ndx1 mask
+ trials_1 = numpy.zeros((ndx_new.modelset.shape[0], ndx_new.segset.shape[0]), dtype="bool")
+ model_index_a = numpy.argwhere(numpy.in1d(ndx_new.modelset, ndx1.modelset))
+ model_index_b = numpy.argwhere(numpy.in1d(ndx1.modelset, ndx_new.modelset))
+ seg_index_a = numpy.argwhere(numpy.in1d(ndx_new.segset, ndx1.segset))
+ seg_index_b = numpy.argwhere(numpy.in1d(ndx1.segset, ndx_new.segset))
+ trials_1[model_index_a[:, None], seg_index_a] = ndx1.trialmask[model_index_b[:, None], seg_index_b]
+
+ # expand ndx2 mask
+ trials_2 = numpy.zeros((ndx_new.modelset.shape[0], ndx_new.segset.shape[0]), dtype="bool")
+ model_index_a = numpy.argwhere(numpy.in1d(ndx_new.modelset, ndx2.modelset))
+ model_index_b = numpy.argwhere(numpy.in1d(ndx2.modelset, ndx_new.modelset))
+ seg_index_a = numpy.argwhere(numpy.in1d(ndx_new.segset, ndx2.segset))
+ seg_index_b = numpy.argwhere(numpy.in1d(ndx2.segset, ndx_new.segset))
+ trials_2[model_index_a[:, None], seg_index_a] = ndx2.trialmask[model_index_b[:, None], seg_index_b]
+
+ # merge masks
+ trials = trials_1 | trials_2
+
+ # build new ndx
+ ndx_new.trialmask = trials
+ self.modelset = ndx_new.modelset
+ self.segset = ndx_new.segset
+ self.trialmask = ndx_new.trialmask
diff --git a/sidekit/sidekit/bosaris/plotwindow.py b/sidekit/sidekit/bosaris/plotwindow.py
new file mode 100755
index 0000000..2b5597c
--- /dev/null
+++ b/sidekit/sidekit/bosaris/plotwindow.py
@@ -0,0 +1,147 @@
+# -*- coding: utf-8 -*-
+
+# This package is a translation of a part of the BOSARIS toolkit.
+# The authors thank Niko Brummer and Agnitio for allowing them to
+# translate this code and provide the community with efficient structures
+# and tools.
+#
+# The BOSARIS Toolkit is a collection of functions and classes in Matlab
+# that can be used to calibrate, fuse and plot scores from speaker recognition
+# (or other fields in which scores are used to test the hypothesis that two
+# samples are from the same source) trials involving a model and a test segment.
+# The toolkit was written at the BOSARIS2010 workshop which took place at the
+# University of Technology in Brno, Czech Republic from 5 July to 6 August 2010.
+# See the User Guide (available on the toolkit website)1 for a discussion of the
+# theory behind the toolkit and descriptions of some of the algorithms used.
+#
+# The BOSARIS toolkit in MATLAB can be downloaded from `the website
+# `_.
+
+"""
+This is the 'plotwindow' module
+"""
+import numpy
+
+
+__author__ = "Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+__credits__ = ["Niko Brummer", "Edward de Villiers"]
+
+
+class PlotWindow:
+ """A class that is used to define the parameters of a plotting window.
+
+ :attr __pfa_limits__: ndarray of two values that determine the limits of the pfa axis.
+ Default is [0.0005, 0.5]
+ :attr __pmiss_limits__: ndarray of two values that determine the limits of the pmiss axis.
+ Default is [0.0005, 0.5]
+ :attr __xticks__: coordonates of the ticks on the horizontal axis
+ :attr __xticklabels__: labels of the ticks on the horizontal axis in a ndarray of strings
+ :attr __yticks__: coordonates of the ticks on the vertical axis
+ :attr __yticklabels__: labels of the ticks on the vertical axis in a ndarray of strings
+ """
+
+ def __init__(self, input_type=''):
+ """Initialize PlotWindow object to one of the pre-defined ploting type.
+ - 'new'
+ - 'old'
+ - 'big'
+ - 'sre10'
+
+ :param input_type: the type of DET plot to display. Default is 'old'
+ """
+
+ if input_type == '':
+ self.__pfa_limits__ = numpy.array([5e-4, 5e-1])
+ self.__pmiss_limits__ = numpy.array([5e-4, 5e-1])
+ self.__xticks__ = numpy.array([0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.4])
+ self.__xticklabels__ = numpy.array(['0.1', '0.2', '0.5', ' 1 ', ' 2 ', ' 5 ', '10 ', '20 ', '30 ', '40 '])
+ self.__yticks__ = numpy.array([0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.4])
+ self.__yticklabels__ = numpy.array(['0.1', '0.2', '0.5', ' 1 ', ' 2 ', ' 5 ', '10 ', '20 ', '30 ', '40 '])
+ elif input_type == 'new':
+ self.axis_new()
+ elif input_type == 'old':
+ self.axis_old()
+ elif input_type == 'big':
+ self.axis_big()
+ elif input_type == 'sre10':
+ self.axis_sre10()
+ else:
+ raise Exception('Error, wrong type of PlotWindow')
+
+ def make_plot_window_from_values(self, pfa_limits, pmiss_limits, xticks, xticklabels, yticks, yticklabels):
+ """Initialize PlotWindow from provided values
+
+ :param pfa_limits: ndarray of two values that determine the limits of the pfa axis.
+ :param pmiss_limits: ndarray of two values that determine the limits of the pmiss axis.
+ :param xticks: coordonates of the ticks on the horizontal axis.
+ :param xticklabels: labels of the ticks on the horizontal axis in a ndarray of strings.
+ :param yticks: coordonates of the ticks on the vertical axis.
+ :param yticklabels: labels of the ticks on the vertical axis in a ndarray of strings.
+ """
+ self.__pfa_limits__ = pfa_limits
+ self.__pmiss_limits__ = pmiss_limits
+ self.__xticks__ = xticks
+ self.__xticklabels__ = xticklabels
+ self.__yticks__ = yticks
+ self.__yticklabels__ = yticklabels
+
+ def axis_new(self):
+ """Set axis value to new ones
+
+ - pfa ranges from 0.000005 to 0.005
+ - pmiss ranges from 0.01 to 0.99
+ """
+ self.__pfa_limits__ = numpy.array([5e-6, 5e-3])
+ self.__pmiss_limits__ = numpy.array([1e-2, 0.99])
+ self.__xticks__ = numpy.array([1e-5, 2e-5, 5e-5, 1e-4, 2e-4, 5e-4, 1e-3, 2e-3])
+ self.__xticklabels__ = numpy.array(['1e-3', '2e-3', '5e-3', '0.01', '0.02', '0.05', '0.1 ', '0.2 '])
+ self.__yticks__ = numpy.array([0.02, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.98])
+ self.__yticklabels__ = numpy.array([' 2 ', ' 5 ', '10 ', '20 ', '30 ',
+ '40 ', '50 ', '60 ', '70 ', '80 ', '90 ', '95 ', '98 '])
+
+ def axis_old(self):
+ """Set axis value to old ones (NIST-SRE08 style)
+
+ - pfa ranges from 0.0005 to 0.5
+ - pmiss ranges from 0.0005 to 0.5
+ """
+ self.__pfa_limits__ = numpy.array([5e-4, 5e-1])
+ self.__pmiss_limits__ = numpy.array([5e-4, 5e-1])
+ self.__xticks__ = numpy.array([0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.4])
+ self.__xticklabels__ = numpy.array(['0.1', '0.2', '0.5', ' 1 ', ' 2 ', ' 5 ', '10 ', '20 ', '30 ', '40 '])
+ self.__yticks__ = numpy.array([0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.4])
+ self.__yticklabels__ = numpy.array(['0.1', '0.2', '0.5', ' 1 ', ' 2 ', ' 5 ', '10 ', '20 ', '30 ', '40 '])
+
+ def axis_big(self):
+ """Set axis value to big ones
+
+ - pfa ranges from 0.000005 to 0.99
+ - pmiss ranges from 0.000005 to0.99
+ """
+ self.__pfa_limits__ = numpy.array([5e-6, 0.99])
+ self.__pmiss_limits__ = numpy.array([5e-6, 0.99])
+ self.__yticks__ = numpy.array([5e-6, 5e-5, 5e-4, 0.5e-2, 2.5e-2, 10e-2,
+ 25e-2, 50e-2, 72e-2, 88e-2, 96e-2, 99e-2])
+ self.__yticklabels__ = numpy.array(['5e-4', '5e-3', '0.05', '0.5 ',
+ '2.5 ', ' 10 ', ' 25 ', ' 50 ', ' 72 ', ' 88 ', ' 96 ', ' 99 '])
+ self.__xticks__ = numpy.array([5e-5, 5e-4, 0.5e-2, 2.5e-2, 10e-2, 25e-2, 50e-2, 72e-2, 88e-2, 96e-2, 99e-2])
+ self.__xticklabels__ = numpy.array(['5e-3', '0.05', '0.5 ', '2.5 ',
+ ' 10 ', ' 25 ', ' 50 ', ' 72 ', ' 88 ', ' 96 ', ' 99 '])
+
+ def axis_sre10(self):
+ """Set axis value to NIST-SRE10 style
+
+ - pfa ranges from 0.000003 to 0.5
+ - pmiss ranges from 0.0003 to 0.9
+ """
+ self.__pfa_limits__ = numpy.array([3e-6, 5e-1])
+ self.__pmiss_limits__ = numpy.array([3e-4, 9e-1])
+ self.__xticks__ = numpy.array([1e-5, 1e-4, 1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2, 1e-1, 2e-1, 4e-1])
+ self.__xticklabels__ = numpy.array(['0.001', ' 0.01', ' 0.1', ' 0.2',
+ ' 0.5', ' 1', ' 2', ' 5', ' 10', ' 20', ' 40'])
+ self.__yticks__ = numpy.array([1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2, 1e-1, 2e-1, 4e-1, 8e-1])
+ self.__yticklabels__ = numpy.array(['0.1', '0.2', '0.5', ' 1 ', ' 2 ', ' 5 ', ' 10', ' 20', ' 40', ' 80'])
diff --git a/sidekit/sidekit/bosaris/scores.py b/sidekit/sidekit/bosaris/scores.py
new file mode 100755
index 0000000..6fdd3e9
--- /dev/null
+++ b/sidekit/sidekit/bosaris/scores.py
@@ -0,0 +1,493 @@
+# -*- coding: utf-8 -*-
+
+# This package is a translation of a part of the BOSARIS toolkit.
+# The authors thank Niko Brummer and Agnitio for allowing them to
+# translate this code and provide the community with efficient structures
+# and tools.
+#
+# The BOSARIS Toolkit is a collection of functions and classes in Matlab
+# that can be used to calibrate, fuse and plot scores from speaker recognition
+# (or other fields in which scores are used to test the hypothesis that two
+# samples are from the same source) trials involving a model and a test segment.
+# The toolkit was written at the BOSARIS2010 workshop which took place at the
+# University of Technology in Brno, Czech Republic from 5 July to 6 August 2010.
+# See the User Guide (available on the toolkit website)1 for a discussion of the
+# theory behind the toolkit and descriptions of some of the algorithms used.
+#
+# The BOSARIS toolkit in MATLAB can be downloaded from `the website
+# `_.
+
+"""
+This is the 'scores' module
+
+"""
+import h5py
+import logging
+import numpy
+import os
+from .ndx import Ndx
+from .key import Key
+from ..sidekit_wrappers import check_path_existance
+
+
+__author__ = "Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+__credits__ = ["Niko Brummer", "Edward de Villiers"]
+
+
+def diff(list1, list2):
+ c = [item for item in list1 if item not in list2]
+ c.sort()
+ return c
+
+
+def ismember(list1, list2):
+ c = [item in list2 for item in list1]
+ return c
+
+
+class Scores:
+ """A class for storing scores for trials. The modelset and segset
+ fields are lists of model and test segment names respectively.
+ The element i,j of scoremat and scoremask corresponds to the
+ trial involving model i and test segment j.
+
+ :attr modelset: list of unique models in a ndarray
+ :attr segset: list of unique test segments in a ndarray
+ :attr scoremask: 2D ndarray of boolean which indicates the trials of interest
+ i.e. the entry i,j in scoremat should be ignored if scoremask[i,j] is False
+ :attr scoremat: 2D ndarray of scores
+ """
+ def __init__(self, scores_file_name=''):
+ """ Initialize a Scores object by loading information from a file HDF5 format.
+
+ :param scores_file_name: name of the file to load
+ """
+ self.modelset = numpy.empty(0, dtype="|O")
+ self.segset = numpy.empty(0, dtype="|O")
+ self.scoremask = numpy.array([], dtype="bool")
+ self.scoremat = numpy.array([])
+
+ if scores_file_name == '':
+ pass
+ else:
+ tmp = Scores.read(scores_file_name)
+ self.modelset = tmp.modelset
+ self.segset = tmp.segset
+ self.scoremask = tmp.scoremask
+ self.scoremat = tmp.scoremat
+
+ def __repr__(self):
+ ch = 'modelset:\n'
+ ch += self.modelset+'\n'
+ ch += 'segset:\n'
+ ch += self.segset+'\n'
+ ch += 'scoremask:\n'
+ ch += self.scoremask.__repr__()+'\n'
+ ch += 'scoremat:\n'
+ ch += self.scoremat.__repr__()+'\n'
+
+ @check_path_existance
+ def write(self, output_file_name):
+ """ Save Scores in HDF5 format
+
+ :param output_file_name: name of the file to write to
+ """
+ with h5py.File(output_file_name, "w") as f:
+ f.create_dataset("modelset", data=self.modelset.astype('S'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ f.create_dataset("segset", data=self.segset.astype('S'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ f.create_dataset("score_mask", data=self.scoremask.astype('int8'),
+ maxshape=(None, None),
+ compression="gzip",
+ fletcher32=True)
+ f.create_dataset("scores", data=self.scoremat,
+ maxshape=(None, None),
+ compression="gzip",
+ fletcher32=True)
+
+ @check_path_existance
+ def write_txt(self, output_file_name):
+ """Save a Scores object in a text file
+
+ :param output_file_name: name of the file to write to
+ """
+ if not os.path.exists(os.path.dirname(output_file_name)):
+ os.makedirs(os.path.dirname(output_file_name))
+
+ with open(output_file_name, 'w') as fid:
+ for m in range(self.modelset.shape[0]):
+ segs = self.segset[self.scoremask[m, ]]
+ scores = self.scoremat[m, self.scoremask[m, ]]
+ for s in range(segs.shape[0]):
+ fid.write('{} {} {}\n'.format(self.modelset[m], segs[s], scores[s]))
+
+ @check_path_existance
+ def write_matlab(self, output_file_name):
+ """Save a Scores object in Bosaris compatible HDF5 format
+
+ :param output_file_name: name of the file to write to
+ """
+ with h5py.File(output_file_name, "w") as f:
+ f.create_dataset("/ID/row_ids", data=self.modelset.astype('S'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ f.create_dataset("/ID/column_ids", data=self.segset.astype('S'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ f.create_dataset("score_mask", data=self.scoremask.astype('int8'),
+ maxshape=(None, None),
+ compression="gzip",
+ fletcher32=True)
+ f.create_dataset("scores", data=self.scoremat,
+ maxshape=(None, None),
+ compression="gzip",
+ fletcher32=True)
+
+ def get_tar_non(self, key):
+ """Divides scores into target and non-target scores using
+ information in a key.
+
+ :param key: a Key object.
+
+ :return: a vector of target scores.
+ :return: a vector of non-target scores.
+ """
+ if (key.modelset == self.modelset).all() \
+ and (key.segset == self.segset).all() \
+ and self.scoremask.shape[0] == key.tar.shape[0] \
+ and self.scoremask.shape[1] == key.tar.shape[1]:
+ return self.scoremat[key.tar & self.scoremask], self.scoremat[key.non & self.scoremask]
+ else:
+ new_score = self.align_with_ndx(key)
+ tarndx = key.tar & new_score.scoremask
+ nonndx = key.non & new_score.scoremask
+ tar = new_score.scoremat[tarndx]
+ non = new_score.scoremat[nonndx]
+ return tar, non
+
+ def align_with_ndx(self, ndx):
+ """The ordering in the output Scores object corresponds to ndx, so
+ aligning several Scores objects with the same ndx will result in
+ them being comparable with each other.
+
+ :param ndx: a Key or Ndx object
+
+ :return: resized version of the current Scores object to size of \'ndx\'
+ and reordered according to the ordering of modelset and segset in \'ndx\'.
+ """
+ aligned_scr = Scores()
+ aligned_scr.modelset = ndx.modelset
+ aligned_scr.segset = ndx.segset
+
+ hasmodel = numpy.array(ismember(ndx.modelset, self.modelset))
+ rindx = numpy.array([numpy.argwhere(self.modelset == v)[0][0]
+ for v in ndx.modelset[hasmodel]]).astype(int)
+ hasseg = numpy.array(ismember(ndx.segset, self.segset))
+ cindx = numpy.array([numpy.argwhere(self.segset == v)[0][0]
+ for v in ndx.segset[hasseg]]).astype(int)
+
+ aligned_scr.scoremat = numpy.zeros((ndx.modelset.shape[0], ndx.segset.shape[0]))
+ aligned_scr.scoremat[numpy.where(hasmodel)[0][:, None],
+ numpy.where(hasseg)[0]] = self.scoremat[rindx[:, None], cindx]
+
+ aligned_scr.scoremask = numpy.zeros((ndx.modelset.shape[0], ndx.segset.shape[0]), dtype='bool')
+ aligned_scr.scoremask[numpy.where(hasmodel)[0][:, None],
+ numpy.where(hasseg)[0]] = self.scoremask[rindx[:, None], cindx]
+
+ assert numpy.sum(aligned_scr.scoremask) <= (numpy.sum(hasmodel) * numpy.sum(hasseg)), 'Error in new scoremask'
+
+ if isinstance(ndx, Ndx):
+ aligned_scr.scoremask = aligned_scr.scoremask & ndx.trialmask
+ else:
+ aligned_scr.scoremask = aligned_scr.scoremask & (ndx.tar | ndx.non)
+
+ if numpy.sum(hasmodel) < ndx.modelset.shape[0]:
+ logging.info('models reduced from %d to %d', ndx.modelset.shape[0], numpy.sum(hasmodel))
+ if numpy.sum(hasseg) < ndx.segset.shape[0]:
+ logging.info('testsegs reduced from %d to %d', ndx.segset.shape[0], numpy.sum(hasseg))
+
+ if isinstance(ndx, Key):
+ tar = ndx.tar & aligned_scr.scoremask
+ non = ndx.non & aligned_scr.scoremask
+
+ missing = numpy.sum(ndx.tar) - numpy.sum(tar)
+ if missing > 0:
+ logging.info('%d of %d targets missing', missing, numpy.sum(ndx.tar))
+ missing = numpy.sum(ndx.non) - numpy.sum(non)
+ if missing > 0:
+ logging.info('%d of %d non targets missing', missing, numpy.sum(ndx.non))
+
+ else:
+ mask = ndx.trialmask & aligned_scr.scoremask
+ missing = numpy.sum(ndx.trialmask) - numpy.sum(mask)
+ if missing > 0:
+ logging.info('%d of %d trials missing', missing, numpy.sum(ndx.trialmask))
+
+ assert all(numpy.isfinite(aligned_scr.scoremat[aligned_scr.scoremask])), \
+ 'Inifinite or Nan value in the scoremat'
+ assert aligned_scr.validate(), 'Wrong Score format'
+ return aligned_scr
+
+ def set_missing_to_value(self, ndx, value):
+ """Sets all scores for which the trialmask is true but the scoremask
+ is false to the same value, supplied by the user.
+
+ :param ndx: a Key or Ndx object.
+ :param value: a value for the missing scores.
+
+ :return: a Scores object (with the missing scores added and set
+ to value).
+ """
+ if isinstance(ndx, Key):
+ ndx = ndx.to_ndx()
+
+ new_scr = self.align_with_ndx(ndx)
+ missing = ndx.trialmask & -new_scr.scoremask
+ new_scr.scoremat[missing] = value
+ new_scr.scoremask[missing] = True
+ assert new_scr.validate(), "Wrong format of Scores"
+ return new_scr
+
+ def filter(self, modlist, seglist, keep):
+ """Removes some of the information in a Scores object. Useful for
+ creating a gender specific score set from a pooled gender score
+ set. Depending on the value of \'keep\', the two input lists
+ indicate the models and test segments (and their associated
+ scores) to retain or discard.
+
+ :param modlist: a list of strings which will be compared with
+ the modelset of the current Scores object.
+ :param seglist: a list of strings which will be compared with
+ the segset of \'inscr\'.
+ :param keep: a boolean indicating whether modlist and seglist are the
+ models to keep or discard.
+
+ :return: a filtered version of \'inscr\'.
+ """
+ if keep:
+ keepmods = modlist
+ keepsegs = seglist
+ else:
+ keepmods = diff(self.modelset, modlist)
+ keepsegs = diff(self.segset, seglist)
+
+ keepmodidx = numpy.array(ismember(self.modelset, keepmods))
+ keepsegidx = numpy.array(ismember(self.segset, keepsegs))
+
+ outscr = Scores()
+ outscr.modelset = self.modelset[keepmodidx]
+ outscr.segset = self.segset[keepsegidx]
+ tmp = self.scoremat[numpy.array(keepmodidx), :]
+ outscr.scoremat = tmp[:, numpy.array(keepsegidx)]
+ tmp = self.scoremask[numpy.array(keepmodidx), :]
+ outscr.scoremask = tmp[:, numpy.array(keepsegidx)]
+
+ assert isinstance(outscr, Scores), 'Wrong Scores format'
+
+ if self.modelset.shape[0] > outscr.modelset.shape[0]:
+ logging.info('Number of models reduced from %d to %d', self.modelset.shape[0], outscr.modelset.shape[0])
+ if self.segset.shape[0] > outscr.segset.shape[0]:
+ logging.info('Number of test segments reduced from %d to %d', self.segset.shape[0], outscr.segset.shape[0])
+ return outscr
+
+ def validate(self):
+ """Checks that an object of type Scores obeys certain rules that
+ must always be true.
+
+ :return: a boolean value indicating whether the object is valid.
+ """
+ ok = self.scoremat.shape == self.scoremask.shape
+ ok &= (self.scoremat.shape[0] == self.modelset.shape[0])
+ ok &= (self.scoremat.shape[1] == self.segset.shape[0])
+ return ok
+
+ @staticmethod
+ def read(input_file_name):
+ """Read a Scores object from information in a hdf5 file.
+
+ :param input_file_name: name of the file to read from
+ """
+ with h5py.File(input_file_name, "r") as f:
+ scores = Scores()
+
+ scores.modelset = numpy.empty(f["modelset"].shape, dtype=f["modelset"].dtype)
+ f["modelset"].read_direct(scores.modelset)
+ scores.modelset = scores.modelset.astype('U100', copy=False)
+
+ scores.segset = numpy.empty(f["segset"].shape, dtype=f["segset"].dtype)
+ f["segset"].read_direct(scores.segset)
+ scores.segset = scores.segset.astype('U100', copy=False)
+
+ scores.scoremask = numpy.empty(f["score_mask"].shape, dtype=f["score_mask"].dtype)
+ f["score_mask"].read_direct(scores.scoremask)
+ scores.scoremask = scores.scoremask.astype('bool', copy=False)
+
+ scores.scoremat = numpy.empty(f["scores"].shape, dtype=f["scores"].dtype)
+ f["scores"].read_direct(scores.scoremat)
+
+ assert scores.validate(), "Error: wrong Scores format"
+ return scores
+
+ @staticmethod
+ def read_matlab(input_file_name):
+ """Read a Scores object from information in a hdf5 file in Matlab BOSARIS format.
+
+ :param input_file_name: name of the file to read from
+ """
+ with h5py.File(input_file_name, "r") as f:
+ scores = Scores()
+
+ scores.modelset = numpy.empty(f["ID/row_ids"].shape, dtype=f["ID/row_ids"].dtype)
+ f["ID/row_ids"].read_direct(scores.modelset)
+ scores.modelset = scores.modelset.astype('U100', copy=False)
+
+ scores.segset = numpy.empty(f["ID/column_ids"].shape, dtype=f["ID/column_ids"].dtype)
+ f["ID/column_ids"].read_direct(scores.segset)
+ scores.segset = scores.segset.astype('U100', copy=False)
+
+ scores.scoremask = numpy.empty(f["score_mask"].shape, dtype=f["score_mask"].dtype)
+ f["score_mask"].read_direct(scores.scoremask)
+ scores.scoremask = scores.scoremask.astype('bool', copy=False)
+
+ scores.scoremat = numpy.empty(f["scores"].shape, dtype=f["scores"].dtype)
+ f["scores"].read_direct(scores.scoremat)
+
+ assert scores.validate(), "Error: wrong Scores format"
+ return scores
+
+ @classmethod
+ @check_path_existance
+ def read_txt(cls, input_file_name):
+ """Creates a Scores object from information stored in a text file.
+
+ :param input_file_name: name of the file to read from
+ """
+ s = Scores()
+ with open(input_file_name, 'r') as fid:
+ lines = [l.rstrip().split() for l in fid]
+
+ models = numpy.array([], '|O')
+ models.resize(len(lines))
+ testsegs = numpy.array([], '|O')
+ testsegs.resize(len(lines))
+ scores = numpy.array([])
+ scores.resize(len(lines))
+
+ for ii in range(len(lines)):
+ models[ii] = lines[ii][0]
+ testsegs[ii] = lines[ii][1]
+ scores[ii] = float(lines[ii][2])
+
+ modelset = numpy.unique(models)
+ segset = numpy.unique(testsegs)
+
+ scoremask = numpy.zeros((modelset.shape[0], segset.shape[0]), dtype="bool")
+ scoremat = numpy.zeros((modelset.shape[0], segset.shape[0]))
+ for m in range(modelset.shape[0]):
+ segs = testsegs[numpy.array(ismember(models, modelset[m]))]
+ scrs = scores[numpy.array(ismember(models, modelset[m]))]
+ idx = segs.argsort()
+ segs = segs[idx]
+ scrs = scrs[idx]
+ scoremask[m, ] = ismember(segset, segs)
+ scoremat[m, numpy.array(ismember(segset, segs))] = scrs
+
+ s.modelset = modelset
+ s.segset = segset
+ s.scoremask = scoremask
+ s.scoremat = scoremat
+ assert s.validate(), "Wrong Scores format"
+ s.sort()
+ return s
+
+ def merge(self, score_list):
+ """Merges a list of Scores objects into the current one.
+ The resulting must have all models and segment in the input
+ Scores (only once) and the union of all the scoremasks.
+ It is an error if two of the input Scores objects have a
+ score for the same trial.
+
+ :param score_list: the list of Scores object to merge
+ """
+ assert isinstance(score_list, list), "Input is not a list"
+ for scr in score_list:
+ assert isinstance(score_list, list), \
+ '{} {} {}'.format("Element ", scr, " is not a Score")
+
+ self.validate()
+ for scr2 in score_list:
+ scr_new = Scores()
+ scr1 = self
+ scr1.sort()
+ scr2.sort()
+
+ # create new scr with empty matrices
+ scr_new.modelset = numpy.union1d(scr1.modelset, scr2.modelset)
+ scr_new.segset = numpy.union1d(scr1.segset, scr2.segset)
+
+ # expand scr1 matrices
+ scoremat_1 = numpy.zeros((scr_new.modelset.shape[0], scr_new.segset.shape[0]))
+ scoremask_1 = numpy.zeros((scr_new.modelset.shape[0], scr_new.segset.shape[0]), dtype='bool')
+ model_index_a = numpy.argwhere(numpy.in1d(scr_new.modelset, scr1.modelset))
+ model_index_b = numpy.argwhere(numpy.in1d(scr1.modelset, scr_new.modelset))
+ seg_index_a = numpy.argwhere(numpy.in1d(scr_new.segset, scr1.segset))
+ seg_index_b = numpy.argwhere(numpy.in1d(scr1.segset, scr_new.segset))
+ scoremat_1[model_index_a[:, None], seg_index_a] = scr1.scoremat[model_index_b[:, None], seg_index_b]
+ scoremask_1[model_index_a[:, None], seg_index_a] = scr1.scoremask[model_index_b[:, None], seg_index_b]
+
+ # expand scr2 matrices
+ scoremat_2 = numpy.zeros((scr_new.modelset.shape[0], scr_new.segset.shape[0]))
+ scoremask_2 = numpy.zeros((scr_new.modelset.shape[0], scr_new.segset.shape[0]), dtype='bool')
+ model_index_a = numpy.argwhere(numpy.in1d(scr_new.modelset, scr2.modelset))
+ model_index_b = numpy.argwhere(numpy.in1d(scr2.modelset, scr_new.modelset))
+ seg_index_a = numpy.argwhere(numpy.in1d(scr_new.segset, scr2.segset))
+ seg_index_b = numpy.argwhere(numpy.in1d(scr2.segset, scr_new.segset))
+ scoremat_2[model_index_a[:, None], seg_index_a] = scr2.scoremat[model_index_b[:, None], seg_index_b]
+ scoremask_2[model_index_a[:, None], seg_index_a] = scr2.scoremask[model_index_b[:, None], seg_index_b]
+
+ # check for clashes
+ assert numpy.sum(scoremask_1 & scoremask_2) == 0, "Conflict in the new scoremask"
+
+ # merge masks
+ self.scoremat = scoremat_1 + scoremat_2
+ self.scoremask = scoremask_1 | scoremask_2
+ self.modelset = scr_new.modelset
+ self.segset = scr_new.segset
+ assert self.validate(), 'Wrong Scores format'
+
+ def sort(self):
+ """Sort models and segments"""
+ sort_model_idx = numpy.argsort(self.modelset)
+ sort_seg_idx = numpy.argsort(self.segset)
+ sort_mask = self.scoremask[sort_model_idx[:, None], sort_seg_idx]
+ sort_mat = self.scoremat[sort_model_idx[:, None], sort_seg_idx]
+ self.modelset.sort()
+ self.segset.sort()
+ self.scoremat = sort_mat
+ self.scoremask = sort_mask
+
+ def get_score(self, modelID, segID):
+ """return a score given a model and segment identifiers
+ raise an error if the trial does not exist
+ :param modelID: id of the model
+ :param segID: id of the test segment
+ """
+ model_idx = numpy.argwhere(self.modelset == modelID)
+ seg_idx = numpy.argwhere(self.segset == segID)
+ if model_idx.shape[0] == 0:
+ raise Exception('No such model as: %s', modelID)
+ elif seg_idx.shape[0] == 0:
+ raise Exception('No such segment as: %s', segID)
+ else:
+ return self.scoremat[model_idx, seg_idx]
diff --git a/sidekit/sidekit/factor_analyser.py b/sidekit/sidekit/factor_analyser.py
new file mode 100755
index 0000000..f4651f5
--- /dev/null
+++ b/sidekit/sidekit/factor_analyser.py
@@ -0,0 +1,1101 @@
+# -* coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+"""
+Copyright 2014-2021 Sylvain Meignier and Anthony Larcher
+
+ :mod:`factor_analyser` provides methods to train different types of factor analysers
+
+"""
+import copy
+import numpy
+import multiprocessing
+import logging
+import h5py
+import scipy
+import warnings
+import ctypes
+from sidekit.sv_utils import serialize
+from sidekit.statserver import StatServer
+from sidekit.mixture import Mixture
+from sidekit.sidekit_wrappers import process_parallel_lists, deprecated, check_path_existance
+from sidekit import STAT_TYPE
+
+__license__ = "LGPL"
+__author__ = "Anthony Larcher & Sylvain Meignier"
+__copyright__ = "Copyright 2014-2021 Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+
+def e_on_batch(stat0, stat1, ubm, F):
+ """
+ Compute statistics for the Expectation step on a batch of data
+
+ :param stat0: matrix of zero-order statistics (1 session per line)
+ :param stat1: matrix of first-order statistics (1 session per line)
+ :param ubm: Mixture object
+ :param F: factor loading matrix
+ :return: first and second order statistics
+ """
+ tv_rank = F.shape[1]
+ nb_distrib = stat0.shape[1]
+ feature_size = ubm.mu.shape[1]
+ index_map = numpy.repeat(numpy.arange(nb_distrib), feature_size)
+ upper_triangle_indices = numpy.triu_indices(tv_rank)
+
+ gmm_covariance = "diag" if ubm.invcov.ndim == 2 else "full"
+
+ # Allocate the memory to save
+ session_nb = stat0.shape[0]
+ e_h = numpy.zeros((session_nb, tv_rank), dtype=STAT_TYPE)
+ e_hh = numpy.zeros((session_nb, tv_rank * (tv_rank + 1) // 2), dtype=STAT_TYPE)
+
+ # Whiten the statistics for diagonal or full models
+ stat1 -= stat0[:, index_map] * ubm.get_mean_super_vector()
+
+ if gmm_covariance == "diag":
+ stat1 *= numpy.sqrt(ubm.get_invcov_super_vector())
+ elif gmm_covariance == "full":
+ stat1 = numpy.einsum("ikj,ikl->ilj",
+ stat1.T.reshape(-1, nb_distrib, session_nb),
+ ubm.invchol
+ ).reshape(-1, session_nb).T
+
+ for idx in range(session_nb):
+ inv_lambda = scipy.linalg.inv(numpy.eye(tv_rank) + (F.T * stat0[idx, index_map]).dot(F))
+ aux = F.T.dot(stat1[idx, :])
+ e_h[idx] = numpy.dot(aux, inv_lambda)
+ e_hh[idx] = (inv_lambda + numpy.outer(e_h[idx], e_h[idx]))[upper_triangle_indices]
+
+ return e_h, e_hh
+
+
+def e_worker(arg, q):
+ """
+ Encapsulates the method that compute statistics for expectation step
+
+ :param arg: a tuple that should include
+ a matrix of zero-order statistics (1 session per line)
+ a matrix of first-order statistics (1 session per line)
+ a Mixture object
+ a factor loading matrix
+ :param q: output queue (a multiprocessing.Queue object)
+ """
+ q.put(arg[:2] + e_on_batch(*arg))
+
+
+def e_gather(arg, q):
+ """
+ Consumer that sums accumulators stored in the memory
+
+ :param arg: a tuple of input parameters including three accumulators for the estimation of Factor Analysis matrix
+ :param q: input queue that is filled by the producers and emptied in this function (a multiprocessing.Queue object)
+ :return: the three accumulators
+ """
+ _A, _C, _R = arg
+
+ while True:
+
+ stat0, stat1, e_h, e_hh = q.get()
+ if e_h is None:
+ break
+ _A += stat0.T.dot(e_hh)
+ _C += e_h.T.dot(stat1)
+ _R += numpy.sum(e_hh, axis=0)
+
+ return _A, _C, _R
+
+
+def iv_extract_on_batch(arg, q):
+ """
+ Extract i-vectors for a batch of sessions (shows)
+
+ :param arg: a tuple of inputs that includes a list of batch_indices, a matrix of zero-order statistics
+ a matrix of first order statistics, a Mixture model and loading factor matrix
+ :param q: the output queue to fill (a multiprocessing.Queue object)
+ """
+ batch_indices, stat0, stat1, ubm, F = arg
+ E_h, E_hh = e_on_batch(stat0, stat1, ubm, F)
+ tv_rank = E_h.shape[1]
+ q.put((batch_indices,) + (E_h, E_hh[:, numpy.array([i * tv_rank-((i*(i-1))//2) for i in range(tv_rank)])]))
+
+
+def iv_collect(arg, q):
+ """
+ Consumer method that takes inputs from a queue and fill matrices with i-vectors
+ and uncertainty matrices (diagonal version only)
+
+ :param arg:a tuple of inputs including a matrix to store i-vectors and a matrix to store uncertainty matrices
+ :param q: the input queue (a multiprocessing.Queue object)
+ :return: the matrices of i-vectors and uncertainty matrices
+ """
+ iv, iv_sigma = arg
+
+ while True:
+
+ batch_idx, e_h, e_hh = q.get()
+ if e_h is None:
+ break
+ iv[batch_idx, :] = e_h
+ iv_sigma[batch_idx, :] = e_hh
+
+ return iv, iv_sigma
+
+
+@process_parallel_lists
+def fa_model_loop(batch_start,
+ mini_batch_indices,
+ factor_analyser,
+ stat0,
+ stat1,
+ e_h,
+ e_hh,
+ num_thread=1):
+ """
+ Methods that is called for PLDA estimation for parallelization on classes
+
+ :param batch_start: index to start at in the list
+ :param mini_batch_indices: indices of the elements in the list (should start at zero)
+ :param factor_analyser: FactorAnalyser object
+ :param stat0: matrix of zero order statistics
+ :param stat1: matrix of first order statistics
+ :param e_h: accumulator
+ :param e_hh: accumulator
+ :param num_thread: number of parallel process to run
+ """
+ rank = factor_analyser.F.shape[1]
+ if factor_analyser.Sigma.ndim == 2:
+ A = factor_analyser.F.T.dot(factor_analyser.F)
+ inv_lambda_unique = dict()
+ for sess in numpy.unique(stat0[:, 0]):
+ inv_lambda_unique[sess] = scipy.linalg.inv(sess * A + numpy.eye(A.shape[0]))
+
+ tmp = numpy.zeros((factor_analyser.F.shape[1], factor_analyser.F.shape[1]), dtype=STAT_TYPE)
+
+ for idx in mini_batch_indices:
+ if factor_analyser.Sigma.ndim == 1:
+ inv_lambda = scipy.linalg.inv(numpy.eye(rank) +
+ (factor_analyser.F.T * stat0[idx + batch_start, :]).dot(factor_analyser.F))
+ else:
+ inv_lambda = inv_lambda_unique[stat0[idx + batch_start, 0]]
+
+ aux = factor_analyser.F.T.dot(stat1[idx + batch_start, :])
+ numpy.dot(aux, inv_lambda, out=e_h[idx])
+ e_hh[idx] = inv_lambda + numpy.outer(e_h[idx], e_h[idx], tmp)
+
+
+class FactorAnalyser:
+ """
+ A class to train factor analyser such as total variability models and Probabilistic
+ Linear Discriminant Analysis (PLDA).
+
+ :attr mean: mean vector
+ :attr F: between class matrix
+ :attr G: within class matrix
+ :attr H: MAP covariance matrix (for Joint Factor Analysis only)
+ :attr Sigma: residual covariance matrix
+ """
+
+ def __init__(self,
+ input_file_name=None,
+ mean=None,
+ F=None,
+ G=None,
+ H=None,
+ Sigma=None):
+ """
+ Initialize a Factor Analyser object to None or by reading from an HDF5 file.
+ When loading from a file, other parameters can be provided to overwrite each of the component.
+
+ :param input_file_name: name of the HDF5 file to read from, default is nNone
+ :param mean: the mean vector
+ :param F: between class matrix
+ :param G: within class matrix
+ :param H: MAP covariance matrix
+ :param Sigma: residual covariance matri
+ x
+ """
+ if input_file_name is not None:
+ fa = FactorAnalyser.read(input_file_name)
+ self.mean = fa.mean
+ self.F = fa.F
+ self.G = fa.G
+ self.H = fa.H
+ self.Sigma = fa.Sigma
+ else:
+ self.mean = None
+ self.F = None
+ self.G = None
+ self.H = None
+ self.Sigma = None
+
+ if mean is not None:
+ self.mean = mean
+ if F is not None:
+ self.F = F
+ if G is not None:
+ self.G = G
+ if H is not None:
+ self.H = H
+ if Sigma is not None:
+ self.Sigma = Sigma
+
+ @check_path_existance
+ def write(self, output_file_name):
+ """
+ Write a FactorAnalyser object into HDF5 file
+
+ :param output_file_name: the name of the file to write to
+ """
+ with h5py.File(output_file_name, "w") as fh:
+ kind = numpy.zeros(5, dtype="int16") # FA with 5 matrix
+ if self.mean is not None:
+ kind[0] = 1
+ fh.create_dataset("fa/mean", data=self.mean,
+ compression="gzip",
+ fletcher32=True)
+ if self.F is not None:
+ kind[1] = 1
+ fh.create_dataset("fa/f", data=self.F,
+ compression="gzip",
+ fletcher32=True)
+ if self.G is not None:
+ kind[2] = 1
+ fh.create_dataset("fa/g", data=self.G,
+ compression="gzip",
+ fletcher32=True)
+ if self.H is not None:
+ kind[3] = 1
+ fh.create_dataset("fa/h", data=self.H,
+ compression="gzip",
+ fletcher32=True)
+ if self.Sigma is not None:
+ kind[4] = 1
+ fh.create_dataset("fa/sigma", data=self.Sigma,
+ compression="gzip",
+ fletcher32=True)
+ fh.create_dataset("fa/kind", data=kind,
+ compression="gzip",
+ fletcher32=True)
+
+ @staticmethod
+ def read(input_filename):
+ """
+ Read a generic FactorAnalyser model from a HDF5 file
+
+ :param input_filename: the name of the file to read from
+
+ :return: a FactorAnalyser object
+ """
+ fa = FactorAnalyser()
+ with h5py.File(input_filename, "r") as fh:
+ kind = fh.get("fa/kind")[()]
+ if kind[0] != 0:
+ fa.mean = fh.get("fa/mean")[()]
+ if kind[1] != 0:
+ fa.F = fh.get("fa/f")[()]
+ if kind[2] != 0:
+ fa.G = fh.get("fa/g")[()]
+ if kind[3] != 0:
+ fa.H = fh.get("fa/h")[()]
+ if kind[4] != 0:
+ fa.Sigma = fh.get("fa/sigma")[()]
+ return fa
+
+ def total_variability_raw(self,
+ stat_server,
+ ubm,
+ tv_rank,
+ nb_iter=20,
+ min_div=True,
+ tv_init=None,
+ save_init=False,
+ output_file_name=None):
+ """
+ Train a total variability model using a single process on a single node.
+ This method is provided for didactic purpose and should not be used as it uses
+ to much memory and is to slow. If you want to use a single process
+ run: "total_variability_single"
+
+ :param stat_server: the StatServer containing data to train the model
+ :param ubm: a Mixture object
+ :param tv_rank: rank of the total variability model
+ :param nb_iter: number of EM iteration
+ :param min_div: boolean, if True, apply minimum divergence re-estimation
+ :param tv_init: initial matrix to start the EM iterations with
+ :param save_init: boolean, if True, save the initial matrix
+ :param output_file_name: name of the file where to save the matrix
+ """
+ assert(isinstance(stat_server, StatServer) and stat_server.validate()), \
+ "First argument must be a proper StatServer"
+ assert(isinstance(ubm, Mixture) and ubm.validate()), "Second argument must be a proper Mixture"
+ assert(isinstance(tv_rank, int) and (0 < tv_rank <= min(stat_server.stat1.shape))), \
+ "tv_rank must be a positive integer less than the dimension of the statistics"
+ assert(isinstance(nb_iter, int) and (0 < nb_iter)), "nb_iter must be a positive integer"
+
+ gmm_covariance = "diag" if ubm.invcov.ndim == 2 else "full"
+
+ # Set useful variables
+ nb_sessions, sv_size = stat_server.stat1.shape
+ feature_size = ubm.mu.shape[1]
+ nb_distrib = ubm.w.shape[0]
+
+ # Whiten the statistics for diagonal or full models
+ if gmm_covariance == "diag":
+ stat_server.whiten_stat1(ubm.get_mean_super_vector(), 1. / ubm.get_invcov_super_vector())
+ elif gmm_covariance == "full":
+ stat_server.whiten_stat1(ubm.get_mean_super_vector(), ubm.invchol)
+
+ # mean and Sigma are initialized at ZEROS as statistics are centered
+ self.mean = numpy.zeros(ubm.get_mean_super_vector().shape)
+ self.Sigma = numpy.zeros(ubm.get_mean_super_vector().shape)
+
+ # Initialize TV from given data or randomly
+ # mean and Sigma are initialized at ZEROS as statistics are centered
+ self.mean = numpy.zeros(ubm.get_mean_super_vector().shape)
+ self.F = numpy.random.randn(sv_size, tv_rank) if tv_init is None else tv_init
+ self.Sigma = numpy.zeros(ubm.get_mean_super_vector().shape)
+
+ # Save init if required
+ if save_init:
+ if output_file_name is None:
+ self.write(output_file_name + "temporary_factor_analyser_init.h5")
+ else:
+ self.write(output_file_name + "_init.h5")
+
+ # Estimate TV iteratively
+ for it in range(nb_iter):
+ print(f"Start it: {it}")
+ # Create accumulators for the list of models to process
+ _A = numpy.zeros((nb_distrib, tv_rank, tv_rank), dtype=STAT_TYPE)
+ _C = numpy.zeros((tv_rank, feature_size * nb_distrib), dtype=STAT_TYPE)
+
+ _R = numpy.zeros((tv_rank, tv_rank), dtype=STAT_TYPE)
+
+ # E-step:
+ index_map = numpy.repeat(numpy.arange(nb_distrib), feature_size)
+
+ for sess in range(stat_server.segset.shape[0]):
+
+ inv_lambda = scipy.linalg.inv(numpy.eye(tv_rank)
+ + (self.F.T * stat_server.stat0[sess, index_map]).dot(self.F))
+ Aux = self.F.T.dot(stat_server.stat1[sess, :])
+ e_h = Aux.dot(inv_lambda)
+ e_hh = inv_lambda + numpy.outer(e_h, e_h)
+
+ # Accumulate for minimum divergence step
+ _R += e_hh
+
+ # Accumulate for M-step
+ _C += numpy.outer(e_h, stat_server.stat1[sess, :])
+ _A += e_hh * stat_server.stat0[sess][:, numpy.newaxis, numpy.newaxis]
+
+ _R /= nb_sessions
+
+ # M-step ( + MinDiv si _R n'est pas None)
+ for g in range(nb_distrib):
+ distrib_idx = range(g * feature_size, (g + 1) * feature_size)
+ self.F[distrib_idx, :] = scipy.linalg.solve(_A[g], _C[:, distrib_idx]).T
+
+ # MINIMUM DIVERGENCE STEP
+ if min_div:
+ ch = scipy.linalg.cholesky(_R)
+ self.F = self.F.dot(ch)
+
+ # Save the FactorAnalyser if output_file_name is not None
+ if output_file_name is not None:
+ if it < nb_iter - 1:
+ self.write(output_file_name + "_it-{}.h5".format(it))
+ else:
+ self.write(output_file_name + ".h5")
+
+ def total_variability_single(self,
+ stat_server_filename,
+ ubm,
+ tv_rank,
+ nb_iter=20,
+ min_div=True,
+ tv_init=None,
+ batch_size=300,
+ save_init=False,
+ output_file_name=None):
+ """
+ Train a total variability model using a single process on a single node.
+ Use this method to run a single process on a single node with optimized code.
+
+ Optimization:
+ Only half of symmetric matrices are stored here
+ process sessions per batch in order to control the memory footprint
+
+ :param stat_server_filename: the name of the file for StatServer, containing data to train the model
+ :param ubm: a Mixture object
+ :param tv_rank: rank of the total variability model
+ :param nb_iter: number of EM iteration
+ :param min_div: boolean, if True, apply minimum divergence re-estimation
+ :param tv_init: initial matrix to start the EM iterations with
+ :param batch_size: number of sessions to process at once to reduce memory footprint
+ :param save_init: boolean, if True, save the initial matrix
+ :param output_file_name: name of the file where to save the matrix
+ """
+ assert (isinstance(ubm, Mixture) and ubm.validate()), "Second argument must be a proper Mixture"
+ assert (isinstance(nb_iter, int) and (0 < nb_iter)), "nb_iter must be a positive integer"
+
+ gmm_covariance = "diag" if ubm.invcov.ndim == 2 else "full"
+
+ # Set useful variables
+ with h5py.File(stat_server_filename, 'r') as fh:
+ nb_sessions, sv_size = fh["stat1"].shape
+ feature_size = ubm.mu.shape[1]
+ nb_distrib = ubm.w.shape[0]
+ sv_size = nb_distrib * feature_size
+
+ # Initialize TV from given data or randomly
+ # mean and Sigma are initialized at ZEROS as statistics are centered
+ self.mean = numpy.zeros(ubm.get_mean_super_vector().shape)
+ self.F = numpy.random.randn(sv_size, tv_rank) if tv_init is None else tv_init
+ self.Sigma = numpy.zeros(ubm.get_mean_super_vector().shape)
+
+ # Save init if required
+ if save_init:
+ if output_file_name is None:
+ self.write(output_file_name + "temporary_factor_analyser_init.h5")
+ else:
+ self.write(output_file_name + "_init.h5")
+
+ # Create index to replicate self.stat0 and save only upper triangular coefficients of symmetric matrices
+ index_map = numpy.repeat(numpy.arange(nb_distrib), feature_size)
+ upper_triangle_indices = numpy.triu_indices(tv_rank)
+
+ # Open the StatServer file
+ with h5py.File(stat_server_filename, 'r') as fh:
+ nb_sessions, sv_size = fh['stat1'].shape
+ batch_nb = int(numpy.floor(fh['segset'].shape[0] / float(batch_size) + 0.999))
+ batch_indices = numpy.array_split(numpy.arange(nb_sessions), batch_nb)
+
+ # Estimate TV iteratively
+ for it in range(nb_iter):
+
+ # Create accumulators for the list of models to process
+ _A = numpy.zeros((nb_distrib, tv_rank * (tv_rank + 1) // 2), dtype=STAT_TYPE)
+ _C = numpy.zeros((tv_rank, feature_size * nb_distrib), dtype=STAT_TYPE)
+ _R = numpy.zeros((tv_rank * (tv_rank + 1) // 2), dtype=STAT_TYPE)
+
+ # Load data per batch to reduce the memory footprint
+ for batch_idx in batch_indices:
+
+ stat0 = fh['stat0'][batch_idx, :]
+ stat1 = fh['stat1'][batch_idx, :]
+
+ e_h, e_hh = e_on_batch(stat0, stat1, ubm, self.F)
+
+ _R += numpy.sum(e_hh, axis=0)
+ _C += e_h.T.dot(stat1)
+ _A += stat0.T.dot(e_hh)
+
+ _R /= nb_sessions
+
+ # M-step
+ _A_tmp = numpy.zeros((tv_rank, tv_rank), dtype=STAT_TYPE)
+ for c in range(nb_distrib):
+ distrib_idx = range(c * feature_size, (c + 1) * feature_size)
+ _A_tmp[upper_triangle_indices] = _A_tmp.T[upper_triangle_indices] = _A[c, :]
+ self.F[distrib_idx, :] = scipy.linalg.solve(_A_tmp, _C[:, distrib_idx]).T
+
+ # Minimum divergence
+ if min_div:
+ _R_tmp = numpy.zeros((tv_rank, tv_rank), dtype=STAT_TYPE)
+ _R_tmp[upper_triangle_indices] = _R_tmp.T[upper_triangle_indices] = _R
+ ch = scipy.linalg.cholesky(_R_tmp)
+ self.F = self.F.dot(ch)
+
+ # Save the FactorAnalyser if output_file_name is not None
+ if output_file_name is not None:
+ if it < nb_iter - 1:
+ self.write(output_file_name + "_it-{}.h5".format(it))
+ else:
+ self.write(output_file_name + ".h5")
+
+ def total_variability(self,
+ stat_server_filename,
+ ubm,
+ tv_rank,
+ nb_iter=20,
+ min_div=True,
+ tv_init=None,
+ batch_size=300,
+ save_init=False,
+ output_file_name=None,
+ num_thread=1):
+ """
+ Train a total variability model using multiple process on a single node.
+ this method is the recommended one to train a Total Variability matrix.
+
+ Optimization:
+ Only half of symmetric matrices are stored here
+ process sessions per batch in order to control the memory footprint
+ Batches are processed by a pool of workers running in different process
+ The implementation is based on a multiple producers / single consumer approach
+
+ :param stat_server_filename: a list of StatServer file names to process
+ :param ubm: a Mixture object
+ :param tv_rank: rank of the total variability model
+ :param nb_iter: number of EM iteration
+ :param min_div: boolean, if True, apply minimum divergence re-estimation
+ :param tv_init: initial matrix to start the EM iterations with
+ :param batch_size: size of batch to load in memory for each worker
+ :param save_init: boolean, if True, save the initial matrix
+ :param output_file_name: name of the file where to save the matrix
+ :param num_thread: number of process to run in parallel
+ """
+ if not isinstance(stat_server_filename, list):
+ stat_server_filename = [stat_server_filename]
+
+ assert (isinstance(ubm, Mixture) and ubm.validate()), "Second argument must be a proper Mixture"
+ assert (isinstance(nb_iter, int) and (0 < nb_iter)), "nb_iter must be a positive integer"
+
+ gmm_covariance = "diag" if ubm.invcov.ndim == 2 else "full"
+
+ # Set useful variables
+ if not isinstance(stat_server_filename[0], h5py._hl.files.File):
+ with h5py.File(stat_server_filename[0], 'r') as fh: # open the first StatServer to get size
+ _, sv_size = fh['stat1'].shape
+ feature_size = fh['stat1'].shape[1] // fh['stat0'].shape[1]
+ distrib_nb = fh['stat0'].shape[1]
+ else:
+ _, sv_size = stat_server_filename[0]['stat1'].shape
+ feature_size = stat_server_filename[0]['stat1'].shape[1] // stat_server_filename[0]['stat0'].shape[1]
+ distrib_nb = stat_server_filename[0]['stat0'].shape[1]
+
+ upper_triangle_indices = numpy.triu_indices(tv_rank)
+
+ # mean and Sigma are initialized at ZEROS as statistics are centered
+ self.mean = numpy.zeros(ubm.get_mean_super_vector().shape, dtype=STAT_TYPE)
+ self.F = serialize(numpy.zeros((sv_size, tv_rank)).astype(STAT_TYPE))
+ if tv_init is None:
+ self.F = numpy.random.randn(sv_size, tv_rank).astype(STAT_TYPE)
+ else:
+ self.F = tv_init
+ self.Sigma = numpy.zeros(ubm.get_mean_super_vector().shape, dtype=STAT_TYPE)
+
+ # Save init if required
+ if save_init:
+ if output_file_name is None:
+ self.write(output_file_name + "temporary_factor_analyser_init.h5")
+ else:
+ self.write(output_file_name + "_init.h5")
+
+ # Estimate TV iteratively
+ for it in range(nb_iter):
+ print(f"Iteration {it}")
+ # Create serialized accumulators for the list of models to process
+ with warnings.catch_warnings():
+ warnings.simplefilter('ignore', RuntimeWarning)
+ _A = serialize(numpy.zeros((distrib_nb, tv_rank * (tv_rank + 1) // 2), dtype=STAT_TYPE))
+ _C = serialize(numpy.zeros((tv_rank, sv_size), dtype=STAT_TYPE))
+ _R = serialize(numpy.zeros((tv_rank * (tv_rank + 1) // 2), dtype=STAT_TYPE))
+
+ total_session_nb = 0
+
+ # E-step
+ # Accumulate statistics for each StatServer from the list
+ for stat_server_file in stat_server_filename:
+
+ # get info from the current StatServer
+ if not isinstance(stat_server_file, h5py._hl.files.File):
+ fh = h5py.File(stat_server_file, 'r')
+ else:
+ fh = stat_server_file
+
+ nb_sessions = fh["modelset"].shape[0]
+ total_session_nb += nb_sessions
+ batch_nb = int(numpy.floor(nb_sessions / float(batch_size) + 0.999))
+ batch_indices = numpy.array_split(numpy.arange(nb_sessions), batch_nb)
+
+ manager = multiprocessing.Manager()
+ q = manager.Queue()
+ pool = multiprocessing.Pool(num_thread + 2)
+
+ # put Consumer to work first
+ watcher = pool.apply_async(e_gather, ((_A, _C, _R), q))
+ # fire off workers
+ jobs = []
+
+ # Load data per batch to reduce the memory footprint
+ for batch_idx in batch_indices:
+
+ # Create list of argument for a process
+ arg = fh["stat0"][batch_idx, :], fh["stat1"][batch_idx, :], ubm, self.F
+ job = pool.apply_async(e_worker, (arg, q))
+ jobs.append(job)
+
+ # collect results from the workers through the pool result queue
+ for job in jobs:
+ job.get()
+
+ # now we are done, kill the consumer
+ q.put((None, None, None, None))
+ pool.close()
+
+ _A, _C, _R = watcher.get()
+
+ if not isinstance(stat_server_file, h5py._hl.files.File):
+ fh.close()
+
+ _R /= total_session_nb
+
+ # M-step
+ _A_tmp = numpy.zeros((tv_rank, tv_rank), dtype=STAT_TYPE)
+ for c in range(distrib_nb):
+ distrib_idx = range(c * feature_size, (c + 1) * feature_size)
+ _A_tmp[upper_triangle_indices] = _A_tmp.T[upper_triangle_indices] = _A[c, :]
+ self.F[distrib_idx, :] = scipy.linalg.solve(_A_tmp, _C[:, distrib_idx]).T
+
+ # Minimum divergence
+ if min_div:
+ _R_tmp = numpy.zeros((tv_rank, tv_rank), dtype=STAT_TYPE)
+ _R_tmp[upper_triangle_indices] = _R_tmp.T[upper_triangle_indices] = _R
+ ch = scipy.linalg.cholesky(_R_tmp)
+ self.F = self.F.dot(ch)
+
+ # Save the FactorAnalyser if output_file_name is not None
+ if output_file_name is not None:
+ if it < nb_iter - 1:
+ self.write(output_file_name + "_it-{}.h5".format(it))
+ else:
+ self.write(output_file_name + ".h5")
+
+ def extract_ivectors_single(self,
+ ubm,
+ stat_server,
+ uncertainty=False):
+ """
+ Estimate i-vectors for a given StatServer using single process on a single node.
+
+ :param stat_server: sufficient statistics stored in a StatServer
+ :param ubm: Mixture object (the UBM)
+ :param uncertainty: boolean, if True, return an additional matrix with uncertainty matrices (diagonal of the matrices)
+
+ :return: a StatServer with i-vectors in the stat1 attribute and a matrix of uncertainty matrices (optional)
+ """
+ assert(isinstance(stat_server, StatServer) and stat_server.validate()), \
+ "First argument must be a proper StatServer"
+ assert(isinstance(ubm, Mixture) and ubm.validate()), "Second argument must be a proper Mixture"
+
+ gmm_covariance = "diag" if ubm.invcov.ndim == 2 else "full"
+
+ # Set useful variables
+ tv_rank = self.F.shape[1]
+ feature_size = ubm.mu.shape[1]
+ nb_distrib = ubm.w.shape[0]
+
+ # Whiten the statistics for diagonal or full models
+ if gmm_covariance == "diag":
+ stat_server.whiten_stat1(ubm.get_mean_super_vector(), 1. / ubm.get_invcov_super_vector())
+ elif gmm_covariance == "full":
+ stat_server.whiten_stat1(ubm.get_mean_super_vector(), ubm.invchol)
+
+ # Extract i-vectors
+ iv_stat_server = StatServer()
+ iv_stat_server.modelset = copy.deepcopy(stat_server.modelset)
+ iv_stat_server.segset = copy.deepcopy(stat_server.segset)
+ iv_stat_server.start = copy.deepcopy(stat_server.start)
+ iv_stat_server.stop = copy.deepcopy(stat_server.stop)
+ iv_stat_server.stat0 = numpy.ones((stat_server.modelset.shape[0], 1))
+ iv_stat_server.stat1 = numpy.ones((stat_server.modelset.shape[0], tv_rank))
+
+ iv_sigma = numpy.ones((stat_server.modelset.shape[0], tv_rank))
+
+ # Replicate self.stat0
+ index_map = numpy.repeat(numpy.arange(nb_distrib), feature_size)
+
+ for sess in range(stat_server.segset.shape[0]):
+
+ inv_lambda = scipy.linalg.inv(numpy.eye(tv_rank) + (self.F.T *
+ stat_server.stat0[sess, index_map]).dot(self.F))
+ Aux = self.F.T.dot(stat_server.stat1[sess, :])
+ iv_stat_server.stat1[sess, :] = Aux.dot(inv_lambda)
+ iv_sigma[sess, :] = numpy.diag(inv_lambda + numpy.outer(iv_stat_server.stat1[sess, :],
+ iv_stat_server.stat1[sess, :]))
+
+ if uncertainty:
+ return iv_stat_server, iv_sigma
+ else:
+ return iv_stat_server
+
+ def extract_ivectors(self,
+ ubm,
+ stat_server_filename,
+ prefix='',
+ batch_size=300,
+ uncertainty=False,
+ num_thread=1):
+ """
+ Parallel extraction of i-vectors using multiprocessing module
+
+ :param ubm: Mixture object (the UBM)
+ :param stat_server_filename: name of the file from which the input StatServer is read
+ :param prefix: prefix used to store the StatServer in its file
+ :param batch_size: number of sessions to process in a batch
+ :param uncertainty: a boolean, if True, return the diagonal of the uncertainty matrices
+ :param num_thread: number of process to run in parallel
+ :return: a StatServer with i-vectors in the stat1 attribute and a matrix of uncertainty matrices (optional)
+ """
+ assert (isinstance(ubm, Mixture) and ubm.validate()), "Second argument must be a proper Mixture"
+
+ tv_rank = self.F.shape[1]
+
+ fh = stat_server_filename
+ if isinstance(stat_server_filename, str):
+ fh = h5py.File(stat_server_filename, 'r')
+
+ _, sv_size = fh[prefix + 'stat1'].shape
+ nb_sessions = fh[prefix + "modelset"].shape[0]
+
+ iv_server = StatServer()
+ iv_server.modelset = fh.get(prefix + 'modelset')[()]
+ iv_server.segset = fh.get(prefix + 'segset')[()]
+
+ tmpstart = fh.get(prefix+"start")[()]
+ tmpstop = fh.get(prefix+"stop")[()]
+ iv_server.start = numpy.empty(fh[prefix+"start"].shape, '|O')
+ iv_server.stop = numpy.empty(fh[prefix+"stop"].shape, '|O')
+ iv_server.start[tmpstart != -1] = tmpstart[tmpstart != -1]
+ iv_server.stop[tmpstop != -1] = tmpstop[tmpstop != -1]
+
+ iv_server.stat0 = numpy.ones((nb_sessions, 1), dtype=STAT_TYPE)
+ with warnings.catch_warnings():
+ iv_server.stat1 = serialize(numpy.zeros((nb_sessions, tv_rank)))
+ iv_sigma = serialize(numpy.zeros((nb_sessions, tv_rank)))
+
+ nb_sessions = iv_server.modelset.shape[0]
+ batch_nb = int(numpy.floor(nb_sessions / float(batch_size) + 0.999))
+ batch_indices = numpy.array_split(numpy.arange(nb_sessions), batch_nb)
+
+ manager = multiprocessing.Manager()
+ q = manager.Queue()
+ pool = multiprocessing.Pool(num_thread + 2)
+
+ # put listener to work first
+ watcher = pool.apply_async(iv_collect, ((iv_server.stat1, iv_sigma), q))
+ # fire off workers
+ jobs = []
+
+ # Load data per batch to reduce the memory footprint
+ for batch_idx in batch_indices:
+
+ # Create list of argument for a process
+ arg = batch_idx, fh["stat0"][batch_idx, :], fh["stat1"][batch_idx, :], ubm, self.F
+ job = pool.apply_async(iv_extract_on_batch, (arg, q))
+ jobs.append(job)
+
+ # collect results from the workers through the pool result queue
+ for job in jobs:
+ job.get()
+
+ # now we are done, kill the listener
+ q.put((None, None, None))
+ pool.close()
+
+ iv_server.stat1, iv_sigma = watcher.get()
+
+ if isinstance(stat_server_filename, str):
+ fh.close()
+
+ if uncertainty:
+ return iv_server, iv_sigma
+ else:
+ return iv_server
+
+ def plda(self,
+ stat_server,
+ rank_f,
+ nb_iter=10,
+ scaling_factor=1.,
+ output_file_name=None,
+ save_partial=False,
+ save_final=True,
+ num_thread=1):
+ """
+ Train a simplified Probabilistic Linear Discriminant Analysis model (no within class covariance matrix
+ but full residual covariance matrix)
+
+ :param stat_server: StatServer object with training statistics
+ :param rank_f: rank of the between class covariance matrix
+ :param nb_iter: number of iterations to run
+ :param scaling_factor: scaling factor to downscale statistics (value bewteen 0 and 1)
+ :param output_file_name: name of the output file where to store PLDA model
+ :param save_partial: boolean, if True, save PLDA model after each iteration
+ """
+ vect_size = stat_server.stat1.shape[1]
+
+ # Initialize mean and residual covariance from the training data
+ self.mean = stat_server.get_mean_stat1()
+ self.Sigma = stat_server.get_total_covariance_stat1()
+
+ # Sum stat per model
+ model_shifted_stat, session_per_model = stat_server.sum_stat_per_model()
+ class_nb = model_shifted_stat.modelset.shape[0]
+
+ # Multiply statistics by scaling_factor
+ model_shifted_stat.stat0 *= scaling_factor
+ model_shifted_stat.stat1 *= scaling_factor
+ session_per_model *= scaling_factor
+
+ # Compute Eigen Decomposition of Sigma in order to initialize the EigenVoice matrix
+ sigma_obs = stat_server.get_total_covariance_stat1()
+ evals, evecs = scipy.linalg.eigh(sigma_obs)
+ idx = numpy.argsort(evals)[::-1]
+ evecs = evecs.real[:, idx[:rank_f]]
+ self.F = evecs[:, :rank_f]
+
+ # Estimate PLDA model by iterating the EM algorithm
+ for it in range(nb_iter):
+ logging.info('Estimate between class covariance, it %d / %d', it + 1, nb_iter)
+
+ # E-step
+ print("E_step")
+
+ # Copy stats as they will be whitened with a different Sigma for each iteration
+ local_stat = copy.deepcopy(model_shifted_stat)
+
+ # Whiten statistics (with the new mean and Sigma)
+ local_stat.whiten_stat1(self.mean, self.Sigma)
+
+ # Whiten the EigenVoice matrix
+ eigen_values, eigen_vectors = scipy.linalg.eigh(self.Sigma)
+ ind = eigen_values.real.argsort()[::-1]
+ eigen_values = eigen_values.real[ind]
+ eigen_vectors = eigen_vectors.real[:, ind]
+ sqr_inv_eval_sigma = 1 / numpy.sqrt(eigen_values.real)
+ sqr_inv_sigma = numpy.dot(eigen_vectors, numpy.diag(sqr_inv_eval_sigma))
+ self.F = sqr_inv_sigma.T.dot(self.F)
+
+ # Replicate self.stat0
+ index_map = numpy.zeros(vect_size, dtype=int)
+ _stat0 = local_stat.stat0[:, index_map]
+
+ e_h = numpy.zeros((class_nb, rank_f))
+ e_hh = numpy.zeros((class_nb, rank_f, rank_f))
+
+ # loop on model id's
+ fa_model_loop(batch_start=0,
+ mini_batch_indices=numpy.arange(class_nb),
+ factor_analyser=self,
+ stat0=_stat0,
+ stat1=local_stat.stat1,
+ e_h=e_h,
+ e_hh=e_hh,
+ num_thread=num_thread)
+
+ # Accumulate for minimum divergence step
+ _R = numpy.sum(e_hh, axis=0) / session_per_model.shape[0]
+
+ _C = e_h.T.dot(local_stat.stat1).dot(scipy.linalg.inv(sqr_inv_sigma))
+ _A = numpy.einsum('ijk,i->jk', e_hh, local_stat.stat0.squeeze())
+
+ # M-step
+ self.F = scipy.linalg.solve(_A, _C).T
+
+ # Update the residual covariance
+ self.Sigma = sigma_obs - self.F.dot(_C) / session_per_model.sum()
+
+ # Minimum Divergence step
+ self.F = self.F.dot(scipy.linalg.cholesky(_R))
+
+ if output_file_name is None:
+ output_file_name = "plda"
+
+ if save_partial and it < nb_iter - 1:
+ self.write(output_file_name + "_it-{}.h5".format(it))
+ elif it == nb_iter - 1 and save_final:
+ self.write(output_file_name + ".h5")
+
+
+ def weighted_likelihood_plda(self,
+ out_stat_server,
+ in_stat_server,
+ alpha,
+ rank_f,
+ nb_iter=10,
+ scaling_factor=1.,
+ output_file_name=None,
+ save_partial=False):
+ """
+ Train a simplified Probabilistic Linear Discriminant Analysis model (no within class covariance matrix
+ but full residual covariance matrix)
+
+ :param out_stat_server: out-domain StatServer object with training statistics
+ :param rank_f: rank of the between class covariance matrix
+ :param nb_iter: number of iterations to run
+ :param scaling_factor: scaling factor to downscale statistics (value bewteen 0 and 1)
+ :param output_file_name: name of the output file where to store PLDA model
+ :param save_partial: boolean, if True, save PLDA model after each iteration
+ """
+ vect_size = out_stat_server.stat1.shape[1]
+
+
+ # Initialize mean and residual covariance from the training data
+ # First version
+ self.mean = alpha * in_stat_server.get_mean_stat1() + (1-alpha) * out_stat_server.get_mean_stat1()
+ self.Sigma = alpha * in_stat_server.get_total_covariance_stat1_mix(self.mean) \
+ + (1-alpha) * out_stat_server.get_total_covariance_stat1_mix(self.mean)
+ #self.mean = in_stat_server.get_mean_stat1()
+ #self.Sigma = in_stat_server.get_total_covariance_stat1()
+
+ # # Second version
+ # in_stat_server.center_stat1(in_stat_server.get_mean_stat1())
+ # out_stat_server.center_stat1(out_stat_server.get_mean_stat1())
+ # self.mean = 0.5 * in_stat_server.get_mean_stat1() + 0.5 * out_stat_server.get_mean_stat1()
+ # self.Sigma = alpha * in_stat_server.get_total_covariance_stat1() \
+ # + (1-alpha) * out_stat_server.get_total_covariance_stat1()
+
+
+ # Sum stat per model
+ out_model_shifted_stat, out_session_per_model = out_stat_server.sum_stat_per_model()
+ in_model_shifted_stat, in_session_per_model = in_stat_server.sum_stat_per_model()
+ out_class_nb = out_model_shifted_stat.modelset.shape[0]
+ in_class_nb = in_model_shifted_stat.modelset.shape[0]
+
+ # Multiply statistics by scaling_factor
+ out_model_shifted_stat.stat0 *= scaling_factor
+ out_model_shifted_stat.stat1 *= scaling_factor
+ out_session_per_model *= scaling_factor
+
+ in_model_shifted_stat.stat0 *= scaling_factor
+ in_model_shifted_stat.stat1 *= scaling_factor
+ in_session_per_model *= scaling_factor
+
+ # Compute Eigen Decomposition of Sigma in order to initialize the EigenVoice matrix
+ sigma_obs = copy.deepcopy(self.Sigma)
+ evals, evecs = scipy.linalg.eigh(sigma_obs)
+ idx = numpy.argsort(evals)[::-1]
+ evecs = evecs.real[:, idx[:rank_f]]
+ self.F = evecs[:, :rank_f]
+
+ # Estimate PLDA model by iterating the EM algorithm
+ for it in range(nb_iter):
+ logging.info('Estimate between class covariance, it %d / %d', it + 1, nb_iter)
+
+ # E-step
+ print("E_step")
+
+ # Copy stats as they will be whitened with a different Sigma for each iteration
+ out_local_stat = copy.deepcopy(out_model_shifted_stat)
+ in_local_stat = copy.deepcopy(in_model_shifted_stat)
+
+ # Whiten statistics (with the new mean and Sigma)
+ out_local_stat.whiten_stat1(self.mean, self.Sigma)
+ in_local_stat.whiten_stat1(self.mean, self.Sigma)
+
+
+ # Whiten the EigenVoice matrix
+ eigen_values, eigen_vectors = scipy.linalg.eigh(self.Sigma)
+ ind = eigen_values.real.argsort()[::-1]
+ eigen_values = eigen_values.real[ind]
+ eigen_vectors = eigen_vectors.real[:, ind]
+ sqr_inv_eval_sigma = 1 / numpy.sqrt(eigen_values.real)
+ sqr_inv_sigma = numpy.dot(eigen_vectors, numpy.diag(sqr_inv_eval_sigma))
+ self.F = sqr_inv_sigma.T.dot(self.F)
+
+ # Replicate self.stat0
+ index_map = numpy.zeros(vect_size, dtype=int)
+ _stat0 = out_local_stat.stat0[:, index_map]
+
+ e_h = numpy.zeros((out_class_nb, rank_f))
+ e_hh = numpy.zeros((out_class_nb, rank_f, rank_f))
+
+ # loop on model id's
+
+ fa_model_loop(batch_start=0,
+ mini_batch_indices=numpy.arange(out_class_nb),
+ factor_analyser=self,
+ stat0=_stat0,
+ stat1=out_local_stat.stat1,
+ e_h=e_h,
+ e_hh=e_hh,
+ num_thread=1)
+ # Accumulate for minimum divergence step
+ out_R = numpy.sum(e_hh, axis=0) / out_session_per_model.shape[0]
+
+ out_C = e_h.T.dot(out_local_stat.stat1).dot(scipy.linalg.inv(sqr_inv_sigma))
+ out_A = numpy.einsum('ijk,i->jk', e_hh, out_local_stat.stat0.squeeze())
+
+ # Replicate self.stat0
+ index_map = numpy.zeros(vect_size, dtype=int)
+ _stat0 = in_local_stat.stat0[:, index_map]
+
+ e_h = numpy.zeros((in_class_nb, rank_f))
+ e_hh = numpy.zeros((in_class_nb, rank_f, rank_f))
+
+ # loop on model id's
+ fa_model_loop(batch_start=0,
+ mini_batch_indices=numpy.arange(in_class_nb),
+ factor_analyser=self,
+ stat0=_stat0,
+ stat1=in_local_stat.stat1,
+ e_h=e_h,
+ e_hh=e_hh,
+ num_thread=1)
+ # Accumulate for minimum divergence step
+ in_R = numpy.sum(e_hh, axis=0) / in_session_per_model.shape[0]
+
+ in_C = e_h.T.dot(in_local_stat.stat1).dot(scipy.linalg.inv(sqr_inv_sigma))
+ in_A = numpy.einsum('ijk,i->jk', e_hh, in_local_stat.stat0.squeeze())
+
+ in_alpha = alpha / in_session_per_model.sum()
+ out_alpha = (1 - alpha) / out_session_per_model.sum()
+
+ #import ipdb
+ #ipdb.set_trace()
+
+ _A = in_alpha * in_A + out_alpha * out_A
+ _C = in_alpha * in_C + out_alpha * out_C
+ _R = alpha * in_R + (1 - alpha) * out_R
+
+ # M-step
+ self.F = scipy.linalg.solve(_A, _C).T
+
+ # Update the residual covariance
+ self.Sigma = sigma_obs - self.F.dot(_C)
+ #self.Sigma = sigma_obs - self.F.dot(_C) / (in_alpha * in_session_per_model.sum() + out_alpha * out_session_per_model.sum())
+
+ """
+ dans la ligne du dessus on ne divise pas par le nombre total de sessions ???
+ """
+
+ # Minimum Divergence step
+ self.F = self.F.dot(scipy.linalg.cholesky(_R))
+
+ if output_file_name is None:
+ output_file_name = "temporary_plda"
+
+ if save_partial and it < nb_iter - 1:
+ self.write(output_file_name + "_it-{}.h5".format(it))
+ elif it == nb_iter - 1:
+ self.write(output_file_name + ".h5")
+
+
+
+
+
diff --git a/sidekit/sidekit/features_extractor.py b/sidekit/sidekit/features_extractor.py
new file mode 100755
index 0000000..0722952
--- /dev/null
+++ b/sidekit/sidekit/features_extractor.py
@@ -0,0 +1,750 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+"""
+Copyright 2014-2021 Sylvain Meignier and Anthony Larcher
+
+ :mod:`features_server` provides methods to manage features
+
+"""
+import copy
+import h5py
+import logging
+import numpy
+import os
+
+
+from . import PARAM_TYPE
+from .frontend.features import mfcc, plp
+from .frontend.io import read_audio, read_label, write_hdf5, _add_reverb, _add_noise
+from .frontend.vad import vad_snr, vad_percentil
+from .mixture import vad_energy
+from .sidekit_wrappers import process_parallel_lists
+from .bosaris import IdMap
+
+
+__license__ = "LGPL"
+__author__ = "Anthony Larcher & Sylvain Meignier"
+__copyright__ = "Copyright 2014-2021 Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+
+
+class FeaturesExtractor(object):
+ """
+ A FeaturesExtractor process an audio file in SPHERE, WAVE or RAW PCM format and extract filter-banks,
+ cepstral coefficients, bottle-neck features (in the future), log-energy and perform a speech activity detection.
+ """
+
+ def __init__(self,
+ audio_filename_structure=None,
+ feature_filename_structure=None,
+ sampling_frequency=None,
+ lower_frequency=None,
+ higher_frequency=None,
+ filter_bank=None,
+ filter_bank_size=None,
+ window_size=None,
+ shift=None,
+ ceps_number=None,
+ vad=None,
+ snr=None,
+ pre_emphasis=None,
+ save_param=None,
+ keep_all_features=None,
+ feature_type=None,
+ rasta_plp=None,
+ compressed='percentile'):
+ """
+ :param audio_filename_structure: a string that gives the structure of the input file to process
+ :param feature_filename_structure: a string that gives the structure of the output file to write
+ :param sampling_frequency: optional, only required if processing RAW PCM. For other formats, this information
+ is read from the file
+ :param lower_frequency: lower frequency (in Herz) of the filter bank
+ :param higher_frequency: higher frequency of the filter bank
+ :param filter_bank: type of fiter scale to use, can be "lin" or "log" (for linear of log-scale)
+ :param filter_bank_size: number of filters banks
+ :param window_size: size of the sliding window to process (in seconds)
+ :param shift: time shift of the sliding window (in seconds)
+ :param ceps_number: number of cepstral coefficients to extract
+ :param vad: type of voice actovoty detection algorithm to use. Can be "energy", "snr", "percentil" or "lbl"
+ to read from a file
+ :param snr: signal to noise ratio used for "snr" vad algorithm
+ :param pre_emphasis: value given for the pre-emphasis filter (default is 0.97)
+ :param save_param: list of strings that indicate which parameters to save. The strings can be:
+ "cep" for cepstral coefficients, "fb" for filter-banks, "energy" for the log-energy, "bnf"
+ for bottle-neck features and "vad" for the frame selection labels. In the resulting files, parameters are
+ always concatenated in the following order: (energy,fb, cep, bnf, vad_label). Default keeps all.
+ :param keep_all_features: boolean, if True, all frames are writen; if False, keep only frames according to
+ the vad label
+ """
+
+ # Set the default values
+ self.audio_filename_structure = None
+ self.feature_filename_structure = '{}'
+ self.sampling_frequency = 8000
+ self.lower_frequency = None
+ self.higher_frequency = None
+ self.filter_bank = None
+ self.filter_bank_size = None
+ self.window_size = None
+ self.shift = None
+ self.ceps_number = None
+ self.vad = None
+ self.snr = None
+ self.pre_emphasis = 0.97
+ self.save_param = ["energy", "cep", "fb", "bnf", "vad"]
+ self.keep_all_features = None
+ self.feature_type = 'mfcc'
+ self.rasta_plp = True
+
+ if audio_filename_structure is not None:
+ self.audio_filename_structure = audio_filename_structure
+ if feature_filename_structure is not None:
+ self.feature_filename_structure = feature_filename_structure
+ if sampling_frequency is not None:
+ self.sampling_frequency = sampling_frequency
+ if lower_frequency is not None:
+ self.lower_frequency = lower_frequency
+ if higher_frequency is not None:
+ self.higher_frequency = higher_frequency
+ if filter_bank is not None:
+ self.filter_bank = filter_bank
+ if filter_bank_size is not None:
+ self.filter_bank_size = filter_bank_size
+ if window_size is not None:
+ self.window_size = window_size
+ if shift is not None:
+ self.shift = shift
+ if ceps_number is not None:
+ self.ceps_number = ceps_number
+ if vad is not None:
+ self.vad = vad
+ if snr is not None:
+ self.snr = snr
+ if pre_emphasis is not None:
+ self.pre_emphasis = pre_emphasis
+ if save_param is not None:
+ self.save_param = save_param
+ if keep_all_features is not None:
+ self.keep_all_features = keep_all_features
+ if feature_type is not None:
+ self.feature_type = feature_type
+ if rasta_plp is not None:
+ self.rasta_plp = rasta_plp
+ if compressed is not None:
+ self.compressed = compressed
+
+ self.window_sample = None
+ if not (self.window_size is None or self.sampling_frequency is None):
+ self.window_sample = int(self.window_size * self.sampling_frequency)
+
+ self.shift_sample = None
+ if not (self.shift is None or self.sampling_frequency is None):
+ self.shift_sample = int(self.shift * self.sampling_frequency)
+
+ self.show = 'empty'
+
+ def __repr__(self):
+ ch = '\t show: {} keep_all_features: {}\n'.format(
+ self.show, self.keep_all_features)
+ ch += '\t audio_filename_structure: {} \n'.format(self.audio_filename_structure)
+ ch += '\t feature_filename_structure: {} \n'.format(self.feature_filename_structure)
+ ch += '\t pre-emphasis: {} \n'.format(self.pre_emphasis)
+ ch += '\t lower_frequency: {} higher_frequency: {} \n'.format(
+ self.lower_frequency, self.higher_frequency)
+ ch += '\t sampling_frequency: {} \n'.format(self.sampling_frequency)
+ ch += '\t filter bank: {} filters of type {}\n'.format(
+ self.filter_bank_size, self.filter_bank)
+ ch += '\t ceps_number: {} \n\t window_size: {} shift: {} \n'.format(
+ self.ceps_number, self.window_size, self.shift)
+ ch += '\t vad: {} snr: {} \n'.format(self.vad, self.snr)
+ return ch
+
+ def extract(self, show, channel,
+ input_audio_filename=None,
+ output_feature_filename=None,
+ backing_store=False,
+ noise_file_name=None,
+ snr=10,
+ reverb_file_name=None,
+ reverb_level=-26.):
+ """
+ Compute the acoustic parameters (filter banks, cepstral coefficients, log-energy and bottleneck features
+ for a single channel from a given audio file.
+
+ :param show: ID if the show
+ :param channel: channel number (0 if mono file)
+ :param input_audio_filename: name of the input audio file to consider if the name of the audio file is independent from the ID of the show
+ :param output_feature_filename: name of the output feature file to consider if the name of the feature file is independent from the ID of the show
+ :param backing_store: boolean, if False, nothing is writen to disk, if True, the file is writen to disk when closed
+ :param rasta: boolean, only for PLP parameters, if True, perform RASTA filtering
+
+ :return: an hdf5 file handler
+ """
+ # Create the filename to load
+
+ # If the input audio file name does not include the ID of the show
+ # (i.e., if the audio_filename_structure does not include {})
+ # the audio_filename_structure is updated to use the input_audio_filename
+ if input_audio_filename is not None:
+ self.audio_filename_structure = input_audio_filename
+ audio_filename = self.audio_filename_structure.format(show)
+
+ # If the output file name does not include the ID of the show,
+ # (i.e., if the feature_filename_structure does not include {})
+ # the feature_filename_structure is updated to use the output_feature_filename
+ if output_feature_filename is not None:
+ self.feature_filename_structure = output_feature_filename
+ feature_filename = self.feature_filename_structure.format(show)
+
+ # Open audio file, get the signal and possibly the sampling frequency
+ signal, sample_rate = read_audio(audio_filename, self.sampling_frequency)
+ if signal.ndim == 1:
+ signal = signal[:, numpy.newaxis]
+ # AJOUTER LE BRUITAGE ET REVERB DU SIGNAL SI NECESSAIRE
+ if noise_file_name is not None:
+ signal[:, channel] = _add_noise(signal[:, channel], noise_file_name, snr, sample_rate)
+
+ if reverb_file_name is not None:
+ signal[:, channel] = _add_reverb(signal[:, channel], reverb_file_name, sample_rate, reverb_level)
+
+ # Process the target channel to return Filter-Banks, Cepstral coefficients and BNF if required
+ length, chan = signal.shape
+
+ # If the size of the signal is not enough for one frame, return zero features
+ if length < self.window_sample:
+ cep = numpy.empty((0, self.ceps_number), dtype=PARAM_TYPE)
+ energy = numpy.empty((0, 1), dtype=PARAM_TYPE)
+ fb = numpy.empty((0, self.filter_bank_size), dtype=PARAM_TYPE)
+ label = numpy.empty((0, 1), dtype='int8')
+
+ else:
+ # Random noise is added to the input signal to avoid zero frames.
+ numpy.random.seed(0)
+ signal[:, channel] += 0.0001 * numpy.random.randn(signal.shape[0])
+
+ dec = self.shift_sample * 250 * 25000 + self.window_sample
+ dec2 = self.window_sample - self.shift_sample
+ start = 0
+ end = min(dec, length)
+
+ # Process the signal by batch to avoid problems for very long signals
+ while start < (length - dec2):
+ logging.info('process part : %f %f %f',
+ start / self.sampling_frequency,
+ end / self.sampling_frequency,
+ length / self.sampling_frequency)
+
+ if self.feature_type == 'mfcc':
+ # Extract cepstral coefficients, energy and filter banks
+ cep, energy, _, fb = mfcc(signal[start:end, channel],
+ fs=self.sampling_frequency,
+ lowfreq=self.lower_frequency,
+ maxfreq=self.higher_frequency,
+ nlinfilt=self.filter_bank_size if self.filter_bank == "lin" else 0,
+ nlogfilt=self.filter_bank_size if self.filter_bank == "log" else 0,
+ nwin=self.window_size,
+ shift=self.shift,
+ nceps=self.ceps_number,
+ get_spec=False,
+ get_mspec=True,
+ prefac=self.pre_emphasis)
+ elif self.feature_type == 'plp':
+ cep, energy, _, fb = plp(signal[start:end, channel],
+ nwin=self.window_size,
+ fs=self.sampling_frequency,
+ plp_order=self.ceps_number,
+ shift=self.shift,
+ get_spec=False,
+ get_mspec=True,
+ prefac=self.pre_emphasis,
+ rasta=self.rasta_plp)
+
+ # Perform feature selection
+ label, threshold = self._vad(cep, energy, fb, signal[start:end, channel])
+
+ if len(label) < len(energy):
+ label = numpy.hstack((label, numpy.zeros(len(energy)-len(label), dtype='bool')))
+
+ start = end - dec2
+ end = min(end + dec, length)
+ if cep.shape[0] > 0:
+ logging.info('!! size of signal cep: %f len %d type size %d', cep[-1].nbytes/1024/1024,
+ len(cep[-1]),
+ cep[-1].nbytes/len(cep[-1]))
+
+ # Compute the mean and std of fb and cepstral coefficient computed for all selected frames
+ energy_mean = energy[label].mean(axis=0)
+ energy_std = energy[label].std(axis=0)
+ fb_mean = fb[label, :].mean(axis=0)
+ fb_std = fb[label, :].std(axis=0)
+ cep_mean = cep[label, :].mean(axis=0)
+ cep_std = cep[label, :].std(axis=0)
+ # bnf_mean = bnf[label, :].mean(axis=0)
+ # bnf_std = bnf[label, :].std(axis=0)
+
+ # Create the HDF5 file
+ # Create the directory if it dosn't exist
+ dir_name = os.path.dirname(feature_filename) # get the path
+ if not os.path.exists(dir_name) and not (dir_name == ''):
+ os.makedirs(dir_name)
+ h5f = h5py.File(feature_filename, 'w', backing_store=backing_store, driver='core')
+ if "cep" not in self.save_param:
+ cep = None
+ cep_mean = None
+ cep_std = None
+ if "energy" not in self.save_param:
+ energy = None
+ energy_mean = None
+ energy_std = None
+ if "fb" not in self.save_param:
+ fb = None
+ fb_mean = None
+ fb_std = None
+ if "bnf" not in self.save_param:
+ bnf = None
+ bnf_mean = None
+ bnf_std = None
+ if "vad" not in self.save_param:
+ label = None
+ logging.info(label)
+ write_hdf5(show, h5f,
+ cep, cep_mean, cep_std,
+ energy, energy_mean, energy_std,
+ fb, fb_mean, fb_std,
+ bnf, bnf_mean, bnf_std,
+ label,
+ self.compressed)
+
+ return h5f
+
+ def extract_from_signal(self, signal,
+ sample_rate,
+ noise_file_name=None,
+ snr=10,
+ reverb_file_name=None,
+ reverb_level=-26.):
+ """
+ Compute the acoustic parameters (filter banks, cepstral coefficients, log-energy and bottleneck features
+ for a single channel from a given audio file.
+
+ :param show: ID if the show
+ :param channel: channel number (0 if mono file)
+ :param input_audio_filename: name of the input audio file to consider if the name of the audio file is independent from the ID of the show
+ :param output_feature_filename: name of the output feature file to consider if the name of the feature file is independent from the ID of the show
+ :param backing_store: boolean, if False, nothing is writen to disk, if True, the file is writen to disk when closed
+ :param rasta: boolean, only for PLP parameters, if True, perform RASTA filtering
+
+ :return: an hdf5 file handler
+ """
+ if signal.ndim == 1:
+ signal = signal[:, numpy.newaxis]
+
+ # AJOUTER LE BRUITAGE ET REVERB DU SIGNAL SI NECESSAIRE
+ if noise_file_name is not None:
+ signal[:, 0] = _add_noise(signal[:, 0], noise_file_name, snr, sample_rate)
+
+ if reverb_file_name is not None:
+ signal[:, 0] = _add_reverb(signal[:, 0], reverb_file_name, sample_rate, reverb_level)
+
+ # Process the target channel to return Filter-Banks, Cepstral coefficients and BNF if required
+ length, chan = signal.shape
+
+ # If the size of the signal is not enough for one frame, return zero features
+ if length < self.window_sample:
+ cep = numpy.empty((0, self.ceps_number), dtype=PARAM_TYPE)
+ energy = numpy.empty((0, 1), dtype=PARAM_TYPE)
+ fb = numpy.empty((0, self.filter_bank_size), dtype=PARAM_TYPE)
+ label = numpy.empty((0, 1), dtype='int8')
+
+ else:
+ # Random noise is added to the input signal to avoid zero frames.
+ numpy.random.seed(0)
+ signal[:, 0] += 0.0001 * numpy.random.randn(signal.shape[0])
+
+ dec = self.shift_sample * 250 * 25000 + self.window_sample
+ dec2 = self.window_sample - self.shift_sample
+ start = 0
+ end = min(dec, length)
+
+ # Process the signal by batch to avoid problems for very long signals
+ while start < (length - dec2):
+ logging.info('process part : %f %f %f',
+ start / self.sampling_frequency,
+ end / self.sampling_frequency,
+ length / self.sampling_frequency)
+
+ if self.feature_type == 'mfcc':
+ # Extract cepstral coefficients, energy and filter banks
+ cep, energy, _, fb = mfcc(signal[start:end, 0],
+ fs=self.sampling_frequency,
+ lowfreq=self.lower_frequency,
+ maxfreq=self.higher_frequency,
+ nlinfilt=self.filter_bank_size if self.filter_bank == "lin" else 0,
+ nlogfilt=self.filter_bank_size if self.filter_bank == "log" else 0,
+ nwin=self.window_size,
+ shift=self.shift,
+ nceps=self.ceps_number,
+ get_spec=False,
+ get_mspec=True,
+ prefac=self.pre_emphasis)
+ elif self.feature_type == 'plp':
+ cep, energy, _, fb = plp(signal[start:end, 0],
+ nwin=self.window_size,
+ fs=self.sampling_frequency,
+ plp_order=self.ceps_number,
+ shift=self.shift,
+ get_spec=False,
+ get_mspec=True,
+ prefac=self.pre_emphasis,
+ rasta=self.rasta_plp)
+
+ # Perform feature selection
+ label, threshold = self._vad(cep, energy, fb, signal[start:end, 0])
+
+ if len(label) < len(energy):
+ label = numpy.hstack((label, numpy.zeros(len(energy) - len(label), dtype='bool')))
+
+ start = end - dec2
+ end = min(end + dec, length)
+ if cep.shape[0] > 0:
+ logging.info('!! size of signal cep: %f len %d type size %d', cep[-1].nbytes / 1024 / 1024,
+ len(cep[-1]),
+ cep[-1].nbytes / len(cep[-1]))
+
+ return label, energy, cep, fb
+
+ def save(self,
+ show,
+ channel=0,
+ input_audio_filename=None,
+ output_feature_filename=None,
+ noise_file_name=None,
+ snr=10,
+ reverb_file_name=None,
+ reverb_level=-26.):
+ """
+ Compute the acoustic parameters (filter banks, cepstral coefficients, log-energy and bottleneck features
+ for a single channel from a given audio file and save them to disk in a HDF5 format
+
+ :param show:
+ :param channel:
+ :param input_audio_filename:
+ :param output_feature_filename:
+ :return:
+ """
+ # Load the cepstral coefficients, energy, filter-banks, bnf and vad labels
+ h5f = self.extract(show,
+ channel,
+ input_audio_filename,
+ output_feature_filename,
+ backing_store=True,
+ noise_file_name=noise_file_name,
+ snr=snr,
+ reverb_file_name=reverb_file_name,
+ reverb_level=reverb_level)
+ logging.info(h5f.filename)
+
+ # Write the hdf5 file to disk
+ h5f.close()
+
+ @staticmethod
+ def _save(show,
+ feature_filename_structure,
+ save_param,
+ cep,
+ energy,
+ fb,
+ bnf,
+ label,
+ compressed='percentile'):
+ """
+
+ :param show:
+ :param feature_filename_structure:
+ :param save_param:
+ :param cep:
+ :param energy:
+ :param fb:
+ :param bnf:
+ :param label:
+ :return:
+ """
+ feature_filename = feature_filename_structure.format(show)
+ logging.info('output finename: '+feature_filename)
+ dir_name = os.path.dirname(feature_filename) # get the path
+ if not os.path.exists(dir_name) and not (dir_name == ''):
+ os.makedirs(dir_name)
+
+ h5f = h5py.File(feature_filename, 'a', backing_store=True, driver='core')
+
+ if "cep" not in save_param:
+ cep = None
+ cep_mean = None
+ cep_std = None
+ else:
+ cep_mean = cep[label, :].mean(axis=0)
+ cep_std = cep[label, :].std(axis=0)
+ if "energy" not in save_param:
+ energy = None
+ energy_mean = None
+ energy_std = None
+ else:
+ energy_mean = energy[label].mean(axis=0)
+ energy_std = energy[label].std(axis=0)
+ if "fb" not in save_param:
+ fb = None
+ fb_mean = None
+ fb_std = None
+ else:
+ fb_mean = fb[label, :].mean(axis=0)
+ fb_std = fb[label, :].std(axis=0)
+ if "bnf" not in save_param:
+ bnf = None
+ bnf_mean = None
+ bnf_std = None
+ if "vad" not in save_param:
+ label = None
+ logging.info(label)
+
+ write_hdf5(show, h5f,
+ cep, cep_mean, cep_std,
+ energy, energy_mean, energy_std,
+ fb, fb_mean, fb_std,
+ bnf, bnf_mean, bnf_std,
+ label, compressed)
+ h5f.close()
+
+ def save_multispeakers(self,
+ idmap,
+ channel=0,
+ input_audio_filename=None,
+ output_feature_filename=None,
+ keep_all=True,
+ skip_existing_file=False,
+ compressed='percentile'):
+ """
+ :param idmap:
+ :param channel:
+ :param input_audio_filename:
+ :param output_feature_filename:
+ :param keep_all:
+ :param skip_existing_file:
+ :return:
+ """
+ param_vad = self.vad
+ save_param = copy.deepcopy(self.save_param)
+ self.save_param = ["energy", "cep", "fb", "vad"]
+
+ self.vad = None
+ if output_feature_filename is None:
+ output_feature_filename = self.feature_filename_structure
+
+ tmp_dict = dict()
+ nb = 0
+ for show, _id, start, stop in zip(idmap.rightids, idmap.leftids, idmap.start, idmap.stop):
+
+ if skip_existing_file:
+ if keep_all:
+ file_name = output_feature_filename.format(show)
+ else:
+ file_name = output_feature_filename.format(show+'/' + _id)
+ if os.path.isfile(file_name):
+ logging.info('existing file: SKIP '+file_name)
+ continue
+
+ if show not in tmp_dict:
+ tmp_dict[show] = dict()
+ if _id not in tmp_dict[show]:
+ tmp_dict[show][_id] = numpy.arange(start, stop-1)
+ nb += 1
+ else:
+ tmp_dict[show][_id] = numpy.concatenate((tmp_dict[show][_id], numpy.arange(start, stop-1)), axis=0)
+
+
+ output_show = list()
+ output_id = list()
+ output_start = list()
+ output_stop = list()
+ global_compression = 'none'
+ if self.compressed == 'percentile':
+ global_compression = 'percentile'
+ self.compressed = 'none'
+
+ for show in tmp_dict:
+ # temp_file_name = tempfile.NamedTemporaryFile().name
+ # logging.info('tmp file name: '+temp_file_name)
+ self.vad = None
+ h5f = self.extract(show, channel, input_audio_filename, backing_store=False)
+ energy = h5f.get(show + '/energy')[()]
+ label = h5f.get(show + '/vad')[()]
+ fb = h5f.get(show + '/fb')[()]
+ cep = h5f.get(show + '/cep')[()]
+ h5f.close()
+ self.vad = param_vad
+ l = energy.shape[0]
+ for _id in tmp_dict[show]:
+ idx = tmp_dict[show][_id]
+ idx = idx[idx < l]
+ _, threshold_id = self._vad(None, energy[idx], None, None)
+ logging.info('show: ' + show + ' cluster: ' + _id + ' thr:' + str(threshold_id))
+ label_id = energy > threshold_id
+ label[idx] = label_id[idx].flatten()
+
+ if not keep_all:
+ output_show.append(show + '/' + _id)
+ output_id.append(_id)
+ output_start.append(0)
+ output_stop.append(idx.shape[0])
+ logging.info('keep_all id: ' + show + ' show: ' + show + '/' + _id + ' start: 0 stop: ' +
+ str(idx.shape[0]))
+ self._save(show+'/' + _id,
+ output_feature_filename,
+ save_param, cep[idx],
+ energy[idx],
+ fb[idx],
+ None,
+ label[idx],
+ global_compression)
+
+ if keep_all:
+ self._save(show, output_feature_filename, save_param, cep, energy, fb, None, label, global_compression)
+
+ self.vad = param_vad
+ self.save_param = save_param
+
+ self.compressed = global_compression
+
+ if keep_all:
+ return copy.deepcopy(idmap)
+ out_idmap = IdMap()
+ out_idmap.set(numpy.array(output_id),
+ numpy.array(output_show),
+ start=numpy.array(output_start, dtype='int32'),
+ stop=numpy.array(output_stop, dtype='int32'))
+ return out_idmap
+
+ def _vad(self, cep, log_energy, fb, x, label_file_name=None):
+ """
+ Apply Voice Activity Detection.
+
+ :param cep: cepstral coefficient (for future VAD)
+ :param log_energy: logarithm of the energy
+ :param fb: filter bank coefficients (for future VAD)
+ :param x: signal
+ :return:
+ """
+ threshold = -numpy.inf
+ label = None
+ if self.vad is None:
+ logging.info('no vad')
+ label = numpy.array([True] * log_energy.shape[0])
+ elif self.vad == 'snr':
+ logging.info('vad : snr')
+ window_sample = int(self.window_size * self.sampling_frequency)
+ label = vad_snr(x, self.snr, fs=self.sampling_frequency,
+ shift=self.shift, nwin=window_sample)
+ elif self.vad == 'energy':
+ logging.info('vad : energy')
+ label, threshold = vad_energy(log_energy, distrib_nb=3,
+ nb_train_it=8, flooring=0.0001,
+ ceiling=1.5, alpha=0.2)
+ elif self.vad == 'percentil':
+ label, threshold = vad_percentil(log_energy, 10)
+ logging.info('percentil '+str(threshold))
+ elif self.vad == 'dnn':
+ pass # TO DO
+ elif self.vad == 'lbl': # load existing labels as reference
+ logging.info('vad : lbl')
+ label = read_label(label_file_name)
+ else:
+ logging.warning('Wrong VAD type')
+ return label, threshold
+
+ @process_parallel_lists
+ def save_list(self,
+ show_list,
+ channel_list,
+ audio_file_list=None,
+ feature_file_list=None,
+ noise_file_list=None,
+ snr_list=None,
+ reverb_file_list=None,
+ reverb_levels=None,
+ num_thread=1):
+ """
+ Compute the acoustic parameters (filter banks, cepstral coefficients, log-energy and bottleneck features
+ for a list of audio files and save them to disk in a HDF5 format
+ The process is parallelized if num_thread is higher than 1
+
+
+ :param show_list: list of IDs of the show to process
+ :param channel_list: list of channel indices corresponding to each show
+ :param audio_file_list: list of input audio files if the name is independent from the ID of the show
+ :param feature_file_list: list of output audio files if the name is independent from the ID of the show
+ :param num_thread: number of parallel process to run
+ :return:
+ """
+
+ """
+ TO DO: manage multi-processing when writting in a single file
+ (use a queue but can we still use @process_parallel_lists ???)
+
+ """
+
+ logging.info(self)
+
+ # get the length of the longest list
+ max_length = max([len(l) for l in [show_list, channel_list, audio_file_list, feature_file_list]
+ if l is not None])
+
+ if show_list is None:
+ show_list = numpy.empty(int(max_length), dtype='|O')
+ if audio_file_list is None:
+ audio_file_list = numpy.empty(int(max_length), dtype='|O')
+ if feature_file_list is None:
+ feature_file_list = numpy.empty(int(max_length), dtype='|O')
+ if noise_file_list is None:
+ noise_file_list = numpy.empty(int(max_length), dtype='|O')
+ snr_list = numpy.empty(int(max_length), dtype='|O')
+ elif snr_list is None:
+ snr_list = numpy.full(int(max_length), 5.)
+ if reverb_file_list is None:
+ reverb_file_list = numpy.empty(int(max_length), dtype='|O')
+ reverb_levels = numpy.empty(int(max_length), dtype='|O')
+ elif reverb_levels is None:
+ reverb_levels = numpy.full(int(max_length), -26.)
+
+
+ for show, channel, audio_file, feature_file, noise_file, snr, reverb_file, reverb_level in zip(show_list,
+ channel_list,
+ audio_file_list,
+ feature_file_list,
+ noise_file_list,
+ snr_list,
+ reverb_file_list,
+ reverb_levels):
+
+ self.save(show, channel, audio_file, feature_file, noise_file, snr, reverb_file, reverb_level)
diff --git a/sidekit/sidekit/features_server.py b/sidekit/sidekit/features_server.py
new file mode 100755
index 0000000..977742f
--- /dev/null
+++ b/sidekit/sidekit/features_server.py
@@ -0,0 +1,726 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+
+"""
+Copyright 2014-2021 Sylvain Meignier and Anthony Larcher
+
+ :mod:`features_server` provides methods to manage features
+
+"""
+import copy
+import multiprocessing
+import numpy
+import logging
+import h5py
+
+from sidekit.frontend.features import pca_dct, shifted_delta_cepstral, compute_delta, framing, dct_basis
+from sidekit.frontend.io import read_hdf5_segment
+from sidekit.frontend.vad import label_fusion
+from sidekit.frontend.normfeat import cms, cmvn, stg, cep_sliding_norm, rasta_filt
+from sidekit.sv_utils import parse_mask
+from sidekit.features_extractor import FeaturesExtractor
+
+
+__license__ = "LGPL"
+__author__ = "Anthony Larcher & Sylvain Meignier"
+__copyright__ = "Copyright 2014-2021 Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+#comment
+
+class FeaturesServer(object):
+ """
+ Management of features. FeaturesServer instances load datasets from a HDF5 files
+ (that can be read from disk or produced by a FeaturesExtractor object)
+ Datasets read from one or many files are concatenated and processed
+ """
+
+ def __init__(self,
+ features_extractor=None,
+ feature_filename_structure=None,
+ sources=None,
+ dataset_list=None,
+ mask=None,
+ feat_norm=None,
+ global_cmvn=None,
+ dct_pca=False,
+ dct_pca_config=None,
+ sdc=False,
+ sdc_config=None,
+ delta=None,
+ double_delta=None,
+ delta_filter=None,
+ context=None,
+ traps_dct_nb=None,
+ rasta=None,
+ keep_all_features=True):
+ """
+ Initialize a FeaturesServer for two cases:
+ 1. each call to load will load datasets from a single file. This mode requires to provide a dataset_list
+ (lists of datasets to load from each file.
+ 2. each call to load will load datasets from several files (possibly several datasets from each file)
+ and concatenate them. In this mode, you should provide a FeaturesServer for each source, thus, datasets
+ read from each source can be post-processed independently before being concatenated with others. The dataset
+ resulting from the concatenation from all sources is then post-processed.
+
+ :param features_extractor: a FeaturesExtractor if required to extract features from audio file
+ if None, data are loaded from an existing HDF5 file
+ :param feature_filename_structure: structure of the filename to use to load HDF5 files
+ :param sources: tuple of sources to load features different files (optional: for the case where datasets
+ are loaded from several files and concatenated.
+ :param dataset_list: string of the form '["cep", "fb", vad", energy", "bnf"]' (only when loading datasets
+ from a single file) list of datasets to load.
+ :param mask: string of the form '[1-3,10,15-20]' mask to apply on the concatenated dataset
+ to select specific components. In this example, coefficients 1,2,3,10,15,16,17,18,19,20 are kept
+ In this example,
+ :param feat_norm: tpye of normalization to apply as post-processing
+ :param global_cmvn: boolean, if True, use a global mean and std when normalizing the frames
+ :param dct_pca: if True, add temporal context by using a PCA-DCT approach
+ :param dct_pca_config: configuration of the PCA-DCT, default is (12, 12, none)
+ :param sdc: if True, compute shifted delta cepstra coefficients
+ :param sdc_config: configuration to compute sdc coefficients, default is (1,3,7)
+ :param delta: if True, append the first order derivative
+ :param double_delta: if True, append the second order derivative
+ :param delta_filter: coefficients of the filter used to compute delta coefficients
+ :param context: add a left and right context, default is (0,0)
+ :param traps_dct_nb: number of DCT coefficients to keep when computing TRAP coefficients
+ :param rasta: if True, perform RASTA filtering
+ :param keep_all_features: boolean, if True, keep all features, if False, keep frames according to the vad labels
+ :return:
+ """
+ self.features_extractor = None
+ self.feature_filename_structure = '{}'
+ self.sources = ()
+ self.dataset_list = None
+
+ # Post processing options
+ self.mask = None
+ self.feat_norm = None
+ self.global_cmvn = None
+ self.dct_pca = False
+ self.dct_pca_config = (12, 12, None)
+ self.sdc = False
+ self.sdc_config = (1, 3, 7)
+ self.delta = False
+ self.double_delta = False
+ self.delta_filter = numpy.array([.25, .5, .25, 0, -.25, -.5, -.25])
+ self.context = (0, 0)
+ self.traps_dct_nb = 0
+ self.rasta = False
+ self.keep_all_features = True
+
+ if features_extractor is not None:
+ self.features_extractor = features_extractor
+ if feature_filename_structure is not None:
+ self.feature_filename_structure = feature_filename_structure
+ if sources is not None:
+ self.sources = sources
+ if dataset_list is not None:
+ self.dataset_list = dataset_list
+ if mask is not None:
+ self.mask = parse_mask(mask)
+ if feat_norm is not None:
+ self.feat_norm = feat_norm
+ if global_cmvn is not None:
+ self.global_cmvn = global_cmvn
+ if dct_pca is not None:
+ self.dct_pca = dct_pca
+ if dct_pca_config is not None:
+ self.dct_pca_config = dct_pca_config
+ if sdc is not None:
+ self.sdc = sdc
+ if sdc_config is not None:
+ self.sdc_config = sdc_config
+ if delta is not None:
+ self.delta = delta
+ if double_delta is not None:
+ self.double_delta = double_delta
+ if delta_filter is not None:
+ self.delta_filter = delta_filter
+ if context is not None:
+ self.context = context
+ if traps_dct_nb is not None:
+ self.traps_dct_nb = traps_dct_nb
+ if rasta is not None:
+ self.rasta = rasta
+ if keep_all_features is not None:
+ self.keep_all_features = keep_all_features
+
+ self.show = 'empty'
+ self.input_feature_filename = 'empty'
+ self.start_stop = (None, None)
+ self.previous_load = None
+
+ def __repr__(self):
+ """
+
+ :return: a string to display the object
+ """
+ ch = '\t show: {} \n\n'.format(self.show)
+ ch += '\t input_feature_filename: {} \n\n'.format(self.input_feature_filename)
+ ch += '\t feature_filename_structure: {} \n'.format(self.feature_filename_structure)
+ ch += '\t \n'
+ ch += '\t \n\n'
+ ch += '\t Post processing options: \n'
+ ch += '\t\t mask: {} \n'.format(self.mask)
+ ch += '\t\t feat_norm: {} \n'.format(self.feat_norm)
+ ch += '\t\t dct_pca: {}, dct_pca_config: {} \n'.format(self.dct_pca,
+ self.dct_pca_config)
+ ch += '\t\t sdc: {}, sdc_config: {} \n'.format(self.sdc,
+ self.sdc_config)
+ ch += '\t\t delta: {}, double_delta: {}, delta_filter: {} \n'.format(self.delta,
+ self.double_delta,
+ self.delta_filter)
+ ch += '\t\t rasta: {} \n'.format(self.rasta)
+ ch += '\t\t keep_all_features: {} \n'.format(self.keep_all_features)
+
+ return ch
+
+ def post_processing(self, feat, label, global_mean=None, global_std=None):
+ """
+ After cepstral coefficients, filter banks or bottleneck parameters are computed or read from file
+ post processing is applied.
+
+ :param feat: the matrix of acoustic parameters to post-process
+ :param label: the VAD labels for the acoustic parameters
+ :param global_mean: vector or mean to use for normalization
+ :param global_std: vector of standard deviation to use for normalization
+
+ :return: the matrix of acoustic parameters ingand their VAD labels after post-process
+ """
+ # Perform RASTA filtering if required
+ if self.rasta:
+ feat, label = self._rasta(feat, label)
+
+ # Add temporal context
+ if self.delta or self.double_delta:
+ feat = self._delta_and_2delta(feat)
+ elif self.dct_pca:
+ feat = pca_dct(feat, self.dct_pca_config[0], self.dct_pca_config[1], self.dct_pca_config[2])
+ elif self.sdc:
+ feat = shifted_delta_cepstral(feat, d=self.sdc_config[0], p=self.sdc_config[1], k=self.sdc_config[2])
+
+ # Apply a mask on the features
+ if self.mask is not None:
+ feat = self._mask(feat)
+
+ # Smooth the labels and fuse the channels if more than one.
+ logging.debug('Smooth the labels and fuse the channels if more than one')
+ label = label_fusion(label)
+
+ # Normalize the data
+ if self.feat_norm is None:
+ logging.debug('no norm')
+ else:
+ self._normalize(label, feat, global_mean, global_std)
+
+ # if not self.keep_all_features, only selected features and labels are kept
+ if not self.keep_all_features:
+ logging.debug('no keep all')
+ feat = feat[label]
+ label = label[label]
+
+ return feat, label
+
+ def _mask(self, cep):
+ """
+ Keep only the MFCC index present in the filter list
+ :param cep: acoustic parameters to filter
+
+ :return: return the list of MFCC given by filter list
+ """
+ if len(self.mask) == 0:
+ raise Exception('filter list is empty')
+ logging.debug('applied mask')
+ return cep[:, self.mask]
+
+ def _normalize(self, label, cep, global_mean=None, global_std=None):
+ """
+ Normalize acoustic parameters in place
+
+ :param label: vad labels to use for normalization
+ :param cep: acoustic parameters to normalize
+ :param global_mean: mean vector to use if provided
+ :param global_std: standard deviation vector to use if provided
+ """
+ # Perform feature normalization on the entire session.
+ if self.feat_norm is None:
+ logging.debug('no norm')
+ pass
+ elif self.feat_norm == 'cms':
+ logging.debug('cms norm')
+ cms(cep, label, global_mean)
+ elif self.feat_norm == 'cmvn':
+ logging.debug('cmvn norm')
+ cmvn(cep, label, global_mean, global_std)
+ elif self.feat_norm == 'stg':
+ logging.debug('stg norm')
+ stg(cep, label=label)
+ elif self.feat_norm == 'cmvn_sliding':
+ logging.debug('sliding cmvn norm')
+ cep_sliding_norm(cep, label=label, win=301, center=True, reduce=True)
+ elif self.feat_norm == 'cms_sliding':
+ logging.debug('sliding cms norm')
+ cep_sliding_norm(cep, label=label, win=301, center=True, reduce=False)
+ else:
+ logging.warning('Wrong feature normalisation type')
+
+ def _delta_and_2delta(self, cep):
+ """
+ Add deltas and double deltas.
+ :param cep: a matrix of cepstral cefficients
+
+ :return: the cepstral coefficient stacked with deltas and double deltas
+ """
+ if self.delta:
+ logging.debug('add delta')
+ delta = compute_delta(cep, filt=self.delta_filter)
+ cep = numpy.column_stack((cep, delta))
+ if self.double_delta:
+ logging.debug('add delta delta')
+ double_delta = compute_delta(delta, filt=self.delta_filter)
+ cep = numpy.column_stack((cep, double_delta))
+ return cep
+
+ def _rasta(self, cep, label):
+ """
+ Performs RASTA filtering if required.
+ The two first frames are copied from the third to keep
+ the length consistent
+ !!! if vad is None: label[] is empty
+
+ :param cep: the acoustic features to filter
+ :param label: the VAD label
+ :return:
+ """
+ if self.rasta:
+ logging.debug('perform RASTA %s', self.rasta)
+ cep = rasta_filt(cep)
+ cep[:2, :] = cep[2, :]
+ label[:2] = label[2]
+ return cep, label
+
+ def get_context(self, feat, start=None, stop=None, label=None):
+ """
+ Add a left and right context to each frame.
+ First and last frames are duplicated to provide context at the begining and at the end
+
+ :param feat: sequence of feature frames (one fame per line)
+ :param start: index of the first frame of the selected segment
+ :param stop: index of the last frame of the selected segment
+ :param label: vad label if available
+
+ :return: a sequence of frames with their left and right context
+ """
+ if start is None:
+ start = 0
+ if stop is None:
+ stop = feat.shape[0]
+ context_feat = framing(
+ numpy.pad(feat,
+ ((max(self.context[0] - start, 0), max(stop - feat.shape[0] + self.context[1] + 1, 0)),
+ (0, 0)),
+ mode='edge')[start - self.context[0] + max(self.context[0] - start, 0):
+ stop + self.context[1] + max(self.context[0] - start, 0), :], win_size=1+sum(self.context)
+ ).reshape(-1, (1+sum(self.context)) * feat.shape[1])
+
+ if label is not None:
+ context_label = label[start:stop]
+ else:
+ context_label = None
+
+ return context_feat, context_label
+
+ def get_traps(self, feat, start=None, stop=None, label=None):
+ """
+ Compute TRAP parameters. The input frames are concatenated to add their left and right context,
+ a Hamming window is applied and a DCT reduces the dimensionality of the resulting vector.
+
+ :param feat: input acoustic parameters to process
+ :param start: index of the first frame of the selected segment
+ :param stop: index of the last frame of the selected segment
+ :param label: vad label if available
+
+ :return: a sequence of TRAP parameters
+ """
+
+ if start is None:
+ start = 0
+ if stop is None:
+ stop = feat.shape[0]
+
+ context_feat = framing(
+ numpy.pad(
+ feat,
+ ((self.context[0]-start, stop - feat.shape[0] + self.context[1] + 1), (0, 0)),
+ mode='edge'
+ )[start-self.context[0] +
+ max(self.context[0]-start, 0):stop + self.context[1] + max(self.context[0]-start, 0), :],
+ win_size=1+sum(self.context)
+ ).transpose(0, 2, 1)
+ hamming_dct = (dct_basis(self.traps_dct_nb, sum(self.context) + 1) *
+ numpy.hamming(sum(self.context) + 1)).T
+
+ if label is not None:
+ context_label = label[start:stop]
+ else:
+ context_label = None
+
+ return numpy.dot(
+ context_feat.reshape(-1, hamming_dct.shape[0]),
+ hamming_dct
+ ).reshape(context_feat.shape[0], -1), context_label
+
+ def load(self, show, channel=0, input_feature_filename=None, label=None, start=None, stop=None):
+ """
+ Depending of the setting of the FeaturesServer, can either:
+
+ 1. Get the datasets from a single HDF5 file
+ The HDF5 file is loaded from disk or processed on the fly
+ via the FeaturesExtractor of the current FeaturesServer
+
+ 2. Load datasets from multiple input HDF5 files. The datasets are post-processed separately, then concatenated
+ and post-process
+
+ :param show: ID of the show to load (should be the same for each HDF5 file to read from)
+ :param channel: audio channel index in case the parameters are extracted from an audio file
+ :param input_feature_filename: name of the input feature file in case it is independent from the ID of the show
+ :param label: vad labels
+ :param start: index of the first frame of the selected segment
+ :param stop: index of the last frame of the selected segment
+
+ :return: acoustic parameters and their vad labels
+ """
+ # In case the name of the input file does not include the ID of the show
+ # (i.e., feature_filename_structure does not include {})
+ # self.audio_filename_structure is updated to use the input_feature_filename
+ if self.show == show \
+ and self.input_feature_filename == input_feature_filename\
+ and self.start_stop == (start, stop) \
+ and self.previous_load is not None:
+ logging.debug('return previous load')
+ return self.previous_load
+ self.show = show
+ self.input_feature_filename = input_feature_filename
+ self.start_stop = (start, stop)
+
+ feature_filename = None
+ if input_feature_filename is not None:
+ self.feature_filename_structure = input_feature_filename
+ feature_filename = self.feature_filename_structure.format(show)
+
+ if self.dataset_list is not None:
+ self.previous_load = self.get_features(show,
+ channel=channel,
+ input_feature_filename=feature_filename,
+ label=label,
+ start=start, stop=stop)
+ else:
+ logging.info('Extract tandem features from multiple sources')
+ self.previous_load = self.get_tandem_features(show,
+ channel=channel,
+ label=label,
+ start=start, stop=stop)
+ return self.previous_load
+
+ def get_features(self, show, channel=0, input_feature_filename=None, label=None, start=None, stop=None):
+ """
+ Get the datasets from a single HDF5 file
+ The HDF5 file is loaded from disk or processed on the fly
+ via the FeaturesExtractor of the current FeaturesServer
+
+ :param show: ID of the show
+ :param channel: index of the channel to read
+ :param input_feature_filename: name of the input file in case it does not include the ID of the show
+ :param label: vad labels
+ :param start: index of the first frame of the selected segment
+ :param stop: index of the last frame of the selected segment
+
+ :return: acoustic parameters and their vad labels
+ """
+ """
+ Si le nom du fichier d'entrée est totalement indépendant du show
+ -> si feature_filename_structure ne contient pas "{}"
+ on peut mettre à jour: self.audio_filename_structure pour entrer directement le nom du fichier de feature
+ """
+ if input_feature_filename is not None:
+ self.feature_filename_structure = input_feature_filename
+
+ # If no extractor for this source, open hdf5 file and return handler
+ if self.features_extractor is None:
+ h5f = h5py.File(self.feature_filename_structure.format(show), "r")
+
+ # If an extractor is provided for this source, extract features and return an hdf5 handler
+ else:
+ h5f = self.features_extractor.extract(show, channel, input_audio_filename=input_feature_filename)
+
+ feat, label, global_mean, global_std, global_cmvn = read_hdf5_segment(h5f,
+ show,
+ dataset_list=self.dataset_list,
+ label=label,
+ start=start, stop=stop,
+ global_cmvn=self.global_cmvn)
+
+ # Post-process the features and return the features and vad label
+ if global_cmvn:
+ feat, label = self.post_processing(feat, label, global_mean, global_std)
+ else:
+ feat, label = self.post_processing(feat, label)
+
+ return feat, label
+
+ def get_features_per_speaker(self, show, idmap, channel=0, input_feature_filename=None, label=None):
+ """
+ Load a single file and return a dictionary with spk_ids as keys and (feature, label) as data
+ :param show:
+ :param channel:
+ :param input_feature_filename:
+ :param label:
+ :param idmap:
+ :return:
+ """
+ if input_feature_filename is not None:
+ self.feature_filename_structure = input_feature_filename
+
+ # If no extractor for this source, open hdf5 file and return handler
+ if self.features_extractor is None:
+ h5f = h5py.File(self.feature_filename_structure.format(show), "r")
+
+ # If an extractor is provided for this source, extract features and return an hdf5 handler
+ else:
+ h5f = self.features_extractor.extract(show, channel, input_audio_filename=input_feature_filename)
+
+ tmp_dict = dict()
+ for spk_id, start, stop in zip(idmap.leftids, idmap.start, idmap.stop):
+ if spk_id not in tmp_dict:
+ tmp_dict[spk_id] = numpy.arange(start, stop - 1)
+ else:
+ tmp_dict[spk_id] = numpy.concatenate((tmp_dict[spk_id], numpy.arange(start, stop - 1)), axis=0)
+
+ feat, lbl, global_mean, global_std, global_cmvn = read_hdf5_segment(h5f,
+ show,
+ dataset_list=self.dataset_list,
+ label=label,
+ start=None, stop=None,
+ global_cmvn=self.global_cmvn)
+
+ fe = FeaturesExtractor(audio_filename_structure="",
+ feature_filename_structure=None,
+ sampling_frequency=16000,
+ lower_frequency=133.3333,
+ higher_frequency=6855.4976,
+ filter_bank="log",
+ filter_bank_size=40,
+ window_size=0.025,
+ shift=0.01,
+ ceps_number=13,
+ pre_emphasis=0.97,
+ keep_all_features=True,
+ vad='percentil',
+ save_param=["energy", "cep", "vad"]
+ )
+
+ feat_per_spk = dict()
+ for spk_id in tmp_dict.keys():
+ lbl1 = copy.deepcopy(lbl)
+ feat1 = copy.deepcopy(feat)
+
+ _, threshold_id = fe._vad(None, feat1[tmp_dict[spk_id], 0], None, None)
+ label_id = feat1[:, 0] > threshold_id
+ lbl1[tmp_dict[spk_id]] = label_id[tmp_dict[spk_id]].flatten()
+
+ # Post-process the features and return the features and vad label
+ if global_cmvn:
+ tmp_feat, tmp_lbl = self.post_processing(feat1[tmp_dict[spk_id], :], lbl1[tmp_dict[spk_id]], global_mean, global_std)
+ else:
+ tmp_feat, tmp_lbl = self.post_processing(feat1[tmp_dict[spk_id], :], lbl1[tmp_dict[spk_id]])
+
+ feat_per_spk[spk_id] = (tmp_feat, tmp_lbl)
+
+ return feat_per_spk
+
+ def get_tandem_features(self, show, channel=0, label=None, start=None, stop=None):
+ """
+ Read acoustic parameters from multiple HDF5 files (from disk or extracted by FeaturesExtractor objects).
+
+ :param show: Id of the show
+ :param channel: index of the channel
+ :param label: vad labels
+ :param start: index of the first frame of the selected segment
+ :param stop: index of the last frame of the selected segment
+
+ :return: acoustic parameters and their vad labels
+ """
+ # Each source has its own sources (including subserver) that provides features and label
+ features = []
+ for features_server, get_vad in self.sources:
+ # Get features from this source
+ feat, lbl = features_server.get_features(show, channel=channel, label=label, start=start, stop=stop)
+ if get_vad:
+ label = lbl
+ features.append(feat)
+
+ features = numpy.hstack(features)
+
+ # If the VAD is not required, return all labels at True
+ if label is None:
+ label = numpy.ones(feat.shape[0], dtype='bool')
+
+ # Apply the final post-processing on the concatenated features
+ return self.post_processing(features, label)
+
+ def mean_std(self, show, channel=0, start=None, stop=None):
+ """
+ Compute the mean and standard deviation vectors for a segment of acoustic features
+
+ :param show: the ID of the show
+ :param channel: the index of the channel
+ param start: index of the first frame of the selected segment
+ :param stop: index of the last frame of the selected segment
+
+ :return: the number of frames, the mean of the frames and their standard deviation
+ """
+ feat, _ = self.load(show, channel=channel, start=start, stop=stop)
+ return feat.shape[0], feat.sum(axis=0), numpy.sum(feat**2, axis=0)
+
+ def stack_features(self,
+ show_list,
+ channel_list=None,
+ feature_filename_list=None,
+ label_list=None,
+ start_list=None,
+ stop_list=None):
+ """
+ Load acoustic features from a list of fils and return them stacked in a 2D-array
+ one line per frame.
+
+ :param show_list:
+ :param channel_list:
+ :param label_list:
+ :param start_list:
+ :param stop_list:
+ :return:
+ """
+ if channel_list is None:
+ channel_list = numpy.zeros(len(show_list))
+ if feature_filename_list is None:
+ feature_filename_list = numpy.empty(len(show_list), dtype='|O')
+ if label_list is None:
+ label_list = numpy.empty(len(show_list), dtype='|O')
+ if start_list is None:
+ start_list = numpy.empty(len(show_list), dtype='|O')
+ if stop_list is None:
+ stop_list = numpy.empty(len(show_list), dtype='|O')
+
+ features_list = []
+ for idx, load_arg in enumerate(zip(show_list, channel_list, feature_filename_list, label_list, start_list, stop_list)):
+ logging.info("load file {} / {}".format(idx + 1, len(show_list)))
+ features_list.append(self.load(*load_arg)[0])
+
+ return numpy.vstack(features_list)
+
+
+ def _stack_features_worker(self,
+ input_queue,
+ output_queue):
+ """Load a list of feature files into a Queue object
+
+ :param input: a Queue object
+ :param output: a list of Queue objects to fill
+ """
+ while True:
+ next_task = input_queue.get()
+ if next_task is None:
+ # Poison pill means shutdown
+ output_queue.put(None)
+ input_queue.task_done()
+ break
+ output_queue.put(self.load(*next_task)[0])
+ input_queue.task_done()
+
+
+ #@profile
+ def stack_features_parallel(self, # fileList, numThread=1):
+ show_list,
+ channel_list=None,
+ feature_filename_list=None,
+ label_list=None,
+ start_list=None,
+ stop_list=None,
+ num_thread=1):
+ """Load a list of feature files and stack them in a unique ndarray.
+ The list of files to load is splited in sublists processed in parallel
+
+ :param fileList: a list of files to load
+ :param numThread: numbe of thead (optional, default is 1)
+ """
+ if channel_list is None:
+ channel_list = numpy.zeros(len(show_list))
+ if feature_filename_list is None:
+ feature_filename_list = numpy.empty(len(show_list), dtype='|O')
+ if label_list is None:
+ label_list = numpy.empty(len(show_list), dtype='|O')
+ if start_list is None:
+ start_list = numpy.empty(len(show_list), dtype='|O')
+ if stop_list is None:
+ stop_list = numpy.empty(len(show_list), dtype='|O')
+
+ #queue_in = Queue.Queue(maxsize=len(fileList)+numThread)
+ queue_in = multiprocessing.JoinableQueue(maxsize=len(show_list)+num_thread)
+ queue_out = []
+
+ # Start worker processes
+ jobs = []
+ for i in range(num_thread):
+ queue_out.append(multiprocessing.Queue())
+ p = multiprocessing.Process(target=self._stack_features_worker,
+ args=(queue_in, queue_out[i]))
+ jobs.append(p)
+ p.start()
+
+ # Submit tasks
+ for task in zip(show_list, channel_list, feature_filename_list,
+ label_list, start_list, stop_list):
+ queue_in.put(task)
+
+ # Add None to the queue to kill the workers
+ for task in range(num_thread):
+ queue_in.put(None)
+
+ # Wait for all the tasks to finish
+ queue_in.join()
+ output = []
+ for q in queue_out:
+ while True:
+ data = q.get()
+ if data is None:
+ break
+ output.append(data)
+
+ for p in jobs:
+ p.join()
+ return numpy.concatenate(output, axis=0)
+
+
diff --git a/sidekit/sidekit/frontend/__init__.py b/sidekit/sidekit/frontend/__init__.py
new file mode 100755
index 0000000..821b1d3
--- /dev/null
+++ b/sidekit/sidekit/frontend/__init__.py
@@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+
+"""
+Copyright 2014-2021 Anthony Larcher and Sylvain Meignier
+
+:mod:`frontend` provides methods to process an audio signal in order to extract
+useful parameters for speaker verification.
+"""
+
+from .io import write_pcm
+from .io import read_pcm
+from .io import pcmu2lin
+from .io import read_sph
+from .io import write_label
+from .io import read_label
+from .io import read_spro4
+from .io import read_audio
+from .io import write_spro4
+from .io import read_htk
+from .io import write_htk
+from .io import read_hdf5_segment
+from .io import write_hdf5
+from .io import read_hdf5
+
+from .vad import vad_snr
+from .vad import label_fusion
+from .vad import speech_enhancement
+
+
+from .normfeat import cms
+from .normfeat import cmvn
+from .normfeat import stg
+from .normfeat import rasta_filt
+from .normfeat import cep_sliding_norm
+
+
+from .features import compute_delta
+from .features import framing
+from .features import pre_emphasis
+from .features import trfbank
+from .features import mel_filter_bank
+from .features import mfcc
+from .features import pca_dct
+from .features import shifted_delta_cepstral
+
+
+__author__ = "Anthony Larcher and Sylvain Meignier"
+__copyright__ = "Copyright 2014-2021 Anthony Larcher and Sylvain Meignier"
+__license__ = "LGPL"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
diff --git a/sidekit/sidekit/frontend/features.py b/sidekit/sidekit/frontend/features.py
new file mode 100755
index 0000000..4ee2ed7
--- /dev/null
+++ b/sidekit/sidekit/frontend/features.py
@@ -0,0 +1,1126 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+
+"""
+Copyright 2014-2021 Anthony Larcher and Sylvain Meignier
+
+:mod:`frontend` provides methods to process an audio signal in order to extract
+useful parameters for speaker verification.
+"""
+
+import numpy
+import numpy.matlib
+import scipy
+from scipy.fftpack.realtransforms import dct
+from .vad import pre_emphasis
+import numpy.matlib
+
+PARAM_TYPE = numpy.float32
+
+__author__ = "Anthony Larcher and Sylvain Meignier"
+__copyright__ = "Copyright 2014-2021 Anthony Larcher and Sylvain Meignier"
+__license__ = "LGPL"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+
+
+def hz2mel(f, htk=True):
+ """Convert an array of frequency in Hz into mel.
+
+ :param f: frequency to convert
+
+ :return: the equivalence on the mel scale.
+ """
+ if htk:
+ return 2595 * numpy.log10(1 + f / 700.)
+ else:
+ f = numpy.array(f)
+
+ # Mel fn to match Slaney's Auditory Toolbox mfcc.m
+ # Mel fn to match Slaney's Auditory Toolbox mfcc.m
+ f_0 = 0.
+ f_sp = 200. / 3.
+ brkfrq = 1000.
+ brkpt = (brkfrq - f_0) / f_sp
+ logstep = numpy.exp(numpy.log(6.4) / 27)
+
+ linpts = f < brkfrq
+
+ z = numpy.zeros_like(f)
+ # fill in parts separately
+ z[linpts] = (f[linpts] - f_0) / f_sp
+ z[~linpts] = brkpt + (numpy.log(f[~linpts] / brkfrq)) / numpy.log(logstep)
+
+ if z.shape == (1,):
+ return z[0]
+ else:
+ return z
+
+
+def mel2hz(z, htk=True):
+ """Convert an array of mel values in Hz.
+
+ :param m: ndarray of frequencies to convert in Hz.
+
+ :return: the equivalent values in Hertz.
+ """
+ if htk:
+ return 700. * (10**(z / 2595.) - 1)
+ else:
+ z = numpy.array(z, dtype=float)
+ f_0 = 0
+ f_sp = 200. / 3.
+ brkfrq = 1000.
+ brkpt = (brkfrq - f_0) / f_sp
+ logstep = numpy.exp(numpy.log(6.4) / 27)
+
+ linpts = (z < brkpt)
+
+ f = numpy.zeros_like(z)
+
+ # fill in parts separately
+ f[linpts] = f_0 + f_sp * z[linpts]
+ f[~linpts] = brkfrq * numpy.exp(numpy.log(logstep) * (z[~linpts] - brkpt))
+
+ if f.shape == (1,):
+ return f[0]
+ else:
+ return f
+
+
+def hz2bark(f):
+ """
+ Convert frequencies (Hertz) to Bark frequencies
+
+ :param f: the input frequency
+ :return:
+ """
+ return 6. * numpy.arcsinh(f / 600.)
+
+
+def bark2hz(z):
+ """
+ Converts frequencies Bark to Hertz (Hz)
+
+ :param z:
+ :return:
+ """
+ return 600. * numpy.sinh(z / 6.)
+
+
+def compute_delta(features,
+ win=3,
+ method='filter',
+ filt=numpy.array([.25, .5, .25, 0, -.25, -.5, -.25])):
+ """features is a 2D-ndarray each row of features is a a frame
+
+ :param features: the feature frames to compute the delta coefficients
+ :param win: parameter that set the length of the computation window.
+ The size of the window is (win x 2) + 1
+ :param method: method used to compute the delta coefficients
+ can be diff or filter
+ :param filt: definition of the filter to use in "filter" mode, default one
+ is similar to SPRO4: filt=numpy.array([.2, .1, 0, -.1, -.2])
+
+ :return: the delta coefficients computed on the original features.
+ """
+ # First and last features are appended to the begining and the end of the
+ # stream to avoid border effect
+ x = numpy.zeros((features.shape[0] + 2 * win, features.shape[1]), dtype=PARAM_TYPE)
+ x[:win, :] = features[0, :]
+ x[win:-win, :] = features
+ x[-win:, :] = features[-1, :]
+
+ delta = numpy.zeros(x.shape, dtype=PARAM_TYPE)
+
+ if method == 'diff':
+ filt = numpy.zeros(2 * win + 1, dtype=PARAM_TYPE)
+ filt[0] = -1
+ filt[-1] = 1
+
+ for i in range(features.shape[1]):
+ delta[:, i] = numpy.convolve(features[:, i], filt)
+
+ return delta[win:-win, :]
+
+
+def pca_dct(cep, left_ctx=12, right_ctx=12, p=None):
+ """Apply DCT PCA as in [McLaren 2015] paper:
+ Mitchell McLaren and Yun Lei, 'Improved Speaker Recognition
+ Using DCT coefficients as features' in ICASSP, 2015
+
+ A 1D-dct is applied to the cepstral coefficients on a temporal
+ sliding window.
+ The resulting matrix is then flatten and reduced by using a Principal
+ Component Analysis.
+
+ :param cep: a matrix of cepstral cefficients, 1 line per feature vector
+ :param left_ctx: number of frames to consider for left context
+ :param right_ctx: number of frames to consider for right context
+ :param p: a PCA matrix trained on a developpment set to reduce the
+ dimension of the features. P is a portait matrix
+ """
+ y = numpy.r_[numpy.resize(cep[0, :], (left_ctx, cep.shape[1])),
+ cep,
+ numpy.resize(cep[-1, :], (right_ctx, cep.shape[1]))]
+
+ ceps = framing(y, win_size=left_ctx + 1 + right_ctx).transpose(0, 2, 1)
+ dct_temp = (dct_basis(left_ctx + 1 + right_ctx, left_ctx + 1 + right_ctx)).T
+ if p is None:
+ p = numpy.eye(dct_temp.shape[0] * cep.shape[1], dtype=PARAM_TYPE)
+ return (numpy.dot(ceps.reshape(-1, dct_temp.shape[0]),
+ dct_temp).reshape(ceps.shape[0], -1)).dot(p)
+
+
+def shifted_delta_cepstral(cep, d=1, p=3, k=7):
+ """
+ Compute the Shifted-Delta-Cepstral features for language identification
+
+ :param cep: matrix of feature, 1 vector per line
+ :param d: represents the time advance and delay for the delta computation
+ :param k: number of delta-cepstral blocks whose delta-cepstral
+ coefficients are stacked to form the final feature vector
+ :param p: time shift between consecutive blocks.
+
+ return: cepstral coefficient concatenated with shifted deltas
+ """
+
+ y = numpy.r_[numpy.resize(cep[0, :], (d, cep.shape[1])),
+ cep,
+ numpy.resize(cep[-1, :], (k * 3 + d, cep.shape[1]))]
+
+ delta = compute_delta(y, win=d, method='diff')
+ sdc = numpy.empty((cep.shape[0], cep.shape[1] * k))
+
+ idx = numpy.zeros(delta.shape[0], dtype='bool')
+ for ii in range(k):
+ idx[d + ii * p] = True
+ for ff in range(len(cep)):
+ sdc[ff, :] = delta[idx, :].reshape(1, -1)
+ idx = numpy.roll(idx, 1)
+ return numpy.hstack((cep, sdc))
+
+
+def trfbank(fs, nfft, lowfreq, maxfreq, nlinfilt, nlogfilt, midfreq=1000):
+ """Compute triangular filterbank for cepstral coefficient computation.
+
+ :param fs: sampling frequency of the original signal.
+ :param nfft: number of points for the Fourier Transform
+ :param lowfreq: lower limit of the frequency band filtered
+ :param maxfreq: higher limit of the frequency band filtered
+ :param nlinfilt: number of linear filters to use in low frequencies
+ :param nlogfilt: number of log-linear filters to use in high frequencies
+ :param midfreq: frequency boundary between linear and log-linear filters
+
+ :return: the filter bank and the central frequencies of each filter
+ """
+ # Total number of filters
+ nfilt = nlinfilt + nlogfilt
+
+ # ------------------------
+ # Compute the filter bank
+ # ------------------------
+ # Compute start/middle/end points of the triangular filters in spectral
+ # domain
+ frequences = numpy.zeros(nfilt + 2, dtype=PARAM_TYPE)
+ if nlogfilt == 0:
+ linsc = (maxfreq - lowfreq) / (nlinfilt + 1)
+ frequences[:nlinfilt + 2] = lowfreq + numpy.arange(nlinfilt + 2) * linsc
+ elif nlinfilt == 0:
+ low_mel = hz2mel(lowfreq)
+ max_mel = hz2mel(maxfreq)
+ mels = numpy.zeros(nlogfilt + 2)
+ # mels[nlinfilt:]
+ melsc = (max_mel - low_mel) / (nfilt + 1)
+ mels[:nlogfilt + 2] = low_mel + numpy.arange(nlogfilt + 2) * melsc
+ # Back to the frequency domain
+ frequences = mel2hz(mels)
+ else:
+ # Compute linear filters on [0;1000Hz]
+ linsc = (min([midfreq, maxfreq]) - lowfreq) / (nlinfilt + 1)
+ frequences[:nlinfilt] = lowfreq + numpy.arange(nlinfilt) * linsc
+ # Compute log-linear filters on [1000;maxfreq]
+ low_mel = hz2mel(min([1000, maxfreq]))
+ max_mel = hz2mel(maxfreq)
+ mels = numpy.zeros(nlogfilt + 2, dtype=PARAM_TYPE)
+ melsc = (max_mel - low_mel) / (nlogfilt + 1)
+
+ # Verify that mel2hz(melsc)>linsc
+ while mel2hz(melsc) < linsc:
+ # in this case, we add a linear filter
+ nlinfilt += 1
+ nlogfilt -= 1
+ frequences[:nlinfilt] = lowfreq + numpy.arange(nlinfilt) * linsc
+ low_mel = hz2mel(frequences[nlinfilt - 1] + 2 * linsc)
+ max_mel = hz2mel(maxfreq)
+ mels = numpy.zeros(nlogfilt + 2, dtype=PARAM_TYPE)
+ melsc = (max_mel - low_mel) / (nlogfilt + 1)
+
+ mels[:nlogfilt + 2] = low_mel + numpy.arange(nlogfilt + 2) * melsc
+ # Back to the frequency domain
+ frequences[nlinfilt:] = mel2hz(mels)
+
+ heights = 2. / (frequences[2:] - frequences[0:-2])
+
+ # Compute filterbank coeff (in fft domain, in bins)
+ fbank = numpy.zeros((nfilt, int(numpy.floor(nfft / 2)) + 1), dtype=PARAM_TYPE)
+ # FFT bins (in Hz)
+ n_frequences = numpy.arange(nfft) / (1. * nfft) * fs
+
+ for i in range(nfilt):
+ low = frequences[i]
+ cen = frequences[i + 1]
+ hi = frequences[i + 2]
+
+ lid = numpy.arange(numpy.floor(low * nfft / fs) + 1, numpy.floor(cen * nfft / fs) + 1, dtype=numpy.int)
+ left_slope = heights[i] / (cen - low)
+ rid = numpy.arange(numpy.floor(cen * nfft / fs) + 1,
+ min(numpy.floor(hi * nfft / fs) + 1, nfft), dtype=numpy.int)
+ right_slope = heights[i] / (hi - cen)
+ fbank[i][lid] = left_slope * (n_frequences[lid] - low)
+ fbank[i][rid[:-1]] = right_slope * (hi - n_frequences[rid[:-1]])
+
+ return fbank, frequences
+
+
+def mel_filter_bank(fs, nfft, lowfreq, maxfreq, widest_nlogfilt, widest_lowfreq, widest_maxfreq,):
+ """Compute triangular filterbank for cepstral coefficient computation.
+
+ :param fs: sampling frequency of the original signal.
+ :param nfft: number of points for the Fourier Transform
+ :param lowfreq: lower limit of the frequency band filtered
+ :param maxfreq: higher limit of the frequency band filtered
+ :param widest_nlogfilt: number of log filters
+ :param widest_lowfreq: lower frequency of the filter bank
+ :param widest_maxfreq: higher frequency of the filter bank
+ :param widest_maxfreq: higher frequency of the filter bank
+
+ :return: the filter bank and the central frequencies of each filter
+ """
+
+ # ------------------------
+ # Compute the filter bank
+ # ------------------------
+ # Compute start/middle/end points of the triangular filters in spectral
+ # domain
+ widest_freqs = numpy.zeros(widest_nlogfilt + 2, dtype=PARAM_TYPE)
+ low_mel = hz2mel(widest_lowfreq)
+ max_mel = hz2mel(widest_maxfreq)
+ mels = numpy.zeros(widest_nlogfilt+2)
+ melsc = (max_mel - low_mel) / (widest_nlogfilt + 1)
+ mels[:widest_nlogfilt + 2] = low_mel + numpy.arange(widest_nlogfilt + 2) * melsc
+ # Back to the frequency domain
+ widest_freqs = mel2hz(mels)
+
+ # Select filters in the narrow band
+ sub_band_freqs = numpy.array([fr for fr in widest_freqs if lowfreq <= fr <= maxfreq], dtype=PARAM_TYPE)
+
+ heights = 2./(sub_band_freqs[2:] - sub_band_freqs[0:-2])
+ nfilt = sub_band_freqs.shape[0] - 2
+
+ # Compute filterbank coeff (in fft domain, in bins)
+ fbank = numpy.zeros((nfilt, numpy.floor(nfft/2)+1).astype(int), dtype=PARAM_TYPE)
+ # FFT bins (in Hz)
+ nfreqs = numpy.arange(nfft) / (1. * nfft) * fs
+
+ for i in range(nfilt):
+ low = sub_band_freqs[i]
+ cen = sub_band_freqs[i+1]
+ hi = sub_band_freqs[i+2]
+ lid = numpy.arange(numpy.floor(low * nfft / fs) + 1, numpy.floor(cen * nfft / fs) + 1, dtype=numpy.int)
+ left_slope = heights[i] / (cen - low)
+ rid = numpy.arange(numpy.floor(cen * nfft / fs) + 1, min(numpy.floor(hi * nfft / fs) + 1,
+ nfft), dtype=numpy.int)
+ right_slope = heights[i] / (hi - cen)
+ fbank[i][lid] = left_slope * (nfreqs[lid] - low)
+ fbank[i][rid[:-1]] = right_slope * (hi - nfreqs[rid[:-1]])
+
+ return fbank, sub_band_freqs
+
+
+def power_spectrum(input_sig,
+ fs=8000,
+ win_time=0.025,
+ shift=0.01,
+ prefac=0.97):
+ """
+ Compute the power spectrum of the signal.
+ :param input_sig:
+ :param fs:
+ :param win_time:
+ :param shift:
+ :param prefac:
+ :return:
+ """
+ window_length = int(round(win_time * fs))
+ overlap = window_length - int(shift * fs)
+ framed = framing(input_sig, window_length, win_shift=window_length-overlap).copy()
+ # Pre-emphasis filtering is applied after framing to be consistent with stream processing
+ framed = pre_emphasis(framed, prefac)
+ l = framed.shape[0]
+ n_fft = 2 ** int(numpy.ceil(numpy.log2(window_length)))
+ # Windowing has been changed to hanning which is supposed to have less noisy sidelobes
+ # ham = numpy.hamming(window_length)
+ window = numpy.hanning(window_length)
+
+ spec = numpy.ones((l, int(n_fft / 2) + 1), dtype=PARAM_TYPE)
+ log_energy = numpy.log((framed**2).sum(axis=1))
+ dec = 500000
+ start = 0
+ stop = min(dec, l)
+ while start < l:
+ ahan = framed[start:stop, :] * window
+ mag = numpy.fft.rfft(ahan, n_fft, axis=-1)
+ spec[start:stop, :] = mag.real**2 + mag.imag**2
+ start = stop
+ stop = min(stop + dec, l)
+
+ return spec, log_energy
+
+
+def mfcc(input_sig,
+ lowfreq=100, maxfreq=8000,
+ nlinfilt=0, nlogfilt=24,
+ nwin=0.025,
+ fs=16000,
+ nceps=13,
+ shift=0.01,
+ get_spec=False,
+ get_mspec=False,
+ prefac=0.97):
+ """Compute Mel Frequency Cepstral Coefficients.
+
+ :param input_sig: input signal from which the coefficients are computed.
+ Input audio is supposed to be RAW PCM 16bits
+ :param lowfreq: lower limit of the frequency band filtered.
+ Default is 100Hz.
+ :param maxfreq: higher limit of the frequency band filtered.
+ Default is 8000Hz.
+ :param nlinfilt: number of linear filters to use in low frequencies.
+ Default is 0.
+ :param nlogfilt: number of log-linear filters to use in high frequencies.
+ Default is 24.
+ :param nwin: length of the sliding window in seconds
+ Default is 0.025.
+ :param fs: sampling frequency of the original signal. Default is 16000Hz.
+ :param nceps: number of cepstral coefficients to extract.
+ Default is 13.
+ :param shift: shift between two analyses. Default is 0.01 (10ms).
+ :param get_spec: boolean, if true returns the spectrogram
+ :param get_mspec: boolean, if true returns the output of the filter banks
+ :param prefac: pre-emphasis filter value
+
+ :return: the cepstral coefficients in a ndaray as well as
+ the Log-spectrum in the mel-domain in a ndarray.
+
+ .. note:: MFCC are computed as follows:
+
+ - Pre-processing in time-domain (pre-emphasizing)
+ - Compute the spectrum amplitude by windowing with a Hamming window
+ - Filter the signal in the spectral domain with a triangular filter-bank, whose filters are approximatively
+ linearly spaced on the mel scale, and have equal bandwith in the mel scale
+ - Compute the DCT of the log-spectrom
+ - Log-energy is returned as first coefficient of the feature vector.
+
+ For more details, refer to [Davis80]_.
+ """
+ # Compute power spectrum
+ spec, log_energy = power_spectrum(input_sig,
+ fs,
+ win_time=nwin,
+ shift=shift,
+ prefac=prefac)
+ # Filter the spectrum through the triangle filter-bank
+ n_fft = 2 ** int(numpy.ceil(numpy.log2(int(round(nwin * fs)))))
+ fbank = trfbank(fs, n_fft, lowfreq, maxfreq, nlinfilt, nlogfilt)[0]
+
+ mspec = numpy.log(numpy.dot(spec, fbank.T)) # A tester avec log10 et log
+ # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain)
+ # The C0 term is removed as it is the constant term
+ ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:, 1:nceps + 1]
+ lst = list()
+ lst.append(ceps)
+ lst.append(log_energy)
+ if get_spec:
+ lst.append(spec)
+ else:
+ lst.append(None)
+ del spec
+ if get_mspec:
+ lst.append(mspec)
+ else:
+ lst.append(None)
+ del mspec
+
+ return lst
+
+
+def fft2barkmx(n_fft, fs, nfilts=0, width=1., minfreq=0., maxfreq=8000):
+ """
+ Generate a matrix of weights to combine FFT bins into Bark
+ bins. n_fft defines the source FFT size at sampling rate fs.
+ Optional nfilts specifies the number of output bands required
+ (else one per bark), and width is the constant width of each
+ band in Bark (default 1).
+ While wts has n_fft columns, the second half are all zero.
+ Hence, Bark spectrum is fft2barkmx(n_fft,fs) * abs(fft(xincols, n_fft));
+ 2004-09-05 dpwe@ee.columbia.edu based on rastamat/audspec.m
+
+ :param n_fft: the source FFT size at sampling rate fs
+ :param fs: sampling rate
+ :param nfilts: number of output bands required
+ :param width: constant width of each band in Bark (default 1)
+ :param minfreq:
+ :param maxfreq:
+ :return: a matrix of weights to combine FFT bins into Bark bins
+ """
+ maxfreq = min(maxfreq, fs / 2.)
+
+ min_bark = hz2bark(minfreq)
+ nyqbark = hz2bark(maxfreq) - min_bark
+
+ if nfilts == 0:
+ nfilts = numpy.ceil(nyqbark) + 1
+
+ wts = numpy.zeros((nfilts, n_fft))
+
+ # bark per filt
+ step_barks = nyqbark / (nfilts - 1)
+
+ # Frequency of each FFT bin in Bark
+ binbarks = hz2bark(numpy.arange(n_fft / 2 + 1) * fs / n_fft)
+
+ for i in range(nfilts):
+ f_bark_mid = min_bark + i * step_barks
+ # Linear slopes in log-space (i.e. dB) intersect to trapezoidal window
+ lof = (binbarks - f_bark_mid - 0.5)
+ hif = (binbarks - f_bark_mid + 0.5)
+ wts[i, :n_fft // 2 + 1] = 10 ** (numpy.minimum(numpy.zeros_like(hif), numpy.minimum(hif, -2.5 * lof) / width))
+
+ return wts
+
+
+def fft2melmx(n_fft,
+ fs=8000,
+ nfilts=0,
+ width=1.,
+ minfreq=0,
+ maxfreq=4000,
+ htkmel=False,
+ constamp=False):
+ """
+ Generate a matrix of weights to combine FFT bins into Mel
+ bins. n_fft defines the source FFT size at sampling rate fs.
+ Optional nfilts specifies the number of output bands required
+ (else one per "mel/width"), and width is the constant width of each
+ band relative to standard Mel (default 1).
+ While wts has n_fft columns, the second half are all zero.
+ Hence, Mel spectrum is fft2melmx(n_fft,fs)*abs(fft(xincols,n_fft));
+ minfreq is the frequency (in Hz) of the lowest band edge;
+ default is 0, but 133.33 is a common standard (to skip LF).
+ maxfreq is frequency in Hz of upper edge; default fs/2.
+ You can exactly duplicate the mel matrix in Slaney's mfcc.m
+ as fft2melmx(512, 8000, 40, 1, 133.33, 6855.5, 0);
+ htkmel=1 means use HTK's version of the mel curve, not Slaney's.
+ constamp=1 means make integration windows peak at 1, not sum to 1.
+ frqs returns bin center frqs.
+
+ % 2004-09-05 dpwe@ee.columbia.edu based on fft2barkmx
+
+ :param n_fft:
+ :param fs:
+ :param nfilts:
+ :param width:
+ :param minfreq:
+ :param maxfreq:
+ :param htkmel:
+ :param constamp:
+ :return:
+ """
+ maxfreq = min(maxfreq, fs / 2.)
+
+ if nfilts == 0:
+ nfilts = numpy.ceil(hz2mel(maxfreq, htkmel) / 2.)
+
+ wts = numpy.zeros((nfilts, n_fft))
+
+ # Center freqs of each FFT bin
+ fftfrqs = numpy.arange(n_fft / 2 + 1) / n_fft * fs
+
+ # 'Center freqs' of mel bands - uniformly spaced between limits
+ minmel = hz2mel(minfreq, htkmel)
+ maxmel = hz2mel(maxfreq, htkmel)
+ binfrqs = mel2hz(minmel + numpy.arange(nfilts + 2) / (nfilts + 1) * (maxmel - minmel), htkmel)
+
+ for i in range(nfilts):
+ _fs = binfrqs[i + numpy.arange(3, dtype=int)]
+ # scale by width
+ _fs = _fs[1] + width * (_fs - _fs[1])
+ # lower and upper slopes for all bins
+ loslope = (fftfrqs - _fs[0]) / (_fs[1] - __fs[0])
+ hislope = (_fs[2] - fftfrqs)/(_fs[2] - _fs[1])
+
+ wts[i, 1 + numpy.arange(n_fft//2 + 1)] =numpy.maximum(numpy.zeros_like(loslope),numpy.minimum(loslope, hislope))
+
+ if not constamp:
+ # Slaney-style mel is scaled to be approx constant E per channel
+ wts = numpy.dot(numpy.diag(2. / (binfrqs[2 + numpy.arange(nfilts)] - binfrqs[numpy.arange(nfilts)])) , wts)
+
+ # Make sure 2nd half of FFT is zero
+ wts[:, n_fft // 2 + 1: n_fft] = 0
+
+ return wts, binfrqs
+
+
+def audspec(power_spectrum,
+ fs=16000,
+ nfilts=None,
+ fbtype='bark',
+ minfreq=0,
+ maxfreq=8000,
+ sumpower=True,
+ bwidth=1.):
+ """
+
+ :param power_spectrum:
+ :param fs:
+ :param nfilts:
+ :param fbtype:
+ :param minfreq:
+ :param maxfreq:
+ :param sumpower:
+ :param bwidth:
+ :return:
+ """
+ if nfilts is None:
+ nfilts = int(numpy.ceil(hz2bark(fs / 2)) + 1)
+
+ if not fs == 16000:
+ maxfreq = min(fs / 2, maxfreq)
+
+ nframes, nfreqs = power_spectrum.shape
+ n_fft = (nfreqs -1 ) * 2
+
+ if fbtype == 'bark':
+ wts = fft2barkmx(n_fft, fs, nfilts, bwidth, minfreq, maxfreq)
+ elif fbtype == 'mel':
+ wts = fft2melmx(n_fft, fs, nfilts, bwidth, minfreq, maxfreq)
+ elif fbtype == 'htkmel':
+ wts = fft2melmx(n_fft, fs, nfilts, bwidth, minfreq, maxfreq, True, True)
+ elif fbtype == 'fcmel':
+ wts = fft2melmx(n_fft, fs, nfilts, bwidth, minfreq, maxfreq, True, False)
+ else:
+ print('fbtype {} not recognized'.format(fbtype))
+
+ wts = wts[:, :nfreqs]
+
+ if sumpower:
+ audio_spectrum = power_spectrum.dot(wts.T)
+ else:
+ audio_spectrum = numpy.dot(numpy.sqrt(power_spectrum), wts.T)**2
+
+ return audio_spectrum, wts
+
+
+def postaud(x, fmax, fbtype='bark', broaden=0):
+ """
+ do loudness equalization and cube root compression
+
+ :param x:
+ :param fmax:
+ :param fbtype:
+ :param broaden:
+ :return:
+ """
+ nframes, nbands = x.shape
+
+ # Include frequency points at extremes, discard later
+ nfpts = nbands + 2 * broaden
+
+ if fbtype == 'bark':
+ bandcfhz = bark2hz(numpy.linspace(0, hz2bark(fmax), num=nfpts))
+ elif fbtype == 'mel':
+ bandcfhz = mel2hz(numpy.linspace(0, hz2bark(fmax), num=nfpts))
+ elif fbtype == 'htkmel' or fbtype == 'fcmel':
+ bandcfhz = mel2hz(numpy.linspace(0, hz2mel(fmax,1), num=nfpts),1)
+ else:
+ print('unknown fbtype {}'.format(fbtype))
+
+ # Remove extremal bands (the ones that will be duplicated)
+ bandcfhz = bandcfhz[broaden:(nfpts - broaden)]
+
+ # Hynek's magic equal-loudness-curve formula
+ fsq = bandcfhz ** 2
+ ftmp = fsq + 1.6e5
+ eql = ((fsq / ftmp) ** 2) * ((fsq + 1.44e6) / (fsq + 9.61e6))
+
+ # weight the critical bands
+ z = numpy.matlib.repmat(eql.T,nframes,1) * x
+
+ # cube root compress
+ z = z ** .33
+
+ # replicate first and last band (because they are unreliable as calculated)
+ if broaden == 1:
+ y = z[:, numpy.hstack((0,numpy.arange(nbands), nbands - 1))]
+ else:
+ y = z[:, numpy.hstack((1,numpy.arange(1, nbands - 1), nbands - 2))]
+
+ return y, eql
+
+
+def dolpc(x, model_order=8):
+ """
+ compute autoregressive model from spectral magnitude samples
+
+ :param x:
+ :param model_order:
+ :return:
+ """
+ nframes, nbands = x.shape
+
+ r = numpy.real(numpy.fft.ifft(numpy.hstack((x,x[:,numpy.arange(nbands-2,0,-1)]))))
+
+ # First half only
+ r = r[:, :nbands]
+
+ # Find LPC coeffs by Levinson-Durbin recursion
+ y_lpc = numpy.ones((r.shape[0], model_order + 1))
+
+ for ff in range(r.shape[0]):
+ y_lpc[ff, 1:], e, _ = levinson(r[ff, :-1].T, order=model_order, allow_singularity=True)
+ # Normalize each poly by gain
+ y_lpc[ff, :] /= e
+
+ return y_lpc
+
+
+def lpc2cep(a, nout):
+ """
+ Convert the LPC 'a' coefficients in each column of lpcas
+ into frames of cepstra.
+ nout is number of cepstra to produce, defaults to size(lpcas,1)
+ 2003-04-11 dpwe@ee.columbia.edu
+
+ :param a:
+ :param nout:
+ :return:
+ """
+ ncol , nin = a.shape
+
+ order = nin - 1
+
+ if nout is None:
+ nout = order + 1
+
+ c = numpy.zeros((ncol, nout))
+
+ # First cep is log(Error) from Durbin
+ c[:, 0] = -numpy.log(a[:, 0])
+
+ # Renormalize lpc A coeffs
+ a /= numpy.tile(a[:, 0][:, None], (1, nin))
+
+ for n in range(1, nout):
+ sum = 0
+ for m in range(1, n):
+ sum += (n - m) * a[:, m] * c[:, n - m]
+ c[:, n] = -(a[:, n] + sum / n)
+
+ return c
+
+
+def lpc2spec(lpcas, nout=17):
+ """
+ Convert LPC coeffs back into spectra
+ nout is number of freq channels, default 17 (i.e. for 8 kHz)
+
+ :param lpcas:
+ :param nout:
+ :return:
+ """
+ [cols, rows] = lpcas.shape
+ order = rows - 1
+
+ gg = lpcas[:, 0]
+ aa = lpcas / numpy.tile(gg, (rows,1)).T
+
+ # Calculate the actual z-plane polyvals: nout points around unit circle
+ zz = numpy.exp((-1j * numpy.pi / (nout - 1)) * numpy.outer(numpy.arange(nout).T, numpy.arange(order + 1)))
+
+ # Actual polyvals, in power (mag^2)
+ features = ( 1./numpy.abs(aa.dot(zz.T))**2) / numpy.tile(gg, (nout, 1)).T
+
+ F = numpy.zeros((cols, rows-1))
+ M = numpy.zeros((cols, rows-1))
+
+ for c in range(cols):
+ aaa = aa[c, :]
+ rr = numpy.roots(aaa)
+ ff = numpy.angle(rr.T)
+ zz = numpy.exp(1j * numpy.outer(ff, numpy.arange(len(aaa))))
+ mags = numpy.sqrt(((1./numpy.abs(zz.dot(aaa)))**2)/gg[c])
+ ix = numpy.argsort(ff)
+ keep = ff[ix] > 0
+ ix = ix[keep]
+ F[c, numpy.arange(len(ix))] = ff[ix]
+ M[c, numpy.arange(len(ix))] = mags[ix]
+
+ F = F[:, F.sum(axis=0) != 0]
+ M = M[:, M.sum(axis=0) != 0]
+
+ return features, F, M
+
+
+def spec2cep(spec, ncep=13, type=2):
+ """
+ Calculate cepstra from spectral samples (in columns of spec)
+ Return ncep cepstral rows (defaults to 9)
+ This one does type II dct, or type I if type is specified as 1
+ dctm returns the DCT matrix that spec was multiplied by to give cep.
+
+ :param spec:
+ :param ncep:
+ :param type:
+ :return:
+ """
+ nrow, ncol = spec.shape
+
+ # Make the DCT matrix
+ dctm = numpy.zeros(ncep, nrow);
+ #if type == 2 || type == 3
+ # # this is the orthogonal one, the one you want
+ # for i = 1:ncep
+ # dctm(i,:) = cos((i-1)*[1:2:(2*nrow-1)]/(2*nrow)*pi) * sqrt(2/nrow);
+
+ # if type == 2
+ # # make it unitary! (but not for HTK type 3)
+ # dctm(1,:) = dctm(1,:)/sqrt(2);
+
+ #elif type == 4: # type 1 with implicit repeating of first, last bins
+ # """
+ # Deep in the heart of the rasta/feacalc code, there is the logic
+ # that the first and last auditory bands extend beyond the edge of
+ # the actual spectra, and they are thus copied from their neighbors.
+ # Normally, we just ignore those bands and take the 19 in the middle,
+ # but when feacalc calculates mfccs, it actually takes the cepstrum
+ # over the spectrum *including* the repeated bins at each end.
+ # Here, we simulate 'repeating' the bins and an nrow+2-length
+ # spectrum by adding in extra DCT weight to the first and last
+ # bins.
+ # """
+ # for i = 1:ncep
+ # dctm(i,:) = cos((i-1)*[1:nrow]/(nrow+1)*pi) * 2;
+ # # Add in edge points at ends (includes fixup scale)
+ # dctm(i,1) = dctm(i,1) + 1;
+ # dctm(i,nrow) = dctm(i,nrow) + ((-1)^(i-1));
+
+ # dctm = dctm / (2*(nrow+1));
+ #else % dpwe type 1 - same as old spec2cep that expanded & used fft
+ # for i = 1:ncep
+ # dctm(i,:) = cos((i-1)*[0:(nrow-1)]/(nrow-1)*pi) * 2 / (2*(nrow-1));
+ # dctm(:,[1 nrow]) = dctm(:, [1 nrow])/2;
+
+ #cep = dctm*log(spec);
+ return None, None, None
+
+
+def lifter(x, lift=0.6, invs=False):
+ """
+ Apply lifter to matrix of cepstra (one per column)
+ lift = exponent of x i^n liftering
+ or, as a negative integer, the length of HTK-style sin-curve liftering.
+ If inverse == 1 (default 0), undo the liftering.
+
+ :param x:
+ :param lift:
+ :param invs:
+ :return:
+ """
+ nfrm , ncep = x.shape
+
+ if lift == 0:
+ y = x
+ else:
+ if lift > 0:
+ if lift > 10:
+ print('Unlikely lift exponent of {} did you mean -ve?'.format(lift))
+ liftwts = numpy.hstack((1, numpy.arange(1, ncep)**lift))
+
+ elif lift < 0:
+ # Hack to support HTK liftering
+ L = float(-lift)
+ if (L != numpy.round(L)):
+ print('HTK liftering value {} must be integer'.format(L))
+
+ liftwts = numpy.hstack((1, 1 + L/2*numpy.sin(numpy.arange(1, ncep) * numpy.pi / L)))
+
+ if invs:
+ liftwts = 1 / liftwts
+
+ y = x.dot(numpy.diag(liftwts))
+
+ return y
+
+
+def plp(input_sig,
+ nwin=0.025,
+ fs=16000,
+ plp_order=13,
+ shift=0.01,
+ get_spec=False,
+ get_mspec=False,
+ prefac=0.97,
+ rasta=True):
+ """
+ output is matrix of features, row = feature, col = frame
+
+ % fs is sampling rate of samples, defaults to 8000
+ % dorasta defaults to 1; if 0, just calculate PLP
+ % modelorder is order of PLP model, defaults to 8. 0 -> no PLP
+
+ :param input_sig:
+ :param fs: sampling rate of samples default is 8000
+ :param rasta: default is True, if False, juste compute PLP
+ :param model_order: order of the PLP model, default is 8, 0 means no PLP
+
+ :return: matrix of features, row = features, column are frames
+ """
+ plp_order -= 1
+
+ # first compute power spectrum
+ powspec, log_energy = power_spectrum(input_sig, fs, nwin, shift, prefac)
+
+ # next group to critical bands
+ audio_spectrum = audspec(powspec, fs)[0]
+ nbands = audio_spectrum.shape[0]
+
+ if rasta:
+ # put in log domain
+ nl_aspectrum = numpy.log(audio_spectrum)
+
+ # next do rasta filtering
+ ras_nl_aspectrum = rasta_filt(nl_aspectrum)
+
+ # do inverse log
+ audio_spectrum = numpy.exp(ras_nl_aspectrum)
+
+ # do final auditory compressions
+ post_spectrum = postaud(audio_spectrum, fs / 2.)[0]
+
+ if plp_order > 0:
+
+ # LPC analysis
+ lpcas = dolpc(post_spectrum, plp_order)
+
+ # convert lpc to cepstra
+ cepstra = lpc2cep(lpcas, plp_order + 1)
+
+ # .. or to spectra
+ spectra, F, M = lpc2spec(lpcas, nbands)
+
+ else:
+
+ # No LPC smoothing of spectrum
+ spectra = post_spectrum
+ cepstra = spec2cep(spectra)
+
+ cepstra = lifter(cepstra, 0.6)
+
+ lst = list()
+ lst.append(cepstra)
+ lst.append(log_energy)
+ if get_spec:
+ lst.append(powspec)
+ else:
+ lst.append(None)
+ del powspec
+ if get_mspec:
+ lst.append(post_spectrum)
+ else:
+ lst.append(None)
+ del post_spectrum
+
+ return lst
+
+
+def framing(sig, win_size, win_shift=1, context=(0, 0), pad='zeros'):
+ """
+ :param sig: input signal, can be mono or multi dimensional
+ :param win_size: size of the window in term of samples
+ :param win_shift: shift of the sliding window in terme of samples
+ :param context: tuple of left and right context
+ :param pad: can be zeros or edge
+ """
+ dsize = sig.dtype.itemsize
+ if sig.ndim == 1:
+ sig = sig[:, numpy.newaxis]
+ # Manage padding
+ c = (context, ) + (sig.ndim - 1) * ((0, 0), )
+ _win_size = win_size + sum(context)
+ shape = (int((sig.shape[0] - win_size) / win_shift) + 1, 1, _win_size, sig.shape[1])
+ strides = tuple(map(lambda x: x * dsize, [win_shift * sig.shape[1], 1, sig.shape[1], 1]))
+ if pad == 'zeros':
+ return numpy.lib.stride_tricks.as_strided(numpy.lib.pad(sig, c, 'constant', constant_values=(0,)),
+ shape=shape,
+ strides=strides).squeeze()
+ elif pad == 'edge':
+ return numpy.lib.stride_tricks.as_strided(numpy.lib.pad(sig, c, 'edge'),
+ shape=shape,
+ strides=strides).squeeze()
+
+
+def dct_basis(nbasis, length):
+ """
+ :param nbasis: number of CT coefficients to keep
+ :param length: length of the matrix to process
+ :return: a basis of DCT coefficients
+ """
+ return scipy.fftpack.idct(numpy.eye(nbasis, length), norm='ortho')
+
+
+def levinson(r, order=None, allow_singularity=False):
+ r"""Levinson-Durbin recursion.
+
+ Find the coefficients of a length(r)-1 order autoregressive linear process
+
+ :param r: autocorrelation sequence of length N + 1 (first element being the zero-lag autocorrelation)
+ :param order: requested order of the autoregressive coefficients. default is N.
+ :param allow_singularity: false by default. Other implementations may be True (e.g., octave)
+
+ :return:
+ * the `N+1` autoregressive coefficients :math:`A=(1, a_1...a_N)`
+ * the prediction errors
+ * the `N` reflections coefficients values
+
+ This algorithm solves the set of complex linear simultaneous equations
+ using Levinson algorithm.
+
+ .. math::
+
+ \bold{T}_M \left( \begin{array}{c} 1 \\ \bold{a}_M \end{array} \right) =
+ \left( \begin{array}{c} \rho_M \\ \bold{0}_M \end{array} \right)
+
+ where :math:`\bold{T}_M` is a Hermitian Toeplitz matrix with elements
+ :math:`T_0, T_1, \dots ,T_M`.
+
+ .. note:: Solving this equations by Gaussian elimination would
+ require :math:`M^3` operations whereas the levinson algorithm
+ requires :math:`M^2+M` additions and :math:`M^2+M` multiplications.
+
+ This is equivalent to solve the following symmetric Toeplitz system of
+ linear equations
+
+ .. math::
+
+ \left( \begin{array}{cccc}
+ r_1 & r_2^* & \dots & r_{n}^*\\
+ r_2 & r_1^* & \dots & r_{n-1}^*\\
+ \dots & \dots & \dots & \dots\\
+ r_n & \dots & r_2 & r_1 \end{array} \right)
+ \left( \begin{array}{cccc}
+ a_2\\
+ a_3 \\
+ \dots \\
+ a_{N+1} \end{array} \right)
+ =
+ \left( \begin{array}{cccc}
+ -r_2\\
+ -r_3 \\
+ \dots \\
+ -r_{N+1} \end{array} \right)
+
+ where :math:`r = (r_1 ... r_{N+1})` is the input autocorrelation vector, and
+ :math:`r_i^*` denotes the complex conjugate of :math:`r_i`. The input r is typically
+ a vector of autocorrelation coefficients where lag 0 is the first
+ element :math:`r_1`.
+
+
+ .. doctest::
+
+ >>> import numpy; from spectrum import LEVINSON
+ >>> T = numpy.array([3., -2+0.5j, .7-1j])
+ >>> a, e, k = LEVINSON(T)
+
+ """
+ #from numpy import isrealobj
+ T0 = numpy.real(r[0])
+ T = r[1:]
+ M = len(T)
+
+ if order is None:
+ M = len(T)
+ else:
+ assert order <= M, 'order must be less than size of the input data'
+ M = order
+
+ realdata = numpy.isrealobj(r)
+ if realdata is True:
+ A = numpy.zeros(M, dtype=float)
+ ref = numpy.zeros(M, dtype=float)
+ else:
+ A = numpy.zeros(M, dtype=complex)
+ ref = numpy.zeros(M, dtype=complex)
+
+ P = T0
+
+ for k in range(M):
+ save = T[k]
+ if k == 0:
+ temp = -save / P
+ else:
+ #save += sum([A[j]*T[k-j-1] for j in range(0,k)])
+ for j in range(0, k):
+ save = save + A[j] * T[k-j-1]
+ temp = -save / P
+ if realdata:
+ P = P * (1. - temp**2.)
+ else:
+ P = P * (1. - (temp.real**2+temp.imag**2))
+
+ if (P <= 0).any() and allow_singularity==False:
+ raise ValueError("singular matrix")
+ A[k] = temp
+ ref[k] = temp # save reflection coeff at each step
+ if k == 0:
+ continue
+
+ khalf = (k+1)//2
+ if realdata is True:
+ for j in range(0, khalf):
+ kj = k-j-1
+ save = A[j]
+ A[j] = save + temp * A[kj]
+ if j != kj:
+ A[kj] += temp*save
+ else:
+ for j in range(0, khalf):
+ kj = k-j-1
+ save = A[j]
+ A[j] = save + temp * A[kj].conjugate()
+ if j != kj:
+ A[kj] = A[kj] + temp * save.conjugate()
+
+ return A, P, ref
diff --git a/sidekit/sidekit/frontend/io.py b/sidekit/sidekit/frontend/io.py
new file mode 100755
index 0000000..1ce2fc4
--- /dev/null
+++ b/sidekit/sidekit/frontend/io.py
@@ -0,0 +1,1706 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+
+"""
+Copyright 2014-2021 Anthony Larcher
+
+:mod:`frontend` provides methods to process an audio signal in order to extract
+useful parameters for speaker verification.
+"""
+import audioop
+import decimal
+import h5py
+import logging
+import math
+import numpy
+import os
+import soundfile
+import struct
+import warnings
+import wave
+import scipy.signal
+import scipy.io.wavfile
+from scipy.signal import lfilter
+from scipy.signal import decimate
+from ..sidekit_wrappers import check_path_existance, process_parallel_lists
+
+
+__author__ = "Anthony Larcher"
+__copyright__ = "Copyright 2014-2021 Anthony Larcher"
+__license__ = "LGPL"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+
+
+wav_flag = "float32" # Could be "int16"
+
+# HTK parameters
+WAVEFORM = 0
+LPC = 1
+LPCREFC = 2
+LPCEPSTRA = 3
+LPCDELCEP = 4
+IREFC = 5
+MFCC = 6
+FBANK = 7
+MELSPEC = 8
+USER = 9
+DISCRETE = 10
+PLP = 11
+ANON = 12
+
+_E = 0o000100 # has energy
+_N = 0o000200 # absolute energy supressed
+_D = 0o000400 # has delta coefficients
+_A = 0o001000 # has acceleration coefficients
+_C = 0o002000 # is compressed
+_Z = 0o004000 # has zero mean static coef.
+_K = 0o010000 # has CRC checksum
+_0 = 0o020000 # has 0th cepstral coef.
+_V = 0o040000 # has VQ data
+_T = 0o100000 # has third differential coef.
+
+parms16bit = [WAVEFORM, IREFC, DISCRETE]
+
+
+@check_path_existance
+def write_pcm(data, output_file_name):
+ """Write signal to single channel PCM 16 bits
+
+ :param data: audio signal to write in a RAW PCM file.
+ :param output_file_name: name of the file to write
+ """
+ with open(output_file_name, 'wb') as of:
+ if numpy.abs(data).max() < 1.:
+ data = numpy.around(numpy.array(data) * 16384, decimals=0).astype('int16')
+ of.write(struct.pack('<' + 'h' * data.shape[0], *data))
+
+@check_path_existance
+def write_wav(data, output_file_name, fs):
+ if data.dtype != numpy.int16:
+ if data.dtype == numpy.float32:
+ data /= numpy.abs(data).max()
+ data *= 0.9
+ data = numpy.array(data * 2 ** 15, dtype=numpy.int16)
+ if numpy.any(data > numpy.iinfo(numpy.int16).max) or numpy.any(data < numpy.iinfo(numpy.int16).min):
+ warnings.warn('Warning: clipping detected when writing {}'.format(output_file_name))
+ scipy.io.wavfile.write(output_file_name, fs, data)
+
+
+def read_pcm(input_file_name):
+ """Read signal from single channel PCM 16 bits
+
+ :param input_file_name: name of the PCM file to read.
+
+ :return: the audio signal read from the file in a ndarray encoded on 16 bits, None and 2 (depth of the encoding in bytes)
+ """
+ with open(input_file_name, 'rb') as f:
+ f.seek(0, 2) # Go to te end of the file
+ # get the sample count
+ sample_count = int(f.tell() / 2)
+ f.seek(0, 0) # got to the begining of the file
+ data = numpy.asarray(struct.unpack('<' + 'h' * sample_count, f.read()))
+ return data.astype(numpy.float32), None, 2
+
+
+def read_wav(input_file_name):
+ """
+ :param input_file_name:
+ :return:
+ """
+ #with wave.open(input_file_name, "r") as wfh:
+ # (nchannels, sampwidth, framerate, nframes, comptype, compname) = wfh.getparams()
+ # raw = wfh.readframes(nframes * nchannels)
+ # out = struct.unpack_from("%dh" % nframes * nchannels, raw)
+ # sig = numpy.reshape(numpy.array(out), (-1, nchannels)).squeeze()
+ # return sig.astype(numpy.float32), framerate, sampwidth
+ nfo = soundfile.info(input_file_name)
+ sig, sample_rate = soundfile.read(input_file_name, dtype=wav_flag)
+ sig = numpy.reshape(numpy.array(sig), (-1, nfo.channels)).squeeze()
+ sig = sig.astype(numpy.float32)
+ return sig, sample_rate, 4
+
+
+def pcmu2lin(p, s=4004.189931):
+ """Convert Mu-law PCM to linear X=(P,S)
+ lin = pcmu2lin(pcmu) where pcmu contains a vector
+ of mu-law values in the range 0 to 255.
+ No checking is performed to see that numbers are in this range.
+
+ Output values are divided by the scale factor s:
+
+ s Output Range
+ 1 +-8031 (integer values)
+ 4004.2 +-2.005649 (default)
+ 8031 +-1
+ 8159 +-0.9843118 (+-1 nominal full scale)
+
+ The default scaling factor 4004.189931 is equal to
+ sqrt((2207^2 + 5215^2)/2) this follows ITU standard G.711.
+ The sine wave with PCM-Mu values [158 139 139 158 30 11 11 30]
+ has a mean square value of unity corresponding to 0 dBm0.
+ :param p: input signal encoded in PCM mu-law to convert
+ :param s: conversion value from mu-scale oto linear scale
+ """
+ t = 4 / s
+ m = 15 - (p % 16)
+ q = numpy.floor(p // 128)
+ e = (127 - p - m + 128 * q) / 16
+ x = (m + 16.5) * numpy.power(2, e) - 16.5
+ z = (q - 0.5) * x * t
+ return z
+
+
+def read_sph(input_file_name, mode='p'):
+ """
+ Read a SPHERE audio file
+
+ :param input_file_name: name of the file to read
+ :param mode: specifies the following (\* =default)
+
+ .. note::
+
+ - Scaling:
+
+ - 's' Auto scale to make data peak = +-1 (use with caution if reading in chunks)
+ - 'r' Raw unscaled data (integer values)
+ - 'p' Scaled to make +-1 equal full scale
+ - 'o' Scale to bin centre rather than bin edge (e.g. 127 rather than 127.5 for 8 bit values,
+ can be combined with n+p,r,s modes)
+ - 'n' Scale to negative peak rather than positive peak (e.g. 128.5 rather than 127.5 for 8 bit values,
+ can be combined with o+p,r,s modes)
+
+ - Format
+
+ - 'l' Little endian data (Intel,DEC) (overrides indication in file)
+ - 'b' Big endian data (non Intel/DEC) (overrides indication in file)
+
+ - File I/O
+
+ - 'f' Do not close file on exit
+ - 'd' Look in data directory: voicebox('dir_data')
+ - 'w' Also read the annotation file \*.wrd if present (as in TIMIT)
+ - 't' Also read the phonetic transcription file \*.phn if present (as in TIMIT)
+
+ - NMAX maximum number of samples to read (or -1 for unlimited [default])
+ - NSKIP number of samples to skip from start of file (or -1 to continue from previous read when FFX
+ is given instead of FILENAME [default])
+
+ :return: a tupple such that (Y, FS)
+
+ .. note::
+
+ - Y data matrix of dimension (samples,channels)
+ - FS sample frequency in Hz
+ - WRD{\*,2} cell array with word annotations: WRD{\*,:)={[t_start t_end],'text'} where times are in seconds
+ only present if 'w' option is given
+ - PHN{\*,2} cell array with phoneme annotations: PHN{\*,:)={[t_start t_end],'phoneme'} where times
+ are in seconds only present if 't' option is present
+ - FFX Cell array containing
+
+ 1. filename
+ 2. header information
+
+ 1. first header field name
+ 2. first header field value
+ 3. format string (e.g. NIST_1A)
+ 4.
+ 1. file id
+ 2. current position in file
+ 3. dataoff byte offset in file to start of data
+ 4. order byte order (l or b)
+ 5. nsamp number of samples
+ 6. number of channels
+ 7. nbytes bytes per data value
+ 8. bits number of bits of precision
+ 9. fs sample frequency
+ 10. min value
+ 11. max value
+ 12. coding 0=PCM,1=uLAW + 0=no compression, 0=shorten,20=wavpack,30=shortpack
+ 13. file not yet decompressed
+
+ 5. temporary filename
+
+ If no output parameters are specified,
+ header information will be printed.
+ The code to decode shorten-encoded files, is
+ not yet released with this toolkit.
+ """
+ codings = dict([('pcm', 1), ('ulaw', 2)])
+ compressions = dict([(',embedded-shorten-', 1),
+ (',embedded-wavpack-', 2),
+ (',embedded-shortpack-', 3)])
+ byteorder = 'l'
+ endianess = dict([('l', '<'), ('b', '>')])
+
+ if not mode == 'p':
+ mode = [mode, 'p']
+ k = list((m >= 'p') & (m <= 's') for m in mode)
+ # scale to input limits not output limits
+ mno = all([m != 'o' for m in mode])
+ sc = ''
+ if k[0]:
+ sc = mode[0]
+ # Get byte order (little/big endian)
+ if any([m == 'l' for m in mode]):
+ byteorder = 'l'
+ elif any([m == 'b' for m in mode]):
+ byteorder = 'b'
+ ffx = ['', '', '', '', '']
+
+ if isinstance(input_file_name, str):
+ if os.path.exists(input_file_name):
+ fid = open(input_file_name, 'rb')
+ elif os.path.exists("".join((input_file_name, '.sph'))):
+ input_file_name = "".join((input_file_name, '.sph'))
+ fid = open(input_file_name, 'rb')
+ else:
+ raise Exception('Cannot find file {}'.format(input_file_name))
+ ffx[0] = input_file_name
+ elif not isinstance(input_file_name, str):
+ ffx = input_file_name
+ else:
+ fid = input_file_name
+
+ # Read the header
+ if ffx[3] == '':
+ fid.seek(0, 0) # go to the begining of the file
+ l1 = fid.readline().decode("utf-8")
+ l2 = fid.readline().decode("utf-8")
+ if not (l1 == 'NIST_1A\n') & (l2 == ' 1024\n'):
+ logging.warning('File does not begin with a SPHERE header')
+ ffx[2] = l1.rstrip()
+ hlen = int(l2[3:7])
+ hdr = {}
+ while True: # Read the header and fill a dictionary
+ st = fid.readline().decode("utf-8").rstrip()
+ if st[0] != ';':
+ elt = st.split(' ')
+ if elt[0] == 'end_head':
+ break
+ if elt[1][0] != '-':
+ logging.warning('Missing ''-'' in SPHERE header')
+ break
+ if elt[1][1] == 's':
+ hdr[elt[0]] = elt[2]
+ elif elt[1][1] == 'i':
+ hdr[elt[0]] = int(elt[2])
+ else:
+ hdr[elt[0]] = float(elt[2])
+
+ if 'sample_byte_format' in list(hdr.keys()):
+ if hdr['sample_byte_format'][0] == '0':
+ bord = 'l'
+ else:
+ bord = 'b'
+ if (bord != byteorder) & all([m != 'b' for m in mode]) \
+ & all([m != 'l' for m in mode]):
+ byteorder = bord
+
+ icode = 0 # Get encoding, default is PCM
+ if 'sample_coding' in list(hdr.keys()):
+ icode = -1 # unknown code
+ for coding in list(codings.keys()):
+ if hdr['sample_coding'].startswith(coding):
+ # is the signal compressed
+ # if len(hdr['sample_coding']) > codings[coding]:
+ if len(hdr['sample_coding']) > len(coding):
+ for compression in list(compressions.keys()):
+ if hdr['sample_coding'].endswith(compression):
+ icode = 10 * compressions[compression] \
+ + codings[coding] - 1
+ break
+ else: # if the signal is not compressed
+ icode = codings[coding] - 1
+ break
+ # initialize info of the files with default values
+ info = [fid, 0, hlen, ord(byteorder), 0, 1, 2, 16, 1, 1, -1, icode]
+ # Get existing info from the header
+ if 'sample_count' in list(hdr.keys()):
+ info[4] = hdr['sample_count']
+ if not info[4]: # if no info sample_count or zero
+ # go to the end of the file
+ fid.seek(0, 2) # Go to te end of the file
+ # get the sample count
+ info[4] = int(math.floor((fid.tell() - info[2]) / (info[5] * info[6]))) # get the sample_count
+ if 'channel_count' in list(hdr.keys()):
+ info[5] = hdr['channel_count']
+ if 'sample_n_bytes' in list(hdr.keys()):
+ info[6] = hdr['sample_n_bytes']
+ if 'sample_sig_bits' in list(hdr.keys()):
+ info[7] = hdr['sample_sig_bits']
+ if 'sample_rate' in list(hdr.keys()):
+ info[8] = hdr['sample_rate']
+ if 'sample_min' in list(hdr.keys()):
+ info[9] = hdr['sample_min']
+ if 'sample_max' in list(hdr.keys()):
+ info[10] = hdr['sample_max']
+
+ ffx[1] = hdr
+ ffx[3] = info
+ info = ffx[3]
+ ksamples = info[4]
+ if ksamples > 0:
+ fid = info[0]
+ if (icode >= 10) & (ffx[4] == ''): # read compressed signal
+ # need to use a script with SHORTEN
+ raise Exception('compressed signal, need to unpack in a script with SHORTEN')
+ info[1] = ksamples
+ # use modes o and n to determine effective peak
+ pk = 2 ** (8 * info[6] - 1) * (1 + (float(mno) / 2 - int(all([m != 'b'
+ for m in
+ mode]))) / 2 **
+ info[7])
+ fid.seek(1024) # jump after the header
+ nsamples = info[5] * ksamples
+ if info[6] < 3:
+ if info[6] < 2:
+ logging.debug('Sphere i1 PCM')
+ y = numpy.fromfile(fid, endianess[byteorder]+"i1", -1)
+ if info[11] % 10 == 1:
+ if y.shape[0] % 2:
+ y = numpy.frombuffer(audioop.ulaw2lin(
+ numpy.concatenate((y, numpy.zeros(1, 'int8'))), 2),
+ numpy.int16)[:-1]/32768.
+ else:
+ y = numpy.frombuffer(audioop.ulaw2lin(y, 2), numpy.int16)/32768.
+ pk = 1.
+ else:
+ y = y - 128
+ else:
+ logging.debug('Sphere i2')
+ y = numpy.fromfile(fid, endianess[byteorder]+"i2", -1)
+ else: # non verifie
+ if info[6] < 4:
+ y = numpy.fromfile(fid, endianess[byteorder]+"i1", -1)
+ y = y.reshape(nsamples, 3).transpose()
+ y = (numpy.dot(numpy.array([1, 256, 65536]), y) - (numpy.dot(y[2, :], 2 ** (-7)).astype(int) * 2 ** 24))
+ else:
+ y = numpy.fromfile(fid, endianess[byteorder]+"i4", -1)
+
+ if sc != 'r':
+ if sc == 's':
+ if info[9] > info[10]:
+ info[9] = numpy.min(y)
+ info[10] = numpy.max(y)
+ sf = 1 / numpy.max(list(list(map(abs, info[9:11]))), axis=0)
+ else:
+ sf = 1 / pk
+ y = sf * y
+
+ if info[5] > 1:
+ y = y.reshape(ksamples, info[5])
+ else:
+ y = numpy.array([])
+ if mode != 'f':
+ fid.close()
+ info[0] = -1
+ if not ffx[4] == '':
+ pass # VERIFY SCRIPT, WHICH CASE IS HANDLED HERE
+ return y.astype(numpy.float32), int(info[8]), int(info[6])
+
+
+def read_audio(input_file_name, framerate=None):
+ """ Read a 1 or 2-channel audio file in SPHERE, WAVE or RAW PCM format.
+ The format is determined from the file extension.
+ If the sample rate read from the file is a multiple of the one given
+ as parameter, we apply a decimation function to subsample the signal.
+
+ :param input_file_name: name of the file to read from
+ :param framerate: frame rate, optional, if lower than the one read from the file, subsampling is applied
+ :return: the signal as a numpy array and the sampling frequency
+ """
+ if framerate is None:
+ raise TypeError("Expected sampling frequency required in sidekit.frontend.io.read_audio")
+ ext = os.path.splitext(input_file_name)[-1]
+ if ext.lower() == '.sph':
+ sig, read_framerate, sampwidth = read_sph(input_file_name, 'p')
+ elif ext.lower() == '.wav' or ext.lower() == '.wave':
+ try:
+ sig, read_framerate, sampwidth = read_wav(input_file_name)
+ except:
+ import pydub
+ audio = pydub.AudioSegment.from_wav(input_file_name)
+ read_framerate = audio.frame_rate
+ sampwidth = audio.sample_width
+ sig = numpy.array(audio.split_to_mono()[0].get_array_of_samples())
+ elif ext.lower() == '.pcm' or ext.lower() == '.raw':
+ sig, read_framerate, sampwidth = read_pcm(input_file_name)
+ read_framerate = framerate
+ else:
+ raise TypeError("Unknown extension of audio file")
+
+ # Convert to 16 bit encoding if needed
+ #if not sampwidth == 2:
+ # sig *= (2**(15-sampwidth))
+
+ if framerate > read_framerate:
+ print("Warning in read_audio, up-sampling function is not implemented yet!")
+ elif read_framerate % float(framerate) == 0 and not framerate == read_framerate:
+ print("downsample {}".format(input_file_name))
+ sig = scipy.signal.decimate(sig, int(read_framerate / float(framerate)), n=None, ftype='iir', axis=0)
+ return sig.astype(numpy.float32), framerate
+
+
+@check_path_existance
+def write_label(label,
+ output_file_name,
+ selected_label='speech',
+ frame_per_second=100,
+ show=None,
+ format="mdtm"):
+ """Save labels in ALIZE format
+
+ :param output_file_name: name of the file to write to
+ :param label: label to write in the file given as a ndarray of boolean
+ :param selected_label: label to write to the file. Default is 'speech'.
+ :param frame_per_second: number of frame per seconds. Used to convert
+ the frame number into time. Default is 100.
+ """
+ if label.shape[0] > 0:
+ bits = label[:-1] ^ label[1:]
+ # convert true value into a list of feature indexes
+ # append 0 at the beginning of the list, append the last index to the list
+ idx = [0] + (numpy.arange(len(bits))[bits] + 1).tolist() + [len(label)]
+ framerate = decimal.Decimal(1) / decimal.Decimal(frame_per_second)
+
+ if format == "lab":
+ # for each pair of indexes (idx[i] and idx[i+1]), create a segment
+ with open(output_file_name, 'w') as fid:
+ for i in range(~label[0], len(idx) - 1, 2):
+ fid.write('{} {} {}\n'.format(str(idx[i]*framerate),
+ str(idx[i + 1]*framerate), selected_label))
+ else:
+ # write in MDTM format
+ lst = []
+ for i in range(~label[0], len(idx) - 1, 2):
+ gender = 'U'
+ env = 'U'
+ channel = 'U'
+ start = idx[i]*framerate
+ stop = idx[i + 1]*framerate
+ lst.append('{:s} 1 {:.2f} {:.2f} {:s} {:s} {:s} {:s}\n'.format(
+ show, start, stop - start, gender,
+ channel, env, "speech"))
+
+ with open(output_file_name, 'w', encoding="utf8") as fid:
+ for line in lst:
+ fid.write(line)
+
+
+def read_label(input_file_name, selected_label='speech', frame_per_second=100):
+ """Read label file in ALIZE format
+
+ :param input_file_name: the label file name
+ :param selected_label: the label to return. Default is 'speech'.
+ :param frame_per_second: number of frame per seconds. Used to convert
+ the frame number into time. Default is 100.
+
+ :return: a logical array
+ """
+ with open(input_file_name) as f:
+ segments = f.readlines()
+
+ if len(segments) == 0:
+ lbl = numpy.zeros(0).astype(bool)
+ else:
+ # initialize the length from the last segment's end
+ foo1, stop, foo2 = segments[-1].rstrip().split()
+ lbl = numpy.zeros(int(float(stop) * 100)).astype(bool)
+
+ begin = numpy.zeros(len(segments))
+ end = numpy.zeros(len(segments))
+
+ for s in range(len(segments)):
+ start, stop, label = segments[s].rstrip().split()
+ if label == selected_label:
+ begin[s] = int(round(float(start) * frame_per_second))
+ end[s] = int(round(float(stop) * frame_per_second))
+ lbl[begin[s]:end[s]] = True
+ return lbl
+
+
+def read_spro4(input_file_name,
+ label_file_name="",
+ selected_label="",
+ frame_per_second=100):
+ """Read a feature stream in SPRO4 format
+
+ :param input_file_name: name of the feature file to read from
+ :param label_file_name: name of the label file to read if required.
+ By Default, the method assumes no label to read from.
+ :param selected_label: label to select in the label file. Default is none.
+ :param frame_per_second: number of frame per seconds. Used to convert
+ the frame number into time. Default is 0.
+
+ :return: a sequence of features in a numpy array
+ """
+ with open(input_file_name, 'rb') as f:
+
+ tmp_s = struct.unpack("8c", f.read(8))
+ s = ()
+ for i in range(len(tmp_s)):
+ s += (tmp_s[i].decode("utf-8"),)
+ f.seek(0, 2) # Go to te end of the file
+ size = f.tell() # get the position
+ f.seek(0, 0) # go back to the begining of the file
+ head_size = 0
+
+ if "".join(s) == '':
+ # swap empty header for general header the code need changing
+ struct.unpack("19b", f.read(19))
+ head_size = 19
+
+ dim = struct.unpack("H", f.read(2))[0]
+ struct.unpack("4b", f.read(4))
+ struct.unpack("f", f.read(4))
+ n_frames = int(math.floor((size - 10 - head_size) / (4 * dim)))
+
+ features = numpy.asarray(struct.unpack('f' * n_frames * dim,
+ f.read(4 * n_frames * dim)))
+ features.resize((n_frames, dim))
+
+ lbl = numpy.ones(numpy.shape(features)[0]).astype(bool)
+ if not label_file_name == "":
+ lbl = read_label(label_file_name, selected_label, frame_per_second)
+
+ features = features[lbl, :]
+ return features.astype(numpy.float32)
+
+
+def read_hdf5_segment(file_handler,
+ show,
+ dataset_list,
+ label,
+ start=None, stop=None,
+ global_cmvn=False):
+ """Read a segment from a stream in HDF5 format. Return the features in the
+ range start:end
+ In case the start and end cannot be reached, the first or last feature are copied
+ so that the length of the returned segment is always end-start
+
+ :param file_name: name of the file to open
+ :param dataset: identifier of the dataset in the HDF5 file
+ :param mask:
+ :param start:
+ :param end:
+
+ :return:read_hdf5_segment
+ """
+ h5f = file_handler
+
+ compression_type = {0: 'none', 1: 'htk', 2: 'percentile'}
+ if "compression" not in h5f:
+ compression = 'none'
+ print("Warning, default feature storage mode is now using compression")
+ else:
+ if isinstance(h5f["compression"], h5py._hl.dataset.Dataset):
+ compression = compression_type[h5f["compression"][()]]
+ else:
+ compression = compression_type[h5f["compression"]]
+
+ if show not in h5f:
+ raise Exception('show {} is not in the HDF5 file'.format(show))
+
+ # Get the selected segment
+ dataset_length = h5f[show + "/" + next(h5f[show].__iter__())].shape[0]
+
+ # Deal with the case where start < 0 or stop > feat.shape[0]
+ if start is None:
+ start = 0
+ pad_begining = -start if start < 0 else 0
+ start = max(start, 0)
+
+ if stop is None:
+ stop = dataset_length
+ pad_end = stop - dataset_length if stop > dataset_length else 0
+ stop = min(stop, dataset_length)
+ global_cmvn = global_cmvn and not (start is None or stop is None)
+
+ # Get the data between start and stop
+ # Concatenate all required datasets
+ feat = []
+ global_mean = []
+ global_std = []
+
+ feat = []
+ for data_id in ['energy', 'cep', 'fb', 'bnf']:
+ if data_id in dataset_list:
+ if "/".join((show, data_id)) in h5f:
+ dataset_id = show + '/{}'.format(data_id)
+ if compression == 'none':
+ data = _read_segment(h5f, dataset_id, start, stop)
+ if data.ndim ==1:
+ data = data[:, numpy.newaxis]
+ feat.append(data)
+ elif compression == 'htk':
+ feat.append(_read_segment_htk(h5f, dataset_id, start, stop))
+ else:
+ feat.append(_read_segment_percentile(h5f, dataset_id, start, stop))
+ global_mean.append(h5f["/".join((show, "{}_mean".format(data_id)))][()])
+ global_std.append(h5f["/".join((show, "{}_std".format(data_id)))][()])
+
+ else:
+ raise Exception('{} is not in the HDF5 file'.format(data_id))
+
+ feat = numpy.hstack(feat)
+ global_mean = numpy.hstack(global_mean)
+ global_std = numpy.hstack(global_std)
+
+ if label is None:
+ if "/".join((show, "vad")) in h5f:
+ label = h5f.get("/".join((show, "vad")))[()].astype('bool').squeeze()[start:stop]
+ else:
+ label = numpy.ones(feat.shape[0], dtype='bool')
+ # Pad the segment if needed
+ feat = numpy.pad(feat, ((pad_begining, pad_end), (0, 0)), mode='edge')
+ label = numpy.pad(label, (pad_begining, pad_end), mode='edge')
+ #stop += pad_begining + pad_end
+
+ return feat, label, global_mean, global_std, global_cmvn
+
+
+def read_spro4_segment(input_file_name, start=0, end=None):
+ """Read a segment from a stream in SPRO4 format. Return the features in the
+ range start:end
+ In case the start and end cannot be reached, the first or last feature are copied
+ so that the length of the returned segment is always end-start
+
+ :param input_file_name: name of the feature file to read from
+ :param start: index of the first frame to read (start at zero)
+ :param end: index of the last frame following the segment to read.
+ end < 0 means that end is the value of the right_context to add
+ at the end of the file
+
+ :return: a sequence of features in a ndarray of length end-start
+ """
+ with open(input_file_name, 'rb') as f:
+
+ tmpS = struct.unpack("8c", f.read(8))
+ s = ()
+ for i in range(len(tmpS)):
+ s += (tmpS[i].decode("utf-8"),)
+ f.seek(0, 2) # Go to te end of the file
+ size = f.tell() # get the position
+ f.seek(0, 0) # go back to the begining of the file
+ head_size = 0
+
+ if "".join(s) == '':
+ # swap empty header for general header the code need changing
+ struct.unpack("19b", f.read(19))
+ head_size = 19
+
+ dim = struct.unpack("H", f.read(2))[0]
+ struct.unpack("4b", f.read(4))
+ struct.unpack("f", f.read(4))
+ n_frames = int(math.floor((size - 10 - head_size) / (4 * dim)))
+ if end is None:
+ end = n_frames
+ elif end < 0:
+ end = n_frames - end
+
+ s, e = max(0, start), min(n_frames, end)
+ f.seek(2 + 4 + 4 + dim * 4 * s, 0)
+ features = numpy.fromfile(f, 'IIHH", len(features)+(4 if dt & _C else 0), sampling_period*1e7,
+ features.shape[1] * (2 if (pk in parms16bit or dt & _C) else 4), dt))
+ if pk == 5:
+ features *= 32767.0
+ if pk in parms16bit:
+ features = features.astype('>h')
+ elif dt & _C:
+ mmax, mmin = features.max(axis=0), features.min(axis=0)
+ mmax[mmax == mmin] += 32767
+ mmin[mmax == mmin] -= 32767 # to avoid division by zero for constant coefficients
+ scale = 2 * 32767. / (mmax - mmin)
+ bias = 0.5 * scale * (mmax + mmin)
+ features = features * scale - bias
+ numpy.array([scale]).astype('>f').tofile(fh)
+ numpy.array([bias]).astype('>f').tofile(fh)
+ features = features.astype('>h')
+ else:
+ features = features.astype('>f')
+ features.tofile(fh)
+
+def read_htk(input_file_name,
+ label_file_name="",
+ selected_label="",
+ frame_per_second=100):
+ """Read a sequence of features in HTK format
+
+ :param input_file_name: name of the file to read from
+ :param label_file_name: name of the label file to read from
+ :param selected_label: label to select
+ :param frame_per_second: number of frames per second
+
+ :return: a tupple (d, fp, dt, tc, t) described below
+
+ .. note::
+
+ - d = data: column vector for waveforms, 1 row per frame for other types
+ - fp = frame period in seconds
+ - dt = data type (also includes Voicebox code for generating data)
+
+ 0. WAVEFORM Acoustic waveform
+ 1. LPC Linear prediction coefficients
+ 2. LPREFC LPC Reflection coefficients: -lpcar2rf([1 LPC]);LPREFC(1)=[];
+ 3. LPCEPSTRA LPC Cepstral coefficients
+ 4. LPDELCEP LPC cepstral+delta coefficients (obsolete)
+ 5. IREFC LPC Reflection coefficients (16 bit fixed point)
+ 6. MFCC Mel frequency cepstral coefficients
+ 7. FBANK Log Fliter bank energies
+ 8. MELSPEC linear Mel-scaled spectrum
+ 9. USER User defined features
+ 10. DISCRETE Vector quantised codebook
+ 11. PLP Perceptual Linear prediction
+ 12. ANON
+
+ - tc = full type code = dt plus (optionally)
+ one or more of the following modifiers
+
+ - 64 _E Includes energy terms
+ - 128 _N Suppress absolute energy
+ - 256 _D Include delta coefs
+ - 512 _A Include acceleration coefs
+ - 1024 _C Compressed
+ - 2048 _Z Zero mean static coefs
+ - 4096 _K CRC checksum (not implemented yet)
+ - 8192 _0 Include 0'th cepstral coef
+ - 16384 _V Attach VQ index
+ - 32768 _T Attach delta-delta-delta index
+
+ - t = text version of type code e.g. LPC_C_K
+
+ This function is a translation of the Matlab code from
+ VOICEBOX is a MATLAB toolbox for speech processing.
+ by Mike Brookes
+ Home page: `VOICEBOX `
+ """
+ kinds = ['WAVEFORM', 'LPC', 'LPREFC', 'LPCEPSTRA', 'LPDELCEP', 'IREFC',
+ 'MFCC', 'FBANK', 'MELSPEC', 'USER', 'DISCRETE', 'PLP', 'ANON',
+ '???']
+ with open(input_file_name, 'rb') as fid:
+ nf = struct.unpack(">l", fid.read(4))[0] # number of frames
+ # frame interval (in seconds)
+ fp = struct.unpack(">l", fid.read(4))[0] * 1.e-7
+ by = struct.unpack(">h", fid.read(2))[0] # bytes per frame
+ tc = struct.unpack(">h", fid.read(2))[0] # type code
+ tc += 65536 * (tc < 0)
+ cc = 'ENDACZK0VT' # list of suffix codes
+ nhb = len(cc) # number of suffix codes
+ ndt = 6 # number of bits for base type
+ hb = list(int(math.floor(tc * 2 ** x))
+ for x in range(- (ndt + nhb), -ndt + 1))
+ # extract bits from type code
+ hd = list(hb[x] - 2 * hb[x - 1] for x in range(nhb, 0, -1))
+ # low six bits of tc represent data type
+ dt = tc - hb[-1] * 2 ** ndt
+
+ # hd(7)=1 CRC check
+ # hd(5)=1 compressed data
+ if dt == 5:
+ fid.seek(0, 2) # Go to te end of the file
+ flen = fid.tell() # get the position
+ fid.seek(0, 0) # go back to the begining of the file
+ if flen > 14 + by * nf: # if file too long
+ dt = 2 # change type to LPRFEC
+ hd[4] = 1 # set compressed flag
+ nf += 4 # frame count doesn't include
+ # compression constants in this case
+
+ # 16 bit data for waveforms, IREFC and DISCRETE
+ if any([dt == x for x in [0, 5, 10]]):
+ n_dim = int(by * nf / 2)
+ data = numpy.asarray(struct.unpack(">" + "h" * n_dim, fid.read(2 * n_dim)))
+ d = data.reshape(nf, by / 2)
+ if dt == 5:
+ d /= 32767 # scale IREFC
+ else:
+ if hd[4]: # compressed data - first read scales
+ nf -= 4 # frame count includes compression constants
+ n_col = int(by / 2)
+ scales = numpy.asarray(struct.unpack(">" + "f" * n_col, fid.read(4 * n_col)))
+ biases = numpy.asarray(struct.unpack(">" + "f" * n_col, fid.read(4 * n_col)))
+ data = numpy.asarray(struct.unpack(">" + "h" * n_col * nf, fid.read(2 * n_col * nf)))
+ d = data.reshape(nf, n_col)
+ d = d + biases
+ d = d / scales
+ else:
+ data = numpy.asarray(struct.unpack(">" + "f" * int(by / 4) * nf, fid.read(by * nf)))
+ d = data.reshape(nf, by / 4)
+
+ t = kinds[min(dt, len(kinds) - 1)]
+
+ lbl = numpy.ones(numpy.shape(d)[0]).astype(bool)
+ if not label_file_name == "":
+ lbl = read_label(label_file_name, selected_label, frame_per_second)
+
+ d = d[lbl, :]
+
+ return d.astype(numpy.float32), fp, dt, tc, t
+
+
+def read_htk_segment(input_file_name,
+ start=0,
+ stop=None):
+ """Read a segment from a stream in SPRO4 format. Return the features in the
+ range start:end
+ In case the start and end cannot be reached, the first or last feature are copied
+ so that the length of the returned segment is always end-start
+
+ :param input_file_name: name of the feature file to read from or file-like
+ object alowing to seek in the file
+ :param start: index of the first frame to read (start at zero)
+ :param stop: index of the last frame following the segment to read.
+ end < 0 means that end is the value of the right_context to add
+ at the end of the file
+
+ :return: a sequence of features in a ndarray of length end-start
+ """
+ try:
+ fh = open(input_file_name, 'rb')
+ except TypeError:
+ fh = input_file_name
+ try:
+ fh.seek(0)
+ n_samples, _, sample_size, parm_kind = struct.unpack(">IIHH", fh.read(12))
+ pk = parm_kind & 0x3f
+ if parm_kind & _C:
+ scale, bias = numpy.fromfile(fh, '>f', sample_size).reshape(2, sample_size/2)
+ n_samples -= 4
+ s, e = max(0, start), min(n_samples, stop)
+ fh.seek(s*sample_size, 1)
+ dtype, _bytes = ('>h', 2) if parm_kind & _C or pk in parms16bit else ('>f', 4)
+ m = numpy.fromfile(fh, dtype, (e - s) * sample_size / _bytes).reshape(e - s, sample_size / _bytes)
+ if parm_kind & _C:
+ m = (m + bias) / scale
+ if pk == IREFC:
+ m /= 32767.0
+ if pk == WAVEFORM:
+ m = m.ravel()
+ finally:
+ if fh is not input_file_name:
+ fh.close()
+ if start != s or stop != e: # repeat first or/and last frame as required
+ m = numpy.r_[numpy.repeat(m[[0]], s-start, axis=0), m, numpy.repeat(m[[-1]], stop-e, axis=0)]
+ return m.astype(numpy.float32)
+
+def _add_dataset_header(fh,
+ dataset_id,
+ _min_val,
+ _range,
+ _header):
+ """
+ Create a dataset in the HDF5 file and write the data
+ after compressing float to int
+ """
+ _c_header = (_header - _min_val) / _range
+ numpy.clip(_c_header, 0., 1.)
+ _c_header = (_c_header * 65535 + 0.499).astype(int)
+
+ fh.create_dataset(dataset_id + '_header',
+ data=_c_header,
+ maxshape=(None, None),
+ compression="gzip",
+ fletcher32=True)
+ fh.create_dataset(dataset_id + '_min_range',
+ data=numpy.array([_min_val, _range]).astype('float32'),
+ maxshape=(2,),
+ compression="gzip",
+ fletcher32=True)
+
+def _add_percentile_dataset(fh,
+ dataset_id,
+ data):
+ """
+ Create the dataset in the HDF5 file, write the data
+ compressed in int8 format and the header compressed in
+ int format
+ """
+ _min_val = data.min()
+ _range = data.ptp()
+
+ if data.ndim == 1:
+ data = data[:, numpy.newaxis]
+
+ # First write the compression information in the dataset header
+ _header = numpy.zeros((data.shape[1], 4))
+
+ for j, p in enumerate([0, 25, 75, 100]):
+ _header[:, j] = numpy.percentile(data, p, axis=0, interpolation='lower')
+ _add_dataset_header(fh, dataset_id, _min_val, _range, _header)
+
+ # now write the compressed data
+ c_data = numpy.zeros(data.shape, dtype=numpy.uint8)
+ for i in range(data.shape[1]):
+ p0, p25, p75, p100 = _header[i]
+ mat1 = numpy.uint8((((data[:, i] - p0) / (p25 - p0)) * 64 + 0.5))
+ mat1 = numpy.clip(mat1, 0, 64) * (data[:, i] < p25)
+ mat2 = (numpy.uint8(((data[:, i] - p25) / (p75 - p25)) * 128 + 0.5) + 64)
+ mat2 = numpy.clip(mat2, 64, 192) * ((data[:, i] >= p25) & (data[:, i] < p75))
+ mat3 = (numpy.uint8(((data[:, i] - p75) / (p100 - p75)) * 63 + 0.5) + 192)
+ mat3 = numpy.clip(mat3, 192, 255) * (data[:, i] >= p75)
+ c_data[:, i] = mat1 + mat2 + mat3
+
+ fh.create_dataset(dataset_id,
+ data=c_data,
+ maxshape=(None, None),
+ compression="gzip",
+ fletcher32=True)
+
+def _read_dataset(h5f, dataset_id):
+ data = h5f[dataset_id][()]
+ if data.ndim == 1:
+ data = data[:, numpy.newaxis]
+ return data
+
+def _read_segment(h5f, dataset_id, s, e):
+ data = h5f[dataset_id][s:e]
+ return data
+
+def _read_dataset_htk(h5f, dataset_id):
+ (A, B) = h5f[dataset_id + "comp"][()]
+ data = (h5f[dataset_id][()] + B) / A
+ if data.ndim == 1:
+ data = data[:, numpy.newaxis]
+ return data
+
+def _read_segment_htk(h5f, dataset_id, e, s):
+ (A, B) = h5f[dataset_id + "comp"][()]
+ data = (h5f[dataset_id][s:e, :] + B) / A
+ return data
+
+def read_dataset_percentile(h5f, dataset_id):
+ # read the header
+ (_min_val, _range) = h5f[dataset_id + "_min_range"][()]
+ c_header = h5f[dataset_id + "_header"][()]
+ _header = numpy.full(c_header.shape, _min_val)
+ _header += c_header * _range * 1.52590218966964e-05
+
+ # decompress the data
+ c_data = h5f[dataset_id][()]
+ mat1 = (_header[:,[0]] + (_header[:,[1]] - _header[:,[0]]) * c_data.T * (1/64)) * (c_data.T <= 64)
+ mat2 = (_header[:,[1]] + (_header[:,[2]] - _header[:,[1]]) * (c_data.T - 64) * (1/128)) * ((c_data.T > 64) & (c_data.T<=192))
+ mat3 = (_header[:,[2]] + (_header[:,[3]] - _header[:,[2]]) * (c_data.T - 192) * (1/63)) * (c_data.T > 192)
+ return (mat1+mat2+mat3).T
+
+def _read_segment_percentile(h5f, dataset_id, s, e):
+ # read the header
+ (_min_val, _range) = h5f[dataset_id + "_min_range"][()]
+ c_header = h5f[dataset_id + "_header"][()]
+ _header = numpy.full(c_header.shape, _min_val)
+ _header += c_header * _range * 1.52590218966964e-05
+
+ c_data = h5f[dataset_id][()][s:e, :]
+ mat1 = (_header[:,[0]] + (_header[:,[1]] - _header[:,[0]]) * c_data.T * (1/64)) * (c_data.T <= 64)
+ mat2 = (_header[:,[1]] + (_header[:,[2]] - _header[:,[1]]) * (c_data.T - 64) * (1/128)) * ((c_data.T > 64) & (c_data.T<=192))
+ mat3 = (_header[:,[2]] + (_header[:,[3]] - _header[:,[2]]) * (c_data.T - 192) * (1/63)) * (c_data.T > 192)
+ return (mat1+mat2+mat3).T
+
+
+def _write_show(show,
+ fh,
+ cep, cep_mean, cep_std,
+ energy, energy_mean, energy_std,
+ fb, fb_mean, fb_std,
+ bnf, bnf_mean, bnf_std,
+ label):
+ if cep is not None:
+ fh.create_dataset(show + '/cep', data=cep.astype('float32'),
+ maxshape=(None, None),
+ compression="gzip",
+ fletcher32=True)
+ if cep_mean is not None:
+ fh.create_dataset(show + '/cep_mean', data=cep_mean.astype('float32'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ if cep_std is not None:
+ fh.create_dataset(show + '/cep_std', data=cep_std.astype('float32'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ if energy is not None:
+ energy = energy.squeeze()
+ fh.create_dataset(show + '/energy', data=energy.astype('float32'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ if energy_mean is not None:
+ fh.create_dataset(show + '/energy_mean', data=energy_mean)
+ if energy_std is not None:
+ fh.create_dataset(show + '/energy_std', data=energy_std)
+ if fb is not None:
+ fh.create_dataset(show + '/fb', data=fb.astype('float32'),
+ maxshape=(None, None),
+ compression="gzip",
+ fletcher32=True)
+ if fb_mean is not None:
+ fh.create_dataset(show + '/fb_mean', data=fb_mean.astype('float32'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ if fb_std is not None:
+ fh.create_dataset(show + '/fb_std', data=fb_std.astype('float32'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ if bnf is not None:
+ fh.create_dataset(show + '/bnf', data=bnf.astype('float32'),
+ maxshape=(None, None),
+ compression="gzip",
+ fletcher32=True)
+ if bnf_mean is not None:
+ fh.create_dataset(show + '/bnf_mean', data=bnf_mean.astype('float32'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ if bnf_std is not None:
+ fh.create_dataset(show + '/bnf_std', data=bnf_std.astype('float32'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ if label is not None and not show + "/vad" in fh:
+ fh.create_dataset(show + '/' + "vad", data=label.astype('int8'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+
+def _write_show_htk(show,
+ fh,
+ cep, cep_mean, cep_std,
+ energy, energy_mean, energy_std,
+ fb, fb_mean, fb_std,
+ bnf, bnf_mean, bnf_std,
+ label):
+ if cep is not None:
+ A_cep = 2 * 32767. / (cep.max() - cep.min())
+ B_cep = (cep.max() + cep.min()) * 32767. / (cep.max() - cep.min())
+ fh.create_dataset(show + '/cep_comp', data=numpy.array([A_cep, B_cep]).astype('float32'),
+ maxshape=(2,),
+ compression="gzip",
+ fletcher32=True)
+ fh.create_dataset(show + '/cep', data=(A_cep*cep - B_cep).astype("short"),
+ maxshape=(None, None),
+ compression="gzip",
+ fletcher32=True)
+ if energy is not None:
+ A_energy = 2 * 32767. / (energy.max() - energy.min())
+ B_energy = (energy.max() + energy.min()) * 32767. / (energy.max() - energy.min())
+ fh.create_dataset(show + '/energy_comp', data=numpy.array([A_energy, B_energy]).astype('float32'),
+ maxshape=(2,),
+ compression="gzip",
+ fletcher32=True)
+ fh.create_dataset(show + '/energy', data=(A_energy * energy - B_energy).astype("short"),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ if fb is not None:
+ A_fb = 2 * 32767. / (fb.max() - fb.min())
+ B_fb = (fb.max() + fb.min()) * 32767. / (fb.max() - fb.min())
+ fh.create_dataset(show + '/fb_comp', data=numpy.array([A_fb, B_fb]).astype('float32'),
+ maxshape=(2,),
+ compression="gzip",
+ fletcher32=True)
+ fh.create_dataset(show + '/fb', data=(A_fb * fb - B_fb).astype("short"),
+ maxshape=(None, None),
+ compression="gzip",
+ fletcher32=True)
+ if bnf is not None:
+ A_bnf = 2 * 32767. / (bnf.max() - bnf.min())
+ B_bnf = (bnf.max() + bnf.min()) * 32767. / (bnf.max() - bnf.min())
+ fh.create_dataset(show + '/bnf_comp', data=numpy.array([A_bnf, B_bnf]).astype('float32'),
+ maxshape=(2,),
+ compression="gzip",
+ fletcher32=True)
+ fh.create_dataset(show + '/bnf', data=(A_bnf * bnf - B_bnf).astype("short"),
+ maxshape=(None, None),
+ compression="gzip",
+ fletcher32=True)
+ if energy_mean is not None:
+ fh.create_dataset(show + '/energy_mean', data=energy_mean)
+ if energy_std is not None:
+ fh.create_dataset(show + '/energy_std', data=energy_std)
+ if cep_mean is not None:
+ fh.create_dataset(show + '/cep_mean', data=cep_mean.astype('float32'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ if cep_std is not None:
+ fh.create_dataset(show + '/cep_std', data=cep_std.astype('float32'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ if fb_mean is not None:
+ fh.create_dataset(show + '/fb_mean', data=fb_mean.astype('float32'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ if fb_std is not None:
+ fh.create_dataset(show + '/fb_std', data=fb_std.astype('float32'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ if bnf_mean is not None:
+ fh.create_dataset(show + '/bnf_mean', data=bnf_mean.astype('float32'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ if bnf_std is not None:
+ fh.create_dataset(show + '/bnf_std', data=bnf_std.astype('float32'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+
+ if label is not None and not show + "/vad" in fh:
+ fh.create_dataset(show + '/' + "vad", data=label.astype('int8'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+
+def _write_show_percentile(show,
+ fh,
+ cep, cep_mean, cep_std,
+ energy, energy_mean, energy_std,
+ fb, fb_mean, fb_std,
+ bnf, bnf_mean, bnf_std,
+ label):
+ if cep is not None:
+ _add_percentile_dataset(fh, show + '/cep', cep)
+
+ if energy is not None:
+ _add_percentile_dataset(fh, show + '/energy', energy)
+
+ if fb is not None:
+ _add_percentile_dataset(fh, show + '/fb', fb)
+
+ if bnf is not None:
+ _add_percentile_dataset(fh, show + '/bnf', bnf)
+
+ if cep_mean is not None:
+ fh.create_dataset(show + '/cep_mean', data=cep_mean.astype('float32'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+
+ if cep_std is not None:
+ fh.create_dataset(show + '/cep_std', data=cep_std.astype('float32'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+
+ if energy_mean is not None:
+ fh.create_dataset(show + '/energy_mean', data=energy_mean)
+
+ if energy_std is not None:
+ fh.create_dataset(show + '/energy_std', data=energy_std)
+
+ if fb_mean is not None:
+ fh.create_dataset(show + '/fb_mean', data=fb_mean.astype('float32'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ if fb_std is not None:
+ fh.create_dataset(show + '/fb_std', data=fb_std.astype('float32'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ if bnf_mean is not None:
+ fh.create_dataset(show + '/bnf_mean', data=bnf_mean.astype('float32'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ if bnf_std is not None:
+ fh.create_dataset(show + '/bnf_std', data=bnf_std.astype('float32'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+
+ if label is not None and not show + "/vad" in fh:
+ fh.create_dataset(show + '/' + "vad", data=label.astype('int8'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+
+
+
+def write_hdf5(show,
+ fh,
+ cep, cep_mean, cep_std,
+ energy, energy_mean, energy_std,
+ fb, fb_mean, fb_std,
+ bnf, bnf_mean, bnf_std,
+ label,
+ compression='percentile'):
+ """
+ :param show: identifier of the show to write
+ :param fh: HDF5 file handler
+ :param cep: cepstral coefficients to store
+ :param cep_mean: pre-computed mean of the cepstral coefficient
+ :param cep_std: pre-computed standard deviation of the cepstral coefficient
+ :param energy: energy coefficients to store
+ :param energy_mean: pre-computed mean of the energy
+ :param energy_std: pre-computed standard deviation of the energy
+ :param fb: filter-banks coefficients to store
+ :param fb_mean: pre-computed mean of the filter bank coefficient
+ :param fb_std: pre-computed standard deviation of the filter bank coefficient
+ :param bnf: bottle-neck features to store
+ :param bnf_mean: pre-computed mean of the bottleneck features
+ :param bnf_std: pre-computed standard deviation of the bottleneck features
+ :param label: vad labels to store
+ :param compressed: boolean, default is False
+ :return:
+ """
+ #write the the type of compression: could be:
+ # 0 = no compression
+ # 1 HTK'style compression
+ # 2 compression percentile
+ compression_type = {'none':0, 'htk':1, 'percentile':2}
+ if "compression" not in fh:
+ fh.create_dataset('compression', data=compression_type[compression])
+ else:
+ assert(fh['compression'][()] == compression_type[compression])
+
+ if compression == 'none':
+ _write_show(show,
+ fh,
+ cep, cep_mean, cep_std,
+ energy, energy_mean, energy_std,
+ fb, fb_mean, fb_std,
+ bnf, bnf_mean, bnf_std,
+ label)
+ elif compression == 'htk':
+ _write_show_htk(show,
+ fh,
+ cep, cep_mean, cep_std,
+ energy, energy_mean, energy_std,
+ fb, fb_mean, fb_std,
+ bnf, bnf_mean, bnf_std,
+ label)
+ else:
+ # Default: use percentile compression
+ _write_show_percentile(show,
+ fh,
+ cep, cep_mean, cep_std,
+ energy, energy_mean, energy_std,
+ fb, fb_mean, fb_std,
+ bnf, bnf_mean, bnf_std,
+ label)
+
+def read_hdf5(h5f, show, dataset_list=("cep", "fb", "energy", "vad", "bnf")):
+ """
+
+ :param h5f: HDF5 file handler to read from
+ :param show: identifier of the show to read
+ :param dataset_list: list of datasets to read and concatenate
+ :return:
+ """
+ compression_type = {0:'none', 1:'htk', 2:'percentile'}
+ if "compression" not in h5f:
+ compression = 'none'
+ print("Warning, default feature storage mode is now using compression")
+ else:
+ compression = compression_type[h5f["compression"][()]]
+
+ if show not in h5f:
+ raise Exception('show {} is not in the HDF5 file'.format(show))
+
+ # initialize the list of features to concatenate
+ feat = []
+
+ if "energy" in dataset_list:
+ if "/".join((show, "energy")) in h5f:
+ dataset_id = show + '/energy'
+ if compression == 'none':
+ feat.append(_read_dataset(h5f, dataset_id))
+ elif compression == 'htk':
+ feat.append(_read_dataset_htk(h5f, dataset_id))
+ else:
+ feat.append(read_dataset_percentile(h5f, dataset_id))
+ else:
+ raise Exception('energy is not in the HDF5 file')
+
+ if "cep" in dataset_list:
+ if "/".join((show, "cep")) in h5f:
+ dataset_id = show + '/cep'
+ if compression == 'none':
+ feat.append(_read_dataset(h5f, dataset_id))
+ elif compression == 'htk':
+ feat.append(_read_dataset_htk(h5f, dataset_id))
+ else:
+ feat.append(read_dataset_percentile(h5f, dataset_id))
+ else:
+ raise Exception('cep) is not in the HDF5 file')
+
+ if "fb" in dataset_list:
+ if "/".join((show, "fb")) in h5f:
+ dataset_id = show + '/fb'
+ if compression == 'none':
+ feat.append(_read_dataset(h5f, dataset_id))
+ elif compression == 'htk':
+ feat.append(_read_dataset_htk(h5f, dataset_id))
+ else:
+ feat.append(read_dataset_percentile(h5f, dataset_id))
+ else:
+ raise Exception('cep) is not in the HDF5 file')
+
+ if "bnf" in dataset_list:
+ if "/".join((show, "bnf")) in h5f:
+ dataset_id = show + '/bnf'
+ if compression == 'none':
+ feat.append(_read_dataset(h5f, dataset_id))
+ elif compression == 'htk':
+ feat.append(_read_dataset_htk(h5f, dataset_id))
+ else:
+ feat.append(read_dataset_percentile(h5f, dataset_id))
+ else:
+ raise Exception('cep) is not in the HDF5 file')
+
+ feat = numpy.hstack(feat)
+
+ label = None
+ if "vad" in dataset_list:
+ if "/".join((show, "vad")) in h5f:
+ label = h5f.get("/".join((show, "vad")))[()].astype('bool').squeeze()
+ else:
+ warnings.warn("Warning...........no VAD in this HDF5 file")
+ label = numpy.ones(feat.shape[0], dtype='bool')
+
+ return feat.astype(numpy.float32), label
+
+
+
+def _rms_energy(x):
+ return 10*numpy.log10((1e-12 + x.dot(x))/len(x))
+
+def _add_noise(signal, noise_file_name, snr, sample_rate):
+ """
+
+ :param signal:
+ :param noise_file_name:
+ :param snr:
+ :return:
+ """
+ # Open noise file
+ if isinstance(noise_file_name, numpy.ndarray):
+ noise = noise_file_name
+ else:
+ noise, fs_noise = read_audio(noise_file_name, sample_rate)
+
+ # Generate random section of masker
+ if len(noise) < len(signal):
+ dup_factor = len(signal) // len(noise) + 1
+ noise = numpy.tile(noise, dup_factor)
+
+ if len(noise) != len(signal):
+ idx = numpy.random.randint(0, len(noise) - len(signal))
+ noise = noise[idx:idx + len(signal)]
+
+ # Compute energy of both signals
+ N_dB = _rms_energy(noise)
+ S_dB = _rms_energy(signal)
+
+ # Rescale N
+ N_new = S_dB - snr
+ noise_scaled = 10 ** (N_new / 20) * noise / 10 ** (N_dB / 20)
+ noisy = signal + noise_scaled
+
+ return (noisy - noisy.mean()) / noisy.std()
+
+def bin_interp(upcount, lwcount, upthr, lwthr, margin, tol=0.1):
+ n_iter = 1
+ if abs(upcount - upthr - margin) < tol:
+ midcount = upcount
+ elif abs(lwcount - lwthr - margin) < tol:
+ midcount = lwcount
+ else:
+ midcount = (upcount + lwcount)/2
+ midthr = (upthr + lwthr)/2
+ diff = midcount - midthr - margin
+ while abs(diff) > tol:
+ n_iter += 1
+ if n_iter > 20:
+ tol *= 1.1
+ if diff > tol:
+ midcount = (upcount + midcount)/2
+ midthr = (upthr + midthr)/2
+ elif diff < -tol:
+ midcount = (lwcount + midcount)/2
+ midthr = (lwthr + midthr)/2
+ diff = midcount - midthr - margin
+ return midcount
+
+def asl_meter(x, fs, nbits=16):
+ '''Measure the Active Speech Level (ASR) of x following ITU-T P.56.
+ If x is integer, it will be scaled to (-1, 1) according to nbits.
+ '''
+
+ if numpy.issubdtype(x.dtype, numpy.integer):
+ x = x / 2**(nbits-1)
+
+ # Constants
+ MIN_LOG_OFFSET = 1e-20
+ T = 0.03 # Time constant of smoothing in seconds
+ g = numpy.exp(-1/(T*fs))
+ H = 0.20 # Time of handover in seconds
+ I = int(numpy.ceil(H*fs))
+ M = 15.9 # Margin between threshold and ASL in dB
+
+ a = numpy.zeros(nbits-1) # Activity count
+ c = 0.5**numpy.arange(nbits-1, 0, step=-1) # Threshold level
+ h = numpy.ones(nbits)*I # Hangover count
+ s = 0
+ sq = 0
+ p = 0
+ q = 0
+ asl = -100
+
+ L = len(x)
+ s = sum(abs(x))
+ sq = sum(x**2)
+ dclevel = s/numpy.arange(1, L+1)
+ lond_term_level = 10*numpy.log10(sq/numpy.arange(1, L+1) + MIN_LOG_OFFSET)
+ c_dB = 20*numpy.log10(c)
+
+ for i in range(L):
+ p = g * p + (1-g) * abs(x[i])
+ q = g * q + (1-g) * p
+
+ for j in range(nbits-1):
+ if q >= c[j]:
+ a[j] += 1
+ h[j] = 0
+ elif h[j] < I:
+ a[j] += 1;
+ h[j] += 1
+
+ a_dB = -100 * numpy.ones(nbits-1)
+
+ for i in range(nbits-1):
+ if a[i] != 0:
+ a_dB[i] = 10*numpy.log10(sq/a[i])
+
+ delta = a_dB - c_dB
+ idx = numpy.where(delta <= M)[0]
+
+ if len(idx) != 0:
+ idx = idx[0]
+ if idx > 1:
+ asl = bin_interp(a_dB[idx], a_dB[idx-1], c_dB[idx], c_dB[idx-1], M)
+ else:
+ asl = a_dB[idx]
+
+ return asl
+
+def _add_reverb(signal, reverb_file_name, sample_rate, reverb_level=-26.0, ):
+ '''Adds reverb (convolutive noise) to a speech signal.
+ The output speech level is normalized to asl_level.
+ '''
+ if isinstance(reverb_file_name, numpy.ndarray):
+ reverb = reverb_file_name
+ else:
+ reverb, _ = read_audio(reverb_file_name, sample_rate)
+ y = lfilter(reverb, 1, signal)
+ y = y/10**(asl_meter(y, sample_rate)/20) * 10**(reverb_level/20)
+
+ return (y - y.mean()) / y.std()
+
+
+def degrade_audio(input_path,
+ input_extension,
+ output_path,
+ output_extension,
+ input_filename,
+ output_filename,
+ sampling_frequency=16000,
+ noise_file_name=None,
+ snr=-10,
+ reverb_file_name=None,
+ reverb_level=-26.):
+ """
+
+ :param input_filename:
+ :param output_filename:
+ :return:
+ """
+
+ # Open audio file, get the signal and possibly the sampling frequency
+ signal, sample_rate = read_audio(input_filename, sampling_frequency)
+ if signal.ndim == 1:
+ signal = signal[:, numpy.newaxis]
+
+ for channel in range(signal.shape[1]):
+ if noise_file_name is not None:
+ signal[:, channel] = _add_noise(signal[:, channel], noise_file_name, snr, sampling_frequency)
+ if reverb_file_name is not None:
+ signal[:, channel] = _add_reverb(signal[:, channel], reverb_file_name, sampling_frequency, reverb_level)
+
+ write_wav(signal, output_filename, sample_rate)
+
+
+@process_parallel_lists
+def augment_list(input_path,
+ input_extension,
+ output_path,
+ output_extension,
+ sampling_frequency,
+ show_list,
+ channel_list,
+ audio_file_list=None,
+ feature_file_list=None,
+ noise_file_list=None,
+ snr_list=None,
+ reverb_file_list=None,
+ reverb_levels=None,
+ num_thread=1):
+ """
+ Compute the acoustic parameters (filter banks, cepstral coefficients, log-energy and bottleneck features
+ for a list of audio files and save them to disk in a HDF5 format
+ The process is parallelized if num_thread is higher than 1
+
+
+ :param show_list: list of IDs of the show to process
+ :param channel_list: list of channel indices corresponding to each show
+ :param audio_file_list: list of input audio files if the name is independent from the ID of the show
+ :param feature_file_list: list of output audio files if the name is independent from the ID of the show
+ :param num_thread: number of parallel process to run
+ :return:
+ """
+
+ # get the length of the longest list
+ max_length = max([len(l) for l in [show_list, channel_list, audio_file_list, feature_file_list]
+ if l is not None])
+
+ if show_list is None:
+ show_list = numpy.empty(int(max_length), dtype='|O')
+ if audio_file_list is None:
+ audio_file_list = numpy.empty(int(max_length), dtype='|O')
+ if feature_file_list is None:
+ feature_file_list = numpy.empty(int(max_length), dtype='|O')
+ if noise_file_list is None:
+ noise_file_list = numpy.empty(int(max_length), dtype='|O')
+ snr_list = numpy.empty(int(max_length), dtype='|O')
+ elif snr_list is None:
+ snr_list = numpy.full(int(max_length), 5.)
+ if reverb_file_list is None:
+ reverb_file_list = numpy.empty(int(max_length), dtype='|O')
+ reverb_levels = numpy.empty(int(max_length), dtype='|O')
+ elif reverb_levels is None:
+ reverb_levels = numpy.full(int(max_length), -26.)
+
+
+ for show, channel, input_file, output_file, noise_file, snr, reverb_file, reverb_level in zip(show_list,
+ channel_list,
+ audio_file_list,
+ feature_file_list,
+ noise_file_list,
+ snr_list,
+ reverb_file_list,
+ reverb_levels):
+ degrade_audio(input_path, input_extension, output_path, output_extension,
+ show,
+ input_file,
+ output_file,
+ sampling_frequency,
+ noise_file,
+ snr,
+ reverb_file,
+ reverb_level)
+
+
+
diff --git a/sidekit/sidekit/frontend/normfeat.py b/sidekit/sidekit/frontend/normfeat.py
new file mode 100755
index 0000000..40dfcd6
--- /dev/null
+++ b/sidekit/sidekit/frontend/normfeat.py
@@ -0,0 +1,230 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+
+"""
+Copyright 2014-2021 Anthony Larcher and Sylvain Meignier
+
+:mod:`frontend` provides methods to process an audio signal in order to extract
+useful parameters for speaker verification.
+"""
+import numpy
+import pandas
+import scipy.stats as stats
+from scipy.signal import lfilter
+
+
+__author__ = "Anthony Larcher and Sylvain Meignier"
+__copyright__ = "Copyright 2014-2021 Anthony Larcher and Sylvain Meignier"
+__license__ = "LGPL"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+
+
+def rasta_filt(x):
+ """Apply RASTA filtering to the input signal.
+
+ :param x: the input audio signal to filter.
+ cols of x = critical bands, rows of x = frame
+ same for y but after filtering
+ default filter is single pole at 0.94
+ """
+ x = x.T
+ numerator = numpy.arange(.2, -.3, -.1)
+ denominator = numpy.array([1, -0.94])
+
+ # Initialize the state. This avoids a big spike at the beginning
+ # resulting from the dc offset level in each band.
+ # (this is effectively what rasta/rasta_filt.c does).
+ # Because Matlab uses a DF2Trans implementation, we have to
+ # specify the FIR part to get the state right (but not the IIR part)
+ y = numpy.zeros(x.shape)
+ zf = numpy.zeros((x.shape[0], 4))
+ for i in range(y.shape[0]):
+ y[i, :4], zf[i, :4] = lfilter(numerator, 1, x[i, :4], axis=-1, zi=[0, 0, 0, 0])
+
+ # .. but don't keep any of these values, just output zero at the beginning
+ y = numpy.zeros(x.shape)
+
+ # Apply the full filter to the rest of the signal, append it
+ for i in range(y.shape[0]):
+ y[i, 4:] = lfilter(numerator, denominator, x[i, 4:], axis=-1, zi=zf[i, :])[0]
+
+ return y.T
+
+
+def cms(features, label=None, global_mean=None):
+ """Performs cepstral mean subtraction
+
+ :param features: a feature stream of dimension dim x nframes
+ where dim is the dimension of the acoustic features and nframes the
+ number of frames in the stream
+ :param label: a logical vector
+ :param global_mean: pre-computed mean to use for feature normalization if given
+
+ :return: a feature stream
+ """
+ # If no label file as input: all speech are speech
+ if label is None:
+ label = numpy.ones(features.shape[0]).astype(bool)
+ if label.sum() == 0:
+ mu = numpy.zeros((features.shape[1]))
+ if global_mean is not None:
+ mu = global_mean
+ else:
+ mu = numpy.mean(features[label, :], axis=0)
+ features -= mu
+
+
+def cmvn(features, label=None, global_mean=None, global_std=None):
+ """Performs mean and variance normalization
+
+ :param features: a feature stream of dimension dim x nframes
+ where dim is the dimension of the acoustic features and nframes the
+ number of frames in the stream
+ :param global_mean: pre-computed mean to use for feature normalization if given
+ :param global_std: pre-computed standard deviation to use for feature normalization if given
+ :param label: a logical verctor
+
+ :return: a sequence of features
+ """
+ # If no label file as input: all speech are speech
+ if label is None:
+ label = numpy.ones(features.shape[0]).astype(bool)
+
+ if global_mean is not None and global_std is not None:
+ mu = global_mean
+ stdev = global_std
+ features -= mu
+ features /= stdev
+
+ elif not label.sum() == 0:
+ mu = numpy.mean(features[label, :], axis=0)
+ stdev = numpy.std(features[label, :], axis=0)
+ features -= mu
+ features /= stdev
+
+
+def stg(features, label=None, win=301):
+ """Performs feature warping on a sliding window
+
+ :param features: a feature stream of dimension dim x nframes
+ where dim is the dimension of the acoustic features and nframes the
+ number of frames in the stream
+ :param label: label of selected frames to compute the Short Term Gaussianization, by default, al frames are used
+ :param win: size of the frame window to consider, must be an odd number to get a symetric context on left and right
+ :return: a sequence of features
+ """
+
+ # If no label file as input: all speech are speech
+ if label is None:
+ label = numpy.ones(features.shape[0]).astype(bool)
+ speech_features = features[label, :]
+
+ add_a_feature = False
+ if win % 2 == 1:
+ # one feature per line
+ nframes, dim = numpy.shape(speech_features)
+
+ # If the number of frames is not enough for one window
+ if nframes < win:
+ # if the number of frames is not odd, duplicate the last frame
+ # if nframes % 2 == 1:
+ if not nframes % 2 == 1:
+ nframes += 1
+ add_a_feature = True
+ speech_features = numpy.concatenate((speech_features, [speech_features[-1, ]]))
+ win = nframes
+
+ # create the output feature stream
+ stg_features = numpy.zeros(numpy.shape(speech_features))
+
+ # Process first window
+ r = numpy.argsort(speech_features[:win, ], axis=0)
+ r = numpy.argsort(r, axis=0)
+ arg = (r[: (win - 1) // 2] + 0.5) / win
+ stg_features[: (win - 1) // 2, :] = stats.norm.ppf(arg, 0, 1)
+
+ # process all following windows except the last one
+ for m in range(int((win - 1) / 2), int(nframes - (win - 1) / 2)):
+ idx = list(range(int(m - (win - 1) / 2), int(m + (win - 1) / 2 + 1)))
+ foo = speech_features[idx, :]
+ r = numpy.sum(foo < foo[(win - 1) // 2], axis=0) + 1
+ arg = (r - 0.5) / win
+ stg_features[m, :] = stats.norm.ppf(arg, 0, 1)
+
+ # Process the last window
+ r = numpy.argsort(speech_features[list(range(nframes - win, nframes)), ], axis=0)
+ r = numpy.argsort(r, axis=0)
+ arg = (r[(win + 1) // 2: win, :] + 0.5) / win
+
+ stg_features[list(range(int(nframes - (win - 1) / 2), nframes)), ] = stats.norm.ppf(arg, 0, 1)
+ else:
+ # Raise an exception
+ raise Exception('Sliding window should have an odd length')
+
+ # wrapFeatures = np.copy(features)
+ if add_a_feature:
+ stg_features = stg_features[:-1]
+ features[label, :] = stg_features
+
+
+def cep_sliding_norm(features, win=301, label=None, center=True, reduce=False):
+ """
+ Performs a cepstal mean substitution and standard deviation normalization
+ in a sliding windows. MFCC is modified.
+
+ :param features: the MFCC, a numpy array
+ :param win: the size of the sliding windows
+ :param label: vad label if available
+ :param center: performs mean subtraction
+ :param reduce: performs standard deviation division
+
+ """
+ if label is None:
+ label = numpy.ones(features.shape[0]).astype(bool)
+
+ if numpy.sum(label) <= win:
+ if reduce:
+ cmvn(features, label)
+ else:
+ cms(features, label)
+ else:
+ d_win = win // 2
+
+ df = pandas.DataFrame(features[label, :])
+ r = df.rolling(window=win, center=True)
+ mean = r.mean().values
+ std = r.std().values
+
+ mean[0:d_win, :] = mean[d_win, :]
+ mean[-d_win:, :] = mean[-d_win-1, :]
+
+ std[0:d_win, :] = std[d_win, :]
+ std[-d_win:, :] = std[-d_win-1, :]
+
+ if center:
+ features[label, :] -= mean
+ if reduce:
+ features[label, :] /= std
diff --git a/sidekit/sidekit/frontend/vad.py b/sidekit/sidekit/frontend/vad.py
new file mode 100755
index 0000000..02e0c93
--- /dev/null
+++ b/sidekit/sidekit/frontend/vad.py
@@ -0,0 +1,428 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+
+"""
+Copyright 2014-2021 Anthony Larcher and Sylvain Meignier
+
+:mod:`frontend` provides methods to process an audio signal in order to extract
+useful parameters for speaker verification.
+"""
+import copy
+import logging
+import numpy
+from scipy.fftpack import fft, ifft
+from scipy import ndimage
+
+
+__author__ = "Anthony Larcher and Sylvain Meignier"
+__copyright__ = "Copyright 2014-2021 Anthony Larcher and Sylvain Meignier"
+__license__ = "LGPL"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+
+
+def pre_emphasis(input_sig, pre):
+ """Pre-emphasis of an audio signal.
+ :param input_sig: the input vector of signal to pre emphasize
+ :param pre: value that defines the pre-emphasis filter.
+ """
+ if input_sig.ndim == 1:
+ return (input_sig - numpy.c_[input_sig[numpy.newaxis, :][..., :1],
+ input_sig[numpy.newaxis, :][..., :-1]].squeeze() * pre)
+ else:
+ return input_sig - numpy.c_[input_sig[..., :1], input_sig[..., :-1]] * pre
+
+
+def segment_axis(a, length, overlap=0, axis=None, end='cut', endvalue=0):
+ """Generate a new array that chops the given array along the given axis
+ into overlapping frames.
+
+ This method has been implemented by Anne Archibald,
+ as part of the talk box toolkit
+ example::
+
+ segment_axis(arange(10), 4, 2)
+ array([[0, 1, 2, 3],
+ ( [2, 3, 4, 5],
+ [4, 5, 6, 7],
+ [6, 7, 8, 9]])
+
+ :param a: the array to segment
+ :param length: the length of each frame
+ :param overlap: the number of array elements by which the frames should overlap
+ :param axis: the axis to operate on; if None, act on the flattened array
+ :param end: what to do with the last frame, if the array is not evenly
+ divisible into pieces. Options are:
+ - 'cut' Simply discard the extra values
+ - 'wrap' Copy values from the beginning of the array
+ - 'pad' Pad with a constant value
+
+ :param endvalue: the value to use for end='pad'
+
+ :return: a ndarray
+
+ The array is not copied unless necessary (either because it is unevenly
+ strided and being flattened or because end is set to 'pad' or 'wrap').
+ """
+
+ if axis is None:
+ a = numpy.ravel(a) # may copy
+ axis = 0
+
+ l = a.shape[axis]
+
+ if overlap >= length:
+ raise ValueError("frames cannot overlap by more than 100%")
+ if overlap < 0 or length <= 0:
+ raise ValueError("overlap must be nonnegative and length must" +
+ "be positive")
+
+ if l < length or (l - length) % (length - overlap):
+ if l > length:
+ roundup = length + (1 + (l - length) // (length - overlap)) * (length - overlap)
+ rounddown = length + ((l - length) // (length - overlap)) * (length - overlap)
+ else:
+ roundup = length
+ rounddown = 0
+ assert rounddown < l < roundup
+ assert roundup == rounddown + (length - overlap) or (roundup == length and rounddown == 0)
+ a = a.swapaxes(-1, axis)
+
+ if end == 'cut':
+ a = a[..., :rounddown]
+ l = a.shape[0]
+ elif end in ['pad', 'wrap']: # copying will be necessary
+ s = list(a.shape)
+ s[-1] = roundup
+ b = numpy.empty(s, dtype=a.dtype)
+ b[..., :l] = a
+ if end == 'pad':
+ b[..., l:] = endvalue
+ elif end == 'wrap':
+ b[..., l:] = a[..., :roundup - l]
+ a = b
+
+ a = a.swapaxes(-1, axis)
+
+ if l == 0:
+ raise ValueError("Not enough data points to segment array " +
+ "in 'cut' mode; try 'pad' or 'wrap'")
+ assert l >= length
+ assert (l - length) % (length - overlap) == 0
+ n = 1 + (l - length) // (length - overlap)
+ s = a.strides[axis]
+ new_shape = a.shape[:axis] + (n, length) + a.shape[axis + 1:]
+ new_strides = a.strides[:axis] + ((length - overlap) * s, s) + a.strides[axis + 1:]
+
+ try:
+ return numpy.ndarray.__new__(numpy.ndarray, strides=new_strides,
+ shape=new_shape, buffer=a, dtype=a.dtype)
+ except TypeError:
+ logging.debug("Problem with ndarray creation forces copy.")
+ a = a.copy()
+ # Shape doesn't change but strides does
+ new_strides = a.strides[:axis] + ((length - overlap) * s, s) + a.strides[axis + 1:]
+ return numpy.ndarray.__new__(numpy.ndarray, strides=new_strides,
+ shape=new_shape, buffer=a, dtype=a.dtype)
+
+
+def speech_enhancement(X, Gain, NN):
+ """This program is only to process the single file seperated by the silence
+ section if the silence section is detected, then a counter to number of
+ buffer is set and pre-processing is required.
+
+ Usage: SpeechENhance(wavefilename, Gain, Noise_floor)
+
+ :param X: input audio signal
+ :param Gain: default value is 0.9, suggestion range 0.6 to 1.4,
+ higher value means more subtraction or noise redcution
+ :param NN:
+
+ :return: a 1-dimensional array of boolean that
+ is True for high energy frames.
+
+ Copyright 2014 Sun Han Wu and Anthony Larcher
+ """
+ if X.shape[0] < 512: # creer une exception
+ return X
+
+ num1 = 40 # dsiable buffer number
+ Alpha = 0.75 # original value is 0.9
+ FrameSize = 32 * 2 # 256*2
+ FrameShift = int(FrameSize / NN) # FrameSize/2=128
+ nfft = FrameSize # = FrameSize
+ Fmax = int(numpy.floor(nfft / 2) + 1) # 128+1 = 129
+ # arising hamming windows
+ Hamm = 1.08 * (0.54 - 0.46 * numpy.cos(2 * numpy.pi * numpy.arange(FrameSize) / (FrameSize - 1)))
+ y0 = numpy.zeros(FrameSize - FrameShift) # 128 zeros
+
+ Eabsn = numpy.zeros(Fmax)
+ Eta1 = Eabsn
+
+ ###################################################################
+ # initial parameter for noise min
+ mb = numpy.ones((1 + FrameSize // 2, 4)) * FrameSize / 2 # 129x4 set four buffer * FrameSize/2
+ im = 0
+ Beta1 = 0.9024 # seems that small value is better;
+ pxn = numpy.zeros(1 + FrameSize // 2) # 1+FrameSize/2=129 zeros vector
+
+ ###################################################################
+ old_absx = Eabsn
+ x = numpy.zeros(FrameSize)
+ x[FrameSize - FrameShift:FrameSize] = X[
+ numpy.arange(numpy.min((int(FrameShift), X.shape[0])))]
+
+ if x.shape[0] < FrameSize:
+ EOF = 1
+ return X
+
+ EOF = 0
+ Frame = 0
+
+ ###################################################################
+ # add the pre-noise estimates
+ for i in range(200):
+ Frame += 1
+ fftn = fft(x * Hamm) # get its spectrum
+ absn = numpy.abs(fftn[0:Fmax]) # get its amplitude
+
+ # add the following part from noise estimation algorithm
+ pxn = Beta1 * pxn + (1 - Beta1) * absn # Beta=0.9231 recursive pxn
+ im = (im + 1) % 40 # noise_memory=47; im=0 (init) for noise level estimation
+
+ if im:
+ mb[:, 0] = numpy.minimum(mb[:, 0], pxn) # 129 by 4 im<>0 update the first vector from PXN
+ else:
+ mb[:, 1:] = mb[:, :3] # im==0 every 47 time shift pxn to first vector of mb
+ mb[:, 0] = pxn
+ # 0-2 vector shifted to 1 to 3
+
+ pn = 2 * numpy.min(mb, axis=1) # pn = 129x1po(9)=1.5 noise level estimate compensation
+ # over_sub_noise= oversubtraction factor
+
+ # end of noise detection algotihm
+ x[:FrameSize - FrameShift] = x[FrameShift:FrameSize]
+ index1 = numpy.arange(FrameShift * Frame, numpy.min((FrameShift * (Frame + 1), X.shape[0])))
+ In_data = X[index1] # fread(ifp, FrameShift, 'short');
+
+ if In_data.shape[0] < FrameShift: # to check file is out
+ EOF = 1
+ break
+ else:
+ x[FrameSize - FrameShift:FrameSize] = In_data # shift new 128 to position 129 to FrameSize location
+ # end of for loop for noise estimation
+
+ # end of prenoise estimation ************************
+ x = numpy.zeros(FrameSize)
+ x[FrameSize - FrameShift:FrameSize] = X[numpy.arange(numpy.min((int(FrameShift), X.shape[0])))]
+
+ if x.shape[0] < FrameSize:
+ EOF = 1
+ return X
+
+ EOF = 0
+ Frame = 0
+
+ X1 = numpy.zeros(X.shape)
+ Frame = 0
+
+ while EOF == 0:
+ Frame += 1
+ xwin = x * Hamm
+
+ fftx = fft(xwin, nfft) # FrameSize FFT
+ absx = numpy.abs(fftx[0:Fmax]) # Fmax=129,get amplitude of x
+ argx = fftx[:Fmax] / (absx + numpy.spacing(1)) # normalize x spectrum phase
+
+ absn = absx
+
+ # add the following part from rainer algorithm
+ pxn = Beta1 * pxn + (1 - Beta1) * absn # s Beta=0.9231 recursive pxn
+
+ im = int((im + 1) % (num1 * NN / 2)) # original =40 noise_memory=47; im=0 (init) for noise level estimation
+
+ if im:
+ mb[:, 0] = numpy.minimum(mb[:, 0], pxn) # 129 by 4 im<>0 update the first vector from PXN
+ else:
+ mb[:, 1:] = mb[:, :3] # im==0 every 47 time shift pxn to first vector of mb
+ mb[:, 0] = pxn
+
+ pn = 2 * numpy.min(mb, axis=1) # pn = 129x1po(9)=1.5 noise level estimate compensation
+
+ Eabsn = pn
+ Gaina = Gain
+
+ temp1 = Eabsn * Gaina
+
+ Eta1 = Alpha * old_absx + (1 - Alpha) * numpy.maximum(absx - temp1, 0)
+ new_absx = (absx * Eta1) / (Eta1 + temp1) # wiener filter
+ old_absx = new_absx
+
+ ffty = new_absx * argx # multiply amplitude with its normalized spectrum
+
+ #y = numpy.real(numpy.fft.fftpack.ifft(numpy.concatenate((ffty,
+ # numpy.conj(ffty[numpy.arange(Fmax - 2, 0, -1)])))))
+ y = numpy.real(ifft(numpy.concatenate((ffty, numpy.conj(ffty[numpy.arange(Fmax - 2, 0, -1)])))))
+ y[:FrameSize - FrameShift] = y[:FrameSize - FrameShift] + y0
+ y0 = y[FrameShift:FrameSize] # keep 129 to FrameSize point samples
+ x[:FrameSize - FrameShift] = x[FrameShift:FrameSize]
+
+ index1 = numpy.arange(FrameShift * Frame, numpy.min((FrameShift * (Frame + 1), X.shape[0])))
+ In_data = X[index1] # fread(ifp, FrameShift, 'short');
+
+ z = 2 / NN * y[:FrameShift] # left channel is the original signal
+ z /= 1.15
+ z = numpy.minimum(z, 32767)
+ z = numpy.maximum(z, -32768)
+ index0 = numpy.arange(FrameShift * (Frame - 1), FrameShift * Frame)
+ if not all(index0 < X1.shape[0]):
+ idx = 0
+ while (index0[idx] < X1.shape[0]) & (idx < index0.shape[0]):
+ X1[index0[idx]] = z[idx]
+ idx += 1
+ else:
+ X1[index0] = z
+
+ if In_data.shape[0] == 0:
+ EOF = 1
+ else:
+ x[numpy.arange(FrameSize - FrameShift, FrameSize + In_data.shape[0] - FrameShift)] = In_data
+
+ X1 = X1[X1.shape[0] - X.shape[0]:]
+ # }
+ # catch{
+
+ # }
+ return X1
+
+
+def vad_percentil(log_energy, percent):
+ """
+
+ :param log_energy:
+ :param percent:
+ :return:
+ """
+ thr = numpy.percentile(log_energy, percent)
+ return log_energy > thr, thr
+
+
+def vad_energy(log_energy,
+ distrib_nb=3,
+ nb_train_it=8,
+ flooring=0.0001, ceiling=1.0,
+ alpha=2):
+ """
+
+ :param log_energy:
+ :param distrib_nb:
+ :param nb_train_it:
+ :param flooring:
+ :param ceiling:
+ :param alpha:
+ :return:
+ """
+ # center and normalize the energy
+ log_energy = (log_energy - numpy.mean(log_energy)) / numpy.std(log_energy)
+
+ # Initialize a Mixture with 2 or 3 distributions
+ world = Mixture()
+ # set the covariance of each component to 1.0 and the mean to mu + meanIncrement
+ world.cst = numpy.ones(distrib_nb) / (numpy.pi / 2.0)
+ world.det = numpy.ones(distrib_nb)
+ world.mu = -2 + 4.0 * numpy.arange(distrib_nb) / (distrib_nb - 1)
+ world.mu = world.mu[:, numpy.newaxis]
+ world.invcov = numpy.ones((distrib_nb, 1))
+ # set equal weights for each component
+ world.w = numpy.ones(distrib_nb) / distrib_nb
+ world.cov_var_ctl = copy.deepcopy(world.invcov)
+
+ # Initialize the accumulator
+ accum = copy.deepcopy(world)
+
+ # Perform nbTrainIt iterations of EM
+ for it in range(nb_train_it):
+ accum._reset()
+ # E-step
+ world._expectation(accum, log_energy)
+ # M-step
+ world._maximization(accum, ceiling, flooring)
+
+ # Compute threshold
+ threshold = world.mu.max() - alpha * numpy.sqrt(1.0 / world.invcov[world.mu.argmax(), 0])
+
+ # Apply frame selection with the current threshold
+ label = log_energy > threshold
+ return label, threshold
+
+
+def vad_snr(sig, snr, fs=16000, shift=0.01, nwin=256):
+ """Select high energy frames based on the Signal to Noise Ratio
+ of the signal.
+ Input signal is expected encoded on 16 bits
+
+ :param sig: the input audio signal
+ :param snr: Signal to noise ratio to consider
+ :param fs: sampling frequency of the input signal in Hz. Default is 16000.
+ :param shift: shift between two frames in seconds. Default is 0.01
+ :param nwin: number of samples of the sliding window. Default is 256.
+ """
+ overlap = nwin - int(shift * fs)
+ sig /= 32768.
+ sig = speech_enhancement(numpy.squeeze(sig), 1.2, 2)
+
+ # Compute Standard deviation
+ sig += 0.1 * numpy.random.randn(sig.shape[0])
+
+ std2 = segment_axis(sig, nwin, overlap, axis=None, end='cut', endvalue=0).T
+ std2 = numpy.std(std2, axis=0)
+ std2 = 20 * numpy.log10(std2) # convert the dB
+
+ # APPLY VAD
+ label = (std2 > numpy.max(std2) - snr) & (std2 > -75)
+
+ return label
+
+
+def label_fusion(label, win=3):
+ """Apply a morphological filtering on the label to remove isolated labels.
+ In case the input is a two channel label (2D ndarray of boolean of same
+ length) the labels of two channels are fused to remove
+ overlaping segments of speech.
+
+ :param label: input labels given in a 1D or 2D ndarray
+ :param win: parameter or the morphological filters
+ """
+ channel_nb = len(label)
+ if channel_nb == 2:
+ overlap_label = numpy.logical_and(label[0], label[1])
+ label[0] = numpy.logical_and(label[0], ~overlap_label)
+ label[1] = numpy.logical_and(label[1], ~overlap_label)
+
+ for idx, lbl in enumerate(label):
+ cl = ndimage.grey_closing(lbl, size=win)
+ label[idx] = ndimage.grey_opening(cl, size=win)
+
+ return label
diff --git a/sidekit/sidekit/gmm_scoring.py b/sidekit/sidekit/gmm_scoring.py
new file mode 100755
index 0000000..2f9f2a4
--- /dev/null
+++ b/sidekit/sidekit/gmm_scoring.py
@@ -0,0 +1,177 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+"""
+Copyright 2014-2021 Anthony Larcher and Sylvain Meignier
+
+ :mod:`features_server` provides methods to test gmm models
+
+"""
+import numpy
+import warnings
+import multiprocessing
+import ctypes
+import logging
+
+import sidekit.sv_utils
+import sidekit.frontend
+from sidekit.mixture import Mixture
+from sidekit.statserver import StatServer
+from sidekit.features_server import FeaturesServer
+from sidekit.bosaris import Ndx
+from sidekit.bosaris import Scores
+
+
+__license__ = "LGPL"
+__author__ = "Anthony Larcher"
+__copyright__ = "Copyright 2014-2021 Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+
+
+def gmm_scoring_singleThread(ubm, enroll, ndx, feature_server, score_mat, seg_idx=None):
+ """Compute log-likelihood ratios for sequences of acoustic feature
+ frames between a Universal Background Model (UBM) and a list of Gaussian
+ Mixture Models (GMMs) which only mean vectors differ from the UBM.
+
+ :param ubm: a Mixture object used to compute the denominator
+ of the likelihood ratios
+ :param enroll: a StatServer object which stat1 attribute contains mean
+ super-vectors of the GMMs to use to compute the numerator of the
+ likelihood ratios.
+ :param ndx: an Ndx object which define the list of trials to compute
+ :param feature_server: sidekit.FeaturesServer used to load the acoustic parameters
+ :param score_mat: a ndarray of scores to fill
+ :param seg_idx: the list of unique test segments to process.
+ Those test segments should belong to the list of test segments
+ in the ndx object. By setting seg_idx=None, all test segments
+ from the ndx object will be processed
+
+ """
+ assert isinstance(ubm, Mixture), 'First parameter should be a Mixture'
+ assert isinstance(enroll, StatServer), 'Second parameter should be a StatServer'
+ assert isinstance(ndx, Ndx), 'Third parameter should be a Ndx'
+ assert isinstance(feature_server, FeaturesServer), 'Fourth parameter should be a FeatureServer'
+
+ if seg_idx is None:
+ seg_idx = range(ndx.segset.shape[0])
+
+ for ts in seg_idx:
+ logging.info('Compute trials involving test segment %d/%d', ts + 1, ndx.segset.shape[0])
+
+ # Select the models to test with the current segment
+ models = ndx.modelset[ndx.trialmask[:, ts]]
+ ind_dict = dict((k, i) for i, k in enumerate(ndx.modelset))
+ inter = set(ind_dict.keys()).intersection(models)
+ idx_ndx = [ind_dict[x] for x in inter]
+ ind_dict = dict((k, i) for i, k in enumerate(enroll.modelset))
+ inter = set(ind_dict.keys()).intersection(models)
+ idx_enroll = [ind_dict[x] for x in inter]
+
+ # Load feature file
+ cep, _ = feature_server.load(ndx.segset[ts])
+
+ llr = numpy.zeros(numpy.array(idx_enroll).shape)
+ for m in range(llr.shape[0]):
+ # Compute llk for the current model
+ if ubm.invcov.ndim == 2:
+ lp = ubm.compute_log_posterior_probabilities(cep, enroll.stat1[idx_enroll[m], :])
+ elif ubm.invcov.ndim == 3:
+ lp = ubm.compute_log_posterior_probabilities_full(cep, enroll.stat1[idx_enroll[m], :])
+ pp_max = numpy.max(lp, axis=1)
+ log_lk = pp_max + numpy.log(numpy.sum(numpy.exp((lp.transpose() - pp_max).transpose()), axis=1))
+ llr[m] = log_lk.mean()
+
+ # Compute and substract llk for the ubm
+ if ubm.invcov.ndim == 2:
+ lp = ubm.compute_log_posterior_probabilities(cep)
+ elif ubm.invcov.ndim == 3:
+ lp = ubm.compute_log_posterior_probabilities_full(cep)
+ ppMax = numpy.max(lp, axis=1)
+ loglk = ppMax + numpy.log(numpy.sum(numpy.exp((lp.transpose() - ppMax).transpose()), axis=1))
+ llr = llr - loglk.mean()
+
+ # Fill the score matrix
+ score_mat[idx_ndx, ts] = llr
+
+
+def gmm_scoring(ubm, enroll, ndx, feature_server, num_thread=1):
+ """Compute log-likelihood ratios for sequences of acoustic feature
+ frames between a Universal Background Model (UBM) and a list of
+ Gaussian Mixture Models (GMMs) which only mean vectors differ
+ from the UBM.
+
+ :param ubm: a Mixture object used to compute the denominator of the
+ likelihood ratios
+ :param enroll: a StatServer object which stat1 attribute contains
+ mean super-vectors of the GMMs to use to compute the numerator
+ of the likelihood ratios.
+ :param ndx: an Ndx object which define the list of trials to compute
+ :param feature_server: a FeatureServer object to load the features
+ :param num_thread: number of thread to launch in parallel
+
+ :return: a Score object.
+
+ """
+ assert isinstance(ubm, Mixture), 'First parameter should be a Mixture'
+ assert isinstance(enroll, StatServer), 'Second parameter should be a StatServer'
+ assert isinstance(ndx, Ndx), 'Third parameter should be a Ndx'
+ assert isinstance(feature_server, FeaturesServer), 'Fourth parameter should be a FeatureServer'
+
+ # Remove missing models and test segments
+ if feature_server.features_extractor is None:
+ existing_test_seg, test_seg_idx = sidekit.sv_utils.check_file_list(ndx.segset,
+ feature_server.feature_filename_structure)
+ clean_ndx = ndx.filter(enroll.modelset, existing_test_seg, True)
+ elif feature_server.features_extractor.audio_filename_structure is not None:
+ existing_test_seg, test_seg_idx = \
+ sidekit.sv_utils.check_file_list(ndx.segset, feature_server.features_extractor.audio_filename_structure)
+ clean_ndx = ndx.filter(enroll.modelset, existing_test_seg, True)
+ else:
+ clean_ndx = ndx
+
+ s = numpy.zeros(clean_ndx.trialmask.shape)
+ dims = s.shape
+ with warnings.catch_warnings():
+ warnings.simplefilter('ignore', RuntimeWarning)
+ tmp_stat1 = multiprocessing.Array(ctypes.c_double, s.size)
+ s = numpy.ctypeslib.as_array(tmp_stat1.get_obj())
+ s = s.reshape(dims)
+
+ # Split the list of segment to process for multi-threading
+ los = numpy.array_split(numpy.arange(clean_ndx.segset.shape[0]), num_thread)
+ jobs = []
+ for idx in los:
+ p = multiprocessing.Process(target=gmm_scoring_singleThread, args=(ubm, enroll, ndx, feature_server, s, idx))
+ jobs.append(p)
+ p.start()
+ for p in jobs:
+ p.join()
+
+ score = Scores()
+ score.scoremat = s
+ score.modelset = clean_ndx.modelset
+ score.segset = clean_ndx.segset
+ score.scoremask = clean_ndx.trialmask
+ return score
diff --git a/sidekit/sidekit/iv_scoring.py b/sidekit/sidekit/iv_scoring.py
new file mode 100755
index 0000000..85fbbed
--- /dev/null
+++ b/sidekit/sidekit/iv_scoring.py
@@ -0,0 +1,574 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+"""
+Copyright 2014-2021 Anthony Larcher and Sylvain Meignier
+
+ :mod:`iv_scoring` provides methods to compare i-vectors
+"""
+import copy
+import logging
+import sys
+
+import numpy
+import scipy
+import torch
+
+from sidekit.bosaris import Ndx, Scores
+from sidekit.statserver import StatServer
+
+if sys.version_info.major > 2:
+ from functools import reduce
+
+
+__license__ = "LGPL"
+__author__ = "Anthony Larcher"
+__copyright__ = "Copyright 2014-2021 Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+
+
+def _check_missing_model(enroll, test, ndx):
+ # Remove missing models and test segments
+ clean_ndx = ndx.filter(enroll.modelset, test.segset, True)
+
+ # Align StatServers to match the clean_ndx
+ enroll.align_models(clean_ndx.modelset)
+ test.align_segments(clean_ndx.segset)
+
+ return clean_ndx
+
+
+def cosine_scoring(enroll, test, ndx, wccn=None, check_missing=True, device=None):
+ """Compute the cosine similarities between to sets of vectors. The list of
+ trials to perform is given in an Ndx object.
+
+ :param enroll: a StatServer in which stat1 are i-vectors
+ :param test: a StatServer in which stat1 are i-vectors
+ :param ndx: an Ndx object defining the list of trials to perform
+ :param wccn: numpy.ndarray, if provided, the i-vectors are normalized by using a Within Class Covariance Matrix
+ :param check_missing: boolean, if True, check that all models and segments exist
+
+ :return: a score object
+ """
+ assert isinstance(enroll, StatServer), 'First parameter should be a StatServer'
+ assert isinstance(test, StatServer), 'Second parameter should be a StatServer'
+ assert isinstance(ndx, Ndx), 'Third parameter should be an Ndx'
+ enroll_copy = copy.deepcopy(enroll)
+ test_copy = copy.deepcopy(test)
+
+ # If models are not unique, compute the mean per model, display a warning
+ #if not numpy.unique(enroll_copy.modelset).shape == enroll_copy.modelset.shape:
+ # logging.warning("Enrollment models are not unique, average i-vectors")
+ # enroll_copy = enroll_copy.mean_stat_per_model()
+
+ # Remove missing models and test segments
+ if check_missing:
+ clean_ndx = _check_missing_model(enroll_copy, test_copy, ndx)
+ else:
+ clean_ndx = ndx
+
+ if wccn is not None:
+ enroll_copy.rotate_stat1(wccn)
+ if enroll_copy != test_copy:
+ test_copy.rotate_stat1(wccn)
+
+ # Cosine scoring
+ enroll_copy.norm_stat1()
+ if enroll_copy != test_copy:
+ test_copy.norm_stat1()
+ s_size_in_bytes = enroll_copy.stat1.shape[0] * test_copy.stat1.shape[0] * 4
+ if device == None:
+ device = torch.device("cuda:0" if torch.cuda.is_available() and s_size_in_bytes < 3e9 else "cpu")
+ else:
+ device = device if torch.cuda.is_available() and s_size_in_bytes < 3e9 else torch.device("cpu")
+
+ score = Scores()
+ score.scoremat = torch.einsum('ij,kj', torch.FloatTensor(enroll_copy.stat1).to(device),
+ torch.FloatTensor(test_copy.stat1).to(device)).cpu().numpy()
+ score.modelset = clean_ndx.modelset
+ score.segset = clean_ndx.segset
+ score.scoremask = clean_ndx.trialmask
+ return score
+
+
+def mahalanobis_scoring(enroll, test, ndx, m, check_missing=True):
+ """Compute the mahalanobis distance between to sets of vectors. The list of
+ trials to perform is given in an Ndx object.
+
+ :param enroll: a StatServer in which stat1 are i-vectors
+ :param test: a StatServer in which stat1 are i-vectors
+ :param ndx: an Ndx object defining the list of trials to perform
+ :param m: mahalanobis matrix as a ndarray
+ :param check_missing: boolean, default is True, set to False not to check missing models
+
+ :return: a score object
+ """
+ assert isinstance(enroll, StatServer), 'First parameter should be a StatServer'
+ assert isinstance(test, StatServer), 'Second parameter should be a StatServer'
+ assert isinstance(ndx, Ndx), 'Third parameter should be an Ndx'
+ assert enroll.stat1.shape[1] == test.stat1.shape[1], 'I-vectors dimension mismatch'
+ assert enroll.stat1.shape[1] == m.shape[0], 'I-vectors and Mahalanobis matrix dimension mismatch'
+
+ # If models are not unique, compute the mean per model, display a warning
+ if not numpy.unique(enroll.modelset).shape == enroll.modelset.shape:
+ logging.warning("Enrollment models are not unique, average i-vectors")
+ enroll = enroll.mean_stat_per_model()
+
+ # Remove missing models and test segments
+ if check_missing:
+ clean_ndx = _check_missing_model(enroll, test, ndx)
+ else:
+ clean_ndx = ndx
+
+ # Mahalanobis scoring
+ s = numpy.zeros((enroll.modelset.shape[0], test.segset.shape[0]))
+ for i in range(enroll.modelset.shape[0]):
+ diff = enroll.stat1[i, :] - test.stat1
+ s[i, :] = -0.5 * numpy.sum(numpy.dot(diff, m) * diff, axis=1)
+
+ score = Scores()
+ score.scoremat = s
+ score.modelset = clean_ndx.modelset
+ score.segset = clean_ndx.segset
+ score.scoremask = clean_ndx.trialmask
+ return score
+
+
+def two_covariance_scoring(enroll, test, ndx, W, B, check_missing=True):
+ """Compute the 2-covariance scores between to sets of vectors. The list of
+ trials to perform is given in an Ndx object. Within and between class
+ co-variance matrices have to be pre-computed.
+
+ :param enroll: a StatServer in which stat1 are i-vectors
+ :param test: a StatServer in which stat1 are i-vectors
+ :param ndx: an Ndx object defining the list of trials to perform
+ :param W: the within-class co-variance matrix to consider
+ :param B: the between-class co-variance matrix to consider
+ :param check_missing: boolean, default is True, set to False not to check missing models
+
+ :return: a score object
+ """
+ assert isinstance(enroll, StatServer), 'First parameter should be a directory'
+ assert isinstance(test, StatServer), 'Second parameter should be a StatServer'
+ assert isinstance(ndx, Ndx), 'Third parameter should be an Ndx'
+ assert enroll.stat1.shape[1] == test.stat1.shape[1], 'I-vectors dimension mismatch'
+ assert enroll.stat1.shape[1] == W.shape[0], 'I-vectors and co-variance matrix dimension mismatch'
+ assert enroll.stat1.shape[1] == B.shape[0], 'I-vectors and co-variance matrix dimension mismatch'
+
+ # If models are not unique, compute the mean per model, display a warning
+ if not numpy.unique(enroll.modelset).shape == enroll.modelset.shape:
+ logging.warning("Enrollment models are not unique, average i-vectors")
+ enroll = enroll.mean_stat_per_model()
+
+ # Remove missing models and test segments
+ if check_missing:
+ clean_ndx = _check_missing_model(enroll, test, ndx)
+ else:
+ clean_ndx = ndx
+
+ # Two covariance scoring scoring
+ S = numpy.zeros((enroll.modelset.shape[0], test.segset.shape[0]))
+ iW = scipy.linalg.inv(W)
+ iB = scipy.linalg.inv(B)
+
+ G = reduce(numpy.dot, [iW, scipy.linalg.inv(iB + 2*iW), iW])
+ H = reduce(numpy.dot, [iW, scipy.linalg.inv(iB + iW), iW])
+
+ s2 = numpy.sum(numpy.dot(enroll.stat1, H) * enroll.stat1, axis=1)
+ s3 = numpy.sum(numpy.dot(test.stat1, H) * test.stat1, axis=1)
+
+ for ii in range(enroll.modelset.shape[0]):
+ A = enroll.stat1[ii, :] + test.stat1
+ s1 = numpy.sum(numpy.dot(A, G) * A, axis=1)
+ S[ii, :] = s1 - s3 - s2[ii]
+
+ score = Scores()
+ score.scoremat = S
+ score.modelset = clean_ndx.modelset
+ score.segset = clean_ndx.segset
+ score.scoremask = clean_ndx.trialmask
+ return score
+
+
+def PLDA_scoring(enroll,
+ test,
+ ndx,
+ mu,
+ F,
+ G,
+ Sigma,
+ test_uncertainty=None,
+ Vtrans=None,
+ p_known=0.0,
+ scaling_factor=1.,
+ full_model=False):
+ """Compute the PLDA scores between two sets of vectors. The list of
+ trials to perform is given in an Ndx object. PLDA matrices have to be
+ pre-computed. i-vectors are supposed to be whitened before.
+
+ Implements the approach described in [Lee13]_ including scoring
+ for partially open-set identification
+
+ :param enroll: a StatServer in which stat1 are i-vectors
+ :param test: a StatServer in which stat1 are i-vectors
+ :param ndx: an Ndx object defining the list of trials to perform
+ :param mu: the mean vector of the PLDA gaussian
+ :param F: the between-class co-variance matrix of the PLDA
+ :param G: the within-class co-variance matrix of the PLDA
+ :param Sigma: the residual covariance matrix
+ :param p_known: probability of having a known speaker for open-set
+ identification case (=1 for the verification task and =0 for the
+ closed-set case)
+ :param scaling_factor: scaling factor to be multiplied by the sufficient statistics
+ :param full_model: boolean, set to True when using a complete PLDA model (including within class covariance matrix)
+
+ :return: a score object
+ """
+ assert isinstance(enroll, StatServer), 'First parameter should be a StatServer'
+ assert isinstance(test, StatServer), 'Second parameter should be a StatServer'
+ assert isinstance(ndx, Ndx), 'Third parameter should be an Ndx'
+ assert enroll.stat1.shape[1] == test.stat1.shape[1], 'I-vectors dimension mismatch'
+ assert enroll.stat1.shape[1] == F.shape[0], 'I-vectors and co-variance matrix dimension mismatch'
+ assert enroll.stat1.shape[1] == G.shape[0], 'I-vectors and co-variance matrix dimension mismatch'
+
+ if not full_model:
+ return fast_PLDA_scoring(enroll,
+ test,
+ ndx,
+ mu,
+ F,
+ Sigma,
+ test_uncertainty,
+ Vtrans,
+ p_known=p_known,
+ scaling_factor=scaling_factor,
+ check_missing=True)
+ else:
+ return full_PLDA_scoring(enroll, test, ndx, mu, F, G, Sigma, p_known=p_known, scaling_factor=scaling_factor)
+
+
+def full_PLDA_scoring(enroll, test, ndx, mu, F, G, Sigma, p_known=0.0, scaling_factor=1., check_missing=True):
+ """Compute PLDA scoring
+
+ :param enroll: a StatServer in which stat1 are i-vectors
+ :param test: a StatServer in which stat1 are i-vectors
+ :param ndx: an Ndx object defining the list of trials to perform
+ :param mu: the mean vector of the PLDA gaussian
+ :param F: the between-class co-variance matrix of the PLDA
+ :param G: the within-class co-variance matrix of the PLDA
+ :param Sigma: the residual covariance matrix
+ :param p_known: probability of having a known speaker for open-set
+ identification case (=1 for the verification task and =0 for the
+ closed-set case)
+ :param check_missing: boolean, default is True, set to False not to check missing models
+
+ """
+
+ enroll_copy = copy.deepcopy(enroll)
+ test_copy = copy.deepcopy(test)
+
+ # If models are not unique, compute the mean per model, display a warning
+ # if not numpy.unique(enroll_copy.modelset).shape == enroll_copy.modelset.shape:
+ # logging.warning("Enrollment models are not unique, average i-vectors")
+ # enroll_copy = enroll_copy.mean_stat_per_model()
+
+ # Remove missing models and test segments
+ if check_missing:
+ clean_ndx = _check_missing_model(enroll_copy, test_copy, ndx)
+ else:
+ clean_ndx = ndx
+
+ # Center the i-vectors around the PLDA mean
+ enroll_copy.center_stat1(mu)
+ test_copy.center_stat1(mu)
+
+ # Compute temporary matrices
+ invSigma = scipy.linalg.inv(Sigma)
+ I_iv = numpy.eye(mu.shape[0], dtype='float')
+ I_ch = numpy.eye(G.shape[1], dtype='float')
+ I_spk = numpy.eye(F.shape[1], dtype='float')
+ A = numpy.linalg.inv(G.T.dot(invSigma * scaling_factor).dot(G) + I_ch) # keep numpy as interface are different
+ B = F.T.dot(invSigma * scaling_factor).dot(I_iv - G.dot(A).dot(G.T).dot(invSigma * scaling_factor))
+ K = B.dot(F)
+ K1 = scipy.linalg.inv(K + I_spk)
+ K2 = scipy.linalg.inv(2 * K + I_spk)
+
+ # Compute the Gaussian distribution constant
+ alpha1 = numpy.linalg.slogdet(K1)[1]
+ alpha2 = numpy.linalg.slogdet(K2)[1]
+ constant = alpha2 / 2.0 - alpha1
+
+ # Compute verification scores
+ score = Scores()
+ score.scoremat = numpy.zeros(clean_ndx.trialmask.shape)
+ score.modelset = clean_ndx.modelset
+ score.segset = clean_ndx.segset
+ score.scoremask = clean_ndx.trialmask
+
+ # Project data in the space that maximizes the speaker separability
+ test_tmp = B.dot(test_copy.stat1.T)
+ enroll_tmp = B.dot(enroll_copy.stat1.T)
+
+ # score qui ne dépend que du segment
+ tmp1 = test_tmp.T.dot(K1)
+
+ # Compute the part of the score that is only dependent on the test segment
+ S1 = numpy.empty(test_copy.segset.shape[0])
+ for seg_idx in range(test_copy.segset.shape[0]):
+ S1[seg_idx] = tmp1[seg_idx, :].dot(test_tmp[:, seg_idx])/2.
+
+ # Compute the part of the score that depends only on the model (S2) and on both model and test segment
+ S2 = numpy.empty(enroll_copy.modelset.shape[0])
+
+ for model_idx in range(enroll_copy.modelset.shape[0]):
+ mod_plus_test_seg = test_tmp + numpy.atleast_2d(enroll_tmp[:, model_idx]).T
+ tmp2 = mod_plus_test_seg.T.dot(K2)
+
+ S2[model_idx] = enroll_tmp[:, model_idx].dot(K1).dot(enroll_tmp[:, model_idx])/2.
+ score.scoremat[model_idx, :] = numpy.einsum("ij, ji->i", tmp2, mod_plus_test_seg)/2.
+
+ score.scoremat += constant - (S1 + S2[:, numpy.newaxis])
+ score.scoremat *= scaling_factor
+
+ # Case of open-set identification, we compute the log-likelihood
+ # by taking into account the probability of having a known impostor
+ # or an out-of set class
+ if p_known != 0:
+ N = score.scoremat.shape[0]
+ open_set_scores = numpy.empty(score.scoremat.shape)
+ tmp = numpy.exp(score.scoremat)
+ for ii in range(N):
+ # open-set term
+ open_set_scores[ii, :] = score.scoremat[ii, :] \
+ - numpy.log(p_known * tmp[~(numpy.arange(N) == ii)].sum(axis=0) / (N - 1) + (1 - p_known))
+ score.scoremat = open_set_scores
+
+ return score
+
+def fast_PLDA_scoring(enroll,
+ test,
+ ndx,
+ mu,
+ F,
+ Sigma,
+ test_uncertainty=None,
+ Vtrans=None,
+ p_known=0.0,
+ scaling_factor=1.,
+ check_missing=True):
+ """Compute the PLDA scores between to sets of vectors. The list of
+ trials to perform is given in an Ndx object. PLDA matrices have to be
+ pre-computed. i-vectors are supposed to be whitened before.
+
+ :param enroll: a StatServer in which stat1 are i-vectors
+ :param test: a StatServer in which stat1 are i-vectors
+ :param ndx: an Ndx object defining the list of trials to perform
+ :param mu: the mean vector of the PLDA gaussian
+ :param F: the between-class co-variance matrix of the PLDA
+ :param Sigma: the residual covariance matrix
+ :param p_known: probability of having a known speaker for open-set
+ identification case (=1 for the verification task and =0 for the
+ closed-set case)
+ :param check_missing: boolean, if True, check that all models and segments exist
+
+ :return: a score object
+ """
+ # assert isinstance(enroll, StatServer), 'First parameter should be a StatServer'
+ # assert isinstance(test, StatServer), 'Second parameter should be a StatServer'
+ # assert isinstance(ndx, Ndx), 'Third parameter should be an Ndx'
+ # assert enroll.stat1.shape[1] == test.stat1.shape[1], 'I-vectors dimension mismatch'
+ # assert enroll.stat1.shape[1] == F.shape[0], 'I-vectors and co-variance matrix dimension mismatch'
+ # assert enroll.stat1.shape[1] == G.shape[0], 'I-vectors and co-variance matrix dimension mismatch'
+
+ enroll_ctr = copy.deepcopy(enroll)
+ test_ctr = copy.deepcopy(test)
+
+ # If models are not unique, compute the mean per model, display a warning
+ if not numpy.unique(enroll_ctr.modelset).shape == enroll_ctr.modelset.shape:
+ logging.warning("Enrollment models are not unique, average i-vectors")
+ enroll_ctr = enroll_ctr.mean_stat_per_model()
+
+ # Remove missing models and test segments
+ if check_missing:
+ clean_ndx = _check_missing_model(enroll_ctr, test_ctr, ndx)
+ else:
+ clean_ndx = ndx
+
+ # Center the i-vectors around the PLDA mean
+ enroll_ctr.center_stat1(mu)
+ test_ctr.center_stat1(mu)
+
+ # If models are not unique, compute the mean per model, display a warning
+ if not numpy.unique(enroll_ctr.modelset).shape == enroll_ctr.modelset.shape:
+ logging.warning("Enrollment models are not unique, average i-vectors")
+ enroll_ctr = enroll_ctr.mean_stat_per_model()
+
+ # Compute constant component of the PLDA distribution
+ invSigma = scipy.linalg.inv(Sigma)
+ I_spk = numpy.eye(F.shape[1], dtype='float')
+
+ K = F.T.dot(invSigma * scaling_factor).dot(F)
+ K1 = scipy.linalg.inv(K + I_spk)
+ K2 = scipy.linalg.inv(2 * K + I_spk)
+
+ # Compute the Gaussian distribution constant
+ alpha1 = numpy.linalg.slogdet(K1)[1]
+ alpha2 = numpy.linalg.slogdet(K2)[1]
+ plda_cst = alpha2 / 2.0 - alpha1
+
+ # Compute intermediate matrices
+ Sigma_ac = numpy.dot(F, F.T)
+ Sigma_tot = Sigma_ac + Sigma
+ Sigma_tot_inv = scipy.linalg.inv(Sigma_tot)
+
+ Tmp = numpy.linalg.inv(Sigma_tot - Sigma_ac.dot(Sigma_tot_inv).dot(Sigma_ac))
+ Phi = Sigma_tot_inv - Tmp
+ Psi = Sigma_tot_inv.dot(Sigma_ac).dot(Tmp)
+
+ # Compute the different parts of PLDA score
+ model_part = 0.5 * numpy.einsum('ij, ji->i', enroll_ctr.stat1.dot(Phi), enroll_ctr.stat1.T)
+ seg_part = 0.5 * numpy.einsum('ij, ji->i', test_ctr.stat1.dot(Phi), test_ctr.stat1.T)
+
+ # Compute verification scores
+ score = Scores()
+ score.modelset = clean_ndx.modelset
+ score.segset = clean_ndx.segset
+ score.scoremask = clean_ndx.trialmask
+
+ score.scoremat = model_part[:, numpy.newaxis] + seg_part + plda_cst
+ score.scoremat += enroll_ctr.stat1.dot(Psi).dot(test_ctr.stat1.T)
+ score.scoremat *= scaling_factor
+
+ # Case of open-set identification, we compute the log-likelihood
+ # by taking into account the probability of having a known impostor
+ # or an out-of set class
+ if p_known != 0:
+ N = score.scoremat.shape[0]
+ open_set_scores = numpy.empty(score.scoremat.shape)
+ tmp = numpy.exp(score.scoremat)
+ for ii in range(N):
+ # open-set term
+ open_set_scores[ii, :] = score.scoremat[ii, :] \
+ - numpy.log(p_known * tmp[~(numpy.arange(N) == ii)].sum(axis=0) / (N - 1) + (1 - p_known))
+ score.scoremat = open_set_scores
+
+ return score
+
+
+#IL FAUT RAJOUTER LA GESTION DES MULTI-SESSIONS (voir fonction PLDA_scoring_with_test_uncertainty_by_the_book de Themos)
+#IMPLEMENTER LA VERSION "BY THE BOOK" pour ne pas utiliser la moyenne des i-vecteurs
+def PLDA_scoring_uncertainty(enroll,
+ test,
+ ndx,
+ mu,
+ F,
+ Sigma,
+ p_known=0.0,
+ scaling_factor=1.,
+ test_uncertainty=None,
+ Vtrans=None,
+ check_missing=True):
+ """
+
+ :param enroll:
+ :param test:
+ :param ndx:
+ :param mu:
+ :param F:
+ :param Sigma:
+ :param p_known:
+ :param scaling_factor:
+ :param test_uncertainty:
+ :param Vtrans:
+ :param check_missing:
+ :return:
+ """
+ assert isinstance(enroll, StatServer), 'First parameter should be a StatServer'
+ assert isinstance(test, StatServer), 'Second parameter should be a StatServer'
+ assert isinstance(ndx, Ndx), 'Third parameter should be an Ndx'
+ assert enroll.stat1.shape[1] == test.stat1.shape[1], 'I-vectors dimension mismatch'
+ assert enroll.stat1.shape[1] == F.shape[0], 'I-vectors and co-variance matrix dimension mismatch'
+ assert enroll.stat1.shape[1] == G.shape[0], 'I-vectors and co-variance matrix dimension mismatch'
+
+ enroll_ctr = copy.deepcopy(enroll)
+ test_ctr = copy.deepcopy(test)
+
+ # Remove missing models and test segments
+ if check_missing:
+ clean_ndx = _check_missing_model(enroll_ctr, test_ctr, ndx)
+ else:
+ clean_ndx = ndx
+
+ # Center the i-vectors around the PLDA mean
+ enroll_ctr.center_stat1(mu)
+ test_ctr.center_stat1(mu)
+
+ # Align StatServers to match the clean_ndx
+ enroll_ctr.align_models_average(clean_ndx.modelset)
+ test_ctr.align_segments(clean_ndx.segset)
+
+ # Compute constant component of the PLDA distribution
+ scoremat = numpy.zeros((enroll_ctr.stat1.shape[0], test_ctr.stat1.shape[0]), dtype='float')
+ invSigma = scipy.linalg.inv(Sigma)
+
+ K1 = scipy.linalg.inv(numpy.eye(F.shape[1]) + F.T.dot(invSigma * scaling_factor).dot(F))
+
+ FK1Ft = F.dot(K1).dot(F.T)
+ Xtilda_e = FK1Ft.dot(invSigma * scaling_factor).dot(enroll_ctr.stat1.T).T
+
+ for t in range(test_ctr.stat1.shape[0]):
+ xt = test_ctr.stat1[t,:]
+ Pr = numpy.eye(F.shape[1]) - numpy.outer(test.stat1[t, :],test.stat1[t, :])
+ Cunc = Pr.dot(Vtrans.transpose()).dot(numpy.diag(test_uncertainty[t, :]).dot(Vtrans)).dot(Pr)
+ prec_den = scipy.linalg.inv(F.dot(F.T) + Sigma + Cunc)
+
+ denom = -0.5 * xt.dot(prec_den).dot(xt) +0.5 * numpy.linalg.slogdet(prec_den)[1]
+ prec_num = scipy.linalg.inv(FK1Ft+Sigma+Cunc)
+ Xec = Xtilda_e - xt
+ numer = -0.5 * numpy.einsum('ij, ji->i', Xec.dot(prec_num), Xec.T) + 0.5 * numpy.linalg.slogdet(prec_num)[1]
+ scoremat[:, t] = scaling_factor * (numer - denom)
+
+ # Compute verification scores
+ score = Scores()
+ score.modelset = clean_ndx.modelset
+ score.segset = clean_ndx.segset
+ score.scoremask = clean_ndx.trialmask
+
+ score.scoremat = scoremat
+
+ # Case of open-set identification, we compute the log-likelihood
+ # by taking into account the probability of having a known impostor
+ # or an out-of set class
+ if p_known != 0:
+ N = score.scoremat.shape[0]
+ open_set_scores = numpy.empty(score.scoremat.shape)
+ tmp = numpy.exp(score.scoremat)
+ for ii in range(N):
+ # open-set term
+ open_set_scores[ii, :] = score.scoremat[ii, :] \
+ - numpy.log(p_known * tmp[~(numpy.arange(N) == ii)].sum(axis=0) / (N - 1) + (1 - p_known))
+ score.scoremat = open_set_scores
+
+ return score
diff --git a/sidekit/sidekit/jfa_scoring.py b/sidekit/sidekit/jfa_scoring.py
new file mode 100755
index 0000000..0bdadd3
--- /dev/null
+++ b/sidekit/sidekit/jfa_scoring.py
@@ -0,0 +1,132 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+"""
+Copyright 2014-2021 Anthony Larcher and Sylvain Meignier
+
+ :mod:`jfa_scoring` provides methods to score using JFA model
+
+"""
+import copy
+from sidekit.mixture import Mixture
+from sidekit.statserver import StatServer
+from sidekit.bosaris import Ndx
+from sidekit.bosaris import Scores
+
+
+__license__ = "LGPL"
+__author__ = "Anthony Larcher"
+__copyright__ = "Copyright 2014-2021 Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+
+
+def _check_missing_model(enroll, test, ndx):
+ # Remove missing models and test segments
+ clean_ndx = ndx.filter(enroll.modelset, test.segset, True)
+
+ # Align StatServers to match the clean_ndx
+ enroll.align_models(clean_ndx.modelset)
+ test.align_segments(clean_ndx.segset)
+
+ return clean_ndx
+
+
+def jfa_scoring(ubm, enroll, test, ndx, mean, sigma, V, U, D, batch_size=100, num_thread=1, check_missing=True):
+ """Compute a verification score as a channel point estimate
+ of the log-likelihood ratio. Detail of this scoring can be found in
+ [Glembeck09].
+
+ [Glembek09] Ondrej Glembek, Lukas Burget, Najim Dehak, Niko Brummer,
+ and Patrick Kenny, "Comparison of scoring methods used in speaker
+ recognition with joint factor analysis."
+ in Acoustics, Speech and Signal Processing, 2009. ICASSP 2009.
+ IEEE International Conference on. IEEE, 2009.
+
+ Note that input statistics should not be whitened as
+ it is done within this function.
+
+ :param ubm: the Mixture object used to compute sufficient statistics
+ :param enroll: a StatServer object which contains zero- and first-order
+ statistics.
+ :param test: a StatServer object which contains zero- and first-order
+ statistics.
+ :param ndx: an Ndx object which trial mask will be copied into the output
+ Scores object
+ :param mean: mean vector of the JFA model
+ :param sigma: residual covariance vector of the JFA model
+ :param V: between class covariance matrix of the JFA model
+ :param U: within class covariance matrix of the JFA model
+ :param D: MAP covariance matrix for the JFA model
+ :param batch_size: size of the batch to reduce memory footprint
+ :param num_thread: number of parallel process to run
+ :param check_missing: boolean, if True, check that all model exist
+
+ :return: a Scores object
+ """
+ assert isinstance(ubm, Mixture), '1st parameter must be a Mixture'
+ assert isinstance(enroll, StatServer), '2nd parameter must be a StatServer'
+ assert isinstance(test, StatServer), '3rd parameter must be a StatServer'
+ assert isinstance(ndx, Ndx), '4th parameter shomustuld be a Ndx'
+
+ # Remove missing models and test segments
+ if check_missing:
+ clean_ndx = _check_missing_model(enroll, test, ndx)
+ else:
+ clean_ndx = ndx
+
+ print("taille de clean_ndx.trial_mask = {}".format(clean_ndx.trialmask.shape))
+
+ # Sum enrolment statistics per model in case of multi-session
+ enroll = enroll.sum_stat_per_model()[0]
+
+ # Whiten enroll and test statistics
+ enroll.whiten_stat1(ubm.get_mean_super_vector(), ubm.get_invcov_super_vector())
+ test.whiten_stat1(ubm.get_mean_super_vector(), ubm.get_invcov_super_vector())
+
+ # Estimate Vy and DZ from the enrollment
+ trn_y, trn_x, trn_z = enroll.estimate_hidden(mean, sigma, V, U, D, batch_size=batch_size, num_thread=num_thread)
+ M = ((trn_y.stat1.dot(V.T)) + (trn_z.stat1 * D))
+
+ # Estimate Ux from the test
+ tmp = copy.deepcopy(test)
+ test_y, test_x, test_z = tmp.estimate_hidden(mean, sigma, None, U, None, batch_size, num_thread)
+
+ # remove Ux weighted from the test statistics
+ Ux = copy.deepcopy(test)
+ Ux.stat1 = test_x.stat1.dot(U.T)
+ test = test.subtract_weighted_stat1(Ux)
+
+ # sum zero-order statistics for each test segment
+ test_stat0_sum = test.stat0.sum(axis=1)
+
+ # Compute score as the dot product of the enrollment supervector and the first
+ # order statistics divided by the sum of the zero-order stats
+ scores = Scores()
+ scores.modelset = enroll.modelset
+ scores.segset = test.segset
+ scores.scoremask = clean_ndx.trialmask
+ scores.scoremat = M.dot((test.stat1 / test_stat0_sum[:, None]).T)
+
+ return scores
diff --git a/sidekit/sidekit/libsvm/__init__.py b/sidekit/sidekit/libsvm/__init__.py
new file mode 100755
index 0000000..a1fb154
--- /dev/null
+++ b/sidekit/sidekit/libsvm/__init__.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+
+"""
+Copyright 2014-2021 Anthony Larcher
+"""
+
+__author__ = "Anthony Larcher"
+__copyright__ = "Copyright 2014-2021 Anthony Larcher"
+__license__ = "LGPL"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+
+
+from .svm import *
+from .svmutil import *
diff --git a/sidekit/sidekit/libsvm/libsvm.so.2 b/sidekit/sidekit/libsvm/libsvm.so.2
new file mode 100755
index 0000000..a46f679
Binary files /dev/null and b/sidekit/sidekit/libsvm/libsvm.so.2 differ
diff --git a/sidekit/sidekit/libsvm/svm.py b/sidekit/sidekit/libsvm/svm.py
new file mode 100755
index 0000000..9abdad1
--- /dev/null
+++ b/sidekit/sidekit/libsvm/svm.py
@@ -0,0 +1,339 @@
+#!/usr/bin/env python
+
+"""
+Copyright (c) 2000-2014 Chih-Chung Chang and Chih-Jen Lin
+All rights reserved.
+"""
+
+from ctypes import *
+from ctypes import c_int, c_double
+from ctypes.util import find_library
+from os import path
+import sys
+
+
+try:
+ dirname = path.dirname(path.abspath(__file__))
+ if sys.platform == 'win32':
+ libsvm = CDLL(path.join(dirname, r'libsvm.dll'))
+ else:
+ libsvm = CDLL(path.join(dirname, 'libsvm.so.2'))
+except:
+ # For unix the prefix 'lib' is not considered.
+ if find_library('svm'):
+ libsvm = CDLL(find_library('svm'))
+ elif find_library('libsvm'):
+ libsvm = CDLL(find_library('libsvm'))
+ else:
+ raise Exception('LIBSVM library not found.')
+
+
+# Construct constants
+SVM_TYPE = ['C_SVC', 'NU_SVC', 'ONE_CLASS', 'EPSILON_SVR', 'NU_SVR']
+KERNEL_TYPE = ['LINEAR', 'POLY', 'RBF', 'SIGMOID', 'PRECOMPUTED']
+for i, s in enumerate(SVM_TYPE):
+ exec("%s = %d" % (s, i))
+for i, s in enumerate(KERNEL_TYPE):
+ exec("%s = %d" % (s, i))
+
+PRINT_STRING_FUN = CFUNCTYPE(None, c_char_p)
+
+
+def print_null(s):
+ return
+
+
+def genFields(names, types):
+ return list(zip(names, types))
+
+
+def fillprototype(f, restype, argtypes):
+ f.restype = restype
+ f.argtypes = argtypes
+
+
+class svm_node(Structure):
+ _names = ["index", "value"]
+ _types = [c_int, c_double]
+ _fields_ = genFields(_names, _types)
+
+ def __str__(self):
+ return '%d:%g' % (self.index, self.value)
+
+
+def gen_svm_nodearray(xi, feature_max=None, isKernel=None):
+ if isinstance(xi, dict):
+ index_range = xi.keys()
+ elif isinstance(xi, (list, tuple)):
+ if not isKernel:
+ xi = [0] + xi # idx should start from 1
+ index_range = range(len(xi))
+ else:
+ raise TypeError('xi should be a dictionary, list or tuple')
+
+ if feature_max:
+ assert(isinstance(feature_max, int))
+ index_range = filter(lambda j: j <= feature_max, index_range)
+ if not isKernel:
+ index_range = filter(lambda j: xi[j] != 0, index_range)
+
+ index_range = sorted(index_range)
+ ret = (svm_node * (len(index_range)+1))()
+ ret[-1].index = -1
+ for idx, j in enumerate(index_range):
+ ret[idx].index = j
+ ret[idx].value = xi[j]
+ max_idx = 0
+ if index_range:
+ max_idx = index_range[-1]
+ return ret, max_idx
+
+
+class svm_problem(Structure):
+ _names = ["l", "y", "x"]
+ _types = [c_int, POINTER(c_double), POINTER(POINTER(svm_node))]
+ _fields_ = genFields(_names, _types)
+
+ def __init__(self, y, x, isKernel=None):
+ if len(y) != len(x):
+ raise ValueError("len(y) != len(x)")
+ self.l = l = len(y)
+
+ max_idx = 0
+ x_space = self.x_space = []
+ for i, xi in enumerate(x):
+ tmp_xi, tmp_idx = gen_svm_nodearray(xi, isKernel=isKernel)
+ x_space += [tmp_xi]
+ max_idx = max(max_idx, tmp_idx)
+ self.n = max_idx
+
+ self.y = (c_double * l)()
+ for i, yi in enumerate(y):
+ self.y[i] = yi
+
+ self.x = (POINTER(svm_node) * l)()
+ for i, xi in enumerate(self.x_space):
+ self.x[i] = xi
+
+
+class svm_parameter(Structure):
+ _names = ["svm_type", "kernel_type", "degree", "gamma", "coef0",
+ "cache_size", "eps", "C", "nr_weight", "weight_label", "weight",
+ "nu", "p", "shrinking", "probability"]
+ _types = [c_int, c_int, c_int, c_double, c_double,
+ c_double, c_double, c_double, c_int, POINTER(c_int), POINTER(c_double),
+ c_double, c_double, c_int, c_int]
+ _fields_ = genFields(_names, _types)
+
+ def __init__(self, options=None):
+ if options is None:
+ options = ''
+ self.parse_options(options)
+
+ def __str__(self):
+ s = ''
+ attrs = svm_parameter._names + list(self.__dict__.keys())
+ values = map(lambda attr: getattr(self, attr), attrs)
+ for attr, val in zip(attrs, values):
+ s += (' %s: %s\n' % (attr, val))
+ s = s.strip()
+
+ return s
+
+ def set_to_default_values(self):
+ self.svm_type = C_SVC
+ self.kernel_type = RBF
+ self.degree = 3
+ self.gamma = 0
+ self.coef0 = 0
+ self.nu = 0.5
+ self.cache_size = 100
+ self.C = 1
+ self.eps = 0.001
+ self.p = 0.1
+ self.shrinking = 1
+ self.probability = 0
+ self.nr_weight = 0
+ self.weight_label = (c_int * 0)()
+ self.weight = (c_double * 0)()
+ self.cross_validation = False
+ self.nr_fold = 0
+ self.print_func = None
+
+ def parse_options(self, options):
+ if isinstance(options, list):
+ argv = options
+ elif isinstance(options, str):
+ argv = options.split()
+ else:
+ raise TypeError("arg 1 should be a list or a str.")
+ self.set_to_default_values()
+ self.print_func = cast(None, PRINT_STRING_FUN)
+ weight_label = []
+ weight = []
+
+ i = 0
+ while i < len(argv):
+ if argv[i] == "-s":
+ i += 1
+ self.svm_type = int(argv[i])
+ elif argv[i] == "-t":
+ i += 1
+ self.kernel_type = int(argv[i])
+ elif argv[i] == "-d":
+ i += 1
+ self.degree = int(argv[i])
+ elif argv[i] == "-g":
+ i += 1
+ self.gamma = float(argv[i])
+ elif argv[i] == "-r":
+ i += 1
+ self.coef0 = float(argv[i])
+ elif argv[i] == "-n":
+ i += 1
+ self.nu = float(argv[i])
+ elif argv[i] == "-m":
+ i += 1
+ self.cache_size = float(argv[i])
+ elif argv[i] == "-c":
+ i += 1
+ self.C = float(argv[i])
+ elif argv[i] == "-e":
+ i += 1
+ self.eps = float(argv[i])
+ elif argv[i] == "-p":
+ i += 1
+ self.p = float(argv[i])
+ elif argv[i] == "-h":
+ i += 1
+ self.shrinking = int(argv[i])
+ elif argv[i] == "-b":
+ i += 1
+ self.probability = int(argv[i])
+ elif argv[i] == "-q":
+ self.print_func = PRINT_STRING_FUN(print_null)
+ elif argv[i] == "-v":
+ i += 1
+ self.cross_validation = 1
+ self.nr_fold = int(argv[i])
+ if self.nr_fold < 2:
+ raise ValueError("n-fold cross validation: n must >= 2")
+ elif argv[i].startswith("-w"):
+ i += 1
+ self.nr_weight += 1
+ nr_weight = self.nr_weight
+ weight_label += [int(argv[i-1][2:])]
+ weight += [float(argv[i])]
+ else:
+ raise ValueError("Wrong options")
+ i += 1
+
+ libsvm.svm_set_print_string_function(self.print_func)
+ self.weight_label = (c_int*self.nr_weight)()
+ self.weight = (c_double*self.nr_weight)()
+ for i in range(self.nr_weight):
+ self.weight[i] = weight[i]
+ self.weight_label[i] = weight_label[i]
+
+
+class svm_model(Structure):
+ _names = ['param', 'nr_class', 'l', 'SV', 'sv_coef', 'rho',
+ 'probA', 'probB', 'sv_indices', 'label', 'nSV', 'free_sv']
+ _types = [svm_parameter, c_int, c_int, POINTER(POINTER(svm_node)),
+ POINTER(POINTER(c_double)), POINTER(c_double),
+ POINTER(c_double), POINTER(c_double), POINTER(c_int),
+ POINTER(c_int), POINTER(c_int), c_int]
+ _fields_ = genFields(_names, _types)
+
+ def __init__(self):
+ self.__createfrom__ = 'python'
+
+ def __del__(self):
+ # free memory created by C to avoid memory leak
+ if hasattr(self, '__createfrom__') and self.__createfrom__ == 'C':
+ libsvm.svm_free_and_destroy_model(pointer(self))
+
+ def get_svm_type(self):
+ return libsvm.svm_get_svm_type(self)
+
+ def get_nr_class(self):
+ return libsvm.svm_get_nr_class(self)
+
+ def get_svr_probability(self):
+ return libsvm.svm_get_svr_probability(self)
+
+ def get_labels(self):
+ nr_class = self.get_nr_class()
+ labels = (c_int * nr_class)()
+ libsvm.svm_get_labels(self, labels)
+ return labels[:nr_class]
+
+ def get_sv_indices(self):
+ total_sv = self.get_nr_sv()
+ sv_indices = (c_int * total_sv)()
+ libsvm.svm_get_sv_indices(self, sv_indices)
+ return sv_indices[:total_sv]
+
+ def get_nr_sv(self):
+ return libsvm.svm_get_nr_sv(self)
+
+ def is_probability_model(self):
+ return libsvm.svm_check_probability_model(self) == 1
+
+ def get_sv_coef(self):
+ return [tuple(self.sv_coef[j][i] for j in range(self.nr_class - 1))
+ for i in range(self.l)]
+
+ def get_SV(self):
+ result = []
+ for sparse_sv in self.SV[:self.l]:
+ row = dict()
+
+ i = 0
+ while True:
+ row[sparse_sv[i].index] = sparse_sv[i].value
+ if sparse_sv[i].index == -1:
+ break
+ i += 1
+
+ result.append(row)
+ return result
+
+
+def toPyModel(model_ptr):
+ """
+ toPyModel(model_ptr) -> svm_model
+
+ Convert a ctypes POINTER(svm_model) to a Python svm_model
+ """
+ if not bool(model_ptr):
+ raise ValueError("Null pointer")
+ m = model_ptr.contents
+ m.__createfrom__ = 'C'
+ return m
+
+fillprototype(libsvm.svm_train, POINTER(svm_model), [POINTER(svm_problem), POINTER(svm_parameter)])
+fillprototype(libsvm.svm_cross_validation, None, [POINTER(svm_problem), POINTER(svm_parameter),
+ c_int, POINTER(c_double)])
+
+fillprototype(libsvm.svm_save_model, c_int, [c_char_p, POINTER(svm_model)])
+fillprototype(libsvm.svm_load_model, POINTER(svm_model), [c_char_p])
+
+fillprototype(libsvm.svm_get_svm_type, c_int, [POINTER(svm_model)])
+fillprototype(libsvm.svm_get_nr_class, c_int, [POINTER(svm_model)])
+fillprototype(libsvm.svm_get_labels, None, [POINTER(svm_model), POINTER(c_int)])
+fillprototype(libsvm.svm_get_sv_indices, None, [POINTER(svm_model), POINTER(c_int)])
+fillprototype(libsvm.svm_get_nr_sv, c_int, [POINTER(svm_model)])
+fillprototype(libsvm.svm_get_svr_probability, c_double, [POINTER(svm_model)])
+
+fillprototype(libsvm.svm_predict_values, c_double, [POINTER(svm_model), POINTER(svm_node), POINTER(c_double)])
+fillprototype(libsvm.svm_predict, c_double, [POINTER(svm_model), POINTER(svm_node)])
+fillprototype(libsvm.svm_predict_probability, c_double, [POINTER(svm_model), POINTER(svm_node), POINTER(c_double)])
+
+fillprototype(libsvm.svm_free_model_content, None, [POINTER(svm_model)])
+fillprototype(libsvm.svm_free_and_destroy_model, None, [POINTER(POINTER(svm_model))])
+fillprototype(libsvm.svm_destroy_param, None, [POINTER(svm_parameter)])
+
+fillprototype(libsvm.svm_check_parameter, c_char_p, [POINTER(svm_problem), POINTER(svm_parameter)])
+fillprototype(libsvm.svm_check_probability_model, c_int, [POINTER(svm_model)])
+fillprototype(libsvm.svm_set_print_string_function, None, [PRINT_STRING_FUN])
diff --git a/sidekit/sidekit/libsvm/svmutil.py b/sidekit/sidekit/libsvm/svmutil.py
new file mode 100755
index 0000000..e64743f
--- /dev/null
+++ b/sidekit/sidekit/libsvm/svmutil.py
@@ -0,0 +1,300 @@
+#!/usr/bin/env python
+
+"""
+Copyright (c) 2000-2014 Chih-Chung Chang and Chih-Jen Lin
+All rights reserved.
+"""
+
+import sys
+import os
+import pickle
+from .svm import svm_node, svm_problem, svm_parameter, svm_model, toPyModel, SVM_TYPE, KERNEL_TYPE, \
+ gen_svm_nodearray, print_null
+from ctypes import c_int, c_double
+
+sys.path = [os.path.dirname(os.path.abspath(__file__))] + sys.path
+
+def save_svm(svm_file_name, w, b):
+ """
+ Save SVM weights and biais in PICKLE format
+ :return:
+ """
+ if not os.path.exists(os.path.dirname(svm_file_name)):
+ os.makedirs(os.path.dirname(svm_file_name))
+ with open(svm_file_name, "wb") as f:
+ pickle.dump((w, b), f)
+
+
+def read_svm(svm_file_name):
+ """Read SVM model in PICKLE format
+
+ :param svm_file_name: name of the file to read from
+ """
+ with open(svm_file_name, "rb") as f:
+ (w, b) = pickle.load(f)
+ return w, b
+
+
+def svm_read_problem(data_file_name):
+ """
+ svm_read_problem(data_file_name) -> [y, x]
+
+ Read LIBSVM-format data from data_file_name and return labels y
+ and data instances x.
+ :param data_file_name: name of the file to load from
+ """
+ prob_y = []
+ prob_x = []
+ for line in open(data_file_name):
+ line = line.split(None, 1)
+ # In case an instance with all zero features
+ if len(line) == 1:
+ line += ['']
+ label, features = line
+ xi = {}
+ for e in features.split():
+ ind, val = e.split(":")
+ xi[int(ind)] = float(val)
+ prob_y += [float(label)]
+ prob_x += [xi]
+ return prob_y, prob_x
+
+
+def svm_load_model(model_file_name):
+ """
+ svm_load_model(model_file_name) -> model
+
+ Load a LIBSVM model from model_file_name and return.
+ :param model_file_name: file name to load from
+ """
+ model = svm_load_model(model_file_name.encode())
+ if not model:
+ print("can't open model file %s" % model_file_name)
+ return None
+ model = toPyModel(model)
+ return model
+
+
+def svm_save_model(model_file_name, model):
+ """
+ svm_save_model(model_file_name, model) -> None
+
+ Save a LIBSVM model to the file model_file_name.
+ :param model_file_name: file name to write to
+ :param model: model to save
+ """
+ svm_save_model(model_file_name.encode(), model)
+
+
+def evaluations(ty, pv):
+ """
+ evaluations(ty, pv) -> (ACC, MSE, SCC)
+
+ Calculate accuracy, mean squared error and squared correlation coefficient
+ using the true values (ty) and predicted values (pv).
+ """
+ if len(ty) != len(pv):
+ raise ValueError("len(ty) must equal to len(pv)")
+ total_correct = total_error = 0
+ sumv = sumy = sumvv = sumyy = sumvy = 0
+ for v, y in zip(pv, ty):
+ if y == v:
+ total_correct += 1
+ total_error += (v-y)*(v-y)
+ sumv += v
+ sumy += y
+ sumvv += v*v
+ sumyy += y*y
+ sumvy += v*y
+ l = len(ty)
+ ACC = 100.0*total_correct/l
+ MSE = total_error/l
+ try:
+ SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
+ except:
+ SCC = float('nan')
+ return ACC, MSE, SCC
+
+
+def svm_train(arg1, arg2=None, arg3=None):
+ """
+ svm_train(y, x [, options]) -> model | ACC | MSE
+ svm_train(prob [, options]) -> model | ACC | MSE
+ svm_train(prob, param) -> model | ACC| MSE
+
+ Train an SVM model from data (y, x) or an svm_problem prob using
+ \'options\' or an svm_parameter param.
+ If \'-v\' is specified in \'options\' (i.e., cross validation)
+ either accuracy (ACC) or mean-squared error (MSE) is returned.
+ options:
+
+ - -s svm_type : set type of SVM (default 0)
+
+ - 0 -- C-SVC (multi-class classification)
+ - 1 -- nu-SVC (multi-class classification)
+ - 2 -- one-class SVM
+ - 3 -- epsilon-SVR (regression)
+ - 4 -- nu-SVR (regression)
+
+ - -t kernel_type : set type of kernel function (default 2)
+
+ - 0 -- linear: u\'\*v
+ - 1 -- polynomial: (gamma\*u\'\*v + coef0)^degree
+ - 2 -- radial basis function: exp(-gamma\*|u-v|^2)
+ - 3 -- sigmoid: tanh(gamma\*u\'\*v + coef0)
+ - 4 -- precomputed kernel (kernel values in training_set_file)
+
+ - -d degree : set degree in kernel function (default 3)
+ - -g gamma : set gamma in kernel function (default 1/num_features)
+ - -r coef0 : set coef0 in kernel function (default 0)
+ - -c cost : set the parameter C of C-SVC, epsilon-SVR, and nu-SVR (default 1)
+ - -n nu : set the parameter nu of nu-SVC, one-class SVM, and nu-SVR (default 0.5)
+ - -p epsilon : set the epsilon in loss function of epsilon-SVR (default 0.1)
+ - -m cachesize : set cache memory size in MB (default 100)
+ - -e epsilon : set tolerance of termination criterion (default 0.001)
+ - -h shrinking : whether to use the shrinking heuristics, 0 or 1 (default 1)
+ - -b probability_estimates : whether to train a SVC or SVR model for probability estimates, 0 or 1 (default 0)
+ - -wi weight : set the parameter C of class i to weight*C, for C-SVC (default 1)
+ - -v n: n-fold cross validation mode
+ - -q : quiet mode (no outputs)
+ """
+ prob, param = None, None
+ if isinstance(arg1, (list, tuple)):
+ assert isinstance(arg2, (list, tuple))
+ y, x, options = arg1, arg2, arg3
+ param = svm_parameter(options)
+ prob = svm_problem(y, x, isKernel=(param.kernel_type == 'PRECOMPUTED'))
+ elif isinstance(arg1, svm_problem):
+ prob = arg1
+ if isinstance(arg2, svm_parameter):
+ param = arg2
+ else:
+ param = svm_parameter(arg2)
+ if prob is None or param is None:
+ raise TypeError("Wrong types for the arguments")
+
+ if param.kernel_type == 'PRECOMPUTED':
+ for xi in prob.x_space:
+ idx, val = xi[0].index, xi[0].value
+ if xi[0].index != 0:
+ raise ValueError('Wrong input format: first column must be 0:sample_serial_number')
+ if val <= 0 or val > prob.n:
+ raise ValueError('Wrong input format: sample_serial_number out of range')
+
+ if param.gamma == 0 and prob.n > 0:
+ param.gamma = 1.0 / prob.n
+ libsvm.svm_set_print_string_function(param.print_func)
+ err_msg = libsvm.svm_check_parameter(prob, param)
+ if err_msg:
+ raise ValueError('Error: %s' % err_msg)
+
+ if param.cross_validation:
+ l, nr_fold = prob.l, param.nr_fold
+ target = (c_double * l)() # pytype: disable=not-callable
+ libsvm.svm_cross_validation(prob, param, nr_fold, target)
+ ACC, MSE, SCC = evaluations(prob.y[:l], target[:l])
+ if param.svm_type in ['EPSILON_SVR', 'NU_SVR']:
+ print("Cross Validation Mean squared error = %g" % MSE)
+ print("Cross Validation Squared correlation coefficient = %g" % SCC)
+ return MSE
+ else:
+ print("Cross Validation Accuracy = %g%%" % ACC)
+ return ACC
+ else:
+ m = svm_train(prob, param)
+ m = toPyModel(m)
+
+ # If prob is destroyed, data including SVs pointed by m can remain.
+ m.x_space = prob.x_space
+ return m
+
+
+def svm_predict(y, x, m, options=""):
+ """svm_predict(y, x, m [, options]) -> (p_labels, p_acc, p_vals)
+
+ Predict data (y, x) with the SVM model m.
+ options:
+
+ - "-b" probability_estimates: whether to predict probability estimates,
+ 0 or 1 (default 0); for one-class SVM only 0 is supported.
+ - "-q" : quiet mode (no outputs).
+
+ The return tuple contains
+
+ - p_labels: a list of predicted labels
+ - p_acc: a tuple including accuracy (for classification),
+ mean-squared error, and squared correlation coefficient (for regression).
+ - p_vals: a list of decision values or probability estimates
+ (if \'-b 1\' is specified). If k is the number of classes,
+ for decision values, each element includes results of predicting k(k-1)/2 binary-class SVMs.
+ For probabilities, each element contains k values indicating the probability that the testing instance
+ is in each class.
+
+ .. note:: that the order of classes here is the same as \'model.label\' field in the model structure.
+ """
+
+ def info(s):
+ print(s)
+
+ predict_probability = 0
+ argv = options.split()
+ i = 0
+ while i < len(argv):
+ if argv[i] == '-b':
+ i += 1
+ predict_probability = int(argv[i])
+ elif argv[i] == '-q':
+ info = print_null
+ else:
+ raise ValueError("Wrong options")
+ i += 1
+
+ svm_type = m.get_svm_type()
+ is_prob_model = m.is_probability_model()
+ nr_class = m.get_nr_class()
+ pred_labels = []
+ pred_values = []
+
+ if predict_probability:
+ if not is_prob_model:
+ raise ValueError("Model does not support probabiliy estimates")
+
+ if svm_type in ['NU_SVR', 'EPSILON_SVR']:
+ info("Prob. model for test data: target value = predicted value + z,\n" +
+ "z: Laplace distribution e^(-|z|/sigma)/(2sigma),sigma=%g".format(m.get_svr_probability()))
+ nr_class = 0
+
+ prob_estimates = (c_double * nr_class)() # pytype: disable=not-callable
+ for xi in x:
+ xi, idx = gen_svm_nodearray(xi, isKernel=(m.param.kernel_type == 'PRECOMPUTED'))
+ label = libsvm.svm_predict_probability(m, xi, prob_estimates)
+ values = prob_estimates[:nr_class]
+ pred_labels += [label]
+ pred_values += [values]
+ else:
+ if is_prob_model:
+ info("Model supports probability estimates, but disabled in predicton.")
+ if svm_type in ('ONE_CLASS', 'EPSILON_SVR', 'NU_SVC'):
+ nr_classifier = 1
+ else:
+ nr_classifier = nr_class*(nr_class-1)//2
+ dec_values = (c_double * nr_classifier)() # pytype: disable=not-callable
+ for xi in x:
+ xi, idx = gen_svm_nodearray(xi, isKernel=(m.param.kernel_type == 'PRECOMPUTED'))
+ label = libsvm.svm_predict_values(m, xi, dec_values)
+ if nr_class == 1:
+ values = [1]
+ else:
+ values = dec_values[:nr_classifier]
+ pred_labels += [label]
+ pred_values += [values]
+
+ ACC, MSE, SCC = evaluations(y, pred_labels)
+ l = len(y)
+ if svm_type in ['EPSILON_SVR', 'NU_SVR']:
+ info("Mean squared error = %g (regression)" % MSE)
+ info("Squared correlation coefficient = %g (regression)" % SCC)
+ else:
+ info("Accuracy = %g%% (%d/%d) (classification)" % (ACC, int(l*ACC/100), l))
+
+ return pred_labels, (ACC, MSE, SCC), pred_values
diff --git a/sidekit/sidekit/lid_utils.py b/sidekit/sidekit/lid_utils.py
new file mode 100755
index 0000000..8359aec
--- /dev/null
+++ b/sidekit/sidekit/lid_utils.py
@@ -0,0 +1,263 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+
+"""
+Copyright 2014-2021 Anthony Larcher
+
+:mod:`lid_utils` provides utilities to perform Language identification.
+"""
+
+import numpy
+
+from sidekit.mixture import Mixture
+from sidekit.statserver import StatServer
+from sidekit.bosaris import Scores
+import sidekit.sv_utils
+import sidekit.frontend
+
+__license__ = "LGPL"
+__author__ = "Anthony Larcher"
+__copyright__ = "Copyright 2014-2021 Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+
+
+def log_sum_exp(x):
+ """
+ :param x: input vector
+ """
+ xmax = x.max(axis=0)
+ xnorm = x - xmax
+ ex = numpy.exp(xnorm)
+ return xmax + numpy.log(ex.sum(axis=0))
+
+
+def compute_log_likelihood_ratio(M, p_tar=0.5):
+ """
+ Compute log-likelihood ratio for closed-set identification.
+
+ :param M: a matrix of log-likelihood of shape nb_models x nb_test_segments
+ :param p_tar: probability of a trial to be from a target
+
+ :return: a matrix of log-likelihood ration of shape
+ nb_models x nb_test_segments
+ """
+ llr = numpy.empty(M.shape)
+ log_prior = numpy.ones((M.shape[0] - 1, 1)) * numpy.log((1 - p_tar) / (M.shape[0] - 1))
+ for ii in range(M.shape[0]):
+ llr[ii, :] = numpy.log(p_tar) + M[ii, :] - log_sum_exp(M[~(numpy.arange(M.shape[0]) == ii)] + log_prior)
+
+ return llr
+
+
+def gaussian_backend_train(train_ss):
+ """
+ Take a StatServer of training examples as input
+ output a StatServer mean for each class and a full tied co-variance matrix
+ :param train_ss: sidekit.StatServer containing training data
+ """
+
+ # Compute parameters of the Gaussian backend (common covariance and constant)
+ gb_sigma = train_ss.get_within_covariance_stat1()
+
+ # Compute mean of each class
+ gb_mean = train_ss.mean_stat_per_model()
+
+ # Compute the normalization constant
+ gb_cst = - 0.5 * (numpy.linalg.slogdet(gb_sigma)[1] + train_ss.stat1.shape[1] * numpy.log(2 * numpy.pi))
+
+ return gb_mean, gb_sigma, gb_cst
+
+
+def gaussian_backend_train_hetero(train_ss, alpha=0.1):
+ """
+ Take a StatServer of training examples as input
+ output a StatServer mean for each class and a full tied co-variance matrix
+ :param train_ss: sidekit.StatServer of input training data
+ :param alpha: weight of the a priori distribution learned on all training data
+ """
+
+ # Compute parameters of the Gaussian backend (common covariance and constant)
+ vect_size = train_ss.stat1.shape[1]
+ unique_language = numpy.unique(train_ss.modelset)
+ # gb_sigma = train_ss.get_within_covariance_stat1()
+
+ W = numpy.zeros((vect_size, vect_size))
+ gb_sigma = []
+
+ for languageID in unique_language:
+ spk_ctr_vec = train_ss.get_model_stat1(languageID) \
+ - numpy.mean(train_ss.get_model_stat1(languageID), axis=0)
+ gb_sigma.append(numpy.dot(spk_ctr_vec.transpose(), spk_ctr_vec))
+ W += gb_sigma[-1]
+ gb_sigma[-1] /= spk_ctr_vec.shape[0]
+ W /= train_ss.stat1.shape[0]
+
+ for ii in range(len(gb_sigma)):
+ gb_sigma[ii] = alpha * gb_sigma[ii] + (1 - alpha) * W
+
+ # Compute mean of each class
+ gb_mean = train_ss.mean_stat_per_model()
+
+ # Compute the normalization constant
+ gb_cst = []
+ for ii in range(len(gb_sigma)):
+ gb_cst.append(- 0.5 * (numpy.linalg.slogdet(gb_sigma[ii])[1] +
+ train_ss.stat1.shape[1] * numpy.log(2 * numpy.pi)))
+
+ return gb_mean, gb_sigma, gb_cst
+
+
+def _gaussian_backend_train(data, label):
+ """
+ Take a StatServer of training examples as input
+ output a StatServer mean for each class and a tied co-variance matrix
+ """
+ train_ss = StatServer()
+ train_ss.segset = label
+ train_ss.modelset = label
+ train_ss.stat1 = data
+ train_ss.stat0 = numpy.ones((data.shape[0], 1))
+ train_ss.start = numpy.empty(data.shape[0], dtype="object")
+ train_ss.stop = numpy.empty(data.shape[0], dtype="object")
+
+ return gaussian_backend_train(train_ss)
+
+
+def gaussian_backend_test(test_ss, params, diag=False, compute_llr=True):
+ """
+ Process data through a Gaussian-Backend which parameters (mean and variance)
+ have been estimated using Gaussian_Backend_Train.
+
+ If compute_llr is set to true, return the log-likelihood ratio, if not,
+ return rthe log-likelihood on each Gaussian distrbution. Default is True.
+
+ :param test_ss: a StatServer which stat1 are vectors to classify
+ :param params: Gaussian Backend parameters, a tupple of mean, covariance
+ and constante computed with Gaussian_Backend_Train
+ :param diag: boolean, if true: use the diagonal version of the covariance
+ matrix, if not the full version
+ :param compute_llr: boolean, if true, return the log-likelihood ratio, if not,
+ return rthe log-likelihood on each Gaussian distrbution.
+ """
+ gb_mean, gb_sigma, gb_cst = params
+
+ scores = Scores()
+ scores.modelset = gb_mean.modelset
+ scores.segset = test_ss.segset
+ scores.scoremat = numpy.ones((gb_mean.modelset.shape[0], test_ss.segset.shape[0]))
+ scores.scoremask = numpy.ones(scores.scoremat.shape, dtype='bool')
+
+ if diag:
+ gb_gmm = Mixture()
+ gb_gmm.w = numpy.ones(gb_mean.modelset.shape[0], dtype='float') / gb_mean.modelset.shape[0]
+ gb_gmm.mu = gb_mean.stat1
+ if gb_sigma.ndim == 2:
+ gb_gmm.invcov = numpy.tile(1 / numpy.diag(gb_sigma), (gb_mean.modelset.shape[0], 1))
+ elif gb_sigma.ndim == 2:
+ gb_gmm.invcov = numpy.tile(1. / gb_sigma, (gb_mean.modelset.shape[0], 1))
+ gb_gmm._compute_all()
+
+ scores.scoremat = gb_gmm.compute_log_posterior_probabilities(test_ss.stat1).T
+
+ else:
+ assert gb_sigma.ndim == 2
+ scores.scoremat *= gb_cst
+
+ inv_sigma = numpy.linalg.inv(gb_sigma)
+
+ # Compute scores for all trials per language
+ for lang in range(gb_mean.modelset.shape[0]):
+ scores.scoremat[lang, :] -= 0.5 * (gb_mean.stat1[lang, :].dot(inv_sigma).dot(gb_mean.stat1[lang, :].T) -
+ 2 * numpy.sum(test_ss.stat1.dot(inv_sigma) * gb_mean.stat1[lang, :],
+ axis=1) +
+ numpy.sum(test_ss.stat1.dot(inv_sigma) * test_ss.stat1, axis=1))
+
+ if compute_llr:
+ scores.scoremat = compute_log_likelihood_ratio(scores.scoremat)
+
+ assert scores.validate()
+ return scores
+
+
+def gaussian_backend_test_hetero(test_ss, params, diag=False, compute_llr=True):
+ """
+ Process data through a Gaussian-Backend which parameters (mean and variance)
+ have been estimated using Gaussian_Backend_Train.
+
+ If compute_llr is set to true, return the log-likelihood ratio, if not,
+ return rthe log-likelihood on each Gaussian distrbution. Default is True.
+
+ :param test_ss: a StatServer which stat1 are vectors to classify
+ :param params: Gaussian Backend parameters, a tupple of mean, covariance
+ and constante computed with Gaussian_Backend_Train
+ :param diag: boolean, if true: use the diagonal version of the covariance
+ matrix, if not the full version
+ :param compute_llr: boolean, if true, return the log-likelihood ratio, if not,
+ return rthe log-likelihood on each Gaussian distrbution.
+ """
+ gb_mean, gb_sigma, gb_cst = params
+
+ scores = sidekit.Scores()
+ scores.modelset = gb_mean.modelset
+ scores.segset = test_ss.segset
+ scores.scoremat = numpy.ones((gb_mean.modelset.shape[0], test_ss.segset.shape[0]))
+ scores.scoremask = numpy.ones(scores.scoremat.shape, dtype='bool')
+
+ if diag:
+
+ gb_gmm = sidekit.Mixture()
+ gb_gmm.w = numpy.ones(gb_mean.modelset.shape[0], dtype='float') / gb_mean.modelset.shape[0]
+ gb_gmm.mu = gb_mean.stat1
+ gb_gmm.invcov = numpy.empty(gb_gmm.mu.shape)
+ for l in range(len(gb_sigma)):
+ if gb_sigma[0].ndim == 2:
+ gb_gmm.invcov[l, :] = 1 / numpy.diag(gb_sigma[l])
+ elif gb_sigma[0].ndim == 1:
+ gb_gmm.invcov[l, :] = 1 / gb_sigma[l]
+ gb_gmm._compute_all()
+
+ scores.scoremat = gb_gmm.compute_log_posterior_probabilities(test_ss.stat1).T
+
+ else:
+ assert gb_sigma[0].ndim == 2
+ for lang in range(gb_mean.modelset.shape[0]):
+ scores.scoremat[lang, :] *= gb_cst[lang]
+
+ inv_sigma = numpy.linalg.inv(gb_sigma)
+
+ # Compute scores for all trials per language
+ for lang in range(gb_mean.modelset.shape[0]):
+ scores.scoremat[lang, :] -= 0.5 * (
+ gb_mean.stat1[lang, :].dot(inv_sigma[lang]).dot(gb_mean.stat1[lang, :].T) -
+ 2 * numpy.sum(test_ss.stat1.dot(inv_sigma[lang]) *
+ gb_mean.stat1[lang, :], axis=1) +
+ numpy.sum(test_ss.stat1.dot(inv_sigma[lang]) * test_ss.stat1, axis=1))
+
+ if compute_llr:
+ scores.scoremat = compute_log_likelihood_ratio(scores.scoremat)
+
+ assert scores.validate()
+ return scores
diff --git a/sidekit/sidekit/mixture.py b/sidekit/sidekit/mixture.py
new file mode 100755
index 0000000..0636a4c
--- /dev/null
+++ b/sidekit/sidekit/mixture.py
@@ -0,0 +1,1029 @@
+
+# -*- coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+
+"""
+Copyright 2014-2021 Anthony Larcher
+
+:mod:`mixture` provides methods to manage Gaussian mixture models
+
+"""
+import copy
+import h5py
+import numpy
+import struct
+import ctypes
+import multiprocessing
+import warnings
+from .sidekit_wrappers import *
+from .bosaris import IdMap
+from .sv_utils import mean_std_many
+import sys
+
+__license__ = "LGPL"
+__author__ = "Anthony Larcher"
+__copyright__ = "Copyright 2014-2021 Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+
+
+def sum_log_probabilities(lp):
+ """Sum log probabilities in a secure manner to avoid extreme values
+
+ :param lp: numpy array of log-probabilities to sum
+ """
+ pp_max = numpy.max(lp, axis=1)
+ log_lk = pp_max + numpy.log(numpy.sum(numpy.exp((lp.transpose() - pp_max).T), axis=1))
+ ind = ~numpy.isfinite(pp_max)
+ if sum(ind) != 0:
+ log_lk[ind] = pp_max[ind]
+ pp = numpy.exp((lp.transpose() - log_lk).transpose())
+ llk = log_lk.sum()
+ return pp, llk
+
+
+def vad_energy(log_energy,
+ distrib_nb=3,
+ nb_train_it=8,
+ flooring=0.0001, ceiling=1.0,
+ alpha=2):
+ """
+
+ :param log_energy:
+ :param distrib_nb:
+ :param nb_train_it:
+ :param flooring:
+ :param ceiling:
+ :param alpha:
+ :return:
+ """
+ # center and normalize the energy
+ log_energy = (log_energy - numpy.mean(log_energy)) / numpy.std(log_energy)
+
+ # Initialize a Mixture with 2 or 3 distributions
+ world = Mixture()
+ # set the covariance of each component to 1.0 and the mean to mu + meanIncrement
+ world.cst = numpy.ones(distrib_nb) / (numpy.pi / 2.0)
+ world.det = numpy.ones(distrib_nb)
+ world.mu = -2 + 4.0 * numpy.arange(distrib_nb) / (distrib_nb - 1)
+ world.mu = world.mu[:, numpy.newaxis]
+ world.invcov = numpy.ones((distrib_nb, 1))
+ # set equal weights for each component
+ world.w = numpy.ones(distrib_nb) / distrib_nb
+ world.cov_var_ctl = copy.deepcopy(world.invcov)
+
+ # Initialize the accumulator
+ accum = copy.deepcopy(world)
+
+ # Perform nbTrainIt iterations of EM
+ for it in range(nb_train_it):
+ accum._reset()
+ # E-step
+ world._expectation(accum, log_energy)
+ # M-step
+ world._maximization(accum, ceiling, flooring)
+
+ # Compute threshold
+ threshold = world.mu.max() - alpha * numpy.sqrt(1.0 / world.invcov[world.mu.argmax(), 0])
+
+ # Apply frame selection with the current threshold
+ label = log_energy > threshold
+ return label, threshold
+
+
+class Mixture(object):
+ """
+ A class for Gaussian Mixture Model storage.
+ For more details about Gaussian Mixture Models (GMM) you can refer to
+ [Bimbot04]_.
+
+ :attr w: array of weight parameters
+ :attr mu: ndarray of mean parameters, each line is one distribution
+ :attr invcov: ndarray of inverse co-variance parameters, 2-dimensional
+ for diagonal co-variance distribution 3-dimensional for full co-variance
+ :attr invchol: 3-dimensional ndarray containing upper cholesky
+ decomposition of the inverse co-variance matrices
+ :attr cst: array of constant computed for each distribution
+ :attr det: array of determinant for each distribution
+
+ """
+ @staticmethod
+ def read_alize(file_name):
+ """
+
+ :param file_name:
+ :return:
+ """
+ """Read a Mixture in alize raw format
+
+ :param mixtureFileName: name of the file to read from
+ """
+ mixture = Mixture()
+
+ with open(file_name, 'rb') as f:
+ distrib_nb = struct.unpack("I", f.read(4))[0]
+ vect_size = struct.unpack("':
+ distrib_nb = int(w[1])
+ mixture.w.resize(distrib_nb)
+ mixture.cst.resize(distrib_nb)
+ mixture.det.resize(distrib_nb)
+
+ if w[0] == '':
+ begin_hmm = True
+
+ if w[0] == '':
+ state2 = True
+
+ if begin_hmm & state2:
+
+ if w[0].upper() == '':
+ distrib = int(w[1]) - 1
+ mixture.w[distrib] = numpy.double(w[2])
+
+ elif w[0].upper() == '':
+ if vect_size == 0:
+ vect_size = int(w[1])
+ mixture.mu.resize(distrib_nb, vect_size)
+ i += 1
+ mixture.mu[distrib, :] = numpy.double(lines[i].split())
+
+ elif w[0].upper() == '':
+ if mixture.invcov.shape[0] == 0:
+ vect_size = int(w[1])
+ mixture.invcov.resize(distrib_nb, vect_size)
+ i += 1
+ C = numpy.double(lines[i].split())
+ mixture.invcov[distrib, :] = 1 / C
+
+ elif w[0].upper() == '':
+ raise Exception("we don't manage full covariance model")
+ elif w[0].upper() == '':
+ mixture.cst[distrib] = numpy.exp(-.05 * numpy.double(w[1]))
+ mixture._compute_all()
+ return mixture
+
+ def __init__(self,
+ mixture_file_name='',
+ name='empty'):
+ """Initialize a Mixture from a file or as an empty Mixture.
+
+ :param mixture_file_name: name of the file to read from, if empty, initialize
+ an empty mixture
+ """
+ self.w = numpy.array([])
+ self.mu = numpy.array([])
+ self.invcov = numpy.array([])
+ self.invchol = numpy.array([])
+ self.cov_var_ctl = numpy.array([])
+ self.cst = numpy.array([])
+ self.det = numpy.array([])
+ self.name = name
+ self.A = 0
+
+ if mixture_file_name != '':
+ self.read(mixture_file_name)
+
+ @accepts('Mixture', 'Mixture', debug=2)
+ def __add__(self, other):
+ """Overide the sum for a mixture.
+ Weight, means and inv_covariances are added, det and cst are
+ set to 0
+ """
+ new_mixture = Mixture()
+ new_mixture.w = self.w + other.w
+ new_mixture.mu = self.mu + other.mu
+ new_mixture.invcov = self.invcov + other.invcov
+ return new_mixture
+
+ def init_from_diag(self, diag_mixture):
+ """
+
+ :param diag_mixture:
+ """
+ distrib_nb = diag_mixture.w.shape[0]
+ dim = diag_mixture.mu.shape[1]
+
+ self.w = diag_mixture.w
+ self.cst = diag_mixture.cst
+ self.det = diag_mixture.det
+ self.mu = diag_mixture.mu
+
+ self.invcov = numpy.empty((distrib_nb, dim, dim))
+ self.invchol = numpy.empty((distrib_nb, dim, dim))
+ for gg in range(distrib_nb):
+ self.invcov[gg] = numpy.diag(diag_mixture.invcov[gg, :])
+ self.invchol[gg] = numpy.linalg.cholesky(self.invcov[gg])
+ self.cov_var_ctl = numpy.diag(diag_mixture.cov_var_ctl)
+ self.name = diag_mixture.name
+ self.A = numpy.zeros(self.cst.shape) # we keep zero here as it is not used for full covariance distributions
+
+ def _serialize(self):
+ """
+ Serialization is necessary to share the memomry when running multiprocesses
+ """
+ with warnings.catch_warnings():
+ warnings.simplefilter('ignore', RuntimeWarning)
+
+ sh = self.w.shape
+ tmp = multiprocessing.Array(ctypes.c_double, self.w.size)
+ self.w = numpy.ctypeslib.as_array(tmp.get_obj())
+ self.w = self.w.reshape(sh)
+
+ sh = self.mu.shape
+ tmp = multiprocessing.Array(ctypes.c_double, self.mu.size)
+ self.mu = numpy.ctypeslib.as_array(tmp.get_obj())
+ self.mu = self.mu.reshape(sh)
+
+ sh = self.invcov.shape
+ tmp = multiprocessing.Array(ctypes.c_double, self.invcov.size)
+ self.invcov = numpy.ctypeslib.as_array(tmp.get_obj())
+ self.invcov = self.invcov.reshape(sh)
+
+ sh = self.cov_var_ctl.shape
+ tmp = multiprocessing.Array(ctypes.c_double, self.cov_var_ctl.size)
+ self.cov_var_ctl = numpy.ctypeslib.as_array(tmp.get_obj())
+ self.cov_var_ctl = self.cov_var_ctl.reshape(sh)
+
+ sh = self.cst.shape
+ tmp = multiprocessing.Array(ctypes.c_double, self.cst.size)
+ self.cst = numpy.ctypeslib.as_array(tmp.get_obj())
+ self.cst = self.cst.reshape(sh)
+
+ sh = self.det.shape
+ tmp = multiprocessing.Array(ctypes.c_double, self.det.size)
+ self.det = numpy.ctypeslib.as_array(tmp.get_obj())
+ self.det = self.det.reshape(sh)
+
+ def get_distrib_nb(self):
+ """
+ Return the number of Gaussian distributions in the mixture
+ :return: then number of distributions
+ """
+ return self.w.shape[0]
+
+ def read(self, mixture_file_name, prefix=''):
+ """Read a Mixture in hdf5 format
+
+ :param mixture_file_name: name of the file to read from
+ :param prefix:
+ """
+ with h5py.File(mixture_file_name, 'r') as f:
+ self.w = f.get(prefix+'w')[()]
+ self.w.resize(numpy.max(self.w.shape))
+ self.mu = f.get(prefix+'mu')[()]
+ self.invcov = f.get(prefix+'invcov')[()]
+ self.invchol = f.get(prefix+'invchol')[()]
+ self.cov_var_ctl = f.get(prefix+'cov_var_ctl')[()]
+ self.cst = f.get(prefix+'cst')[()]
+ self.det = f.get(prefix+'det')[()]
+ self.A = f.get(prefix+'a')[()]
+
+ @check_path_existance
+ def write_alize(self, mixture_file_name):
+ """Save a mixture in alize raw format
+
+ :param mixture_file_name: name of the file to write in
+ """
+ with open(mixture_file_name, 'wb') as of:
+ # write the number of distributions per state
+ of.write(struct.pack("ijm', tmp, self.invchol)
+ lp = numpy.log(self.w[:, numpy.newaxis]) + numpy.log(self.cst[:, numpy.newaxis]) - 0.5 * (a * a).sum(-1)
+
+ return lp.T
+
+ def compute_log_posterior_probabilities(self, cep, mu=None):
+ """ Compute log posterior probabilities for a set of feature frames.
+
+ :param cep: a set of feature frames in a ndarray, one feature per row
+ :param mu: a mean super-vector to replace the ubm's one. If it is an empty
+ vector, use the UBM
+
+ :return: A ndarray of log-posterior probabilities corresponding to the
+ input feature set.
+ """
+ if cep.ndim == 1:
+ cep = cep[numpy.newaxis, :]
+ A = self.A
+ if mu is None:
+ mu = self.mu
+ else:
+ # for MAP, Compute the data independent term
+ A = (numpy.square(mu.reshape(self.mu.shape)) * self.invcov).sum(1) \
+ - 2.0 * (numpy.log(self.w) + numpy.log(self.cst))
+
+ # Compute the data independent term
+ B = numpy.dot(numpy.square(cep), self.invcov.T) \
+ - 2.0 * numpy.dot(cep, numpy.transpose(mu.reshape(self.mu.shape) * self.invcov))
+
+ # Compute the exponential term
+ lp = -0.5 * (B + A)
+ return lp
+
+ @staticmethod
+ def variance_control(cov, flooring, ceiling, cov_ctl):
+ """variance_control for Mixture (florring and ceiling)
+
+ :param cov: covariance to control
+ :param flooring: float, florring value
+ :param ceiling: float, ceiling value
+ :param cov_ctl: co-variance to consider for flooring and ceiling
+ """
+ floor = flooring * cov_ctl
+ ceil = ceiling * cov_ctl
+
+ to_floor = numpy.less_equal(cov, floor)
+ to_ceil = numpy.greater_equal(cov, ceil)
+
+ cov[to_floor] = floor[to_floor]
+ cov[to_ceil] = ceil[to_ceil]
+ return cov
+
+ def _reset(self):
+ """Set all the Mixture values to ZERO"""
+ self.cst.fill(0.0)
+ self.det.fill(0.0)
+ self.w.fill(0.0)
+ self.mu.fill(0.0)
+ self.invcov.fill(0.0)
+ self.A = 0.0
+
+ def _split_ditribution(self):
+ """Split each distribution into two depending on the principal
+ axis of variance."""
+ sigma = 1.0 / self.invcov
+ sig_max = numpy.max(sigma, axis=1)
+ arg_max = numpy.argmax(sigma, axis=1)
+
+ shift = numpy.zeros(self.mu.shape)
+ for x, y, z in zip(range(arg_max.shape[0]), arg_max, sig_max):
+ shift[x, y] = numpy.sqrt(z)
+
+ self.mu = numpy.vstack((self.mu - shift, self.mu + shift))
+ self.invcov = numpy.vstack((self.invcov, self.invcov))
+ self.w = numpy.concatenate([self.w, self.w]) * 0.5
+ self.cst = numpy.zeros(self.w.shape)
+ self.det = numpy.zeros(self.w.shape)
+ self.cov_var_ctl = numpy.vstack((self.cov_var_ctl, self.cov_var_ctl))
+
+ self._compute_all()
+
+ def _expectation(self, accum, cep):
+ """Expectation step of the EM algorithm. Calculate the expected value
+ of the log likelihood function, with respect to the conditional
+ distribution.
+
+ :param accum: a Mixture object to store the accumulated statistics
+ :param cep: a set of input feature frames
+
+ :return loglk: float, the log-likelihood computed over the input set of
+ feature frames.
+ """
+ if cep.ndim == 1:
+ cep = cep[:, numpy.newaxis]
+ if self.invcov.ndim == 2:
+ lp = self.compute_log_posterior_probabilities(cep)
+ elif self.invcov.ndim == 3:
+ lp = self.compute_log_posterior_probabilities_full(cep)
+ pp, loglk = sum_log_probabilities(lp)
+
+ # zero order statistics
+ accum.w += pp.sum(0)
+ # first order statistics
+ accum.mu += numpy.dot(cep.T, pp).T
+ # second order statistics
+ if self.invcov.ndim == 2:
+ accum.invcov += numpy.dot(numpy.square(cep.T), pp).T # version for diagonal covariance
+ elif self.invcov.ndim == 3:
+ tmp = numpy.einsum('ijk,ilk->ijl', cep[:, :, numpy.newaxis], cep[:, :, numpy.newaxis])
+ accum.invcov += numpy.einsum('ijk,im->mjk', tmp, pp)
+
+ # return the log-likelihood
+ return loglk
+
+ @process_parallel_lists
+ def _expectation_list(self, stat_acc, feature_list, feature_server, llk_acc=numpy.zeros(1), num_thread=1):
+ """
+ Expectation step of the EM algorithm. Calculate the expected value
+ of the log likelihood function, with respect to the conditional
+ distribution.
+
+ :param stat_acc:
+ :param feature_list:
+ :param feature_server:
+ :param llk_acc:
+ :param num_thread:
+ :return:
+ """
+ stat_acc._reset()
+ feature_server.keep_all_features = False
+ for feat in feature_list:
+ cep = feature_server.load(feat)[0]
+ llk_acc[0] += self._expectation(stat_acc, cep)
+
+ @process_parallel_lists
+ def _expectation_list_idmap(self, stat_acc,
+ feature_list,
+ start_list,
+ stop_list,
+ feature_server,
+ llk_acc=numpy.zeros(1),
+ num_thread=1):
+ """
+ Expectation step of the EM algorithm. Calculate the expected value
+ of the log likelihood function, with respect to the conditional
+ distribution.
+
+ :param stat_acc:
+ :param feature_list:
+ :param feature_server:
+ :param llk_acc:
+ :param num_thread:
+ :return:
+ """
+ stat_acc._reset()
+ feature_server.keep_all_features = False
+ for feat, start, stop in zip(feature_list, start_list, stop_list):
+ cep = feature_server.load(feat, start=start, stop=stop)[0]
+ llk_acc[0] += self._expectation(stat_acc, cep)
+
+ def _maximization(self, accum, ceil_cov=10, floor_cov=1e-2):
+ """Re-estimate the parmeters of the model which maximize the likelihood
+ on the data.
+
+ :param accum: a Mixture in which statistics computed during the E step
+ are stored
+ :param floor_cov: a constant; minimum bound to consider, default is 1e-200
+ """
+ self.w = accum.w / numpy.sum(accum.w)
+ self.mu = accum.mu / accum.w[:, numpy.newaxis]
+ if accum.invcov.ndim == 2:
+ cov = accum.invcov / accum.w[:, numpy.newaxis] - numpy.square(self.mu)
+ cov = Mixture.variance_control(cov, floor_cov, ceil_cov, self.cov_var_ctl)
+ self.invcov = 1.0 / cov
+ elif accum.invcov.ndim == 3:
+ cov = accum.invcov / accum.w[:, numpy.newaxis, numpy.newaxis] \
+ - numpy.einsum('ijk,ilk->ijl', self.mu[:, :, numpy.newaxis], self.mu[:, :, numpy.newaxis])
+ # ADD VARIANCE CONTROL
+ for gg in range(self.w.shape[0]):
+ self.invcov[gg] = numpy.linalg.inv(cov[gg])
+ self.invchol[gg] = numpy.linalg.cholesky(self.invcov[gg]).T
+ self._compute_all()
+
+ def _init(self, features_server, feature_list, start_list=None, stop_list=None, num_thread=1):
+ """
+ Initialize a Mixture as a single Gaussian distribution which
+ mean and covariance are computed on a set of feature frames
+
+ :param features_server:
+ :param feature_list:
+ :param num_thread:
+ :return:
+ """
+
+ # Init using all data
+ if num_thread == 1:
+ features = features_server.stack_features(feature_list,
+ start_list=start_list,
+ stop_list=stop_list)
+ else:
+ features = features_server.stack_features_parallel(feature_list,
+ start_list=start_list,
+ stop_list=stop_list,
+ num_thread=num_thread)
+ n_frames = features.shape[0]
+ mu = features.mean(0)
+ cov = (features**2).mean(0)
+ #n_frames, mu, cov = mean_std_many(features_server, feature_list, in_context=False, num_thread=num_thread)
+ self.mu = mu[None]
+ self.invcov = 1./cov[None]
+ self.w = numpy.asarray([1.0])
+ self.cst = numpy.zeros(self.w.shape)
+ self.det = numpy.zeros(self.w.shape)
+ self.cov_var_ctl = 1.0 / copy.deepcopy(self.invcov)
+ self._compute_all()
+
+ def EM_split(self,
+ features_server,
+ feature_list,
+ distrib_nb,
+ iterations=(1, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8),
+ num_thread=1,
+ llk_gain=0.01,
+ save_partial=False,
+ output_file_name="ubm",
+ ceil_cov=10,
+ floor_cov=1e-2):
+ """Expectation-Maximization estimation of the Mixture parameters.
+
+ :param features_server: sidekit.FeaturesServer used to load data
+ :param feature_list: list of feature files to train the GMM
+ :param distrib_nb: final number of distributions
+ :param iterations: list of iteration number for each step of the learning process
+ :param num_thread: number of thread to launch for parallel computing
+ :param llk_gain: limit of the training gain. Stop the training when gain between
+ two iterations is less than this value
+ :param save_partial: name of the file to save intermediate mixtures,
+ if True, save before each split of the distributions
+ :param ceil_cov:
+ :param floor_cov:
+
+ :return llk: a list of log-likelihoods obtained after each iteration
+ """
+ llk = []
+
+ process_idmap = isinstance(feature_list, IdMap)
+ start_list = None
+ stop_list = None
+ if process_idmap:
+ start_list = feature_list.start
+ stop_list = feature_list.stop
+ session_list = feature_list.rightids
+ else:
+ session_list = feature_list
+ init_session_list = session_list[:20]
+ init_start_list = None
+ init_stop_list = None
+ if start_list is not None:
+ init_start_list = start_list[:20]
+ init_stop_list = stop_list[:20]
+ self._init(features_server, init_session_list, start_list=init_start_list, stop_list=init_stop_list, num_thread=num_thread)
+ # for N iterations:
+ for it in iterations[:int(numpy.log2(distrib_nb))]:
+ # Save current model before spliting
+ if save_partial:
+ self.write('{}_{}g.h5'.format(output_file_name, self.get_distrib_nb()), prefix='')
+
+ self._split_ditribution()
+
+ # initialize the accumulator
+ accum = copy.deepcopy(self)
+
+ for i in range(it):
+ accum._reset()
+
+ # serialize the accum
+ accum._serialize()
+ llk_acc = numpy.zeros(1)
+ sh = llk_acc.shape
+ with warnings.catch_warnings():
+ warnings.simplefilter('ignore', RuntimeWarning)
+ tmp = multiprocessing.Array(ctypes.c_double, llk_acc.size)
+ llk_acc = numpy.ctypeslib.as_array(tmp.get_obj())
+ llk_acc = llk_acc.reshape(sh)
+
+ logging.debug('Expectation')
+ # E step
+ if process_idmap:
+ self._expectation_list_idmap(stat_acc=accum,
+ feature_list=session_list,
+ start_list=start_list,
+ stop_list=stop_list,
+ feature_server=features_server,
+ llk_acc=llk_acc,
+ num_thread=num_thread)
+ else:
+ self._expectation_list(stat_acc=accum,
+ feature_list=session_list,
+ feature_server=features_server,
+ llk_acc=llk_acc,
+ num_thread=num_thread)
+ llk.append(llk_acc[0] / numpy.sum(accum.w))
+
+ # M step
+ logging.debug('Maximisation')
+ self._maximization(accum, ceil_cov=ceil_cov, floor_cov=floor_cov)
+ if i > 0:
+ # gain = llk[-1] - llk[-2]
+ # if gain < llk_gain:
+ # logging.debug(
+ # 'EM (break) distrib_nb: %d %i/%d gain: %f -- %s, %d',
+ # self.mu.shape[0], i + 1, it, gain, self.name,
+ # len(cep))
+ # break
+ # else:
+ # logging.debug(
+ # 'EM (continu) distrib_nb: %d %i/%d gain: %f -- %s, %d',
+ # self.mu.shape[0], i + 1, it, gain, self.name,
+ # len(cep))
+ # break
+ pass
+ else:
+ # logging.debug(
+ # 'EM (start) distrib_nb: %d %i/%i llk: %f -- %s, %d',
+ # self.mu.shape[0], i + 1, it, llk[-1],
+ # self.name, len(cep))
+ pass
+
+ return llk
+
+ def EM_uniform(self, cep, distrib_nb, iteration_min=3, iteration_max=10,
+ llk_gain=0.01, do_init=True):
+
+ """Expectation-Maximization estimation of the Mixture parameters.
+
+ :param cep: set of feature frames to consider
+ :param cep: set of feature frames to consider
+ :param distrib_nb: number of distributions
+ :param iteration_min: minimum number of iterations to perform
+ :param iteration_max: maximum number of iterations to perform
+ :param llk_gain: gain in term of likelihood, stop the training when the gain is less than this value
+ :param do_init: boolean, if True initialize the GMM from the training data
+
+ :return llk: a list of log-likelihoods obtained after each iteration
+
+ """
+ llk = []
+
+ if do_init:
+ self._init_uniform(cep, distrib_nb)
+ accum = copy.deepcopy(self)
+
+ for i in range(0, iteration_max):
+ accum._reset()
+ # serialize the accum
+ accum._serialize()
+ llk_acc = numpy.zeros(1)
+ sh = llk_acc.shape
+ with warnings.catch_warnings():
+ warnings.simplefilter('ignore', RuntimeWarning)
+ tmp = multiprocessing.Array(ctypes.c_double, llk_acc.size)
+ llk_acc = numpy.ctypeslib.as_array(tmp.get_obj())
+ llk_acc = llk_acc.reshape(sh)
+
+ # E step
+ # llk.append(self._expectation_parallel(accum, cep, num_thread) / cep.shape[0])
+ # self._expectation(accum,cep)
+ llk.append(self._expectation(accum, cep) / cep.shape[0])
+
+ # M step
+ self._maximization(accum)
+ if i > 0:
+ gain = llk[-1] - llk[-2]
+ if gain < llk_gain and i >= iteration_min:
+ logging.debug(
+ 'EM (break) distrib_nb: %d %i/%d gain: %f -- %s, %d',
+ self.mu.shape[0], i + 1, iteration_max, gain, self.name,
+ len(cep))
+ break
+ else:
+ logging.debug(
+ 'EM (continu) distrib_nb: %d %i/%d gain: %f -- %s, %d',
+ self.mu.shape[0], i + 1, iteration_max, gain, self.name,
+ len(cep))
+ else:
+ logging.debug(
+ 'EM (start) distrib_nb: %d %i/%i llk: %f -- %s, %d',
+ self.mu.shape[0], i + 1, iteration_max, llk[-1],
+ self.name, len(cep))
+ return llk
+
+ def _init_uniform(self, cep, distrib_nb):
+ """
+
+ :param cep: matrix of acoustic frames
+ :param distrib_nb: number of distributions
+ """
+
+ # Load data to initialize the mixture
+ # self._init(fs, cep)
+ cov_tmp = copy.deepcopy(self.invcov)
+ nb = cep.shape[0]
+ self.w = numpy.full(distrib_nb, 1.0 / distrib_nb, "d")
+ self.cst = numpy.zeros(distrib_nb, "d")
+ self.det = numpy.zeros(distrib_nb, "d")
+
+ for i in range(0, distrib_nb):
+ start = nb // distrib_nb * i
+ end = max(start + 10, nb)
+ mean = numpy.mean(cep[start:end, :], axis=0)
+ cov = (cep[start:end, :]**2).mean(0)
+ if i == 0:
+ self.mu = mean
+ self.invcov = 1./cov[None]
+ else:
+ self.mu = numpy.vstack((self.mu, mean))
+ self.invcov = numpy.vstack((self.invcov, cov))
+ self.cov_var_ctl = 1.0 / copy.deepcopy(self.invcov)
+
+ self._compute_all()
+
+ def EM_diag2full(self, diagonal_mixture, features_server, featureList, iterations=2, num_thread=1):
+ """Expectation-Maximization estimation of the Mixture parameters.
+
+ :param features_server: sidekit.FeaturesServer used to load data
+ :param featureList: list of feature files to train the GMM
+ :param iterations: list of iteration number for each step of the learning process
+ :param num_thread: number of thread to launch for parallel computing
+
+ :return llk: a list of log-likelihoods obtained after each iteration
+ """
+ llk = []
+
+ # Convert the covariance matrices into full ones
+ distrib_nb = diagonal_mixture.w.shape[0]
+ dim = diagonal_mixture.mu.shape[1]
+
+ self.w = diagonal_mixture.w
+ self.cst = diagonal_mixture.cst
+ self.det = diagonal_mixture.det
+ self.mu = diagonal_mixture.mu
+
+ self.invcov = numpy.empty((distrib_nb, dim, dim))
+ self.invchol = numpy.empty((distrib_nb, dim, dim))
+ for gg in range(distrib_nb):
+ self.invcov[gg] = numpy.diag(diagonal_mixture.invcov[gg, :])
+ self.invchol[gg] = numpy.linalg.cholesky(self.invcov[gg])
+ self.cov_var_ctl = numpy.diag(diagonal_mixture.cov_var_ctl)
+ self.name = diagonal_mixture.name
+ self.A = numpy.zeros(self.cst.shape) # we keep zero here as it is not used for full covariance distributions
+
+ # Create Accumulator
+ accum = copy.deepcopy(self)
+
+ # Run iterations of EM
+ for it in range(iterations):
+ logging.debug('EM convert full it: %d', it)
+
+ accum._reset()
+
+ # serialize the accum
+ accum._serialize()
+ llk_acc = numpy.zeros(1)
+ sh = llk_acc.shape
+ with warnings.catch_warnings():
+ warnings.simplefilter('ignore', RuntimeWarning)
+ tmp = multiprocessing.Array(ctypes.c_double, llk_acc.size)
+ llk_acc = numpy.ctypeslib.as_array(tmp.get_obj())
+ llk_acc = llk_acc.reshape(sh)
+
+ logging.debug('Expectation')
+ # E step
+ self._expectation_list(stat_acc=accum,
+ feature_list=featureList,
+ feature_server=features_server,
+ llk_acc=llk_acc,
+ num_thread=num_thread)
+ llk.append(llk_acc[0] / numpy.sum(accum.w))
+
+ # M step
+ logging.debug('Maximisation')
+ self._maximization(accum)
+ if it > 0:
+ # gain = llk[-1] - llk[-2]
+ # if gain < llk_gain:
+ # logging.debug(
+ # 'EM (break) distrib_nb: %d %i/%d gain: %f -- %s, %d',
+ # self.mu.shape[0], i + 1, it, gain, self.name,
+ # len(cep))
+ # break
+ # else:
+ # logging.debug(
+ # 'EM (continu) distrib_nb: %d %i/%d gain: %f -- %s, %d',
+ # self.mu.shape[0], i + 1, it, gain, self.name,
+ # len(cep))
+ # break
+ pass
+ else:
+ # logging.debug(
+ # 'EM (start) distrib_nb: %d %i/%i llk: %f -- %s, %d',
+ # self.mu.shape[0], i + 1, it, llk[-1],
+ # self.name, len(cep))
+ pass
+
+ return llk
+
+ def merge(self, model_list):
+ """
+ Merge a list of Mixtures into a new one. Weights are normalized uniformly
+ :param model_list: a list of Mixture objects to merge
+ """
+ self.w = numpy.hstack(([mod.w for mod in model_list]))
+ self.w /= self.w.sum()
+
+ self.mu = numpy.vstack(([mod.mu for mod in model_list]))
+ self.invcov = numpy.vstack(([mod.invcov for mod in model_list]))
+ self.invchol = numpy.vstack(([mod.invchol for mod in model_list]))
+ self.cov_var_ctl = numpy.vstack(([mod.cov_var_ctl for mod in model_list]))
+ self.cst = numpy.hstack(([mod.cst for mod in model_list]))
+ self.det = numpy.hstack(([mod.det for mod in model_list]))
+ self.name = "_".join([mod.name for mod in model_list])
+ self.A = numpy.hstack(([mod.A for mod in model_list]))
+
+ self._compute_all()
+ assert self.validate(), "Error while merging models"
diff --git a/sidekit/sidekit/nnet/__init__.py b/sidekit/sidekit/nnet/__init__.py
new file mode 100755
index 0000000..22222a8
--- /dev/null
+++ b/sidekit/sidekit/nnet/__init__.py
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+
+"""
+Copyright 2014-2021 Anthony Larcher and Sylvain Meignier
+
+:mod:`nnet` provides methods to manage Neural Networks using PyTorch
+"""
+
+
+from .xsets import IdMapSetPerSpeaker
+from .xsets import SideSet
+from .xsets import SideSampler
+from .xvector import Xtractor
+from .xvector import xtrain
+from .xvector import extract_embeddings
+from .pooling import MeanStdPooling
+from .pooling import AttentivePooling
+from .pooling import GruPooling
+from .res_net import ResBlock
+from .res_net import PreResNet34
+from .res_net import PreFastResNet34
+from .res_net import PreHalfResNet34
+from .sincnet import SincNet
+from .preprocessor import RawPreprocessor
+from .preprocessor import MfccFrontEnd
+from .preprocessor import MelSpecFrontEnd
+
+
+__author__ = "Anthony Larcher and Sylvain Meignier"
+__copyright__ = "Copyright 2014-2021 Anthony Larcher and Sylvain Meignier"
+__license__ = "LGPL"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
diff --git a/sidekit/sidekit/nnet/augmentation.py b/sidekit/sidekit/nnet/augmentation.py
new file mode 100644
index 0000000..7f19dfc
--- /dev/null
+++ b/sidekit/sidekit/nnet/augmentation.py
@@ -0,0 +1,308 @@
+# coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+
+"""
+Copyright 2014-2021 Anthony Larcher
+"""
+
+import collections
+import numpy
+import random
+import torch
+import torchaudio
+
+from scipy import signal
+
+
+__author__ = "Anthony Larcher and Sylvain Meignier"
+__copyright__ = "Copyright 2014-2021 Anthony Larcher and Sylvain Meignier"
+__license__ = "LGPL"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+
+
+Noise = collections.namedtuple('Noise', 'type file_id duration')
+
+
+class PreEmphasis(torch.nn.Module):
+ """
+ Apply pre-emphasis filtering
+ """
+
+ def __init__(self, coef: float = 0.97):
+ super().__init__()
+ self.coef = coef
+ # make kernel
+ # In pytorch, the convolution operation uses cross-correlation. So, filter is flipped.
+ self.register_buffer(
+ 'flipped_filter', torch.FloatTensor([-self.coef, 1.]).unsqueeze(0).unsqueeze(0)
+ )
+
+ def forward(self, input_signal: torch.tensor) -> torch.tensor:
+ """
+ Forward pass of the pre-emphasis filtering
+
+ :param input_signal: the input signal
+ :return: the filtered signal
+ """
+ assert len(input_signal.size()) == 2, 'The number of dimensions of input tensor must be 2!'
+ # reflect padding to match lengths of in/out
+ input_signal = input_signal.unsqueeze(1)
+ input_signal = torch.nn.functional.pad(input_signal, (1, 0), 'reflect')
+ return torch.nn.functional.conv1d(input_signal, self.flipped_filter).squeeze(1)
+
+
+class FrequencyMask(object):
+ """Crop randomly the image in a sample.
+
+ Args:
+ output_size (tuple or int): Desired output size. If int, square crop
+ is made.
+ """
+ def __init__(self, max_size, feature_size):
+ self.max_size = max_size
+ self.feature_size = feature_size
+
+ def __call__(self, sample):
+ data = sample[0]
+ if sample[2]:
+ size = numpy.random.randint(1, self.max_size)
+ f0 = numpy.random.randint(0, self.feature_size - self.max_size)
+ data[f0:f0+size, :] = 10.
+ return data, sample[1], sample[2], sample[3], sample[4], sample[5]
+
+
+class TemporalMask(object):
+ """Crop randomly the image in a sample.
+
+ Args:
+ output_size (tuple or int): Desired output size. If int, square crop
+ is made.
+ """
+ def __init__(self, max_size):
+ self.max_size = max_size
+
+ def __call__(self, sample):
+ data = sample[0]
+ if sample[3]:
+ size = numpy.random.randint(1, self.max_size)
+ t0 = numpy.random.randint(0, sample[0].shape[1] - self.max_size)
+ data[:, t0:t0+size] = 10.
+ return data, sample[1], sample[2], sample[3], sample[4], sample[5]
+
+
+def normalize(wav):
+ """
+ Center and reduce a waveform
+
+ :param wav: the input waveform
+ :return: the normalized waveform
+ """
+ return wav / (numpy.sqrt(numpy.mean(wav ** 2)) + 1e-8)
+
+
+def crop(input_signal, duration):
+ """
+ Select a chunk from an audio segment
+
+ :param input_signal: signal to select a chunk from
+ :param duration: duration of the chunk to select
+ :return:
+ """
+ start = random.randint(0, input_signal.shape[0] - duration)
+ chunk = input_signal[start: start + duration]
+ return chunk
+
+
+def data_augmentation(speech,
+ sample_rate,
+ transform_dict,
+ transform_number,
+ noise_df=None,
+ rir_df=None,
+ babble_noise=True):
+ """
+ Perform data augmentation on an input signal.
+ Each speech chunk is augmented by using 'transform_number' transformations that are picked up randomly from a
+ dictionary of possible transformations.
+
+ :param speech: the input signal to be augmented
+ :param sample_rate: sampling rate of the input signal to augment
+ :param transform_dict: the dictionary of possibles augmentations to apply
+ :param transform_number: the number of transformations to apply on each chunk
+ :param rir_df: a pandas dataframe object including the list of RIR signals to chose from; default is None
+ :param noise_df: a pandas dataframe object including the list of NOISE signals to chose from; default is None
+ :param babble_noise: boolean that enable the use of babble noise, True by default (typically turned to False when
+ the task includes overlapping speech detection).
+
+ :return: augmented signal
+
+ tranformation
+ pipeline: add_noise,add_reverb
+ add_noise:
+ noise_db_csv: filename.csv
+ snr: 5,6,7,8,9,10,11,12,13,14,15
+ add_reverb:
+ rir_db_csv: filename.csv
+ codec: true
+ phone_filtering: true
+ """
+ # Select the data augmentation randomly
+ aug_idx = random.sample(range(len(transform_dict.keys())), k=transform_number)
+ augmentations = numpy.array(list(transform_dict.keys()))[aug_idx]
+
+ if "stretch" in augmentations:
+ strech = torchaudio.functional.TimeStretch()
+ rate = random.uniform(0.8,1.2)
+ speech = strech(speech, rate)
+
+ if "add_reverb" in augmentations:
+ rir_nfo = rir_df.iloc[random.randrange(rir_df.shape[0])].file_id
+ rir_fn = transform_dict["add_reverb"]["data_path"] + rir_nfo # TODO harmonize with noise
+ rir, rir_fs = torchaudio.load(rir_fn)
+ assert rir_fs == sample_rate
+ #rir = rir[rir_nfo[1], :] #keep selected channel
+ speech = torch.tensor(signal.convolve(speech, rir, mode='full')[:, :speech.shape[1]])
+
+ if "add_noise" in augmentations:
+ # Pick a noise type
+ noise = torch.zeros_like(speech)
+ if not babble_noise:
+ noise_idx = random.randrange(1, 3)
+ else:
+ noise_idx = random.randrange(0, 4)
+
+ # speech
+ if noise_idx == 0:
+ # Pick a SNR level
+ # TODO make SNRs configurable by noise type
+ snr_db = random.randint(13, 20)
+ pick_count = random.randint(3, 7)
+ index_list = random.sample(range(noise_df.loc['speech'].shape[0]), k=pick_count)
+ for idx in index_list:
+ noise_row = noise_df.loc['speech'].iloc[idx]
+ noise += load_noise_seg(noise_row, speech.shape, sample_rate, transform_dict["add_noise"]["data_path"])
+ noise /= pick_count
+ # music
+ elif noise_idx == 1:
+ snr_db = random.randint(5, 15)
+ noise_row = noise_df.loc['music'].iloc[random.randrange(noise_df.loc['music'].shape[0])]
+ noise += load_noise_seg(noise_row, speech.shape, sample_rate, transform_dict["add_noise"]["data_path"])
+ # noise
+ elif noise_idx == 2:
+ snr_db = random.randint(0, 15)
+ noise_row = noise_df.loc['noise'].iloc[random.randrange(noise_df.loc['noise'].shape[0])]
+ noise += load_noise_seg(noise_row, speech.shape, sample_rate, transform_dict["add_noise"]["data_path"])
+ # babble noise with different volume
+ elif noise_idx == 3:
+ snr_db = random.randint(13,20)
+ pick_count = random.randint(5,10) # Randomly select 5 to 10 speakers
+ index_list = random.choices(range(noise_df.loc['speech'].shape[0]), k=pick_count)
+
+ noise = torch.zeros(1,speech.shape[1])
+ for idx in index_list:
+ noise_row = noise_df.loc['speech'].iloc[idx]
+ noise_ = load_noise_seg(noise_row, speech.shape, sample_rate, transform_dict["add_noise"]["data_path"])
+ transform = torchaudio.transforms.Vol(gain=random.randint(5,15),gain_type='db') # Randomly select volume level (5-15d)
+ noise += transform(noise_)
+ noise /= pick_count
+
+ speech_power = speech.norm(p=2)
+ noise_power = noise.norm(p=2)
+ snr = 10 ** (snr_db / 20)
+ scale = snr * noise_power / speech_power
+ speech = (scale * speech + noise) / 2
+
+ if "phone_filtering" in augmentations:
+ final_shape = speech.shape[1]
+ speech, sample_rate = torchaudio.sox_effects.apply_effects_tensor(
+ speech,
+ sample_rate,
+ effects=[
+ ["lowpass", "4000"],
+ ["compand", "0.02,0.05", "-60,-60,-30,-10,-20,-8,-5,-8,-2,-8", "-8", "-7", "0.05"],
+ ["rate", "16000"],
+ ])
+ speech = speech[:, :final_shape]
+
+ if "filtering" in augmentations:
+ effects = [
+ ["bandpass","2000","3500"],
+ ["bandstop","200","500"]]
+ speech, sample_rate = torchaudio.sox_eefects.apply_effects_tensor(
+ speech,
+ sample_rate,
+ effects=[effects[random.randint(0, 1)]],
+ )
+
+ if "codec" in augmentations:
+ final_shape = speech.shape[1]
+ configs = [
+ ({"format": "wav", "encoding": 'ULAW', "bits_per_sample": 8}, "8 bit mu-law"),
+ ({"format": "gsm"}, "GSM-FR"),
+ ({"format": "mp3", "compression": -9}, "MP3"),
+ ({"format": "vorbis", "compression": -1}, "Vorbis")
+ ]
+ param, title = random.choice(configs)
+ speech = torchaudio.functional.apply_codec(speech, sample_rate, **param)
+ speech = speech[:, :final_shape]
+
+ return speech
+
+
+def load_noise_seg(noise_row, speech_shape, sample_rate, data_path):
+ """
+ Pick a noise signal to add while performing data augmentation
+
+ :param noise_row: a row from a Pandas dataframe object
+ :param speech_shape: shape of the speech signal to be augmented
+ :param sample_rate: sampling rate of the speech signal to be augmented
+ :param data_path: directory where to load the noise file from
+ :return:
+ """
+ noise_start = noise_row['start']
+ noise_duration = noise_row['duration']
+ noise_file_id = noise_row['file_id']
+
+ if noise_duration * sample_rate > speech_shape[1]:
+ # It is recommended to split noise files (especially speech noise type) in shorter subfiles
+ # When frame_offset is too high, loading the segment can take much longer
+ frame_offset = random.randrange(noise_start * sample_rate,
+ int((noise_start + noise_duration) * sample_rate - speech_shape[1]))
+ else:
+ frame_offset = noise_start * sample_rate
+
+ noise_fn = data_path + "/" + noise_file_id + ".wav"
+ if noise_duration * sample_rate > speech_shape[1]:
+ noise_seg, noise_sr = torchaudio.load(noise_fn, frame_offset=int(frame_offset), num_frames=int(speech_shape[1]))
+ else:
+ noise_seg, noise_sr = torchaudio.load(noise_fn,
+ frame_offset=int(frame_offset),
+ num_frames=int(noise_duration * sample_rate))
+ assert noise_sr == sample_rate
+
+ if noise_seg.shape[1] < speech_shape[1]:
+ noise_seg = torch.tensor(numpy.resize(noise_seg.numpy(), speech_shape))
+ return noise_seg
diff --git a/sidekit/sidekit/nnet/loss.py b/sidekit/sidekit/nnet/loss.py
new file mode 100644
index 0000000..832bf14
--- /dev/null
+++ b/sidekit/sidekit/nnet/loss.py
@@ -0,0 +1,437 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+
+"""
+Copyright 2014-2021 Anthony Larcher
+
+"""
+
+
+import math
+import numpy
+import torch
+
+from collections import OrderedDict
+from torch.nn import Parameter
+
+
+#from .classification import Classification
+
+__license__ = "LGPL"
+__author__ = "Anthony Larcher"
+__copyright__ = "Copyright 2015-2020 Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reS'
+
+
+class ArcMarginModel(torch.nn.Module):
+ """
+
+ """
+ def __init__(self, args):
+ super(ArcMarginModel, self).__init__()
+
+ self.weight = Parameter(torch.FloatTensor(num_classes, args.emb_size))
+ nn.init.xavier_uniform_(self.weight)
+
+ self.easy_margin = args.easy_margin
+ self.m = args.margin_m
+ self.s = args.margin_s
+
+ self.cos_m = math.cos(self.m)
+ self.sin_m = math.sin(self.m)
+ self.th = math.cos(math.pi - self.m)
+ self.mm = math.sin(math.pi - self.m) * self.m
+
+ def forward(self, input, label):
+ """
+
+ :param input:
+ :param label:
+ :return:
+ """
+ x = F.normalize(input)
+ W = F.normalize(self.weight)
+ cosine = F.linear(x, W)
+ sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
+ phi = cosine * self.cos_m - sine * self.sin_m # cos(theta + m)
+ if self.easy_margin:
+ phi = torch.where(cosine > 0, phi, cosine)
+ else:
+ phi = torch.where(cosine > self.th, phi, cosine - self.mm)
+ one_hot = torch.zeros(cosine.size(), device=device)
+ one_hot.scatter_(1, label.view(-1, 1).long(), 1)
+ output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
+ output *= self.s
+ return output
+
+
+def l2_norm(input, axis=1):
+ """
+
+ :param input:
+ :param axis:
+ :return:
+ """
+ norm = torch.norm(input, 2, axis, True)
+ output = torch.div(input, norm)
+ return output
+
+
+class ArcFace(torch.nn.Module):
+ """
+
+ """
+ # implementation of additive margin softmax loss in https://arxiv.org/abs/1801.05599
+ def __init__(self, embedding_size, classnum, s=64., m=0.5):
+ super(ArcFace, self).__init__()
+ self.classnum = classnum
+ self.kernel = Parameter(torch.Tensor(embedding_size, classnum))
+ # initial kernel
+ self.kernel.data.uniform_(-1, 1).renorm_(2, 1, 1e-5).mul_(1e5)
+ self.m = m # the margin value, default is 0.5
+ self.s = s # scalar value default is 64, see normface https://arxiv.org/abs/1704.06369
+ self.cos_m = math.cos(m)
+ self.sin_m = math.sin(m)
+ self.mm = self.sin_m * m # issue 1
+ self.threshold = math.cos(math.pi - m)
+
+ def forward(self, embbedings, target):
+ """
+
+ :param embbedings:
+ :param target:
+ :return:
+ """
+ # weights norm
+ nB = len(embbedings)
+ kernel_norm = l2_norm(self.kernel, axis=0)
+ # cos(theta+m)
+ cos_theta = torch.mm(embbedings, kernel_norm)
+ # output = torch.mm(embbedings,kernel_norm)
+ cos_theta = cos_theta.clamp(-1, 1) # for numerical stability
+ cos_theta_2 = torch.pow(cos_theta, 2)
+ sin_theta_2 = 1 - cos_theta_2
+ sin_theta = torch.sqrt(sin_theta_2)
+ cos_theta_m = (cos_theta * self.cos_m - sin_theta * self.sin_m)
+ # this condition controls the theta+m should in range [0, pi]
+ # 0<=theta+m<=pi
+ # -m<=theta<=pi-m
+ cond_v = cos_theta - self.threshold
+ cond_mask = cond_v <= 0
+ # when theta not in [0,pi], use cosface instead
+ keep_val = (cos_theta - self.mm)
+ cos_theta_m[cond_mask] = keep_val[cond_mask]
+ # a little bit hacky way to prevent in_place operation on cos_theta
+ output = cos_theta * 1.0
+ idx_ = torch.arange(0, nB, dtype=torch.long)
+ output[idx_, target] = cos_theta_m[idx_, target]
+ output *= self.s # scale up in order to make softmax work, first introduced in normface
+ return output
+
+
+################################## Cosface head #############################################################
+
+class Am_softmax(torch.nn.Module):
+ """
+
+ """
+ # implementation of additive margin softmax loss in https://arxiv.org/abs/1801.05599
+ def __init__(self, embedding_size=512, classnum=51332):
+ super(Am_softmax, self).__init__()
+ self.classnum = classnum
+ self.kernel = Parameter(torch.Tensor(embedding_size, classnum))
+ # initial kernel
+ self.kernel.data.uniform_(-1, 1).renorm_(2, 1, 1e-5).mul_(1e5)
+ self.m = 0.35 # additive margin recommended by the paper
+ self.s = 30. # see normface https://arxiv.org/abs/1704.06369
+
+ def forward(self, embbedings, label):
+ """
+
+ :param embbedings:
+ :param label:
+ :return:
+ """
+ kernel_norm = l2_norm(self.kernel, axis=0)
+ cos_theta = torch.mm(embbedings, kernel_norm)
+ cos_theta = cos_theta.clamp(-1, 1) # for numerical stability
+ phi = cos_theta - self.m
+ label = label.view(-1, 1) # size=(B,1)
+ index = cos_theta.data * 0.0 # size=(B,Classnum)
+ index.scatter_(1, label.data.view(-1, 1), 1)
+ index = index.byte()
+ output = cos_theta * 1.0
+ output[index] = phi[index] # only change the correct predicted output
+ output *= self.s # scale up in order to make softmax work, first introduced in normface
+ return output
+
+
+class ArcLinear(torch.nn.Module):
+ """Additive Angular Margin linear module (ArcFace)
+
+ Parameters
+ ----------
+ nfeat : int
+ Embedding dimension
+ nclass : int
+ Number of classes
+ margin : float
+ Angular margin to penalize distances between embeddings and centers
+ s : float
+ Scaling factor for the logits
+ """
+
+ def __init__(self, nfeat, nclass, margin, s):
+ super(ArcLinear, self).__init__()
+ eps = 1e-4
+ self.min_cos = eps - 1
+ self.max_cos = 1 - eps
+ self.nclass = nclass
+ self.margin = margin
+ self.s = s
+ self.W = torch.nn.Parameter(torch.Tensor(nclass, nfeat))
+ torch.nn.init.xavier_uniform_(self.W)
+
+ def forward(self, x, target=None):
+ """Apply the angular margin transformation
+
+ Parameters
+ ----------
+ x : `torch.Tensor`
+ an embedding batch
+ target : `torch.Tensor`
+ a non one-hot label batch
+
+ Returns
+ -------
+ fX : `torch.Tensor`
+ logits after the angular margin transformation
+ """
+ # the feature vectors has been normalized before calling this layer
+ #xnorm = torch.nn.functional.normalize(x)
+ xnorm = x
+ # normalize W
+ Wnorm = torch.nn.functional.normalize(self.W)
+ target = target.long().view(-1, 1)
+ # calculate cosθj (the logits)
+ cos_theta_j = torch.matmul(xnorm, torch.transpose(Wnorm, 0, 1))
+ # get the cosθ corresponding to the classes
+ cos_theta_yi = cos_theta_j.gather(1, target)
+ # for numerical stability
+ cos_theta_yi = cos_theta_yi.clamp(min=self.min_cos, max=self.max_cos)
+ # get the angle separating xi and Wyi
+ theta_yi = torch.acos(cos_theta_yi)
+ # apply the margin to the angle
+ cos_theta_yi_margin = torch.cos(theta_yi + self.margin)
+ # one hot encode y
+ one_hot = torch.zeros_like(cos_theta_j)
+ one_hot.scatter_(1, target, 1.0)
+ # project margin differences into cosθj
+ return self.s * (cos_theta_j + one_hot * (cos_theta_yi_margin - cos_theta_yi))
+
+
+class ArcMarginProduct(torch.nn.Module):
+ """
+ Implement of large margin arc distance: :
+ Args:
+ in_features: size of each input sample
+ out_features: size of each output sample
+ s: norm of input feature
+ m: margin
+ cos(theta + m)
+ """
+
+ def __init__(self, in_features, out_features, s=30.0, m=0.50, easy_margin=False):
+ super(ArcMarginProduct, self).__init__()
+ self.in_features = in_features
+ self.out_features = out_features
+ self.s = s
+ self.m = m
+ self.weight = Parameter(torch.FloatTensor(out_features, in_features))
+ torch.nn.init.xavier_uniform_(self.weight)
+
+ self.easy_margin = easy_margin
+ self.cos_m = math.cos(self.m)
+ self.sin_m = math.sin(self.m)
+ self.th = math.cos(math.pi - self.m)
+ self.mm = math.sin(math.pi - self.m) * self.m
+
+ def change_params(self, s=None, m=None):
+ """
+
+ :param s:
+ :param m:
+ """
+ if s is None:
+ s = self.s
+ if m is None:
+ m = self.m
+ self.s = s
+ self.m = m
+ self.cos_m = math.cos(self.m)
+ self.sin_m = math.sin(self.m)
+ self.th = math.cos(math.pi - self.m)
+ self.mm = math.sin(math.pi - self.m) * self.m
+
+ def forward(self, input, target=None):
+ """
+
+ :param input:
+ :param target:
+ :return:
+ """
+ # cos(theta)
+ cosine = torch.nn.functional.linear(torch.nn.functional.normalize(input),
+ torch.nn.functional.normalize(self.weight))
+ if target == None:
+ return cosine * self.s
+ # cos(theta + m)
+ sine = torch.sqrt((1.0 - torch.mul(cosine, cosine)).clamp(0, 1))
+ phi = cosine * self.cos_m - sine * self.sin_m
+
+ if self.easy_margin:
+ phi = torch.where(cosine > 0, phi, cosine)
+ else:
+ phi = torch.where((cosine - self.th) > 0, phi, cosine - self.mm)
+
+ #one_hot = torch.zeros(cosine.size(), device='cuda' if torch.cuda.is_available() else 'cpu')
+ one_hot = torch.zeros_like(cosine)
+ one_hot.scatter_(1, target.view(-1, 1), 1)
+ output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
+ output = output * self.s
+
+ return output, cosine * self.s
+
+
+class SoftmaxAngularProto(torch.nn.Module):
+ """
+
+ from https://github.com/clovaai/voxceleb_trainer/blob/3bfd557fab5a3e6cd59d717f5029b3a20d22a281/loss/angleproto.py
+ """
+ def __init__(self, spk_count, emb_dim=256, init_w=10.0, init_b=-5.0, **kwargs):
+ super(SoftmaxAngularProto, self).__init__()
+
+ self.test_normalize = True
+
+ self.w = torch.nn.Parameter(torch.tensor(init_w))
+ self.b = torch.nn.Parameter(torch.tensor(init_b))
+ self.criterion = torch.nn.CrossEntropyLoss()
+
+ self.cce_backend = torch.nn.Sequential(OrderedDict([
+ ("linear8", torch.nn.Linear(emb_dim, spk_count))
+ ]))
+
+ def forward(self, x, target=None):
+ """
+
+ :param x:
+ :param target:
+ :return:
+ """
+ assert x.size()[1] >= 2
+
+ cce_prediction = self.cce_backend(x)
+
+ if target is None:
+ return cce_prediction
+
+ x = x.reshape(-1, 2, x.size()[-1]).squeeze(1)
+
+ out_anchor = torch.mean(x[:, 1:, :], 1)
+ out_positive = x[:,0,:]
+
+ cos_sim_matrix = torch.nn.functional.cosine_similarity(out_positive.unsqueeze(-1),
+ out_anchor.unsqueeze(-1).transpose(0, 2))
+ torch.clamp(self.w, 1e-6)
+ cos_sim_matrix = cos_sim_matrix * self.w + self.b
+ loss = self.criterion(cos_sim_matrix, torch.arange(0,
+ cos_sim_matrix.shape[0],
+ device=x.device)) + self.criterion(cce_prediction, target)
+ return loss, cce_prediction
+
+
+class AngularProximityMagnet(torch.nn.Module):
+ """
+ from https://github.com/clovaai/voxceleb_trainer/blob/3bfd557fab5a3e6cd59d717f5029b3a20d22a281/loss/angleproto.py
+ """
+ def __init__(self, spk_count, emb_dim=256, batch_size=512, init_w=10.0, init_b=-5.0, **kwargs):
+ super(AngularProximityMagnet, self).__init__()
+
+ self.test_normalize = True
+
+ self.w = torch.nn.Parameter(torch.tensor(init_w))
+ self.b1 = torch.nn.Parameter(torch.tensor(init_b))
+ self.b2 = torch.nn.Parameter(torch.tensor(+5.54))
+
+ #last_linear = torch.nn.Linear(512, 1)
+ #last_linear.bias.data += 1
+
+ #self.magnitude = torch.nn.Sequential(OrderedDict([
+ # ("linear9", torch.nn.Linear(emb_dim, 512)),
+ # ("relu9", torch.nn.ReLU()),
+ # ("linear10", torch.nn.Linear(512, 512)),
+ # ("relu10", torch.nn.ReLU()),
+ # ("linear11", last_linear),
+ # ("relu11", torch.nn.ReLU())
+ # ]))
+
+ self.cce_backend = torch.nn.Sequential(OrderedDict([
+ ("linear8", torch.nn.Linear(emb_dim, spk_count))
+ ]))
+
+ self.criterion = torch.nn.CrossEntropyLoss()
+ self.magnet_criterion = torch.nn.BCEWithLogitsLoss(reduction='mean')
+
+ def forward(self, x, target=None):
+ """
+
+ :param x:
+ :param target:
+ :return:
+ """
+ assert x.size()[1] >= 2
+
+ cce_prediction = self.cce_backend(x)
+
+ if target is None:
+ return x, cce_prediction
+
+ x = x.reshape(-1, 2, x.size()[-1]).squeeze(1)
+ out_anchor = torch.mean(x[:, 1:, :], 1)
+ out_positive = x[:, 0, :]
+
+ ap_sim_matrix = torch.nn.functional.cosine_similarity(out_positive.unsqueeze(-1),out_anchor.unsqueeze(-1).transpose(0,2))
+ torch.clamp(self.w, 1e-6)
+ ap_sim_matrix = ap_sim_matrix * self.w + self.b1
+
+ labels = torch.arange(0, int(out_positive.shape[0]), device=torch.device("cuda:0")).unsqueeze(1)
+ cos_sim_matrix = torch.mm(out_positive, out_anchor.T)
+ cos_sim_matrix = cos_sim_matrix + self.b2
+ cos_sim_matrix = cos_sim_matrix + numpy.log(1/out_positive.shape[0] / (1 - 1/out_positive.shape[0]))
+ mask = (torch.tile(labels, (1, labels.shape[0])) == labels.T).float()
+ batch_loss = self.criterion(ap_sim_matrix, torch.arange(0, int(out_positive.shape[0]), device=torch.device("cuda:0"))) \
+ + self.magnet_criterion(cos_sim_matrix.flatten().unsqueeze(1), mask.flatten().unsqueeze(1))
+ return batch_loss, cce_prediction
diff --git a/sidekit/sidekit/nnet/pooling.py b/sidekit/sidekit/nnet/pooling.py
new file mode 100644
index 0000000..d2c7938
--- /dev/null
+++ b/sidekit/sidekit/nnet/pooling.py
@@ -0,0 +1,144 @@
+# coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+
+"""
+Copyright 2014-2021 Anthony Larcher, Yevhenii Prokopalo
+"""
+
+
+import os
+import torch
+
+
+os.environ['MKL_THREADING_LAYER'] = 'GNU'
+
+__license__ = "LGPL"
+__author__ = "Anthony Larcher"
+__copyright__ = "Copyright 2015-2021 Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reS'
+
+
+class MeanStdPooling(torch.nn.Module):
+ """
+ Mean and Standard deviation pooling
+ """
+ def __init__(self):
+ """
+
+ """
+ super(MeanStdPooling, self).__init__()
+ pass
+
+ def forward(self, x):
+ """
+
+ :param x:
+ :return:
+ """
+ mean = torch.mean(x, dim=2)
+ std = torch.std(x, dim=2)
+ return torch.cat([mean, std], dim=1)
+
+
+class AttentivePooling(torch.nn.Module):
+ """
+ Mean and Standard deviation attentive pooling
+ """
+ def __init__(self, num_channels, n_mels, reduction=2, global_context=False):
+ """
+
+ """
+ # TODO Make global_context configurable (True/False)
+ # TODO Make convolution parameters configurable
+ super(AttentivePooling, self).__init__()
+ in_factor = 3 if global_context else 1
+ self.attention = torch.nn.Sequential(
+ torch.nn.Conv1d(num_channels * (n_mels//8) * in_factor, num_channels//reduction, kernel_size=1),
+ torch.nn.ReLU(),
+ torch.nn.BatchNorm1d(num_channels//reduction),
+ torch.nn.Tanh(),
+ torch.nn.Conv1d(num_channels//reduction, num_channels * (n_mels//8), kernel_size=1),
+ torch.nn.Softmax(dim=2),
+ )
+ self.global_context = global_context
+ self.gc = MeanStdPooling()
+
+ def new_parameter(self, *size):
+ out = torch.nn.Parameter(torch.FloatTensor(*size))
+ torch.nn.init.xavier_normal_(out)
+ return out
+
+ def forward(self, x):
+ """
+
+ :param x:
+ :return:
+ """
+ if self.global_context:
+ w = self.attention(torch.cat([x, self.gc(x).unsqueeze(2).repeat(1, 1, x.shape[-1])], dim=1))
+ else:
+ w = self.attention(x)
+
+ mu = torch.sum(x * w, dim=2)
+ rh = torch.sqrt( ( torch.sum((x**2) * w, dim=2) - mu**2 ).clamp(min=1e-5) )
+ x = torch.cat((mu, rh),1)
+ x = x.view(x.size()[0], -1)
+ return x
+
+
+class GruPooling(torch.nn.Module):
+ """
+ Pooling done by using a recurrent network
+ """
+ def __init__(self, input_size, gru_node, nb_gru_layer):
+ """
+
+ :param input_size:
+ :param gru_node:
+ :param nb_gru_layer:
+ """
+ super(GruPooling, self).__init__()
+ self.lrelu_keras = torch.nn.LeakyReLU(negative_slope = 0.3)
+ self.bn_before_gru = torch.nn.BatchNorm1d(num_features = input_size)
+ self.gru = torch.nn.GRU(input_size = input_size,
+ hidden_size = gru_node,
+ num_layers = nb_gru_layer,
+ batch_first = True)
+
+ def forward(self, x):
+ """
+
+ :param x:
+ :return:
+ """
+ x = self.bn_before_gru(x)
+ x = self.lrelu_keras(x)
+ x = x.permute(0, 2, 1) # (batch, filt, time) >> (batch, time, filt)
+ self.gru.flatten_parameters()
+ x, _ = self.gru(x)
+ x = x[:, -1, :]
+
+ return x
diff --git a/sidekit/sidekit/nnet/preprocessor.py b/sidekit/sidekit/nnet/preprocessor.py
new file mode 100644
index 0000000..009565f
--- /dev/null
+++ b/sidekit/sidekit/nnet/preprocessor.py
@@ -0,0 +1,248 @@
+# coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+
+"""
+Copyright 2014-2021 Anthony Larcher, Yevhenii Prokopalo
+"""
+
+
+import logging
+import numpy
+import os
+import torch
+import torchaudio
+
+from .augmentation import PreEmphasis
+from .sincnet import SincConv1d
+from .res_net import LayerNorm
+
+
+os.environ['MKL_THREADING_LAYER'] = 'GNU'
+
+__license__ = "LGPL"
+__author__ = "Anthony Larcher"
+__copyright__ = "Copyright 2015-2021 Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reS'
+
+
+logging.basicConfig(format='%(asctime)s %(message)s')
+
+
+# Make PyTorch Deterministic
+torch.manual_seed(0)
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = False
+numpy.random.seed(0)
+
+
+class MfccFrontEnd(torch.nn.Module):
+ """
+ Module that extract MFCC coefficients
+ """
+ def __init__(self,
+ pre_emphasis=0.97,
+ sample_rate=16000,
+ n_fft=2048,
+ f_min=133.333,
+ f_max=6855.4976,
+ win_length=1024,
+ window_fn=torch.hann_window,
+ hop_length=512,
+ power=2.0,
+ n_mels=100,
+ n_mfcc=80):
+
+ super(MfccFrontEnd, self).__init__()
+
+ self.pre_emphasis = pre_emphasis
+ self.sample_rate = sample_rate
+ self.n_fft = n_fft
+ self.f_min = f_min
+ self.f_max = f_max
+ self.win_length = win_length
+ self.window_fn=window_fn
+ self.hop_length = hop_length
+ self.power=power
+ self.window_fn = window_fn
+ self.n_mels = n_mels
+ self.n_mfcc = n_mfcc
+
+ self.PreEmphasis = PreEmphasis(self.pre_emphasis)
+
+ self.melkwargs = {"n_fft":self.n_fft,
+ "f_min":self.f_min,
+ "f_max":self.f_max,
+ "win_length":self.win_length,
+ "window_fn":self.window_fn,
+ "hop_length":self.hop_length,
+ "power":self.power,
+ "n_mels":self.n_mels}
+
+ self.MFCC = torchaudio.transforms.MFCC(
+ sample_rate=self.sample_rate,
+ n_mfcc=self.n_mfcc,
+ dct_type=2,
+ log_mels=True,
+ melkwargs=self.melkwargs)
+
+ self.CMVN = torch.nn.InstanceNorm1d(self.n_mfcc)
+
+ def forward(self, x):
+ """
+
+ :param x:
+ :return:
+ """
+ with torch.no_grad():
+ with torch.cuda.amp.autocast(enabled=False):
+ mfcc = self.PreEmphasis(x)
+ mfcc = self.MFCC(mfcc)
+ mfcc = self.CMVN(mfcc)
+ return mfcc
+
+
+class MelSpecFrontEnd(torch.nn.Module):
+ """
+ Module that compute Mel spetrogramm on an audio signal
+ """
+ def __init__(self,
+ pre_emphasis=0.97,
+ sample_rate=16000,
+ n_fft=1024,
+ f_min=90,
+ f_max=7600,
+ win_length=400,
+ window_fn=torch.hann_window,
+ hop_length=160,
+ power=2.0,
+ n_mels=80):
+
+ super(MelSpecFrontEnd, self).__init__()
+
+ self.pre_emphasis = pre_emphasis
+ self.sample_rate = sample_rate
+ self.n_fft = n_fft
+ self.f_min = f_min
+ self.f_max = f_max
+ self.win_length = win_length
+ self.window_fn=window_fn
+ self.hop_length = hop_length
+ self.power=power
+ self.window_fn = window_fn
+ self.n_mels = n_mels
+
+ self.PreEmphasis = PreEmphasis(self.pre_emphasis)
+
+ self.melkwargs = {"n_fft":self.n_fft,
+ "f_min":self.f_min,
+ "f_max":self.f_max,
+ "win_length":self.win_length,
+ "window_fn":self.window_fn,
+ "hop_length":self.hop_length,
+ "power":self.power,
+ "n_mels":self.n_mels}
+
+ self.MelSpec = torchaudio.transforms.MelSpectrogram(sample_rate=self.sample_rate,
+ n_fft=self.melkwargs['n_fft'],
+ f_min=self.melkwargs['f_min'],
+ f_max=self.melkwargs['f_max'],
+ win_length=self.melkwargs['win_length'],
+ hop_length=self.melkwargs['hop_length'],
+ window_fn=self.melkwargs['window_fn'],
+ power=self.melkwargs['power'],
+ n_mels=self.melkwargs['n_mels'])
+
+ self.CMVN = torch.nn.InstanceNorm1d(self.n_mels)
+ self.time_masking = torchaudio.transforms.TimeMasking(time_mask_param=5)
+ self.freq_masking = torchaudio.transforms.FrequencyMasking(freq_mask_param=10)
+
+ def forward(self, x, is_eval=False):
+ """
+
+ :param x:
+ :param is_eval:
+ :return:
+ """
+ with torch.no_grad():
+ with torch.cuda.amp.autocast(enabled=False):
+ if x.dim() == 1:
+ x = x.unsqueeze(0)
+ out = self.PreEmphasis(x)
+ out = self.MelSpec(out)+1e-6
+ out = torch.log(out)
+ out = self.CMVN(out)
+ if not is_eval:
+ out = self.freq_masking(out)
+ out = self.time_masking(out)
+ return out
+
+
+class RawPreprocessor(torch.nn.Module):
+ """
+ Pre-process the raw audio signal by using a SincNet architecture
+ [ADD REF]
+ """
+ def __init__(self, nb_samp, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, bias=False, groups=1, min_low_hz=50, min_band_hz=50, sample_rate=16000):
+ """
+
+ :param nb_samp:
+ :param in_channels:
+ :param filts:
+ :param first_conv:
+ """
+ super(RawPreprocessor, self).__init__()
+ self.ln = LayerNorm(nb_samp)
+ self.first_conv = SincConv1d(in_channels = in_channels,
+ out_channels = out_channels,
+ kernel_size = kernel_size,
+ sample_rate = sample_rate,
+ stride=stride,
+ padding=padding,
+ dilation=dilation,
+ bias=bias,
+ groups=groups,
+ min_low_hz=min_low_hz,
+ min_band_hz=min_band_hz
+ )
+ self.first_bn = torch.nn.BatchNorm1d(num_features = out_channels)
+ self.lrelu = torch.nn.LeakyReLU()
+ self.lrelu_keras = torch.nn.LeakyReLU(negative_slope = 0.3)
+
+ def forward(self, x):
+ """
+
+ :param x:
+ :return:
+ """
+ nb_samp = x.shape[0]
+ len_seq = x.shape[1]
+ out = self.ln(x)
+ out = out.view(nb_samp, 1, len_seq)
+ out = torch.nn.functional.max_pool1d(torch.abs(self.first_conv(out)), 3)
+ out = self.first_bn(out)
+ out = self.lrelu_keras(out)
+
+ return out
diff --git a/sidekit/sidekit/nnet/res_net.py b/sidekit/sidekit/nnet/res_net.py
new file mode 100644
index 0000000..4f3c5c6
--- /dev/null
+++ b/sidekit/sidekit/nnet/res_net.py
@@ -0,0 +1,607 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+
+"""
+Copyright 2014-2021 Anthony Larcher
+"""
+
+import torch
+import torchaudio
+
+
+__author__ = "Anthony Larcher and Sylvain Meignier"
+__copyright__ = "Copyright 2014-2021 Anthony Larcher and Sylvain Meignier"
+__license__ = "LGPL"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+
+
+class FeatureMapScaling(torch.nn.Module):
+ """
+
+ """
+ def __init__(self, nb_dim, do_add = True, do_mul = True):
+ """
+
+ :param nb_dim:
+ :param do_add:
+ :param do_mul:
+ """
+ super(FeatureMapScaling, self).__init__()
+ self.fc = torch.nn.Linear(nb_dim, nb_dim)
+ self.sig = torch.nn.Sigmoid()
+ self.do_add = do_add
+ self.do_mul = do_mul
+
+ def forward(self, x):
+ """
+
+ :param x:
+ :return:
+ """
+ y = torch.nn.functional.adaptive_avg_pool1d(x, 1).view(x.size(0), -1)
+ y = self.sig(self.fc(y)).view(x.size(0), x.size(1), -1)
+
+ if self.do_mul:
+ x = x * y
+
+ if self.do_add:
+ x = x + y
+
+ return x
+
+
+class ResBlockWFMS(torch.nn.Module):
+ """
+
+ """
+ def __init__(self, nb_filts, first=False):
+ """
+
+ :param nb_filts:
+ :param first:
+ """
+ super(ResBlockWFMS, self).__init__()
+ self.first = first
+
+ if not self.first:
+ self.bn1 = torch.nn.BatchNorm1d(num_features=nb_filts[0])
+
+ self.lrelu = torch.nn.LeakyReLU()
+ self.lrelu_keras = torch.nn.LeakyReLU(negative_slope=0.3)
+
+ self.conv1 = torch.nn.Conv1d(in_channels=nb_filts[0],
+ out_channels=nb_filts[1],
+ kernel_size=3,
+ padding=1,
+ stride=1)
+
+ self.bn2 = torch.nn.BatchNorm1d(num_features=nb_filts[1])
+
+ self.conv2 = torch.nn.Conv1d(in_channels=nb_filts[1],
+ out_channels=nb_filts[1],
+ padding=1,
+ kernel_size=3,
+ stride=1)
+
+ if nb_filts[0] != nb_filts[1]:
+ self.downsample = True
+ self.conv_downsample = torch.nn.Conv1d(in_channels=nb_filts[0],
+ out_channels=nb_filts[1],
+ padding=0,
+ kernel_size=1,
+ stride=1)
+ else:
+ self.downsample = False
+
+ self.mp = torch.nn.MaxPool1d(3)
+
+ self.fms = FeatureMapScaling(nb_dim=nb_filts[1],
+ do_add=True,
+ do_mul=True
+ )
+
+ def forward(self, x):
+ """
+
+ :param x:
+ :return:
+ """
+ identity = x
+
+ if not self.first:
+ out = self.bn1(x)
+ out = self.lrelu_keras(out)
+ else:
+ out = x
+
+ #out = self.conv1(x)
+ out = self.conv1(out) # modif Anthony
+ out = self.bn2(out)
+ out = self.lrelu_keras(out)
+ out = self.conv2(out)
+
+ if self.downsample:
+ identity = self.conv_downsample(identity)
+
+ out += identity
+ out = self.mp(out)
+ out = self.fms(out)
+
+ return out
+
+
+class LayerNorm(torch.nn.Module):
+ """
+
+ """
+ def __init__(self, features, eps=1e-6):
+ """
+
+ :param features:
+ :param eps:
+ """
+ super(LayerNorm,self).__init__()
+ self.gamma = torch.nn.Parameter(torch.ones(features))
+ self.beta = torch.nn.Parameter(torch.zeros(features))
+ self.eps = eps
+
+ def forward(self, x):
+ """
+
+ :param x:
+ :return:
+ """
+ mean = x.mean(-1, keepdim=True)
+ std = x.std(-1, keepdim=True)
+ return self.gamma * (x - mean) / (std + self.eps) + self.beta
+
+
+class ResBlock(torch.nn.Module):
+ """
+
+ """
+ def __init__(self, in_channels, out_channels, stride, is_first=False):
+ """
+
+ :param filter_size:
+ :param channel_nb:
+ :param is_first: boolean, True if this block ios the first of the model, if not, apply a BatchNorm layer first
+ """
+ super(ResBlock, self).__init__()
+ self.is_first = is_first
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.expansion = self.out_channels // self.in_channels
+
+ self.resample = None
+ if not self.in_channels == self.out_channels:
+ self.resample = torch.nn.Sequential(
+ torch.nn.Conv2d(in_channels=self.in_channels,
+ out_channels=self.out_channels,
+ kernel_size=1),
+ torch.nn.BatchNorm2d(self.in_channels * self.expansion),
+ )
+
+ if not self.is_first:
+ self.batch_norm1 = torch.nn.BatchNorm2d(num_features=self.in_channels)
+
+ self.activation = torch.nn.LeakyReLU()
+
+ self.conv1 = torch.nn.Conv2d(in_channels=self.in_channels,
+ out_channels=self.out_channels,
+ kernel_size=(3,3),
+ stride=stride,
+ padding=1,
+ padding_mode='zeros',
+ dilation=1)
+ self.conv2= torch.nn.Conv2d(in_channels=self.out_channels,
+ out_channels=self.out_channels,
+ stride=stride,
+ kernel_size=(3,3),
+ padding=1,
+ padding_mode='zeros',
+ dilation=1)
+
+ self.batch_norm2 = torch.nn.BatchNorm2d(num_features=self.out_channels)
+
+ def forward(self, x):
+ """
+
+ :param x:
+ :return:
+ """
+ identity = x
+
+ if not self.is_first:
+ out = self.activation(self.batch_norm1(x))
+ else:
+ out = x
+
+ out = self.conv1(out)
+ out = self.batch_norm2(out)
+ out = self.activation(out)
+
+ out = self.conv2(out)
+ out = self.batch_norm2(out)
+
+ if not self.expansion == 1:
+ identity = self.resample(identity)
+ out += identity
+
+ out = self.activation(out)
+ return out
+
+
+class SELayer(torch.nn.Module):
+ """
+
+ """
+ def __init__(self, channel, reduction=16):
+ super(SELayer, self).__init__()
+ self.avg_pool = torch.nn.AdaptiveAvgPool2d(1)
+ self.fc = torch.nn.Sequential(
+ torch.nn.Linear(channel, channel // reduction, bias=False),
+ torch.nn.ReLU(inplace=True),
+ torch.nn.Linear(channel // reduction, channel, bias=False),
+ torch.nn.Sigmoid()
+ )
+
+ def forward(self, x):
+ """
+
+ :param x:
+ :return:
+ """
+ b, c, _, _ = x.size()
+ y = self.avg_pool(x).view(b, c)
+ y = self.fc(y).view(b, c, 1, 1)
+ return x * y.expand_as(x)
+
+
+class BasicBlock(torch.nn.Module):
+ """
+
+ """
+ expansion = 1
+
+ def __init__(self, in_planes, planes, stride=1):
+ super(BasicBlock, self).__init__()
+ self.conv1 = torch.nn.Conv2d(
+ in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+ self.bn1 = torch.nn.BatchNorm2d(planes)
+ self.conv2 = torch.nn.Conv2d(planes, planes, kernel_size=3,
+ stride=1, padding=1, bias=False)
+ self.bn2 = torch.nn.BatchNorm2d(planes)
+
+ self.se = SELayer(planes)
+
+ self.shortcut = torch.nn.Sequential()
+ if stride != 1 or in_planes != self.expansion*planes:
+ self.shortcut = torch.nn.Sequential(
+ torch.nn.Conv2d(in_planes, self.expansion*planes,
+ kernel_size=1, stride=stride, bias=False),
+ torch.nn.BatchNorm2d(self.expansion*planes)
+ )
+
+ def forward(self, x):
+ """
+
+ :param x:
+ :return:
+ """
+ out = torch.nn.functional.relu(self.bn1(self.conv1(x)))
+ out = self.bn2(self.conv2(out))
+ out = self.se(out)
+ out += self.shortcut(x)
+ out = torch.nn.functional.relu(out)
+ return out
+
+
+class Bottleneck(torch.nn.Module):
+ """
+
+ """
+ def __init__(self, in_planes, planes, stride=1, expansion=4):
+ super(Bottleneck, self).__init__()
+
+ self.expansion = expansion
+
+ self.conv1 = torch.nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
+ self.bn1 = torch.nn.BatchNorm2d(planes)
+ self.conv2 = torch.nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+ self.bn2 = torch.nn.BatchNorm2d(planes)
+ self.conv3 = torch.nn.Conv2d(planes, self.expansion * planes, kernel_size=1, bias=False)
+ self.bn3 = torch.nn.BatchNorm2d(self.expansion*planes)
+
+ self.shortcut = torch.nn.Sequential()
+ if stride != 1 or in_planes != self.expansion*planes:
+ self.shortcut = torch.nn.Sequential(
+ torch.nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
+ torch.nn.BatchNorm2d(self.expansion*planes)
+ )
+
+ def forward(self, x):
+ """
+
+ :param x:
+ :return:
+ """
+ out = torch.nn.functional.relu(self.bn1(self.conv1(x)))
+ out = torch.nn.functional.relu(self.bn2(self.conv2(out)))
+ out = self.bn3(self.conv3(out))
+ out += self.shortcut(x)
+ out = torch.nn.functional.relu(out)
+ return out
+
+
+class ResNet(torch.nn.Module):
+ """
+
+ """
+ def __init__(self, block, num_blocks, speaker_number=10):
+ super(ResNet, self).__init__()
+ self.in_planes = 128
+ self.speaker_number = speaker_number
+
+ # Feature extraction
+ n_fft = 2048
+ win_length = None
+ hop_length = 512
+ n_mels = 80
+ n_mfcc = 80
+
+ # todo modify the front-end like for other architectures
+ self.MFCC = torchaudio.transforms.MFCC(
+ sample_rate=16000,
+ n_mfcc=n_mfcc, melkwargs={'n_fft': n_fft, 'n_mels': n_mels, 'hop_length': hop_length})
+
+ self.CMVN = torch.nn.InstanceNorm1d(80)
+
+ self.conv1 = torch.nn.Conv2d(1, 128, kernel_size=3,
+ stride=1, padding=1, bias=False)
+ self.bn1 = torch.nn.BatchNorm2d(128)
+
+ # With block = [3, 1, 3, 1, 5, 1, 2]
+ self.layer1 = self._make_layer(block, 128, num_blocks[0], stride=1)
+ self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
+ self.layer3 = self._make_layer(block, 128, num_blocks[2], stride=1)
+ self.layer4 = self._make_layer(block, 256, num_blocks[3], stride=2)
+ self.layer5 = self._make_layer(block, 256, num_blocks[4], stride=1)
+ self.layer6 = self._make_layer(block, 256, num_blocks[5], stride=2)
+ self.layer7 = self._make_layer(block, 256, num_blocks[5], stride=1)
+ self.stat_pooling = MeanStdPooling()
+ self.before_embedding = torch.nn.Linear(5120, 256)
+
+ def _make_layer(self, block, planes, num_blocks, stride):
+ """
+
+ :param block:
+ :param planes:
+ :param num_blocks:
+ :param stride:
+ :return:
+ """
+ strides = [stride] + [1]*(num_blocks-1)
+ layers = []
+ for stride in strides:
+ layers.append(block(self.in_planes, planes, stride))
+ self.in_planes = planes * block.expansion
+ return torch.nn.Sequential(*layers)
+
+ def forward(self, x):
+ """
+
+ :param x:
+ :return:
+ """
+ out = self.MFCC(x)
+ out = self.CMVN(out)
+ out = out.unsqueeze(1)
+ out = torch.nn.functional.relu(self.bn1(self.conv1(out)))
+ out = self.layer1(out)
+ out = self.layer2(out)
+ out = self.layer3(out)
+ out = self.layer4(out)
+ out = self.layer5(out)
+ out = self.layer6(out)
+ out = self.layer7(out)
+ out = torch.flatten(out, start_dim=1, end_dim=2)
+ out = self.stat_pooling(out)
+ out = self.before_embedding(out)
+ return out
+
+
+class PreResNet34(torch.nn.Module):
+ """
+ Networks that contains only the ResNet part until pooling, with NO classification layers
+ """
+ def __init__(self, block=BasicBlock, num_blocks=[3, 1, 3, 1, 5, 1, 2], speaker_number=10):
+ super(PreResNet34, self).__init__()
+ self.in_planes = 128
+
+ self.speaker_number = speaker_number
+ self.conv1 = torch.nn.Conv2d(1, 128, kernel_size=3,
+ stride=1, padding=1, bias=False)
+
+ self.bn1 = torch.nn.BatchNorm2d(128)
+
+ # With block = [3, 1, 3, 1, 5, 1, 2]
+ self.layer1 = self._make_layer(block, 128, num_blocks[0], stride=1)
+ self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
+ self.layer3 = self._make_layer(block, 128, num_blocks[2], stride=1)
+ self.layer4 = self._make_layer(block, 256, num_blocks[3], stride=2)
+ self.layer5 = self._make_layer(block, 256, num_blocks[4], stride=1)
+ self.layer6 = self._make_layer(block, 256, num_blocks[5], stride=2)
+ self.layer7 = self._make_layer(block, 256, num_blocks[5], stride=1)
+
+
+ def _make_layer(self, block, planes, num_blocks, stride):
+ """
+
+ :param block:
+ :param planes:
+ :param num_blocks:
+ :param stride:
+ :return:
+ """
+ strides = [stride] + [1]*(num_blocks-1)
+ layers = []
+ for stride in strides:
+ layers.append(block(self.in_planes, planes, stride))
+ self.in_planes = planes * block.expansion
+ return torch.nn.Sequential(*layers)
+
+ def forward(self, x):
+ """
+
+ :param x:
+ :return:
+ """
+ out = x.unsqueeze(1)
+ out = torch.nn.functional.relu(self.bn1(self.conv1(out)))
+ out = self.layer1(out)
+ out = self.layer2(out)
+ out = self.layer3(out)
+ out = self.layer4(out)
+ out = self.layer5(out)
+ out = self.layer6(out)
+ out = self.layer7(out)
+ out = torch.flatten(out, start_dim=1, end_dim=2)
+ return out
+
+
+class PreHalfResNet34(torch.nn.Module):
+ """
+ Networks that contains only the ResNet part until pooling, with NO classification layers
+ """
+ def __init__(self, block=BasicBlock, num_blocks=[3, 4, 6, 3], speaker_number=10):
+ super(PreHalfResNet34, self).__init__()
+ self.in_planes = 32
+ self.speaker_number = speaker_number
+
+ self.conv1 = torch.nn.Conv2d(1, 32, kernel_size=3,
+ stride=(1, 1), padding=1, bias=False)
+ self.bn1 = torch.nn.BatchNorm2d(32)
+
+ # With block = [3, 4, 6, 3]
+ self.layer1 = self._make_layer(block, 32, num_blocks[0], stride=(1, 1))
+ self.layer2 = self._make_layer(block, 64, num_blocks[1], stride=(2, 2))
+ self.layer3 = self._make_layer(block, 128, num_blocks[2], stride=(2, 2))
+ self.layer4 = self._make_layer(block, 256, num_blocks[3], stride=(2, 2))
+
+
+ def _make_layer(self, block, planes, num_blocks, stride):
+ """
+
+ :param block:
+ :param planes:
+ :param num_blocks:
+ :param stride:
+ :return:
+ """
+ strides = [stride] + [1]*(num_blocks-1)
+ layers = []
+ for stride in strides:
+ layers.append(block(self.in_planes, planes, stride))
+ self.in_planes = planes * block.expansion
+ return torch.nn.Sequential(*layers)
+
+ def forward(self, x):
+ """
+
+ :param x:
+ :return:
+ """
+ out = x.unsqueeze(1)
+ out = out.contiguous(memory_format=torch.channels_last)
+ out = torch.nn.functional.relu(self.bn1(self.conv1(out)))
+ out = self.layer1(out)
+ out = self.layer2(out)
+ out = self.layer3(out)
+ out = self.layer4(out)
+ out = out.contiguous(memory_format=torch.contiguous_format)
+ out = torch.flatten(out, start_dim=1, end_dim=2)
+ return out
+
+
+class PreFastResNet34(torch.nn.Module):
+ """
+ Networks that contains only the ResNet part until pooling, with NO classification layers
+ """
+ def __init__(self, block=BasicBlock, num_blocks=[3, 4, 6, 3], speaker_number=10):
+ super(PreFastResNet34, self).__init__()
+ self.in_planes = 16
+ self.speaker_number = speaker_number
+
+ self.conv1 = torch.nn.Conv2d(1, 16, kernel_size=7,
+ stride=(2, 1), padding=3, bias=False)
+ self.bn1 = torch.nn.BatchNorm2d(16)
+
+ # With block = [3, 4, 6, 3]
+ self.layer1 = self._make_layer(block, 16, num_blocks[0], stride=1)
+ self.layer2 = self._make_layer(block, 32, num_blocks[1], stride=(2, 2))
+ self.layer3 = self._make_layer(block, 64, num_blocks[2], stride=(2, 2))
+ self.layer4 = self._make_layer(block, 128, num_blocks[3], stride=(1, 1))
+
+
+ def _make_layer(self, block, planes, num_blocks, stride):
+ """
+
+ :param block:
+ :param planes:
+ :param num_blocks:
+ :param stride:
+ :return:
+ """
+ strides = [stride] + [1]*(num_blocks-1)
+ layers = []
+ for stride in strides:
+ layers.append(block(self.in_planes, planes, stride))
+ self.in_planes = planes * block.expansion
+ return torch.nn.Sequential(*layers)
+
+ def forward(self, x):
+ """
+
+ :param x:
+ :return:
+ """
+ out = x.unsqueeze(1)
+ out = out.contiguous(memory_format=torch.channels_last)
+ out = torch.nn.functional.relu(self.bn1(self.conv1(out)))
+ out = self.layer1(out)
+ out = self.layer2(out)
+ out = self.layer3(out)
+ out = self.layer4(out)
+ out = out.contiguous(memory_format=torch.contiguous_format)
+ out = torch.flatten(out, start_dim=1, end_dim=2)
+ return out
+
+
+def ResNet34():
+ return ResNet(BasicBlock, [3, 1, 3, 1, 5, 1, 2])
+
+# TODO create a flexible class that allows to generate different RESNET sequential classes and manage the sizes
\ No newline at end of file
diff --git a/sidekit/sidekit/nnet/sincnet.py b/sidekit/sidekit/nnet/sincnet.py
new file mode 100644
index 0000000..8110ba9
--- /dev/null
+++ b/sidekit/sidekit/nnet/sincnet.py
@@ -0,0 +1,427 @@
+# The MIT License (MIT)
+#
+# Copyright (c) 2019 Mirco Ravanelli
+# Copyright (c) 2019-2020 CNRS
+# Copyright (c) 2020-2021 LIUM
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# AUTHOR
+# Hervé Bredin - http://herve.niderb.fr
+# Anthony Larcher
+
+# Part of this code was taken from https://github.com/mravanelli/SincNet
+# (see above license terms).
+
+# Please give proper credit to the authors if you are using SincNet-based
+# models by citing their paper:
+
+# Mirco Ravanelli, Yoshua Bengio.
+# "Speaker Recognition from raw waveform with SincNet".
+# SLT 2018. https://arxiv.org/abs/1808.00158
+
+from typing import List
+import numpy
+import torch
+import math
+
+
+class SincConv1d(torch.nn.Module):
+ """Sinc-based 1D convolution
+
+ Parameters
+ ----------
+ in_channels : `int`
+ Should be 1.
+ out_channels : `int`
+ Number of filters.
+ kernel_size : `int`
+ Filter length.
+ stride : `int`, optional
+ Defaults to 1.
+ sample_rate : `int`, optional
+ Sample rate. Defaults to 16000.
+ min_low_hz: `int`, optional
+ Defaults to 50.
+ min_band_hz: `int`, optional
+ Defaults to 50.
+
+ Usage
+ -----
+ Same as `torch.nn.Conv1d`
+
+ Reference
+ ---------
+ Mirco Ravanelli, Yoshua Bengio. "Speaker Recognition from raw waveform with
+ SincNet". SLT 2018. https://arxiv.org/abs/1808.00158
+ """
+
+ @staticmethod
+ def to_mel(hz):
+ return 2595 * numpy.log10(1 + hz / 700)
+
+ @staticmethod
+ def to_hz(mel):
+ return 700 * (10 ** (mel / 2595) - 1)
+
+ def __init__(self,
+ out_channels,
+ kernel_size,
+ sample_rate=16000,
+ in_channels=1,
+ stride=1,
+ padding=0,
+ dilation=1,
+ bias=False,
+ groups=1,
+ min_low_hz=50,
+ min_band_hz=50):
+
+ super(SincConv1d, self).__init__()
+
+ if in_channels != 1:
+ msg = f"SincConv1d only supports one input channel. (Here, in_channels = {in_channels})."
+ raise ValueError(msg)
+
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.stride = stride
+ self.padding = padding
+ self.dilation = dilation
+ self.sample_rate = sample_rate
+ self.min_low_hz = min_low_hz
+ self.min_band_hz = min_band_hz
+
+ self.kernel_size = kernel_size
+ if kernel_size % 2 == 0:
+ self.kernel_size=self.kernel_size+1
+
+ if bias:
+ raise ValueError("SincConv1d does not support bias.")
+ if groups > 1:
+ raise ValueError("SincConv1d does not support groups.")
+
+ # initialize filterbanks such that they are equally spaced in Mel scale
+ low_hz = 30
+ high_hz = self.sample_rate / 2 - (self.min_low_hz + self.min_band_hz)
+
+ mel = numpy.linspace(self.to_mel(low_hz),
+ self.to_mel(high_hz),
+ self.out_channels + 1)
+ hz = self.to_hz(mel)
+
+ # filter lower frequency (out_channels, 1)
+ self.low_hz_ = torch.nn.Parameter(torch.Tensor(hz[:-1]).view(-1, 1))
+
+ # filter frequency band (out_channels, 1)
+ self.band_hz_ = torch.nn.Parameter(torch.Tensor(numpy.diff(hz)).view(-1, 1))
+
+ # Half Hamming window
+ n_lin = torch.linspace(0, (self.kernel_size / 2) - 1, steps=int((self.kernel_size / 2)))
+ self.window_ = 0.54 - 0.46 * torch.cos(2 * math.pi * n_lin / self.kernel_size)
+
+ # (kernel_size, 1)
+ # Due to symmetry, I only need half of the time axes
+ n = (self.kernel_size - 1) / 2.0
+ self.n_ = 2 * math.pi * torch.arange(-n, 0).view(1, -1) / self.sample_rate
+
+ def forward(self, waveforms):
+ """Get sinc filters activations
+
+ Parameters
+ ----------
+ waveforms : `torch.Tensor` (batch_size, 1, n_samples)
+ Batch of waveforms.
+
+ Returns
+ -------
+ features : `torch.Tensor` (batch_size, out_channels, n_samples_out)
+ Batch of sinc filters activations.
+ """
+ self.n_ = self.n_.to(waveforms.device)
+ self.window_ = self.window_.to(waveforms.device)
+
+ low = self.min_low_hz + torch.abs(self.low_hz_)
+
+ high = torch.clamp(low + self.min_band_hz + torch.abs(self.band_hz_),
+ self.min_low_hz,
+ self.sample_rate / 2)
+ band = (high - low)[:, 0]
+
+ f_times_t_low = torch.matmul(low, self.n_)
+ f_times_t_high = torch.matmul(high, self.n_)
+
+ # Equivalent to Eq.4 of the reference paper
+ # expanded the sinc and simplified the terms.
+ # This way I avoid several useless computations.
+ band_pass_left = ((torch.sin(f_times_t_high) - torch.sin(f_times_t_low)) / (self.n_ / 2)) * self.window_
+ band_pass_center = 2 * band.view(-1, 1)
+ band_pass_right = torch.flip(band_pass_left, dims=[1])
+
+ band_pass = torch.cat([band_pass_left, band_pass_center, band_pass_right], dim=1)
+ band_pass = band_pass / (2 * band[:, None])
+
+ self.filters = band_pass.view(self.out_channels, 1, self.kernel_size)
+
+
+ return torch.nn.functional.conv1d(waveforms,
+ self.filters,
+ stride=self.stride,
+ padding=self.padding,
+ dilation=self.dilation,
+ bias=None,
+ groups=1)
+
+
+class SincNet(torch.nn.Module):
+ """SincNet (learnable) feature extraction
+
+ Parameters
+ ----------
+ waveform_normalize : `bool`, optional
+ Standardize waveforms (to zero mean and unit standard deviation) and
+ apply (learnable) affine transform. Defaults to True.
+ instance_normalize : `bool`, optional
+ Standardize internal representation (to zero mean and unit standard
+ deviation) and apply (learnable) affine transform. Defaults to True.
+
+
+ Reference
+ ---------
+ Mirco Ravanelli, Yoshua Bengio. "Speaker Recognition from raw waveform with
+ SincNet". SLT 2018. https://arxiv.org/abs/1808.00158
+
+ """
+
+ '''
+ @staticmethod
+ def get_alignment(task: Task, **kwargs):
+ """Get frame alignment"""
+ return "strict"
+
+ @staticmethod
+ def get_resolution(
+ task: Task,
+ sample_rate: int = 16000,
+ kernel_size: List[int] = [251, 5, 5],
+ stride: List[int] = [1, 1, 1],
+ max_pool: List[int] = [3, 3, 3],
+ **kwargs,
+ ) -> SlidingWindow:
+ """Get frame resolution
+
+ Parameters
+ ----------
+ task : Task
+ sample_rate : int, optional
+ kerne_size : list of int, optional
+ stride : list of int, optional
+ max_pool : list of int, optional
+
+ Returns
+ -------
+ resolution : SlidingWindow
+ Frame resolution.
+ """
+
+ # https://medium.com/mlreview/a-guide-to-receptive-field-arithmetic-for-convolutional-neural-networks-e0f514068807
+ padding = 0
+ receptive_field, jump, start = 1, 1, 0.5
+ for ks, s, mp in zip(kernel_size, stride, max_pool):
+ # increase due to (Sinc)Conv1d
+ receptive_field += (ks - 1) * jump
+ start += ((ks - 1) / 2 - padding) * jump
+ jump *= s
+ # increase in receptive field due to MaxPool1d
+ receptive_field += (mp - 1) * jump
+ start += ((mp - 1) / 2 - padding) * jump
+ jump *= mp
+
+ return SlidingWindow(
+ duration=receptive_field / sample_rate, step=jump / sample_rate, start=0.0
+ )
+ '''
+
+ def __init__(
+ self,
+ waveform_normalize=True,
+ sample_rate=16000,
+ min_low_hz=50,
+ min_band_hz=50,
+ out_channels=[80, 60, 60],
+ kernel_size: List[int] = [251, 5, 5],
+ stride=[1, 1, 1],
+ max_pool=[3, 3, 3],
+ instance_normalize=True,
+ activation="leaky_relu",
+ dropout=0.0,
+ ):
+ super().__init__()
+
+ # check parameters values
+ n_layers = len(out_channels)
+ if len(kernel_size) != n_layers:
+ msg = (
+ f"out_channels ({len(out_channels):d}) and kernel_size "
+ f"({len(kernel_size):d}) should have the same length."
+ )
+ raise ValueError(msg)
+ if len(stride) != n_layers:
+ msg = (
+ f"out_channels ({len(out_channels):d}) and stride "
+ f"({len(stride):d}) should have the same length."
+ )
+ raise ValueError(msg)
+ if len(max_pool) != n_layers:
+ msg = (
+ f"out_channels ({len(out_channels):d}) and max_pool "
+ f"({len(max_pool):d}) should have the same length."
+ )
+ raise ValueError(msg)
+
+ # Waveform normalization
+ self.waveform_normalize = waveform_normalize
+ if self.waveform_normalize:
+ self.waveform_normalize_ = torch.nn.InstanceNorm1d(1, affine=True)
+
+ # SincNet-specific parameters
+ self.sample_rate = sample_rate
+ self.min_low_hz = min_low_hz
+ self.min_band_hz = min_band_hz
+
+ # Conv1D parameters
+ self.out_channels = out_channels
+ self.kernel_size = kernel_size
+ self.stride = stride
+ self.conv1d_ = torch.nn.ModuleList([])
+
+ # Max-pooling parameters
+ self.max_pool = max_pool
+ self.max_pool1d_ = torch.nn.ModuleList([])
+
+ # Instance normalization
+ self.instance_normalize = instance_normalize
+ if self.instance_normalize:
+ self.instance_norm1d_ = torch.nn.ModuleList([])
+
+ config = zip(self.out_channels, self.kernel_size, self.stride, self.max_pool)
+
+ in_channels = None
+ for i, (out_channels, kernel_size, stride, max_pool) in enumerate(config):
+
+ # 1D convolution
+ if i > 0:
+ conv1d = torch.nn.Conv1d(
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride=stride,
+ padding=0,
+ dilation=1,
+ groups=1,
+ bias=True,
+ )
+ else:
+ conv1d = SincConv1d(
+ out_channels,
+ kernel_size,
+ sample_rate=self.sample_rate,
+ in_channels=1,
+ min_low_hz=self.min_low_hz,
+ min_band_hz=self.min_band_hz,
+ stride=stride,
+ padding=0,
+ dilation=1,
+ bias=False,
+ groups=1,
+ )
+ self.conv1d_.append(conv1d)
+
+ # 1D max-pooling
+ max_pool1d = torch.nn.MaxPool1d(max_pool, stride=max_pool, padding=0, dilation=1)
+ self.max_pool1d_.append(max_pool1d)
+
+ # 1D instance normalization
+ if self.instance_normalize:
+ instance_norm1d = torch.nn.InstanceNorm1d(out_channels, affine=True)
+ self.instance_norm1d_.append(instance_norm1d)
+
+ in_channels = out_channels
+
+ # Activation function
+ self.activation = activation
+ if self.activation == "leaky_relu":
+ self.activation_ = torch.nn.LeakyReLU(negative_slope=0.2)
+ else:
+ msg = f'Only "leaky_relu" activation is supported.'
+ raise ValueError(msg)
+
+ # Dropout
+ self.dropout = dropout
+ if self.dropout:
+ self.dropout_ = torch.nn.Dropout(p=self.dropout)
+
+ def forward(self, waveforms):
+ """Extract SincNet features
+
+ Parameters
+ ----------
+ waveforms : (batch_size, n_samples, 1)
+ Batch of waveforms
+
+ Returns
+ -------
+ features : (batch_size, n_frames, out_channels[-1])
+ """
+ waveforms = waveforms[:, :, None]
+ output = waveforms.transpose(1, 2)
+
+ # standardize waveforms
+ if self.waveform_normalize:
+ output = self.waveform_normalize_(output)
+
+ layers = zip(self.conv1d_, self.max_pool1d_)
+ for i, (conv1d, max_pool1d) in enumerate(layers):
+
+ output = conv1d(output)
+ if i == 0:
+ output = torch.abs(output)
+
+ output = max_pool1d(output)
+
+ if self.instance_normalize:
+ output = self.instance_norm1d_[i](output)
+
+ output = self.activation_(output)
+
+ if self.dropout:
+ output = self.dropout_(output)
+
+ #return output.transpose(1, 2)
+ return output
+
+ def dimension():
+ doc = "Output features dimension."
+
+ def fget(self):
+ return self.out_channels[-1]
+
+ return locals()
+
+ dimension = property(**dimension())
diff --git a/sidekit/sidekit/nnet/xsets.py b/sidekit/sidekit/nnet/xsets.py
new file mode 100644
index 0000000..094bb6e
--- /dev/null
+++ b/sidekit/sidekit/nnet/xsets.py
@@ -0,0 +1,589 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+
+"""
+Copyright 2014-2021 Anthony Larcher
+
+"""
+
+import math
+import numpy
+import pandas
+import random
+import torch
+import torchaudio
+import tqdm
+import soundfile
+import yaml
+
+from torch.utils.data import Dataset
+from .augmentation import data_augmentation
+from ..bosaris.idmap import IdMap
+
+__license__ = "LGPL"
+__author__ = "Anthony Larcher"
+__copyright__ = "Copyright 2015-2021 Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+
+
+class SideSampler(torch.utils.data.Sampler):
+ """
+ Data Sampler used to generate uniformly distributed batches
+ """
+
+ def __init__(self,
+ data_source,
+ spk_count,
+ examples_per_speaker,
+ samples_per_speaker,
+ batch_size,
+ seed=0,
+ rank=0,
+ num_process=1,
+ num_replicas=1):
+ """[summary]
+
+ Args:
+ data_source ([type]): [description]
+ spk_count ([type]): [description]
+ examples_per_speaker ([type]): [description]
+ samples_per_speaker ([type]): [description]
+ batch_size ([type]): [description]
+ num_replicas: number of GPUs for parallel computing
+ """
+ self.train_sessions = data_source
+ self.labels_to_indices = dict()
+ self.spk_count = spk_count
+ self.examples_per_speaker = examples_per_speaker
+ self.samples_per_speaker = samples_per_speaker
+ self.epoch = 0
+ self.seed = seed
+ self.rank = rank
+ self.num_process = num_process
+ self.num_replicas = num_replicas
+
+ assert batch_size % examples_per_speaker == 0
+ assert (self.samples_per_speaker * self.spk_count * self.examples_per_speaker) % self.num_process == 0
+
+ self.batch_size = batch_size // (self.examples_per_speaker * self.num_replicas)
+ #self.batch_size = batch_size // self.examples_per_speaker
+
+ # reference all segment indexes per speaker
+ for idx in range(self.spk_count):
+ self.labels_to_indices[idx] = list()
+ for idx, value in enumerate(self.train_sessions):
+ self.labels_to_indices[value].append(idx)
+ # shuffle segments per speaker
+ g = torch.Generator()
+ g.manual_seed(self.seed + self.epoch)
+ for idx, ldlist in enumerate(self.labels_to_indices.values()):
+ ldlist = numpy.array(ldlist)
+ self.labels_to_indices[idx] = ldlist[torch.randperm(ldlist.shape[0], generator=g).numpy()]
+
+ self.segment_cursors = numpy.zeros((len(self.labels_to_indices),), dtype=numpy.int)
+
+
+ def __iter__(self):
+ g = torch.Generator()
+ g.manual_seed(self.seed + self.epoch)
+ numpy.random.seed(self.seed + self.epoch)
+
+ # Generate batches per speaker
+ straight = numpy.arange(self.spk_count)
+ indices = numpy.ones((self.samples_per_speaker, self.spk_count), dtype=numpy.int) * straight
+ batch_cursor = 0
+ # each line of "indices" represents all speaker indexes (shuffled in a different way)
+ for idx in range(self.samples_per_speaker):
+ if batch_cursor == 0:
+ indices[idx, :] = numpy.random.permutation(straight)
+ else:
+ # if one batch is split between the end of previous line and the beginning of current line
+ # we make sure no speaker is present twice in this batch
+ probs = numpy.ones_like(straight)
+ probs[indices[idx-1, -batch_cursor:]] = 0
+ probs = probs/numpy.sum(probs)
+ indices[idx, :self.batch_size - batch_cursor] = numpy.random.choice(self.spk_count, self.batch_size - batch_cursor, replace=False, p=probs)
+ probs = numpy.ones_like(straight)
+ probs[indices[idx, :self.batch_size - batch_cursor]] = 0
+ to_pick = numpy.sum(probs).astype(numpy.int)
+ probs = probs/numpy.sum(probs)
+ indices[idx, self.batch_size - batch_cursor:] = numpy.random.choice(self.spk_count, to_pick, replace=False, p=probs)
+
+ assert numpy.sum(indices[idx, :]) == numpy.sum(straight)
+ batch_cursor = (batch_cursor + indices.shape[1]) % self.batch_size
+
+ # now we have the speaker indexes to sample in batches
+ batch_matrix = numpy.repeat(indices, self.examples_per_speaker, axis=1).flatten()
+
+ # we want to convert the speaker indexes into segment indexes
+ self.index_iterator = numpy.zeros_like(batch_matrix)
+
+ # keep track of next segment index to sample for each speaker
+ for idx, value in enumerate(batch_matrix):
+ if self.segment_cursors[value] > len(self.labels_to_indices[value]) - 1:
+ self.labels_to_indices[value] = self.labels_to_indices[value][torch.randperm(self.labels_to_indices[value].shape[0], generator=g)]
+ self.segment_cursors[value] = 0
+ self.index_iterator[idx] = self.labels_to_indices[value][self.segment_cursors[value]]
+ self.segment_cursors[value] += 1
+ #self.index_iterator = self.index_iterator.reshape(-1, self.num_process * self.examples_per_speaker)[:, self.rank * self.examples_per_speaker:(self.rank + 1) * self.examples_per_speaker].flatten()
+
+ self.index_iterator = numpy.repeat(self.index_iterator, self.num_replicas)
+ self.index_iterator = self.index_iterator.reshape(-1, self.num_process * self.examples_per_speaker * self.num_replicas)[:, self.rank * self.examples_per_speaker * self.num_replicas:(self.rank + 1) * self.examples_per_speaker * self.num_replicas].flatten()
+
+ return iter(self.index_iterator)
+
+ def __len__(self) -> int:
+ #return (self.samples_per_speaker * self.spk_count * self.examples_per_speaker) // self.num_process
+ return (self.samples_per_speaker * self.spk_count * self.examples_per_speaker * self.num_replicas) // self.num_process
+
+
+
+ def set_epoch(self, epoch: int) -> None:
+ self.epoch = epoch
+
+
+class SideSet(Dataset):
+
+ def __init__(self,
+ dataset,
+ set_type="train",
+ chunk_per_segment=1,
+ transform_number=1,
+ overlap=0.,
+ dataset_df=None,
+ min_duration=0.165,
+ output_format="pytorch",
+ ):
+ """
+
+ :param dataset_yaml: name of the YAML file describing the dataset
+ :param set_type: string, can be "train" or "validation"
+ :param chunk_per_segment: number of chunks to select for each segment
+ default is 1 and -1 means select all possible chunks
+ """
+ self.data_path = dataset["data_path"]
+ self.sample_rate = int(dataset["sample_rate"])
+ self.data_file_extension = dataset["data_file_extension"]
+ self.transformation = ''
+ self.min_duration = min_duration
+ self.output_format = output_format
+ self.transform_number = transform_number
+
+ if set_type == "train":
+ self.duration = dataset["train"]["duration"]
+ self.transformation = dataset["train"]["transformation"]
+ else:
+ self.duration = dataset["valid"]["duration"]
+ self.transformation = dataset["valid"]["transformation"]
+
+ self.sample_number = int(self.duration * self.sample_rate)
+ self.overlap = int(overlap * self.sample_rate)
+
+ # Load the dataset description as pandas.dataframe
+ if dataset_df is None:
+ df = pandas.read_csv(dataset["dataset_description"])
+ else:
+ assert isinstance(dataset_df, pandas.DataFrame)
+ df = dataset_df
+
+ # From each segment which duration is longer than the chosen one
+ # select the requested segments
+ if set_type == "train":
+ tmp_sessions = df.loc[df['duration'] > self.duration]
+ else:
+ if not "duration" == '':
+ tmp_sessions = df.loc[df['duration'] > self.duration]
+ else:
+ self.sessions = df
+
+ # Create lists for each column of the dataframe
+ df_dict = dict(zip(df.columns, [[], [], [], [], [], [], []]))
+ df_dict["file_start"] = list()
+ df_dict["file_duration"] = list()
+
+ # For each segment, get all possible segments with the current overlap
+ for idx in tqdm.trange(len(tmp_sessions), desc='indexing all ' + set_type + ' segments', mininterval=1, disable=None):
+ current_session = tmp_sessions.iloc[idx]
+
+ # Compute possible starts
+ possible_starts = numpy.arange(0,
+ int(self.sample_rate * (current_session.duration - self.duration)),
+ self.sample_number
+ ) + int(self.sample_rate * (current_session.duration % self.duration / 2))
+ possible_starts += int(self.sample_rate * current_session.start)
+
+ # Select max(seg_nb, possible_segments) segments
+ if chunk_per_segment == -1:
+ starts = possible_starts
+ chunk_nb = len(possible_starts)
+ else:
+ chunk_nb = min(len(possible_starts), chunk_per_segment)
+ starts = numpy.random.permutation(possible_starts)[:chunk_nb]
+
+ # Once we know how many segments are selected, create the other fields to fill the DataFrame
+ for ii in range(chunk_nb):
+ df_dict["database"].append(current_session.database)
+ df_dict["speaker_id"].append(current_session.speaker_id)
+ df_dict["file_id"].append(current_session.file_id)
+ df_dict["start"].append(starts[ii])
+ df_dict["duration"].append(self.duration)
+ df_dict["file_start"].append(current_session.start)
+ df_dict["file_duration"].append(current_session.duration)
+ df_dict["speaker_idx"].append(current_session.speaker_idx)
+ df_dict["gender"].append(current_session.gender)
+
+ self.sessions = pandas.DataFrame.from_dict(df_dict)
+ self.len = len(self.sessions)
+
+ self.transform = dict()
+ if (self.transformation["pipeline"] != '') and (self.transformation["pipeline"] is not None):
+ transforms = self.transformation["pipeline"].split(',')
+ if "add_noise" in transforms:
+ self.transform["add_noise"] = self.transformation["add_noise"]
+ if "add_reverb" in transforms:
+ self.transform["add_reverb"] = self.transformation["add_reverb"]
+ if "codec" in transforms:
+ self.transform["codec"] = []
+ if "phone_filtering" in transforms:
+ self.transform["phone_filtering"] = []
+
+ self.noise_df = None
+ if "add_noise" in self.transform:
+ noise_df = pandas.read_csv(self.transformation["add_noise"]["noise_db_csv"])
+ noise_df = noise_df.loc[noise_df.duration > self.duration]
+ self.noise_df = noise_df.set_index(noise_df.type)
+
+ self.rir_df = None
+ if "add_reverb" in self.transform:
+ tmp_rir_df = pandas.read_csv(self.transformation["add_reverb"]["rir_db_csv"])
+ tmp_rir_df = tmp_rir_df.loc[tmp_rir_df["type"] == "simulated_rirs"]
+ # load the RIR database
+ self.rir_df = tmp_rir_df.set_index(tmp_rir_df.type)
+
+
+ def __getitem__(self, index):
+ """
+
+ :return:
+ """
+ # Check the size of the file
+ current_session = self.sessions.iloc[index]
+
+ # TODO is this required ?
+ nfo = torchaudio.info(f"{self.data_path}/{current_session['file_id']}{self.data_file_extension}")
+ original_start = int(current_session['start'])
+ if self.overlap > 0:
+ lowest_shift = self.overlap/2
+ highest_shift = self.overlap/2
+ if original_start < (current_session['file_start']*self.sample_rate + self.sample_number/2):
+ lowest_shift = int(original_start - current_session['file_start']*self.sample_rate)
+ if original_start + self.sample_number > (current_session['file_start'] + current_session['file_duration'])*self.sample_rate - self.sample_number/2:
+ highest_shift = int((current_session['file_start'] + current_session['file_duration'])*self.sample_rate - (original_start + self.sample_number))
+ start_frame = original_start + int(random.uniform(-lowest_shift, highest_shift))
+ else:
+ start_frame = original_start
+
+ conversion_rate = nfo.sample_rate // self.sample_rate
+
+ if start_frame + conversion_rate * self.sample_number >= nfo.num_frames:
+ start_frame = numpy.min(nfo.num_frames - conversion_rate * self.sample_number - 1)
+
+ speech, speech_fs = torchaudio.load(f"{self.data_path}/{current_session['file_id']}{self.data_file_extension}",
+ frame_offset=conversion_rate*start_frame,
+ num_frames=conversion_rate*self.sample_number)
+
+ if nfo.sample_rate != self.sample_rate:
+ speech = torchaudio.transforms.Resample(nfo.sample_rate, self.sample_rate).forward(speech)
+
+ speech += 10e-6 * torch.randn(speech.shape)
+
+ if len(self.transform) > 0:
+ speech = data_augmentation(speech,
+ self.sample_rate,
+ self.transform,
+ self.transform_number,
+ noise_df=self.noise_df,
+ rir_df=self.rir_df)
+
+ speaker_idx = current_session["speaker_idx"]
+
+ if self.output_format == "pytorch":
+ return speech, torch.tensor(speaker_idx)
+ else:
+ return speech, speaker_idx
+
+ def __len__(self):
+ """
+
+ :param self:
+ :return:
+ """
+ return self.len
+
+def get_sample(path, resample=None):
+ effects = [
+ ["remix", "1"]
+ ]
+ if resample:
+ effects.append(["rate", f'{resample}'])
+ return torchaudio.sox_effects.apply_effects_file(path, effects=effects)
+
+
+class IdMapSet(Dataset):
+ """
+ DataSet that provide data according to a sidekit.IdMap object
+ """
+
+ def __init__(self,
+ idmap_name,
+ data_path,
+ file_extension,
+ transform_pipeline={},
+ transform_number=1,
+ sliding_window=False,
+ window_len=3.,
+ window_shift=1.5,
+ sample_rate=16000,
+ min_duration=0.165
+ ):
+ """
+
+ :param data_root_name:
+ :param idmap_name:
+ """
+ if isinstance(idmap_name, IdMap):
+ self.idmap = idmap_name
+ else:
+ self.idmap = IdMap(idmap_name)
+
+ self.data_path = data_path
+ self.file_extension = file_extension
+ self.len = self.idmap.leftids.shape[0]
+ self.transformation = transform_pipeline
+ self.min_duration = min_duration
+ self.sample_rate = sample_rate
+ self.sliding_window = sliding_window
+ self.window_len = int(window_len * self.sample_rate)
+ self.window_shift = int(window_shift * self.sample_rate)
+ self.transform_number = transform_number
+
+ self.noise_df = None
+ if "add_noise" in self.transformation:
+ # Load the noise dataset, filter according to the duration
+ noise_df = pandas.read_csv(self.transformation["add_noise"]["noise_db_csv"])
+ tmp_df = noise_df.loc[noise_df['duration'] > self.duration]
+ self.noise_df = tmp_df['file_id'].tolist()
+
+ self.rir_df = None
+ if "add_reverb" in self.transformation:
+ # load the RIR database
+ tmp_rir_df = pandas.read_csv(self.transformation["add_reverb"]["rir_db_csv"])
+ self.rir_df = zip(tmp_rir_df['file_id'].tolist(), tmp_rir_df['channel'].tolist())
+
+ def __getitem__(self, index):
+ """
+
+ :param index:
+ :return:
+ """
+ # Read start and stop and convert to time in seconds
+ if self.idmap.start[index] is None:
+ start = 0
+ else:
+ start = int(self.idmap.start[index] * 0.01 * self.sample_rate)
+
+ if self.idmap.stop[index] is None:
+ speech, speech_fs = torchaudio.load(f"{self.data_path}/{self.idmap.rightids[index]}.{self.file_extension}")
+ duration = int(speech.shape[1] - start)
+ else:
+ duration = int(self.idmap.stop[index] * 0.01 * self.sample_rate) - start
+ # add this in case the segment is too short
+ if duration <= self.min_duration * self.sample_rate:
+ middle = start + duration // 2
+ start = int(max(0, int(middle - (self.min_duration * self.sample_rate / 2))))
+ duration = int(self.min_duration * self.sample_rate)
+ speech, speech_fs = torchaudio.load(f"{self.data_path}/{self.idmap.rightids[index]}.{self.file_extension}",
+ frame_offset=start,
+ num_frames=duration)
+
+ speech += 10e-6 * torch.randn(speech.shape)
+
+ if self.sliding_window:
+ speech = speech.squeeze().unfold(0, self.window_len, self.window_shift)
+ #middle_points = numpy.arange(start + self.window_len / 2,
+ # start + duration - self.window_len / 2,
+ # self.window_shift)
+ #starts = middle_points - self.window_shift / 2
+ #stops = middle_points + self.window_shift / 2
+ #starts[0] = start
+ #stops[-1] = start + duration
+ #stop = stops
+ #start = starts
+ stop = start + duration
+ else:
+ stop = start + duration
+
+ if len(self.transformation.keys()) > 0:
+ speech = data_augmentation(speech,
+ speech_fs,
+ self.transformation,
+ self.transform_number,
+ noise_df=self.noise_df,
+ rir_df=self.rir_df)
+
+
+ speech = speech.squeeze()
+
+ return speech, self.idmap.leftids[index], self.idmap.rightids[index], start, stop
+
+ def __len__(self):
+ """
+
+ :param self:
+ :return:
+ """
+ return self.len
+
+
+class IdMapSetPerSpeaker(Dataset):
+ """
+ DataSet that provide data according to a sidekit.IdMap object
+ """
+
+ def __init__(self,
+ idmap_name,
+ data_path,
+ file_extension,
+ transform_pipeline={},
+ transform_number=1,
+ sample_rate=16000,
+ min_duration=0.165
+ ):
+ """
+
+ :param idmap_name:
+ :param data_root_path:
+ :param file_extension:
+ :param transform_pipeline:
+ :param transform_number:
+ :param sample_rate:
+ :param min_duration:
+ """
+ if isinstance(idmap_name, IdMap):
+ self.idmap = idmap_name
+ else:
+ self.idmap = IdMap(idmap_name)
+
+ self.data_path = data_path
+ self.file_extension = file_extension
+ self.len = len(set(self.idmap.leftids))
+ self.transformation = transform_pipeline
+ self.transform_number = transform_number
+ self.min_duration = min_duration
+ self.sample_rate = sample_rate
+ self.speaker_list = list(set(self.idmap.leftids))
+ self.output_im = IdMap()
+ self.output_im.leftids = numpy.unique(self.idmap.leftids)
+ self.output_im.rightids = self.output_im.leftids
+ self.output_im.start = numpy.empty(self.output_im.rightids.shape[0], "|O")
+ self.output_im.stop = numpy.empty(self.output_im.rightids.shape[0], "|O")
+
+ self.noise_df = None
+ if "add_noise" in self.transformation:
+ # Load the noise dataset, filter according to the duration
+ noise_df = pandas.read_csv(self.transformation["add_noise"]["noise_db_csv"])
+ tmp_df = noise_df.loc[noise_df['duration'] > self.duration]
+ self.noise_df = tmp_df['file_id'].tolist()
+
+ self.rir_df = None
+ if "add_reverb" in self.transformation:
+ # load the RIR database
+ tmp_rir_df = pandas.read_csv(self.transformation["add_reverb"]["rir_db_csv"])
+ self.rir_df = zip(tmp_rir_df['file_id'].tolist(), tmp_rir_df['channel'].tolist())
+
+ def __getitem__(self, index):
+ """
+
+ :param index:
+ :return:
+ """
+
+ # Loop on all segments from the given speaker to load data
+ spk_id = self.output_im.leftids[index]
+ tmp_data = []
+ #nfo = soundfile.info(f"{self.data_path}/{self.idmap.rightids[index]}.{self.file_extension}")
+ for sid, seg_id, seg_start, seg_stop in zip(self.idmap.leftids, self.idmap.rightids,
+ self.idmap.start, self.idmap.stop):
+ if sid == spk_id:
+
+ # Read start and stop and convert to time in seconds
+ if seg_start is None:
+ start = 0
+ else:
+ start = int(seg_start * 0.01 * self.sample_rate)
+
+ if seg_stop is None:
+ speech, speech_fs = torchaudio.load(f"{self.data_path}/{self.idmap.rightids[index]}.{self.file_extension}")
+ duration = int(speech.shape[1] - start)
+ else:
+ duration = int(seg_stop * 0.01 * self.sample_rate) - start
+ # add this in case the segment is too short
+ if duration <= self.min_duration * self.sample_rate:
+ middle = start + duration // 2
+ start = int(max(0, int(middle - (self.min_duration * self.sample_rate / 2))))
+ duration = int(self.min_duration * self.sample_rate)
+
+ speech, speech_fs = torchaudio.load(f"{self.data_path}/{seg_id}.{self.file_extension}",
+ frame_offset=start,
+ num_frames=duration)
+
+ speech += 10e-6 * torch.randn(speech.shape)
+ tmp_data.append(speech)
+
+ speech = torch.cat(tmp_data, dim=1)
+ speech += 10e-6 * torch.randn(speech.shape)
+
+ if len(self.transformation.keys()) > 0:
+ speech = data_augmentation(speech,
+ speech_fs,
+ self.transformation,
+ self.transform_number,
+ noise_df=self.noise_df,
+ rir_df=self.rir_df)
+
+ stop = start + duration
+ speech = speech.squeeze()
+
+ return speech, self.idmap.leftids[index], self.idmap.rightids[index], start, stop
+
+ def __len__(self):
+ """
+
+ :param self:
+ :return:
+ """
+ return self.len
+
diff --git a/sidekit/sidekit/nnet/xvector.py b/sidekit/sidekit/nnet/xvector.py
new file mode 100755
index 0000000..01c5137
--- /dev/null
+++ b/sidekit/sidekit/nnet/xvector.py
@@ -0,0 +1,1928 @@
+#coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+
+"""
+Copyright 2014-2021 Anthony Larcher, Yevhenii Prokopalo
+"""
+
+
+import logging
+import math
+import os
+import numpy
+import random
+import pandas
+import shutil
+import torch
+import tqdm
+import yaml
+import pickle
+
+from collections import OrderedDict
+from torch.utils.data import DataLoader
+from sklearn.model_selection import train_test_split
+from .pooling import MeanStdPooling
+from .pooling import AttentivePooling
+from .pooling import GruPooling
+from .preprocessor import MfccFrontEnd
+from .preprocessor import MelSpecFrontEnd
+from .preprocessor import BNFrontEnd
+from .preprocessor import RawPreprocessor
+from .xsets import SideSet
+from .xsets import IdMapSet
+from .xsets import IdMapSetPerSpeaker
+from .xsets import SideSampler
+from .res_net import ResBlockWFMS
+from .res_net import ResBlock
+from .res_net import PreFastResNet34
+from .res_net import PreHalfResNet34
+from .res_net import PreResNet34
+from ..bosaris import IdMap
+from ..bosaris import Key
+from ..bosaris import Ndx
+from ..statserver import StatServer
+from ..iv_scoring import cosine_scoring
+from .sincnet import SincNet
+from ..bosaris.detplot import rocch, rocch2eer
+from .loss import SoftmaxAngularProto
+from .loss import l2_norm
+from .loss import ArcMarginProduct
+from .loss import ArcLinear
+from .loss import AngularProximityMagnet
+
+
+os.environ['MKL_THREADING_LAYER'] = 'GNU'
+
+#torch.backends.cudnn.benchmark = True
+
+__license__ = "LGPL"
+__author__ = "Anthony Larcher"
+__copyright__ = "Copyright 2015-2021 Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reS'
+
+
+def seed_worker(seed_val):
+ """
+
+ :param worker_id:
+ :return:
+ """
+ worker_seed = torch.initial_seed() % 2**32
+ numpy.random.seed(worker_seed)
+ random.seed(worker_seed)
+
+
+def eer(negatives, positives):
+ """
+ Logarithmic complexity EER computation
+
+ :param negatives: negative_scores (numpy array): impostor scores
+ :param positives: positive_scores (numpy array): genuine scores
+ :return: float: Equal Error Rate (EER)
+ """
+ positives = numpy.sort(positives)
+ negatives = numpy.sort(negatives)[::-1]
+
+ pos_count = positives.shape[0]
+ neg_count = negatives.shape[0]
+
+ p_score = positives[0]
+ n_score = negatives[0]
+
+ p_index = 0
+ n_index = 0
+
+ next_p_jump = pos_count//2
+ next_n_jump = neg_count//2
+
+ kdx = 0
+ while True:
+ kdx += 1
+ if p_index < 0 or n_index < 0:
+ return 0
+ if p_index > pos_count or n_index > neg_count:
+ return 100
+ if p_score < n_score:
+ p_index = p_index + next_p_jump
+ n_index = n_index + next_n_jump
+ if next_p_jump == 0 and next_n_jump == 0:
+ break
+ elif p_score >= n_score:
+ p_index = p_index - next_p_jump
+ n_index = n_index - next_n_jump
+ if next_p_jump == 0 and next_n_jump == 0:
+ break
+
+ p_score = positives[p_index]
+ n_score = negatives[n_index]
+ next_p_jump = next_p_jump//2
+ next_n_jump = next_n_jump//2
+
+ eer_predicate = 100
+
+ tfr = (abs(p_index))/pos_count
+ tfa = (1+abs(n_index))/neg_count
+ if (p_score == n_score and tfr == tfa):
+ return tfr
+
+ while positives[p_index] < negatives[n_index]:
+ if p_index < pos_count - 1:
+ p_index += 1
+ elif n_index < neg_count - 1:
+ n_index += 1
+ else:
+ break
+
+ while positives[p_index] > negatives[n_index] and n_index >= 1:
+ n_index -= 1
+
+ tfr = (1+p_index)/pos_count
+ tfa = (1+n_index)/neg_count
+
+ while tfa > tfr:
+ p_index += 1
+ while positives[p_index] > negatives[n_index] and n_index >= 1:
+ n_index -= 1
+ tfr = (1+p_index)/pos_count
+ tfa = (1+n_index)/neg_count
+
+ if abs(tfr - tfa) <= eer_predicate:
+ eer_predicate = abs(tfr - tfa)
+ eer = (tfr + tfa) / 2
+ else:
+ return eer
+
+ tfr = p_index/pos_count
+ tfa = (1+n_index)/neg_count
+ if abs(tfr - tfa) <= eer_predicate:
+ eer_predicate = abs(tfr - tfa)
+ eer = (tfr + tfa) / 2
+ else:
+ return eer
+
+ while True:
+ while negatives[n_index + 1] <= positives[p_index - 1]:
+ p_index -= 1
+ tfr = p_index/pos_count
+ tfa = (1+n_index)/neg_count
+ if abs(tfr - tfa) <= eer_predicate:
+ eer_predicate = abs(tfr - tfa)
+ eer = (tfr + tfa) / 2
+ else:
+ return eer
+ while negatives[n_index + 1] > positives[p_index - 1]:
+ n_index += 1
+ tfr = p_index/pos_count
+ tfa = (1+n_index)/neg_count
+ if abs(tfr - tfa) <= eer_predicate:
+ eer_predicate = abs(tfr - tfa)
+ eer = (tfr + tfa) / 2
+ else:
+ return eer
+
+ return eer
+
+
+def test_metrics(model,
+ device,
+ model_opts,
+ data_opts,
+ train_opts):
+ """Compute model metrics
+
+ Args:
+ model ([type]): [description]
+ validation_loader ([type]): [description]
+ device ([type]): [description]
+ speaker_number ([type]): [description]
+ model_archi ([type]): [description]
+
+ Raises:
+ NotImplementedError: [description]
+ NotImplementedError: [description]
+
+ Returns:
+ [type]: [description]
+ """
+ transform_pipeline = dict()
+
+ xv_stat = extract_embeddings(idmap_name=data_opts["test"]["idmap"],
+ model_filename=model,
+ data_root_name=data_opts["test"]["data_path"],
+ device=device,
+ transform_pipeline=transform_pipeline,
+ num_thread=train_opts["num_cpu"],
+ mixed_precision=train_opts["mixed_precision"])
+
+ tar, non = cosine_scoring(xv_stat,
+ xv_stat,
+ Ndx(data_opts["test"]["ndx"]),
+ wccn=None,
+ check_missing=True,
+ device=device
+ ).get_tar_non(Key(data_opts["test"]["key"]))
+
+ pmiss, pfa = rocch(tar, non)
+
+ return rocch2eer(pmiss, pfa)
+
+
+def save_checkpoint(state, is_best, filename='checkpoint.pth.tar', best_filename='model_best.pth.tar'):
+ """
+
+ :param state:
+ :param is_best:
+ :param filename:
+ :param best_filename:
+ :return:
+ """
+ torch.save(state, filename)
+ if is_best:
+ shutil.copyfile(filename, best_filename)
+
+
+class TrainingMonitor():
+ """
+
+ """
+ def __init__(self,
+ output_file,
+ log_interval=10,
+ patience=numpy.inf,
+ best_accuracy=0.0,
+ best_eer_epoch=1,
+ best_eer=100,
+ compute_test_eer=False
+ ):
+
+ self.current_epoch = 0
+ self.log_interval = log_interval
+ self.init_patience = patience
+ self.current_patience = patience
+ self.best_accuracy = best_accuracy
+ self.best_eer_epoch = best_eer_epoch
+ self.best_eer = best_eer
+ self.compute_test_eer = compute_test_eer
+ self.test_eer = []
+
+ self.training_loss = []
+ self.training_acc = []
+
+ self.val_loss = []
+ self.val_acc = []
+ self.val_eer = []
+
+ self.is_best = True
+
+ # Initialize the logger
+ logging_format = '%(asctime)-15s %(message)s'
+ logging.basicConfig(level=logging.DEBUG, format=logging_format, datefmt='%m-%d %H:%M')
+ self.logger = logging.getLogger('Monitoring')
+ self.logger.setLevel(logging.DEBUG)
+ # create file handler which logs even debug messages
+ fh = logging.FileHandler(output_file)
+ formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+ fh.setFormatter(formatter)
+ fh.setLevel(logging.DEBUG)
+ self.logger.addHandler(fh)
+
+ def display(self):
+ """
+
+ :return:
+ """
+ # TODO
+ self.logger.critical(f"***Validation metrics - Cross validation accuracy = {self.val_acc[-1]} %, EER = {self.val_eer[-1] * 100} %")
+ self.logger.critical(f"***Test metrics - Test EER = {self.test_eer[-1] * 100} %")
+
+ def display_final(self):
+ """
+
+ :return:
+ """
+ self.logger.critical(f"Best accuracy {self.best_accuracy * 100.} obtained at epoch {self.best_accuracy_epoch}")
+
+ def update(self,
+ epoch=None,
+ training_acc=None,
+ training_loss=None,
+ test_eer=None,
+ val_eer=None,
+ val_loss=None,
+ val_acc=None):
+
+ if epoch is not None:
+ self.current_epoch = epoch
+ if training_acc is not None:
+ self.training_acc.append(training_acc)
+ if training_loss is not None:
+ self.training_loss.append(training_loss)
+ if val_eer is not None:
+ self.val_eer.append(val_eer)
+ if val_loss is not None:
+ self.val_loss.append(val_loss)
+ if val_acc is not None:
+ self.val_acc.append(val_acc)
+
+ # remember best accuracy and save checkpoint
+ if self.compute_test_eer and test_eer is not None:
+ self.test_eer.append(test_eer)
+ self.is_best = test_eer < self.best_eer
+ self.best_eer = min(test_eer, self.best_eer)
+ if self.is_best:
+ self.best_eer_epoch = epoch
+ self.current_patience = self.init_patience
+ else:
+ self.current_patience -= 1
+ elif val_eer is not None:
+ self.is_best = val_eer < self.best_eer
+ self.best_eer = min(val_eer, self.best_eer)
+ if self.is_best:
+ self.best_eer_epoch = epoch
+ self.current_patience = self.init_patience
+ else:
+ self.current_patience -= 1
+
+
+class Xtractor(torch.nn.Module):
+ """
+ Class that defines an x-vector extractor based on 5 convolutional layers and a mean standard deviation pooling
+ """
+
+ def __init__(self,
+ speaker_number,
+ model_archi="xvector",
+ loss=None,
+ norm_embedding=False,
+ aam_margin=0.2,
+ aam_s=30,
+ embedding_size=256):
+ """
+ If config is None, default architecture is created
+ :param model_archi:
+ """
+ super(Xtractor, self).__init__()
+ self.speaker_number = speaker_number
+ self.feature_size = None
+ self.norm_embedding = norm_embedding
+
+ if model_archi == "xvector":
+
+ self.input_nbdim = 2
+
+ if loss not in ["cce", 'aam']:
+ raise NotImplementedError(f"The valid loss are for now cce and aam ")
+ else:
+ self.loss = loss
+
+ self.activation = torch.nn.LeakyReLU(0.2)
+
+ self.preprocessor = MfccFrontEnd()
+ self.feature_size = self.preprocessor.n_mfcc
+
+ self.sequence_network = torch.nn.Sequential(OrderedDict([
+ ("conv1", torch.nn.Conv1d(self.feature_size, 512, 5, dilation=1)),
+ ("activation1", torch.nn.LeakyReLU(0.2)),
+ ("batch_norm1", torch.nn.BatchNorm1d(512)),
+ ("conv2", torch.nn.Conv1d(512, 512, 3, dilation=2)),
+ ("activation2", torch.nn.LeakyReLU(0.2)),
+ ("batch_norm2", torch.nn.BatchNorm1d(512)),
+ ("conv3", torch.nn.Conv1d(512, 512, 3, dilation=3)),
+ ("activation3", torch.nn.LeakyReLU(0.2)),
+ ("batch_norm3", torch.nn.BatchNorm1d(512)),
+ ("conv4", torch.nn.Conv1d(512, 512, 1)),
+ ("activation4", torch.nn.LeakyReLU(0.2)),
+ ("batch_norm4", torch.nn.BatchNorm1d(512)),
+ ("conv5", torch.nn.Conv1d(512, 1536, 1)),
+ ("activation5", torch.nn.LeakyReLU(0.2)),
+ ("batch_norm5", torch.nn.BatchNorm1d(1536))
+ ]))
+
+ self.embedding_size = embedding_size
+
+ self.stat_pooling = MeanStdPooling()
+ self.stat_pooling_weight_decay = 0
+ self.before_speaker_embedding = torch.nn.Sequential(OrderedDict([
+ ("linear6", torch.nn.Linear(3072, self.embedding_size))
+ ]))
+
+ if self.loss == "aam":
+ self.after_speaker_embedding = ArcMarginProduct(self.embedding_size,
+ int(self.speaker_number),
+ s=64,
+ m=0.2,
+ easy_margin=False)
+ elif self.loss == "cce":
+ self.after_speaker_embedding = torch.nn.Sequential(OrderedDict([
+ ("activation6", torch.nn.LeakyReLU(0.2)),
+ ("batch_norm6", torch.nn.BatchNorm1d(512)),
+ ("dropout6", torch.nn.Dropout(p=0.05)),
+ ("linear7", torch.nn.Linear(512, 512)),
+ ("activation7", torch.nn.LeakyReLU(0.2)),
+ ("batch_norm7", torch.nn.BatchNorm1d(512)),
+ ("linear8", torch.nn.Linear(512, int(self.speaker_number)))
+ ]))
+
+ self.preprocessor_weight_decay = 0.0002
+ self.sequence_network_weight_decay = 0.0002
+ self.before_speaker_embedding_weight_decay = 0.002
+ self.after_speaker_embedding_weight_decay = 0.002
+
+ elif model_archi == "resnet34":
+
+ self.preprocessor = MelSpecFrontEnd(n_mels=80)
+ self.sequence_network = PreResNet34()
+ self.embedding_size = embedding_size
+
+ self.before_speaker_embedding = torch.nn.Linear(in_features=5120,
+ out_features=self.embedding_size)
+
+ self.stat_pooling = AttentivePooling(256, 80, global_context=True)
+
+ self.loss = "aam"
+ self.after_speaker_embedding = ArcMarginProduct(self.embedding_size,
+ int(self.speaker_number),
+ s = 30.0,
+ m = 0.20,
+ easy_margin = False)
+
+ self.preprocessor_weight_decay = 0.00002
+ self.sequence_network_weight_decay = 0.00002
+ self.stat_pooling_weight_decay = 0.00002
+ self.before_speaker_embedding_weight_decay = 0.00002
+ self.after_speaker_embedding_weight_decay = 0.0002
+
+ elif model_archi == "fastresnet34":
+ self.preprocessor = MelSpecFrontEnd()
+ self.sequence_network = PreFastResNet34()
+ self.embedding_size = embedding_size
+
+ self.before_speaker_embedding = torch.nn.Linear(in_features = 2560,
+ out_features = self.embedding_size)
+
+ self.stat_pooling = AttentivePooling(128, 80, global_context=False)
+ self.stat_pooling_weight_decay = 0
+
+ self.loss = loss
+ if self.loss == "aam":
+ self.after_speaker_embedding = ArcMarginProduct(self.embedding_size,
+ int(self.speaker_number),
+ s = 30,
+ m = 0.2,
+ easy_margin = False)
+
+ elif self.loss == 'aps':
+ self.after_speaker_embedding = SoftmaxAngularProto(int(self.speaker_number))
+ elif self.loss == 'smn':
+ self.after_speaker_embedding = AngularProximityMagnet(int(self.speaker_number))
+
+ self.preprocessor_weight_decay = 0.00002
+ self.sequence_network_weight_decay = 0.00002
+ self.stat_pooling_weight_decay = 0.00002
+ self.before_speaker_embedding_weight_decay = 0.00002
+ self.after_speaker_embedding_weight_decay = 0.0002
+
+ elif model_archi == "bn_halfresnet34":
+ self.preprocessor = BNFrontEnd()
+ self.sequence_network = PreHalfResNet34()
+
+ self.embedding_size = embedding_size
+ #self.embedding_size = 256
+ #self.before_speaker_embedding = torch.nn.Linear(in_features = 5120,
+ # out_features = self.embedding_size)
+
+ ars_bn_dim = 256
+ self.before_speaker_embedding = torch.nn.Sequential(OrderedDict([
+ ("lin_be", torch.nn.Linear(in_features = int((5120/80)*ars_bn_dim), out_features = self.embedding_size, bias=False)),
+ ("bn_be", torch.nn.BatchNorm1d(self.embedding_size))
+ ]))
+
+ self.stat_pooling = AttentivePooling(256, ars_bn_dim, global_context=True)
+
+ self.loss = loss
+ if self.loss == "aam":
+ self.after_speaker_embedding = ArcMarginProduct(self.embedding_size,
+ int(self.speaker_number),
+ s = 30,
+ m = 0.2,
+ easy_margin = False)
+ elif self.loss == 'aps':
+ self.after_speaker_embedding = SoftmaxAngularProto(int(self.speaker_number))
+ self.preprocessor_weight_decay = 0.00002
+ self.sequence_network_weight_decay = 0.00002
+ self.stat_pooling_weight_decay = 0.00002
+ self.before_speaker_embedding_weight_decay = 0.00002
+ self.after_speaker_embedding_weight_decay = 0.000
+
+
+
+ elif model_archi == "halfresnet34":
+ self.preprocessor = MelSpecFrontEnd(n_fft=1024,
+ win_length=400,
+ hop_length=160,
+ n_mels=80)
+ self.sequence_network = PreHalfResNet34()
+
+ self.embedding_size = embedding_size
+ #self.embedding_size = 256
+ #self.before_speaker_embedding = torch.nn.Linear(in_features = 5120,
+ # out_features = self.embedding_size)
+
+ self.before_speaker_embedding = torch.nn.Sequential(OrderedDict([
+ ("lin_be", torch.nn.Linear(in_features = 5120, out_features = self.embedding_size, bias=False)),
+ ("bn_be", torch.nn.BatchNorm1d(self.embedding_size))
+ ]))
+
+ self.stat_pooling = AttentivePooling(256, 80, global_context=True)
+
+ self.loss = loss
+ if self.loss == "aam":
+ self.after_speaker_embedding = ArcMarginProduct(self.embedding_size,
+ int(self.speaker_number),
+ s = 30,
+ m = 0.2,
+ easy_margin = False)
+ elif self.loss == 'aps':
+ self.after_speaker_embedding = SoftmaxAngularProto(int(self.speaker_number))
+ self.preprocessor_weight_decay = 0.00002
+ self.sequence_network_weight_decay = 0.00002
+ self.stat_pooling_weight_decay = 0.00002
+ self.before_speaker_embedding_weight_decay = 0.00002
+ self.after_speaker_embedding_weight_decay = 0.000
+
+ elif model_archi == "rawnet2":
+
+ if loss not in ["cce", 'aam']:
+ raise NotImplementedError(f"The valid loss are for now cce and aam ")
+ else:
+ self.loss = loss
+
+ self.input_nbdim = 2
+
+ filts = [128, [128, 128], [128, 256], [256, 256]]
+ self.norm_embedding = True
+
+ self.preprocessor = RawPreprocessor(nb_samp=48000,
+ in_channels=1,
+ out_channels=filts[0],
+ kernel_size=3)
+
+ self.sequence_network = torch.nn.Sequential(OrderedDict([
+ ("block0", ResBlockWFMS(nb_filts=filts[1], first=True)),
+ ("block1", ResBlockWFMS(nb_filts=filts[1])),
+ ("block2", ResBlockWFMS(nb_filts=filts[2])),
+ ("block3", ResBlockWFMS(nb_filts=[filts[2][1], filts[2][1]])),
+ ("block4", ResBlockWFMS(nb_filts=[filts[2][1], filts[2][1]])),
+ ("block5", ResBlockWFMS(nb_filts=[filts[2][1], filts[2][1]]))
+ ]))
+
+ self.stat_pooling = GruPooling(input_size=filts[2][-1],
+ gru_node=1024,
+ nb_gru_layer=1)
+
+ self.before_speaker_embedding = torch.nn.Linear(in_features = 1024,
+ out_features = 1024)
+
+ if self.loss == "aam":
+ if loss == 'aam':
+ self.after_speaker_embedding = ArcLinear(1024,
+ int(self.speaker_number),
+ margin=aam_margin, s=aam_s)
+ elif self.loss == "cce":
+ self.after_speaker_embedding = torch.nn.Linear(in_features = 1024,
+ out_features = int(self.speaker_number),
+ bias = True)
+
+ self.preprocessor_weight_decay = 0.000
+ self.sequence_network_weight_decay = 0.000
+ self.stat_pooling_weight_decay = 0.000
+ self.before_speaker_embedding_weight_decay = 0.00
+ self.after_speaker_embedding_weight_decay = 0.00
+
+ else:
+ is_first_resblock = True
+
+ if isinstance(model_archi, dict):
+ cfg = model_archi
+ else:
+ # Load Yaml configuration
+ with open(model_archi, 'r') as fh:
+ cfg = yaml.load(fh, Loader=yaml.FullLoader)
+
+ self.loss = cfg["loss"]["type"]
+ if self.loss == "aam":
+ self.aam_margin = cfg["loss"]["aam_margin"]
+ self.aam_s = cfg["loss"]["aam_s"]
+
+ """
+ Prepare Preprocessor
+ """
+ self.preprocessor = None
+ if "preprocessor" in cfg:
+ if cfg['preprocessor']["type"] == "sincnet":
+ self.preprocessor = SincNet(
+ waveform_normalize=cfg['preprocessor']["waveform_normalize"],
+ sample_rate=cfg['preprocessor']["sample_rate"],
+ min_low_hz=cfg['preprocessor']["min_low_hz"],
+ min_band_hz=cfg['preprocessor']["min_band_hz"],
+ out_channels=cfg['preprocessor']["out_channels"],
+ kernel_size=cfg['preprocessor']["kernel_size"],
+ stride=cfg['preprocessor']["stride"],
+ max_pool=cfg['preprocessor']["max_pool"],
+ instance_normalize=cfg['preprocessor']["instance_normalize"],
+ activation=cfg['preprocessor']["activation"],
+ dropout=cfg['preprocessor']["dropout"]
+ )
+ self.feature_size = self.preprocessor.dimension
+ elif cfg['preprocessor']["type"] == "rawnet2":
+ self.preprocessor = RawPreprocessor(nb_samp=int(cfg['preprocessor']["sampling_frequency"] * cfg['preprocessor']["duration"]),
+ in_channels=1,
+ out_channels=cfg["feature_size"],
+ kernel_size=cfg['preprocessor']["kernel_size"],
+ stride=cfg['preprocessor']["stride"],
+ padding=cfg['preprocessor']["padding"],
+ dilation=cfg['preprocessor']["dilation"])
+ self.feature_size = cfg["feature_size"]
+ self.preprocessor_weight_decay = 0.000
+
+ """
+ Prepare sequence network
+ """
+ # Get Feature size
+ if self.feature_size is None:
+ self.feature_size = cfg["preprocessor"]["feature_size"]
+
+ input_size = self.feature_size
+
+ # Get activation function
+ if cfg["activation"] == 'LeakyReLU':
+ self.activation = torch.nn.LeakyReLU(0.2)
+ elif cfg["activation"] == 'PReLU':
+ self.activation = torch.nn.PReLU()
+ elif cfg["activation"] == 'ReLU6':
+ self.activation = torch.nn.ReLU6()
+ else:
+ self.activation = torch.nn.ReLU()
+
+ # Create sequential object for the first part of the network
+ segmental_layers = []
+ for k in cfg["segmental"].keys():
+ if k.startswith("lin"):
+ segmental_layers.append((k, torch.nn.Linear(input_size,
+ cfg["segmental"][k]["output"])))
+ input_size = cfg["segmental"][k]["output"]
+
+ elif k.startswith("conv2D"):
+ segmental_layers.append((k, torch.nn.Conv2d(in_channels=1,
+ out_channels=entry_conv_out_channels,
+ kernel_size=entry_conv_kernel_size,
+ padding=3,
+ stride=1)))
+
+ elif k.startswith("conv"):
+ segmental_layers.append((k, torch.nn.Conv1d(input_size,
+ cfg["segmental"][k]["output_channels"],
+ kernel_size=cfg["segmental"][k]["kernel_size"],
+ dilation=cfg["segmental"][k]["dilation"])))
+ input_size = cfg["segmental"][k]["output_channels"]
+
+ elif k.startswith("ctrans"):
+ segmental_layers.append((k, torch.nn.ConvTranspose1d(input_size,
+ cfg["segmental"][k]["output_channels"],
+ kernel_size=cfg["segmental"][k]["kernel_size"],
+ dilation=cfg["segmental"][k]["dilation"])))
+ elif k.startswith("activation"):
+ segmental_layers.append((k, self.activation))
+
+ elif k.startswith('batch_norm'):
+ segmental_layers.append((k, torch.nn.BatchNorm1d(input_size)))
+
+ elif k.startswith('resblock'):
+ segmental_layers.append((ResBlock(cfg["segmental"][k]["input_channel"],
+ cfg["segmental"][k]["output_channel"],
+ is_first_resblock)))
+ is_first_resblock = False
+
+ self.sequence_network = torch.nn.Sequential(OrderedDict(segmental_layers))
+ self.sequence_network_weight_decay = cfg["segmental"]["weight_decay"]
+
+ """
+ Pooling
+ """
+ self.stat_pooling = MeanStdPooling()
+ tmp_input_size = input_size * 2
+ if cfg["stat_pooling"]["type"] == "GRU":
+ self.stat_pooling = GruPooling(input_size=cfg["stat_pooling"]["input_size"],
+ gru_node=cfg["stat_pooling"]["gru_node"],
+ nb_gru_layer=cfg["stat_pooling"]["nb_gru_layer"])
+ tmp_input_size = cfg["stat_pooling"]["gru_node"]
+
+ self.stat_pooling_weight_decay = cfg["stat_pooling"]["weight_decay"]
+
+ """
+ Prepare last part of the network (after pooling)
+ """
+ # Create sequential object for the second part of the network
+ input_size = tmp_input_size
+ before_embedding_layers = []
+ for k in cfg["before_embedding"].keys():
+ if k.startswith("lin"):
+ if cfg["before_embedding"][k]["output"] == "speaker_number":
+ before_embedding_layers.append((k, torch.nn.Linear(input_size, self.speaker_number)))
+ else:
+ before_embedding_layers.append((k, torch.nn.Linear(input_size,
+ cfg["before_embedding"][k]["output"])))
+ input_size = cfg["before_embedding"][k]["output"]
+
+ elif k.startswith("activation"):
+ before_embedding_layers.append((k, self.activation))
+
+ elif k.startswith('batch_norm'):
+ before_embedding_layers.append((k, torch.nn.BatchNorm1d(input_size)))
+
+ elif k.startswith('dropout'):
+ before_embedding_layers.append((k, torch.nn.Dropout(p=cfg["before_embedding"][k])))
+
+ self.embedding_size = input_size
+ self.before_speaker_embedding = torch.nn.Sequential(OrderedDict(before_embedding_layers))
+ self.before_speaker_embedding_weight_decay = cfg["before_embedding"]["weight_decay"]
+
+ # if loss_criteria is "cce"
+ # Create sequential object for the second part of the network
+ if self.loss == "cce":
+ after_embedding_layers = []
+ for k in cfg["after_embedding"].keys():
+ if k.startswith("lin"):
+ if cfg["after_embedding"][k]["output"] == "speaker_number":
+ after_embedding_layers.append((k, torch.nn.Linear(input_size, self.speaker_number)))
+ else:
+ after_embedding_layers.append((k, torch.nn.Linear(input_size,
+ cfg["after_embedding"][k]["output"])))
+ input_size = cfg["after_embedding"][k]["output"]
+
+ elif k.startswith('arc'):
+ after_embedding_layers.append((k, ArcLinear(input_size,
+ self.speaker_number,
+ margin=self.aam_margin,
+ s=self.aam_s)))
+
+ elif k.startswith("activation"):
+ after_embedding_layers.append((k, self.activation))
+
+ elif k.startswith('batch_norm'):
+ after_embedding_layers.append((k, torch.nn.BatchNorm1d(input_size)))
+
+ elif k.startswith('dropout'):
+ after_embedding_layers.append((k, torch.nn.Dropout(p=cfg["after_embedding"][k])))
+
+ self.after_speaker_embedding = torch.nn.Sequential(OrderedDict(after_embedding_layers))
+
+ elif self.loss == "aam":
+ self.norm_embedding = True
+ self.after_speaker_embedding = ArcMarginProduct(input_size,
+ int(self.speaker_number),
+ s=64,
+ m=0.2,
+ easy_margin=True)
+
+ self.after_speaker_embedding_weight_decay = cfg["after_embedding"]["weight_decay"]
+
+ def forward(self, x, is_eval=False, target=None, norm_embedding=True):
+ """
+
+ :param x:
+ :param is_eval: False for training
+ :return:
+ """
+ if self.preprocessor is not None:
+ x = self.preprocessor(x, is_eval)
+
+ x = self.sequence_network(x)
+
+ # Mean and Standard deviation pooling
+ x = self.stat_pooling(x)
+
+ x = self.before_speaker_embedding(x)
+
+ if norm_embedding:
+ x = l2_norm(x)
+
+ if self.loss == "cce":
+ if is_eval:
+ return x
+ else:
+ return self.after_speaker_embedding(x), x
+
+ elif self.loss in ['aam', 'aps']:
+ x = self.after_speaker_embedding(x, target=target), torch.nn.functional.normalize(x, dim=1)
+ elif self.loss == 'smn':
+ if not is_eval:
+ x = self.after_speaker_embedding(x, target=target), x
+
+
+ return x
+
+ def context_size(self):
+ context = 1
+ if isinstance(self, Xtractor):
+ for name, module in self.sequence_network.named_modules():
+ if name.startswith("conv"):
+ context += module.dilation[0] * (module.kernel_size[0] - 1)
+ else:
+ for name, module in self.module.sequence_network.named_modules():
+ if name.startswith("conv"):
+ context += module.dilation[0] * (module.kernel_size[0] - 1)
+ return context
+
+
+def fill_dict(target_dict, source_dict, prefix = ""):
+ """
+ Recursively Fill a dictionary target_dict by taking values from source_dict
+
+ :param target_dict: output dictionary that is initialized with default values
+ :param source_dict: input dictionary
+ :return:
+ """
+ for k1, v1 in target_dict.items():
+
+ if isinstance(v1, dict):
+ if k1 in source_dict and isinstance(source_dict[k1], dict):
+ fill_dict(v1, source_dict[k1], prefix + "\t")
+ else:
+ pass
+ else:
+ if k1 in source_dict and source_dict[k1] is not None:
+ target_dict[k1] = source_dict[k1]
+ else:
+ pass
+
+
+def update_training_dictionary(dataset_description,
+ model_description,
+ training_description,
+ kwargs=None):
+ """
+
+ :param dataset_description:
+ :param model_description:
+ :param training_description:
+ :param kwargs:
+ :return:
+ """
+ dataset_opts=dict()
+ model_opts=dict()
+ training_opts=dict()
+
+ if isinstance(dataset_description, str) and os.path.isfile(dataset_description):
+ with open(dataset_description, 'r') as fh:
+ tmp_data_dict = yaml.load(fh, Loader=yaml.FullLoader)
+ else:
+ tmp_data_dict = dataset_description
+
+ if isinstance(model_description, str) and os.path.isfile(model_description):
+ with open(model_description, 'r') as fh:
+ tmp_model_dict = yaml.load(fh, Loader=yaml.FullLoader)
+ else:
+ tmp_model_dict = model_description
+
+ if isinstance(training_description, str) and os.path.isfile(training_description):
+ with open(training_description, 'r') as fh:
+ tmp_train_dict = yaml.load(fh, Loader=yaml.FullLoader)
+ else:
+ tmp_train_dict = training_description
+
+ # Initialize default dictionaries
+ dataset_opts["data_path"] = None
+ dataset_opts["dataset_csv"] = None
+ dataset_opts["stratify"] = False
+ dataset_opts["data_file_extension"] = ".wav"
+ dataset_opts["sample_rate"] = 16000
+
+ dataset_opts["validation_ratio"] = 0.1
+ dataset_opts["batch_size"] = 64
+
+ dataset_opts["train"] = dict()
+ dataset_opts["train"]["duration"] = 4.
+ dataset_opts["train"]["chunk_per_segment"] = -1
+ dataset_opts["train"]["overlap"] = 3.9
+ dataset_opts["train"]["sampler"] = dict()
+ dataset_opts["train"]["sampler"]["examples_per_speaker"] = 1
+ dataset_opts["train"]["sampler"]["samples_per_speaker"] = 100
+ dataset_opts["train"]["sampler"]["augmentation_replica"] = 1
+ dataset_opts["train"]["transform_number"] = 2
+ dataset_opts["train"]["transformation"] = dict()
+ dataset_opts["train"]["transformation"]["pipeline"] = ""
+ dataset_opts["train"]["transformation"]["add_noise"] = dict()
+ dataset_opts["train"]["transformation"]["add_noise"]["noise_db_csv"] = ""
+ dataset_opts["train"]["transformation"]["add_noise"]["data_path"] = ""
+ dataset_opts["train"]["transformation"]["add_reverb"] = dict()
+ dataset_opts["train"]["transformation"]["add_reverb"]["rir_db_csv"] = ""
+ dataset_opts["train"]["transformation"]["add_reverb"]["data_path"] = ""
+
+ dataset_opts["valid"] = dict()
+ dataset_opts["valid"]["duration"] = 2.
+ dataset_opts["valid"]["transformation"] = dict()
+ dataset_opts["valid"]["transformation"]["pipeline"] = ""
+ dataset_opts["valid"]["transformation"]["add_noise"] = dict()
+ dataset_opts["valid"]["transformation"]["add_noise"]["noise_db_csv"] = ""
+ dataset_opts["valid"]["transformation"]["add_noise"]["data_path"] = ""
+ dataset_opts["valid"]["transformation"]["add_reverb"] = dict()
+ dataset_opts["valid"]["transformation"]["add_reverb"]["noise_db_csv"] = ""
+ dataset_opts["valid"]["transformation"]["add_reverb"]["data_path"] = ""
+
+ dataset_opts["test"] = dict()
+ dataset_opts["test"]["idmap"] = ""
+ dataset_opts["test"]["ndx"] = ""
+ dataset_opts["test"]["key"] = ""
+ dataset_opts["test"]["data_path"] =""
+
+ # Initialize model options
+ model_opts["speaker_number"] = None
+ model_opts["embedding_size"] = 256
+ model_opts["loss"] = dict()
+ model_opts["loss"]["type"] ="aam"
+ model_opts["loss"]["aam_margin"] = 0.2
+ model_opts["loss"]["aam_s"] = 30
+
+ model_opts["initial_model_name"] = None
+ model_opts["reset_parts"] = []
+ model_opts["freeze_parts"] = []
+
+ model_opts["model_type"] = "fastresnet"
+
+ model_opts["preprocessor"] = dict()
+ model_opts["preprocessor"]["type"] = "mel_spec"
+ model_opts["preprocessor"]["feature_size"] = 80
+
+ # Initialize training options
+ training_opts["log_file"] = "sidekit.log"
+ training_opts["numpy_seed"] = 0
+ training_opts["torch_seed"] = 0
+ training_opts["random_seed"] = 0
+ training_opts["deterministic"] = False
+ training_opts["epochs"] = 100
+ training_opts["lr"] = 1e-3
+ training_opts["patience"] = 50
+ training_opts["multi_gpu"] = False
+ training_opts["num_cpu"] = 5
+ training_opts["mixed_precision"] = False
+ training_opts["clipping"] = False
+
+ training_opts["optimizer"] = dict()
+ training_opts["optimizer"]["type"] = "sgd"
+ training_opts["optimizer"]["options"] = None
+
+ training_opts["scheduler"] = dict()
+ training_opts["scheduler"]["type"] = "ReduceLROnPlateau"
+ training_opts["scheduler"]["step_size_up"] = 10
+ training_opts["scheduler"]["base_lr"] = 1e-8
+ training_opts["scheduler"]["mode"] = "triangular2"
+
+ training_opts["compute_test_eer"] = False
+ training_opts["log_interval"] = 10
+ training_opts["validation_frequency"] = 1
+
+ training_opts["tmp_model_name"] = "tmp_model.pt"
+ training_opts["best_model_name"] = "best_model.pt"
+ training_opts["checkpoint_frequency"] = "10"
+
+ # Use options from the YAML config files
+ fill_dict(dataset_opts, tmp_data_dict)
+ fill_dict(model_opts, tmp_model_dict)
+ fill_dict(training_opts, tmp_train_dict)
+
+ # Overwrite with manually given parameters
+ if "lr" in kwargs:
+ training_opts["lr"] = kwargs['lr']
+ if "batch_size" in kwargs:
+ dataset_opts["batch_size"] = kwargs["batch_size"]
+ if "optimizer" in kwargs:
+ training_opts["optimizer"]["type"] = kwargs["optimizer"]
+ if "scheduler" in kwargs:
+ training_opts["scheduler"]["type"] = kwargs["scheduler"]
+ if "margin" in kwargs:
+ model_opts["loss"]["aam_margin"] = kwargs["margin"]
+ if "aam_s" in kwargs:
+ model_opts["loss"]["aam_s"] = kwargs["aam_s"]
+
+ return dataset_opts, model_opts, training_opts
+
+
+def get_network(model_opts, local_rank):
+ """
+
+ :param model_opts:
+ :param local_rank:
+ :return:
+ """
+
+ if model_opts["model_type"] in ["xvector", "rawnet2", "resnet34", "fastresnet34", "halfresnet34", "bn_halfresnet34"]:
+ model = Xtractor(model_opts["speaker_number"], model_opts["model_type"], loss=model_opts["loss"]["type"], embedding_size=model_opts["embedding_size"])
+ else:
+ # Custom type of model
+ model = Xtractor(model_opts["speaker_number"], model_opts, loss=model_opts["loss"]["type"], embedding_size=model_opts["embedding_size"])
+
+ # Load the model if it exists
+ if model_opts["initial_model_name"] is not None and os.path.isfile(model_opts["initial_model_name"]):
+ logging.critical(f"*** Load model from = {model_opts['initial_model_name']}")
+ checkpoint = torch.load(model_opts["initial_model_name"])
+
+ """
+ Here we remove all layers that we don't want to reload
+
+ """
+ pretrained_dict = checkpoint["model_state_dict"]
+ for part in model_opts["reset_parts"]:
+ pretrained_dict = {k: v for k, v in pretrained_dict.items() if not k.startswith(part)}
+
+ new_model_dict = model.state_dict()
+ new_model_dict.update(pretrained_dict)
+ model.load_state_dict(new_model_dict)
+
+ # Freeze required layers
+ for name, param in model.named_parameters():
+ if name.split(".")[0] in model_opts["reset_parts"]:
+ param.requires_grad = False
+
+ if model_opts["loss"]["type"] == "aam" and not (model_opts["loss"]["aam_margin"] == 0.2 and model_opts["loss"]["aam_s"] == 30):
+ model.after_speaker_embedding.change_params(model_opts["loss"]["aam_s"], model_opts["loss"]["aam_margin"])
+ print(f"Modified AAM: margin = {model.after_speaker_embedding.m} and s = {model.after_speaker_embedding.s}")
+
+ if local_rank < 1:
+
+
+ logging.info(model)
+ logging.info("Model_parameters_count: {:d}".format(
+ sum(p.numel()
+ for p in model.sequence_network.parameters()
+ if p.requires_grad) + \
+ sum(p.numel()
+ for p in model.before_speaker_embedding.parameters()
+ if p.requires_grad) + \
+ sum(p.numel()
+ for p in model.stat_pooling.parameters()
+ if p.requires_grad)))
+
+ return model
+
+
+def get_loaders(dataset_opts, training_opts, model_opts, local_rank=0):
+ """
+
+ :param dataset_opts:
+ :param training_opts:
+ :param model_opts:
+ :return:
+ """
+
+ """
+ Set the dataloaders according to the dataset_yaml
+
+ First we load the dataframe from CSV file in order to split it for training and validation purpose
+ Then we provide those two
+ """
+ df = pandas.read_csv(dataset_opts["dataset_csv"])
+
+ stratify = None
+ if dataset_opts["stratify"]:
+ stratify = df["speaker_idx"]
+ training_df, validation_df = train_test_split(df,
+ test_size=dataset_opts["validation_ratio"],
+ stratify=stratify)
+
+ torch.manual_seed(training_opts['torch_seed'] + local_rank)
+ torch.cuda.manual_seed(training_opts['torch_seed'] + local_rank)
+
+ train_DEV4S = "/tmp/DEV4S_train_side_set.pl"
+ val_DEV4S = "/tmp/DEV4S_val_side_set.pl"
+ if os.environ.get('DEV4S') == 'True':
+ dataset_opts["batch_size"] = 4
+ if os.environ.get('DEV4S') == 'True' and os.path.isfile(train_DEV4S) and os.environ.get('DEV4S_KeepDataPrep') != "True":
+ with open(train_DEV4S, "rb") as f:
+ training_set = pickle.load(f)
+ with open(val_DEV4S, "rb") as f:
+ validation_set = pickle.load(f)
+ else:
+ training_set = SideSet(dataset_opts,
+ set_type="train",
+ chunk_per_segment=-1,
+ transform_number=dataset_opts['train']['transform_number'],
+ overlap=dataset_opts['train']['overlap'],
+ dataset_df=training_df,
+ output_format="pytorch",
+ )
+
+ validation_set = SideSet(dataset_opts,
+ set_type="validation",
+ dataset_df=validation_df,
+ output_format="pytorch")
+
+ if os.environ.get('DEV4S') == 'True' and not os.path.isfile(train_DEV4S):
+ with open(train_DEV4S, "wb") as f:
+ pickle.dump(training_set, f)
+ with open(val_DEV4S, "wb") as f:
+ pickle.dump(validation_set, f)
+
+ if model_opts["loss"]["type"] == 'aps':
+ samples_per_speaker = 2
+ else:
+ samples_per_speaker = 1
+
+ if training_opts["multi_gpu"]:
+ assert dataset_opts["batch_size"] % int(os.environ['WORLD_SIZE']) == 0
+ assert dataset_opts["batch_size"] % samples_per_speaker == 0
+ batch_size = dataset_opts["batch_size"]//(int(os.environ['WORLD_SIZE']) * dataset_opts["train"]["sampler"]["examples_per_speaker"])
+
+ side_sampler = SideSampler(data_source=training_set.sessions['speaker_idx'],
+ spk_count=model_opts["speaker_number"],
+ examples_per_speaker=dataset_opts["train"]["sampler"]["examples_per_speaker"],
+ samples_per_speaker=dataset_opts["train"]["sampler"]["samples_per_speaker"],
+ batch_size=batch_size,
+ seed=training_opts['torch_seed'],
+ rank=local_rank,
+ num_process=int(os.environ['WORLD_SIZE']),
+ num_replicas=dataset_opts["train"]["sampler"]["augmentation_replica"]
+ )
+ else:
+ batch_size = dataset_opts["batch_size"] // dataset_opts["train"]["sampler"]["examples_per_speaker"]
+ side_sampler = SideSampler(data_source=training_set.sessions['speaker_idx'],
+ spk_count=model_opts["speaker_number"],
+ examples_per_speaker=dataset_opts["train"]["sampler"]["examples_per_speaker"],
+ samples_per_speaker=dataset_opts["train"]["sampler"]["samples_per_speaker"],
+ batch_size=batch_size,
+ seed=training_opts['torch_seed'],
+ rank=0,
+ num_process=1,
+ num_replicas=dataset_opts["train"]["sampler"]["augmentation_replica"]
+ )
+
+ training_loader = DataLoader(training_set,
+ batch_size=batch_size * dataset_opts["train"]["sampler"]["augmentation_replica"],
+ shuffle=False,
+ drop_last=True,
+ pin_memory=True,
+ sampler=side_sampler,
+ num_workers=training_opts["num_cpu"],
+ persistent_workers=False,
+ worker_init_fn=seed_worker)
+
+ validation_loader = DataLoader(validation_set,
+ batch_size=batch_size,
+ drop_last=False,
+ pin_memory=True,
+ num_workers=training_opts["num_cpu"],
+ persistent_workers=False,
+ worker_init_fn=seed_worker)
+
+ # Compute indices for target and non-target trials once only to avoid recomputing for each epoch
+ classes = torch.ShortTensor(validation_set.sessions['speaker_idx'].to_numpy())
+ mask = classes.unsqueeze(1) == classes.unsqueeze(1).T
+ tar_indices = torch.tril(mask, -1).numpy()
+ non_indices = torch.tril(~mask, -1).numpy()
+
+ # Select a subset of non-target trials to reduce the number of tests
+ tar_non_ratio = numpy.sum(tar_indices)/numpy.sum(non_indices)
+ non_indices *= (numpy.random.rand(*non_indices.shape) < tar_non_ratio)
+
+ return training_loader, validation_loader, side_sampler, tar_indices, non_indices
+
+
+def get_optimizer(model, model_opts, train_opts, training_loader):
+ """
+
+ :param model:
+ :param model_opts:
+ :param train_opts:
+ :param training_loader:
+ :return:
+ """
+ if train_opts["optimizer"]["type"] == 'adam':
+ _optimizer = torch.optim.Adam
+ _options = {'lr': train_opts["lr"]}
+ elif train_opts["optimizer"]["type"] == 'rmsprop':
+ _optimizer = torch.optim.RMSprop
+ _options = {'lr': train_opts["lr"]}
+ else: # train_opts["optimizer"]["type"] == 'sgd'
+ _optimizer = torch.optim.SGD
+ _options = {'lr': train_opts["lr"], 'momentum': 0.9}
+
+ param_list = []
+ if type(model) is Xtractor:
+ if model.preprocessor is not None:
+ param_list.append({'params': model.preprocessor.parameters(),
+ 'weight_decay': model.preprocessor_weight_decay})
+ param_list.append({'params': model.sequence_network.parameters(),
+ 'weight_decay': model.sequence_network_weight_decay})
+ param_list.append({'params': model.stat_pooling.parameters(),
+ 'weight_decay': model.stat_pooling_weight_decay})
+ param_list.append({'params': model.before_speaker_embedding.parameters(),
+ 'weight_decay': model.before_speaker_embedding_weight_decay})
+ param_list.append({'params': model.after_speaker_embedding.parameters(),
+ 'weight_decay': model.after_speaker_embedding_weight_decay})
+
+ else:
+ if model.module.preprocessor is not None:
+ param_list.append({'params': model.module.preprocessor.parameters(),
+ 'weight_decay': model.module.preprocessor_weight_decay})
+ param_list.append({'params': model.module.sequence_network.parameters(),
+ 'weight_decay': model.module.sequence_network_weight_decay})
+ param_list.append({'params': model.module.stat_pooling.parameters(),
+ 'weight_decay': model.module.stat_pooling_weight_decay})
+ param_list.append({'params': model.module.before_speaker_embedding.parameters(),
+ 'weight_decay': model.module.before_speaker_embedding_weight_decay})
+ param_list.append({'params': model.module.after_speaker_embedding.parameters(),
+ 'weight_decay': model.module.after_speaker_embedding_weight_decay})
+
+ optimizer = _optimizer(param_list, **_options)
+
+ if train_opts["scheduler"]["type"] == 'CyclicLR':
+ cycle_momentum = True
+ if train_opts["optimizer"]["type"] == "adam":
+ cycle_momentum = False
+ scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer=optimizer,
+ base_lr=train_opts["scheduler"]["base_lr"],
+ max_lr=train_opts["lr"],
+ step_size_up=train_opts["scheduler"]["step_size_up"],
+ step_size_down=None,
+ cycle_momentum=cycle_momentum,
+ mode=train_opts["scheduler"]["mode"])
+
+ elif train_opts["scheduler"]["type"] == "MultiStepLR":
+ scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer=optimizer,
+ milestones=[10000,50000,100000],
+ gamma=0.5)
+
+ elif train_opts["scheduler"]["type"] == "StepLR":
+ scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer,
+ step_size=1 * training_loader.__len__(),
+ gamma=0.95)
+
+ elif train_opts["scheduler"]["type"] == "StepLR2":
+ scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer,
+ step_size=1 * training_loader.__len__(),
+ gamma=0.5)
+ else:
+ scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
+ mode='min',
+ factor=0.5,
+ patience=3000,
+ verbose=True)
+
+ return optimizer, scheduler
+
+
+def save_model(model, training_monitor, model_opts, training_opts, optimizer, scheduler, epoch):
+ """
+
+ :param model:
+ :param training_monitor:
+ :param model_opts:
+ :param training_opts:
+ :param optimizer:
+ :param scheduler:
+ :return:
+ """
+
+ best_name = training_opts["best_model_name"]
+ tmp_name = training_opts["tmp_model_name"]
+
+ if epoch is not None:
+ best_name = best_name + f"_epoch{epoch}"
+
+ # TODO à reprendre
+ if type(model) is Xtractor:
+ save_checkpoint({
+ 'epoch': training_monitor.current_epoch,
+ 'model_state_dict': model.state_dict(),
+ 'optimizer_state_dict': optimizer.state_dict(),
+ 'accuracy': training_monitor.best_accuracy,
+ 'scheduler': scheduler,
+ 'speaker_number' : model.speaker_number,
+ 'model_archi': model_opts,
+ 'loss': model_opts["loss"]["type"]
+ }, training_monitor.is_best, filename=tmp_name, best_filename=best_name)
+ else:
+ save_checkpoint({
+ 'epoch': training_monitor.current_epoch,
+ 'model_state_dict': model.module.state_dict(),
+ 'optimizer_state_dict': optimizer.state_dict(),
+ 'accuracy': training_monitor.best_accuracy,
+ 'scheduler': scheduler,
+ 'speaker_number': model.module.speaker_number,
+ 'model_archi': model_opts,
+ 'loss': model_opts["loss"]["type"]
+ }, training_monitor.is_best, filename=tmp_name, best_filename=best_name)
+
+
+class AAMScheduler():
+ """
+ For now we only update margin
+ """
+ def __init__(self, original_margin, final_margin, final_steps_nb, update_frequency, mode='lin', Tau=1, verbose=True):
+ """
+
+ :param final_margin:
+ :param num_epochs:
+ :param mode: can be linear or exp
+ :param verbose:
+ """
+ self.current_margin = original_margin
+ self.original_margin = original_margin
+ self.final_margin = final_margin
+ self.final_steps_nb = final_steps_nb
+ self.update_frequency = update_frequency
+ self.mode = mode
+ self.Tau = Tau
+ self.verbose = verbose
+ self._counter = 0
+
+ def __step__(self):
+ self._counter += 1
+
+ if self._counter % self.update_frequency == 0:
+ # update the parameters
+ if self.mode == "lin":
+ self.current_margin = self.original_margin + \
+ (self.final_margin - self.original_margin) * \
+ (self._counter / self.final_steps_nb)
+ else:
+ self.current_margin = self.original_margin + \
+ (self.final_margin - self.original_margin) * \
+ (1 - numpy.exp(-self._counter / (self.final_steps_nb/7)))
+
+ return self.current_margin
+
+
+def xtrain(dataset_description,
+ model_description,
+ training_description,
+ **kwargs):
+ """
+ REFACTORING
+ - en cas de redemarrage à partir d'un modele existant, recharger l'optimize et le scheduler
+ """
+
+ torch.multiprocessing.set_start_method('forkserver', force=True)
+
+ local_rank = -1
+ if "RANK" in os.environ:
+ local_rank = int(os.environ['RANK'])
+
+ # Test to optimize
+ # torch.backends.cudnn.benchmark = True
+ torch.autograd.profiler.emit_nvtx(enabled=False)
+
+ dataset_opts, model_opts, training_opts = update_training_dictionary(dataset_description,
+ model_description,
+ training_description,
+ kwargs)
+ if os.environ.get('DEV4S') == 'True':
+ training_opts["multi_gpu"] = False
+
+ # Initialize the training monitor
+ monitor = TrainingMonitor(output_file=training_opts["log_file"],
+ patience=training_opts["patience"],
+ best_accuracy=0.0,
+ best_eer_epoch=1,
+ best_eer=100,
+ compute_test_eer=training_opts["compute_test_eer"])
+
+ # Make PyTorch Deterministic
+ torch.backends.cudnn.deterministic = False
+ if training_opts["deterministic"]:
+ torch.backends.cudnn.deterministic = True
+
+ # Set all the seeds
+ random.seed(training_opts["random_seed"])
+ numpy.random.seed(training_opts["numpy_seed"]) # Set the random seed of numpy for the data split.
+ torch.manual_seed(training_opts["torch_seed"])
+ torch.cuda.manual_seed(training_opts["torch_seed"])
+
+ # Display the entire configurations as YAML dictionaries
+ if local_rank < 1:
+ monitor.logger.info("\n*********************************\nDataset options\n*********************************\n")
+ monitor.logger.info(yaml.dump(dataset_opts, default_flow_style=False))
+ monitor.logger.info("\n*********************************\nModel options\n*********************************\n")
+ monitor.logger.info(yaml.dump(model_opts, default_flow_style=False))
+ monitor.logger.info("\n*********************************\nTraining options\n*********************************\n")
+ monitor.logger.info(yaml.dump(training_opts, default_flow_style=False))
+
+ # Initialize the model
+ model = get_network(model_opts, local_rank)
+ if local_rank < 1:
+ monitor.logger.info(model)
+
+ embedding_size = model.embedding_size
+ aam_scheduler = None
+ #if model.loss == "aam":
+ # aam_scheduler = AAMScheduler(model_opts["loss"]["aam_margin"],
+ # final_margin=0.5,
+ # final_steps_nb=120000,
+ # update_frequency=25000,
+ # mode='exp',
+ # Tau=1,
+ # verbose=True)
+
+ # Set the device and manage parallel processing
+ torch.cuda.set_device(local_rank)
+ if local_rank >= 0:
+ device = torch.device(local_rank)
+ else:
+ device = torch.device("cuda")
+
+ if training_opts["multi_gpu"]:
+ model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+
+ model.to(device)
+
+
+ """ [HOW TO] from https://gist.github.com/sgraaf/5b0caa3a320f28c27c12b5efeb35aa4c
+ - Add the following line right after "if __name__ == '__main__':" in your main script :
+ parser.add_argument('--local_rank', type=int, default=-1, metavar='N', help='Local process rank.')
+ - Then, in your shell :
+ export NUM_NODES=1
+ export NUM_GPUS_PER_NODE=2
+ export NODE_RANK=0
+ export WORLD_SIZE=$(($NUM_NODES * $NUM_GPUS_PER_NODE))
+ python -m torch.distributed.launch \
+ --nproc_per_node=$NUM_GPUS_PER_NODE \
+ --nnodes=$NUM_NODES \
+ --node_rank $NODE_RANK \
+ train_xvector.py ...
+ """
+ if training_opts["multi_gpu"]:
+ if local_rank < 1:
+ print("Let's use", int(os.environ['WORLD_SIZE']), "GPUs!")
+ torch.distributed.init_process_group(backend='nccl', init_method='env://')
+ model = torch.nn.parallel.DistributedDataParallel(model,
+ device_ids=[local_rank],
+ output_device=local_rank)
+ else:
+ print("Train on a single GPU")
+
+ # Initialise data loaders
+ training_loader, validation_loader,\
+ sampler, validation_tar_indices, validation_non_indices = get_loaders(dataset_opts,
+ training_opts,
+ model_opts,
+ local_rank)
+
+ if local_rank < 1:
+ monitor.logger.info(f"Start training process")
+ monitor.logger.info(f"Use \t{int(os.environ['WORLD_SIZE'])} \tgpus")
+ monitor.logger.info(f"Use \t{training_opts['num_cpu']} \tcpus")
+
+ monitor.logger.info(f"Validation EER will be measured using")
+ monitor.logger.info(f"\t {numpy.sum(validation_tar_indices)} target trials and")
+ monitor.logger.info(f"\t {numpy.sum(validation_non_indices)} non-target trials")
+
+ # Create optimizer and scheduler
+ optimizer, scheduler = get_optimizer(model, model_opts, training_opts, training_loader)
+
+ scaler = None
+ if training_opts["mixed_precision"]:
+ scaler = torch.cuda.amp.GradScaler()
+
+ for epoch in range(1, training_opts["epochs"] + 1):
+
+ monitor.update(epoch=epoch)
+
+ # Process one epoch and return the current model
+ if monitor.current_patience == 0:
+ print(f"Stopping at epoch {epoch} for cause of patience")
+ break
+
+ sampler.set_epoch(epoch)
+ if training_opts["multi_gpu"]:
+ torch.distributed.barrier()
+
+ model = train_epoch(model,
+ training_opts,
+ monitor,
+ training_loader,
+ optimizer,
+ scheduler,
+ device,
+ scaler=scaler)
+ # aam_scheduler=aam_scheduler)
+
+ # Cross validation
+ if math.fmod(epoch, training_opts["validation_frequency"]) == 0:
+ val_acc, val_loss, val_eer = cross_validation(model,
+ validation_loader,
+ device,
+ [validation_loader.dataset.__len__(), embedding_size],
+ validation_tar_indices,
+ validation_non_indices,
+ training_opts["mixed_precision"])
+
+ test_eer = None
+ if training_opts["compute_test_eer"] and local_rank < 1:
+ test_eer = test_metrics(model, device, model_opts, dataset_opts, training_opts)
+
+ monitor.update(test_eer=test_eer,
+ val_eer=val_eer,
+ val_loss=val_loss,
+ val_acc=val_acc)
+
+ if local_rank < 1:
+ monitor.display()
+
+ # Save the current model and if needed update the best one
+ # TODO ajouter une option qui garde les modèles à certaines époques (par exemple avant le changement de LR
+ if local_rank < 1:
+ save_model(model, monitor, model_opts, training_opts, optimizer, scheduler, epoch)
+
+
+ for ii in range(int(os.environ['WORLD_SIZE'])):
+ monitor.logger.info(torch.cuda.memory_summary(ii))
+
+ # TODO gérer l'affichage en utilisant le training_monitor
+ if local_rank < 1:
+ monitor.display_final()
+
+ return monitor.best_eer
+
+
+def train_epoch(model,
+ training_opts,
+ training_monitor,
+ training_loader,
+ optimizer,
+ scheduler,
+ device,
+ scaler=None,
+ clipping=False,
+ aam_scheduler=None):
+ """
+
+ :param model:
+ :param training_opts:
+ :param training_monitor:
+ :param training_loader:
+ :param optimizer:
+ :param scheduler:
+ :param device:
+ :param scaler:
+ :param clipping:
+ :param aam_scheduler:
+ :return:
+ """
+ model.train()
+ criterion = torch.nn.CrossEntropyLoss(reduction='mean')
+
+ if isinstance(model, Xtractor):
+ loss_criteria = model.loss
+ else:
+ loss_criteria = model.module.loss
+
+ accuracy = 0.0
+ running_loss = 0.0
+ batch_count = 0
+ for batch_idx, (data, target) in enumerate(training_loader):
+ data = data.squeeze().to(device)
+
+ target = target.squeeze()
+ target = target.to(device)
+ optimizer.zero_grad(set_to_none=True)
+
+ if scaler is not None:
+ with torch.cuda.amp.autocast():
+ if loss_criteria == 'aam':
+ output_tuple, _ = model(data, target=target)
+ output, no_margin_output = output_tuple
+ loss = criterion(output, target)
+ elif loss_criteria == 'smn':
+ output_tuple, _ = model(data, target=target)
+ loss, output = output_tuple
+ loss += criterion(output, target)
+ elif loss_criteria == 'aps':
+ output_tuple, _ = model(data, target=target)
+ loss, output = output_tuple
+ else:
+ output, _ = model(data, target=None)
+ loss = criterion(output, target)
+
+ scaler.scale(loss).backward()
+ if clipping:
+ scaler.unscale_(optimizer)
+ torch.nn.utils.clip_grad_norm_(model.parameters(), 1.)
+ scaler.step(optimizer)
+ scaler.update()
+
+ else:
+ if loss_criteria == 'aam':
+ output, _ = model(data, target=target)
+ loss = criterion(output, target)
+ elif loss_criteria == 'aps':
+ output_tuple, _ = model(data, target=target)
+ cos_sim_matx, output = output_tuple
+ loss = criterion(cos_sim_matx, torch.arange(0, int(data.shape[0]/2), device=device)) + criterion(output, target)
+ else:
+ output, _ = model(data, target=None)
+ loss = criterion(output, target)
+
+ loss.backward()
+ optimizer.step()
+
+ running_loss += loss.item()
+ accuracy += (torch.argmax(no_margin_output.data, 1) == target).sum().cpu()
+ batch_count += 1
+
+ if math.fmod(batch_idx, training_opts["log_interval"]) == 0:
+ batch_size = target.shape[0]
+ training_monitor.update(training_loss=loss.item(),
+ training_acc=100.0 * accuracy.item() / ((batch_idx + 1) * batch_size))
+
+ training_monitor.logger.info('Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tAccuracy: {:.3f}\tLr: {}'.format(
+ training_monitor.current_epoch,
+ batch_idx + 1,
+ training_loader.__len__(),
+ 100. * batch_idx / training_loader.__len__(),
+ running_loss / batch_count,
+ 100.0 * accuracy / (batch_count*target.shape[0]),
+ scheduler.get_last_lr()[0], # TODO only working fro some scheduler
+ ))
+ running_loss = 0.0
+ accuracy = 0.0
+ batch_count = 0
+ running_loss = 0.0
+ if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
+ scheduler.step(training_monitor.best_eer)
+ else:
+ scheduler.step()
+ if aam_scheduler is not None:
+ model.after_speaker_embedding.margin = aam_scheduler.__step__()
+ return model
+
+def cross_validation(model, validation_loader, device, validation_shape, tar_indices, non_indices, mixed_precision=False):
+ """
+
+ :param model:
+ :param validation_loader:
+ :param device:
+ :param validation_shape:
+ :return:
+ """
+ model.eval()
+ if isinstance(model, Xtractor):
+ loss_criteria = model.loss
+ else:
+ loss_criteria = model.module.loss
+
+ accuracy = 0.0
+ loss = 0.0
+ criterion = torch.nn.CrossEntropyLoss()
+ embeddings = torch.zeros(validation_shape)
+ cursor = 0
+ with torch.no_grad():
+ for batch_idx, (data, target) in enumerate(tqdm.tqdm(validation_loader, desc='validation compute', mininterval=1, disable=None)):
+ if target.dim() != 1:
+ target = target.squeeze()
+ target = target.to(device)
+ batch_size = target.shape[0]
+ data = data.squeeze().to(device)
+ with torch.cuda.amp.autocast(enabled=mixed_precision):
+ output, batch_embeddings = model(data, target=None, is_eval=True)
+ if loss_criteria == 'cce':
+ batch_embeddings = l2_norm(batch_embeddings)
+ if loss_criteria == 'smn':
+ batch_embeddings, batch_predictions = output
+ else:
+ batch_predictions = output
+ accuracy += (torch.argmax(batch_predictions.data, 1) == target).sum()
+ loss += criterion(batch_predictions, target)
+ embeddings[cursor:cursor + batch_size,:] = batch_embeddings.detach().cpu()
+ cursor += batch_size
+
+ local_device = "cpu" if embeddings.shape[0] > 3e4 else device
+ embeddings = embeddings.to(local_device)
+ scores = torch.einsum('ij,kj', embeddings, embeddings).cpu().numpy()
+ negatives = scores[non_indices]
+ positives = scores[tar_indices]
+
+ pmiss, pfa = rocch(positives, negatives)
+ equal_error_rate = rocch2eer(pmiss, pfa)
+
+ return (100. * accuracy.cpu().numpy() / validation_shape[0],
+ loss.cpu().numpy() / ((batch_idx + 1) * batch_size),
+ equal_error_rate)
+
+
+def extract_embeddings(idmap_name,
+ model_filename,
+ data_root_name,
+ device,
+ batch_size=1,
+ file_extension="wav",
+ transform_pipeline={},
+ sliding_window=False,
+ win_duration=3.,
+ win_shift=1.5,
+ num_thread=1,
+ sample_rate=16000,
+ mixed_precision=False,
+ norm_embeddings=True):
+ """
+
+ :param idmap_name:
+ :param model_filename:
+ :param data_root_name:
+ :param device:
+ :param file_extension:
+ :param transform_pipeline:
+ :param sliding_window:
+ :param win_duration:
+ :param win_shift:
+ :param num_thread:
+ :param sample_rate:
+ :param mixed_precision:
+ :return:
+ """
+
+ if sliding_window:
+ batch_size = 1
+
+ # Load the model
+ if isinstance(model_filename, str):
+ checkpoint = torch.load(model_filename, map_location=device)
+ speaker_number = checkpoint["speaker_number"]
+ model_opts = checkpoint["model_archi"]
+ model_opts["embedding_size"] = 256
+ model = Xtractor(speaker_number,
+ model_archi=model_opts["model_type"],
+ loss=model_opts["loss"]["type"],
+ embedding_size=model_opts["embedding_size"])
+ model.load_state_dict(checkpoint["model_state_dict"])
+ else:
+ model = model_filename
+
+ if isinstance(idmap_name, IdMap):
+ idmap = idmap_name
+ else:
+ idmap = IdMap(idmap_name)
+
+ # Create dataset to load the data
+ dataset = IdMapSet(idmap_name=idmap,
+ data_path=data_root_name,
+ file_extension=file_extension,
+ transform_pipeline=transform_pipeline,
+ transform_number=0,
+ sliding_window=sliding_window,
+ window_len=win_duration,
+ window_shift=win_shift,
+ sample_rate=sample_rate,
+ min_duration=win_duration
+ )
+
+ dataloader = DataLoader(dataset,
+ batch_size=batch_size,
+ shuffle=False,
+ drop_last=False,
+ pin_memory=True,
+ num_workers=num_thread)
+
+ with torch.no_grad():
+ model.eval()
+ model.to(device)
+
+ embed = []
+ modelset= []
+ segset = []
+ starts = []
+
+ for idx, (data, mod, seg, start, stop) in enumerate(tqdm.tqdm(dataloader,
+ desc='xvector extraction',
+ mininterval=1,
+ disable=None)):
+
+ if data.dim() > 2:
+ data = data.squeeze()
+
+ with torch.cuda.amp.autocast(enabled=mixed_precision):
+ tmp_data = torch.split(data,data.shape[0]//(max(1, data.shape[0]//100)))
+ for td in tmp_data:
+ _, vec = model(x=td.to(device), is_eval=True, norm_embedding=norm_embeddings)
+ embed.append(vec.detach().cpu())
+ modelset.extend(mod * data.shape[0])
+ segset.extend(seg * data.shape[0])
+ if sliding_window:
+ tmp_start = numpy.arange(0, data.shape[0] * win_shift, win_shift)
+ starts.extend(tmp_start * sample_rate + start.detach().cpu().numpy())
+ else:
+ starts.append(start.numpy())
+
+ embeddings = StatServer()
+ embeddings.stat1 = numpy.concatenate(embed)
+ embeddings.modelset = numpy.array(modelset).astype('>U')
+ embeddings.segset = numpy.array(segset).astype('>U')
+ embeddings.start = numpy.array(starts).squeeze()
+ embeddings.stop = embeddings.start + win_duration
+ embeddings.stat0 = numpy.ones((embeddings.modelset.shape[0], 1))
+
+ return embeddings
+
+
+def extract_embeddings_per_speaker(idmap_name,
+ model_filename,
+ data_root_name,
+ device,
+ file_extension="wav",
+ transform_pipeline={},
+ sample_rate=16000,
+ mixed_precision=False,
+ num_thread=1):
+ """
+
+ :param idmap_name:
+ :param model_filename:
+ :param data_root_name:
+ :param device:
+ :param file_extension:
+ :param transform_pipeline:
+ :param sample_rate:
+ :param mixed_precision:
+ :param num_thread:
+ :return:
+ """
+ # Load the model
+ if isinstance(model_filename, str):
+ checkpoint = torch.load(model_filename, map_location=device)
+ speaker_number = checkpoint["speaker_number"]
+ model_opts = checkpoint["model_archi"]
+ model = Xtractor(speaker_number, model_archi=model_opts["model_type"], loss=model_opts["loss"]["type"])
+ model.load_state_dict(checkpoint["model_state_dict"])
+ else:
+ model = model_filename
+
+ if isinstance(idmap_name, IdMap):
+ idmap = idmap_name
+ else:
+ idmap = IdMap(idmap_name)
+
+ # Create dataset to load the data
+ dataset = IdMapSetPerSpeaker(idmap_name=idmap,
+ data_path=data_root_name,
+ file_extension=file_extension,
+ transform_pipeline=transform_pipeline,
+ sample_rate=sample_rate,
+ min_duration=1.)
+
+ dataloader = DataLoader(dataset,
+ batch_size=1,
+ shuffle=False,
+ drop_last=False,
+ pin_memory=True,
+ num_workers=num_thread)
+
+ with torch.no_grad():
+ model.eval()
+ model.to(device)
+
+ # Get the size of embeddings to extract
+ emb_size = model.embedding_size
+
+ # Create the StatServer
+ embeddings = StatServer()
+ embeddings.modelset = dataset.output_im.leftids
+ embeddings.segset = dataset.output_im.rightids
+ embeddings.start = dataset.output_im.start
+ embeddings.stop = dataset.output_im.stop
+ embeddings.stat0 = numpy.ones((embeddings.modelset.shape[0], 1))
+ embeddings.stat1 = numpy.ones((embeddings.modelset.shape[0], emb_size))
+
+ # Process the data
+ with torch.cuda.amp.autocast(enabled=mixed_precision):
+ for idx, (data, mod, seg, start, stop) in enumerate(tqdm.tqdm(dataloader,
+ desc='xvector extraction',
+ mininterval=1)):
+ if data.shape[1] > 20000000:
+ data = data[..., :20000000]
+ _, vec = model(x=data.to(device), is_eval=True, norm_embedding=True)
+ embeddings.stat1[idx, :] = vec.detach().cpu()
+
+ return embeddings
+
diff --git a/sidekit/sidekit/score_normalization.py b/sidekit/sidekit/score_normalization.py
new file mode 100644
index 0000000..65360e9
--- /dev/null
+++ b/sidekit/sidekit/score_normalization.py
@@ -0,0 +1,137 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+
+"""
+Copyright 2014-2019 Anthony Larcher
+
+:mod:`sidekit_io` provides methods to read and write from and to different
+formats.
+"""
+
+import copy
+import numpy
+
+__license__ = "LGPL"
+__author__ = "Anthony Larcher"
+__copyright__ = "Copyright 2014-2019 Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+
+
+def znorm(enrol_test_scores, enrol_imp_scores, sym=False):
+ """
+ imp_scores are formed by scoring enrollment utterance ewith all files from the impostor cohort
+ thus: enrol_test_scores.modelset and enrol_imp_scores.modelset must be the same
+
+ This function assumes that all models from enrol_test_scores are in enrol_imp_scores
+
+ :param enrol_test_scores:
+ :param enrol_imp_scores:
+ :return:
+ """
+ # Align enrol_test_scores.modelset and enrol_imp_scores.modelset
+ #enrol_imp_scores.filter(enrol_test_scores.modelset, enrol_imp_scores.segset, keep=True)
+ scores_znorm = copy.deepcopy(enrol_test_scores)
+ scores_znorm.sort()
+ enrol_imp_scores.sort()
+
+ # Compute the new enrol_test_scores normalized scores
+ if sym:
+ mean_per_model = (enrol_imp_scores.scoremat.sum(1) - numpy.diag(enrol_imp_scores.scoremat)) / (enrol_imp_scores.scoremat.shape[1] - 1)
+ tmp = numpy.square(enrol_imp_scores.scoremat - mean_per_model)
+ std_per_model = (tmp.sum(1) - numpy.diag(tmp)) / (tmp.shape[1] - 1)
+ else:
+ mean_per_model = enrol_imp_scores.scoremat.mean(1)
+ std_per_model = enrol_imp_scores.scoremat.std(1)
+ scores_znorm.scoremat = (scores_znorm.scoremat - mean_per_model) / std_per_model
+
+ return scores_znorm
+
+
+def tnorm(enrol_test_scores, imp_test_scores):
+ """
+
+ :param enrol_test_scores:
+ :param imp_test_scores:
+ :return:
+ """
+ # Align enrol_test_scores.segset and imp_test_scores.segset
+ #imp_test_scores.filter(imp_test_scores.modelset, enrol_test_scores.segset, keep=True)
+ scores_tnorm = copy.deepcopy(enrol_test_scores)
+ scores_tnorm.sort()
+ imp_test_scores.sort()
+
+ # Compute the new enrol_test_scores normalized scores
+ mean_per_segment = imp_test_scores.scoremat.mean(0)
+ std_per_segment = imp_test_scores.scoremat.std(0)
+ scores_tnorm.scoremat = (scores_tnorm.scoremat - mean_per_segment) / std_per_segment
+
+ return scores_tnorm
+
+
+def ztnorm(enrol_test_scores, enrol_imp_scores, imp_test_scores, imp_imp_scores):
+ """
+
+ :param enrol_test_scores:
+ :param enrol_imp_scores:
+ :param imp_test_scores:
+ :param imp_imp_scores:
+ :return:
+ """
+
+ # Apply Z-norm first on enrol_test_scores by using enrol_imp_scores
+ # and on imp_test_scores by using imp_imp_scores
+ #
+ # to produce Z_enrol_test_scores and Z_imp_test_scores
+ z_enrol_test_scores = znorm(enrol_test_scores, enrol_imp_scores)
+ z_imp_test_scores = znorm(imp_test_scores, imp_imp_scores, sym=True)
+
+ # Apply t-norm on Z_enrol_test_scores with Z_imp_test_scores
+ zt_enrol_test_scores = tnorm(z_enrol_test_scores, z_imp_test_scores)
+
+ return zt_enrol_test_scores
+
+
+def snorm(enrol_test_scores, enrol_imp_scores, imp_test_scores):
+ """
+
+ :param enrol_test_scores:
+ :return:
+ """
+ # Compute z-norm scores
+ z_enrol_test_scores = znorm(enrol_test_scores, enrol_imp_scores)
+
+ # Compute t-norm scores
+ t_enrol_test_scores = tnorm(enrol_test_scores, imp_test_scores)
+
+ # Average the z and t normed scores
+ s_enrol_test_scores = copy.deepcopy(enrol_test_scores)
+ s_enrol_test_scores.scoremat = 0.5 * (z_enrol_test_scores.scoremat + t_enrol_test_scores.scoremat)
+
+ return s_enrol_test_scores
+
+
+def asnorm(enrol_test_scores):
+ pass
diff --git a/sidekit/sidekit/sidekit_io.py b/sidekit/sidekit/sidekit_io.py
new file mode 100755
index 0000000..5adbad4
--- /dev/null
+++ b/sidekit/sidekit/sidekit_io.py
@@ -0,0 +1,469 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+
+"""
+Copyright 2014-2019 Anthony Larcher
+
+:mod:`sidekit_io` provides methods to read and write from and to different
+formats.
+"""
+
+import h5py
+import array
+import numpy
+import os
+import pickle
+import struct
+import gzip
+import logging
+from sidekit.sidekit_wrappers import check_path_existance
+
+
+__license__ = "LGPL"
+__author__ = "Anthony Larcher"
+__copyright__ = "Copyright 2014-2019 Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+
+
+def read_vect(filename):
+ """Read vector in ALIZE binary format and return an array
+
+ :param filename: name of the file to read from
+
+ :return: a numpy.ndarray object
+ """
+ with open(filename, 'rb') as f:
+ struct.unpack("<2l", f.read(8))
+ data = array.array("d")
+ data.fromstring(f.read())
+ return numpy.array(data)
+
+
+def read_matrix(filename):
+ """Read matrix in ALIZE binary format and return a ndarray
+
+ :param filename: name of the file to read from
+
+ :return: a numpy.ndarray object
+ """
+ with open(filename, 'rb') as f:
+ m_dim = struct.unpack("<2l", f.read(8))
+ data = array.array("d")
+ data.fromstring(f.read())
+ T = numpy.array(data)
+ T.resize(m_dim[0], m_dim[1])
+ return T
+
+
+@check_path_existance
+def write_matrix(m, filename):
+ """Write a matrix in ALIZE binary format
+
+ :param m: a 2-dimensional ndarray
+ :param filename: name of the file to write in
+
+ :exception: TypeError if m is not a 2-dimensional ndarray
+ """
+ if not m.ndim == 2:
+ raise TypeError("To write vector, use write_vect")
+ else:
+ with open(filename, 'wb') as mf:
+ data = numpy.array(m.flatten())
+ mf.write(struct.pack(".
+"""
+Copyright 2014-2021 Sylvain Meignier and Anthony Larcher
+
+ :mod:`sidekit_mpi` provides methods to run using Message Passing interface
+
+"""
+
+import copy
+import numpy
+import os
+import logging
+import h5py
+import scipy
+import sys
+from sidekit.statserver import StatServer
+from sidekit.factor_analyser import FactorAnalyser
+from sidekit.mixture import Mixture
+from sidekit.sidekit_io import write_matrix_hdf5, read_matrix_hdf5
+
+from sidekit.sv_utils import serialize
+from sidekit.factor_analyser import e_on_batch
+from mpi4py import MPI
+
+
+__license__ = "LGPL"
+__author__ = "Anthony Larcher"
+__copyright__ = "Copyright 2014-2021 Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+
+data_type = numpy.float32
+
+def total_variability(stat_server_file_name,
+ ubm,
+ tv_rank,
+ nb_iter=20,
+ min_div=True,
+ tv_init=None,
+ save_init=False,
+ output_file_name=None):
+ """
+ Train a total variability model using multiple process on multiple nodes with MPI.
+
+ Example of how to train a total variability matrix using MPI.
+ Here is what your script should look like:
+
+ ----------------------------------------------------------------
+
+ import sidekit
+
+ fa = sidekit.FactorAnalyser()
+ fa.total_variability_mpi("/lium/spk1/larcher/expe/MPI_TV/data/statserver.h5",
+ ubm,
+ tv_rank,
+ nb_iter=tv_iteration,
+ min_div=True,
+ tv_init=tv_new_init2,
+ output_file_name="data/TV_mpi")
+
+ ----------------------------------------------------------------
+
+ This script should be run using mpirun command (see MPI4PY website for
+ more information about how to use it
+ http://pythonhosted.org/mpi4py/
+ )
+
+ mpirun --hostfile hostfile ./my_script.py
+
+ :param comm: MPI.comm object defining the group of nodes to use
+ :param stat_server_file_name: name of the StatServer file to load (make sure you provide absolute path and that
+ it is accessible from all your nodes).
+ :param ubm: a Mixture object
+ :param tv_rank: rank of the total variability model
+ :param nb_iter: number of EM iteration
+ :param min_div: boolean, if True, apply minimum divergence re-estimation
+ :param tv_init: initial matrix to start the EM iterations with
+ :param output_file_name: name of the file where to save the matrix
+ """
+ comm = MPI.COMM_WORLD
+
+ comm.Barrier()
+
+ # this lines allows to process a single StatServer or a list of StatServers
+ if not isinstance(stat_server_file_name, list):
+ stat_server_file_name = [stat_server_file_name]
+
+ # Initialize useful variables
+ sv_size = ubm.get_mean_super_vector().shape[0]
+ gmm_covariance = "diag" if ubm.invcov.ndim == 2 else "full"
+ nb_distrib, feature_size = ubm.mu.shape
+ upper_triangle_indices = numpy.triu_indices(tv_rank)
+
+ # Initialize the FactorAnalyser, mean and Sigma are initialized at ZEROS as statistics are centered
+ factor_analyser = FactorAnalyser()
+ factor_analyser.mean = numpy.zeros(ubm.get_mean_super_vector().shape)
+ factor_analyser.F = serialize(numpy.zeros((sv_size, tv_rank)).astype(data_type))
+ if tv_init is None:
+ factor_analyser.F = numpy.random.randn(sv_size, tv_rank).astype(data_type)
+ else:
+ factor_analyser.F = tv_init
+ factor_analyser.Sigma = numpy.zeros(ubm.get_mean_super_vector().shape)
+
+ # Save init if required
+ if comm.rank == 0:
+ if output_file_name is None:
+ output_file_name = "temporary_factor_analyser"
+ if save_init:
+ factor_analyser.write(output_file_name + "_init.h5")
+
+ # Iterative training of the FactorAnalyser
+ for it in range(nb_iter):
+ if comm.rank == 0:
+ logging.critical("Start it {}".format(it))
+
+ _A = numpy.zeros((nb_distrib, tv_rank * (tv_rank + 1) // 2), dtype=data_type)
+ _C = numpy.zeros((tv_rank, sv_size), dtype=data_type)
+ _R = numpy.zeros((tv_rank * (tv_rank + 1) // 2), dtype=data_type)
+
+ if comm.rank == 0:
+ total_session_nb = 0
+
+ # E-step
+ for stat_server_file in stat_server_file_name:
+
+ with h5py.File(stat_server_file, 'r') as fh:
+ nb_sessions = fh["segset"].shape[0]
+
+ if comm.rank == 0:
+ total_session_nb += nb_sessions
+
+ comm.Barrier()
+ if comm.rank == 0:
+ logging.critical("Process file: {}".format(stat_server_file))
+
+ # Allocate a list of sessions to process to each node
+ local_session_idx = numpy.array_split(range(nb_sessions), comm.size)
+ stat0 = fh['stat0'][local_session_idx[comm.rank], :]
+ stat1 = fh['stat1'][local_session_idx[comm.rank], :]
+ e_h, e_hh = e_on_batch(stat0, stat1, ubm, factor_analyser.F)
+
+ _A += stat0.T.dot(e_hh)
+ _C += e_h.T.dot(stat1)
+ _R += numpy.sum(e_hh, axis=0)
+
+ comm.Barrier()
+
+ comm.Barrier()
+
+ # Sum all statistics
+ if comm.rank == 0:
+ # only processor 0 will actually get the data
+ total_A = numpy.zeros_like(_A)
+ total_C = numpy.zeros_like(_C)
+ total_R = numpy.zeros_like(_R)
+ else:
+ total_A = [None] * _A.shape[0]
+ total_C = None
+ total_R = None
+
+ # Accumulate _A, using a list in order to avoid limitations of MPI (impossible to reduce matrices bigger
+ # than 4GB)
+ for ii in range(_A.shape[0]):
+ _tmp = copy.deepcopy(_A[ii])
+ if comm.rank == 0:
+ _total_A = numpy.zeros_like(total_A[ii])
+ else:
+ _total_A = None
+
+ comm.Reduce(
+ [_tmp, MPI.FLOAT],
+ [_total_A, MPI.FLOAT],
+ op=MPI.SUM,
+ root=0
+ )
+ if comm.rank == 0:
+ total_A[ii] = copy.deepcopy(_total_A)
+
+ comm.Reduce(
+ [_C, MPI.FLOAT],
+ [total_C, MPI.FLOAT],
+ op=MPI.SUM,
+ root=0
+ )
+
+ comm.Reduce(
+ [_R, MPI.FLOAT],
+ [total_R, MPI.FLOAT],
+ op=MPI.SUM,
+ root=0
+ )
+
+ comm.Barrier()
+
+ # M-step
+ if comm.rank == 0:
+
+ total_R /= total_session_nb
+ _A_tmp = numpy.zeros((tv_rank, tv_rank), dtype=data_type)
+ for c in range(nb_distrib):
+ distrib_idx = range(c * feature_size, (c + 1) * feature_size)
+ _A_tmp[upper_triangle_indices] = _A_tmp.T[upper_triangle_indices] = total_A[c, :]
+ factor_analyser.F[distrib_idx, :] = scipy.linalg.solve(_A_tmp, total_C[:, distrib_idx]).T
+
+ # minimum divergence
+ if min_div:
+ _R_tmp = numpy.zeros((tv_rank, tv_rank), dtype=data_type)
+ _R_tmp[upper_triangle_indices] = _R_tmp.T[upper_triangle_indices] = total_R
+ ch = scipy.linalg.cholesky(_R_tmp)
+ factor_analyser.F = factor_analyser.F.dot(ch)
+
+ # Save the current FactorAnalyser
+ if output_file_name is not None:
+ if it < nb_iter - 1:
+ factor_analyser.write(output_file_name + "_it-{}.h5".format(it))
+ else:
+ factor_analyser.write(output_file_name + ".h5")
+ factor_analyser.F = comm.bcast(factor_analyser.F, root=0)
+ comm.Barrier()
+
+
+def extract_ivector(tv,
+ stat_server_file_name,
+ ubm,
+ output_file_name,
+ uncertainty=False,
+ prefix=''):
+ """
+ Estimate i-vectors for a given StatServer using multiple process on multiple nodes.
+
+ :param comm: MPI.comm object defining the group of nodes to use
+ :param stat_server_file_name: file name of the sufficient statistics StatServer HDF5 file
+ :param ubm: Mixture object (the UBM)
+ :param output_file_name: name of the file to save the i-vectors StatServer in HDF5 format
+ :param uncertainty: boolean, if True, saves a matrix with uncertainty matrices (diagonal of the matrices)
+ :param prefix: prefixe of the dataset to read from in HDF5 file
+ """
+ assert(isinstance(ubm, Mixture) and ubm.validate()), "Second argument must be a proper Mixture"
+
+ comm = MPI.COMM_WORLD
+
+ comm.Barrier()
+
+ gmm_covariance = "diag" if ubm.invcov.ndim == 2 else "full"
+
+ # Set useful variables
+ tv_rank = tv.F.shape[1]
+ feature_size = ubm.mu.shape[1]
+ nb_distrib = ubm.w.shape[0]
+
+ # Get the number of sessions to process
+ with h5py.File(stat_server_file_name, 'r') as fh:
+ nb_sessions = fh["segset"].shape[0]
+
+ # Work on each node with different data
+ indices = numpy.array_split(numpy.arange(nb_sessions), comm.size, axis=0)
+ sendcounts = numpy.array([idx.shape[0] * tv.F.shape[1] for idx in indices])
+ displacements = numpy.hstack((0, numpy.cumsum(sendcounts)[:-1]))
+
+ stat_server = StatServer.read_subset(stat_server_file_name, indices[comm.rank])
+
+ # Whiten the statistics for diagonal or full models
+ if gmm_covariance == "diag":
+ stat_server.whiten_stat1(ubm.get_mean_super_vector(), 1. / ubm.get_invcov_super_vector())
+ elif gmm_covariance == "full":
+ stat_server.whiten_stat1(ubm.get_mean_super_vector(), ubm.invchol)
+
+ # Estimate i-vectors
+ if comm.rank == 0:
+ iv = numpy.zeros((nb_sessions, tv_rank))
+ iv_sigma = numpy.zeros((nb_sessions, tv_rank))
+ else:
+ iv = None
+ iv_sigma = None
+
+ local_iv = numpy.zeros((stat_server.modelset.shape[0], tv_rank))
+ local_iv_sigma = numpy.ones((stat_server.modelset.shape[0], tv_rank))
+
+ # Replicate stat0
+ index_map = numpy.repeat(numpy.arange(nb_distrib), feature_size)
+ for sess in range(stat_server.segset.shape[0]):
+
+ inv_lambda = scipy.linalg.inv(numpy.eye(tv_rank) + (tv.F.T * stat_server.stat0[sess, index_map]).dot(tv.F))
+
+ Aux = tv.F.T.dot(stat_server.stat1[sess, :])
+ local_iv[sess, :] = Aux.dot(inv_lambda)
+ local_iv_sigma[sess, :] = numpy.diag(inv_lambda + numpy.outer(local_iv[sess, :], local_iv[sess, :]))
+ comm.Barrier()
+
+ comm.Gatherv(local_iv,[iv, sendcounts, displacements,MPI.DOUBLE], root=0)
+ comm.Gatherv(local_iv_sigma,[iv_sigma, sendcounts, displacements,MPI.DOUBLE], root=0)
+
+ if comm.rank == 0:
+
+ with h5py.File(stat_server_file_name, 'r') as fh:
+ iv_stat_server = StatServer()
+ iv_stat_server.modelset = fh.get(prefix+"modelset")[()]
+ iv_stat_server.segset = fh.get(prefix+"segset")[()]
+
+ # if running python 3, need a conversion to unicode
+ if sys.version_info[0] == 3:
+ iv_stat_server.modelset = iv_stat_server.modelset.astype('U', copy=False)
+ iv_stat_server.segset = iv_stat_server.segset.astype('U', copy=False)
+
+ tmpstart = fh.get(prefix+"start")[()]
+ tmpstop = fh.get(prefix+"stop")[()]
+ iv_stat_server.start = numpy.empty(fh[prefix+"start"].shape, '|O')
+ iv_stat_server.stop = numpy.empty(fh[prefix+"stop"].shape, '|O')
+ iv_stat_server.start[tmpstart != -1] = tmpstart[tmpstart != -1]
+ iv_stat_server.stop[tmpstop != -1] = tmpstop[tmpstop != -1]
+ iv_stat_server.stat0 = numpy.ones((nb_sessions, 1))
+ iv_stat_server.stat1 = iv
+
+ iv_stat_server.write(output_file_name)
+ if uncertainty:
+ path = os.path.splitext(output_file_name)
+ write_matrix_hdf5(iv_sigma, path[0] + "_uncertainty" + path[1])
+
+
+def EM_split(ubm,
+ features_server,
+ feature_list,
+ distrib_nb,
+ output_filename,
+ iterations=(1, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8),
+ llk_gain=0.01,
+ save_partial=False,
+ ceil_cov=10,
+ floor_cov=1e-2,
+ num_thread=1):
+ """Expectation-Maximization estimation of the Mixture parameters.
+
+ :param comm:
+ :param features: a 2D-array of feature frames (one raow = 1 frame)
+ :param distrib_nb: final number of distributions
+ :param iterations: list of iteration number for each step of the learning process
+ :param llk_gain: limit of the training gain. Stop the training when gain between
+ two iterations is less than this value
+ :param save_partial: name of the file to save intermediate mixtures,
+ if True, save before each split of the distributions
+ :param ceil_cov:
+ :param floor_cov:
+
+ :return llk: a list of log-likelihoods obtained after each iteration
+ """
+
+ # Load the features
+ features = features_server.stack_features_parallel(feature_list, num_thread=num_thread)
+
+ comm = MPI.COMM_WORLD
+ comm.Barrier()
+
+ if comm.rank == 0:
+ import sys
+
+ llk = []
+
+ # Initialize the mixture
+ n_frames, feature_size = features.shape
+ mu = features.mean(axis=0)
+ cov = numpy.mean(features**2, axis=0)
+ ubm.mu = mu[None]
+ ubm.invcov = 1./cov[None]
+ ubm.w = numpy.asarray([1.0])
+ ubm.cst = numpy.zeros(ubm.w.shape)
+ ubm.det = numpy.zeros(ubm.w.shape)
+ ubm.cov_var_ctl = 1.0 / copy.deepcopy(ubm.invcov)
+ ubm._compute_all()
+
+ else:
+ n_frames = None
+ feature_size = None
+ features = None
+
+ comm.Barrier()
+
+ # Broadcast the UBM on each process
+ ubm = comm.bcast(ubm, root=0)
+
+ # Send n_frames and feature_size to all process
+ n_frames = comm.bcast(n_frames, root=0)
+ feature_size = comm.bcast(feature_size, root=0)
+
+ # Compute the size of all matrices to scatter to each process
+ indices = numpy.array_split(numpy.arange(n_frames), comm.size, axis=0)
+ sendcounts = numpy.array([idx.shape[0] * feature_size for idx in indices])
+ displacements = numpy.hstack((0, numpy.cumsum(sendcounts)[:-1]))
+
+ # Scatter features on all process
+ local_features = numpy.empty((indices[comm.rank].shape[0], feature_size))
+
+ comm.Scatterv([features, tuple(sendcounts), tuple(displacements), MPI.DOUBLE], local_features)
+ comm.Barrier()
+
+ # for N iterations:
+ for nbg, it in enumerate(iterations[:int(numpy.log2(distrib_nb))]):
+
+ if comm.rank == 0:
+ logging.critical("Start training model with {} distributions".format(2**nbg))
+ # Save current model before spliting
+ if save_partial:
+ ubm.write(output_filename + '_{}g.h5'.format(ubm.get_distrib_nb()), prefix='')
+
+ ubm._split_ditribution()
+
+ if comm.rank == 0:
+ accum = copy.deepcopy(ubm)
+ else:
+ accum = Mixture()
+ accum.w = accum.mu = accum.invcov = None
+
+ # Create one accumulator for each process
+ local_accum = copy.deepcopy(ubm)
+ for i in range(it):
+
+ local_accum._reset()
+
+ if comm.rank == 0:
+ logging.critical("\titeration {} / {}".format(i+1, it))
+ _tmp_llk = numpy.array(0)
+ accum._reset()
+
+ else:
+ _tmp_llk = numpy.array([None])
+
+ # E step
+ logging.critical("\nStart E-step, rank {}".format(comm.rank))
+ local_llk = numpy.array(ubm._expectation(local_accum, local_features))
+
+ # Reduce all accumulators in process 1
+ comm.Barrier()
+ comm.Reduce(
+ [local_accum.w, MPI.DOUBLE],
+ [accum.w, MPI.DOUBLE],
+ op=MPI.SUM,
+ root=0
+ )
+
+ comm.Reduce(
+ [local_accum.mu, MPI.DOUBLE],
+ [accum.mu, MPI.DOUBLE],
+ op=MPI.SUM,
+ root=0
+ )
+
+ comm.Reduce(
+ [local_accum.invcov, MPI.DOUBLE],
+ [accum.invcov, MPI.DOUBLE],
+ op=MPI.SUM,
+ root=0
+ )
+
+ comm.Reduce(
+ [local_llk, MPI.DOUBLE],
+ [_tmp_llk, MPI.DOUBLE],
+ op=MPI.SUM,
+ root=0
+ )
+ comm.Barrier()
+
+ if comm.rank == 0:
+ llk.append(_tmp_llk / numpy.sum(accum.w))
+
+ # M step
+ logging.critical("\nStart M-step, rank {}".format(comm.rank))
+ ubm._maximization(accum, ceil_cov=ceil_cov, floor_cov=floor_cov)
+
+ if i > 0:
+ # gain = llk[-1] - llk[-2]
+ # if gain < llk_gain:
+ # logging.debug(
+ # 'EM (break) distrib_nb: %d %i/%d gain: %f -- %s, %d',
+ # self.mu.shape[0], i + 1, it, gain, self.name,
+ # len(cep))
+ # break
+ # else:
+ # logging.debug(
+ # 'EM (continu) distrib_nb: %d %i/%d gain: %f -- %s, %d',
+ # self.mu.shape[0], i + 1, it, gain, self.name,
+ # len(cep))
+ # break
+ pass
+ else:
+ # logging.debug(
+ # 'EM (start) distrib_nb: %d %i/%i llk: %f -- %s, %d',
+ # self.mu.shape[0], i + 1, it, llk[-1],
+ # self.name, len(cep))
+ pass
+ # Send the new Mixture to all process
+ comm.Barrier()
+ ubm = comm.bcast(ubm, root=0)
+ comm.Barrier()
+ if comm.rank == 0:
+ ubm.write(output_filename + '_{}g.h5'.format(ubm.get_distrib_nb()), prefix='')
+ #return llk
+
diff --git a/sidekit/sidekit/sidekit_wrappers.py b/sidekit/sidekit/sidekit_wrappers.py
new file mode 100755
index 0000000..046bc9b
--- /dev/null
+++ b/sidekit/sidekit/sidekit_wrappers.py
@@ -0,0 +1,289 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+
+"""
+Copyright 2014-2021 Anthony Larcher
+
+:mod:`sidekit_wrappers` provides wrappers for different purposes.
+The aim when using wrappers is to simplify the development of new function
+in an efficient manner
+"""
+import os
+import numpy
+import copy
+import logging
+from sidekit import PARALLEL_MODULE
+
+__license__ = "LGPL"
+__author__ = "Anthony Larcher"
+__copyright__ = "Copyright 2014-2021 Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+
+
+def coroutine(func):
+ """
+ Decorator that allows to forget about the first call of a coroutine .next()
+ method or .send(None)
+ This call is done inside the decorator
+ :param func: the coroutine to decorate
+ """
+ def start(*args, **kwargs):
+ cr = func(*args, **kwargs)
+ next(cr)
+ return cr
+ return start
+
+
+def deprecated(func):
+ """
+
+ :param func:
+ :return:
+ """
+ count = [0]
+
+ def wrapper(*args, **kwargs):
+ count[0] += 1
+ if count[0] == 1:
+ logging.warning(func.__name__ + ' is deprecated')
+ return func(*args, **kwargs)
+ return wrapper
+
+
+def check_path_existance(func):
+ """ Decorator for a function wich prototype is:
+
+ func(features, outputFileName)
+
+ This decorator gets the path included in 'outputFileName' if any
+ and check if this path exists; if not the path is created.
+ :param func: function to decorate
+ """
+ def wrapper(*args, **kwargs):
+ dir_name = os.path.dirname(args[1]) # get the path
+ # Create the directory if it dosn't exist
+ if not os.path.exists(dir_name) and not (dir_name == ''):
+ os.makedirs(dir_name)
+ # Do the job
+ func(*args, **kwargs)
+ return wrapper
+
+
+def process_parallel_lists(func):
+ """
+ Decorator that is used to parallelize process.
+ This decorator takes a function with any list of arguments including
+ "num_thread" and parallelize the process by creating "num_thread" number of
+ parallel process or threads.
+
+ The choice of process or threas depends on the value of the global variable
+ "PARALLEL_MODULE" that is defined in ./sidekit/__init__.py
+
+ Parallelization is done as follow:
+ - all arguments have to be given to the decorator with their names
+ any other case might limit the parallelization.
+ - the function that is decorated is called by "num_thread" concurrent
+ process (or threads) with the list of arguments that is given
+ to the decorator except special arguments (see below)
+
+ Special arguments:
+ Special arguments are the one that lead to parallelization.
+ There are 3 types of special arguments which name end with a special
+ suffix:
+
+ - arguments which names are "*_list" or "*_indices" are lists
+ or numpy arrays that will be split (equally or almost) and
+ each sub-list will be passed as an argument for a process/thread
+
+ - arguments which names are "_acc" are duplicated and each thread is
+ given a copy of this accumulator. At the end of the function, all
+ accumulators will be summed to return a unique accumulatore; thus
+ any object passed as a "*_acc" argument has to implement
+ a "+" operator
+
+ - arguments which names are "*_server" are duplicated using a deepcopy
+ of the original argument. This is mostly used to pass servers such
+ as FeaturesServer as arguments
+ :param func: function to decorate
+
+ """
+ def wrapper(*args, **kwargs):
+ """
+
+ :param args:
+ :param kwargs:
+ :return:
+ """
+
+ if len(args) > 1:
+ print("Warning, some arguments are not named, computation might not be parallelized")
+
+ num_thread = 1
+ if "num_thread" in kwargs.keys():
+ num_thread = kwargs["num_thread"]
+
+ # On créé un dictionnaire de paramètres kwargs pour chaque thread
+ if PARALLEL_MODULE in ['threading', 'multiprocessing'] and num_thread > 1:
+
+ # If arguments end with _list or _indices,
+ # set number of Threads to the minimum length of the lists and raise a warning
+ list_length = numpy.inf
+ for k, v in kwargs.items():
+ # If v is a list or a numpy.array
+ if k.endswith("_list") or k.endswith("_indices"):
+ list_length = min(list_length, len(list(v)))
+ num_thread = min(num_thread, list_length)
+
+ # Create a list of dictionaries, one per thread, and initialize
+ # them with the keys
+ parallel_kwargs = []
+ for ii in range(num_thread):
+ parallel_kwargs.append(dict(zip(kwargs.keys(),
+ [None]*len(kwargs.keys()))))
+
+ for k, v in kwargs.items():
+
+ # If v is a list or a numpy.array
+ if k.endswith("_list") or k.endswith("_indices"):
+ sub_lists = numpy.array_split(v, num_thread)
+ for ii in range(num_thread):
+ parallel_kwargs[ii][k] = sub_lists[ii] # the ii-th sub_list is used for the thread ii
+
+ elif k == "num_thread":
+ for ii in range(num_thread):
+ parallel_kwargs[ii][k] = 1
+
+ # If v is an accumulator (meaning k ends with "_acc")
+ # v is duplicated for each thread
+ elif k.endswith("_acc"):
+ for ii in range(num_thread):
+ parallel_kwargs[ii][k] = v
+
+ # Duplicate servers for each thread
+ elif k.endswith("_server") or k.endswith("_extractor"):
+ for ii in range(num_thread):
+ parallel_kwargs[ii][k] = copy.deepcopy(v)
+
+ # All other parameters are just given to each thread
+ else:
+ for ii in range(num_thread):
+ parallel_kwargs[ii][k] = v
+
+ if PARALLEL_MODULE == 'multiprocessing':
+ import multiprocessing
+ jobs = []
+ multiprocessing.freeze_support()
+ for idx in range(num_thread):
+ p = multiprocessing.Process(target=func, args=args, kwargs=parallel_kwargs[idx])
+ jobs.append(p)
+ p.start()
+ for p in jobs:
+ p.join()
+
+ elif PARALLEL_MODULE == 'threading':
+ import threading
+ jobs = []
+ for idx in range(num_thread):
+ p = threading.Thread(target=func, args=args, kwargs=parallel_kwargs[idx])
+ jobs.append(p)
+ p.start()
+ for p in jobs:
+ p.join()
+
+ elif PARALLEL_MODULE == 'MPI':
+ # TODO
+ print("ParallelProcess using MPI is not implemented yet")
+ pass
+
+ # Sum accumulators if any
+ for k, v in kwargs.items():
+ if k.endswith("_acc"):
+ for ii in range(num_thread):
+ if isinstance(kwargs[k], list):
+ kwargs[k][0] += parallel_kwargs[ii][k][0]
+ else:
+ kwargs[k] += parallel_kwargs[ii][k]
+
+ else:
+ logging.debug("No Parallel processing with this module")
+ func(*args, **kwargs)
+
+ return wrapper
+
+
+def accepts(*types, **kw):
+ """Function decorator. Checks decorated function's arguments are
+ of the expected types.
+
+ Sources: https://wiki.python.org/moin/PythonDecoratorLibrary#Type_Enforcement_.28accepts.2Freturns.29
+
+ Parameters:
+ types -- The expected types of the inputs to the decorated function.
+ Must specify type for each parameter.
+ kw -- Optional specification of 'debug' level (this is the only valid
+ keyword argument, no other should be given).
+ debug = ( 0 | 1 | 2 )
+
+ """
+ if not kw:
+ # default level: MEDIUM
+ debug = 1
+ else:
+ debug = kw['debug']
+ try:
+ def decorator(f):
+ def newf(*args):
+ if debug == 0:
+ return f(*args)
+ assert len(args) == len(types)
+ argtypes = tuple([a.__class__.__name__ for a in args])
+ if argtypes != types:
+ print("argtypes = {} and types = {}".format(argtypes, types))
+ msg = info(f.__name__, types, argtypes, 0)
+ if debug == 1:
+ print('TypeWarning: ', msg)
+ elif debug == 2:
+ raise TypeError(msg)
+ return f(*args)
+ newf.__name__ = f.__name__
+ return newf
+ return decorator
+ except KeyError as key:
+ raise KeyError(key + "is not a valid keyword argument")
+ except TypeError(msg):
+ raise TypeError(msg)
+
+
+def info(fname, expected, actual, flag):
+ """Convenience function returns nicely formatted error/warning msg.
+ :param fname: function to decorate
+ :param expected: expected format of the function
+ :param actual: actual format of the function to check
+ :param flag: flag
+ """
+ _format = lambda types: ', '.join([str(t).split("'")[0] for t in types])
+ expected, actual = _format(expected), _format(actual)
+ msg = "'{}' method ".format(fname)\
+ + ("accepts", "returns")[flag] + " ({}), but ".format(expected)\
+ + ("was given", "result is")[flag] + " ({})".format(actual)
+ return msg
diff --git a/sidekit/sidekit/statserver.py b/sidekit/sidekit/statserver.py
new file mode 100755
index 0000000..2b57922
--- /dev/null
+++ b/sidekit/sidekit/statserver.py
@@ -0,0 +1,2041 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+
+
+"""
+Copyright 2014-2021 Anthony Larcher
+
+:mod:`statserver` provides methods to manage zero and first statistics.
+"""
+import copy
+import ctypes
+import h5py
+import logging
+import multiprocessing
+import numpy
+import os
+import scipy
+import sys
+import tqdm
+import warnings
+
+from sidekit.bosaris import IdMap
+from sidekit.mixture import Mixture
+from sidekit.features_server import FeaturesServer
+from sidekit.sidekit_wrappers import process_parallel_lists, deprecated, check_path_existance
+import sidekit.frontend
+from sidekit import STAT_TYPE
+
+
+ct = ctypes.c_double
+if STAT_TYPE == numpy.float32:
+ ct = ctypes.c_float
+
+
+__license__ = "LGPL"
+__author__ = "Anthony Larcher"
+__copyright__ = "Copyright 2014-2021 Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+
+
+def compute_llk(stat, V, sigma, U=None):
+ # Compute Likelihood
+ (n, d) = stat.stat1.shape
+ centered_data = stat.stat1 - stat.get_mean_stat1()
+
+ if sigma.ndim == 2:
+ sigma_tot = numpy.dot(V, V.T) + sigma
+ else:
+ sigma_tot = numpy.dot(V, V.T) + numpy.diag(sigma)
+ if U is not None:
+ sigma_tot += numpy.dot(U, U.T)
+
+ E, junk = scipy.linalg.eigh(sigma_tot)
+ log_det = numpy.sum(numpy.log(E))
+
+ return (-0.5 * (n * d * numpy.log(2 * numpy.pi) + n * log_det +
+ numpy.sum(numpy.sum(numpy.dot(centered_data,
+ scipy.linalg.inv(sigma_tot)) * centered_data, axis=1))))
+
+
+def sum_log_probabilities(lp):
+ """Sum log probabilities in a secure manner to avoid extreme values
+
+ :param lp: ndarray of log-probabilities to sum
+ """
+ pp_max = numpy.max(lp, axis=1)
+ log_lk = pp_max \
+ + numpy.log(numpy.sum(numpy.exp((lp.transpose() - pp_max).transpose()), axis=1))
+ ind = ~numpy.isfinite(pp_max)
+ if sum(ind) != 0:
+ log_lk[ind] = pp_max[ind]
+ pp = numpy.exp((lp.transpose() - log_lk).transpose())
+ return pp, log_lk
+
+
+@process_parallel_lists
+def fa_model_loop(batch_start,
+ mini_batch_indices,
+ r,
+ phi_white,
+ phi,
+ sigma,
+ stat0,
+ stat1,
+ e_h,
+ e_hh,
+ num_thread=1):
+ """
+ :param batch_start: index to start at in the list
+ :param mini_batch_indices: indices of the elements in the list (should start at zero)
+ :param r: rank of the matrix
+ :param phi_white: whitened version of the factor matrix
+ :param phi: non-whitened version of the factor matrix
+ :param sigma: covariance matrix
+ :param stat0: matrix of zero order statistics
+ :param stat1: matrix of first order statistics
+ :param e_h: accumulator
+ :param e_hh: accumulator
+ :param num_thread: number of parallel process to run
+ """
+ if sigma.ndim == 2:
+ A = phi.T.dot(scipy.linalg.solve(sigma, phi)).astype(dtype=STAT_TYPE)
+
+ tmp = numpy.zeros((phi.shape[1], phi.shape[1]), dtype=STAT_TYPE)
+
+ for idx in mini_batch_indices:
+ if sigma.ndim == 1:
+ inv_lambda = scipy.linalg.inv(numpy.eye(r) + (phi_white.T * stat0[idx + batch_start, :]).dot(phi_white))
+ else:
+ inv_lambda = scipy.linalg.inv(stat0[idx + batch_start, 0] * A + numpy.eye(A.shape[0]))
+
+ Aux = phi_white.T.dot(stat1[idx + batch_start, :])
+ numpy.dot(Aux, inv_lambda, out=e_h[idx])
+ e_hh[idx] = inv_lambda + numpy.outer(e_h[idx], e_h[idx], tmp)
+
+
+@process_parallel_lists
+def fa_distribution_loop(distrib_indices, _A, stat0, batch_start, batch_stop, e_hh, num_thread=1):
+ """
+ :param distrib_indices: indices of the distributions to iterate on
+ :param _A: accumulator
+ :param stat0: matrix of zero order statistics
+ :param batch_start: index of the first session to process
+ :param batch_stop: index of the last session to process
+ :param e_hh: accumulator
+ :param num_thread: number of parallel process to run
+ """
+ tmp = numpy.zeros((e_hh.shape[1], e_hh.shape[1]), dtype=STAT_TYPE)
+ for c in distrib_indices:
+ _A[c] += numpy.einsum('ijk,i->jk', e_hh, stat0[batch_start:batch_stop, c], out=tmp)
+ # The line abov is equivalent to the two lines below:
+ # tmp = (E_hh.T * stat0[batch_start:batch_stop, c]).T
+ # _A[c] += numpy.sum(tmp, axis=0)
+
+
+def load_existing_statistics_hdf5(statserver, statserver_file_name):
+ """Load required statistics into the StatServer by reading from a file
+ in hdf5 format.
+
+ :param statserver: sidekit.StatServer to fill
+ :param statserver_file_name: name of the file to read from
+ """
+ assert os.path.isfile(statserver_file_name), "statserver_file_name does not exist"
+
+ # Load the StatServer
+ ss = StatServer(statserver_file_name)
+
+ # Check dimension consistency with current Stat_Server
+ ok = True
+ if statserver.stat0.shape[0] > 0:
+ ok &= (ss.stat0.shape[0] == statserver.stat0.shape[1])
+ ok &= (ss.stat1.shape[0] == statserver.stat1.shape[1])
+ else:
+ statserver.stat0 = numpy.zeros((statserver. modelset.shape[0], ss.stat0.shape[1]), dtype=STAT_TYPE)
+ statserver.stat1 = numpy.zeros((statserver. modelset.shape[0], ss.stat1.shape[1]), dtype=STAT_TYPE)
+
+ if ok:
+ # For each segment, load statistics if they exist
+ # Get the lists of existing segments
+ seg_idx = [i for i in range(statserver. segset.shape[0]) if statserver.segset[i] in ss.segset]
+ stat_idx = [numpy.where(ss.segset == seg)[0][0] for seg in statserver.segset if seg in ss.segset]
+
+ # Copy statistics
+ statserver.stat0[seg_idx, :] = ss.stat0[stat_idx, :]
+ statserver.stat1[seg_idx, :] = ss.stat1[stat_idx, :]
+ else:
+ raise Exception('Mismatched statistic dimensions')
+
+
+class StatServer:
+ """A class for statistic storage and processing
+
+ :attr modelset: list of model IDs for each session as an array of strings
+ :attr segset: the list of session IDs as an array of strings
+ :attr start: index of the first frame of the segment
+ :attr stop: index of the last frame of the segment
+ :attr stat0: a ndarray of float64. Each line contains 0-order statistics
+ from the corresponding session
+ :attr stat1: a ndarray of float64. Each line contains 1-order statistics
+ from the corresponding session
+
+ """
+
+ #def __init__(self, statserver_file_name=None, ubm=None, index=None):$
+ def __init__(self, statserver_file_name=None, distrib_nb=0, feature_size=0, index=None, ubm=None):
+ """Initialize an empty StatServer or load a StatServer from an existing
+ file.
+
+ :param statserver_file_name: name of the file to read from. If filename
+ is an empty string, the StatServer is initialized empty.
+ If filename is an IdMap object, the StatServer is initialized
+ to match the structure of the IdMap.
+ """
+ self.modelset = numpy.empty(0, dtype="|O")
+ self.segset = numpy.empty(0, dtype="|O")
+ self.start = numpy.empty(0, dtype="|O")
+ self.stop = numpy.empty(0, dtype="|O")
+ self.stat0 = numpy.array([], dtype=STAT_TYPE)
+ self.stat1 = numpy.array([], dtype=STAT_TYPE)
+
+ if ubm is not None:
+ distrib_nb = ubm.w.shape[0]
+ feature_size = ubm.mu.shape[1]
+
+ if statserver_file_name is None:
+ pass
+ # initialize
+ elif isinstance(statserver_file_name, IdMap):
+ self.modelset = statserver_file_name.leftids
+ self.segset = statserver_file_name.rightids
+ self.start = statserver_file_name.start
+ self.stop = statserver_file_name.stop
+ self.stat0 = numpy.empty((self.segset.shape[0], distrib_nb), dtype=STAT_TYPE)
+ self.stat1 = numpy.empty((self.segset.shape[0], distrib_nb * feature_size), dtype=STAT_TYPE)
+
+ with warnings.catch_warnings():
+ warnings.simplefilter('ignore', RuntimeWarning)
+ tmp_stat0 = multiprocessing.Array(ct, self.stat0.size)
+ self.stat0 = numpy.ctypeslib.as_array(tmp_stat0.get_obj())
+ self.stat0 = self.stat0.reshape(self.segset.shape[0], distrib_nb)
+
+ tmp_stat1 = multiprocessing.Array(ct, self.stat1.size)
+ self.stat1 = numpy.ctypeslib.as_array(tmp_stat1.get_obj())
+ self.stat1 = self.stat1.reshape(self.segset.shape[0], distrib_nb * feature_size)
+
+ # initialize by reading an existing StatServer
+ elif isinstance(statserver_file_name, str) and index is None:
+ tmp = StatServer.read(statserver_file_name)
+ self.modelset = tmp.modelset
+ self.segset = tmp.segset
+ self.start = tmp.start
+ self.stop = tmp.stop
+ self.stat0 = tmp.stat0.astype(STAT_TYPE)
+ self.stat1 = tmp.stat1.astype(STAT_TYPE)
+
+ with warnings.catch_warnings():
+ size = self.stat0.shape
+ warnings.simplefilter('ignore', RuntimeWarning)
+ tmp_stat0 = multiprocessing.Array(ct, self.stat0.size)
+ self.stat0 = numpy.ctypeslib.as_array(tmp_stat0.get_obj())
+ self.stat0 = self.stat0.reshape(size)
+
+ size = self.stat1.shape
+ tmp_stat1 = multiprocessing.Array(ct, self.stat1.size)
+ self.stat1 = numpy.ctypeslib.as_array(tmp_stat1.get_obj())
+ self.stat1 = self.stat1.reshape(size)
+
+ self.stat0 = copy.deepcopy(tmp.stat0).astype(STAT_TYPE)
+ self.stat1 = copy.deepcopy(tmp.stat1).astype(STAT_TYPE)
+
+
+ elif isinstance(statserver_file_name, str) and index is not None:
+ tmp = StatServer.read_subset(statserver_file_name, index)
+ self.modelset = tmp.modelset
+ self.segset = tmp.segset
+ self.start = tmp.start
+ self.stop = tmp.stop
+ self.stat0 = tmp.stat0.astype(STAT_TYPE)
+ self.stat1 = tmp.stat1.astype(STAT_TYPE)
+
+ with warnings.catch_warnings():
+ warnings.simplefilter('ignore', RuntimeWarning)
+ tmp_stat0 = multiprocessing.Array(ct, self.stat0.size)
+ self.stat0 = numpy.ctypeslib.as_array(tmp_stat0.get_obj())
+ self.stat0 = self.stat0.reshape(self.segset.shape[0], distrib_nb)
+
+ tmp_stat1 = multiprocessing.Array(ct, self.stat1.size)
+ self.stat1 = numpy.ctypeslib.as_array(tmp_stat1.get_obj())
+ self.stat1 = self.stat1.reshape(self.segset.shape[0], feature_size * distrib_nb)
+
+ self.stat0 = copy.deepcopy(tmp.stat0).astype(STAT_TYPE)
+ self.stat1 = copy.deepcopy(tmp.stat1).astype(STAT_TYPE)
+
+
+ def __repr__(self):
+ ch = '-' * 30 + '\n'
+ ch += 'modelset: ' + self.modelset.__repr__() + '\n'
+ ch += 'segset: ' + self.segset.__repr__() + '\n'
+ ch += 'seg start:' + self.start.__repr__() + '\n'
+ ch += 'seg stop:' + self.stop.__repr__() + '\n'
+ ch += 'stat0:' + self.stat0.__repr__() + '\n'
+ ch += 'stat1:' + self.stat1.__repr__() + '\n'
+ ch += '-' * 30 + '\n'
+ return ch
+
+ def validate(self, warn=False):
+ """Validate the structure and content of the StatServer.
+ Check consistency between the different attributes of
+ the StatServer:
+ - dimension of the modelset
+ - dimension of the segset
+ - length of the modelset and segset
+ - consistency of stat0 and stat1
+
+ :param warn: bollean optional, if True, display possible warning
+ """
+ ok = self.modelset.ndim == 1 \
+ and (self.modelset.shape == self.segset.shape == self.start.shape == self.stop.shape) \
+ and (self.stat0.shape[0] == self.stat1.shape[0] == self.modelset.shape[0]) \
+ and (not bool(self.stat1.shape[1] % self.stat0.shape[1]))
+
+ if warn and (self.segset.shape != numpy.unique(self.segset).shape):
+ logging.warning('Duplicated segments in StatServer')
+ return ok
+
+ def merge(*arg):
+ """
+ Merge a variable number of StatServers into one.
+ If a pair segmentID is duplicated, keep ony one
+ of them and raises a WARNING
+ """
+ line_number = 0
+ for idx, ss in enumerate(arg):
+ assert (isinstance(ss, sidekit.StatServer) and ss.validate()), "Arguments must be proper StatServers"
+
+ # Check consistency of StatServers (dimension of the stat0 and stat1)
+ if idx == 0:
+ dim_stat0 = ss.stat0.shape[1]
+ dim_stat1 = ss.stat1.shape[1]
+ else:
+ assert (dim_stat0 == ss.stat0.shape[1] and
+ dim_stat1 == ss.stat1.shape[1]), "Stat dimensions are not consistent"
+
+ line_number += ss.modelset.shape[0]
+
+ # Get a list of unique modelID-segmentID-start-stop
+ id_list = []
+ for ss in arg:
+ for m, s, start, stop in zip(ss.modelset, ss.segset, ss.start, ss.stop):
+ id_list.append("{}-{}-{}-{}".format(m, s, str(start), str(stop)))
+ id_set = set(id_list)
+ if line_number != len(id_set):
+ print("WARNING: duplicated segmentID in input StatServers")
+
+ # Initialize the new StatServer with unique set of segmentID
+ tmp = numpy.array(list(id_set))
+ new_stat_server = sidekit.StatServer()
+ new_stat_server.modelset = numpy.empty(len(id_set), dtype='object')
+ new_stat_server.segset = numpy.empty(len(id_set), dtype='object')
+ new_stat_server.start = numpy.empty(len(id_set), 'object')
+ new_stat_server.stop = numpy.empty(len(id_set), dtype='object')
+ new_stat_server.stat0 = numpy.zeros((len(id_set), dim_stat0), dtype=STAT_TYPE)
+ new_stat_server.stat1 = numpy.zeros((len(id_set), dim_stat1), dtype=STAT_TYPE)
+
+ for ss in arg:
+ for idx, (m, s, start, stop) in enumerate(zip(ss.modelset, ss.segset, ss.start, ss.stop)):
+ key = "{}-{}-{}-{}".format(m, s, str(start), str(stop))
+ new_idx = numpy.argwhere(tmp == key)
+ new_stat_server.modelset[new_idx] = ss.modelset[idx]
+ new_stat_server.segset[new_idx] = ss.segset[idx]
+ new_stat_server.start[new_idx] = ss.start[idx]
+ new_stat_server.stop[new_idx] = ss.stop[idx]
+ new_stat_server.stat0[new_idx, :] = ss.stat0[idx, :].astype(STAT_TYPE)
+ new_stat_server.stat1[new_idx, :] = ss.stat1[idx, :].astype(STAT_TYPE)
+
+ assert (new_stat_server.validate()), "Problem in StatServer Merging"
+ return new_stat_server
+
+ @staticmethod
+ def read(statserver_file_name, prefix=''):
+ """Read StatServer in hdf5 format
+
+ :param statserver_file_name: name of the file to read from
+ :param prefix: prefixe of the dataset to read from in HDF5 file
+ """
+ with h5py.File(statserver_file_name, "r") as f:
+ statserver = StatServer()
+ statserver.modelset = f.get(prefix+"modelset")[()]
+ statserver.segset = f.get(prefix+"segset")[()]
+
+ # if running python 3, need a conversion to unicode
+ if sys.version_info[0] == 3:
+ statserver.modelset = statserver.modelset.astype('U', copy=False)
+ statserver.segset = statserver.segset.astype('U', copy=False)
+
+ tmpstart = f.get(prefix+"start")[()]
+ tmpstop = f.get(prefix+"stop")[()]
+ statserver.start = numpy.empty(f[prefix+"start"].shape, '|O')
+ statserver.stop = numpy.empty(f[prefix+"stop"].shape, '|O')
+ statserver.start[tmpstart != -1] = tmpstart[tmpstart != -1]
+ statserver.stop[tmpstop != -1] = tmpstop[tmpstop != -1]
+
+ statserver.stat0 = f.get(prefix+"stat0")[()].astype(dtype=STAT_TYPE)
+ statserver.stat1 = f.get(prefix+"stat1")[()].astype(dtype=STAT_TYPE)
+
+ assert statserver.validate(), "Error: wrong StatServer format"
+ return statserver
+
+ @check_path_existance
+ def write(self, output_file_name, prefix='', mode='w'):
+ """Write the StatServer to disk in hdf5 format.
+
+ :param output_file_name: name of the file to write in.
+ :param prefix:
+ """
+ assert self.validate(), "Error: wrong StatServer format"
+
+ start = copy.deepcopy(self.start)
+ start[numpy.isnan(self.start.astype('float'))] = -1
+ start = start.astype('int32', copy=False)
+
+ stop = copy.deepcopy(self.stop)
+ stop[numpy.isnan(self.stop.astype('float'))] = -1
+ stop = stop.astype('int32', copy=False)
+
+ with h5py.File(output_file_name, mode) as f:
+
+ ds_already_exist = prefix in f
+
+ # If the file doesn't exist before, create it
+ if mode == "w" or not ds_already_exist:
+
+ f.create_dataset(prefix+"modelset", data=self.modelset.astype('S'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ f.create_dataset(prefix+"segset", data=self.segset.astype('S'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ f.create_dataset(prefix+"stat0", data=self.stat0.astype(numpy.float32),
+ maxshape=(None, self.stat0.shape[1]),
+ compression="gzip",
+ fletcher32=True)
+ f.create_dataset(prefix+"stat1", data=self.stat1.astype(numpy.float32),
+ maxshape=(None, self.stat1.shape[1]),
+ compression="gzip",
+ fletcher32=True)
+ f.create_dataset(prefix+"start", data=start,
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ f.create_dataset(prefix+"stop", data=stop,
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+
+ # If the file already exist, we extend all datasets and add the new data
+ else:
+
+ previous_size = f[prefix+"modelset"].shape[0]
+
+ # Extend the size of each dataset
+ f[prefix+"modelset"].resize((previous_size + self.modelset.shape[0],))
+ f[prefix+"segset"].resize((previous_size + self.segset.shape[0],))
+ f[prefix+"start"].resize((previous_size + start.shape[0],))
+ f[prefix+"stop"].resize((previous_size + stop.shape[0],))
+ f[prefix+"stat0"].resize((previous_size + self.stat0.shape[0], self.stat0.shape[1]))
+ f[prefix+"stat1"].resize((previous_size + self.stat1.shape[0], self.stat1.shape[1]))
+
+ # add the new data; WARNING: no check is done on the new data, beware of duplicated entries
+ f[prefix+"modelset"][previous_size:] = self.modelset.astype('S')
+ f[prefix+"segset"][previous_size:] = self.segset.astype('S')
+ f[prefix+"start"][previous_size:] = start
+ f[prefix+"stop"][previous_size:] = stop
+ f[prefix+"stat0"][previous_size:, :] = self.stat0.astype(STAT_TYPE)
+ f[prefix+"stat1"][previous_size:, :] = self.stat1.astype(STAT_TYPE)
+
+ def to_hdf5(self, h5f, prefix='', mode='w'):
+ """Write the StatServer to disk in hdf5 format.
+
+ :param output_file_name: name of the file to write in.
+ :param prefix:
+ """
+ assert self.validate(), "Error: wrong StatServer format"
+
+ start = copy.deepcopy(self.start)
+ start[numpy.isnan(self.start.astype('float'))] = -1
+ start = start.astype('int32', copy=False)
+
+ stop = copy.deepcopy(self.stop)
+ stop[numpy.isnan(self.stop.astype('float'))] = -1
+ stop = stop.astype('int32', copy=False)
+
+ # If the file doesn't exist before, create it
+ if mode == "w":
+ h5f.create_dataset(prefix + "modelset", data=self.modelset.astype('S'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ h5f.create_dataset(prefix + "segset", data=self.segset.astype('S'),
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ h5f.create_dataset(prefix + "stat0", data=self.stat0.astype(numpy.float32),
+ maxshape=(None, self.stat0.shape[1]),
+ compression="gzip",
+ fletcher32=True)
+ h5f.create_dataset(prefix + "stat1", data=self.stat1.astype(numpy.float32),
+ maxshape=(None, self.stat1.shape[1]),
+ compression="gzip",
+ fletcher32=True)
+ h5f.create_dataset(prefix + "start", data=start,
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+ h5f.create_dataset(prefix + "stop", data=stop,
+ maxshape=(None,),
+ compression="gzip",
+ fletcher32=True)
+
+ # If the file already exist, we extend all datasets and add the new data
+ else:
+
+ previous_size = f[prefix + "modelset"].shape[0]
+
+ # Extend the size of each dataset
+ h5f[prefix + "modelset"].resize((previous_size + self.modelset.shape[0],))
+ h5f[prefix + "segset"].resize((previous_size + self.segset.shape[0],))
+ h5f[prefix + "start"].resize((previous_size + start.shape[0],))
+ h5f[prefix + "stop"].resize((previous_size + stop.shape[0],))
+ h5f[prefix + "stat0"].resize((previous_size + self.stat0.shape[0], self.stat0.shape[1]))
+ h5f[prefix + "stat1"].resize((previous_size + self.stat1.shape[0], self.stat1.shape[1]))
+
+ # add the new data; WARNING: no check is done on the new data, beware of duplicated entries
+ h5f[prefix + "modelset"][previous_size:] = self.modelset.astype('S')
+ h5f[prefix + "segset"][previous_size:] = self.segset.astype('S')
+ h5f[prefix + "start"][previous_size:] = start
+ h5f[prefix + "stop"][previous_size:] = stop
+ h5f[prefix + "stat0"][previous_size:, :] = self.stat0.astype(STAT_TYPE)
+ h5f[prefix + "stat1"][previous_size:, :] = self.stat1.astype(STAT_TYPE)
+
+ def get_model_stat0(self, mod_id):
+ """Return zero-order statistics of a given model
+
+ :param mod_id: ID of the model which stat0 will be returned
+
+ :return: a matrix of zero-order statistics as a ndarray
+ """
+ S = self.stat0[self. modelset == mod_id, :]
+ return S
+
+ def get_model_stat1(self, mod_id):
+ """Return first-order statistics of a given model
+
+ :param mod_id: string, ID of the model which stat1 will be returned
+
+ :return: a matrix of first-order statistics as a ndarray
+ """
+ return self.stat1[self.modelset == mod_id, :]
+
+ def get_model_stat0_by_index(self, mod_idx):
+ """Return zero-order statistics of model number modIDX
+
+ :param mod_idx: integer, index of the unique model which stat0 will be
+ returned
+
+ :return: a matrix of zero-order statistics as a ndarray
+ """
+ return self.stat0[(self.modelset == numpy.unique(self.modelset)[mod_idx]), :]
+
+ def get_model_stat1_by_index(self, mod_idx):
+ """Return first-order statistics of model number modIDX
+
+ :param mod_idx: integer, index of the unique model which stat1 will be
+ returned
+
+ :return: a matrix of first-order statistics as a ndarray
+ """
+ selectSeg = (self.modelset == numpy.unique(self.modelset)[mod_idx])
+ return self.stat1[selectSeg, :]
+
+ def get_segment_stat0(self, seg_id):
+ """Return zero-order statistics of segment which ID is segID
+
+ :param seg_id: string, ID of the segment which stat0 will be
+ returned
+
+ :return: a matrix of zero-order statistics as a ndarray
+ """
+ return self.stat0[self.segset == seg_id, :]
+
+ def get_segment_stat1(self, seg_id):
+ """Return first-order statistics of segment which ID is segID
+
+ :param seg_id: string, ID of the segment which stat1 will be
+ returned
+
+ :return: a matrix of first-order statistics as a ndarray
+ """
+ return self.stat1[self.segset == seg_id, :]
+
+ def get_segment_stat0_by_index(self, seg_idx):
+ """Return zero-order statistics of segment number segIDX
+
+ :param seg_idx: integer, index of the unique segment which stat0 will be
+ returned
+
+ :return: a matrix of zero-order statistics as a ndarray
+ """
+ return self.stat0[seg_idx, :]
+
+ def get_segment_stat1_by_index(self, seg_idx):
+ """Return first-order statistics of segment number segIDX
+
+ :param seg_idx: integer, index of the unique segment which stat1 will be
+ returned
+
+ :return: a matrix of first-order statistics as a ndarray
+ """
+ return self.stat1[seg_idx, :]
+
+ def get_model_segments(self, mod_id):
+ """Return the list of segments belonging to model modID
+
+ :param mod_id: string, ID of the model which belonging segments will be
+ returned
+
+ :return: a list of segments belonging to the model
+ """
+ return self.segset[self.modelset == mod_id]
+
+ def get_model_segments_by_index(self, mod_idx):
+ """Return the list of segments belonging to model number modIDX
+
+ :param mod_idx: index of the model which list of segments will be
+ returned
+
+ :return: a list of segments belonging to the model
+ """
+ select_seg = (self.modelset == numpy.unique(self.modelset)[mod_idx])
+ return self.segset[select_seg, :]
+
+ def align_segments(self, segment_list):
+ """Align segments of the current StatServer to match a list of segment
+ provided as input parameter. The size of the StatServer might be
+ reduced to match the input list of segments.
+
+ :param segment_list: ndarray of strings, list of segments to match
+ """
+ indx = numpy.array([numpy.argwhere(self.segset == v)[0][0] for v in segment_list])
+ self.segset = self.segset[indx]
+ self.modelset = self.modelset[indx]
+ self.start = self.start[indx]
+ self.stop = self.stop[indx]
+ self.stat0 = self.stat0[indx, :]
+ self.stat1 = self.stat1[indx, :]
+
+ def align_models(self, model_list):
+ """Align models of the current StatServer to match a list of models
+ provided as input parameter. The size of the StatServer might be
+ reduced to match the input list of models.
+
+ :param model_list: ndarray of strings, list of models to match
+ """
+ indx = numpy.array([numpy.argwhere(self.modelset == v)[0][0] for v in model_list])
+ self.segset = self.segset[indx]
+ self.modelset = self.modelset[indx]
+ self.start = self.start[indx]
+ self.stop = self.stop[indx]
+ self.stat0 = self.stat0[indx, :]
+ self.stat1 = self.stat1[indx, :]
+
+ @process_parallel_lists
+ def accumulate_stat(self, ubm, feature_server, seg_indices=None, channel_extension=("", "_b"), num_thread=1):
+ """Compute statistics for a list of sessions which indices
+ are given in segIndices.
+
+ :param ubm: a Mixture object used to compute the statistics
+ :param feature_server: featureServer object
+ :param seg_indices: list of indices of segments to process
+ if segIndices is an empty list, process all segments.
+ :param channel_extension: tuple of strings, extension of first and second channel for stereo files, default
+ is ("", "_b")
+ :param num_thread: number of parallel process to run
+ """
+ assert isinstance(ubm, Mixture), 'First parameter has to be a Mixture'
+ assert isinstance(feature_server, FeaturesServer), 'Second parameter has to be a FeaturesServer'
+
+ if (seg_indices is None) \
+ or (self.stat0.shape[0] != self.segset.shape[0]) \
+ or (self.stat1.shape[0] != self.segset.shape[0]):
+ self.stat0 = numpy.zeros((self.segset.shape[0], ubm.distrib_nb()), dtype=STAT_TYPE)
+ self.stat1 = numpy.zeros((self.segset.shape[0], ubm.sv_size()), dtype=STAT_TYPE)
+ seg_indices = range(self.segset.shape[0])
+ feature_server.keep_all_features = True
+
+ for count, idx in enumerate(seg_indices):
+ logging.debug('Compute statistics for {}'.format(self.segset[idx]))
+
+ show = self.segset[idx]
+
+ # If using a FeaturesExtractor, get the channel number by checking the extension of the show
+ channel = 0
+ if feature_server.features_extractor is not None and show.endswith(channel_extension[1]):
+ channel = 1
+ show = show[:show.rfind(channel_extension[channel])]
+
+ cep, vad = feature_server.load(show, channel=channel)
+ stop = vad.shape[0] if self.stop[idx] is None else min(self.stop[idx], vad.shape[0])
+ logging.info('{} start: {} stop: {}'.format(show, self.start[idx], stop))
+ data = cep[self.start[idx]:stop, :]
+ data = data[vad[self.start[idx]:stop], :]
+
+ # Verify that frame dimension is equal to gmm dimension
+ if not ubm.dim() == data.shape[1]:
+ raise Exception('dimension of ubm and features differ: {:d} / {:d}'.format(ubm.dim(), data.shape[1]))
+ else:
+ if ubm.invcov.ndim == 2:
+ lp = ubm.compute_log_posterior_probabilities(data)
+ else:
+ lp = ubm.compute_log_posterior_probabilities_full(data)
+ pp, foo = sum_log_probabilities(lp)
+ # Compute 0th-order statistics
+ self.stat0[idx, :] = pp.sum(0)
+ # Compute 1st-order statistics
+ self.stat1[idx, :] = numpy.reshape(numpy.transpose(numpy.dot(data.transpose(), pp)), ubm.sv_size()).astype(STAT_TYPE)
+
+ def accumulate_stat_per_speaker(self, ubm, input_feature_filename, idmap, feature_server, channel_extension=("", "_b")):
+ """
+
+ :param ubm:
+ :param show:
+ :param input_feature_filename:
+ :param idmap:
+ :param feature_server:
+ :param channel_extension:
+ :return:
+ """
+ assert isinstance(ubm, Mixture), 'First parameter has to be a Mixture'
+ assert isinstance(feature_server, FeaturesServer), 'Second parameter has to be a FeaturesServer'
+
+ #show = self.segset[0]
+ show = input_feature_filename
+
+ # If using a FeaturesExtractor, get the channel number by checking the extension of the show
+ channel = 0
+ if feature_server.features_extractor is not None and show.endswith(channel_extension[1]):
+ channel = 1
+
+ show = show[:show.rfind(channel_extension[channel])]
+
+ features_per_speaker = feature_server.get_features_per_speaker(show, idmap, channel)
+
+ print(f"S0 a {features_per_speaker['S0'][1].sum()} trames")
+
+
+ for idx, (spk_id, data) in enumerate(features_per_speaker.items()):
+ cep, vad = data
+ cep = cep[vad]
+
+ # Verify that frame dimension is equal to gmm dimension
+ if not ubm.dim() == cep.shape[1]:
+ raise Exception('dimension of ubm and features differ: {:d} / {:d}'.format(ubm.dim(), cep.shape[1]))
+ else:
+ if ubm.invcov.ndim == 2:
+ lp = ubm.compute_log_posterior_probabilities(cep)
+ else:
+ lp = ubm.compute_log_posterior_probabilities_full(cep)
+ pp, foo = sum_log_probabilities(lp)
+ # Compute 0th-order statistics
+ self.stat0[idx, :] = pp.sum(0)
+ # Compute 1st-order statistics
+ self.stat1[idx, :] = numpy.reshape(numpy.transpose(numpy.dot(cep.transpose(), pp)),
+ ubm.sv_size()).astype(STAT_TYPE)
+
+ def get_mean_stat1(self):
+ """Return the mean of first order statistics
+
+ return: the mean array of the first order statistics.
+ """
+ mu = numpy.mean(self.stat1, axis=0)
+ return mu
+
+ def norm_stat1(self):
+ """Divide all first-order statistics by their euclidian norm."""
+ vect_norm = numpy.clip(numpy.linalg.norm(self.stat1, axis=1), 1e-08, numpy.inf)
+ self.stat1 = (self.stat1.transpose() / vect_norm).transpose()
+
+ def rotate_stat1(self, R):
+ """Rotate first-order statistics by a right-product.
+
+ :param R: ndarray, matrix to use for right product on the first order
+ statistics.
+ """
+ self.stat1 = numpy.dot(self.stat1, R)
+
+ def center_stat1(self, mu):
+ """Center first order statistics.
+
+ :param mu: array to center on.
+ """
+ dim = self.stat1.shape[1] / self.stat0.shape[1]
+ index_map = numpy.repeat(numpy.arange(self.stat0.shape[1]), dim)
+ self.stat1 = self.stat1 - (self.stat0[:, index_map] * mu.astype(STAT_TYPE))
+
+ def subtract_weighted_stat1(self, sts):
+ """Subtract the stat1 from from the sts StatServer to the stat1 of
+ the current StatServer after multiplying by the zero-order statistics
+ from the current statserver
+
+ :param sts: a StatServer
+
+ :return: a new StatServer
+ """
+ new_sts = copy.deepcopy(self)
+
+ # check the compatibility of the two statservers
+ # exact match of the sessions and dimensions of the stat0 and stat1
+ if all(numpy.sort(sts.modelset) == numpy.sort(self.modelset))and \
+ all(numpy.sort(sts.segset) == numpy.sort(self.segset)) and \
+ (sts.stat0.shape == self.stat0.shape) and \
+ (sts.stat1.shape == self.stat1.shape):
+
+ # align sts according to self.segset
+ idx = self.segset.argsort()
+ idx_sts = sts.segset.argsort()
+ new_sts.stat1[idx, :] = sts.stat1[idx_sts, :]
+
+ # Subtract the stat1
+ dim = self.stat1.shape[1] / self.stat0.shape[1]
+ index_map = numpy.repeat(numpy.arange(self.stat0.shape[1]), dim)
+ new_sts.stat1 = self.stat1 - (self.stat0[:, index_map] * new_sts.stat1)
+
+ else:
+ raise Exception('Statserver are not compatible')
+
+ return new_sts
+
+ def whiten_stat1(self, mu, sigma, isSqrInvSigma=False):
+ """Whiten first-order statistics
+ If sigma.ndim == 1, case of a diagonal covariance
+ If sigma.ndim == 2, case of a single Gaussian with full covariance
+ If sigma.ndim == 3, case of a full covariance UBM
+
+ :param mu: array, mean vector to be subtracted from the statistics
+ :param sigma: narray, co-variance matrix or covariance super-vector
+ :param isSqrInvSigma: boolean, True if the input Sigma matrix is the inverse of the square root of a covariance
+ matrix
+ """
+ if sigma.ndim == 1:
+ self.center_stat1(mu)
+ self.stat1 = self.stat1 / numpy.sqrt(sigma.astype(STAT_TYPE))
+
+ elif sigma.ndim == 2:
+ # Compute the inverse square root of the co-variance matrix Sigma
+ sqr_inv_sigma = sigma
+
+ if not isSqrInvSigma:
+ eigen_values, eigen_vectors = scipy.linalg.eigh(sigma)
+ ind = eigen_values.real.argsort()[::-1]
+ eigen_values = eigen_values.real[ind]
+ eigen_vectors = eigen_vectors.real[:, ind]
+
+ sqr_inv_eval_sigma = 1 / numpy.sqrt(eigen_values.real)
+ sqr_inv_sigma = numpy.dot(eigen_vectors, numpy.diag(sqr_inv_eval_sigma))
+ else:
+ pass
+
+ # Whitening of the first-order statistics
+ self.center_stat1(mu)
+ self.rotate_stat1(sqr_inv_sigma)
+
+ elif sigma.ndim == 3:
+ # we assume that sigma is a 3D ndarray of size D x n x n
+ # where D is the number of distributions and n is the dimension of a single distibution
+ n = self.stat1.shape[1] // self.stat0.shape[1]
+ sess_nb = self.stat0.shape[0]
+ self.center_stat1(mu)
+ self.stat1 = numpy.einsum("ikj,ikl->ilj",
+ self.stat1.T.reshape(-1, n, sess_nb), sigma).reshape(-1, sess_nb).T
+
+ else:
+ raise Exception('Wrong dimension of Sigma, must be 1 or 2')
+
+ def whiten_cholesky_stat1(self, mu, sigma):
+ """Whiten first-order statistics by using Cholesky decomposition of
+ Sigma
+
+ :param mu: array, mean vector to be subtracted from the statistics
+ :param sigma: narray, co-variance matrix or covariance super-vector
+ """
+ if sigma.ndim == 2:
+ # Compute the inverse square root of the co-variance matrix Sigma
+ inv_sigma = scipy.linalg.inv(sigma)
+ chol_invcov = scipy.linalg.cholesky(inv_sigma).T
+
+ # Whitening of the first-order statistics
+ self.center_stat1(mu)
+ self.stat1 = self.stat1.dot(chol_invcov)
+
+ elif sigma.ndim == 1:
+ self.center_stat1(mu)
+ self.stat1 = self.stat1 / numpy.sqrt(sigma)
+ else:
+ raise Exception('Wrong dimension of Sigma, must be 1 or 2')
+
+ def get_total_covariance_stat1(self):
+ """Compute and return the total covariance matrix of the first-order
+ statistics.
+
+ :return: the total co-variance matrix of the first-order statistics
+ as a ndarray.
+ """
+ C = self.stat1 - self.stat1.mean(axis=0)
+ return numpy.dot(C.transpose(), C) / self.stat1.shape[0]
+
+ def get_total_covariance_stat1_mix(self, mean):
+ """Compute and return the total covariance matrix of the first-order
+ statistics.
+
+ :return: the total co-variance matrix of the first-order statistics
+ as a ndarray.
+ """
+ C = self.stat1 - mean
+ return numpy.dot(C.transpose(), C) / self.stat1.shape[0]
+
+ def get_within_covariance_stat1(self):
+ """Compute and return the within-class covariance matrix of the
+ first-order statistics.
+
+ :return: the within-class co-variance matrix of the first-order statistics
+ as a ndarray.
+ """
+ vect_size = self.stat1.shape[1]
+ unique_speaker = numpy.unique(self.modelset)
+ W = numpy.zeros((vect_size, vect_size))
+
+ for speakerID in tqdm.tqdm(unique_speaker):
+ spk_ctr_vec = self.get_model_stat1(speakerID) \
+ - numpy.mean(self.get_model_stat1(speakerID), axis=0)
+ W += numpy.dot(spk_ctr_vec.transpose(), spk_ctr_vec)
+ W /= self.stat1.shape[0]
+ return W
+
+ def get_between_covariance_stat1(self):
+ """Compute and return the between-class covariance matrix of the
+ first-order statistics.
+
+ :return: the between-class co-variance matrix of the first-order
+ statistics as a ndarray.
+ """
+ vect_size = self.stat1.shape[1]
+ unique_speaker = numpy.unique(self.modelset)
+ B = numpy.zeros((vect_size, vect_size))
+
+ # Compute overall mean first-order statistics
+ mu = self.get_mean_stat1()
+
+ # Compute and accumulate mean first-order statistics for each class
+ for speaker_id in unique_speaker:
+ spk_sessions = self.get_model_stat1(speaker_id)
+ tmp = numpy.mean(spk_sessions, axis=0) - mu
+ B += (spk_sessions.shape[0] * numpy.outer(tmp, tmp))
+ B /= self.stat1.shape[0]
+ return B
+
+ def get_lda_matrix_stat1(self, rank):
+ """Compute and return the Linear Discriminant Analysis matrix
+ on the first-order statistics. Columns of the LDA matrix are ordered
+ according to the corresponding eigenvalues in descending order.
+
+ :param rank: integer, rank of the LDA matrix to return
+
+ :return: the LDA matrix of rank "rank" as a ndarray
+ """
+ vect_size = self.stat1.shape[1]
+ unique_speaker = numpy.unique(self.modelset)
+
+ mu = self.get_mean_stat1()
+
+ class_means = numpy.zeros((unique_speaker.shape[0], vect_size))
+ Sw = numpy.zeros((vect_size, vect_size))
+
+ spk_idx = 0
+ for speaker_id in unique_speaker:
+ spk_sessions = self.get_model_stat1(speaker_id) \
+ - numpy.mean(self.get_model_stat1(speaker_id), axis=0)
+ Sw += numpy.dot(spk_sessions.transpose(), spk_sessions) / spk_sessions.shape[0]
+ class_means[spk_idx, :] = numpy.mean(self.get_model_stat1(speaker_id), axis=0)
+ spk_idx += 1
+
+ # Compute Between-class scatter matrix
+ class_means = class_means - mu
+ Sb = numpy.dot(class_means.transpose(), class_means)
+
+ # Compute the Eigenvectors & eigenvalues of the discrimination matrix
+ DiscriminationMatrix = numpy.dot(Sb, scipy.linalg.inv(Sw)).transpose()
+ eigen_values, eigen_vectors = scipy.linalg.eigh(DiscriminationMatrix)
+ eigen_values = eigen_values.real
+ eigen_vectors = eigen_vectors.real
+
+ # Rearrange the eigenvectors according to decreasing eigenvalues
+ # get indexes of the rank top eigen values
+ idx = eigen_values.real.argsort()[-rank:][::-1]
+ L = eigen_vectors[:, idx]
+ return L
+
+ def get_mahalanobis_matrix_stat1(self):
+ """Compute and return Mahalanobis matrix of first-order statistics.
+
+ :return: the mahalanobis matrix computed on the first-order
+ statistics as a ndarray
+ """
+ W = self.get_within_covariance_stat1()
+ M = scipy.linalg.inv(W)
+ return M
+
+ def get_wccn_choleski_stat1(self):
+ """Compute and return the lower Cholesky decomposition matrix of the
+ Within Class Co-variance Normalization matrix on the first-order
+ statistics.
+
+ :return: the lower Choleski decomposition of the WCCN matrix
+ as a ndarray
+ """
+ vect_size = self.stat1.shape[1]
+ unique_speaker = numpy.unique(self.modelset)
+ WCCN = numpy.zeros((vect_size, vect_size))
+
+ for speaker_id in unique_speaker:
+ spk_ctr_vec = self.get_model_stat1(speaker_id) \
+ - numpy.mean(self.get_model_stat1(speaker_id), axis=0)
+ #WCCN += numpy.dot(spk_ctr_vec.transpose(), spk_ctr_vec)
+ WCCN += numpy.dot(spk_ctr_vec.transpose(), spk_ctr_vec) / spk_ctr_vec.shape[0]
+
+ #WCCN /= self.stat1.shape[0]
+ WCCN = WCCN / unique_speaker.shape[0]
+
+ # Choleski decomposition of the WCCN matrix
+ invW = scipy.linalg.inv(WCCN)
+ W = scipy.linalg.cholesky(invW).T
+ return W
+
+ def get_nap_matrix_stat1(self, co_rank):
+ """Compute return the Nuisance Attribute Projection matrix
+ from first-order statistics.
+
+ :param co_rank: co-rank of the Nuisance Attribute Projection matrix
+
+ :return: the NAP matrix of rank "coRank"
+ """
+ vectSize = self.stat1.shape[1]
+ W = numpy.dot(self.stat1, self.stat1.transpose()) / vectSize
+ eigenValues, eigenVectors = scipy.linalg.eigh(W)
+
+ # Rearrange the eigenvectors according to decreasing eigenvalues
+ # get indexes of the rank top eigen values
+ idx = eigenValues.real.argsort()[-co_rank:][::-1]
+ N = numpy.dot(self.stat1.transpose(), eigenVectors[:, idx])
+ N = numpy.dot(N, numpy.diag(1 / numpy.sqrt(vectSize * eigenValues.real[idx])))
+ return N
+
+ def adapt_mean_map(self, ubm, r=16, norm=False):
+ """Maximum A Posteriori adaptation of the mean super-vector of ubm,
+ train one model per segment.
+
+ :param ubm: a Mixture object to adapt
+ :param r: float, the relevant factor for MAP adaptation
+ :param norm: boolean, normalize by using the UBM co-variance.
+ Default is False
+
+ :return: a StatServer with 1 as stat0 and the MAP adapted super-vectors
+ as stat1
+ """
+ gsv_statserver = StatServer()
+ gsv_statserver.modelset = self.modelset
+ gsv_statserver.segset = self.segset
+ gsv_statserver.start = self.start
+ gsv_statserver.stop = self.stop
+ gsv_statserver.stat0 = numpy.ones((self.segset. shape[0], 1), dtype=STAT_TYPE)
+
+ index_map = numpy.repeat(numpy.arange(ubm.distrib_nb()), ubm.dim())
+
+ # Adapt mean vectors
+ alpha = (self.stat0 + numpy.finfo(numpy.float32).eps) / (self.stat0 + numpy.finfo(numpy.float32).eps + r)
+ M = self.stat1 / self.stat0[:, index_map]
+ M[numpy.isnan(M)] = 0 # Replace NaN due to divide by zeros
+ M = alpha[:, index_map] * M + (1 - alpha[:, index_map]) * \
+ numpy.tile(ubm.get_mean_super_vector(), (M.shape[0], 1))
+
+ if norm:
+ if ubm.invcov.ndim == 2:
+ # Normalization corresponds to KL divergence
+ w = numpy.repeat(ubm.w, ubm.dim())
+ KLD = numpy.sqrt(w * ubm.get_invcov_super_vector())
+
+ M = M * KLD
+
+ gsv_statserver.stat1 = M.astype(dtype=STAT_TYPE)
+ gsv_statserver.validate()
+ return gsv_statserver
+
+ def adapt_mean_map_multisession(self, ubm, r=16, norm=False):
+ """Maximum A Posteriori adaptation of the mean super-vector of ubm,
+ train one model per model in the modelset by summing the statistics
+ of the multiple segments.
+
+ :param ubm: a Mixture object to adapt
+ :param r: float, the relevant factor for MAP adaptation
+ :param norm: boolean, normalize by using the UBM co-variance.
+ Default is False
+
+ :return: a StatServer with 1 as stat0 and the MAP adapted super-vectors
+ as stat1
+ """
+ gsv_statserver = StatServer()
+ gsv_statserver.modelset = numpy.unique(self.modelset)
+ gsv_statserver.segset = numpy.unique(self.modelset)
+ gsv_statserver.start = numpy.empty(gsv_statserver.modelset.shape, dtype="|O")
+ gsv_statserver.stop = numpy.empty(gsv_statserver.modelset.shape, dtype="|O")
+ gsv_statserver.stat0 = numpy.ones((numpy.unique(self.modelset).shape[0], 1), dtype=STAT_TYPE)
+
+ index_map = numpy.repeat(numpy.arange(ubm.distrib_nb()), ubm.dim())
+
+ # Sum the statistics per model
+ modelStat = self.sum_stat_per_model()[0]
+
+ # Adapt mean vectors
+ alpha = modelStat.stat0 / (modelStat.stat0 + r)
+ M = modelStat.stat1 / modelStat.stat0[:, index_map]
+ M[numpy.isnan(M)] = 0 # Replace NaN due to divide by zeros
+ M = alpha[:, index_map] * M \
+ + (1 - alpha[:, index_map]) * numpy.tile(ubm.get_mean_super_vector(), (M.shape[0], 1))
+
+ if norm:
+ if ubm.invcov.ndim == 2:
+ # Normalization corresponds to KL divergence
+ w = numpy.repeat(ubm.w, ubm.dim())
+ KLD = numpy.sqrt(w * ubm.get_invcov_super_vector())
+
+ M = M * KLD
+
+ gsv_statserver.stat1 = M.astype(dtype=STAT_TYPE)
+ gsv_statserver.validate()
+ return gsv_statserver
+
+ def precompute_svm_kernel_stat1(self):
+ """Pre-compute the Kernel for SVM training and testing,
+ the output parameter is a matrix that only contains the impostor
+ part of the Kernel. This one has to be completed by the
+ target-dependent part during training and testing.
+
+ :return: the impostor part of the SVM Graam matrix as a ndarray
+ """
+ K = numpy.dot(self.stat1, self.stat1.transpose())
+ return K
+
+ def ivector_extraction_weight(self, ubm, W, Tnorm, delta=numpy.array([])):
+ """Compute i-vectors using the ubm weight approximation.
+ For more information, refers to:
+
+ Glembeck, O.; Burget, L.; Matejka, P.; Karafiat, M. & Kenny, P.
+ "Simplification and optimization of I-Vector extraction,"
+ in IEEE International Conference on Acoustics, Speech, and Signal
+ Processing, ICASSP, 2011, 4516-4519
+
+ :param ubm: a Mixture used as UBM for i-vector estimation
+ :param W: fix matrix pre-computed using the weights from
+ the UBM and the total variability matrix
+ :param Tnorm: total variability matrix pre-normalized using
+ the co-variance of the UBM
+ :param delta: men vector if re-estimated using minimum divergence
+ criteria
+
+ :return: a StatServer which zero-order statistics are 1
+ and first-order statistics are approximated i-vectors.
+ """
+ # check consistency of dimensions for delta, Tnorm, W, ubm
+ assert ubm.get_invcov_super_vector().shape[0] == Tnorm.shape[0], \
+ 'UBM and TV matrix dimension are not consistent'
+ if delta.shape == (0, ):
+ delta = numpy.zeros(ubm.get_invcov_super_vector().shape)
+ assert ubm.get_invcov_super_vector().shape[0] == delta.shape[0],\
+ 'Minimum divergence mean and TV matrix dimension not consistent'
+ assert W.shape[0] == Tnorm.shape[1], 'W and TV matrix dimension are not consistent'
+ ivector_size = Tnorm.shape[1]
+
+ # Sum stat0
+ sumStat0 = self.stat0.sum(axis=1)
+
+ # Center and normalize first-order statistics
+ # for the case of diagonal covariance UBM
+ self.whiten_stat1(delta, 1./ubm.get_invcov_super_vector())
+
+ X = numpy.dot(self.stat1, Tnorm)
+
+ enroll_iv = StatServer()
+ enroll_iv.modelset = self.modelset
+ enroll_iv.segset = self.segset
+ enroll_iv.stat0 = numpy.ones((enroll_iv.segset.shape[0], 1))
+ enroll_iv.stat1 = numpy.zeros((enroll_iv.segset.shape[0], ivector_size))
+ for iv in range(self.stat0.shape[0]): # loop on i-vector
+ logging.debug('Estimate i-vector [ %d / %d ]', iv + 1, self.stat0.shape[0])
+ # Compute precision matrix
+ L = numpy.eye(ivector_size) + sumStat0[iv] * W
+ # Estimate i-vector
+ enroll_iv.stat1[iv, :] = scipy.linalg.solve(L, X[iv, :])
+
+ return enroll_iv
+
+ def ivector_extraction_eigen_decomposition(self,
+ ubm,
+ Q,
+ D_bar_c,
+ Tnorm,
+ delta=numpy.array([])):
+ """Compute i-vectors using the eigen decomposition approximation.
+ For more information, refers to[Glembeck09]_
+
+ :param ubm: a Mixture used as UBM for i-vector estimation
+ :param Q: Q matrix as described in [Glembeck11]
+ :param D_bar_c: matrices as described in [Glembeck11]
+ :param Tnorm: total variability matrix pre-normalized using
+ the co-variance of the UBM
+ :param delta: men vector if re-estimated using minimum divergence
+ criteria
+
+ :return: a StatServer which zero-order statistics are 1
+ and first-order statistics are approximated i-vectors.
+ """
+ # check consistency of dimensions for delta, Tnorm, Q, D_bar_c, ubm
+ assert ubm.get_invcov_super_vector().shape[0] == Tnorm.shape[0], \
+ 'UBM and TV matrix dimension not consistent'
+ if delta.shape == (0, ):
+ delta = numpy.zeros(ubm.get_invcov_super_vector().shape)
+ assert ubm.get_invcov_super_vector().shape[0] == delta.shape[0], \
+ 'Minimum divergence mean and TV matrix dimension not consistent'
+ assert D_bar_c.shape[1] == Tnorm.shape[1], \
+ 'D_bar_c and TV matrix dimension are not consistent'
+ assert D_bar_c.shape[0] == ubm.w.shape[0], \
+ 'D_bar_c and UBM dimension are not consistent'
+
+ ivector_size = Tnorm.shape[1]
+
+ # Center and normalize first-order statistics
+ # for the case of diagonal covariance UBM
+ self.whiten_stat1(delta, 1./ubm.get_invcov_super_vector())
+
+ X = numpy.dot(self.stat1, Tnorm)
+
+ enroll_iv = StatServer()
+ enroll_iv.modelset = self.modelset
+ enroll_iv.segset = self.segset
+ enroll_iv.stat0 = numpy.ones((enroll_iv.segset.shape[0], 1), dtype=STAT_TYPE)
+ enroll_iv.stat1 = numpy.zeros((enroll_iv.segset.shape[0], ivector_size), dtype=STAT_TYPE)
+ for iv in range(self.stat0.shape[0]): # loop on i-vector
+ logging.debug('Estimate i-vector [ %d / %d ]', iv + 1, self.stat0.shape[0])
+
+ # Compute precision matrix
+ diag_L = 1 + numpy.sum(numpy.dot(numpy.diag(self.stat0[iv, :]), D_bar_c), axis=0)
+
+ # Estimate i-vector
+ enroll_iv.stat1[iv, :] = X[iv, :].dot(Q).dot(numpy.diag(1/diag_L)).dot(Q.transpose())
+
+ return enroll_iv
+
+ def estimate_spectral_norm_stat1(self, it=1, mode='efr'):
+ """Compute meta-parameters for Spectral Normalization as described
+ in [Bousquet11]_
+
+ Can be used to perform Eigen Factor Radial or Spherical Nuisance
+ Normalization. Default behavior is equivalent to Length Norm as
+ described in [Garcia-Romero11]_
+
+ Statistics are transformed while the meta-parameters are
+ estimated.
+
+ :param it: integer, number of iterations to perform
+ :param mode: string, can be
+ - efr for Eigen Factor Radial
+ - sphNorm, for Spherical Nuisance Normalization
+
+ :return: a tupple of two lists:
+ - a list of mean vectors
+ - a list of co-variance matrices as ndarrays
+ """
+ spectral_norm_mean = []
+ spectral_norm_cov = []
+ tmp_iv = copy.deepcopy(self)
+
+ for i in range(it):
+ # estimate mean and covariance matrix
+ spectral_norm_mean.append(tmp_iv.get_mean_stat1())
+
+ if mode == 'efr':
+ spectral_norm_cov.append(tmp_iv.get_total_covariance_stat1())
+ elif mode == 'sphNorm':
+ spectral_norm_cov.append(tmp_iv.get_within_covariance_stat1())
+
+ # Center and whiten the statistics
+ tmp_iv.whiten_stat1(spectral_norm_mean[i], spectral_norm_cov[i])
+ tmp_iv.norm_stat1()
+ return spectral_norm_mean, spectral_norm_cov
+
+ def spectral_norm_stat1(self, spectral_norm_mean, spectral_norm_cov, is_sqr_inv_sigma=False):
+ """Apply Spectral Sormalization to all first order statistics.
+ See more details in [Bousquet11]_
+
+ The number of iterations performed is equal to the length of the
+ input lists.
+
+ :param spectral_norm_mean: a list of mean vectors
+ :param spectral_norm_cov: a list of co-variance matrices as ndarrays
+ :param is_sqr_inv_sigma: boolean, True if
+ """
+ assert len(spectral_norm_mean) == len(spectral_norm_cov), \
+ 'Number of mean vectors and covariance matrices is different'
+
+ for mu, Cov in zip(spectral_norm_mean, spectral_norm_cov):
+ self.whiten_stat1(mu, Cov, is_sqr_inv_sigma)
+ self.norm_stat1()
+
+ def sum_stat_per_model(self):
+ """Sum the zero- and first-order statistics per model and store them
+ in a new StatServer.
+
+ :return: a StatServer with the statistics summed per model
+ """
+ sts_per_model = sidekit.StatServer()
+ sts_per_model.modelset = numpy.unique(self.modelset)
+ sts_per_model.segset = copy.deepcopy(sts_per_model.modelset)
+ sts_per_model.stat0 = numpy.zeros((sts_per_model.modelset.shape[0], self.stat0.shape[1]), dtype=STAT_TYPE)
+ sts_per_model.stat1 = numpy.zeros((sts_per_model.modelset.shape[0], self.stat1.shape[1]), dtype=STAT_TYPE)
+ sts_per_model.start = numpy.empty(sts_per_model.segset.shape, '|O')
+ sts_per_model.stop = numpy.empty(sts_per_model.segset.shape, '|O')
+
+ session_per_model = numpy.zeros(numpy.unique(self.modelset).shape[0])
+
+ for idx, model in enumerate(sts_per_model.modelset):
+ sts_per_model.stat0[idx, :] = self.get_model_stat0(model).sum(axis=0)
+ sts_per_model.stat1[idx, :] = self.get_model_stat1(model).sum(axis=0)
+ session_per_model[idx] += self.get_model_stat1(model).shape[0]
+ return sts_per_model, session_per_model
+
+ def mean_stat_per_model(self):
+ """Average the zero- and first-order statistics per model and store them
+ in a new StatServer.
+
+ :return: a StatServer with the statistics averaged per model
+ """
+ sts_per_model = sidekit.StatServer()
+ sts_per_model.modelset = numpy.unique(self.modelset)
+ sts_per_model.segset = copy.deepcopy(sts_per_model.modelset)
+ sts_per_model.stat0 = numpy.zeros((sts_per_model.modelset.shape[0], self.stat0.shape[1]), dtype=STAT_TYPE)
+ sts_per_model.stat1 = numpy.zeros((sts_per_model.modelset.shape[0], self.stat1.shape[1]), dtype=STAT_TYPE)
+ sts_per_model.start = numpy.empty(sts_per_model.segset.shape, '|O')
+ sts_per_model.stop = numpy.empty(sts_per_model.segset.shape, '|O')
+
+ for idx, model in enumerate(sts_per_model.modelset):
+ sts_per_model.stat0[idx, :] = self.get_model_stat0(model).mean(axis=0)
+ sts_per_model.stat1[idx, :] = self.get_model_stat1(model).mean(axis=0)
+ return sts_per_model
+
+ def _expectation(self, phi, mean, sigma, session_per_model, batch_size=100, num_thread=1):
+ """
+ dans cette version, on considère que les stats NE sont PAS blanchis avant
+ """
+ warnings.warn("deprecated, use FactorAnalyser module", DeprecationWarning)
+
+ r = phi.shape[-1]
+ d = int(self.stat1.shape[1] / self.stat0.shape[1])
+ C = self.stat0.shape[1]
+
+ """Whiten the statistics and multiply the covariance matrix by the
+ square root of the inverse of the residual covariance"""
+ self.whiten_stat1(mean, sigma)
+ phi_white = copy.deepcopy(phi)
+ if sigma.ndim == 2:
+ eigen_values, eigen_vectors = scipy.linalg.eigh(sigma)
+ ind = eigen_values.real.argsort()[::-1]
+ eigen_values = eigen_values.real[ind]
+ eigen_vectors = eigen_vectors.real[:, ind]
+ sqr_inv_eval_sigma = 1 / numpy.sqrt(eigen_values.real)
+ sqr_inv_sigma = numpy.dot(eigen_vectors, numpy.diag(sqr_inv_eval_sigma))
+ phi_white = sqr_inv_sigma.T.dot(phi)
+ elif sigma.ndim == 1:
+ sqr_inv_sigma = 1/numpy.sqrt(sigma)
+ phi_white = phi * sqr_inv_sigma[:, None]
+
+ # Replicate self.stat0
+ index_map = numpy.repeat(numpy.arange(C), d)
+ _stat0 = self.stat0[:, index_map]
+
+ # Create accumulators for the list of models to process
+ with warnings.catch_warnings():
+ warnings.simplefilter('ignore', RuntimeWarning)
+ _A = numpy.zeros((C, r, r), dtype=STAT_TYPE)
+ tmp_A = multiprocessing.Array(ct, _A.size)
+ _A = numpy.ctypeslib.as_array(tmp_A.get_obj())
+ _A = _A.reshape(C, r, r)
+
+ _C = numpy.zeros((r, d * C), dtype=STAT_TYPE)
+
+ _R = numpy.zeros((r, r), dtype=STAT_TYPE)
+ _r = numpy.zeros(r, dtype=STAT_TYPE)
+
+ # Process in batches in order to reduce the memory requirement
+ batch_nb = int(numpy.floor(self.segset.shape[0]/float(batch_size) + 0.999))
+
+ for batch in range(batch_nb):
+ batch_start = batch * batch_size
+ batch_stop = min((batch + 1) * batch_size, self.segset.shape[0])
+ batch_len = batch_stop - batch_start
+
+ # Allocate the memory to save time
+ with warnings.catch_warnings():
+ warnings.simplefilter('ignore', RuntimeWarning)
+ e_h = numpy.zeros((batch_len, r), dtype=STAT_TYPE)
+ tmp_e_h = multiprocessing.Array(ct, e_h.size)
+ e_h = numpy.ctypeslib.as_array(tmp_e_h.get_obj())
+ e_h = e_h.reshape(batch_len, r)
+
+ e_hh = numpy.zeros((batch_len, r, r), dtype=STAT_TYPE)
+ tmp_e_hh = multiprocessing.Array(ct, e_hh.size)
+ e_hh = numpy.ctypeslib.as_array(tmp_e_hh.get_obj())
+ e_hh = e_hh.reshape(batch_len, r, r)
+
+ # loop on model id's
+ fa_model_loop(batch_start=batch_start, mini_batch_indices=numpy.arange(batch_len),
+ r=r, phi_white=phi_white, phi=phi, sigma=sigma,
+ stat0=_stat0, stat1=self.stat1,
+ e_h=e_h, e_hh=e_hh, num_thread=num_thread)
+
+ # Accumulate for minimum divergence step
+ _r += numpy.sum(e_h * session_per_model[batch_start:batch_stop, None], axis=0)
+ _R += numpy.sum(e_hh, axis=0)
+
+ if sqr_inv_sigma.ndim == 2:
+ _C += e_h.T.dot(self.stat1[batch_start:batch_stop, :]).dot(scipy.linalg.inv(sqr_inv_sigma))
+ elif sqr_inv_sigma.ndim == 1:
+ _C += e_h.T.dot(self.stat1[batch_start:batch_stop, :]) / sqr_inv_sigma
+
+ # Parallelized loop on the model id's
+ fa_distribution_loop(distrib_indices=numpy.arange(C),
+ _A=_A,
+ stat0=self.stat0,
+ batch_start=batch_start,
+ batch_stop=batch_stop,
+ e_hh=e_hh,
+ num_thread=num_thread)
+
+ _r /= session_per_model.sum()
+ _R /= session_per_model.shape[0]
+
+ return _A, _C, _R
+
+ def _maximization(self, phi, _A, _C, _R=None, sigma_obs=None, session_number=None):
+ """
+ """
+ warnings.warn("deprecated, use FactorAnalyser module", DeprecationWarning)
+ d = self.stat1.shape[1] // self.stat0.shape[1]
+ C = self.stat0.shape[1]
+
+ for c in range(C):
+ distrib_idx = range(c * d, (c+1) * d)
+ phi[distrib_idx, :] = scipy.linalg.solve(_A[c], _C[:, distrib_idx]).T
+
+ # Update the residual covariance if needed
+ # (only for full co-variance case of PLDA
+ sigma = None
+ if sigma_obs is not None:
+ sigma = sigma_obs - phi.dot(_C) / session_number
+
+ # MINIMUM DIVERGENCE STEP
+ if _R is not None:
+ print('applyminDiv reestimation')
+ ch = scipy.linalg.cholesky(_R)
+ phi = phi.dot(ch)
+
+ return phi, sigma
+
+ def estimate_between_class(self,
+ itNb,
+ V,
+ mean,
+ sigma_obs,
+ batch_size=100,
+ Ux=None,
+ Dz=None,
+ minDiv=True,
+ num_thread=1,
+ re_estimate_residual=False,
+ save_partial=False):
+ """
+ Estimate the factor loading matrix for the between class covariance
+
+ :param itNb:
+ :param V: initial between class covariance matrix
+ :param mean: global mean vector
+ :param sigma_obs: covariance matrix of the input data
+ :param batch_size: size of the batches to process one by one to reduce the memory usage
+ :param Ux: statserver of supervectors
+ :param Dz: statserver of supervectors
+ :param minDiv: boolean, if True run the minimum divergence step after maximization
+ :param num_thread: number of parallel process to run
+ :param re_estimate_residual: boolean, if True the residual covariance matrix is re-estimated (for PLDA)
+ :param save_partial: boolean, if True, save FA model for each iteration
+ :return: the within class factor loading matrix
+ """
+ warnings.warn("deprecated, use FactorAnalyser module", DeprecationWarning)
+ # Initialize the covariance
+ sigma = sigma_obs
+
+ # Estimate F by iterating the EM algorithm
+ for it in range(itNb):
+ logging.info('Estimate between class covariance, it %d / %d',
+ it + 1, itNb)
+
+ # Dans la fonction estimate_between_class
+ model_shifted_stat = copy.deepcopy(self)
+
+ # subtract channel effect, Ux, if already estimated
+ if Ux is not None:
+ model_shifted_stat = model_shifted_stat.subtract_weighted_stat1(Ux)
+
+ # Sum statistics per speaker
+ model_shifted_stat, session_per_model = model_shifted_stat.sum_stat_per_model()
+ # subtract residual, Dz, if already estimated
+ if Dz is not None:
+ model_shifted_stat = model_shifted_stat.subtract(Dz)
+
+ # E-step
+ print("E_step")
+ _A, _C, _R = model_shifted_stat._expectation(V, mean, sigma, session_per_model, batch_size, num_thread)
+
+ if not minDiv:
+ _R = None
+
+ # M-step
+ print("M_step")
+ if re_estimate_residual:
+ V, sigma = model_shifted_stat._maximization(V, _A, _C, _R, sigma_obs, session_per_model.sum())
+ else:
+ V = model_shifted_stat._maximization(V, _A, _C, _R)[0]
+
+ if sigma.ndim == 2:
+ logging.info('Likelihood after iteration %d / %f', it + 1, compute_llk(self, V, sigma))
+
+ del model_shifted_stat
+
+ if save_partial:
+ sidekit.sidekit_io.write_fa_hdf5((mean, V, None, None, sigma),
+ "Partial_plda_{}_between_class.h5".format(it))
+
+ return V, sigma
+
+ def estimate_within_class(self,
+ it_nb,
+ U,
+ mean,
+ sigma_obs,
+ batch_size=100,
+ Vy=None,
+ Dz=None,
+ min_div=True,
+ num_thread=1,
+ save_partial=False):
+ """
+ Estimate the factor loading matrix for the within class covariance
+
+ :param it_nb: number of iterations to estimate the within class covariance matrix
+ :param U: initial within class covariance matrix
+ :param mean: mean of the input data
+ :param sigma_obs: co-variance matrix of the input data
+ :param batch_size: number of sessions to process per batch to optimize memory usage
+ :param Vy: statserver of supervectors
+ :param Dz: statserver of supervectors
+ :param min_div: boolean, if True run the minimum divergence step after maximization
+ :param num_thread: number of parallel process to run
+ :param save_partial: boolean, if True, save FA model for each iteration
+ :return: the within class factor loading matrix
+ """
+ warnings.warn("deprecated, use FactorAnalyser module", DeprecationWarning)
+ session_shifted_stat = copy.deepcopy(self)
+
+ session_per_model = numpy.ones(session_shifted_stat.modelset.shape[0])
+ # Estimate F by iterating the EM algorithm
+ for it in range(it_nb):
+ logging.info('Estimate between class covariance, it %d / %d',
+ it + 1, it_nb)
+
+ session_shifted_stat = self
+ # subtract channel effect, Ux, if already estimated
+ # and sum per speaker
+ if Vy is not None:
+ session_shifted_stat = session_shifted_stat.subtract_weighted_stat1(Vy)
+ # session_shifted_stat = self.subtract_weighted_stat1(Vy)
+
+ # subtract residual, Dz, if already estimated
+ if Dz is not None:
+ session_shifted_stat = session_shifted_stat.subtract_weighted_stat1(Dz)
+
+ # E step
+ A, C, R = session_shifted_stat._expectation(U, mean, sigma_obs,
+ session_per_model,
+ batch_size, num_thread)
+
+ # M step
+ if not min_div:
+ R = None
+ U = session_shifted_stat._maximization(U, A, C, R)[0]
+
+ if save_partial:
+ sidekit.sidekit_io.write_fa_hdf5((None, None, U, None, None), save_partial + "_{}_within_class.h5")
+
+ return U
+
+ def estimate_map(self, itNb, D, mean, Sigma, Vy=None, Ux=None, num_thread=1, save_partial=False):
+ """
+
+ :param itNb: number of iterations to estimate the MAP covariance matrix
+ :param D: Maximum a Posteriori marix to estimate
+ :param mean: mean of the input parameters
+ :param Sigma: residual covariance matrix
+ :param Vy: statserver of supervectors
+ :param Ux: statserver of supervectors
+ :param num_thread: number of parallel process to run
+ :param save_partial: boolean, if True save MAP matrix after each iteration
+
+ :return: the MAP covariance matrix into a vector as it is diagonal
+ """
+ warnings.warn("deprecated, use FactorAnalyser module", DeprecationWarning)
+ model_shifted_stat = copy.deepcopy(self)
+
+ logging.info('Estimate MAP matrix')
+ # subtract speaker and channel if already estimated
+ model_shifted_stat.center_stat1(mean)
+ if Vy is not None:
+ model_shifted_stat = model_shifted_stat.subtract_weighted_stat1(Vy)
+ if Ux is not None:
+ model_shifted_stat = model_shifted_stat.subtract_weighted_stat1(Ux)
+
+ # Sum statistics per speaker
+ model_shifted_stat = model_shifted_stat.sum_stat_per_model()[0]
+
+ d = model_shifted_stat.stat1.shape[1] / model_shifted_stat.stat0.shape[1]
+ C = model_shifted_stat.stat0.shape[1]
+
+ # Replicate self.stat0
+ index_map = numpy.repeat(numpy.arange(C), d)
+ _stat0 = model_shifted_stat.stat0[:, index_map]
+
+ # Estimate D by iterating the EM algorithm
+ for it in range(itNb):
+ logging.info('Estimate MAP covariance, it %d / %d', it + 1, itNb)
+
+ # E step
+ e_h = numpy.zeros(model_shifted_stat.stat1.shape, dtype=STAT_TYPE)
+ _A = numpy.zeros(D.shape, dtype=STAT_TYPE)
+ _C = numpy.zeros(D.shape, dtype=STAT_TYPE)
+ for idx in range(model_shifted_stat.modelset.shape[0]):
+ Lambda = numpy.ones(D.shape) + (_stat0[idx, :] * D**2 / Sigma)
+ e_h[idx] = model_shifted_stat.stat1[idx] * D / (Lambda * Sigma)
+ _A = _A + (1/Lambda + e_h[idx]**2) * _stat0[idx, :]
+ _C = _C + e_h[idx] * model_shifted_stat.stat1[idx]
+
+ # M step
+ D = _C / _A
+
+ if save_partial:
+ sidekit.sidekit_io.write_fa_hdf5((None, None, None, D, None), save_partial + "_{}_map.h5")
+
+ return D
+
+ def estimate_hidden(self, mean, sigma, V=None, U=None, D=None, batch_size=100, num_thread=1):
+ """
+ Assume that the statistics have not been whitened
+ :param mean: global mean of the data to subtract
+ :param sigma: residual covariance matrix of the Factor Analysis model
+ :param V: between class covariance matrix
+ :param U: within class covariance matrix
+ :param D: MAP covariance matrix
+ :param batch_size: size of the batches used to reduce memory footprint
+ :param num_thread: number of parallel process to run
+ """
+ warnings.warn("deprecated, use FactorAnalyser module", DeprecationWarning)
+ if V is None:
+ V = numpy.zeros((self.stat1.shape[1], 0), dtype=STAT_TYPE)
+ if U is None:
+ U = numpy.zeros((self.stat1.shape[1], 0), dtype=STAT_TYPE)
+ W = numpy.hstack((V, U))
+
+ # Estimate yx
+ r = W.shape[1]
+ d = int(self.stat1.shape[1] / self.stat0.shape[1])
+ C = self.stat0.shape[1]
+
+ self.whiten_stat1(mean, sigma)
+ W_white = copy.deepcopy(W)
+ if sigma.ndim == 2:
+ eigenvalues, eigenvectors = scipy.linalg.eigh(sigma)
+ ind = eigenvalues.real.argsort()[::-1]
+ eigenvalues = eigenvalues.real[ind]
+ eigenvectors = eigenvectors.real[:, ind]
+ sqr_inv_eval_sigma = 1 / numpy.sqrt(eigenvalues.real)
+ sqr_inv_sigma = numpy.dot(eigenvectors, numpy.diag(sqr_inv_eval_sigma))
+ W_white = sqr_inv_sigma.T.dot(W)
+ elif sigma.ndim == 1:
+ sqr_inv_sigma = 1/numpy.sqrt(sigma)
+ W_white = W * sqr_inv_sigma[:, None]
+
+ # Replicate self.stat0
+ index_map = numpy.repeat(numpy.arange(C), d)
+ _stat0 = self.stat0[:, index_map]
+
+ y = sidekit.StatServer()
+ y.modelset = copy.deepcopy(self.modelset)
+ y.segset = copy.deepcopy(self.segset)
+ y.start = copy.deepcopy(self.start)
+ y.stop = copy.deepcopy(self.stop)
+ y.stat0 = numpy.ones((self.modelset.shape[0], 1))
+ y.stat1 = numpy.ones((self.modelset.shape[0], V.shape[1]))
+
+ x = sidekit.StatServer()
+ x.modelset = copy.deepcopy(self.modelset)
+ x.segset = copy.deepcopy(self.segset)
+ x.start = copy.deepcopy(self.start)
+ x.stop = copy.deepcopy(self.stop)
+ x.stat0 = numpy.ones((self.modelset.shape[0], 1))
+ x.stat1 = numpy.ones((self.modelset.shape[0], U.shape[1]))
+
+ z = sidekit.StatServer()
+ if D is not None:
+ z.modelset = copy.deepcopy(self.modelset)
+ z.segset = copy.deepcopy(self.segset)
+ z.stat0 = numpy.ones((self.modelset.shape[0], 1), dtype=STAT_TYPE)
+ z.stat1 = numpy.ones((self.modelset.shape[0], D.shape[0]), dtype=STAT_TYPE)
+
+ VUyx = copy.deepcopy(self)
+
+ # Process in batches in order to reduce the memory requirement
+ batch_nb = int(numpy.floor(self.segset.shape[0]/float(batch_size) + 0.999))
+
+ for batch in range(batch_nb):
+ batch_start = batch * batch_size
+ batch_stop = min((batch + 1) * batch_size, self.segset.shape[0])
+ batch_len = batch_stop - batch_start
+
+ # Allocate the memory to save time
+ with warnings.catch_warnings():
+ warnings.simplefilter('ignore', RuntimeWarning)
+ e_h = numpy.zeros((batch_len, r), dtype=STAT_TYPE)
+ tmp_e_h = multiprocessing.Array(ct, e_h.size)
+ e_h = numpy.ctypeslib.as_array(tmp_e_h.get_obj())
+ e_h = e_h.reshape(batch_len, r)
+
+ e_hh = numpy.zeros((batch_len, r, r), dtype=STAT_TYPE)
+ tmp_e_hh = multiprocessing.Array(ct, e_hh.size)
+ e_hh = numpy.ctypeslib.as_array(tmp_e_hh.get_obj())
+ e_hh = e_hh.reshape(batch_len, r, r)
+
+ # Parallelized loop on the model id's
+ fa_model_loop(batch_start=batch_start, mini_batch_indices=numpy.arange(batch_len),
+ r=r, phi_white=W_white, phi=W, sigma=sigma,
+ stat0=_stat0, stat1=self.stat1,
+ e_h=e_h, e_hh=e_hh, num_thread=num_thread)
+
+ y.stat1[batch_start:batch_start + batch_len, :] = e_h[:, :V.shape[1]]
+ x.stat1[batch_start:batch_start + batch_len, :] = e_h[:, V.shape[1]:]
+
+ if D is not None:
+ # subtract Vy + Ux from the first-order statistics
+ VUyx.stat1[batch_start:batch_start + batch_len, :] = e_h.dot(W.T)
+
+ if D is not None:
+ # subtract Vy + Ux from the first-order statistics
+ self = self.subtract_weighted_stat1(VUyx)
+
+ # estimate z
+ for idx in range(self.modelset.shape[0]):
+ Lambda = numpy.ones(D.shape, dtype=STAT_TYPE) + (_stat0[idx, :] * D**2)
+ z.stat1[idx] = self.stat1[idx] * D / Lambda
+
+ return y, x, z
+
+ def factor_analysis(self, rank_f, rank_g=0, rank_h=None, re_estimate_residual=False,
+ it_nb=(10, 10, 10), min_div=True, ubm=None,
+ batch_size=100, num_thread=1, save_partial=False, init_matrices=(None, None, None)):
+ """
+ :param rank_f: rank of the between class variability matrix
+ :param rank_g: rank of the within class variab1ility matrix
+ :param rank_h: boolean, if True, estimate the residual covariance
+ matrix. Default is False
+ :param re_estimate_residual: boolean, if True, the residual covariance matrix is re-estimated (use for PLDA)
+ :param it_nb: tupple of three integers; number of iterations to run
+ for F, G, H estimation
+ :param min_div: boolean, if True, re-estimate the covariance matrices
+ according to the minimum divergence criteria
+ :param batch_size: number of sessions to process in one batch or memory optimization
+ :param num_thread: number of thread to run in parallel
+ :param ubm: origin of the space; should be None for PLDA and be a
+ Mixture object for JFA or TV
+ :param save_partial: name of the file to save intermediate models,
+ if True, save before each split of the distributions
+ :param init_matrices: tuple of three optional matrices to initialize the model, default is (None, None, None)
+
+ :return: three matrices, the between class factor loading matrix,
+ the within class factor loading matrix the diagonal MAP matrix
+ (as a vector) and the residual covariance matrix
+ """
+ warnings.warn("deprecated, use FactorAnalyser module", DeprecationWarning)
+
+ (F_init, G_init, H_init) = init_matrices
+ """ not true anymore, stats are not whiten"""
+ # Whiten the statistics around the UBM.mean or,
+ # if there is no UBM, around the effective mean
+
+ vect_size = self.stat1.shape[1]
+ if ubm is None:
+ mean = self.stat1.mean(axis=0)
+ Sigma_obs = self.get_total_covariance_stat1()
+ if F_init is None:
+ evals, evecs = scipy.linalg.eigh(Sigma_obs)
+ idx = numpy.argsort(evals)[::-1]
+ evecs = evecs[:, idx]
+ F_init = evecs[:, :rank_f]
+ else:
+ mean = ubm.get_mean_super_vector()
+ Sigma_obs = 1. / ubm.get_invcov_super_vector()
+ if F_init is None:
+ F_init = numpy.random.randn(vect_size, rank_f).astype(dtype=STAT_TYPE)
+
+ if G_init is None:
+ G_init = numpy.random.randn(vect_size, rank_g)
+ # rank_H = 0
+ if rank_h is not None: # H is empty or full-rank
+ rank_h = vect_size
+ else:
+ rank_h = 0
+ if H_init is None:
+ H_init = numpy.random.randn(rank_h).astype(dtype=STAT_TYPE) * Sigma_obs.mean()
+
+ # Estimate the between class variability matrix
+ if rank_f == 0 or it_nb[0] == 0:
+ F = F_init
+ sigma = Sigma_obs
+ else:
+ # Modify the StatServer for the Total Variability estimation
+ # each session is considered a class.
+ if rank_g == rank_h == 0 and not re_estimate_residual:
+ modelset_backup = copy.deepcopy(self.modelset)
+ self.modelset = self.segset
+
+ F, sigma = self.estimate_between_class(it_nb[0],
+ F_init,
+ mean,
+ Sigma_obs,
+ batch_size,
+ None,
+ None,
+ min_div,
+ num_thread,
+ re_estimate_residual,
+ save_partial)
+
+ if rank_g == rank_h == 0 and not re_estimate_residual:
+ self.modelset = modelset_backup
+
+ # Estimate the within class variability matrix
+ if rank_g == 0 or it_nb[2] == 0:
+ G = G_init
+ else:
+ # Estimate Vy per model (not per session)
+ Gtmp = numpy.random.randn(vect_size, 0)
+ model_shifted_stat = self.sum_stat_per_model()[0]
+ y, x, z = model_shifted_stat.estimate_hidden(mean, Sigma_obs,
+ F, Gtmp, None,
+ num_thread)
+
+ """ Here we compute Vy for each session so we duplicate first
+ the Y computed per model for each session corresponding to
+ this model and then multiply by V.
+ We subtract then a weighted version of Vy from the statistics."""
+ duplicate_y = numpy.zeros((self.modelset.shape[0], rank_f), dtype=STAT_TYPE)
+ for idx, mod in enumerate(y.modelset):
+ duplicate_y[self.modelset == mod] = y.stat1[idx]
+ Vy = copy.deepcopy(self)
+ Vy.stat1 = duplicate_y.dot(F.T)
+
+ # Estimate G
+ G = self.estimate_within_class(it_nb[1],
+ G_init,
+ mean,
+ Sigma_obs,
+ batch_size,
+ Vy,
+ None,
+ min_div,
+ num_thread,
+ save_partial)
+
+ # Estimate the MAP covariance matrix
+ if rank_h == 0 or it_nb[2] == 0:
+ H = H_init
+ else:
+ # Estimate Vy per model (not per session)
+ empty = numpy.random.randn(vect_size, 0)
+ tmp_stat = self.sum_stat_per_model()[0]
+ y, x, z = tmp_stat.estimate_hidden(mean, Sigma_obs, F, empty, None, num_thread)
+
+ """ Here we compute Vy for each session so we duplicate first
+ the Y computed per model for each session corresponding to
+ this model and then multiply by V.
+ We subtract then a weighted version of Vy from the statistics."""
+ duplicate_y = numpy.zeros((self.modelset.shape[0], rank_f), dtype=STAT_TYPE)
+ for idx, mod in enumerate(y.modelset):
+ duplicate_y[self.modelset == mod] = y.stat1[idx]
+ Vy = copy.deepcopy(self)
+ Vy.stat1 = duplicate_y.dot(F.T)
+
+ # Estimate Ux per session
+ tmp_stat = copy.deepcopy(self)
+ tmp_stat = tmp_stat.subtract_weighted_stat1(Vy)
+ y, x, z = tmp_stat.estimate_hidden(mean, Sigma_obs, empty, G, None, num_thread)
+
+ Ux = copy.deepcopy(self)
+ Ux.stat1 = x.stat1.dot(G.T)
+
+ # Estimate H
+ H = self.estimate_map(it_nb[2], H_init,
+ mean,
+ Sigma_obs,
+ Vy,
+ Ux,
+ save_partial)
+
+ return mean, F, G, H, sigma
+
+ # @staticmethod
+ # def read_subset(statserver_filename, idmap, prefix=''):
+ # """
+ # Given a statserver in HDF5 format stored on disk and an IdMap,
+ # create a StatServer object filled with sessions corresponding to the IdMap.
+ #
+ # :param statserver_filename: name of the statserver in hdf5 format to read from
+ # :param idmap: the IdMap of sessions to load
+ # :param prefix: prefix of the group in HDF5 file
+ # :return: a StatServer
+ # """
+ # with h5py.File(statserver_filename, 'r') as h5f:
+ #
+ # # create tuples of (model,seg) for both HDF5 and IdMap for quick comparaison
+ # sst = [(mod, seg) for mod, seg in zip(h5f[prefix+"modelset"].value.astype('U', copy=False),
+ # h5f[prefix+"segset"].value.astype('U', copy=False))]
+ # imt = [(mod, seg) for mod, seg in zip(idmap.leftids, idmap.rightids)]
+ #
+ # # Get indices of existing sessions
+ # existing_sessions = set(sst).intersection(set(imt))
+ # idx = numpy.sort(numpy.array([sst.index(session) for session in existing_sessions]))
+ #
+ # # Create the new StatServer by loading the correct sessions
+ # statserver = sidekit.StatServer()
+ # statserver.modelset = h5f[prefix+"modelset"].value[idx].astype('U', copy=False)
+ # statserver.segset = h5f[prefix+"segset"].value[idx].astype('U', copy=False)
+ #
+ # tmpstart = h5f.get(prefix+"start").value[idx]
+ # tmpstop = h5f.get(prefix+"stop").value[idx]
+ # statserver.start = numpy.empty(idx.shape, '|O')
+ # statserver.stop = numpy.empty(idx.shape, '|O')
+ # statserver.start[tmpstart != -1] = tmpstart[tmpstart != -1]
+ # statserver.stop[tmpstop != -1] = tmpstop[tmpstop != -1]
+ #
+ # statserver.stat0 = h5f[prefix+"stat0"].value[idx, :]
+ # statserver.stat1 = h5f[prefix+"stat1"].value[idx, :]
+ #
+ # return statserver
+
+ def generator(self):
+ """
+ Create a generator which yield stat0, stat1, of one session at a time
+ """
+ i = 0
+ while i < self.stat0.shape[0]:
+ yield self.stat0[i, :], self.stat1[i, :]
+ i += 1
+
+ @staticmethod
+ def read_subset(statserver_filename, index, prefix=''):
+ """
+ Given a statserver in HDF5 format stored on disk and an IdMap,
+ create a StatServer object filled with sessions corresponding to the IdMap.
+
+ :param statserver_filename: name of the statserver in hdf5 format to read from
+ :param index: the IdMap of sessions to load or an array of index to load
+ :param prefix: prefix of the group in HDF5 file
+ :return: a StatServer
+ """
+ with h5py.File(statserver_filename, 'r') as h5f:
+
+ if isinstance(index, sidekit.IdMap):
+ # create tuples of (model,seg) for both HDF5 and IdMap for quick comparaison
+ sst = [(mod, seg) for mod, seg in zip(h5f[prefix+"modelset"][()].astype('U', copy=False),
+ h5f[prefix+"segset"][()].astype('U', copy=False))]
+ imt = [(mod, seg) for mod, seg in zip(index.leftids, index.rightids)]
+
+ # Get indices of existing sessions
+ existing_sessions = set(sst).intersection(set(imt))
+ idx = numpy.sort(numpy.array([sst.index(session) for session in existing_sessions]).astype(int))
+
+ zero_vectors = numpy.argwhere(h5f['stat1'][()].sum(1) == 0)
+ idx = []
+ for ii, couple in enumerate(sst):
+ if couple in existing_sessions and not ii in zero_vectors:
+ idx.append(ii)
+ idx = numpy.array(idx)
+
+ else:
+ idx = numpy.array(index)
+ # If some indices are higher than the size of the StatServer, they are replace by the last index
+ idx = numpy.array([min(len(h5f[prefix+"modelset"]) - 1, idx[ii]) for ii in range(len(idx))])
+
+ # Create the new StatServer by loading the correct sessions
+ statserver = sidekit.StatServer()
+ statserver.modelset = h5f[prefix+"modelset"][()][idx].astype('U', copy=False)
+ statserver.segset = h5f[prefix+"segset"][()][idx].astype('U', copy=False)
+
+ tmpstart = h5f.get(prefix+"start")[()][idx]
+ tmpstop = h5f.get(prefix+"stop")[()][idx]
+ statserver.start = numpy.empty(idx.shape, '|O')
+ statserver.stop = numpy.empty(idx.shape, '|O')
+ statserver.start[tmpstart != -1] = tmpstart[tmpstart != -1]
+ statserver.stop[tmpstop != -1] = tmpstop[tmpstop != -1]
+
+ statserver.stat0 = h5f[prefix+"stat0"][()][idx, :]
+ statserver.stat1 = h5f[prefix+"stat1"][()][idx, :]
+
+ return statserver
diff --git a/sidekit/sidekit/sv_utils.py b/sidekit/sidekit/sv_utils.py
new file mode 100755
index 0000000..37305e1
--- /dev/null
+++ b/sidekit/sidekit/sv_utils.py
@@ -0,0 +1,464 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+
+"""
+Copyright 2014-2021 Anthony Larcher
+
+:mod:`sv_utils` provides utilities to facilitate the work with SIDEKIT.
+"""
+import ctypes
+import copy
+import gzip
+import multiprocessing
+import numpy
+import os
+import pickle
+import re
+import scipy
+import sys
+if sys.version_info.major > 2:
+ from functools import reduce
+
+__license__ = "LGPL"
+__author__ = "Anthony Larcher"
+__copyright__ = "Copyright 2014-2021 Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+
+
+def save_svm(svm_file_name, w, b):
+ """Save SVM weights and bias in PICKLE format
+
+ :param svm_file_name: name of the file to write
+ :param w: weight coefficients of the SVM to store
+ :param b: biais of the SVM to store
+ """
+ if not os.path.exists(os.path.dirname(svm_file_name)):
+ os.makedirs(os.path.dirname(svm_file_name))
+ with gzip.open(svm_file_name, "wb") as f:
+ pickle.dump((w, b), f)
+
+
+def read_svm(svm_file_name):
+ """Read SVM model in PICKLE format
+
+ :param svm_file_name: name of the file to read from
+
+ :return: a tupple of weight and biais
+ """
+ with gzip.open(svm_file_name, "rb") as f:
+ (w, b) = pickle.load(f)
+ return numpy.squeeze(w), b
+
+
+def check_file_list(input_file_list, file_name_structure):
+ """Check the existence of a list of files in a specific directory
+ Return a new list with the existing segments and a list of indices
+ of those files in the original list. Return outputFileList and
+ idx such that inputFileList[idx] = outputFileList
+
+ :param input_file_list: list of file names
+ :param file_name_structure: structure of the filename to search for
+
+ :return: a list of existing files and the indices
+ of the existing files in the input list
+ """
+ exist_files = numpy.array([os.path.isfile(file_name_structure.format(f)) for f in input_file_list])
+ output_file_list = input_file_list[exist_files]
+ idx = numpy.argwhere(numpy.in1d(input_file_list, output_file_list))
+ return output_file_list, idx.transpose()[0]
+
+
+def initialize_iv_extraction_weight(ubm, T):
+ """
+ Estimate matrices W and T for approximation of the i-vectors
+ For more information, refers to [Glembeck09]_
+
+ :param ubm: Mixture object, Universal Background Model
+ :param T: Raw TotalVariability matrix as a ndarray
+
+ :return:
+ W: fix matrix pre-computed using the weights from the UBM and the
+ total variability matrix
+ Tnorm: total variability matrix pre-normalized using the co-variance
+ of the UBM
+ """
+ # Normalize the total variability matrix by using UBM co-variance
+
+ sqrt_invcov = numpy.sqrt(ubm.get_invcov_super_vector()[:, numpy.newaxis])
+ Tnorm = T * sqrt_invcov
+
+ # Split the Total Variability matrix into sub-matrices per distribution
+ Tnorm_c = numpy.array_split(Tnorm, ubm.distrib_nb())
+
+ # Compute fixed matrix W
+ W = numpy.zeros((T.shape[1], T.shape[1]))
+ for c in range(ubm.distrib_nb()):
+ W = W + ubm.w[c] * numpy.dot(Tnorm_c[c].transpose(), Tnorm_c[c])
+
+ return W, Tnorm
+
+
+def initialize_iv_extraction_eigen_decomposition(ubm, T):
+ """Estimate matrices Q, D_bar_c and Tnorm, for approximation
+ of the i-vectors.
+ For more information, refers to [Glembeck09]_
+
+ :param ubm: Mixture object, Universal Background Model
+ :param T: Raw TotalVariability matrix
+
+ :return:
+ Q: Q matrix as described in [Glembeck11]
+ D_bar_c: matrices as described in [Glembeck11]
+ Tnorm: total variability matrix pre-normalized using the co-variance of the UBM
+ """
+ # Normalize the total variability matrix by using UBM co-variance
+ sqrt_invcov = numpy.sqrt(ubm.get_invcov_super_vector()[:, numpy.newaxis])
+ Tnorm = T * sqrt_invcov
+
+ # Split the Total Variability matrix into sub-matrices per distribution
+ Tnorm_c = numpy.array_split(Tnorm, ubm.distrib_nb())
+
+ # Compute fixed matrix Q
+ W = numpy.zeros((T.shape[1], T.shape[1]))
+ for c in range(ubm.distrib_nb()):
+ W = W + ubm.w[c] * numpy.dot(Tnorm_c[c].transpose(), Tnorm_c[c])
+
+ eigen_values, Q = scipy.linalg.eig(W)
+
+ # Compute D_bar_c matrix which is the diagonal approximation of Tc' * Tc
+ D_bar_c = numpy.zeros((ubm.distrib_nb(), T.shape[1]))
+ for c in range(ubm.distrib_nb()):
+ D_bar_c[c, :] = numpy.diag(reduce(numpy.dot, [Q.transpose(), Tnorm_c[c].transpose(), Tnorm_c[c], Q]))
+ return Q, D_bar_c, Tnorm
+
+
+def initialize_iv_extraction_fse(ubm, T):
+ """Estimate matrices for approximation of the i-vectors.
+ For more information, refers to [Cumani13]_
+
+ :param ubm: Mixture object, Universal Background Model
+ :param T: Raw TotalVariability matrix
+
+ :return:
+ Q: Q matrix as described in [Glembeck11]
+ D_bar_c: matrices as described in [Glembeck11]
+ Tnorm: total variability matrix pre-normalized using the co-variance of the UBM
+ """
+ # % Initialize the process
+ # %init = 1;
+ #
+ #
+ # Extract i-vectors by using different methods
+ #
+ # %rank_T = 10;
+ # %featureSize = 50;
+ # %distribNb = 32;
+ # %dictSize = 5;
+ # %dictSizePerDis=rank_T; % a modifier par la suite
+ #
+ # %ubm_file = 'gmm/world32.gmm';
+ # %t_file = 'mat/TV_32.matx';
+ #
+ #
+ #
+ # Load data
+ #
+ #
+ # % Load UBM for weight parameters that are used in the optimization
+ # % function
+ # %UBM = ALize_LoadModel(ubm_file);
+ #
+ # % Load meand from Minimum Divergence re-estimation
+ # %sv_mindiv = ALize_LoadVect(minDiv_file)';
+ #
+ # % Load T matrix
+ # %T = ALize_LoadMatrix(t_file)';
+ #
+ #
+ # function [O,PI,Q] = factorized_subspace_estimation(UBM,T,dictSize,outIterNb,inIterNb)
+ #
+ # rank_T = size(T,2);
+ # distribNb = size(UBM.W,2);
+ # featureSize = size(UBM.mu,1);
+ #
+ # % Normalize matrix T
+ # sv_invcov = reshape(UBM.invcov,[],1);
+ # T_norm = T .* repmat(sqrt(sv_invcov),1,rank_T);
+ #
+ # Tc{1,distribNb} = [];
+ # for ii=0:distribNb-1
+ # % Split the matrix in sub-matrices
+ # Tc{ii+1} = T_norm((ii*featureSize)+1:(ii+1)*featureSize,:);
+ # end
+ #
+ # % Initialize O and Qc by Singular Value Decomposition
+ # init_FSE.Qc{distribNb} = [];
+ # init_FSE.O{distribNb} = [];
+ # PI{distribNb} = [];
+ #
+ # for cc=1:distribNb
+ # init_FSE.Qc{cc} = zeros(featureSize,featureSize);
+ # init_FSE.O{cc} = zeros(featureSize,featureSize);
+ # PI{cc} = sparse(zeros(featureSize,dictSize*distribNb)); % a remplacer par une matrice sparse
+ # end
+ #
+ # % For each distribution
+ # for cc=1:distribNb
+ # fprintf('Initilize matrice for distribution %d / %d\n',cc,distribNb);
+ # % Initialized O with Singular vectors from SVD
+ # [init_FSE.O{cc},~,V] = svd(UBM.W(1,cc)*Tc{cc});
+ # init_FSE.Qc{cc} = V';
+ # end
+ #
+ #
+ # % Concatenate Qc to create the matrix Q: A MODIFIER POUR DISSOCIER
+ # % dictSize DU NOMBRE DE DISTRIBUTIONS
+ # Q = [];
+ # for cc=1:distribNb
+ # Q = [Q;init_FSE.Qc{cc}(1:dictSize,:)];
+ # end
+ # O = init_FSE.O;
+ # clear 'init_FSE'
+ #
+ #
+ # % OUTER iteration process : update Q iteratively
+ # for it = 1:outIterNb
+ #
+ # fprintf('Start iteration %d / %d for Q re-estimation\n',it,5);
+ #
+ # % INNER iteration process: update PI and O iteratively
+ # for pioIT = 1:inIterNb
+ # fprintf(' Start iteration %d / %d for PI and O re-estimation\n',pioIT,10);
+ #
+ # % Update PI
+ # %Compute diagonal terms of QQ'
+ # % diagQ = diag(Q*Q');
+ #
+ # for cc=1:distribNb
+ #
+ # % Compute optimal k and optimal v
+ # % for each line f of PI{cc}
+ # A = O{cc}'*Tc{cc}*Q';
+ # f = 1;
+ # while (f < size(A,1)+1)
+ #
+ # if(f == 1)
+ # A = O{cc}'*Tc{cc}*Q'; % equation (26)
+ # PI{cc} = sparse(zeros(featureSize,dictSize*distribNb));
+ # end
+ #
+ # % Find the optimal index k
+ # [~,k] = max(A(f,:).^2);
+ # k_opt = k;
+ #
+ # % Find the optimal value v
+ # v_opt = A(f,k_opt);
+ #
+ # % Update the line of PI{cc} with the value v_opt in the
+ # % k_opt-th column
+ # PI{cc}(f,k_opt) = v_opt;
+ #
+ # % if the column already has a non-zero element,
+ # % update O and PI
+ # I = find(PI{cc}(:,k_opt)~=0);
+ # if size(I,1)>1
+ # % get indices of the two lines of PI{cc} which
+ # % have a non-zero element on the same column
+ # a = I(1);
+ # b = I(2);
+ #
+ # % Replace column O{cc}(:,a) and O{cc}(:,b)
+ # Oa = (PI{cc}(a,k_opt)*O{cc}(:,a)+PI{cc}(b,k_opt)*O{cc}(:,b))/
+ # (sqrt(PI{cc}(a,k_opt)^2+PI{cc}(b,k_opt)^2));
+ # Ob = (PI{cc}(a,k_opt)*O{cc}(:,b)-PI{cc}(b,k_opt)*O{cc}(:,a))/
+ # (sqrt(PI{cc}(a,k_opt)^2+PI{cc}(b,k_opt)^2));
+ # O{cc}(:,a) = Oa;
+ # O{cc}(:,b) = Ob;
+ #
+ # PI{cc}(a,k_opt) = sqrt(PI{cc}(a,k_opt)^2+PI{cc}(b,k_opt)^2);
+ # PI{cc}(b,k_opt) = 0;
+ #
+ # f = 0;
+ # end
+ # f =f +1;
+ # end
+ # end
+ #
+ # obj = computeObjFunc(UBM.W,Tc,O,PI,Q);
+ # fprintf('Objective Function after estimation of PI = %2.10f\n',obj);
+ #
+ # % Update O
+ # for cc=1:distribNb
+ #
+ # % Compute
+ # Z = PI{cc}*Q*Tc{cc}';
+ #
+ # % Compute Singular value decomposition of Z
+ # [Uz,~,Vz] = svd(Z);
+ #
+ # % Compute the new O{cc}
+ # O{cc} = Vz*Uz';
+ # end
+ # obj = computeObjFunc(UBM.W,Tc,O,PI,Q);
+ # fprintf('Objective Function after estimation of O = %2.10f\n',obj);
+ # end % END OF INNER ITERATION PROCESS
+ #
+ #
+ # % Update Q
+ # D = sparse(zeros(size(PI{cc},2),size(PI{cc},2)));
+ # E = zeros(size(PI{cc},2),rank_T);
+ # for cc=1:distribNb
+ # % Accumulate D
+ # D = D + UBM.W(1,cc) * PI{cc}'*PI{cc};
+ #
+ # % Accumulate the second term
+ # E = E + UBM.W(1,cc) * PI{cc}'*O{cc}'*Tc{cc};
+ # end
+ # Q = D\E;
+ #
+ # % Normalize rows of Q and update PI accordingly
+ #
+ # % Compute norm of each row
+ # c1 = bsxfun(@times,Q,Q);
+ # c2 = sum(c1,2);
+ # c3 = sqrt(c2);
+ # Q = bsxfun(@rdivide,Q,c3);
+ #
+ # % Update PI accordingly
+ # PI = cellfun(@(x)x*sparse(diag(c3)), PI, 'uni',false);
+ #
+ # obj = computeObjFunc(UBM.W,Tc,O,PI,Q);
+ # fprintf('Objective Function after re-estimation of Q = %2.10f\n',obj);
+ # end
+ # end
+ pass
+
+
+def clean_stat_server(statserver):
+ """
+
+ :param statserver:
+ :return:
+ """
+ zero_idx = ~(statserver.stat0.sum(axis=1) == 0.)
+ statserver.modelset = statserver.modelset[zero_idx]
+ statserver.segset = statserver.segset[zero_idx]
+ statserver.start = statserver.start[zero_idx]
+ statserver.stop = statserver.stop[zero_idx]
+ statserver.stat0 = statserver.stat0[zero_idx, :]
+ statserver.stat1 = statserver.stat1[zero_idx, :]
+ assert statserver.validate(), "Error after cleaning StatServer"
+ print("Removed {} empty sessions in StatServer".format((~zero_idx).sum()))
+
+
+def parse_mask(mask):
+ """
+
+ :param mask:
+ :return:
+ """
+ if not set(re.sub("\s", "", mask)[1:-1]).issubset(set("0123456789-,")):
+ raise Exception("Wrong mask format")
+ tmp = [k.split('-') for k in re.sub(r"[\s]", '', mask)[1:-1].split(',')]
+ indices = []
+ for seg in tmp:
+ if len(seg) == 1:
+ seg += seg
+ if len(seg) == 2:
+ indices += list(range(int(seg[0]), int(seg[1])+1))
+ else:
+ raise Exception("Wrong mask format")
+ return indices
+
+
+def segment_mean_std_hdf5(input_segment, in_context=False):
+ """
+ Compute the sum and square sum of all features for a list of segments.
+ Input files are in HDF5 format
+
+ :param input_segment: list of segments to read from, each element of the list is a tuple of 5 values,
+ the filename, the index of thefirst frame, index of the last frame, the number of frames for the
+ left context and the number of frames for the right context
+ :param in_context:
+ :return: a tuple of three values, the number of frames, the sum of frames and the sum of squares
+ """
+ features_server, show, start, stop, in_context = input_segment
+
+ if start is None or stop is None or not in_context:
+ feat, _ = features_server.load(show,
+ start=start,
+ stop=stop)
+
+ else:
+ # Load the segment of frames plus left and right context
+ feat, _ = features_server.load(show,
+ start=start-features_server.context[0],
+ stop=stop+features_server.context[1])
+ # Get features in context
+ feat, _ = features_server.get_context(feat=feat,
+ label=None,
+ start=features_server.context[0],
+ stop=feat.shape[0]-features_server.context[1])
+
+ return feat.shape[0], feat.sum(axis=0), numpy.sum(feat**2, axis=0)
+
+
+def mean_std_many(features_server, seg_list, in_context=False, num_thread=1):
+ """
+ Compute the mean and standard deviation from a list of segments.
+
+ :param features_server:
+ :param seg_list: list of file names with start and stop indices
+ :param in_context:
+ :param num_thread:
+ :return: a tuple of three values, the number of frames, the mean and the variance
+ """
+ if isinstance(seg_list[0], tuple):
+ inputs = [(copy.deepcopy(features_server), seg[0], seg[1], seg[2], in_context) for seg in seg_list]
+ elif isinstance(seg_list[0], str):
+ inputs = [(copy.deepcopy(features_server), seg, None, None, in_context) for seg in seg_list]
+
+ pool = multiprocessing.Pool(processes=num_thread)
+ res = pool.map(segment_mean_std_hdf5, inputs)
+ pool.terminate()
+ total_N = 0
+ total_F = 0
+ total_S = 0
+ for N, F, S in res:
+ total_N += N
+ total_F += F
+ total_S += S
+ return total_N, total_F / total_N, total_S / total_N
+
+
+def serialize(M):
+ M_shape = M.shape
+ ct = ctypes.c_double
+ if M.dtype == numpy.float32:
+ ct = ctypes.c_float
+ tmp_M = multiprocessing.Array(ct, M.size)
+ M = numpy.ctypeslib.as_array(tmp_M.get_obj())
+ return M.reshape(M_shape)
diff --git a/sidekit/sidekit/svm_scoring.py b/sidekit/sidekit/svm_scoring.py
new file mode 100755
index 0000000..1b62e27
--- /dev/null
+++ b/sidekit/sidekit/svm_scoring.py
@@ -0,0 +1,128 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+
+"""
+Copyright 2014-2021 Anthony Larcher
+
+:mod:`svm_scoring` provides functions to perform speaker verification
+by using Support Vector Machines.
+"""
+import ctypes
+import numpy
+import multiprocessing
+import logging
+import sidekit.sv_utils
+from sidekit.bosaris import Ndx
+from sidekit.bosaris import Scores
+from sidekit.statserver import StatServer
+
+
+__license__ = "LGPL"
+__author__ = "Anthony Larcher"
+__copyright__ = "Copyright 2014-2021 Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+
+
+def svm_scoring_singleThread(svm_filename_structure, test_sv, ndx, score, seg_idx=None):
+ """Compute scores for SVM verification on a single thread
+ (two classes only as implementeed at the moment)
+
+ :param svm_filename_structure: structure of the filename where to load the SVM models
+ :param test_sv: StatServer object of super-vectors. stat0 are set to 1 and stat1 are the super-vector to classify
+ :param ndx: Ndx object of the trials to perform
+ :param score: Scores object to fill
+ :param seg_idx: list of segments to classify. Classify all if the list is empty.
+ """
+ assert isinstance(test_sv, StatServer), 'Second parameter should be a StatServer'
+ assert isinstance(ndx, Ndx), 'Third parameter should be an Ndx'
+
+ if seg_idx is None:
+ seg_idx = range(ndx.segset.shape[0])
+
+ # Load SVM models
+ Msvm = numpy.zeros((ndx.modelset.shape[0], test_sv.stat1.shape[1]))
+ bsvm = numpy.zeros(ndx.modelset.shape[0])
+ for m in range(ndx.modelset.shape[0]):
+ svm_file_name = svm_filename_structure.format(ndx.modelset[m])
+ w, b = sidekit.sv_utils.read_svm(svm_file_name)
+ Msvm[m, :] = w
+ bsvm[m] = b
+
+ # Compute scores against all test segments
+ for ts in seg_idx:
+ logging.info('Compute trials involving test segment %d/%d', ts + 1, ndx.segset.shape[0])
+
+ # Select the models to test with the current segment
+ models = ndx.modelset[ndx.trialmask[:, ts]]
+ ind_dict = dict((k, i) for i, k in enumerate(ndx.modelset))
+ inter = set(ind_dict.keys()).intersection(models)
+ idx_ndx = numpy.array([ind_dict[x] for x in inter])
+
+ scores = numpy.dot(Msvm[idx_ndx, :], test_sv.stat1[ts, :]) + bsvm[idx_ndx]
+
+ # Fill the score matrix
+ score.scoremat[idx_ndx, ts] = scores
+
+
+def svm_scoring(svm_filename_structure, test_sv, ndx, num_thread=1):
+ """Compute scores for SVM verification on multiple threads
+ (two classes only as implementeed at the moment)
+
+ :param svm_filename_structure: structure of the filename where to load the SVM models
+ :param test_sv: StatServer object of super-vectors. stat0 are set to 1 and stat1
+ are the super-vector to classify
+ :param ndx: Ndx object of the trials to perform
+ :param num_thread: number of thread to launch in parallel
+
+ :return: a Score object.
+ """
+ # Remove missing models and test segments
+ existing_models, model_idx = sidekit.sv_utils.check_file_list(ndx.modelset, svm_filename_structure)
+ clean_ndx = ndx.filter(existing_models, test_sv.segset, True)
+
+ score = Scores()
+ score.scoremat = numpy.zeros(clean_ndx.trialmask.shape)
+ score.modelset = clean_ndx.modelset
+ score.segset = clean_ndx.segset
+ score.scoremask = clean_ndx.trialmask
+
+ tmp = multiprocessing.Array(ctypes.c_double, score.scoremat.size)
+ score.scoremat = numpy.ctypeslib.as_array(tmp.get_obj())
+ score.scoremat = score.scoremat.reshape(score.modelset.shape[0], score.segset.shape[0])
+
+ # Split the list of segment to process for multi-threading
+ los = numpy.array_split(numpy.arange(clean_ndx.segset.shape[0]), num_thread)
+
+ jobs = []
+ for idx in los:
+ p = multiprocessing.Process(target=svm_scoring_singleThread,
+ args=(svm_filename_structure, test_sv, clean_ndx, score, idx))
+ jobs.append(p)
+ p.start()
+ for p in jobs:
+ p.join()
+
+ return score
diff --git a/sidekit/sidekit/svm_training.py b/sidekit/sidekit/svm_training.py
new file mode 100755
index 0000000..ceedbfb
--- /dev/null
+++ b/sidekit/sidekit/svm_training.py
@@ -0,0 +1,132 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT. If not, see .
+
+"""
+Copyright 2014-2021 Anthony Larcher
+
+:mod:`svm_training` provides utilities to train Support Vector Machines
+to perform speaker verification.
+"""
+import numpy
+import os
+import logging
+from sidekit.libsvm.svmutil import svm_problem, svm_parameter, svm_train
+from sidekit.statserver import StatServer
+import multiprocessing
+import sidekit.sv_utils
+
+
+__license__ = "LGPL"
+__author__ = "Anthony Larcher"
+__copyright__ = "Copyright 2014-2021 Anthony Larcher"
+__maintainer__ = "Anthony Larcher"
+__email__ = "anthony.larcher@univ-lemans.fr"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+
+
+def svm_training_singleThread(K, msn, bsn, svm_dir, background_sv, models, enroll_sv):
+ """Train Suport Vector Machine classifiers for two classes task
+ (as implemented for nowbut miht change in the future to include multi-class
+ classification)
+
+ :param K: pre-computed part of the Gram matrix
+ :param msn: maximum number of sessions to train a SVM
+ :param bsn: number of session used as background impostors
+ :param svm_dir: directory where to store the SVM models
+ :param background_sv: StatServer of super-vectors for background impostors. All
+ super-vectors are used without selection
+ :param models: list of models to train. The models must be included in the
+ enroll_sv StatServer
+ :param enroll_sv: StatServer of super-vectors used for the target models
+ """
+ gram = numpy.zeros((bsn + msn, bsn + msn))
+ gram[:bsn, :bsn] = K
+ # labels of the target examples are set to 1
+ # labels of the impostor vectors are set to 2
+ K_label = (2 * numpy.ones(bsn, 'int')).tolist() + numpy.ones(msn, 'int').tolist()
+
+ for model in models:
+ logging.info('Train SVM model for %s', model)
+ # Compute the part of the Kernel which depends on the enrollment data
+ csn = enroll_sv.get_model_segments(model).shape[0]
+ X = numpy.vstack((background_sv.stat1, enroll_sv.get_model_stat1(model)))
+ gram[:bsn + csn, bsn:bsn + csn] = numpy.dot(X, enroll_sv.get_model_stat1(model).transpose())
+ gram[bsn:bsn + csn, :bsn] = gram[:bsn, bsn:bsn + csn].transpose()
+
+ # train the SVM for the current model (where libsvm is used)
+ Kernel = numpy.zeros((gram.shape[0], gram.shape[1] + 1)).tolist()
+ for i in range(gram.shape[0]):
+ Kernel[i][0] = int(i + 1)
+ Kernel[i][1:] = gram[i, ]
+
+ # isKernel=True must be set for precomputer kernel
+ # Precomputed kernel data (-t 4)
+ prob = svm_problem(K_label, Kernel, isKernel=True)
+ c = 1 / numpy.mean(numpy.diag(gram))
+ param = svm_parameter('-t 4 -c {}'.format(c))
+ svm = svm_train(prob, param)
+ # Compute the weights
+ w = -numpy.dot(X[numpy.array(svm.get_sv_indices()) - 1, ].transpose(), numpy.array(svm.get_sv_coef()))
+ bsvm = svm.rho[0]
+ svmFileName = os.path.join(svm_dir, model + '.svm')
+ sidekit.sv_utils.save_svm(svmFileName, w, bsvm)
+
+
+def svm_training(svmDir, background_sv, enroll_sv, num_thread=1):
+ """Train Suport Vector Machine classifiers for two classes task
+ (as implemented for nowbut miht change in the future to include multi-class
+ classification)
+ Training is parallelized on multiple threads.
+
+ :param svmDir: directory where to store the SVM models
+ :param background_sv: StatServer of super-vectors for background impostors. All
+ super-vectors are used without selection
+ :param enroll_sv: StatServer of super-vectors used for the target models
+ :param num_thread: number of thread to launch in parallel
+ """
+ assert isinstance(background_sv, StatServer), 'Second parameter has to be a StatServer'
+ assert isinstance(enroll_sv, StatServer), 'Third parameter has to be a StatServer'
+
+ # The effective Kernel is initialize for the case of multi-session
+ # by considering the maximum number of sessions per speaker.
+ # For the SVM training, only a subpart of the kernel is used, accordingly
+ # to the number of sessions of the current speaker
+ K = background_sv.precompute_svm_kernel_stat1()
+ msn = max([enroll_sv.modelset.tolist().count(a) for a in enroll_sv.modelset.tolist()])
+ bsn = K.shape[0]
+
+ # Split the list of unique model names
+ listOfModels = numpy.array_split(numpy.unique(enroll_sv.modelset), num_thread)
+
+ # Process each sub-list of models in a separate thread
+ jobs = []
+ for idx, models in enumerate(listOfModels):
+ p = multiprocessing.Process(target=svm_training_singleThread,
+ args=(K, msn, bsn, svmDir, background_sv, models, enroll_sv))
+ jobs.append(p)
+ p.start()
+ for p in jobs:
+ p.join()
+
+