diff --git a/.gitignore b/.gitignore index df38727..9ab6c7b 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,142 @@ kaldi corpora kaldifeat *.egg-info/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ diff --git a/install.sh b/install.sh index 6845fec..d33b5f6 100755 --- a/install.sh +++ b/install.sh @@ -6,19 +6,28 @@ nj=$(nproc) home=$PWD -# python/CONDA +# CUDA version +CUDAROOT=/usr/local/cuda +if [ "$(id -g --name)" == "lium" ]; then + CUDAROOT=/opt/cuda/10.2 # LIUM Cluster + echo "Using local \$CUDAROOT: $CUDAROOT" +fi + +[ ! -d $CUDAROOT ] && echo "CUDAROOT: '$FILE' does not exist." && exit 1 + +cuda_version=$($CUDAROOT/bin/nvcc --version | grep "Cuda compilation tools" | cut -d" " -f5 | sed s/,//) +cuda_version_witout_dot=$(echo $cuda_version | xargs | sed 's/\.//') + +# CONDA conda_url=https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh conda_url=https://repo.anaconda.com/miniconda/Miniconda3-py38_4.9.2-Linux-x86_64.sh -# PYTORCH version +# PYTORCH torch_version=1.8.2 torchvision_version=0.9.2 torchaudio_version=0.8.2 -torch_wheels="https://download.pytorch.org/whl/lts/1.8/torch_lts.html" - -# CUDA version -cuda_version=10.2 +torch_wheels="https://download.pytorch.org/whl/lts/1.8/torch_lts.html" venv_dir=$PWD/venv @@ -35,38 +44,24 @@ if [ ! -f $mark ]; then . $venv_dir/bin/activate echo "Installing conda dependencies" - yes | conda install -c conda-forge sox || exit 1 - yes | conda install -c conda-forge libflac || exit 1 - yes | conda install -c conda-forge cudatoolkit=$cuda_version || exit 1 + yes | conda install -c conda-forge sox + yes | conda install -c conda-forge libflac touch $mark fi source $venv_dir/bin/activate -exit 0 - -# CUDA version -CUDAROOT=/usr/local/cuda -if [ "$(id -g --name)" == "lium" ]; then - CUDAROOT=/opt/cuda/10.2 # LIUM Cluster - echo "Using local \$CUDAROOT: $CUDAROOT" -fi -_cuda_version=$($CUDAROOT/bin/nvcc --version | grep "Cuda compilation tools" | cut -d" " -f5 | sed s/,//) -if [[ $cuda_version != $_cuda_version ]]; then - echo "CUDA env not properly setup! (installed cuda v$cuda_version != in path cuda v$_cuda_version)" - exit 1 -fi -cuda_version_witout_dot=$(echo $cuda_version | xargs | sed 's/\.//') - export PATH=$CUDAROOT/bin:$PATH export LD_LIBRARY_PATH=$CUDAROOT/lib64:$LD_LIBRARY_PATH export CFLAGS="-I$CUDAROOT/include $CFLAGS" export CUDA_HOME=$CUDAROOT export CUDA_PATH=$CUDAROOT + echo "if [ \$(which python) != $venv_dir/bin/python ]; then source $venv_dir/bin/activate; fi; export CUDAROOT=$CUDAROOT; export LD_LIBRARY_PATH=$LD_LIBRARY_PATH;" > env.sh mark=.done-pytorch if [ ! -f $mark ]; then echo " == Installing pytorch $torch_version for cuda $cuda_version ==" + # pip3 install torch==1.7.1+cu101 torchvision==0.8.2+cu101 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html pip3 install torch==$torch_version+cu$cuda_version_witout_dot torchvision==$torchvision_version+cu$cuda_version_witout_dot torchaudio==$torchaudio_version -f $torch_wheels cd $home touch $mark diff --git a/sidekit/.gitignore b/sidekit/.gitignore new file mode 100755 index 0000000..b9e6703 --- /dev/null +++ b/sidekit/.gitignore @@ -0,0 +1,3 @@ +*.pyc +*.DS_Store +docs diff --git a/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/.gitignore b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/.gitignore new file mode 100644 index 0000000..403c167 --- /dev/null +++ b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/.gitignore @@ -0,0 +1,3 @@ +log +model +.ipynb_checkpoints diff --git a/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/VoxCeleb12_ssd.yaml b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/VoxCeleb12_ssd.yaml new file mode 100644 index 0000000..dcf9d8e --- /dev/null +++ b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/VoxCeleb12_ssd.yaml @@ -0,0 +1,70 @@ +# Dataset description + + +# General options +data_path: /lium/scratch/larcher/ +data_file_extension: .wav +dataset_csv: list/vox12.csv + +# DEV4S smaller dataset: +# awk -F, 'x[$1]<10;{x[$1]++}' vox12.csv > vox12_DEV.csv +# dataset_csv: list/vox12_DEV.csv + +# DEV4S smaller dataset (with only long utterances): +# (head -n 2 list/vox12_DEV.csv ; cat list/vox12_DEV.csv | sort -t , -k 5,5 -rn | head -n 7204) > list/vox12_DEV_long.csv +# tail -n +2 list/vox12_DEV.csv | cut -d , -f 1 | sort | uniq > /tmp/len +# tail -n +2 list/vox12_DEV_long.csv | cut -d, -f2- > /tmp/vox12_DEV_long_side.csv +# paste -d',' /tmp/len /tmp/vox12_DEV_long_side.csv > /tmp/vox12_DEV_long_wrongspkid.csv +# (head -n 2 list/vox12_DEV.csv; cat /tmp/vox12_DEV_long_wrongspkid.csv) > list/vox12_DEV_long.csv +# dataset_csv: list/vox12_DEV_long_sorted.csv + +sample_rate: 16000 + +validation_ratio: 0.02 +batch_size: 512 + +# Training set +train: + duration: 3. + chunk_per_segment: -1 + overlap: 3. + + sampler: + examples_per_speaker: 1 + samples_per_speaker: 100 + augmentation_replica: 1 + + transform_number: 1 + + transformation: + pipeline: add_reverb,add_noise,filtering,phone_filtering,codec + spec_aug: 0.5 + temp_aug: 0.5 + + add_noise: + noise_db_csv: /lium/raid01_c/larcher/data/musan_split.csv + data_path: /lium/scratch/larcher/musan_split/ + + add_reverb: + rir_db_csv: list/reverb.csv + data_path: /lium/scratch/amehrish/Database/REVERB_UPDATED/REVERB/RIRS_NOISES/ + +# Validation set +valid: + duration: 3. + + transformation: + pipeline: + spec_aug: 0.5 + temp_aug: 0.5 + + add_noise: + noise_db_csv: /lium/raid01_c/larcher/data/musan_split.csv + data_path: /lium/scratch/larcher/musan_split/ + +# Test set +test: + idmap: /lium/raid01_c/larcher/data/vox1_test_cleaned_idmap.h5 + ndx: /lium/raid01_c/larcher/data/vox1_test_cleaned_ndx.h5 + key: /lium/raid01_c/larcher/data/vox1_test_cleaned_key.h5 + data_path: /lium/scratch/larcher/voxceleb1/test/wav diff --git a/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/VoxCeleb2_ssd.yaml b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/VoxCeleb2_ssd.yaml new file mode 100644 index 0000000..accfb3e --- /dev/null +++ b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/VoxCeleb2_ssd.yaml @@ -0,0 +1,59 @@ +# Dataset description + + +# General options +data_path: /lium/scratch/larcher/ +data_file_extension: .wav +dataset_csv: list/voxceleb2_dev.csv +sample_rate: 16000 + + +validation_ratio: 0.02 +batch_size: 256 + + +# Training set +train: + duration: 4 + chunk_per_segment: -1 + overlap: 3. + + sampler: + examples_per_speaker: 1 + samples_per_speaker: 100 + augmentation_replica: 1 + + transform_number: 1 + + transformation: + pipeline: add_reverb,add_noise,filtering,codec + spec_aug: 0.5 + temp_aug: 0.5 + + add_noise: + noise_db_csv: /lium/raid01_c/larcher/data/musan_split.csv + data_path: /lium/scratch/larcher/musan_split/ + + add_reverb: + rir_db_csv: /lium/home/amehrish/Database/REVERB_UPDATED/REVERB/Reverb.csv + data_path: /lium/home/amehrish/Database/REVERB_UPDATED/REVERB/RIRS_NOISES/ + +# Validation set +valid: + duration: 4. + + transformation: + pipeline: + spec_aug: 0.5 + temp_aug: 0.5 + + add_noise: + noise_db_csv: /lium/raid01_c/larcher/data/musan_split.csv + data_path: /lium/scratch/larcher/musan_split/ + +# Test set +test: + idmap: /lium/raid01_c/larcher/data/vox1_test_cleaned_idmap.h5 + ndx: /lium/raid01_c/larcher/data/vox1_test_cleaned_ndx.h5 + key: /lium/raid01_c/larcher/data/vox1_test_cleaned_key.h5 + data_path: /lium/scratch/larcher/voxceleb1/test/wav diff --git a/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/allies.yaml b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/allies.yaml new file mode 100644 index 0000000..ce08a16 --- /dev/null +++ b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/allies.yaml @@ -0,0 +1,55 @@ +# Dataset description + + +# General options +data_path: /lium/scratch/larcher/ALLIES_train_split/wav/ +data_file_extension: .wav +dataset_csv: /lium/raid01_c/larcher/data/allies_train_split.csv +sample_rate: 16000 + + +validation_ratio: 0.1 +batch_size: 64 + + +# Training set +train: + duration: 1.5 + chunk_per_segment: -1 + overlap: 1.4 + + transformation: + pipeline: add_noise + spec_aug: 0.5 + temp_aug: 0.5 + + add_noise: + noise_db_csv: /lium/raid01_c/larcher/data/musan_split.csv + data_path: /lium/scratch/larcher/musan_split/ + + add_reverb: + + +# Validation set +valid: + duration: 1.5 + + transformation: + pipeline: + spec_aug: 0.5 + temp_aug: 0.5 + + add_noise: + noise_db_csv: /lium/raid01_c/larcher/data/musan_split.csv + data_path: /lium/scratch/larcher/musan_split/ + + add_reverb: + +# Test set +test: + idmap: /lium/raid01_c/larcher/data/allies_dev_verif_idmap.h5 + ndx: /lium/raid01_c/larcher/data/allies_dev_verif_ndx.h5 + key: /lium/raid01_c/larcher/data/allies_dev_verif_key.h5 + data_path: /lium/corpus/base/ALLIES/wav + + diff --git a/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/model.yaml b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/model.yaml new file mode 100644 index 0000000..43e9fd3 --- /dev/null +++ b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/model.yaml @@ -0,0 +1,20 @@ +# Model description + +speaker_number: 7205 + +loss: + type: aam + aam_margin: 0.2 + aam_s: 30 + +# Initialize model from file, reset and freeze parts of it +# initial_model_name: ./model/tmp_half_clr_adam_aam0.2_30_b256_vox12.pt +initial_model_name: +reset_parts: [] +freeze_parts: [] + +# Model can be fastresnet34, resnet34, xvector of custom +# if custom, then need more options +model_type: halfresnet34 + +feature_size: 80 diff --git a/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/training.yaml b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/training.yaml new file mode 100644 index 0000000..e2af66b --- /dev/null +++ b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/cfg/training.yaml @@ -0,0 +1,42 @@ +# Training description + + +# Gerenal options +log_file: log/aam_CLR2-40000-1e-10_adam.log +torch_seed: 42 +numpy_seed: 42 +random_seed: 42 +deterministic: false + +epochs: 1000 +lr: 0.000001 +patience: 100 + +multi_gpu: true +num_cpu: 8 + +mixed_precision: true +clipping: false + +# Optimizer and scheduler options +optimizer: + type: adam + options: + +scheduler: + type: CyclicLR + mode: triangular2 + base_lr: 0.0000001 + step_size_up: 40000 + + + +# Evaluation options +compute_test_eer: true +log_interval: 10 +validation_frequency: 1 + +# Save options +tmp_model_name: model/tmp_half_clr_adam_aam0.2_30_b256_vox12.pt +best_model_name: model/best_halp_clr_adam_aam0.2_30_b256_vox12.pt +checkpoint_frequency: diff --git a/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/h5f/.gitkeep b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/h5f/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/list/.gitkeep b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/list/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/train.sh b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/train.sh new file mode 100755 index 0000000..3e2a650 --- /dev/null +++ b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/train.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +export NUM_NODES=1 +export NUM_GPUS_PER_NODE=2 +export NODE_RANK=0 +export WORLD_SIZE=$(($NUM_NODES * $NUM_GPUS_PER_NODE)) + +# env DEV4S=True WORLD_SIZE=1 ipython3 train_xtractor.py # DEV + +cd ../../.. +. ./env.sh +cd - + +python3 -m torch.distributed.launch \ + --nproc_per_node=$NUM_GPUS_PER_NODE \ + --nnodes=$NUM_NODES \ + --node_rank $NODE_RANK \ + train_xtractor.py diff --git a/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/train_xtractor.py b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/train_xtractor.py new file mode 100644 index 0000000..1862138 --- /dev/null +++ b/sidekit/egs/2021_04_26_Half_CLR40000_1e-10_adam_aam0.2_vox12_3sec_2aug_BN/train_xtractor.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys +import torch +import sidekit +import yaml + +from argparse import ArgumentParser + +def main(): + parser = ArgumentParser('DDP usage example') + parser.add_argument('--local_rank', type=int, default=-1, metavar='N', help='Local process rank.') + args = parser.parse_args() + + # keep track of whether the current process is the `master` process (totally optional, but I find it useful for data laoding, logging, etc.) + args.is_master = args.local_rank == 0 + + sidekit.nnet.xvector.xtrain(dataset_description="cfg/VoxCeleb12_ssd.yaml", + model_description="cfg/model.yaml", + training_description="cfg/training.yaml") + + +if __name__ == '__main__': + main() + diff --git a/sidekit/setup.py b/sidekit/setup.py new file mode 100644 index 0000000..25f5c4b --- /dev/null +++ b/sidekit/setup.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python + +from distutils.core import setup + +setup(name='sidekit', + version='1.0', + ) diff --git a/sidekit/sidekit/__init__.py b/sidekit/sidekit/__init__.py new file mode 100755 index 0000000..ef36a52 --- /dev/null +++ b/sidekit/sidekit/__init__.py @@ -0,0 +1,191 @@ +# -*- coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . + +""" +Copyright 2014-2021 Anthony Larcher and Sylvain Meignier +""" + +from ctypes import * +from ctypes.util import find_library +import importlib +import logging +import numpy +import os +import sys + + +# Read environment variable if it exists +SIDEKIT_CONFIG={"libsvm":False, + "mpi":False, + "cuda":True + } + +if 'SIDEKIT' in os.environ: + for cfg in os.environ['SIDEKIT'].split(","): + k, val = cfg.split("=") + if k == "libsvm": + if val == "false": + SIDEKIT_CONFIG["libsvm"] = False + elif k == "mpi": + if val == "true": + SIDEKIT_CONFIG["mpi"] = True + if k == "cuda": + if val == "true": + SIDEKIT_CONFIG["cuda"] = True + + +PARALLEL_MODULE = 'multiprocessing' # can be , threading, multiprocessing MPI is planned in the future +PARAM_TYPE = numpy.float32 +STAT_TYPE = numpy.float64 # can be numpy.float32 to speed up the computation but can lead to numerical issuess + +# Import bosaris-like classes +from .bosaris import IdMap +from .bosaris import Ndx +from .bosaris import Key +from .bosaris import Scores +from .bosaris import DetPlot +from .bosaris import effective_prior +from .bosaris import logit_effective_prior +from .bosaris import fast_minDCF + +# Import classes +from .features_extractor import FeaturesExtractor +from .features_server import FeaturesServer +from .mixture import Mixture, vad_energy +from .statserver import StatServer +from .factor_analyser import FactorAnalyser + +from .frontend.io import write_pcm +from .frontend.io import read_pcm +from .frontend.io import pcmu2lin +from .frontend.io import read_sph +from .frontend.io import write_label +from .frontend.io import read_label +from .frontend.io import read_spro4 +from .frontend.io import read_audio +from .frontend.io import write_spro4 +from .frontend.io import read_htk +from .frontend.io import write_htk + +from .frontend.vad import vad_energy +from .frontend.vad import vad_snr +from .frontend.vad import label_fusion +from .frontend.vad import speech_enhancement + +from .frontend.normfeat import cms +from .frontend.normfeat import cmvn +from .frontend.normfeat import stg +from .frontend.normfeat import rasta_filt + +from .frontend.features import compute_delta +from .frontend.features import framing +from .frontend.features import pre_emphasis +from .frontend.features import trfbank +from .frontend.features import mel_filter_bank +from .frontend.features import mfcc +from .frontend.features import pca_dct +from .frontend.features import shifted_delta_cepstral + +from .iv_scoring import cosine_scoring +from .iv_scoring import mahalanobis_scoring +from .iv_scoring import two_covariance_scoring +from .iv_scoring import PLDA_scoring +from .iv_scoring import fast_PLDA_scoring + +from .gmm_scoring import gmm_scoring + +from .jfa_scoring import jfa_scoring + +from .score_normalization import znorm +from .score_normalization import tnorm +from .score_normalization import ztnorm +from .score_normalization import snorm + +from .sidekit_io import write_norm_hdf5 +from .sidekit_io import write_matrix_hdf5 + + +from .sv_utils import clean_stat_server + +libsvm_loaded = False +if SIDEKIT_CONFIG["libsvm"]: + try: + dirname = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'libsvm') + if sys.platform == 'win32': + libsvm = CDLL(os.path.join(dirname, r'libsvm.dll')) + libsvm_loaded = True + else: + libsvm = CDLL(os.path.join(dirname, 'libsvm.so.2')) + libsvm_loaded = True + except: + # For unix the prefix 'lib' is not considered. + if find_library('svm'): + libsvm = CDLL(find_library('svm')) + libsvm_loaded = True + elif find_library('libsvm'): + libsvm = CDLL(find_library('libsvm')) + libsvm_loaded = True + else: + libsvm_loaded = False + logging.warning('WARNNG: libsvm is not installed, please refer to the' + + ' documentation if you intend to use SVM classifiers') + +if libsvm_loaded: + from sidekit.libsvm import * + from sidekit.svm_scoring import * + from sidekit.svm_training import * + + +CUDA = False +if SIDEKIT_CONFIG["cuda"]: + CUDA = True + + +if CUDA: + from .nnet import Xtractor + from .nnet import xtrain + from .nnet import extract_embeddings + from .nnet import ResBlock + from .nnet import SincNet + +else: + print("Don't import Torch") + +if SIDEKIT_CONFIG["mpi"]: + found_mpi4py = importlib.find_loader('mpi4py') is not None + if found_mpi4py: + from .sidekit_mpi import EM_split + from .sidekit_mpi import total_variability + from .sidekit_mpi import extract_ivector + print("Import MPI") + + +__author__ = "Anthony Larcher and Sylvain Meignier" +__copyright__ = "Copyright 2014-2021 Anthony Larcher and Sylvain Meignier" +__license__ = "LGPL" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' +__version__="1.4" + diff --git a/sidekit/sidekit/bosaris/Licence b/sidekit/sidekit/bosaris/Licence new file mode 100755 index 0000000..41af980 --- /dev/null +++ b/sidekit/sidekit/bosaris/Licence @@ -0,0 +1,124 @@ +License agreement + +Agnitio Labs + +Non-Commercial Use Only + + + +_____________________________________________________________________ + + + +This AGNITIO Labs License Agreement, including all exhibits ("AGN-LA") is a +legal agreement between you and AGNITIO S. L. (“AGNITIO” or “we”) for the +software or data identified above, which may include source code, and any +associated materials, text or speech files, associated media and "online" or +electronic documentation and any updates we provide in our discretion +(together, the "Software"). + + + +By installing, copying, or otherwise using this Software, you agree to be bound +by the terms of this AGN-LA. If you do not agree, do not install copy or use +the Software. The Software is protected by copyright and other intellectual +property laws and is licensed, not sold. + + + +SCOPE OF RIGHTS: + +You may use, copy, reproduce, and distribute this Software for any +non-commercial purpose, subject to the restrictions in this AGN-LA. Some +purposes which can be non-commercial are teaching, academic research, public +demonstrations and personal experimentation. You may also distribute this +Software with books or other teaching materials, or publish the Software on +websites, that are intended to teach the use of the Software for academic or +other non-commercial purposes. + +You may not use or distribute this Software or any derivative works in any form +for commercial purposes. Examples of commercial purposes would be running +business operations, licensing, leasing, or selling the Software, distributing +the Software for use with commercial products, using the Software in the +creation or use of commercial products or any other activity which purpose is +to procure a commercial gain to you or others. + +If the Software includes source code or data, you may create derivative works +of such portions of the Software and distribute the modified Software for +non-commercial purposes, as provided herein. + +If you distribute the Software or any derivative works of the Software, you +will distribute them under the same terms and conditions as in this license, +and you will not grant other rights to the Software or derivative works that +are different from those provided by this AGN-LA. + +If you have created derivative works of the Software, and distribute such +derivative works, you will cause the modified files to carry prominent notices +so that recipients know that they are not receiving the original Software. Such +notices must state: (i) that you have changed the Software; and (ii) the date +of any changes. + + + +In return, we simply require that you agree: + +1. That you will not remove any copyright or other notices from the Software. + +2. That if any of the Software is in binary format, you will not attempt to +modify such portions of the Software, or to reverse engineer or decompile them, +except and only to the extent authorized by applicable law. + +3. That AGNITIO is granted back, without any restrictions or limitations, a +non-exclusive, perpetual, irrevocable, royalty-free, assignable and +sub-licensable license, to reproduce, publicly perform or display, install, +use, modify, post, distribute, make and have made, sell and transfer your +modifications to and/or derivative works of the Software source code or data, +for any purpose. + +4. That any feedback about the Software provided by you to us is voluntarily +given, and AGNITIO shall be free to use the feedback as it sees fit without +obligation or restriction of any kind, even if the feedback is designated by +you as confidential. + +5. THAT THE SOFTWARE COMES "AS IS", WITH NO WARRANTIES. THIS MEANS NO EXPRESS, +IMPLIED OR STATUTORY WARRANTY, INCLUDING WITHOUT LIMITATION, WARRANTIES OF +MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, ANY WARRANTY AGAINST +INTERFERENCE WITH YOUR ENJOYMENT OF THE SOFTWARE OR ANY WARRANTY OF TITLE OR +NON-INFRINGEMENT. THERE IS NO WARRANTY THAT THIS SOFTWARE WILL FULFILL ANY OF +YOUR PARTICULAR PURPOSES OR NEEDS. ALSO, YOU MUST PASS THIS DISCLAIMER ON +WHENEVER YOU DISTRIBUTE THE SOFTWARE OR DERIVATIVE WORKS. + +6. THAT NEITHER AGNITIO NOR ANY CONTRIBUTOR TO THE SOFTWARE WILL BE LIABLE FOR +ANY DAMAGES RELATED TO THE SOFTWARE OR THIS AGN-LA, INCLUDING DIRECT, INDIRECT, +SPECIAL, CONSEQUENTIAL OR INCIDENTAL DAMAGES, TO THE MAXIMUM EXTENT THE LAW +PERMITS, NO MATTER WHAT LEGAL THEORY IT IS BASED ON. ALSO, YOU MUST PASS THIS +LIMITATION OF LIABILITY ON WHENEVER YOU DISTRIBUTE THE SOFTWARE OR DERIVATIVE +WORKS. + +7. That we have no duty of reasonable care or lack of negligence, and we are +not obligated to (and will not) provide technical support for the Software. + +8. That if you breach this AGN-LA or if you sue anyone over patents that you +think may apply to or read on the Software or anyone's use of the Software, +this AGN-LA (and your license and rights obtained herein) terminate +automatically. Upon any such termination, you shall destroy all of your copies +of the Software immediately. Sections 3, 4, 5, 6, 7, 8, 11 and 12 of this +AGN-LA shall survive any termination of this AGN-LA. + +9. That the patent rights, if any, granted to you in this AGN-LA only apply to +the Software, not to any derivative works you make. + +10. That the Software may be subject to Europe export jurisdiction at the time +it is licensed to you, and it may be subject to additional export or import +laws in other places. You agree to comply with all such laws and regulations +that may apply to the Software after delivery of the software to you. + +11. That all rights not expressly granted to you in this AGN-LA are reserved. + +12. That this AGN-LA shall be construed and controlled by the laws of the +Kingdom of Spain, without regard to conflicts of law. If any provision of this +AGN-LA shall be deemed unenforceable or contrary to law, the rest of this +AGN-LA shall remain in full effect and interpreted in an enforceable manner +that most nearly captures the intent of the original language. + +Copyright (c) AGNITIO. All rights reserved. diff --git a/sidekit/sidekit/bosaris/__init__.py b/sidekit/sidekit/bosaris/__init__.py new file mode 100755 index 0000000..bdc3556 --- /dev/null +++ b/sidekit/sidekit/bosaris/__init__.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +""" +This package is a translation of a part of the BOSARIS toolkit. +The authors thank Niko Brummer and Agnitio for allowing them to +translate this code and provide the community with efficient structures +and tools. + +The BOSARIS Toolkit is a collection of functions and classes in Matlab +that can be used to calibrate, fuse and plot scores from speaker recognition +(or other fields in which scores are used to test the hypothesis that two +samples are from the same source) trials involving a model and a test segment. +The toolkit was written at the BOSARIS2010 workshop which took place at the +University of Technology in Brno, Czech Republic from 5 July to 6 August 2010. +See the User Guide (available on the toolkit website)1 for a discussion of the +theory behind the toolkit and descriptions of some of the algorithms used. + +The BOSARIS toolkit in MATLAB can be downloaded from `the website +`_. +""" + +from .idmap import IdMap +from .ndx import Ndx +from .plotwindow import PlotWindow +from .key import Key +from .scores import Scores +from .detplot import DetPlot +from .detplot import effective_prior +from .detplot import logit_effective_prior +from .detplot import fast_minDCF + + +__author__ = "Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' +__credits__ = ["Niko Brummer", "Edward de Villiers"] + diff --git a/sidekit/sidekit/bosaris/detplot.py b/sidekit/sidekit/bosaris/detplot.py new file mode 100755 index 0000000..02b3f91 --- /dev/null +++ b/sidekit/sidekit/bosaris/detplot.py @@ -0,0 +1,937 @@ +# -*- coding: utf-8 -*- + +# This package is a translation of a part of the BOSARIS toolkit. +# The authors thank Niko Brummer and Agnitio for allowing them to +# translate this code and provide the community with efficient structures +# and tools. +# +# The BOSARIS Toolkit is a collection of functions and classes in Matlab +# that can be used to calibrate, fuse and plot scores from speaker recognition +# (or other fields in which scores are used to test the hypothesis that two +# samples are from the same source) trials involving a model and a test segment. +# The toolkit was written at the BOSARIS2010 workshop which took place at the +# University of Technology in Brno, Czech Republic from 5 July to 6 August 2010. +# See the User Guide (available on the toolkit website)1 for a discussion of the +# theory behind the toolkit and descriptions of some of the algorithms used. +# +# The BOSARIS toolkit in MATLAB can be downloaded from `the website +# `_. + +""" +This is the 'detplot' module + + This module supplies tools for ploting DET curve. + It includes a class for creating a plot for displaying detection performance + with the axes scaled and labelled so that a normal Gaussian + distribution will plot as a straight line. + + The y axis represents the miss probability. + The x axis represents the false alarm probability. + + This file is a translation of the BOSARIS toolkit. + For more information, refers to the license provided with this package. +""" +import copy +import matplotlib +import numpy +import os + +if "DISPLAY" not in os.environ: + matplotlib.use('PDF') +import matplotlib.pyplot as mpl +import scipy +from collections import namedtuple +import logging + +from . import PlotWindow +from . import Scores +from . import Key + + +__author__ = "Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' +__credits__ = ["Niko Brummer", "Edward de Villiers"] + + +colorStyle = [ + ((0, 0, 0), '-', 2), # black + ((0, 0, 1.0), '--', 2), # blue + ((0.8, 0.0, 0.0), '-.', 2), # red + ((0, 0.6, 0.0), ':', 2), # green + ((0.5, 0.0, 0.5), '-', 2), # magenta + ((0.3, 0.3, 0.0), '--', 2), # orange + ((0, 0, 0), ':', 2), # black + ((0, 0, 1.0), ':', 2), # blue + ((0.8, 0.0, 0.0), ':', 2), # red + ((0, 0.6, 0.0), '-', 2), # green + ((0.5, 0.0, 0.5), '-.', 2), # magenta + ((0.3, 0.3, 0.0), '-', 2), # orange + ] + +grayStyle = [ + ((0, 0, 0), '-', 2), # black + ((0, 0, 0), '--', 2), # black + ((0, 0, 0), '-.', 2), # black + ((0, 0, 0), ':', 2), # black + ((0.3, 0.3, 0.3), '-', 2), # gray + ((0.3, 0.3, 0.3), '--', 2), # gray + ((0.3, 0.3, 0.3), '-.', 2), # gray + ((0.3, 0.3, 0.3), ':', 2), # gray + ((0.6, 0.6, 0.6), '-', 2), # lighter gray + ((0.6, 0.6, 0.6), '--', 2), # lighter gray + ((0.6, 0.6, 0.6), '-.', 2), # lighter gray + ((0.6, 0.6, 0.6), ':', 2), # lighter gray + ] + +Box = namedtuple("Box", "left right top bottom") + + +def effective_prior(Ptar, cmiss, cfa): + """This function adjusts a given prior probability of target p_targ, + to incorporate the effects of a cost of miss, + cmiss, and a cost of false-alarm, cfa. + In particular note: + EFFECTIVE_PRIOR(EFFECTIVE_PRIOR(p,cmiss,cfa),1,1) + = EFFECTIVE_PRIOR(p,cfa,cmiss) + + The effective prior for the NIST SRE detection cost fuction, + with p_targ = 0.01, cmiss = 10, cfa = 1 is therefore: + EFFECTIVE_PRIOR(0.01,10,1) = 0.0917 + + :param Ptar: is the probability of a target trial + :param cmiss: is the cost of a miss + :param cfa: is the cost of a false alarm + + :return: a prior + """ + p = Ptar * cmiss / (Ptar * cmiss + (1 - Ptar) * cfa) + return p + + +def logit_effective_prior(Ptar, cmiss, cfa): + """This function adjusts a given prior probability of target p_targ, + to incorporate the effects of a cost of miss, + cmiss, and a cost of false-alarm, cfa. + In particular note: + EFFECTIVE_PRIOR(EFFECTIVE_PRIOR(p,cmiss,cfa),1,1) + = EFFECTIVE_PRIOR(p,cfa,cmiss) + + The effective prior for the NIST SRE detection cost fuction, + with p_targ = 0.01, cmiss = 10, cfa = 1 is therefore: + EFFECTIVE_PRIOR(0.01,10,1) = 0.0917 + + :param Ptar: is the probability of a target trial + :param cmiss: is the cost of a miss + :param cfa: is the cost of a false alarm + + :return: a prior + """ + p = Ptar * cmiss / (Ptar * cmiss + (1 - Ptar) * cfa) + return __logit__(p) + + +def __probit__(p): + """Map from [0,1] to [-inf,inf] as used to make DET out of a ROC + + :param p: the value to map + + :return: probit(input) + """ + y = numpy.sqrt(2) * scipy.special.erfinv(2 * p - 1) + return y + + +def __logit__(p): + """logit function. + This is a one-to-one mapping from probability to log-odds. + i.e. it maps the interval (0,1) to the real line. + The inverse function is given by SIGMOID. + + log_odds = logit(p) = log(p/(1-p)) + + :param p: the input value + + :return: logit(input) + """ + p = numpy.array(p) + lp = numpy.zeros(p.shape) + f0 = p == 0 + f1 = p == 1 + f = (p > 0) & (p < 1) + + if lp.shape == (): + if f: + lp = numpy.log(p / (1 - p)) + elif f0: + lp = -numpy.inf + elif f1: + lp = numpy.inf + else: + lp[f] = numpy.log(p[f] / (1 - p[f])) + lp[f0] = -numpy.inf + lp[f1] = numpy.inf + return lp + + +def __DETsort__(x, col=''): + """DETsort Sort rows, the first in ascending, the remaining in descending + thereby postponing the false alarms on like scores. + based on SORTROWS + + :param x: the array to sort + :param col: not used here + + :return: a sorted vector of scores + """ + assert x.ndim > 1, 'x must be a 2D matrix' + if col == '': + list(range(1, x.shape[1])) + + ndx = numpy.arange(x.shape[0]) + + # sort 2nd column ascending + ind = numpy.argsort(x[:, 1], kind='mergesort') + ndx = ndx[ind] + + # reverse to descending order + ndx = ndx[::-1] + + # now sort first column ascending + ind = numpy.argsort(x[ndx, 0], kind='mergesort') + + ndx = ndx[ind] + sort_scores = x[ndx, :] + return sort_scores + + +def __compute_roc__(true_scores, false_scores): + """Computes the (observed) miss/false_alarm probabilities + for a set of detection output scores. + + true_scores (false_scores) are detection output scores for a set of + detection trials, given that the target hypothesis is true (false). + (By convention, the more positive the score, + the more likely is the target hypothesis.) + + :param true_scores: a 1D array of target scores + :param false_scores: a 1D array of non-target scores + + :return: a tuple of two vectors, Pmiss,Pfa + """ + num_true = true_scores.shape[0] + num_false = false_scores.shape[0] + assert num_true > 0, "Vector of target scores is empty" + assert num_false > 0, "Vector of nontarget scores is empty" + + total = num_true + num_false + + Pmiss = numpy.zeros((total + 1)) + Pfa = numpy.zeros((total + 1)) + + scores = numpy.zeros((total, 2)) + scores[:num_false, 0] = false_scores + scores[:num_false, 1] = 0 + scores[num_false:, 0] = true_scores + scores[num_false:, 1] = 1 + + scores = __DETsort__(scores) + + sumtrue = numpy.cumsum(scores[:, 1], axis=0) + sumfalse = num_false - (numpy.arange(1, total + 1) - sumtrue) + + Pmiss[0] = 0 + Pfa[0] = 1 + Pmiss[1:] = sumtrue / num_true + Pfa[1:] = sumfalse / num_false + return Pmiss, Pfa + + +def __filter_roc__(pm, pfa): + """Removes redundant points from the sequence of points (pfa,pm) so + that plotting an ROC or DET curve will be faster. The output ROC + curve will be identical to the one plotted from the input + vectors. All points internal to straight (horizontal or + vertical) sections on the ROC curve are removed i.e. only the + points at the start and end of line segments in the curve are + retained. Since the plotting code draws straight lines between + points, the resulting plot will be the same as the original. + + :param pm: the vector of miss probabilities of the ROC Convex + :param pfa: the vector of false-alarm probabilities of the ROC Convex + + :return: a tuple of two vectors, Pmiss, Pfa + """ + out = 0 + new_pm = [pm[0]] + new_pfa = [pfa[0]] + + for i in range(1, pm.shape[0]): + if (pm[i] == new_pm[out]) | (pfa[i] == new_pfa[out]): + pass + else: + # save previous point, because it is the last point before the + # change. On the next iteration, the current point will be saved. + out += 1 + new_pm.append(pm[i - 1]) + new_pfa.append(pfa[i - 1]) + + out += 1 + new_pm.append(pm[-1]) + new_pfa.append(pfa[-1]) + pm = numpy.array(new_pm) + pfa = numpy.array(new_pfa) + return pm, pfa + + +def pavx(y): + """PAV: Pool Adjacent Violators algorithm. + Non-paramtetric optimization subject to monotonicity. + + ghat = pav(y) + fits a vector ghat with nondecreasing components to the + data vector y such that sum((y - ghat).^2) is minimal. + (Pool-adjacent-violators algorithm). + + optional outputs: + width: width of pav bins, from left to right + (the number of bins is data dependent) + height: corresponding heights of bins (in increasing order) + + Author: This code is a simplified version of the 'IsoMeans.m' code + made available by Lutz Duembgen at: + http://www.imsv.unibe.ch/~duembgen/software + + :param y: input value + """ + assert y.ndim == 1, 'Argument should be a 1-D array' + assert y.shape[0] > 0, 'Input array is empty' + n = y.shape[0] + + index = numpy.zeros(n, dtype=int) + length = numpy.zeros(n, dtype=int) + + # An interval of indices is represented by its left endpoint + # ("index") and its length "length" + ghat = numpy.zeros(n) + + ci = 0 + index[ci] = 0 + length[ci] = 1 + ghat[ci] = y[0] + + # ci is the number of the interval considered currently. + # ghat(ci) is the mean of y-values within this interval. + for j in range(1, n): + # a new index interval, {j}, is created: + ci += 1 + index[ci] = j + 1 + length[ci] = 1 + ghat[ci] = y[j] + while (ci >= 1) & (ghat[numpy.max(ci - 1, 0)] >= ghat[ci]): + # pool adjacent violators: + nw = length[ci - 1] + length[ci] + ghat[ci - 1] = ghat[ci - 1] + (length[ci] / nw) * (ghat[ci] - ghat[ci - 1]) + length[ci - 1] = nw + ci -= 1 + + height = copy.deepcopy(ghat[:ci + 1]) + width = copy.deepcopy(length[:ci + 1]) + + # Now define ghat for all indices: + while n >= 0: + for j in range(int(index[ci]), int(n + 1)): + ghat[j - 1] = ghat[ci] + + n = index[ci] - 1 + ci -= 1 + + return ghat, width, height + + +def rocch2eer(pmiss, pfa): + """Calculates the equal error rate (eer) from pmiss and pfa vectors. + Note: pmiss and pfa contain the coordinates of the vertices of the + ROC Convex Hull. + Use rocch.m to convert target and non-target scores to pmiss and + pfa values. + + :param pmiss: the vector of miss probabilities + :param pfa: the vector of false-alarm probabilities + + :return: the equal error rate + """ + eer = 0 + for i in range(pfa.shape[0] - 1): + xx = pfa[i:i + 2] + yy = pmiss[i:i + 2] + + # xx and yy should be sorted: + assert (xx[1] <= xx[0]) & (yy[0] <= yy[1]), \ + 'pmiss and pfa have to be sorted' + + XY = numpy.column_stack((xx, yy)) + dd = numpy.dot(numpy.array([1, -1]), XY) + if numpy.min(numpy.abs(dd)) == 0: + eerseg = 0 + else: + # find line coefficients seg s.t. seg'[xx(i);yy(i)] = 1, + # when xx(i),yy(i) is on the line. + seg = numpy.linalg.solve(XY, numpy.array([[1], [1]])) + # candidate for EER, eer is highest candidate + eerseg = 1 / (numpy.sum(seg)) + + eer = max([eer, eerseg]) + return eer + + +def rocch(tar_scores, nontar_scores): + """ROCCH: ROC Convex Hull. + Note: pmiss and pfa contain the coordinates of the vertices of the + ROC Convex Hull. + + For a demonstration that plots ROCCH against ROC for a few cases, just + type 'rocch' at the MATLAB command line. + + :param tar_scores: vector of target scores + :param nontar_scores: vector of non-target scores + + :return: a tupple of two vectors: Pmiss, Pfa + """ + Nt = tar_scores.shape[0] + Nn = nontar_scores.shape[0] + N = Nt + Nn + scores = numpy.concatenate((tar_scores, nontar_scores)) + # Pideal is the ideal, but non-monotonic posterior + Pideal = numpy.concatenate((numpy.ones(Nt), numpy.zeros(Nn))) + # + # It is important here that scores that are the same + # (i.e. already in order) should NOT be swapped.rb + perturb = numpy.argsort(scores, kind='mergesort') + # + Pideal = Pideal[perturb] + Popt, width, foo = pavx(Pideal) + # + nbins = width.shape[0] + pmiss = numpy.zeros(nbins + 1) + pfa = numpy.zeros(nbins + 1) + # + # threshold leftmost: accept everything, miss nothing + left = 0 # 0 scores to left of threshold + fa = Nn + miss = 0 + # + for i in range(nbins): + pmiss[i] = miss / Nt + pfa[i] = fa / Nn + left = int(left + width[i]) + miss = numpy.sum(Pideal[:left]) + fa = N - left - numpy.sum(Pideal[left:]) + # + pmiss[nbins] = miss / Nt + pfa[nbins] = fa / Nn + # + return pmiss, pfa + + +def sigmoid(log_odds): + """SIGMOID: Inverse of the logit function. + This is a one-to-one mapping from log odds to probability. + i.e. it maps the real line to the interval (0,1). + + p = sigmoid(log_odds) + + :param log_odds: the input value + + :return: sigmoid(input) + """ + p = 1 / (1 + numpy.exp(-log_odds)) + return p + + +def fast_minDCF(tar, non, plo, normalize=False): + """Compute the minimum COST for given target and non-target scores + Note that minDCF is parametrized by plo: + + minDCF(Ptar) = min_t Ptar * Pmiss(t) + (1-Ptar) * Pfa(t) + + where t is the adjustable decision threshold and: + + Ptar = sigmoid(plo) = 1./(1+exp(-plo)) + + If normalize == true, then the returned value is: + + minDCF(Ptar) / min(Ptar,1-Ptar). + + Pmiss: a vector with one value for every element of plo. + This is Pmiss(tmin), where tmin is the minimizing threshold + for minDCF, at every value of plo. Pmiss is not altered by + parameter 'normalize'. + + Pfa: a vector with one value for every element of plo. + This is Pfa(tmin), where tmin is the minimizing threshold for + minDCF, at every value of plo. Pfa is not altered by + parameter 'normalize'. + + Note, for the un-normalized case: + + minDCF(plo) = sigmoid(plo).*Pfa(plo) + sigmoid(-plo).*Pmiss(plo) + + :param tar: vector of target scores + :param non: vector of non-target scores + :param plo: vector of prior-log-odds: plo = logit(Ptar) = log(Ptar) - log(1-Ptar) + :param normalize: if true, return normalized minDCF, else un-normalized (optional, default = false) + + :return: the minDCF value + :return: the miss probability for this point + :return: the false-alarm probability for this point + :return: the precision-recall break-even point: Where #FA == #miss + :return the equal error rate + """ + Pmiss, Pfa = rocch(tar, non) + Nmiss = Pmiss * tar.shape[0] + Nfa = Pfa * non.shape[0] + prbep = rocch2eer(Nmiss, Nfa) + eer = rocch2eer(Pmiss, Pfa) + + Ptar = sigmoid(plo) + Pnon = sigmoid(-plo) + cdet = numpy.dot(numpy.array([[Ptar, Pnon]]), numpy.vstack((Pmiss, Pfa))) + ii = numpy.argmin(cdet, axis=1) + minDCF = cdet[0, ii][0] + + Pmiss = Pmiss[ii] + Pfa = Pfa[ii] + + if normalize: + minDCF = minDCF / min([Ptar, Pnon]) + + return minDCF, Pmiss[0], Pfa[0], prbep, eer + + +def plotseg(xx, yy, box, dps): + """Prepare the plotting of a curve. + :param xx: + :param yy: + :param box: + :param dps: + """ + assert ((xx[1] <= xx[0]) & (yy[0] <= yy[1])), 'xx and yy should be sorted' + + XY = numpy.column_stack((xx, yy)) + dd = numpy.dot(numpy.array([1, -1]), XY) + if numpy.min(abs(dd)) == 0: + eer = 0 + else: + # find line coefficients seg s.t. seg'[xx(i);yy(i)] = 1, + # when xx(i),yy(i) is on the line. + seg = numpy.linalg.solve(XY, numpy.array([[1], [1]])) + # candidate for EER, eer is highest candidate + eer = 1.0 / numpy.sum(seg) + + # segment completely outside of box + if (xx[0] < box.left) | (xx[1] > box.right) | (yy[1] < box.bottom) | (yy[0] > box.top): + x = numpy.array([]) + y = numpy.array([]) + else: + if xx[1] < box.left: + xx[1] = box.left + yy[1] = (1 - seg[0] * box.left) / seg[1] + + if xx[0] > box.right: + xx[0] = box.right + yy[0] = (1 - seg[0] * box.right) / seg[1] + + if yy[0] < box.bottom: + yy[0] = box.bottom + xx[0] = (1 - seg[1] * box.bottom) / seg[0] + + if yy[1] > box.top: + yy[1] = box.top + xx[1] = (1 - seg[1] * box.top) / seg[0] + + dx = xx[1] - xx[0] + xdots = xx[0] + dx * numpy.arange(dps + 1) / dps + ydots = (1 - seg[0] * xdots) / seg[1] + x = __probit__(xdots) + y = __probit__(ydots) + + return x, y, eer + + +def rocchdet(tar, non, + dcfweights=numpy.array([]), + pfa_min=5e-4, + pfa_max=0.5, + pmiss_min=5e-4, + pmiss_max=0.5, + dps=100, + normalize=False): + """ROCCHDET: Computes ROC Convex Hull and then maps that to the DET axes. + The DET-curve is infinite, non-trivial limits (away from 0 and 1) + are mandatory. + + :param tar: vector of target scores + :param non: vector of non-target scores + :param dcfweights: 2-vector, such that: DCF = [pmiss,pfa]*dcfweights(:) (Optional, provide only if mindcf is + desired, otherwise omit or use [] + :param pfa_min: limit of DET-curve rectangle. Default is 0.0005 + :param pfa_max: limit of DET-curve rectangle. Default is 0.5 + :param pmiss_min: limit of DET-curve rectangle. Default is 0.0005 + :param pmiss_max: limits of DET-curve rectangle. Default is 0.5 + :param dps: number of returned (x,y) dots (arranged in a curve) in DET space, for every straight line-segment + (edge) of the ROC Convex Hull. Default is 100. + :param normalize: normalize the curve + + :return: probit(Pfa) + :return: probit(Pmiss) + :return: ROCCH EER = max_p mindcf(dcfweights=[p,1-p]), which is also equal to the intersection of the ROCCH + with the line pfa = pmiss. + :return: the mindcf: Identical to result using traditional ROC, but computed by mimimizing over the ROCCH + vertices, rather than over all the ROC points. + """ + assert ((pfa_min > 0) & (pfa_max < 1) & (pmiss_min > 0) & (pmiss_max < 1)), 'limits must be strictly inside (0,1)' + assert((pfa_min < pfa_max) & (pmiss_min < pmiss_max)), 'pfa and pmiss min and max values are not consistent' + + pmiss, pfa = rocch(tar, non) + mindcf = 0.0 + + if dcfweights.shape == (2,): + dcf = numpy.dot(dcfweights, numpy.vstack((pmiss, pfa))) + mindcf = numpy.min(dcf) + if normalize: + mindcf = mindcf/min(dcfweights) + + # pfa is decreasing + # pmiss is increasing + box = Box(left=pfa_min, right=pfa_max, top=pmiss_max, bottom=pmiss_min) + + x = [] + y = [] + eer = 0 + for i in range(pfa.shape[0] - 1): + xx = pfa[i:i + 2] + yy = pmiss[i:i + 2] + xdots, ydots, eerseg = plotseg(xx, yy, box, dps) + x = x + xdots.tolist() + y = y + ydots.tolist() + eer = max(eer, eerseg) + + return numpy.array(x), numpy.array(y), eer, mindcf + + +class DetPlot: + """A class for creating a plot for displaying detection performance + with the axes scaled and labelled so that a normal Gaussian + distribution will plot as a straight line. + + - The y axis represents the miss probability. + - The x axis represents the false alarm probability. + + :attr __plotwindow__: PlotWindow object to plot into + :attr __title__: title of the plot + :attr __sys_name__: list of IDs of the systems + :attr __tar__: list of arrays of of target scores for each system + :attr __non__: list of arrays of the non-target scores for each system + :attr __figure__: figure to plot into + """ + + def __init__(self, window_style='old', plot_title=''): + """Initialize an empty DetPlot object""" + self.__plotwindow__ = PlotWindow(window_style) + self.__title__ = plot_title + self.__sys_name__ = [] + self.__tar__ = [] + self.__non__ = [] + self.__figure__ = '' + self.title = '' + + def set_title(self, title): + """Modify the title of a DetPlot object + + :param title: title of the plot to display + """ + self.title = title + + def create_figure(self, idx=0): + """Create a figure to plot the DET-curve. + Default plot everything on one single figure + + :param idx: Index of the figure to create. Default is 0. + """ + self.__figure__ = mpl.figure(idx) + ax = self.__figure__.add_subplot(111) + ax.set_aspect('equal') + mpl.axis([__probit__(self.__plotwindow__.__pfa_limits__[0]), + __probit__(self.__plotwindow__.__pfa_limits__[1]), + __probit__(self.__plotwindow__.__pmiss_limits__[0]), + __probit__(self.__plotwindow__.__pmiss_limits__[1])]) + xticks = __probit__(self.__plotwindow__.__xticks__) + yticks = __probit__(self.__plotwindow__.__yticks__) + ax.set_xticks(xticks) + ax.set_xticklabels(self.__plotwindow__.__xticklabels__, + size='x-small') + ax.set_yticks(yticks) + ax.set_yticklabels(self.__plotwindow__.__yticklabels__, size='x-small') + if not self.__title__ == '': + mpl.title(self.__title__) + mpl.grid(True) + mpl.xlabel('False Acceptance Rate [in %]') + mpl.ylabel('False Rejection Rate [in %]') + + # assuring, limits are kept by matplotlib after probit transform of axes + mpl.gca().set_xlim( + left=__probit__(self.__plotwindow__.__pfa_limits__[0]), + right=__probit__(self.__plotwindow__.__pfa_limits__[1]) + ) + mpl.gca().set_ylim( + bottom=__probit__(self.__plotwindow__.__pmiss_limits__[0]), + top=__probit__(self.__plotwindow__.__pmiss_limits__[1]) + ) + + def set_system(self, tar, non, sys_name=''): + """Sets the scores to be plotted. This function must be called + before plots are made for a system, but it can be called several + times with different systems (with calls to plotting functions in + between) so that curves for different systems appear on the same plot. + + :param tar: A vector of target scores. + :param non: A vector of non-target scores. + :param sys_name: A string describing the system. This string will + be prepended to the plot names in the legend. + You can pass an empty string to this argument or omit it. + + """ + assert tar.ndim == 1, 'Vector of target scores should be 1-dimensional' + assert non.ndim == 1, 'Vector of nontarget scores should be 1-dimensional' + assert tar.shape[0] > 0, 'Vector of target scores is empty' + assert non.shape[0] > 0, 'Vector of nontarget scores is empty' + self.__sys_name__.append(sys_name) + self.__tar__.append(tar) + self.__non__.append(non) + + def set_system_from_scores(self, scores, key, sys_name=''): + """Sets the scores to be plotted. This function must be called + before plots are made for a system, but it can be called several + times with different systems (with calls to plotting functions in + between) so that curves for different systems appear on the same plot. + + :param scores: A Scores object containing system scores. + :param key: A Key object for distinguishing target and non-target scores. + :param sys_name: A string describing the system. This string will be + prepended to the plot names in the legend. You can pass an + empty string to this argument or omit it. + + """ + assert isinstance(scores, Scores), 'First argument should be a Score object' + assert isinstance(key, Key), 'Second argument should be a Key object' + assert scores.validate(), 'Wrong format of Scores' + assert key.validate(), 'Wrong format of Key' + tar, non = scores.get_tar_non(key) + self.set_system(tar, non, sys_name) + + def plot_steppy_det(self, idx=0, style='color', plot_args=''): + """Plots a DET curve. + + :param idx: the idx of the curve to plot in case tar and non have + several dimensions + :param style: style of the curve, can be gray or color + :param plot_args: a cell array of arguments to be passed to plot + that control the appearance of the curve. + """ + Pmiss, Pfa = __compute_roc__(self.__tar__[idx], self.__non__[idx]) + Pmiss, Pfa = __filter_roc__(Pmiss, Pfa) + + x = __probit__(Pfa) + y = __probit__(Pmiss) + + # In case the plotting arguments are not specified, they are initialized + # by using default values + if not (isinstance(plot_args, tuple) & (len(plot_args) == 3)): + if style == 'gray': + plot_args = grayStyle[idx] + else: + plot_args = colorStyle[idx] + + fig = mpl.plot(x, y, + label=self.__sys_name__[idx], + color=plot_args[0], + linestyle=plot_args[1], + linewidth=plot_args[2]) + mpl.legend() + if matplotlib.get_backend() == 'agg': + mpl.savefig(self.__title__ + '.pdf') + + def plot_rocch_det(self, idx=0, style='color', target_prior=0.001, plot_args=''): + """Plots a DET curve using the ROCCH. + + :param idx: index of the figure to plot on + :param style: style of the DET-curve (see DetPlot description) + :param target_prior: prior of the target trials + :param plot_args: a list of arguments to be passed + to plot that control the appearance of the curve. + """ + # In case the plotting arguments are not specified, they are initialized + # by using default values + if not (isinstance(plot_args, tuple) & (len(plot_args) == 3)): + if style == 'gray': + plot_args = grayStyle[idx] + else: + plot_args = colorStyle[idx] + + pfa_min = self.__plotwindow__.__pfa_limits__[0] + pfa_max = self.__plotwindow__.__pfa_limits__[1] + pmiss_min = self.__plotwindow__.__pmiss_limits__[0] + pmiss_max = self.__plotwindow__.__pmiss_limits__[1] + + dps = 100 # dots per segment + + tmp = numpy.array([sigmoid(__logit__(target_prior)), sigmoid(-__logit__(target_prior))]) + x, y, eer, mindcf = rocchdet(self.__tar__[idx], self.__non__[idx], + tmp, pfa_min, pfa_max, + pmiss_min, pmiss_max, dps, normalize=True) + + fig = mpl.plot(x, y, + label='{}; (eer; minDCF) = ({:.03}; {:.04})'.format( + self.__sys_name__[idx], + 100. * eer, + 100. * mindcf), + color=plot_args[0], + linestyle=plot_args[1], + linewidth=plot_args[2]) + mpl.legend() + if matplotlib.get_backend() == 'agg': + mpl.savefig(self.__title__ + '.pdf') + + def plot_mindcf_point(self, target_prior, idx=0, plot_args='ok', legend_string=''): + """Places the mindcf point for the current system. + + :param target_prior: The effective target prior. + :param idx: inde of the figure to plot in + :param plot_args: a list of arguments to be + passed to 'plot' that control the appearance of the curve. + :param legend_string: Optional. A string to describe this curve + in the legend. + """ + mindcf, pmiss, pfa, prbep, eer = fast_minDCF(self.__tar__[idx], + self.__non__[idx], __logit__(target_prior), True) + if (pfa < self.__plotwindow__.__pfa_limits__[0]) | (pfa > self.__plotwindow__.__pfa_limits__[1]): + logging.warning('pfa of %f is not between %f and %f mindcf point will not be plotted.', format(pfa), + self.__plotwindow__.__pfa_limits__[0], + self.__plotwindow__.__pfa_limits__[1]) + elif (pmiss < self.__plotwindow__.__pmiss_limits__[0]) | (pmiss > self.__plotwindow__.__pmiss_limits__[1]): + logging.warning('pmiss of %f is not between %f and %f. The mindcf point will not be plotted.', + pmiss, self.__plotwindow__.__pmiss_limits__[0], + self.__plotwindow__.__pmiss_limits__[1]) + else: + fig = mpl.plot(__probit__(pfa), __probit__(pmiss), plot_args) + if matplotlib.get_backend() == 'agg': + mpl.savefig(self.__title__ + '.pdf') + + def plot_DR30_fa(self, + idx=0, + plot_args=((0, 0, 0), '--', 1), + legend_string=''): + """Plots a vertical line indicating the Doddington 30 point for + false alarms. This is the point left of which the number of false + alarms is below 30, so that the estimate of the false alarm rate + is no longer good enough to satisfy Doddington's Rule of 30. + + :param idx: index of the figure to plot in + :param plot_args: A cell array of arguments to be passed to 'plot' + that control the appearance of the curve. + :param legend_string: Optional. A string to describe this curve + in the legend. + """ + assert isinstance(plot_args, tuple) & (len(plot_args) == 3), 'Invalid plot_args' + + pfa_min = self.__plotwindow__.__pfa_limits__[0] + pfa_max = self.__plotwindow__.__pfa_limits__[1] + pmiss_min = self.__plotwindow__.__pmiss_limits__[0] + pmiss_max = self.__plotwindow__.__pmiss_limits__[1] + pfaval = 30.0 / self.__non__[idx].shape[0] + + if (pfaval < pfa_min) | (pfaval > pfa_max): + logging.warning('Pfa DR30 of %f is not between %f and %f Pfa DR30 line will not be plotted.', + pfaval, pfa_min, pfa_max) + else: + drx = __probit__(pfaval) + mpl.axvline(drx, + ymin=__probit__(pmiss_min), + ymax=__probit__(pmiss_max), + color=plot_args[0], + linestyle=plot_args[1], + linewidth=plot_args[2], + label=legend_string) + + def plot_DR30_miss(self, + idx=0, + plot_args=((0, 0, 0), '--', 1), + legend_string=''): + """Plots a horizontal line indicating the Doddington 30 point for + misses. This is the point above which the number of misses is + below 30, so that the estimate of the miss rate is no longer good + enough to satisfy Doddington's Rule of 30. + + :param idx: index of the figure to plot in + :param plot_args: A cell array of arguments to be passed + to 'plot' that control the appearance of the curve. + :param legend_string: Optional. A string to describe this curve + in the legend. + """ + assert isinstance(plot_args, tuple) & (len(plot_args) == 3), 'Invalid plot_args' + + pfa_min = self.__plotwindow__.__pfa_limits__[0] + pfa_max = self.__plotwindow__.__pfa_limits__[1] + pmiss_min = self.__plotwindow__.__pmiss_limits__[0] + pmiss_max = self.__plotwindow__.__pmiss_limits__[1] + pmissval = 30.0 / self.__tar__[idx].shape[0] + + if (pmissval < pmiss_min) | (pmissval > pmiss_max): + logging.warning('Pmiss DR30 of is not between %f and %f Pfa DR30 line will not be plotted.', + pmissval, pmiss_min, pmiss_max) + else: + dry = __probit__(pmissval) + mpl.axhline(y=dry, + xmin=__probit__(pfa_min), + xmax=__probit__(pfa_max), + color=plot_args[0], + linestyle=plot_args[1], + linewidth=plot_args[2], + label=legend_string) + + def plot_DR30_both(self, + idx=0, + plot_args_fa=((0, 0, 0), '--', 1), + plot_args_miss=((0, 0, 0), '--', 1), + legend_string=''): + """Plots two lines indicating Doddington's Rule of 30 points: one + for false alarms and one for misses. See the documentation of + plot_DR30_fa and plot_DR30_miss for details. + + :param idx: index of the figure to plot in + :param plot_args_fa: A tuple of arguments to be passed to 'plot' that control + the appearance of the DR30_fa point. + :param plot_args_miss: A tuple of arguments to be passed to 'plot' that control + the appearance of the DR30_miss point. + :param legend_string: Optional. A string to describe this curve + in the legend. + """ + self.plot_DR30_fa(idx, plot_args_fa, 'pfa DR30') + self.plot_DR30_miss(idx, plot_args_miss, 'pmiss DR30') + + def display_legend(self): + # to complete + pass + + def save_as_pdf(self, outfilename): + # to complete + pass + + def add_legend_entry(self, lh, legend_string, append_name): + # to complete + pass + + diff --git a/sidekit/sidekit/bosaris/idmap.py b/sidekit/sidekit/bosaris/idmap.py new file mode 100755 index 0000000..1c19412 --- /dev/null +++ b/sidekit/sidekit/bosaris/idmap.py @@ -0,0 +1,386 @@ +# -*- coding: utf-8 -*- + +# This package is a translation of a part of the BOSARIS toolkit. +# The authors thank Niko Brummer and Agnitio for allowing them to +# translate this code and provide the community with efficient structures +# and tools. +# +# The BOSARIS Toolkit is a collection of functions and classes in Matlab +# that can be used to calibrate, fuse and plot scores from speaker recognition +# (or other fields in which scores are used to test the hypothesis that two +# samples are from the same source) trials involving a model and a test segment. +# The toolkit was written at the BOSARIS2010 workshop which took place at the +# University of Technology in Brno, Czech Republic from 5 July to 6 August 2010. +# See the User Guide (available on the toolkit website)1 for a discussion of the +# theory behind the toolkit and descriptions of some of the algorithms used. +# +# The BOSARIS toolkit in MATLAB can be downloaded from `the website +# `_. + +""" +This is the 'idmap' module +""" +import sys +import numpy +import logging +import copy +import h5py + +from ..sidekit_wrappers import check_path_existance + + +__author__ = "Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' +__credits__ = ["Niko Brummer", "Edward de Villiers"] + + +class IdMap: + """A class that stores a map between identifiers (strings). One + list is called 'leftids' and the other 'rightids'. The class + provides methods that convert a sequence of left ids to a + sequence of right ids and vice versa. If `leftids` or `rightids` + contains duplicates then all occurrences are used as the index + when mapping. + + :attr leftids: a list of classes in a ndarray + :attr rightids: a list of segments in a ndarray + :attr start: index of the first frame of the segment + :attr stop: index of the last frame of the segment + """ + + def __init__(self, idmap_filename=''): + """Initialize an IdMap object + + :param idmap_filename: name of a file to load. Default is ''. + In case the idmap_filename is empty, initialize an empty IdMap object. + """ + self.leftids = numpy.empty(0, dtype="|O") + self.rightids = numpy.empty(0, dtype="|O") + self.start = numpy.empty(0, dtype="|O") + self.stop = numpy.empty(0, dtype="|O") + + if idmap_filename == '': + pass + else: + tmp = IdMap.read(idmap_filename) + self.leftids = tmp.leftids + self.rightids = tmp.rightids + self.start = tmp.start + self.stop = tmp.stop + + def __repr__(self): + ch = '-' * 30 + '\n' + ch += 'left ids:' + self.leftids.__repr__() + '\n' + ch += 'right ids:' + self.rightids.__repr__() + '\n' + ch += 'seg start:' + self.start.__repr__() + '\n' + ch += 'seg stop:' + self.stop.__repr__() + '\n' + ch += '-' * 30 + '\n' + return ch + + @check_path_existance + def write(self, output_file_name): + """ Save IdMap in HDF5 format + + :param output_file_name: name of the file to write to + """ + assert self.validate(), "Error: wrong IdMap format" + with h5py.File(output_file_name, "w") as f: + f.create_dataset("leftids", data=self.leftids.astype('S'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + f.create_dataset("rightids", data=self.rightids.astype('S'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + # WRITE START and STOP + start = copy.deepcopy(self.start) + start[numpy.isnan(self.start.astype('float'))] = -1 + start = start.astype('int32', copy=False) + + stop = copy.deepcopy(self.stop) + stop[numpy.isnan(self.stop.astype('float'))] = -1 + stop = stop.astype('int32', copy=False) + + f.create_dataset("start", data=start, + maxshape=(None,), + compression="gzip", + fletcher32=True) + f.create_dataset("stop", data=stop, + maxshape=(None,), + compression="gzip", + fletcher32=True) + + @check_path_existance + def write_txt(self, output_file_name): + """Saves the Id_Map to a text file. + + :param output_file_name: name of the output text file + """ + with open(output_file_name, 'w') as outputFile: + for left, right, start, stop in zip(self.leftids, self.rightids, self.start, self.stop): + line = ' '.join(filter(None, (left, right, str(start), str(stop)))) + '\n' + outputFile.write(line) + + def map_left_to_right(self, leftidlist): + """Maps an array of ids to a new array of ids using the given map. + The input ids are matched against the leftids of the map and the + output ids are taken from the corresponding rightids of the map. + + Beware: if leftids are not unique in the IdMap, only the last value + corresponding is kept + + :param leftidlist: an array of strings to be matched against the + leftids of the idmap. The rightids corresponding to these + leftids will be returned. + + :return: an array of strings that are the mappings of the + strings in leftidlist. + """ + tmp_dict = dict(zip(self.leftids, self.rightids)) + inter = numpy.intersect1d(self.leftids, leftidlist) + rightids = numpy.empty(inter.shape[0], '|O') + + idx = 0 + for left in leftidlist: + if left in inter: + rightids[idx] = tmp_dict[left] + idx += 1 + + lost_ids = numpy.unique(leftidlist).shape[0] - inter.shape[0] + if lost_ids: + logging.warning('{} ids could not be mapped'.format(lost_ids)) + + return rightids + + def map_right_to_left(self, rightidlist): + """Maps an array of ids to a new array of ids using the given map. + The input ids are matched against the rightids of the map and the + output ids are taken from the corresponding leftids of the map. + + Beware: if rightids are not unique in the IdMap, only the last value + corresponding is kept + + :param rightidlist: An array of strings to be matched against the + rightids of the idmap. The leftids corresponding to these + rightids will be returned. + + :return: an array of strings that are the mappings of the + strings in rightidlist. + """ + tmp_dict = dict(zip(self.rightids, self.leftids)) + inter = numpy.intersect1d(self.rightids, rightidlist) + leftids = numpy.empty(inter.shape[0], '|O') + + idx = 0 + for right in rightidlist: + if right in inter: + leftids[idx] = tmp_dict[right] + idx += 1 + + lost_ids = numpy.unique(rightidlist).shape[0] - inter.shape[0] + if lost_ids: + logging.warning('{} ids could not be mapped'.format(lost_ids)) + + return leftids + + def filter_on_left(self, idlist, keep): + """Removes some of the information in an idmap. Depending on the + value of 'keep', the idlist indicates the strings to retain or + the strings to discard. + + :param idlist: an array of strings which will be compared with + the leftids of the current. + :param keep: A boolean indicating whether idlist contains the ids to + keep or to discard. + + :return: a filtered version of the current IdMap. + """ + # get the list of ids to keep + if keep: + keepids = numpy.unique(idlist) + else: + keepids = numpy.setdiff1d(self.leftids, idlist) + + keep_idx = numpy.in1d(self.leftids, keepids) + out_idmap = IdMap() + out_idmap.leftids = self.leftids[keep_idx] + out_idmap.rightids = self.rightids[keep_idx] + out_idmap.start = self.start[keep_idx] + out_idmap.stop = self.stop[keep_idx] + + return out_idmap + + def filter_on_right(self, idlist, keep): + """Removes some of the information in an idmap. Depending on the + value of 'keep', the idlist indicates the strings to retain or + the strings to discard. + + :param idlist: an array of strings which will be compared with + the rightids of the current IdMap. + :param keep: a boolean indicating whether idlist contains the ids to + keep or to discard. + + :return: a filtered version of the current IdMap. + """ + # get the list of ids to keep + if keep: + keepids = numpy.unique(idlist) + else: + keepids = numpy.setdiff1d(self.rightids, idlist) + + keep_idx = numpy.in1d(self.rightids, keepids) + out_idmap = IdMap() + out_idmap.leftids = self.leftids[keep_idx] + out_idmap.rightids = self.rightids[keep_idx] + out_idmap.start = self.start[keep_idx] + out_idmap.stop = self.stop[keep_idx] + return out_idmap + + def validate(self, warn=False): + """Checks that an object of type Id_Map obeys certain rules that + must alows be true. + + :param warn: boolean. If True, print a warning if strings are + duplicated in either left or right array + + :return: a boolean value indicating whether the object is valid. + + """ + ok = (self.leftids.shape == self.rightids.shape == self.start.shape == self.stop.shape) & self.leftids.ndim == 1 + + if warn & (self.leftids.shape != numpy.unique(self.leftids).shape): + logging.warning('The left id list contains duplicate identifiers') + if warn & (self.rightids.shape != numpy.unique(self.rightids).shape): + logging.warning('The right id list contains duplicate identifiers') + return ok + + def set(self, left, right, start=None, stop=None): + self.leftids = copy.deepcopy(left) + self.rightids = copy.deepcopy(right) + + if start is not None: + self.start = copy.deepcopy(start) + else: + self.start = numpy.empty(self.rightids.shape, '|O') + + if stop is not None: + self.stop = copy.deepcopy(stop) + else: + self.stop = numpy.empty(self.rightids.shape, '|O') + + @staticmethod + def read(input_file_name): + """Read IdMap in hdf5 format. + + :param input_file_name: name of the file to read from + """ + with h5py.File(input_file_name, "r") as f: + idmap = IdMap() + + idmap.leftids = f.get("leftids")[()] + idmap.rightids = f.get("rightids")[()] + + # if running python 3, need a conversion to unicode + if sys.version_info[0] == 3: + idmap.leftids = idmap.leftids.astype('U255', copy=False) + idmap.rightids = idmap.rightids.astype('U255', copy=False) + + tmpstart = f.get("start")[()] + tmpstop = f.get("stop")[()] + idmap.start = numpy.empty(f["start"].shape, '|O') + idmap.stop = numpy.empty(f["stop"].shape, '|O') + idmap.start[tmpstart != -1] = tmpstart[tmpstart != -1] + idmap.stop[tmpstop != -1] = tmpstop[tmpstop != -1] + + assert idmap.validate(), "Error: wrong IdMap format" + return idmap + + @classmethod + @check_path_existance + def read_txt(cls, input_file_name): + """Read IdMap in text format. + + :param input_file_name: name of the file to read from + """ + idmap = IdMap() + + with open(input_file_name, "r") as f: + columns = len(f.readline().split(' ')) + + if columns == 2: + idmap.leftids, idmap.rightids = numpy.loadtxt(input_file_name, + dtype={'names': ('left', 'right'), 'formats': ('|O', '|O')}, + usecols=(0, 1), unpack=True) + idmap.start = numpy.empty(idmap.rightids.shape, '|O') + idmap.stop = numpy.empty(idmap.rightids.shape, '|O') + + # If four columns + elif columns == 4: + idmap.leftids, idmap.rightids, idmap.start, idmap.stop = numpy.loadtxt( + input_file_name, + dtype={'names': ('left', 'right', 'start', 'stop'), + 'formats': ('|O', '|O', 'int', 'int')}, unpack=True) + + if not idmap.validate(): + raise Exception('Wrong format of IdMap') + assert idmap.validate(), "Error: wrong IdMap format" + return idmap + + def merge(self, idmap2): + """ Merges the current IdMap with another IdMap or a list of IdMap objects.. + + :param idmap2: Another Id_Map object. + + :return: an Id_Map object that contains the information from the two + input Id_Maps. + """ + idmap = IdMap() + if self.validate() & idmap2.validate(): + # create tuples of (model,seg) for both IdMaps for quick comparaison + tup1 = [(mod, seg) for mod, seg in zip(self.leftids, self.rightids)] + tup2 = [(mod, seg) for mod, seg in zip(idmap2.leftids, idmap2.rightids)] + + # Get indices of common sessions + existing_sessions = set(tup1).intersection(set(tup2)) + # Get indices of sessions which are not common in idmap2 + idx_new = numpy.sort(numpy.array([idx for idx, sess in enumerate(tup2) if sess not in tup1])) + if len(idx_new) == 0: + idx_new = numpy.zeros(idmap2.leftids.shape[0], dtype='bool') + + idmap.leftids = numpy.concatenate((self.leftids, idmap2.leftids[idx_new]), axis=0) + idmap.rightids = numpy.concatenate((self.rightids, idmap2.rightids[idx_new]), axis=0) + idmap.start = numpy.concatenate((self.start, idmap2.start[idx_new]), axis=0) + idmap.stop = numpy.concatenate((self.stop, idmap2.stop[idx_new]), axis=0) + + else: + raise Exception('Cannot merge IdMaps, wrong type') + + if not idmap.validate(): + raise Exception('Wrong format of IdMap') + + return idmap + + def split(self, N): + """ + Split an IdMap object into N IdMap of same size (if possible) + + :param N: the number of IdMap to generate + :return: a list of IdMap + """ + session_nb = self.leftids.shape[0] + sub_indices = numpy.array_split(numpy.arange(session_nb), N) + + im_list = [] + for ii in range(N): + im_list.append(IdMap()) + im_list[ii].leftids = self.leftids[sub_indices[ii]] + im_list[ii].rightids = self.rightids[sub_indices[ii]] + im_list[ii].start = self.start[sub_indices[ii]] + im_list[ii].stop = self.stop[sub_indices[ii]] + assert im_list[ii].validate(), "Error: wrong IdMap format" + + return im_list diff --git a/sidekit/sidekit/bosaris/key.py b/sidekit/sidekit/bosaris/key.py new file mode 100755 index 0000000..cf0cb1f --- /dev/null +++ b/sidekit/sidekit/bosaris/key.py @@ -0,0 +1,360 @@ +# -*- coding: utf-8 -*- + +# This package is a translation of a part of the BOSARIS toolkit. +# The authors thank Niko Brummer and Agnitio for allowing them to +# translate this code and provide the community with efficient structures +# and tools. +# +# The BOSARIS Toolkit is a collection of functions and classes in Matlab +# that can be used to calibrate, fuse and plot scores from speaker recognition +# (or other fields in which scores are used to test the hypothesis that two +# samples are from the same source) trials involving a model and a test segment. +# The toolkit was written at the BOSARIS2010 workshop which took place at the +# University of Technology in Brno, Czech Republic from 5 July to 6 August 2010. +# See the User Guide (available on the toolkit website)1 for a discussion of the +# theory behind the toolkit and descriptions of some of the algorithms used. +# +# The BOSARIS toolkit in MATLAB can be downloaded from `the website +# `_. + +""" +This is the 'key' module +""" +import h5py +import logging +import numpy +import sys +from .ndx import Ndx +from ..sidekit_wrappers import check_path_existance + +__author__ = "Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' +__credits__ = ["Niko Brummer", "Edward de Villiers"] + + +def diff(list1, list2): + c = [item for item in list1 if item not in list2] + c.sort() + return c + + +def ismember(list1, list2): + c = [item in list2 for item in list1] + return c + + +class Key: + """A class for representing a Key i.e. it classifies trials as + target or non-target trials. + + :attr modelset: list of the models into a ndarray of strings + :attr segset: list of the test segments into a ndarray of strings + :attr tar: 2D ndarray of booleans which rows correspond to the models + and columns to the test segments. True if target trial. + :attr non: 2D ndarray of booleans which rows correspond to the models + and columns to the test segments. True is non-target trial. + """ + + def __init__(self, key_file_name=None, + models=numpy.array([]), + testsegs=numpy.array([]), + trials=numpy.array([])): + """Initialize a Key object. + :param key_file_name: name of the file to load. Default is ''. + :param models: a list of models + :param testsegs: a list of test segments + + In case the key_file_name is empty, initialize an empty Key object. + """ + self.modelset = numpy.empty(0, dtype="|O") + self.segset = numpy.empty(0, dtype="|O") + self.tar = numpy.array([], dtype="bool") + self.non = numpy.array([], dtype="bool") + + if key_file_name is None and models is None and testsegs is None and trials is None: + pass + + elif key_file_name == None: + modelset = numpy.unique(models) + segset = numpy.unique(testsegs) + + tar = numpy.zeros((modelset.shape[0], segset.shape[0]), dtype="bool") + non = numpy.zeros((modelset.shape[0], segset.shape[0]), dtype="bool") + + for idx_m, model in enumerate(modelset): + idx_current_model = numpy.argwhere(models == model).flatten() + current_model_keys = dict(zip(testsegs[idx_current_model], + trials[idx_current_model])) + for idx_s, seg in enumerate(segset): + if seg in current_model_keys: + tar[idx_m, idx_s] = (current_model_keys[seg] == 'target') + non[idx_m, idx_s] = (current_model_keys[seg] == 'nontarget') + + self.modelset = modelset + self.segset = segset + self.tar = tar + self.non = non + assert self.validate(), "Wrong Key format" + + else: + tmp = self.read(key_file_name) + self.modelset = tmp.modelset + self.segset = tmp.segset + self.tar = tmp.tar + self.non = tmp.non + + @classmethod + def create(cls, modelset, segset, tar, non): + key = Key() + key.modelset = modelset + key.segset = segset + key.tar = tar + key.non = non + assert key.validate(), "Wrong Key format" + return key + + @check_path_existance + def write(self, output_file_name): + """ Save Key in HDF5 format + + :param output_file_name: name of the file to write to + """ + assert self.validate(), "Error: wrong Key format" + + with h5py.File(output_file_name, "w") as f: + f.create_dataset("modelset", data=self.modelset.astype('S'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + f.create_dataset("segset", data=self.segset.astype('S'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + trialmask = numpy.array(self.tar, dtype='int8') - numpy.array(self.non, dtype='int8') + f.create_dataset("trial_mask", data=trialmask, + maxshape=(None, None), + compression="gzip", + fletcher32=True) + + @check_path_existance + def write_txt(self, output_file_name): + """Save a Key object to a text file. + + :param output_file_name: name of the output text file + """ + fid = open(output_file_name, 'w') + for m in range(self.modelset.shape[0]): + segs = self.segset[self.tar[m, ]] + for s in range(segs.shape[0]): + fid.write('{} {} {}\n'.format(self.modelset[m], segs[s], 'target')) + segs = self.segset[self.non[m, ]] + for s in range(segs.shape[0]): + fid.write('{} {} {}\n'.format(self.modelset[m], segs[s], 'nontarget')) + fid.close() + + def filter(self, modlist, seglist, keep): + """Removes some of the information in a key. Useful for creating a + gender specific key from a pooled gender key. Depending on the + value of \'keep\', the two input lists indicate the strings to + retain or the strings to discard. + + :param modlist: a cell array of strings which will be compared with + the modelset of 'inkey'. + :param seglist: a cell array of strings which will be compared with + the segset of 'inkey'. + :param keep: a boolean indicating whether modlist and seglist are the + models to keep or discard. + + :return: a filtered version of 'inkey'. + """ + if keep: + keepmods = modlist + keepsegs = seglist + else: + keepmods = diff(self.modelset, modlist) + keepsegs = diff(self.segset, seglist) + + keepmodidx = numpy.array(ismember(self.modelset, keepmods)) + keepsegidx = numpy.array(ismember(self.segset, keepsegs)) + + outkey = Key() + outkey.modelset = self.modelset[keepmodidx] + outkey.segset = self.segset[keepsegidx] + tmp = self.tar[numpy.array(keepmodidx), :] + outkey.tar = tmp[:, numpy.array(keepsegidx)] + tmp = self.non[numpy.array(keepmodidx), :] + outkey.non = tmp[:, numpy.array(keepsegidx)] + + assert(outkey.validate()) + + if self.modelset.shape[0] > outkey.modelset.shape[0]: + logging.info('Number of models reduced from %d to %d', self.modelset.shape[0], outkey.modelset.shape[0]) + if self.segset.shape[0] > outkey.segset.shape[0]: + logging.info('Number of test segments reduced from %d to %d', self.segset.shape[0], outkey.segset.shape[0]) + return outkey + + def to_ndx(self): + """Create a Ndx object based on the Key object + + :return: a Ndx object based on the Key + """ + ndx = Ndx() + ndx.modelset = self.modelset + ndx.segset = self.segset + ndx.trialmask = self.tar | self.non + return ndx + + def validate(self): + """Checks that an object of type Key obeys certain rules that + must always be true. + + :return: a boolean value indicating whether the object is valid. + """ + ok = isinstance(self.modelset, numpy.ndarray) + ok &= isinstance(self.segset, numpy.ndarray) + ok &= isinstance(self.tar, numpy.ndarray) + ok &= isinstance(self.non, numpy.ndarray) + ok &= self.modelset.ndim == 1 + ok &= self.segset.ndim == 1 + ok &= self.tar.ndim == 2 + ok &= self.non.ndim == 2 + ok &= self.tar.shape == self.non.shape + ok &= self.tar.shape[0] == self.modelset.shape[0] + ok &= self.tar.shape[1] == self.segset.shape[0] + return ok + + @staticmethod + def read(input_file_fame): + """Reads a Key object from an hdf5 file. + + :param input_file_fame: name of the file to read from + """ + with h5py.File(input_file_fame, "r") as f: + + key = Key() + key.modelset = f["modelset"][()] + key.segset = f["segset"][()] + + # if running python 3, need a conversion to unicode + if sys.version_info[0] == 3: + key.modelset = key.modelset.astype('U100', copy=False) + key.segset = key.segset.astype('U100', copy=False) + + trialmask = f.get("trial_mask")[()] + key.tar = (trialmask == 1) + key.non = (trialmask == -1) + + assert key.validate(), "Error: wrong Key format" + return key + + @staticmethod + def read_txt(input_file_name): + """Creates a Key object from information stored in a text file. + + :param input_file_name: name of the file to read from + """ + key = Key() + + models, testsegs, trial = numpy.loadtxt(input_file_name, + delimiter=' ', + dtype={'names': ('mod', 'seg', 'key'), + 'formats': ('S1000', 'S1000', 'S10')}, + unpack=True) + + models = models.astype('|O', copy=False).astype('S', copy=False) + testsegs = testsegs.astype('|O', copy=False).astype('S', copy=False) + trial = trial.astype('|O', copy=False).astype('S', copy=False) + + if sys.version_info[0] == 3: + models = models.astype('U', copy=False) + testsegs = testsegs.astype('U', copy=False) + trial = trial.astype('U', copy=False) + + modelset = numpy.unique(models) + segset = numpy.unique(testsegs) + + tar = numpy.zeros((modelset.shape[0], segset.shape[0]), dtype="bool") + non = numpy.zeros((modelset.shape[0], segset.shape[0]), dtype="bool") + + for idx_m, model in enumerate(modelset): + idx_current_model = numpy.argwhere(models == model).flatten() + current_model_keys = dict(zip(testsegs[idx_current_model], trial[idx_current_model])) + for idx_s, seg in enumerate(segset): + if seg in current_model_keys: + tar[idx_m, idx_s] = (current_model_keys[seg] == 'target') + non[idx_m, idx_s] = (current_model_keys[seg] == 'nontarget') + + key.modelset = modelset + key.segset = segset + key.tar = tar + key.non = non + assert key.validate(), "Wrong Key format" + return key + + def merge(self, key_list): + """Merges Key objects. This function takes as input a list of + Key objects to merge in the curent one. + + :param key_list: the list of Keys to merge + """ + # the output key must have all models and segment in the input + # keys (only once) and the same target and non-target trials. + # It is an error if a trial is a target in one key and a + # non-target in another, but a target or non-target marker will + # override a 'non-trial' marker. + assert isinstance(key_list, list), "Input is not a list" + for key in key_list: + assert isinstance(key_list, list), \ + '{} {} {}'.format("Element ", key, " is not a list") + + for key2 in key_list: + key_new = Key() + key1 = self + + # create new ndx with empty masks + key_new.modelset = numpy.union1d(key1.modelset, key2.modelset) + key_new.segset = numpy.union1d(key1.segset, key2.segset) + + # expand ndx1 mask + tar_1 = numpy.zeros((key_new.modelset.shape[0], + key_new.segset.shape[0]), + dtype="bool") + non_1 = numpy.zeros((key_new.modelset.shape[0], + key_new.segset.shape[0]), dtype="bool") + model_index_a = numpy.argwhere(numpy.in1d(key_new.modelset, key1.modelset)) + model_index_b = numpy.argwhere(numpy.in1d(key1.modelset, key_new.modelset)) + seg_index_a = numpy.argwhere(numpy.in1d(key_new.segset, key1.segset)) + seg_index_b = numpy.argwhere(numpy.in1d(key1.segset, key_new.segset)) + tar_1[model_index_a[:, None], seg_index_a] = key1.tar[model_index_b[:, None], seg_index_b] + non_1[model_index_a[:, None], seg_index_a] = key1.non[model_index_b[:, None], seg_index_b] + + # expand ndx2 mask + tar_2 = numpy.zeros((key_new.modelset.shape[0], + key_new.segset.shape[0]), dtype="bool") + non_2 = numpy.zeros((key_new.modelset.shape[0], + key_new.segset.shape[0]), dtype="bool") + model_index_a = numpy.argwhere(numpy.in1d(key_new.modelset, key2.modelset)) + model_index_b = numpy.argwhere(numpy.in1d(key2.modelset, key_new.modelset)) + seg_index_a = numpy.argwhere(numpy.in1d(key_new.segset, key2.segset)) + seg_index_b = numpy.argwhere(numpy.in1d(key2.segset, key_new.segset)) + tar_2[model_index_a[:, None], seg_index_a] = key2.tar[model_index_b[:, None], seg_index_b] + non_2[model_index_a[:, None], seg_index_a] = key2.non[model_index_b[:, None], seg_index_b] + + # merge masks + tar = tar_1 | tar_2 + non = non_1 | non_2 + + # check for clashes + assert numpy.sum(tar & non) == 0, "Conflict in the new Key" + + # build new key + key_new.tar = tar + key_new.non = non + self.modelset = key_new.modelset + self.segset = key_new.segset + self.tar = key_new.tar + self.non = key_new.non + self.validate() diff --git a/sidekit/sidekit/bosaris/ndx.py b/sidekit/sidekit/bosaris/ndx.py new file mode 100755 index 0000000..c485876 --- /dev/null +++ b/sidekit/sidekit/bosaris/ndx.py @@ -0,0 +1,284 @@ +# -*- coding: utf-8 -*- + +# This package is a translation of a part of the BOSARIS toolkit. +# The authors thank Niko Brummer and Agnitio for allowing them to +# translate this code and provide the community with efficient structures +# and tools. +# +# The BOSARIS Toolkit is a collection of functions and classes in Matlab +# that can be used to calibrate, fuse and plot scores from speaker recognition +# (or other fields in which scores are used to test the hypothesis that two +# samples are from the same source) trials involving a model and a test segment. +# The toolkit was written at the BOSARIS2010 workshop which took place at the +# University of Technology in Brno, Czech Republic from 5 July to 6 August 2010. +# See the User Guide (available on the toolkit website)1 for a discussion of the +# theory behind the toolkit and descriptions of some of the algorithms used. +# +# The BOSARIS toolkit in MATLAB can be downloaded from `the website +# `_. + +""" +This is the 'ndx' module +""" +import h5py +import logging +import numpy +import sys +from ..sidekit_wrappers import check_path_existance, deprecated + +__author__ = "Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' +__credits__ = ["Niko Brummer", "Edward de Villiers"] + + +def diff(list1, list2): + c = [item for item in list1 if item not in list2] + c.sort() + return c + + +def ismember(list1, list2): + c = [item in list2 for item in list1] + return c + + +class Ndx: + """A class that encodes trial index information. It has a list of + model names and a list of test segment names and a matrix + indicating which combinations of model and test segment are + trials of interest. + + :attr modelset: list of unique models in a ndarray + :attr segset: list of unique test segments in a ndarray + :attr trialmask: 2D ndarray of boolean. Rows correspond to the models + and columns to the test segments. True if the trial is of interest. + """ + + def __init__(self, ndx_file_name='', + models=numpy.array([]), + testsegs=numpy.array([])): + """Initialize a Ndx object by loading information from a file + in HDF5 or text format. + + :param ndx_file_name: name of the file to load + """ + self.modelset = numpy.empty(0, dtype="|O") + self.segset = numpy.empty(0, dtype="|O") + self.trialmask = numpy.array([], dtype="bool") + + if ndx_file_name == '': + modelset = numpy.unique(models) + segset = numpy.unique(testsegs) + + trialmask = numpy.zeros((modelset.shape[0], segset.shape[0]), dtype="bool") + for m in range(modelset.shape[0]): + segs = testsegs[numpy.array(ismember(models, modelset[m]))] + trialmask[m, ] = ismember(segset, segs) + + self.modelset = modelset + self.segset = segset + self.trialmask = trialmask + assert self.validate(), "Wrong Ndx format" + + else: + ndx = Ndx.read(ndx_file_name) + self.modelset = ndx.modelset + self.segset = ndx.segset + self.trialmask = ndx.trialmask + + @check_path_existance + def write(self, output_file_name): + """ Save Ndx object in HDF5 format + + :param output_file_name: name of the file to write to + """ + assert self.validate(), "Error: wrong Ndx format" + + with h5py.File(output_file_name, "w") as f: + f.create_dataset("modelset", data=self.modelset.astype('S'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + f.create_dataset("segset", data=self.segset.astype('S'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + f.create_dataset("trial_mask", data=self.trialmask.astype('int8'), + maxshape=(None, None), + compression="gzip", + fletcher32=True) + + @check_path_existance + def save_txt(self, output_file_name): + + """Save a Ndx object in a text file + + :param output_file_name: name of the file to write to + """ + fid = open(output_file_name, 'w') + for m in range(self.modelset.shape[0]): + segs = self.segset[self.trialmask[m, ]] + for s in segs: + fid.write('{} {}\n'.format(self.modelset[m], s)) + fid.close() + + def filter(self, modlist, seglist, keep): + """Removes some of the information in an Ndx. Useful for creating a + gender specific Ndx from a pooled gender Ndx. Depending on the + value of \'keep\', the two input lists indicate the strings to + retain or the strings to discard. + + :param modlist: a cell array of strings which will be compared with + the modelset of 'inndx'. + :param seglist: a cell array of strings which will be compared with + the segset of 'inndx'. + :param keep: a boolean indicating whether modlist and seglist are the + models to keep or discard. + + :return: a filtered version of the current Ndx object. + """ + if keep: + keepmods = modlist + keepsegs = seglist + else: + keepmods = diff(self.modelset, modlist) + keepsegs = diff(self.segset, seglist) + + keepmodidx = numpy.array(ismember(self.modelset, keepmods)) + keepsegidx = numpy.array(ismember(self.segset, keepsegs)) + + outndx = Ndx() + outndx.modelset = self.modelset[keepmodidx] + outndx.segset = self.segset[keepsegidx] + tmp = self.trialmask[numpy.array(keepmodidx), :] + outndx.trialmask = tmp[:, numpy.array(keepsegidx)] + + assert outndx.validate, "Wrong Ndx format" + + if self.modelset.shape[0] > outndx.modelset.shape[0]: + logging.info('Number of models reduced from %d to %d', self.modelset.shape[0], outndx.modelset.shape[0]) + if self.segset.shape[0] > outndx.segset.shape[0]: + logging.info('Number of test segments reduced from %d to %d', self.segset.shape[0], outndx.segset.shape[0]) + return outndx + + def validate(self): + """Checks that an object of type Ndx obeys certain rules that + must always be true. + + :return: a boolean value indicating whether the object is valid + """ + ok = isinstance(self.modelset, numpy.ndarray) + ok &= isinstance(self.segset, numpy.ndarray) + ok &= isinstance(self.trialmask, numpy.ndarray) + + ok &= (self.modelset.ndim == 1) + ok &= (self.segset.ndim == 1) + ok &= (self.trialmask.ndim == 2) + + ok &= (self.trialmask.shape == (self.modelset.shape[0], self.segset.shape[0])) + return ok + + @staticmethod + def read(input_file_name): + """Creates an Ndx object from the information in an hdf5 file. + + :param input_file_name: name of the file to read from + """ + with h5py.File(input_file_name, "r") as f: + ndx = Ndx() + ndx.modelset = f["modelset"][()] + ndx.segset = f["segset"][()] + + # if running python 3, need a conversion to unicode + if sys.version_info[0] == 3: + ndx.modelset = ndx.modelset.astype('U100') + ndx.segset = ndx.segset.astype('U100') + + ndx.trialmask = numpy.zeros((ndx.modelset.shape[0], ndx.segset.shape[0]), dtype=numpy.bool) + f["trial_mask"].read_direct(ndx.trialmask) + + assert ndx.validate(), "Error: wrong Ndx format" + return ndx + + @classmethod + @check_path_existance + def read_txt(cls, input_filename): + """Creates an Ndx object from information stored in a text file. + + :param input_filename: name of the file to read from + """ + ndx = Ndx() + + with open(input_filename, 'r') as fid: + lines = [l.rstrip().split() for l in fid] + + models = numpy.empty(len(lines), '|O') + testsegs = numpy.empty(len(lines), '|O') + for ii in range(len(lines)): + models[ii] = lines[ii][0] + testsegs[ii] = lines[ii][1] + + modelset = numpy.unique(models) + segset = numpy.unique(testsegs) + + trialmask = numpy.zeros((modelset.shape[0], segset.shape[0]), dtype="bool") + for m in range(modelset.shape[0]): + segs = testsegs[numpy.array(ismember(models, modelset[m]))] + trialmask[m, ] = ismember(segset, segs) + + ndx.modelset = modelset + ndx.segset = segset + ndx.trialmask = trialmask + + assert ndx.validate(), "Wrong Ndx format" + return ndx + + def merge(self, ndx_list): + """Merges a list of Ndx objects into the current one. + The resulting ndx must have all models and segment in the input + ndxs (only once). A trial in any ndx becomes a trial in the + output ndx + + :param ndx_list: list of Ndx objects to merge + """ + assert isinstance(ndx_list, list), "Input is not a list" + for ndx in ndx_list: + assert isinstance(ndx_list, list), \ + '{} {} {}'.format("Element ", ndx, " is not an Ndx") + + self.validate() + for ndx2 in ndx_list: + ndx_new = Ndx() + ndx1 = self + + # create new ndx with empty masks + ndx_new.modelset = numpy.union1d(ndx1.modelset, ndx2.modelset) + ndx_new.segset = numpy.union1d(ndx1.segset, ndx2.segset) + + # expand ndx1 mask + trials_1 = numpy.zeros((ndx_new.modelset.shape[0], ndx_new.segset.shape[0]), dtype="bool") + model_index_a = numpy.argwhere(numpy.in1d(ndx_new.modelset, ndx1.modelset)) + model_index_b = numpy.argwhere(numpy.in1d(ndx1.modelset, ndx_new.modelset)) + seg_index_a = numpy.argwhere(numpy.in1d(ndx_new.segset, ndx1.segset)) + seg_index_b = numpy.argwhere(numpy.in1d(ndx1.segset, ndx_new.segset)) + trials_1[model_index_a[:, None], seg_index_a] = ndx1.trialmask[model_index_b[:, None], seg_index_b] + + # expand ndx2 mask + trials_2 = numpy.zeros((ndx_new.modelset.shape[0], ndx_new.segset.shape[0]), dtype="bool") + model_index_a = numpy.argwhere(numpy.in1d(ndx_new.modelset, ndx2.modelset)) + model_index_b = numpy.argwhere(numpy.in1d(ndx2.modelset, ndx_new.modelset)) + seg_index_a = numpy.argwhere(numpy.in1d(ndx_new.segset, ndx2.segset)) + seg_index_b = numpy.argwhere(numpy.in1d(ndx2.segset, ndx_new.segset)) + trials_2[model_index_a[:, None], seg_index_a] = ndx2.trialmask[model_index_b[:, None], seg_index_b] + + # merge masks + trials = trials_1 | trials_2 + + # build new ndx + ndx_new.trialmask = trials + self.modelset = ndx_new.modelset + self.segset = ndx_new.segset + self.trialmask = ndx_new.trialmask diff --git a/sidekit/sidekit/bosaris/plotwindow.py b/sidekit/sidekit/bosaris/plotwindow.py new file mode 100755 index 0000000..2b5597c --- /dev/null +++ b/sidekit/sidekit/bosaris/plotwindow.py @@ -0,0 +1,147 @@ +# -*- coding: utf-8 -*- + +# This package is a translation of a part of the BOSARIS toolkit. +# The authors thank Niko Brummer and Agnitio for allowing them to +# translate this code and provide the community with efficient structures +# and tools. +# +# The BOSARIS Toolkit is a collection of functions and classes in Matlab +# that can be used to calibrate, fuse and plot scores from speaker recognition +# (or other fields in which scores are used to test the hypothesis that two +# samples are from the same source) trials involving a model and a test segment. +# The toolkit was written at the BOSARIS2010 workshop which took place at the +# University of Technology in Brno, Czech Republic from 5 July to 6 August 2010. +# See the User Guide (available on the toolkit website)1 for a discussion of the +# theory behind the toolkit and descriptions of some of the algorithms used. +# +# The BOSARIS toolkit in MATLAB can be downloaded from `the website +# `_. + +""" +This is the 'plotwindow' module +""" +import numpy + + +__author__ = "Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' +__credits__ = ["Niko Brummer", "Edward de Villiers"] + + +class PlotWindow: + """A class that is used to define the parameters of a plotting window. + + :attr __pfa_limits__: ndarray of two values that determine the limits of the pfa axis. + Default is [0.0005, 0.5] + :attr __pmiss_limits__: ndarray of two values that determine the limits of the pmiss axis. + Default is [0.0005, 0.5] + :attr __xticks__: coordonates of the ticks on the horizontal axis + :attr __xticklabels__: labels of the ticks on the horizontal axis in a ndarray of strings + :attr __yticks__: coordonates of the ticks on the vertical axis + :attr __yticklabels__: labels of the ticks on the vertical axis in a ndarray of strings + """ + + def __init__(self, input_type=''): + """Initialize PlotWindow object to one of the pre-defined ploting type. + - 'new' + - 'old' + - 'big' + - 'sre10' + + :param input_type: the type of DET plot to display. Default is 'old' + """ + + if input_type == '': + self.__pfa_limits__ = numpy.array([5e-4, 5e-1]) + self.__pmiss_limits__ = numpy.array([5e-4, 5e-1]) + self.__xticks__ = numpy.array([0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.4]) + self.__xticklabels__ = numpy.array(['0.1', '0.2', '0.5', ' 1 ', ' 2 ', ' 5 ', '10 ', '20 ', '30 ', '40 ']) + self.__yticks__ = numpy.array([0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.4]) + self.__yticklabels__ = numpy.array(['0.1', '0.2', '0.5', ' 1 ', ' 2 ', ' 5 ', '10 ', '20 ', '30 ', '40 ']) + elif input_type == 'new': + self.axis_new() + elif input_type == 'old': + self.axis_old() + elif input_type == 'big': + self.axis_big() + elif input_type == 'sre10': + self.axis_sre10() + else: + raise Exception('Error, wrong type of PlotWindow') + + def make_plot_window_from_values(self, pfa_limits, pmiss_limits, xticks, xticklabels, yticks, yticklabels): + """Initialize PlotWindow from provided values + + :param pfa_limits: ndarray of two values that determine the limits of the pfa axis. + :param pmiss_limits: ndarray of two values that determine the limits of the pmiss axis. + :param xticks: coordonates of the ticks on the horizontal axis. + :param xticklabels: labels of the ticks on the horizontal axis in a ndarray of strings. + :param yticks: coordonates of the ticks on the vertical axis. + :param yticklabels: labels of the ticks on the vertical axis in a ndarray of strings. + """ + self.__pfa_limits__ = pfa_limits + self.__pmiss_limits__ = pmiss_limits + self.__xticks__ = xticks + self.__xticklabels__ = xticklabels + self.__yticks__ = yticks + self.__yticklabels__ = yticklabels + + def axis_new(self): + """Set axis value to new ones + + - pfa ranges from 0.000005 to 0.005 + - pmiss ranges from 0.01 to 0.99 + """ + self.__pfa_limits__ = numpy.array([5e-6, 5e-3]) + self.__pmiss_limits__ = numpy.array([1e-2, 0.99]) + self.__xticks__ = numpy.array([1e-5, 2e-5, 5e-5, 1e-4, 2e-4, 5e-4, 1e-3, 2e-3]) + self.__xticklabels__ = numpy.array(['1e-3', '2e-3', '5e-3', '0.01', '0.02', '0.05', '0.1 ', '0.2 ']) + self.__yticks__ = numpy.array([0.02, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.98]) + self.__yticklabels__ = numpy.array([' 2 ', ' 5 ', '10 ', '20 ', '30 ', + '40 ', '50 ', '60 ', '70 ', '80 ', '90 ', '95 ', '98 ']) + + def axis_old(self): + """Set axis value to old ones (NIST-SRE08 style) + + - pfa ranges from 0.0005 to 0.5 + - pmiss ranges from 0.0005 to 0.5 + """ + self.__pfa_limits__ = numpy.array([5e-4, 5e-1]) + self.__pmiss_limits__ = numpy.array([5e-4, 5e-1]) + self.__xticks__ = numpy.array([0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.4]) + self.__xticklabels__ = numpy.array(['0.1', '0.2', '0.5', ' 1 ', ' 2 ', ' 5 ', '10 ', '20 ', '30 ', '40 ']) + self.__yticks__ = numpy.array([0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.4]) + self.__yticklabels__ = numpy.array(['0.1', '0.2', '0.5', ' 1 ', ' 2 ', ' 5 ', '10 ', '20 ', '30 ', '40 ']) + + def axis_big(self): + """Set axis value to big ones + + - pfa ranges from 0.000005 to 0.99 + - pmiss ranges from 0.000005 to0.99 + """ + self.__pfa_limits__ = numpy.array([5e-6, 0.99]) + self.__pmiss_limits__ = numpy.array([5e-6, 0.99]) + self.__yticks__ = numpy.array([5e-6, 5e-5, 5e-4, 0.5e-2, 2.5e-2, 10e-2, + 25e-2, 50e-2, 72e-2, 88e-2, 96e-2, 99e-2]) + self.__yticklabels__ = numpy.array(['5e-4', '5e-3', '0.05', '0.5 ', + '2.5 ', ' 10 ', ' 25 ', ' 50 ', ' 72 ', ' 88 ', ' 96 ', ' 99 ']) + self.__xticks__ = numpy.array([5e-5, 5e-4, 0.5e-2, 2.5e-2, 10e-2, 25e-2, 50e-2, 72e-2, 88e-2, 96e-2, 99e-2]) + self.__xticklabels__ = numpy.array(['5e-3', '0.05', '0.5 ', '2.5 ', + ' 10 ', ' 25 ', ' 50 ', ' 72 ', ' 88 ', ' 96 ', ' 99 ']) + + def axis_sre10(self): + """Set axis value to NIST-SRE10 style + + - pfa ranges from 0.000003 to 0.5 + - pmiss ranges from 0.0003 to 0.9 + """ + self.__pfa_limits__ = numpy.array([3e-6, 5e-1]) + self.__pmiss_limits__ = numpy.array([3e-4, 9e-1]) + self.__xticks__ = numpy.array([1e-5, 1e-4, 1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2, 1e-1, 2e-1, 4e-1]) + self.__xticklabels__ = numpy.array(['0.001', ' 0.01', ' 0.1', ' 0.2', + ' 0.5', ' 1', ' 2', ' 5', ' 10', ' 20', ' 40']) + self.__yticks__ = numpy.array([1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2, 1e-1, 2e-1, 4e-1, 8e-1]) + self.__yticklabels__ = numpy.array(['0.1', '0.2', '0.5', ' 1 ', ' 2 ', ' 5 ', ' 10', ' 20', ' 40', ' 80']) diff --git a/sidekit/sidekit/bosaris/scores.py b/sidekit/sidekit/bosaris/scores.py new file mode 100755 index 0000000..6fdd3e9 --- /dev/null +++ b/sidekit/sidekit/bosaris/scores.py @@ -0,0 +1,493 @@ +# -*- coding: utf-8 -*- + +# This package is a translation of a part of the BOSARIS toolkit. +# The authors thank Niko Brummer and Agnitio for allowing them to +# translate this code and provide the community with efficient structures +# and tools. +# +# The BOSARIS Toolkit is a collection of functions and classes in Matlab +# that can be used to calibrate, fuse and plot scores from speaker recognition +# (or other fields in which scores are used to test the hypothesis that two +# samples are from the same source) trials involving a model and a test segment. +# The toolkit was written at the BOSARIS2010 workshop which took place at the +# University of Technology in Brno, Czech Republic from 5 July to 6 August 2010. +# See the User Guide (available on the toolkit website)1 for a discussion of the +# theory behind the toolkit and descriptions of some of the algorithms used. +# +# The BOSARIS toolkit in MATLAB can be downloaded from `the website +# `_. + +""" +This is the 'scores' module + +""" +import h5py +import logging +import numpy +import os +from .ndx import Ndx +from .key import Key +from ..sidekit_wrappers import check_path_existance + + +__author__ = "Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' +__credits__ = ["Niko Brummer", "Edward de Villiers"] + + +def diff(list1, list2): + c = [item for item in list1 if item not in list2] + c.sort() + return c + + +def ismember(list1, list2): + c = [item in list2 for item in list1] + return c + + +class Scores: + """A class for storing scores for trials. The modelset and segset + fields are lists of model and test segment names respectively. + The element i,j of scoremat and scoremask corresponds to the + trial involving model i and test segment j. + + :attr modelset: list of unique models in a ndarray + :attr segset: list of unique test segments in a ndarray + :attr scoremask: 2D ndarray of boolean which indicates the trials of interest + i.e. the entry i,j in scoremat should be ignored if scoremask[i,j] is False + :attr scoremat: 2D ndarray of scores + """ + def __init__(self, scores_file_name=''): + """ Initialize a Scores object by loading information from a file HDF5 format. + + :param scores_file_name: name of the file to load + """ + self.modelset = numpy.empty(0, dtype="|O") + self.segset = numpy.empty(0, dtype="|O") + self.scoremask = numpy.array([], dtype="bool") + self.scoremat = numpy.array([]) + + if scores_file_name == '': + pass + else: + tmp = Scores.read(scores_file_name) + self.modelset = tmp.modelset + self.segset = tmp.segset + self.scoremask = tmp.scoremask + self.scoremat = tmp.scoremat + + def __repr__(self): + ch = 'modelset:\n' + ch += self.modelset+'\n' + ch += 'segset:\n' + ch += self.segset+'\n' + ch += 'scoremask:\n' + ch += self.scoremask.__repr__()+'\n' + ch += 'scoremat:\n' + ch += self.scoremat.__repr__()+'\n' + + @check_path_existance + def write(self, output_file_name): + """ Save Scores in HDF5 format + + :param output_file_name: name of the file to write to + """ + with h5py.File(output_file_name, "w") as f: + f.create_dataset("modelset", data=self.modelset.astype('S'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + f.create_dataset("segset", data=self.segset.astype('S'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + f.create_dataset("score_mask", data=self.scoremask.astype('int8'), + maxshape=(None, None), + compression="gzip", + fletcher32=True) + f.create_dataset("scores", data=self.scoremat, + maxshape=(None, None), + compression="gzip", + fletcher32=True) + + @check_path_existance + def write_txt(self, output_file_name): + """Save a Scores object in a text file + + :param output_file_name: name of the file to write to + """ + if not os.path.exists(os.path.dirname(output_file_name)): + os.makedirs(os.path.dirname(output_file_name)) + + with open(output_file_name, 'w') as fid: + for m in range(self.modelset.shape[0]): + segs = self.segset[self.scoremask[m, ]] + scores = self.scoremat[m, self.scoremask[m, ]] + for s in range(segs.shape[0]): + fid.write('{} {} {}\n'.format(self.modelset[m], segs[s], scores[s])) + + @check_path_existance + def write_matlab(self, output_file_name): + """Save a Scores object in Bosaris compatible HDF5 format + + :param output_file_name: name of the file to write to + """ + with h5py.File(output_file_name, "w") as f: + f.create_dataset("/ID/row_ids", data=self.modelset.astype('S'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + f.create_dataset("/ID/column_ids", data=self.segset.astype('S'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + f.create_dataset("score_mask", data=self.scoremask.astype('int8'), + maxshape=(None, None), + compression="gzip", + fletcher32=True) + f.create_dataset("scores", data=self.scoremat, + maxshape=(None, None), + compression="gzip", + fletcher32=True) + + def get_tar_non(self, key): + """Divides scores into target and non-target scores using + information in a key. + + :param key: a Key object. + + :return: a vector of target scores. + :return: a vector of non-target scores. + """ + if (key.modelset == self.modelset).all() \ + and (key.segset == self.segset).all() \ + and self.scoremask.shape[0] == key.tar.shape[0] \ + and self.scoremask.shape[1] == key.tar.shape[1]: + return self.scoremat[key.tar & self.scoremask], self.scoremat[key.non & self.scoremask] + else: + new_score = self.align_with_ndx(key) + tarndx = key.tar & new_score.scoremask + nonndx = key.non & new_score.scoremask + tar = new_score.scoremat[tarndx] + non = new_score.scoremat[nonndx] + return tar, non + + def align_with_ndx(self, ndx): + """The ordering in the output Scores object corresponds to ndx, so + aligning several Scores objects with the same ndx will result in + them being comparable with each other. + + :param ndx: a Key or Ndx object + + :return: resized version of the current Scores object to size of \'ndx\' + and reordered according to the ordering of modelset and segset in \'ndx\'. + """ + aligned_scr = Scores() + aligned_scr.modelset = ndx.modelset + aligned_scr.segset = ndx.segset + + hasmodel = numpy.array(ismember(ndx.modelset, self.modelset)) + rindx = numpy.array([numpy.argwhere(self.modelset == v)[0][0] + for v in ndx.modelset[hasmodel]]).astype(int) + hasseg = numpy.array(ismember(ndx.segset, self.segset)) + cindx = numpy.array([numpy.argwhere(self.segset == v)[0][0] + for v in ndx.segset[hasseg]]).astype(int) + + aligned_scr.scoremat = numpy.zeros((ndx.modelset.shape[0], ndx.segset.shape[0])) + aligned_scr.scoremat[numpy.where(hasmodel)[0][:, None], + numpy.where(hasseg)[0]] = self.scoremat[rindx[:, None], cindx] + + aligned_scr.scoremask = numpy.zeros((ndx.modelset.shape[0], ndx.segset.shape[0]), dtype='bool') + aligned_scr.scoremask[numpy.where(hasmodel)[0][:, None], + numpy.where(hasseg)[0]] = self.scoremask[rindx[:, None], cindx] + + assert numpy.sum(aligned_scr.scoremask) <= (numpy.sum(hasmodel) * numpy.sum(hasseg)), 'Error in new scoremask' + + if isinstance(ndx, Ndx): + aligned_scr.scoremask = aligned_scr.scoremask & ndx.trialmask + else: + aligned_scr.scoremask = aligned_scr.scoremask & (ndx.tar | ndx.non) + + if numpy.sum(hasmodel) < ndx.modelset.shape[0]: + logging.info('models reduced from %d to %d', ndx.modelset.shape[0], numpy.sum(hasmodel)) + if numpy.sum(hasseg) < ndx.segset.shape[0]: + logging.info('testsegs reduced from %d to %d', ndx.segset.shape[0], numpy.sum(hasseg)) + + if isinstance(ndx, Key): + tar = ndx.tar & aligned_scr.scoremask + non = ndx.non & aligned_scr.scoremask + + missing = numpy.sum(ndx.tar) - numpy.sum(tar) + if missing > 0: + logging.info('%d of %d targets missing', missing, numpy.sum(ndx.tar)) + missing = numpy.sum(ndx.non) - numpy.sum(non) + if missing > 0: + logging.info('%d of %d non targets missing', missing, numpy.sum(ndx.non)) + + else: + mask = ndx.trialmask & aligned_scr.scoremask + missing = numpy.sum(ndx.trialmask) - numpy.sum(mask) + if missing > 0: + logging.info('%d of %d trials missing', missing, numpy.sum(ndx.trialmask)) + + assert all(numpy.isfinite(aligned_scr.scoremat[aligned_scr.scoremask])), \ + 'Inifinite or Nan value in the scoremat' + assert aligned_scr.validate(), 'Wrong Score format' + return aligned_scr + + def set_missing_to_value(self, ndx, value): + """Sets all scores for which the trialmask is true but the scoremask + is false to the same value, supplied by the user. + + :param ndx: a Key or Ndx object. + :param value: a value for the missing scores. + + :return: a Scores object (with the missing scores added and set + to value). + """ + if isinstance(ndx, Key): + ndx = ndx.to_ndx() + + new_scr = self.align_with_ndx(ndx) + missing = ndx.trialmask & -new_scr.scoremask + new_scr.scoremat[missing] = value + new_scr.scoremask[missing] = True + assert new_scr.validate(), "Wrong format of Scores" + return new_scr + + def filter(self, modlist, seglist, keep): + """Removes some of the information in a Scores object. Useful for + creating a gender specific score set from a pooled gender score + set. Depending on the value of \'keep\', the two input lists + indicate the models and test segments (and their associated + scores) to retain or discard. + + :param modlist: a list of strings which will be compared with + the modelset of the current Scores object. + :param seglist: a list of strings which will be compared with + the segset of \'inscr\'. + :param keep: a boolean indicating whether modlist and seglist are the + models to keep or discard. + + :return: a filtered version of \'inscr\'. + """ + if keep: + keepmods = modlist + keepsegs = seglist + else: + keepmods = diff(self.modelset, modlist) + keepsegs = diff(self.segset, seglist) + + keepmodidx = numpy.array(ismember(self.modelset, keepmods)) + keepsegidx = numpy.array(ismember(self.segset, keepsegs)) + + outscr = Scores() + outscr.modelset = self.modelset[keepmodidx] + outscr.segset = self.segset[keepsegidx] + tmp = self.scoremat[numpy.array(keepmodidx), :] + outscr.scoremat = tmp[:, numpy.array(keepsegidx)] + tmp = self.scoremask[numpy.array(keepmodidx), :] + outscr.scoremask = tmp[:, numpy.array(keepsegidx)] + + assert isinstance(outscr, Scores), 'Wrong Scores format' + + if self.modelset.shape[0] > outscr.modelset.shape[0]: + logging.info('Number of models reduced from %d to %d', self.modelset.shape[0], outscr.modelset.shape[0]) + if self.segset.shape[0] > outscr.segset.shape[0]: + logging.info('Number of test segments reduced from %d to %d', self.segset.shape[0], outscr.segset.shape[0]) + return outscr + + def validate(self): + """Checks that an object of type Scores obeys certain rules that + must always be true. + + :return: a boolean value indicating whether the object is valid. + """ + ok = self.scoremat.shape == self.scoremask.shape + ok &= (self.scoremat.shape[0] == self.modelset.shape[0]) + ok &= (self.scoremat.shape[1] == self.segset.shape[0]) + return ok + + @staticmethod + def read(input_file_name): + """Read a Scores object from information in a hdf5 file. + + :param input_file_name: name of the file to read from + """ + with h5py.File(input_file_name, "r") as f: + scores = Scores() + + scores.modelset = numpy.empty(f["modelset"].shape, dtype=f["modelset"].dtype) + f["modelset"].read_direct(scores.modelset) + scores.modelset = scores.modelset.astype('U100', copy=False) + + scores.segset = numpy.empty(f["segset"].shape, dtype=f["segset"].dtype) + f["segset"].read_direct(scores.segset) + scores.segset = scores.segset.astype('U100', copy=False) + + scores.scoremask = numpy.empty(f["score_mask"].shape, dtype=f["score_mask"].dtype) + f["score_mask"].read_direct(scores.scoremask) + scores.scoremask = scores.scoremask.astype('bool', copy=False) + + scores.scoremat = numpy.empty(f["scores"].shape, dtype=f["scores"].dtype) + f["scores"].read_direct(scores.scoremat) + + assert scores.validate(), "Error: wrong Scores format" + return scores + + @staticmethod + def read_matlab(input_file_name): + """Read a Scores object from information in a hdf5 file in Matlab BOSARIS format. + + :param input_file_name: name of the file to read from + """ + with h5py.File(input_file_name, "r") as f: + scores = Scores() + + scores.modelset = numpy.empty(f["ID/row_ids"].shape, dtype=f["ID/row_ids"].dtype) + f["ID/row_ids"].read_direct(scores.modelset) + scores.modelset = scores.modelset.astype('U100', copy=False) + + scores.segset = numpy.empty(f["ID/column_ids"].shape, dtype=f["ID/column_ids"].dtype) + f["ID/column_ids"].read_direct(scores.segset) + scores.segset = scores.segset.astype('U100', copy=False) + + scores.scoremask = numpy.empty(f["score_mask"].shape, dtype=f["score_mask"].dtype) + f["score_mask"].read_direct(scores.scoremask) + scores.scoremask = scores.scoremask.astype('bool', copy=False) + + scores.scoremat = numpy.empty(f["scores"].shape, dtype=f["scores"].dtype) + f["scores"].read_direct(scores.scoremat) + + assert scores.validate(), "Error: wrong Scores format" + return scores + + @classmethod + @check_path_existance + def read_txt(cls, input_file_name): + """Creates a Scores object from information stored in a text file. + + :param input_file_name: name of the file to read from + """ + s = Scores() + with open(input_file_name, 'r') as fid: + lines = [l.rstrip().split() for l in fid] + + models = numpy.array([], '|O') + models.resize(len(lines)) + testsegs = numpy.array([], '|O') + testsegs.resize(len(lines)) + scores = numpy.array([]) + scores.resize(len(lines)) + + for ii in range(len(lines)): + models[ii] = lines[ii][0] + testsegs[ii] = lines[ii][1] + scores[ii] = float(lines[ii][2]) + + modelset = numpy.unique(models) + segset = numpy.unique(testsegs) + + scoremask = numpy.zeros((modelset.shape[0], segset.shape[0]), dtype="bool") + scoremat = numpy.zeros((modelset.shape[0], segset.shape[0])) + for m in range(modelset.shape[0]): + segs = testsegs[numpy.array(ismember(models, modelset[m]))] + scrs = scores[numpy.array(ismember(models, modelset[m]))] + idx = segs.argsort() + segs = segs[idx] + scrs = scrs[idx] + scoremask[m, ] = ismember(segset, segs) + scoremat[m, numpy.array(ismember(segset, segs))] = scrs + + s.modelset = modelset + s.segset = segset + s.scoremask = scoremask + s.scoremat = scoremat + assert s.validate(), "Wrong Scores format" + s.sort() + return s + + def merge(self, score_list): + """Merges a list of Scores objects into the current one. + The resulting must have all models and segment in the input + Scores (only once) and the union of all the scoremasks. + It is an error if two of the input Scores objects have a + score for the same trial. + + :param score_list: the list of Scores object to merge + """ + assert isinstance(score_list, list), "Input is not a list" + for scr in score_list: + assert isinstance(score_list, list), \ + '{} {} {}'.format("Element ", scr, " is not a Score") + + self.validate() + for scr2 in score_list: + scr_new = Scores() + scr1 = self + scr1.sort() + scr2.sort() + + # create new scr with empty matrices + scr_new.modelset = numpy.union1d(scr1.modelset, scr2.modelset) + scr_new.segset = numpy.union1d(scr1.segset, scr2.segset) + + # expand scr1 matrices + scoremat_1 = numpy.zeros((scr_new.modelset.shape[0], scr_new.segset.shape[0])) + scoremask_1 = numpy.zeros((scr_new.modelset.shape[0], scr_new.segset.shape[0]), dtype='bool') + model_index_a = numpy.argwhere(numpy.in1d(scr_new.modelset, scr1.modelset)) + model_index_b = numpy.argwhere(numpy.in1d(scr1.modelset, scr_new.modelset)) + seg_index_a = numpy.argwhere(numpy.in1d(scr_new.segset, scr1.segset)) + seg_index_b = numpy.argwhere(numpy.in1d(scr1.segset, scr_new.segset)) + scoremat_1[model_index_a[:, None], seg_index_a] = scr1.scoremat[model_index_b[:, None], seg_index_b] + scoremask_1[model_index_a[:, None], seg_index_a] = scr1.scoremask[model_index_b[:, None], seg_index_b] + + # expand scr2 matrices + scoremat_2 = numpy.zeros((scr_new.modelset.shape[0], scr_new.segset.shape[0])) + scoremask_2 = numpy.zeros((scr_new.modelset.shape[0], scr_new.segset.shape[0]), dtype='bool') + model_index_a = numpy.argwhere(numpy.in1d(scr_new.modelset, scr2.modelset)) + model_index_b = numpy.argwhere(numpy.in1d(scr2.modelset, scr_new.modelset)) + seg_index_a = numpy.argwhere(numpy.in1d(scr_new.segset, scr2.segset)) + seg_index_b = numpy.argwhere(numpy.in1d(scr2.segset, scr_new.segset)) + scoremat_2[model_index_a[:, None], seg_index_a] = scr2.scoremat[model_index_b[:, None], seg_index_b] + scoremask_2[model_index_a[:, None], seg_index_a] = scr2.scoremask[model_index_b[:, None], seg_index_b] + + # check for clashes + assert numpy.sum(scoremask_1 & scoremask_2) == 0, "Conflict in the new scoremask" + + # merge masks + self.scoremat = scoremat_1 + scoremat_2 + self.scoremask = scoremask_1 | scoremask_2 + self.modelset = scr_new.modelset + self.segset = scr_new.segset + assert self.validate(), 'Wrong Scores format' + + def sort(self): + """Sort models and segments""" + sort_model_idx = numpy.argsort(self.modelset) + sort_seg_idx = numpy.argsort(self.segset) + sort_mask = self.scoremask[sort_model_idx[:, None], sort_seg_idx] + sort_mat = self.scoremat[sort_model_idx[:, None], sort_seg_idx] + self.modelset.sort() + self.segset.sort() + self.scoremat = sort_mat + self.scoremask = sort_mask + + def get_score(self, modelID, segID): + """return a score given a model and segment identifiers + raise an error if the trial does not exist + :param modelID: id of the model + :param segID: id of the test segment + """ + model_idx = numpy.argwhere(self.modelset == modelID) + seg_idx = numpy.argwhere(self.segset == segID) + if model_idx.shape[0] == 0: + raise Exception('No such model as: %s', modelID) + elif seg_idx.shape[0] == 0: + raise Exception('No such segment as: %s', segID) + else: + return self.scoremat[model_idx, seg_idx] diff --git a/sidekit/sidekit/factor_analyser.py b/sidekit/sidekit/factor_analyser.py new file mode 100755 index 0000000..f4651f5 --- /dev/null +++ b/sidekit/sidekit/factor_analyser.py @@ -0,0 +1,1101 @@ +# -* coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . +""" +Copyright 2014-2021 Sylvain Meignier and Anthony Larcher + + :mod:`factor_analyser` provides methods to train different types of factor analysers + +""" +import copy +import numpy +import multiprocessing +import logging +import h5py +import scipy +import warnings +import ctypes +from sidekit.sv_utils import serialize +from sidekit.statserver import StatServer +from sidekit.mixture import Mixture +from sidekit.sidekit_wrappers import process_parallel_lists, deprecated, check_path_existance +from sidekit import STAT_TYPE + +__license__ = "LGPL" +__author__ = "Anthony Larcher & Sylvain Meignier" +__copyright__ = "Copyright 2014-2021 Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' + +def e_on_batch(stat0, stat1, ubm, F): + """ + Compute statistics for the Expectation step on a batch of data + + :param stat0: matrix of zero-order statistics (1 session per line) + :param stat1: matrix of first-order statistics (1 session per line) + :param ubm: Mixture object + :param F: factor loading matrix + :return: first and second order statistics + """ + tv_rank = F.shape[1] + nb_distrib = stat0.shape[1] + feature_size = ubm.mu.shape[1] + index_map = numpy.repeat(numpy.arange(nb_distrib), feature_size) + upper_triangle_indices = numpy.triu_indices(tv_rank) + + gmm_covariance = "diag" if ubm.invcov.ndim == 2 else "full" + + # Allocate the memory to save + session_nb = stat0.shape[0] + e_h = numpy.zeros((session_nb, tv_rank), dtype=STAT_TYPE) + e_hh = numpy.zeros((session_nb, tv_rank * (tv_rank + 1) // 2), dtype=STAT_TYPE) + + # Whiten the statistics for diagonal or full models + stat1 -= stat0[:, index_map] * ubm.get_mean_super_vector() + + if gmm_covariance == "diag": + stat1 *= numpy.sqrt(ubm.get_invcov_super_vector()) + elif gmm_covariance == "full": + stat1 = numpy.einsum("ikj,ikl->ilj", + stat1.T.reshape(-1, nb_distrib, session_nb), + ubm.invchol + ).reshape(-1, session_nb).T + + for idx in range(session_nb): + inv_lambda = scipy.linalg.inv(numpy.eye(tv_rank) + (F.T * stat0[idx, index_map]).dot(F)) + aux = F.T.dot(stat1[idx, :]) + e_h[idx] = numpy.dot(aux, inv_lambda) + e_hh[idx] = (inv_lambda + numpy.outer(e_h[idx], e_h[idx]))[upper_triangle_indices] + + return e_h, e_hh + + +def e_worker(arg, q): + """ + Encapsulates the method that compute statistics for expectation step + + :param arg: a tuple that should include + a matrix of zero-order statistics (1 session per line) + a matrix of first-order statistics (1 session per line) + a Mixture object + a factor loading matrix + :param q: output queue (a multiprocessing.Queue object) + """ + q.put(arg[:2] + e_on_batch(*arg)) + + +def e_gather(arg, q): + """ + Consumer that sums accumulators stored in the memory + + :param arg: a tuple of input parameters including three accumulators for the estimation of Factor Analysis matrix + :param q: input queue that is filled by the producers and emptied in this function (a multiprocessing.Queue object) + :return: the three accumulators + """ + _A, _C, _R = arg + + while True: + + stat0, stat1, e_h, e_hh = q.get() + if e_h is None: + break + _A += stat0.T.dot(e_hh) + _C += e_h.T.dot(stat1) + _R += numpy.sum(e_hh, axis=0) + + return _A, _C, _R + + +def iv_extract_on_batch(arg, q): + """ + Extract i-vectors for a batch of sessions (shows) + + :param arg: a tuple of inputs that includes a list of batch_indices, a matrix of zero-order statistics + a matrix of first order statistics, a Mixture model and loading factor matrix + :param q: the output queue to fill (a multiprocessing.Queue object) + """ + batch_indices, stat0, stat1, ubm, F = arg + E_h, E_hh = e_on_batch(stat0, stat1, ubm, F) + tv_rank = E_h.shape[1] + q.put((batch_indices,) + (E_h, E_hh[:, numpy.array([i * tv_rank-((i*(i-1))//2) for i in range(tv_rank)])])) + + +def iv_collect(arg, q): + """ + Consumer method that takes inputs from a queue and fill matrices with i-vectors + and uncertainty matrices (diagonal version only) + + :param arg:a tuple of inputs including a matrix to store i-vectors and a matrix to store uncertainty matrices + :param q: the input queue (a multiprocessing.Queue object) + :return: the matrices of i-vectors and uncertainty matrices + """ + iv, iv_sigma = arg + + while True: + + batch_idx, e_h, e_hh = q.get() + if e_h is None: + break + iv[batch_idx, :] = e_h + iv_sigma[batch_idx, :] = e_hh + + return iv, iv_sigma + + +@process_parallel_lists +def fa_model_loop(batch_start, + mini_batch_indices, + factor_analyser, + stat0, + stat1, + e_h, + e_hh, + num_thread=1): + """ + Methods that is called for PLDA estimation for parallelization on classes + + :param batch_start: index to start at in the list + :param mini_batch_indices: indices of the elements in the list (should start at zero) + :param factor_analyser: FactorAnalyser object + :param stat0: matrix of zero order statistics + :param stat1: matrix of first order statistics + :param e_h: accumulator + :param e_hh: accumulator + :param num_thread: number of parallel process to run + """ + rank = factor_analyser.F.shape[1] + if factor_analyser.Sigma.ndim == 2: + A = factor_analyser.F.T.dot(factor_analyser.F) + inv_lambda_unique = dict() + for sess in numpy.unique(stat0[:, 0]): + inv_lambda_unique[sess] = scipy.linalg.inv(sess * A + numpy.eye(A.shape[0])) + + tmp = numpy.zeros((factor_analyser.F.shape[1], factor_analyser.F.shape[1]), dtype=STAT_TYPE) + + for idx in mini_batch_indices: + if factor_analyser.Sigma.ndim == 1: + inv_lambda = scipy.linalg.inv(numpy.eye(rank) + + (factor_analyser.F.T * stat0[idx + batch_start, :]).dot(factor_analyser.F)) + else: + inv_lambda = inv_lambda_unique[stat0[idx + batch_start, 0]] + + aux = factor_analyser.F.T.dot(stat1[idx + batch_start, :]) + numpy.dot(aux, inv_lambda, out=e_h[idx]) + e_hh[idx] = inv_lambda + numpy.outer(e_h[idx], e_h[idx], tmp) + + +class FactorAnalyser: + """ + A class to train factor analyser such as total variability models and Probabilistic + Linear Discriminant Analysis (PLDA). + + :attr mean: mean vector + :attr F: between class matrix + :attr G: within class matrix + :attr H: MAP covariance matrix (for Joint Factor Analysis only) + :attr Sigma: residual covariance matrix + """ + + def __init__(self, + input_file_name=None, + mean=None, + F=None, + G=None, + H=None, + Sigma=None): + """ + Initialize a Factor Analyser object to None or by reading from an HDF5 file. + When loading from a file, other parameters can be provided to overwrite each of the component. + + :param input_file_name: name of the HDF5 file to read from, default is nNone + :param mean: the mean vector + :param F: between class matrix + :param G: within class matrix + :param H: MAP covariance matrix + :param Sigma: residual covariance matri + x + """ + if input_file_name is not None: + fa = FactorAnalyser.read(input_file_name) + self.mean = fa.mean + self.F = fa.F + self.G = fa.G + self.H = fa.H + self.Sigma = fa.Sigma + else: + self.mean = None + self.F = None + self.G = None + self.H = None + self.Sigma = None + + if mean is not None: + self.mean = mean + if F is not None: + self.F = F + if G is not None: + self.G = G + if H is not None: + self.H = H + if Sigma is not None: + self.Sigma = Sigma + + @check_path_existance + def write(self, output_file_name): + """ + Write a FactorAnalyser object into HDF5 file + + :param output_file_name: the name of the file to write to + """ + with h5py.File(output_file_name, "w") as fh: + kind = numpy.zeros(5, dtype="int16") # FA with 5 matrix + if self.mean is not None: + kind[0] = 1 + fh.create_dataset("fa/mean", data=self.mean, + compression="gzip", + fletcher32=True) + if self.F is not None: + kind[1] = 1 + fh.create_dataset("fa/f", data=self.F, + compression="gzip", + fletcher32=True) + if self.G is not None: + kind[2] = 1 + fh.create_dataset("fa/g", data=self.G, + compression="gzip", + fletcher32=True) + if self.H is not None: + kind[3] = 1 + fh.create_dataset("fa/h", data=self.H, + compression="gzip", + fletcher32=True) + if self.Sigma is not None: + kind[4] = 1 + fh.create_dataset("fa/sigma", data=self.Sigma, + compression="gzip", + fletcher32=True) + fh.create_dataset("fa/kind", data=kind, + compression="gzip", + fletcher32=True) + + @staticmethod + def read(input_filename): + """ + Read a generic FactorAnalyser model from a HDF5 file + + :param input_filename: the name of the file to read from + + :return: a FactorAnalyser object + """ + fa = FactorAnalyser() + with h5py.File(input_filename, "r") as fh: + kind = fh.get("fa/kind")[()] + if kind[0] != 0: + fa.mean = fh.get("fa/mean")[()] + if kind[1] != 0: + fa.F = fh.get("fa/f")[()] + if kind[2] != 0: + fa.G = fh.get("fa/g")[()] + if kind[3] != 0: + fa.H = fh.get("fa/h")[()] + if kind[4] != 0: + fa.Sigma = fh.get("fa/sigma")[()] + return fa + + def total_variability_raw(self, + stat_server, + ubm, + tv_rank, + nb_iter=20, + min_div=True, + tv_init=None, + save_init=False, + output_file_name=None): + """ + Train a total variability model using a single process on a single node. + This method is provided for didactic purpose and should not be used as it uses + to much memory and is to slow. If you want to use a single process + run: "total_variability_single" + + :param stat_server: the StatServer containing data to train the model + :param ubm: a Mixture object + :param tv_rank: rank of the total variability model + :param nb_iter: number of EM iteration + :param min_div: boolean, if True, apply minimum divergence re-estimation + :param tv_init: initial matrix to start the EM iterations with + :param save_init: boolean, if True, save the initial matrix + :param output_file_name: name of the file where to save the matrix + """ + assert(isinstance(stat_server, StatServer) and stat_server.validate()), \ + "First argument must be a proper StatServer" + assert(isinstance(ubm, Mixture) and ubm.validate()), "Second argument must be a proper Mixture" + assert(isinstance(tv_rank, int) and (0 < tv_rank <= min(stat_server.stat1.shape))), \ + "tv_rank must be a positive integer less than the dimension of the statistics" + assert(isinstance(nb_iter, int) and (0 < nb_iter)), "nb_iter must be a positive integer" + + gmm_covariance = "diag" if ubm.invcov.ndim == 2 else "full" + + # Set useful variables + nb_sessions, sv_size = stat_server.stat1.shape + feature_size = ubm.mu.shape[1] + nb_distrib = ubm.w.shape[0] + + # Whiten the statistics for diagonal or full models + if gmm_covariance == "diag": + stat_server.whiten_stat1(ubm.get_mean_super_vector(), 1. / ubm.get_invcov_super_vector()) + elif gmm_covariance == "full": + stat_server.whiten_stat1(ubm.get_mean_super_vector(), ubm.invchol) + + # mean and Sigma are initialized at ZEROS as statistics are centered + self.mean = numpy.zeros(ubm.get_mean_super_vector().shape) + self.Sigma = numpy.zeros(ubm.get_mean_super_vector().shape) + + # Initialize TV from given data or randomly + # mean and Sigma are initialized at ZEROS as statistics are centered + self.mean = numpy.zeros(ubm.get_mean_super_vector().shape) + self.F = numpy.random.randn(sv_size, tv_rank) if tv_init is None else tv_init + self.Sigma = numpy.zeros(ubm.get_mean_super_vector().shape) + + # Save init if required + if save_init: + if output_file_name is None: + self.write(output_file_name + "temporary_factor_analyser_init.h5") + else: + self.write(output_file_name + "_init.h5") + + # Estimate TV iteratively + for it in range(nb_iter): + print(f"Start it: {it}") + # Create accumulators for the list of models to process + _A = numpy.zeros((nb_distrib, tv_rank, tv_rank), dtype=STAT_TYPE) + _C = numpy.zeros((tv_rank, feature_size * nb_distrib), dtype=STAT_TYPE) + + _R = numpy.zeros((tv_rank, tv_rank), dtype=STAT_TYPE) + + # E-step: + index_map = numpy.repeat(numpy.arange(nb_distrib), feature_size) + + for sess in range(stat_server.segset.shape[0]): + + inv_lambda = scipy.linalg.inv(numpy.eye(tv_rank) + + (self.F.T * stat_server.stat0[sess, index_map]).dot(self.F)) + Aux = self.F.T.dot(stat_server.stat1[sess, :]) + e_h = Aux.dot(inv_lambda) + e_hh = inv_lambda + numpy.outer(e_h, e_h) + + # Accumulate for minimum divergence step + _R += e_hh + + # Accumulate for M-step + _C += numpy.outer(e_h, stat_server.stat1[sess, :]) + _A += e_hh * stat_server.stat0[sess][:, numpy.newaxis, numpy.newaxis] + + _R /= nb_sessions + + # M-step ( + MinDiv si _R n'est pas None) + for g in range(nb_distrib): + distrib_idx = range(g * feature_size, (g + 1) * feature_size) + self.F[distrib_idx, :] = scipy.linalg.solve(_A[g], _C[:, distrib_idx]).T + + # MINIMUM DIVERGENCE STEP + if min_div: + ch = scipy.linalg.cholesky(_R) + self.F = self.F.dot(ch) + + # Save the FactorAnalyser if output_file_name is not None + if output_file_name is not None: + if it < nb_iter - 1: + self.write(output_file_name + "_it-{}.h5".format(it)) + else: + self.write(output_file_name + ".h5") + + def total_variability_single(self, + stat_server_filename, + ubm, + tv_rank, + nb_iter=20, + min_div=True, + tv_init=None, + batch_size=300, + save_init=False, + output_file_name=None): + """ + Train a total variability model using a single process on a single node. + Use this method to run a single process on a single node with optimized code. + + Optimization: + Only half of symmetric matrices are stored here + process sessions per batch in order to control the memory footprint + + :param stat_server_filename: the name of the file for StatServer, containing data to train the model + :param ubm: a Mixture object + :param tv_rank: rank of the total variability model + :param nb_iter: number of EM iteration + :param min_div: boolean, if True, apply minimum divergence re-estimation + :param tv_init: initial matrix to start the EM iterations with + :param batch_size: number of sessions to process at once to reduce memory footprint + :param save_init: boolean, if True, save the initial matrix + :param output_file_name: name of the file where to save the matrix + """ + assert (isinstance(ubm, Mixture) and ubm.validate()), "Second argument must be a proper Mixture" + assert (isinstance(nb_iter, int) and (0 < nb_iter)), "nb_iter must be a positive integer" + + gmm_covariance = "diag" if ubm.invcov.ndim == 2 else "full" + + # Set useful variables + with h5py.File(stat_server_filename, 'r') as fh: + nb_sessions, sv_size = fh["stat1"].shape + feature_size = ubm.mu.shape[1] + nb_distrib = ubm.w.shape[0] + sv_size = nb_distrib * feature_size + + # Initialize TV from given data or randomly + # mean and Sigma are initialized at ZEROS as statistics are centered + self.mean = numpy.zeros(ubm.get_mean_super_vector().shape) + self.F = numpy.random.randn(sv_size, tv_rank) if tv_init is None else tv_init + self.Sigma = numpy.zeros(ubm.get_mean_super_vector().shape) + + # Save init if required + if save_init: + if output_file_name is None: + self.write(output_file_name + "temporary_factor_analyser_init.h5") + else: + self.write(output_file_name + "_init.h5") + + # Create index to replicate self.stat0 and save only upper triangular coefficients of symmetric matrices + index_map = numpy.repeat(numpy.arange(nb_distrib), feature_size) + upper_triangle_indices = numpy.triu_indices(tv_rank) + + # Open the StatServer file + with h5py.File(stat_server_filename, 'r') as fh: + nb_sessions, sv_size = fh['stat1'].shape + batch_nb = int(numpy.floor(fh['segset'].shape[0] / float(batch_size) + 0.999)) + batch_indices = numpy.array_split(numpy.arange(nb_sessions), batch_nb) + + # Estimate TV iteratively + for it in range(nb_iter): + + # Create accumulators for the list of models to process + _A = numpy.zeros((nb_distrib, tv_rank * (tv_rank + 1) // 2), dtype=STAT_TYPE) + _C = numpy.zeros((tv_rank, feature_size * nb_distrib), dtype=STAT_TYPE) + _R = numpy.zeros((tv_rank * (tv_rank + 1) // 2), dtype=STAT_TYPE) + + # Load data per batch to reduce the memory footprint + for batch_idx in batch_indices: + + stat0 = fh['stat0'][batch_idx, :] + stat1 = fh['stat1'][batch_idx, :] + + e_h, e_hh = e_on_batch(stat0, stat1, ubm, self.F) + + _R += numpy.sum(e_hh, axis=0) + _C += e_h.T.dot(stat1) + _A += stat0.T.dot(e_hh) + + _R /= nb_sessions + + # M-step + _A_tmp = numpy.zeros((tv_rank, tv_rank), dtype=STAT_TYPE) + for c in range(nb_distrib): + distrib_idx = range(c * feature_size, (c + 1) * feature_size) + _A_tmp[upper_triangle_indices] = _A_tmp.T[upper_triangle_indices] = _A[c, :] + self.F[distrib_idx, :] = scipy.linalg.solve(_A_tmp, _C[:, distrib_idx]).T + + # Minimum divergence + if min_div: + _R_tmp = numpy.zeros((tv_rank, tv_rank), dtype=STAT_TYPE) + _R_tmp[upper_triangle_indices] = _R_tmp.T[upper_triangle_indices] = _R + ch = scipy.linalg.cholesky(_R_tmp) + self.F = self.F.dot(ch) + + # Save the FactorAnalyser if output_file_name is not None + if output_file_name is not None: + if it < nb_iter - 1: + self.write(output_file_name + "_it-{}.h5".format(it)) + else: + self.write(output_file_name + ".h5") + + def total_variability(self, + stat_server_filename, + ubm, + tv_rank, + nb_iter=20, + min_div=True, + tv_init=None, + batch_size=300, + save_init=False, + output_file_name=None, + num_thread=1): + """ + Train a total variability model using multiple process on a single node. + this method is the recommended one to train a Total Variability matrix. + + Optimization: + Only half of symmetric matrices are stored here + process sessions per batch in order to control the memory footprint + Batches are processed by a pool of workers running in different process + The implementation is based on a multiple producers / single consumer approach + + :param stat_server_filename: a list of StatServer file names to process + :param ubm: a Mixture object + :param tv_rank: rank of the total variability model + :param nb_iter: number of EM iteration + :param min_div: boolean, if True, apply minimum divergence re-estimation + :param tv_init: initial matrix to start the EM iterations with + :param batch_size: size of batch to load in memory for each worker + :param save_init: boolean, if True, save the initial matrix + :param output_file_name: name of the file where to save the matrix + :param num_thread: number of process to run in parallel + """ + if not isinstance(stat_server_filename, list): + stat_server_filename = [stat_server_filename] + + assert (isinstance(ubm, Mixture) and ubm.validate()), "Second argument must be a proper Mixture" + assert (isinstance(nb_iter, int) and (0 < nb_iter)), "nb_iter must be a positive integer" + + gmm_covariance = "diag" if ubm.invcov.ndim == 2 else "full" + + # Set useful variables + if not isinstance(stat_server_filename[0], h5py._hl.files.File): + with h5py.File(stat_server_filename[0], 'r') as fh: # open the first StatServer to get size + _, sv_size = fh['stat1'].shape + feature_size = fh['stat1'].shape[1] // fh['stat0'].shape[1] + distrib_nb = fh['stat0'].shape[1] + else: + _, sv_size = stat_server_filename[0]['stat1'].shape + feature_size = stat_server_filename[0]['stat1'].shape[1] // stat_server_filename[0]['stat0'].shape[1] + distrib_nb = stat_server_filename[0]['stat0'].shape[1] + + upper_triangle_indices = numpy.triu_indices(tv_rank) + + # mean and Sigma are initialized at ZEROS as statistics are centered + self.mean = numpy.zeros(ubm.get_mean_super_vector().shape, dtype=STAT_TYPE) + self.F = serialize(numpy.zeros((sv_size, tv_rank)).astype(STAT_TYPE)) + if tv_init is None: + self.F = numpy.random.randn(sv_size, tv_rank).astype(STAT_TYPE) + else: + self.F = tv_init + self.Sigma = numpy.zeros(ubm.get_mean_super_vector().shape, dtype=STAT_TYPE) + + # Save init if required + if save_init: + if output_file_name is None: + self.write(output_file_name + "temporary_factor_analyser_init.h5") + else: + self.write(output_file_name + "_init.h5") + + # Estimate TV iteratively + for it in range(nb_iter): + print(f"Iteration {it}") + # Create serialized accumulators for the list of models to process + with warnings.catch_warnings(): + warnings.simplefilter('ignore', RuntimeWarning) + _A = serialize(numpy.zeros((distrib_nb, tv_rank * (tv_rank + 1) // 2), dtype=STAT_TYPE)) + _C = serialize(numpy.zeros((tv_rank, sv_size), dtype=STAT_TYPE)) + _R = serialize(numpy.zeros((tv_rank * (tv_rank + 1) // 2), dtype=STAT_TYPE)) + + total_session_nb = 0 + + # E-step + # Accumulate statistics for each StatServer from the list + for stat_server_file in stat_server_filename: + + # get info from the current StatServer + if not isinstance(stat_server_file, h5py._hl.files.File): + fh = h5py.File(stat_server_file, 'r') + else: + fh = stat_server_file + + nb_sessions = fh["modelset"].shape[0] + total_session_nb += nb_sessions + batch_nb = int(numpy.floor(nb_sessions / float(batch_size) + 0.999)) + batch_indices = numpy.array_split(numpy.arange(nb_sessions), batch_nb) + + manager = multiprocessing.Manager() + q = manager.Queue() + pool = multiprocessing.Pool(num_thread + 2) + + # put Consumer to work first + watcher = pool.apply_async(e_gather, ((_A, _C, _R), q)) + # fire off workers + jobs = [] + + # Load data per batch to reduce the memory footprint + for batch_idx in batch_indices: + + # Create list of argument for a process + arg = fh["stat0"][batch_idx, :], fh["stat1"][batch_idx, :], ubm, self.F + job = pool.apply_async(e_worker, (arg, q)) + jobs.append(job) + + # collect results from the workers through the pool result queue + for job in jobs: + job.get() + + # now we are done, kill the consumer + q.put((None, None, None, None)) + pool.close() + + _A, _C, _R = watcher.get() + + if not isinstance(stat_server_file, h5py._hl.files.File): + fh.close() + + _R /= total_session_nb + + # M-step + _A_tmp = numpy.zeros((tv_rank, tv_rank), dtype=STAT_TYPE) + for c in range(distrib_nb): + distrib_idx = range(c * feature_size, (c + 1) * feature_size) + _A_tmp[upper_triangle_indices] = _A_tmp.T[upper_triangle_indices] = _A[c, :] + self.F[distrib_idx, :] = scipy.linalg.solve(_A_tmp, _C[:, distrib_idx]).T + + # Minimum divergence + if min_div: + _R_tmp = numpy.zeros((tv_rank, tv_rank), dtype=STAT_TYPE) + _R_tmp[upper_triangle_indices] = _R_tmp.T[upper_triangle_indices] = _R + ch = scipy.linalg.cholesky(_R_tmp) + self.F = self.F.dot(ch) + + # Save the FactorAnalyser if output_file_name is not None + if output_file_name is not None: + if it < nb_iter - 1: + self.write(output_file_name + "_it-{}.h5".format(it)) + else: + self.write(output_file_name + ".h5") + + def extract_ivectors_single(self, + ubm, + stat_server, + uncertainty=False): + """ + Estimate i-vectors for a given StatServer using single process on a single node. + + :param stat_server: sufficient statistics stored in a StatServer + :param ubm: Mixture object (the UBM) + :param uncertainty: boolean, if True, return an additional matrix with uncertainty matrices (diagonal of the matrices) + + :return: a StatServer with i-vectors in the stat1 attribute and a matrix of uncertainty matrices (optional) + """ + assert(isinstance(stat_server, StatServer) and stat_server.validate()), \ + "First argument must be a proper StatServer" + assert(isinstance(ubm, Mixture) and ubm.validate()), "Second argument must be a proper Mixture" + + gmm_covariance = "diag" if ubm.invcov.ndim == 2 else "full" + + # Set useful variables + tv_rank = self.F.shape[1] + feature_size = ubm.mu.shape[1] + nb_distrib = ubm.w.shape[0] + + # Whiten the statistics for diagonal or full models + if gmm_covariance == "diag": + stat_server.whiten_stat1(ubm.get_mean_super_vector(), 1. / ubm.get_invcov_super_vector()) + elif gmm_covariance == "full": + stat_server.whiten_stat1(ubm.get_mean_super_vector(), ubm.invchol) + + # Extract i-vectors + iv_stat_server = StatServer() + iv_stat_server.modelset = copy.deepcopy(stat_server.modelset) + iv_stat_server.segset = copy.deepcopy(stat_server.segset) + iv_stat_server.start = copy.deepcopy(stat_server.start) + iv_stat_server.stop = copy.deepcopy(stat_server.stop) + iv_stat_server.stat0 = numpy.ones((stat_server.modelset.shape[0], 1)) + iv_stat_server.stat1 = numpy.ones((stat_server.modelset.shape[0], tv_rank)) + + iv_sigma = numpy.ones((stat_server.modelset.shape[0], tv_rank)) + + # Replicate self.stat0 + index_map = numpy.repeat(numpy.arange(nb_distrib), feature_size) + + for sess in range(stat_server.segset.shape[0]): + + inv_lambda = scipy.linalg.inv(numpy.eye(tv_rank) + (self.F.T * + stat_server.stat0[sess, index_map]).dot(self.F)) + Aux = self.F.T.dot(stat_server.stat1[sess, :]) + iv_stat_server.stat1[sess, :] = Aux.dot(inv_lambda) + iv_sigma[sess, :] = numpy.diag(inv_lambda + numpy.outer(iv_stat_server.stat1[sess, :], + iv_stat_server.stat1[sess, :])) + + if uncertainty: + return iv_stat_server, iv_sigma + else: + return iv_stat_server + + def extract_ivectors(self, + ubm, + stat_server_filename, + prefix='', + batch_size=300, + uncertainty=False, + num_thread=1): + """ + Parallel extraction of i-vectors using multiprocessing module + + :param ubm: Mixture object (the UBM) + :param stat_server_filename: name of the file from which the input StatServer is read + :param prefix: prefix used to store the StatServer in its file + :param batch_size: number of sessions to process in a batch + :param uncertainty: a boolean, if True, return the diagonal of the uncertainty matrices + :param num_thread: number of process to run in parallel + :return: a StatServer with i-vectors in the stat1 attribute and a matrix of uncertainty matrices (optional) + """ + assert (isinstance(ubm, Mixture) and ubm.validate()), "Second argument must be a proper Mixture" + + tv_rank = self.F.shape[1] + + fh = stat_server_filename + if isinstance(stat_server_filename, str): + fh = h5py.File(stat_server_filename, 'r') + + _, sv_size = fh[prefix + 'stat1'].shape + nb_sessions = fh[prefix + "modelset"].shape[0] + + iv_server = StatServer() + iv_server.modelset = fh.get(prefix + 'modelset')[()] + iv_server.segset = fh.get(prefix + 'segset')[()] + + tmpstart = fh.get(prefix+"start")[()] + tmpstop = fh.get(prefix+"stop")[()] + iv_server.start = numpy.empty(fh[prefix+"start"].shape, '|O') + iv_server.stop = numpy.empty(fh[prefix+"stop"].shape, '|O') + iv_server.start[tmpstart != -1] = tmpstart[tmpstart != -1] + iv_server.stop[tmpstop != -1] = tmpstop[tmpstop != -1] + + iv_server.stat0 = numpy.ones((nb_sessions, 1), dtype=STAT_TYPE) + with warnings.catch_warnings(): + iv_server.stat1 = serialize(numpy.zeros((nb_sessions, tv_rank))) + iv_sigma = serialize(numpy.zeros((nb_sessions, tv_rank))) + + nb_sessions = iv_server.modelset.shape[0] + batch_nb = int(numpy.floor(nb_sessions / float(batch_size) + 0.999)) + batch_indices = numpy.array_split(numpy.arange(nb_sessions), batch_nb) + + manager = multiprocessing.Manager() + q = manager.Queue() + pool = multiprocessing.Pool(num_thread + 2) + + # put listener to work first + watcher = pool.apply_async(iv_collect, ((iv_server.stat1, iv_sigma), q)) + # fire off workers + jobs = [] + + # Load data per batch to reduce the memory footprint + for batch_idx in batch_indices: + + # Create list of argument for a process + arg = batch_idx, fh["stat0"][batch_idx, :], fh["stat1"][batch_idx, :], ubm, self.F + job = pool.apply_async(iv_extract_on_batch, (arg, q)) + jobs.append(job) + + # collect results from the workers through the pool result queue + for job in jobs: + job.get() + + # now we are done, kill the listener + q.put((None, None, None)) + pool.close() + + iv_server.stat1, iv_sigma = watcher.get() + + if isinstance(stat_server_filename, str): + fh.close() + + if uncertainty: + return iv_server, iv_sigma + else: + return iv_server + + def plda(self, + stat_server, + rank_f, + nb_iter=10, + scaling_factor=1., + output_file_name=None, + save_partial=False, + save_final=True, + num_thread=1): + """ + Train a simplified Probabilistic Linear Discriminant Analysis model (no within class covariance matrix + but full residual covariance matrix) + + :param stat_server: StatServer object with training statistics + :param rank_f: rank of the between class covariance matrix + :param nb_iter: number of iterations to run + :param scaling_factor: scaling factor to downscale statistics (value bewteen 0 and 1) + :param output_file_name: name of the output file where to store PLDA model + :param save_partial: boolean, if True, save PLDA model after each iteration + """ + vect_size = stat_server.stat1.shape[1] + + # Initialize mean and residual covariance from the training data + self.mean = stat_server.get_mean_stat1() + self.Sigma = stat_server.get_total_covariance_stat1() + + # Sum stat per model + model_shifted_stat, session_per_model = stat_server.sum_stat_per_model() + class_nb = model_shifted_stat.modelset.shape[0] + + # Multiply statistics by scaling_factor + model_shifted_stat.stat0 *= scaling_factor + model_shifted_stat.stat1 *= scaling_factor + session_per_model *= scaling_factor + + # Compute Eigen Decomposition of Sigma in order to initialize the EigenVoice matrix + sigma_obs = stat_server.get_total_covariance_stat1() + evals, evecs = scipy.linalg.eigh(sigma_obs) + idx = numpy.argsort(evals)[::-1] + evecs = evecs.real[:, idx[:rank_f]] + self.F = evecs[:, :rank_f] + + # Estimate PLDA model by iterating the EM algorithm + for it in range(nb_iter): + logging.info('Estimate between class covariance, it %d / %d', it + 1, nb_iter) + + # E-step + print("E_step") + + # Copy stats as they will be whitened with a different Sigma for each iteration + local_stat = copy.deepcopy(model_shifted_stat) + + # Whiten statistics (with the new mean and Sigma) + local_stat.whiten_stat1(self.mean, self.Sigma) + + # Whiten the EigenVoice matrix + eigen_values, eigen_vectors = scipy.linalg.eigh(self.Sigma) + ind = eigen_values.real.argsort()[::-1] + eigen_values = eigen_values.real[ind] + eigen_vectors = eigen_vectors.real[:, ind] + sqr_inv_eval_sigma = 1 / numpy.sqrt(eigen_values.real) + sqr_inv_sigma = numpy.dot(eigen_vectors, numpy.diag(sqr_inv_eval_sigma)) + self.F = sqr_inv_sigma.T.dot(self.F) + + # Replicate self.stat0 + index_map = numpy.zeros(vect_size, dtype=int) + _stat0 = local_stat.stat0[:, index_map] + + e_h = numpy.zeros((class_nb, rank_f)) + e_hh = numpy.zeros((class_nb, rank_f, rank_f)) + + # loop on model id's + fa_model_loop(batch_start=0, + mini_batch_indices=numpy.arange(class_nb), + factor_analyser=self, + stat0=_stat0, + stat1=local_stat.stat1, + e_h=e_h, + e_hh=e_hh, + num_thread=num_thread) + + # Accumulate for minimum divergence step + _R = numpy.sum(e_hh, axis=0) / session_per_model.shape[0] + + _C = e_h.T.dot(local_stat.stat1).dot(scipy.linalg.inv(sqr_inv_sigma)) + _A = numpy.einsum('ijk,i->jk', e_hh, local_stat.stat0.squeeze()) + + # M-step + self.F = scipy.linalg.solve(_A, _C).T + + # Update the residual covariance + self.Sigma = sigma_obs - self.F.dot(_C) / session_per_model.sum() + + # Minimum Divergence step + self.F = self.F.dot(scipy.linalg.cholesky(_R)) + + if output_file_name is None: + output_file_name = "plda" + + if save_partial and it < nb_iter - 1: + self.write(output_file_name + "_it-{}.h5".format(it)) + elif it == nb_iter - 1 and save_final: + self.write(output_file_name + ".h5") + + + def weighted_likelihood_plda(self, + out_stat_server, + in_stat_server, + alpha, + rank_f, + nb_iter=10, + scaling_factor=1., + output_file_name=None, + save_partial=False): + """ + Train a simplified Probabilistic Linear Discriminant Analysis model (no within class covariance matrix + but full residual covariance matrix) + + :param out_stat_server: out-domain StatServer object with training statistics + :param rank_f: rank of the between class covariance matrix + :param nb_iter: number of iterations to run + :param scaling_factor: scaling factor to downscale statistics (value bewteen 0 and 1) + :param output_file_name: name of the output file where to store PLDA model + :param save_partial: boolean, if True, save PLDA model after each iteration + """ + vect_size = out_stat_server.stat1.shape[1] + + + # Initialize mean and residual covariance from the training data + # First version + self.mean = alpha * in_stat_server.get_mean_stat1() + (1-alpha) * out_stat_server.get_mean_stat1() + self.Sigma = alpha * in_stat_server.get_total_covariance_stat1_mix(self.mean) \ + + (1-alpha) * out_stat_server.get_total_covariance_stat1_mix(self.mean) + #self.mean = in_stat_server.get_mean_stat1() + #self.Sigma = in_stat_server.get_total_covariance_stat1() + + # # Second version + # in_stat_server.center_stat1(in_stat_server.get_mean_stat1()) + # out_stat_server.center_stat1(out_stat_server.get_mean_stat1()) + # self.mean = 0.5 * in_stat_server.get_mean_stat1() + 0.5 * out_stat_server.get_mean_stat1() + # self.Sigma = alpha * in_stat_server.get_total_covariance_stat1() \ + # + (1-alpha) * out_stat_server.get_total_covariance_stat1() + + + # Sum stat per model + out_model_shifted_stat, out_session_per_model = out_stat_server.sum_stat_per_model() + in_model_shifted_stat, in_session_per_model = in_stat_server.sum_stat_per_model() + out_class_nb = out_model_shifted_stat.modelset.shape[0] + in_class_nb = in_model_shifted_stat.modelset.shape[0] + + # Multiply statistics by scaling_factor + out_model_shifted_stat.stat0 *= scaling_factor + out_model_shifted_stat.stat1 *= scaling_factor + out_session_per_model *= scaling_factor + + in_model_shifted_stat.stat0 *= scaling_factor + in_model_shifted_stat.stat1 *= scaling_factor + in_session_per_model *= scaling_factor + + # Compute Eigen Decomposition of Sigma in order to initialize the EigenVoice matrix + sigma_obs = copy.deepcopy(self.Sigma) + evals, evecs = scipy.linalg.eigh(sigma_obs) + idx = numpy.argsort(evals)[::-1] + evecs = evecs.real[:, idx[:rank_f]] + self.F = evecs[:, :rank_f] + + # Estimate PLDA model by iterating the EM algorithm + for it in range(nb_iter): + logging.info('Estimate between class covariance, it %d / %d', it + 1, nb_iter) + + # E-step + print("E_step") + + # Copy stats as they will be whitened with a different Sigma for each iteration + out_local_stat = copy.deepcopy(out_model_shifted_stat) + in_local_stat = copy.deepcopy(in_model_shifted_stat) + + # Whiten statistics (with the new mean and Sigma) + out_local_stat.whiten_stat1(self.mean, self.Sigma) + in_local_stat.whiten_stat1(self.mean, self.Sigma) + + + # Whiten the EigenVoice matrix + eigen_values, eigen_vectors = scipy.linalg.eigh(self.Sigma) + ind = eigen_values.real.argsort()[::-1] + eigen_values = eigen_values.real[ind] + eigen_vectors = eigen_vectors.real[:, ind] + sqr_inv_eval_sigma = 1 / numpy.sqrt(eigen_values.real) + sqr_inv_sigma = numpy.dot(eigen_vectors, numpy.diag(sqr_inv_eval_sigma)) + self.F = sqr_inv_sigma.T.dot(self.F) + + # Replicate self.stat0 + index_map = numpy.zeros(vect_size, dtype=int) + _stat0 = out_local_stat.stat0[:, index_map] + + e_h = numpy.zeros((out_class_nb, rank_f)) + e_hh = numpy.zeros((out_class_nb, rank_f, rank_f)) + + # loop on model id's + + fa_model_loop(batch_start=0, + mini_batch_indices=numpy.arange(out_class_nb), + factor_analyser=self, + stat0=_stat0, + stat1=out_local_stat.stat1, + e_h=e_h, + e_hh=e_hh, + num_thread=1) + # Accumulate for minimum divergence step + out_R = numpy.sum(e_hh, axis=0) / out_session_per_model.shape[0] + + out_C = e_h.T.dot(out_local_stat.stat1).dot(scipy.linalg.inv(sqr_inv_sigma)) + out_A = numpy.einsum('ijk,i->jk', e_hh, out_local_stat.stat0.squeeze()) + + # Replicate self.stat0 + index_map = numpy.zeros(vect_size, dtype=int) + _stat0 = in_local_stat.stat0[:, index_map] + + e_h = numpy.zeros((in_class_nb, rank_f)) + e_hh = numpy.zeros((in_class_nb, rank_f, rank_f)) + + # loop on model id's + fa_model_loop(batch_start=0, + mini_batch_indices=numpy.arange(in_class_nb), + factor_analyser=self, + stat0=_stat0, + stat1=in_local_stat.stat1, + e_h=e_h, + e_hh=e_hh, + num_thread=1) + # Accumulate for minimum divergence step + in_R = numpy.sum(e_hh, axis=0) / in_session_per_model.shape[0] + + in_C = e_h.T.dot(in_local_stat.stat1).dot(scipy.linalg.inv(sqr_inv_sigma)) + in_A = numpy.einsum('ijk,i->jk', e_hh, in_local_stat.stat0.squeeze()) + + in_alpha = alpha / in_session_per_model.sum() + out_alpha = (1 - alpha) / out_session_per_model.sum() + + #import ipdb + #ipdb.set_trace() + + _A = in_alpha * in_A + out_alpha * out_A + _C = in_alpha * in_C + out_alpha * out_C + _R = alpha * in_R + (1 - alpha) * out_R + + # M-step + self.F = scipy.linalg.solve(_A, _C).T + + # Update the residual covariance + self.Sigma = sigma_obs - self.F.dot(_C) + #self.Sigma = sigma_obs - self.F.dot(_C) / (in_alpha * in_session_per_model.sum() + out_alpha * out_session_per_model.sum()) + + """ + dans la ligne du dessus on ne divise pas par le nombre total de sessions ??? + """ + + # Minimum Divergence step + self.F = self.F.dot(scipy.linalg.cholesky(_R)) + + if output_file_name is None: + output_file_name = "temporary_plda" + + if save_partial and it < nb_iter - 1: + self.write(output_file_name + "_it-{}.h5".format(it)) + elif it == nb_iter - 1: + self.write(output_file_name + ".h5") + + + + + diff --git a/sidekit/sidekit/features_extractor.py b/sidekit/sidekit/features_extractor.py new file mode 100755 index 0000000..0722952 --- /dev/null +++ b/sidekit/sidekit/features_extractor.py @@ -0,0 +1,750 @@ +# -*- coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . +""" +Copyright 2014-2021 Sylvain Meignier and Anthony Larcher + + :mod:`features_server` provides methods to manage features + +""" +import copy +import h5py +import logging +import numpy +import os + + +from . import PARAM_TYPE +from .frontend.features import mfcc, plp +from .frontend.io import read_audio, read_label, write_hdf5, _add_reverb, _add_noise +from .frontend.vad import vad_snr, vad_percentil +from .mixture import vad_energy +from .sidekit_wrappers import process_parallel_lists +from .bosaris import IdMap + + +__license__ = "LGPL" +__author__ = "Anthony Larcher & Sylvain Meignier" +__copyright__ = "Copyright 2014-2021 Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' + + +class FeaturesExtractor(object): + """ + A FeaturesExtractor process an audio file in SPHERE, WAVE or RAW PCM format and extract filter-banks, + cepstral coefficients, bottle-neck features (in the future), log-energy and perform a speech activity detection. + """ + + def __init__(self, + audio_filename_structure=None, + feature_filename_structure=None, + sampling_frequency=None, + lower_frequency=None, + higher_frequency=None, + filter_bank=None, + filter_bank_size=None, + window_size=None, + shift=None, + ceps_number=None, + vad=None, + snr=None, + pre_emphasis=None, + save_param=None, + keep_all_features=None, + feature_type=None, + rasta_plp=None, + compressed='percentile'): + """ + :param audio_filename_structure: a string that gives the structure of the input file to process + :param feature_filename_structure: a string that gives the structure of the output file to write + :param sampling_frequency: optional, only required if processing RAW PCM. For other formats, this information + is read from the file + :param lower_frequency: lower frequency (in Herz) of the filter bank + :param higher_frequency: higher frequency of the filter bank + :param filter_bank: type of fiter scale to use, can be "lin" or "log" (for linear of log-scale) + :param filter_bank_size: number of filters banks + :param window_size: size of the sliding window to process (in seconds) + :param shift: time shift of the sliding window (in seconds) + :param ceps_number: number of cepstral coefficients to extract + :param vad: type of voice actovoty detection algorithm to use. Can be "energy", "snr", "percentil" or "lbl" + to read from a file + :param snr: signal to noise ratio used for "snr" vad algorithm + :param pre_emphasis: value given for the pre-emphasis filter (default is 0.97) + :param save_param: list of strings that indicate which parameters to save. The strings can be: + "cep" for cepstral coefficients, "fb" for filter-banks, "energy" for the log-energy, "bnf" + for bottle-neck features and "vad" for the frame selection labels. In the resulting files, parameters are + always concatenated in the following order: (energy,fb, cep, bnf, vad_label). Default keeps all. + :param keep_all_features: boolean, if True, all frames are writen; if False, keep only frames according to + the vad label + """ + + # Set the default values + self.audio_filename_structure = None + self.feature_filename_structure = '{}' + self.sampling_frequency = 8000 + self.lower_frequency = None + self.higher_frequency = None + self.filter_bank = None + self.filter_bank_size = None + self.window_size = None + self.shift = None + self.ceps_number = None + self.vad = None + self.snr = None + self.pre_emphasis = 0.97 + self.save_param = ["energy", "cep", "fb", "bnf", "vad"] + self.keep_all_features = None + self.feature_type = 'mfcc' + self.rasta_plp = True + + if audio_filename_structure is not None: + self.audio_filename_structure = audio_filename_structure + if feature_filename_structure is not None: + self.feature_filename_structure = feature_filename_structure + if sampling_frequency is not None: + self.sampling_frequency = sampling_frequency + if lower_frequency is not None: + self.lower_frequency = lower_frequency + if higher_frequency is not None: + self.higher_frequency = higher_frequency + if filter_bank is not None: + self.filter_bank = filter_bank + if filter_bank_size is not None: + self.filter_bank_size = filter_bank_size + if window_size is not None: + self.window_size = window_size + if shift is not None: + self.shift = shift + if ceps_number is not None: + self.ceps_number = ceps_number + if vad is not None: + self.vad = vad + if snr is not None: + self.snr = snr + if pre_emphasis is not None: + self.pre_emphasis = pre_emphasis + if save_param is not None: + self.save_param = save_param + if keep_all_features is not None: + self.keep_all_features = keep_all_features + if feature_type is not None: + self.feature_type = feature_type + if rasta_plp is not None: + self.rasta_plp = rasta_plp + if compressed is not None: + self.compressed = compressed + + self.window_sample = None + if not (self.window_size is None or self.sampling_frequency is None): + self.window_sample = int(self.window_size * self.sampling_frequency) + + self.shift_sample = None + if not (self.shift is None or self.sampling_frequency is None): + self.shift_sample = int(self.shift * self.sampling_frequency) + + self.show = 'empty' + + def __repr__(self): + ch = '\t show: {} keep_all_features: {}\n'.format( + self.show, self.keep_all_features) + ch += '\t audio_filename_structure: {} \n'.format(self.audio_filename_structure) + ch += '\t feature_filename_structure: {} \n'.format(self.feature_filename_structure) + ch += '\t pre-emphasis: {} \n'.format(self.pre_emphasis) + ch += '\t lower_frequency: {} higher_frequency: {} \n'.format( + self.lower_frequency, self.higher_frequency) + ch += '\t sampling_frequency: {} \n'.format(self.sampling_frequency) + ch += '\t filter bank: {} filters of type {}\n'.format( + self.filter_bank_size, self.filter_bank) + ch += '\t ceps_number: {} \n\t window_size: {} shift: {} \n'.format( + self.ceps_number, self.window_size, self.shift) + ch += '\t vad: {} snr: {} \n'.format(self.vad, self.snr) + return ch + + def extract(self, show, channel, + input_audio_filename=None, + output_feature_filename=None, + backing_store=False, + noise_file_name=None, + snr=10, + reverb_file_name=None, + reverb_level=-26.): + """ + Compute the acoustic parameters (filter banks, cepstral coefficients, log-energy and bottleneck features + for a single channel from a given audio file. + + :param show: ID if the show + :param channel: channel number (0 if mono file) + :param input_audio_filename: name of the input audio file to consider if the name of the audio file is independent from the ID of the show + :param output_feature_filename: name of the output feature file to consider if the name of the feature file is independent from the ID of the show + :param backing_store: boolean, if False, nothing is writen to disk, if True, the file is writen to disk when closed + :param rasta: boolean, only for PLP parameters, if True, perform RASTA filtering + + :return: an hdf5 file handler + """ + # Create the filename to load + + # If the input audio file name does not include the ID of the show + # (i.e., if the audio_filename_structure does not include {}) + # the audio_filename_structure is updated to use the input_audio_filename + if input_audio_filename is not None: + self.audio_filename_structure = input_audio_filename + audio_filename = self.audio_filename_structure.format(show) + + # If the output file name does not include the ID of the show, + # (i.e., if the feature_filename_structure does not include {}) + # the feature_filename_structure is updated to use the output_feature_filename + if output_feature_filename is not None: + self.feature_filename_structure = output_feature_filename + feature_filename = self.feature_filename_structure.format(show) + + # Open audio file, get the signal and possibly the sampling frequency + signal, sample_rate = read_audio(audio_filename, self.sampling_frequency) + if signal.ndim == 1: + signal = signal[:, numpy.newaxis] + # AJOUTER LE BRUITAGE ET REVERB DU SIGNAL SI NECESSAIRE + if noise_file_name is not None: + signal[:, channel] = _add_noise(signal[:, channel], noise_file_name, snr, sample_rate) + + if reverb_file_name is not None: + signal[:, channel] = _add_reverb(signal[:, channel], reverb_file_name, sample_rate, reverb_level) + + # Process the target channel to return Filter-Banks, Cepstral coefficients and BNF if required + length, chan = signal.shape + + # If the size of the signal is not enough for one frame, return zero features + if length < self.window_sample: + cep = numpy.empty((0, self.ceps_number), dtype=PARAM_TYPE) + energy = numpy.empty((0, 1), dtype=PARAM_TYPE) + fb = numpy.empty((0, self.filter_bank_size), dtype=PARAM_TYPE) + label = numpy.empty((0, 1), dtype='int8') + + else: + # Random noise is added to the input signal to avoid zero frames. + numpy.random.seed(0) + signal[:, channel] += 0.0001 * numpy.random.randn(signal.shape[0]) + + dec = self.shift_sample * 250 * 25000 + self.window_sample + dec2 = self.window_sample - self.shift_sample + start = 0 + end = min(dec, length) + + # Process the signal by batch to avoid problems for very long signals + while start < (length - dec2): + logging.info('process part : %f %f %f', + start / self.sampling_frequency, + end / self.sampling_frequency, + length / self.sampling_frequency) + + if self.feature_type == 'mfcc': + # Extract cepstral coefficients, energy and filter banks + cep, energy, _, fb = mfcc(signal[start:end, channel], + fs=self.sampling_frequency, + lowfreq=self.lower_frequency, + maxfreq=self.higher_frequency, + nlinfilt=self.filter_bank_size if self.filter_bank == "lin" else 0, + nlogfilt=self.filter_bank_size if self.filter_bank == "log" else 0, + nwin=self.window_size, + shift=self.shift, + nceps=self.ceps_number, + get_spec=False, + get_mspec=True, + prefac=self.pre_emphasis) + elif self.feature_type == 'plp': + cep, energy, _, fb = plp(signal[start:end, channel], + nwin=self.window_size, + fs=self.sampling_frequency, + plp_order=self.ceps_number, + shift=self.shift, + get_spec=False, + get_mspec=True, + prefac=self.pre_emphasis, + rasta=self.rasta_plp) + + # Perform feature selection + label, threshold = self._vad(cep, energy, fb, signal[start:end, channel]) + + if len(label) < len(energy): + label = numpy.hstack((label, numpy.zeros(len(energy)-len(label), dtype='bool'))) + + start = end - dec2 + end = min(end + dec, length) + if cep.shape[0] > 0: + logging.info('!! size of signal cep: %f len %d type size %d', cep[-1].nbytes/1024/1024, + len(cep[-1]), + cep[-1].nbytes/len(cep[-1])) + + # Compute the mean and std of fb and cepstral coefficient computed for all selected frames + energy_mean = energy[label].mean(axis=0) + energy_std = energy[label].std(axis=0) + fb_mean = fb[label, :].mean(axis=0) + fb_std = fb[label, :].std(axis=0) + cep_mean = cep[label, :].mean(axis=0) + cep_std = cep[label, :].std(axis=0) + # bnf_mean = bnf[label, :].mean(axis=0) + # bnf_std = bnf[label, :].std(axis=0) + + # Create the HDF5 file + # Create the directory if it dosn't exist + dir_name = os.path.dirname(feature_filename) # get the path + if not os.path.exists(dir_name) and not (dir_name == ''): + os.makedirs(dir_name) + h5f = h5py.File(feature_filename, 'w', backing_store=backing_store, driver='core') + if "cep" not in self.save_param: + cep = None + cep_mean = None + cep_std = None + if "energy" not in self.save_param: + energy = None + energy_mean = None + energy_std = None + if "fb" not in self.save_param: + fb = None + fb_mean = None + fb_std = None + if "bnf" not in self.save_param: + bnf = None + bnf_mean = None + bnf_std = None + if "vad" not in self.save_param: + label = None + logging.info(label) + write_hdf5(show, h5f, + cep, cep_mean, cep_std, + energy, energy_mean, energy_std, + fb, fb_mean, fb_std, + bnf, bnf_mean, bnf_std, + label, + self.compressed) + + return h5f + + def extract_from_signal(self, signal, + sample_rate, + noise_file_name=None, + snr=10, + reverb_file_name=None, + reverb_level=-26.): + """ + Compute the acoustic parameters (filter banks, cepstral coefficients, log-energy and bottleneck features + for a single channel from a given audio file. + + :param show: ID if the show + :param channel: channel number (0 if mono file) + :param input_audio_filename: name of the input audio file to consider if the name of the audio file is independent from the ID of the show + :param output_feature_filename: name of the output feature file to consider if the name of the feature file is independent from the ID of the show + :param backing_store: boolean, if False, nothing is writen to disk, if True, the file is writen to disk when closed + :param rasta: boolean, only for PLP parameters, if True, perform RASTA filtering + + :return: an hdf5 file handler + """ + if signal.ndim == 1: + signal = signal[:, numpy.newaxis] + + # AJOUTER LE BRUITAGE ET REVERB DU SIGNAL SI NECESSAIRE + if noise_file_name is not None: + signal[:, 0] = _add_noise(signal[:, 0], noise_file_name, snr, sample_rate) + + if reverb_file_name is not None: + signal[:, 0] = _add_reverb(signal[:, 0], reverb_file_name, sample_rate, reverb_level) + + # Process the target channel to return Filter-Banks, Cepstral coefficients and BNF if required + length, chan = signal.shape + + # If the size of the signal is not enough for one frame, return zero features + if length < self.window_sample: + cep = numpy.empty((0, self.ceps_number), dtype=PARAM_TYPE) + energy = numpy.empty((0, 1), dtype=PARAM_TYPE) + fb = numpy.empty((0, self.filter_bank_size), dtype=PARAM_TYPE) + label = numpy.empty((0, 1), dtype='int8') + + else: + # Random noise is added to the input signal to avoid zero frames. + numpy.random.seed(0) + signal[:, 0] += 0.0001 * numpy.random.randn(signal.shape[0]) + + dec = self.shift_sample * 250 * 25000 + self.window_sample + dec2 = self.window_sample - self.shift_sample + start = 0 + end = min(dec, length) + + # Process the signal by batch to avoid problems for very long signals + while start < (length - dec2): + logging.info('process part : %f %f %f', + start / self.sampling_frequency, + end / self.sampling_frequency, + length / self.sampling_frequency) + + if self.feature_type == 'mfcc': + # Extract cepstral coefficients, energy and filter banks + cep, energy, _, fb = mfcc(signal[start:end, 0], + fs=self.sampling_frequency, + lowfreq=self.lower_frequency, + maxfreq=self.higher_frequency, + nlinfilt=self.filter_bank_size if self.filter_bank == "lin" else 0, + nlogfilt=self.filter_bank_size if self.filter_bank == "log" else 0, + nwin=self.window_size, + shift=self.shift, + nceps=self.ceps_number, + get_spec=False, + get_mspec=True, + prefac=self.pre_emphasis) + elif self.feature_type == 'plp': + cep, energy, _, fb = plp(signal[start:end, 0], + nwin=self.window_size, + fs=self.sampling_frequency, + plp_order=self.ceps_number, + shift=self.shift, + get_spec=False, + get_mspec=True, + prefac=self.pre_emphasis, + rasta=self.rasta_plp) + + # Perform feature selection + label, threshold = self._vad(cep, energy, fb, signal[start:end, 0]) + + if len(label) < len(energy): + label = numpy.hstack((label, numpy.zeros(len(energy) - len(label), dtype='bool'))) + + start = end - dec2 + end = min(end + dec, length) + if cep.shape[0] > 0: + logging.info('!! size of signal cep: %f len %d type size %d', cep[-1].nbytes / 1024 / 1024, + len(cep[-1]), + cep[-1].nbytes / len(cep[-1])) + + return label, energy, cep, fb + + def save(self, + show, + channel=0, + input_audio_filename=None, + output_feature_filename=None, + noise_file_name=None, + snr=10, + reverb_file_name=None, + reverb_level=-26.): + """ + Compute the acoustic parameters (filter banks, cepstral coefficients, log-energy and bottleneck features + for a single channel from a given audio file and save them to disk in a HDF5 format + + :param show: + :param channel: + :param input_audio_filename: + :param output_feature_filename: + :return: + """ + # Load the cepstral coefficients, energy, filter-banks, bnf and vad labels + h5f = self.extract(show, + channel, + input_audio_filename, + output_feature_filename, + backing_store=True, + noise_file_name=noise_file_name, + snr=snr, + reverb_file_name=reverb_file_name, + reverb_level=reverb_level) + logging.info(h5f.filename) + + # Write the hdf5 file to disk + h5f.close() + + @staticmethod + def _save(show, + feature_filename_structure, + save_param, + cep, + energy, + fb, + bnf, + label, + compressed='percentile'): + """ + + :param show: + :param feature_filename_structure: + :param save_param: + :param cep: + :param energy: + :param fb: + :param bnf: + :param label: + :return: + """ + feature_filename = feature_filename_structure.format(show) + logging.info('output finename: '+feature_filename) + dir_name = os.path.dirname(feature_filename) # get the path + if not os.path.exists(dir_name) and not (dir_name == ''): + os.makedirs(dir_name) + + h5f = h5py.File(feature_filename, 'a', backing_store=True, driver='core') + + if "cep" not in save_param: + cep = None + cep_mean = None + cep_std = None + else: + cep_mean = cep[label, :].mean(axis=0) + cep_std = cep[label, :].std(axis=0) + if "energy" not in save_param: + energy = None + energy_mean = None + energy_std = None + else: + energy_mean = energy[label].mean(axis=0) + energy_std = energy[label].std(axis=0) + if "fb" not in save_param: + fb = None + fb_mean = None + fb_std = None + else: + fb_mean = fb[label, :].mean(axis=0) + fb_std = fb[label, :].std(axis=0) + if "bnf" not in save_param: + bnf = None + bnf_mean = None + bnf_std = None + if "vad" not in save_param: + label = None + logging.info(label) + + write_hdf5(show, h5f, + cep, cep_mean, cep_std, + energy, energy_mean, energy_std, + fb, fb_mean, fb_std, + bnf, bnf_mean, bnf_std, + label, compressed) + h5f.close() + + def save_multispeakers(self, + idmap, + channel=0, + input_audio_filename=None, + output_feature_filename=None, + keep_all=True, + skip_existing_file=False, + compressed='percentile'): + """ + :param idmap: + :param channel: + :param input_audio_filename: + :param output_feature_filename: + :param keep_all: + :param skip_existing_file: + :return: + """ + param_vad = self.vad + save_param = copy.deepcopy(self.save_param) + self.save_param = ["energy", "cep", "fb", "vad"] + + self.vad = None + if output_feature_filename is None: + output_feature_filename = self.feature_filename_structure + + tmp_dict = dict() + nb = 0 + for show, _id, start, stop in zip(idmap.rightids, idmap.leftids, idmap.start, idmap.stop): + + if skip_existing_file: + if keep_all: + file_name = output_feature_filename.format(show) + else: + file_name = output_feature_filename.format(show+'/' + _id) + if os.path.isfile(file_name): + logging.info('existing file: SKIP '+file_name) + continue + + if show not in tmp_dict: + tmp_dict[show] = dict() + if _id not in tmp_dict[show]: + tmp_dict[show][_id] = numpy.arange(start, stop-1) + nb += 1 + else: + tmp_dict[show][_id] = numpy.concatenate((tmp_dict[show][_id], numpy.arange(start, stop-1)), axis=0) + + + output_show = list() + output_id = list() + output_start = list() + output_stop = list() + global_compression = 'none' + if self.compressed == 'percentile': + global_compression = 'percentile' + self.compressed = 'none' + + for show in tmp_dict: + # temp_file_name = tempfile.NamedTemporaryFile().name + # logging.info('tmp file name: '+temp_file_name) + self.vad = None + h5f = self.extract(show, channel, input_audio_filename, backing_store=False) + energy = h5f.get(show + '/energy')[()] + label = h5f.get(show + '/vad')[()] + fb = h5f.get(show + '/fb')[()] + cep = h5f.get(show + '/cep')[()] + h5f.close() + self.vad = param_vad + l = energy.shape[0] + for _id in tmp_dict[show]: + idx = tmp_dict[show][_id] + idx = idx[idx < l] + _, threshold_id = self._vad(None, energy[idx], None, None) + logging.info('show: ' + show + ' cluster: ' + _id + ' thr:' + str(threshold_id)) + label_id = energy > threshold_id + label[idx] = label_id[idx].flatten() + + if not keep_all: + output_show.append(show + '/' + _id) + output_id.append(_id) + output_start.append(0) + output_stop.append(idx.shape[0]) + logging.info('keep_all id: ' + show + ' show: ' + show + '/' + _id + ' start: 0 stop: ' + + str(idx.shape[0])) + self._save(show+'/' + _id, + output_feature_filename, + save_param, cep[idx], + energy[idx], + fb[idx], + None, + label[idx], + global_compression) + + if keep_all: + self._save(show, output_feature_filename, save_param, cep, energy, fb, None, label, global_compression) + + self.vad = param_vad + self.save_param = save_param + + self.compressed = global_compression + + if keep_all: + return copy.deepcopy(idmap) + out_idmap = IdMap() + out_idmap.set(numpy.array(output_id), + numpy.array(output_show), + start=numpy.array(output_start, dtype='int32'), + stop=numpy.array(output_stop, dtype='int32')) + return out_idmap + + def _vad(self, cep, log_energy, fb, x, label_file_name=None): + """ + Apply Voice Activity Detection. + + :param cep: cepstral coefficient (for future VAD) + :param log_energy: logarithm of the energy + :param fb: filter bank coefficients (for future VAD) + :param x: signal + :return: + """ + threshold = -numpy.inf + label = None + if self.vad is None: + logging.info('no vad') + label = numpy.array([True] * log_energy.shape[0]) + elif self.vad == 'snr': + logging.info('vad : snr') + window_sample = int(self.window_size * self.sampling_frequency) + label = vad_snr(x, self.snr, fs=self.sampling_frequency, + shift=self.shift, nwin=window_sample) + elif self.vad == 'energy': + logging.info('vad : energy') + label, threshold = vad_energy(log_energy, distrib_nb=3, + nb_train_it=8, flooring=0.0001, + ceiling=1.5, alpha=0.2) + elif self.vad == 'percentil': + label, threshold = vad_percentil(log_energy, 10) + logging.info('percentil '+str(threshold)) + elif self.vad == 'dnn': + pass # TO DO + elif self.vad == 'lbl': # load existing labels as reference + logging.info('vad : lbl') + label = read_label(label_file_name) + else: + logging.warning('Wrong VAD type') + return label, threshold + + @process_parallel_lists + def save_list(self, + show_list, + channel_list, + audio_file_list=None, + feature_file_list=None, + noise_file_list=None, + snr_list=None, + reverb_file_list=None, + reverb_levels=None, + num_thread=1): + """ + Compute the acoustic parameters (filter banks, cepstral coefficients, log-energy and bottleneck features + for a list of audio files and save them to disk in a HDF5 format + The process is parallelized if num_thread is higher than 1 + + + :param show_list: list of IDs of the show to process + :param channel_list: list of channel indices corresponding to each show + :param audio_file_list: list of input audio files if the name is independent from the ID of the show + :param feature_file_list: list of output audio files if the name is independent from the ID of the show + :param num_thread: number of parallel process to run + :return: + """ + + """ + TO DO: manage multi-processing when writting in a single file + (use a queue but can we still use @process_parallel_lists ???) + + """ + + logging.info(self) + + # get the length of the longest list + max_length = max([len(l) for l in [show_list, channel_list, audio_file_list, feature_file_list] + if l is not None]) + + if show_list is None: + show_list = numpy.empty(int(max_length), dtype='|O') + if audio_file_list is None: + audio_file_list = numpy.empty(int(max_length), dtype='|O') + if feature_file_list is None: + feature_file_list = numpy.empty(int(max_length), dtype='|O') + if noise_file_list is None: + noise_file_list = numpy.empty(int(max_length), dtype='|O') + snr_list = numpy.empty(int(max_length), dtype='|O') + elif snr_list is None: + snr_list = numpy.full(int(max_length), 5.) + if reverb_file_list is None: + reverb_file_list = numpy.empty(int(max_length), dtype='|O') + reverb_levels = numpy.empty(int(max_length), dtype='|O') + elif reverb_levels is None: + reverb_levels = numpy.full(int(max_length), -26.) + + + for show, channel, audio_file, feature_file, noise_file, snr, reverb_file, reverb_level in zip(show_list, + channel_list, + audio_file_list, + feature_file_list, + noise_file_list, + snr_list, + reverb_file_list, + reverb_levels): + + self.save(show, channel, audio_file, feature_file, noise_file, snr, reverb_file, reverb_level) diff --git a/sidekit/sidekit/features_server.py b/sidekit/sidekit/features_server.py new file mode 100755 index 0000000..977742f --- /dev/null +++ b/sidekit/sidekit/features_server.py @@ -0,0 +1,726 @@ +# -*- coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . + +""" +Copyright 2014-2021 Sylvain Meignier and Anthony Larcher + + :mod:`features_server` provides methods to manage features + +""" +import copy +import multiprocessing +import numpy +import logging +import h5py + +from sidekit.frontend.features import pca_dct, shifted_delta_cepstral, compute_delta, framing, dct_basis +from sidekit.frontend.io import read_hdf5_segment +from sidekit.frontend.vad import label_fusion +from sidekit.frontend.normfeat import cms, cmvn, stg, cep_sliding_norm, rasta_filt +from sidekit.sv_utils import parse_mask +from sidekit.features_extractor import FeaturesExtractor + + +__license__ = "LGPL" +__author__ = "Anthony Larcher & Sylvain Meignier" +__copyright__ = "Copyright 2014-2021 Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' +#comment + +class FeaturesServer(object): + """ + Management of features. FeaturesServer instances load datasets from a HDF5 files + (that can be read from disk or produced by a FeaturesExtractor object) + Datasets read from one or many files are concatenated and processed + """ + + def __init__(self, + features_extractor=None, + feature_filename_structure=None, + sources=None, + dataset_list=None, + mask=None, + feat_norm=None, + global_cmvn=None, + dct_pca=False, + dct_pca_config=None, + sdc=False, + sdc_config=None, + delta=None, + double_delta=None, + delta_filter=None, + context=None, + traps_dct_nb=None, + rasta=None, + keep_all_features=True): + """ + Initialize a FeaturesServer for two cases: + 1. each call to load will load datasets from a single file. This mode requires to provide a dataset_list + (lists of datasets to load from each file. + 2. each call to load will load datasets from several files (possibly several datasets from each file) + and concatenate them. In this mode, you should provide a FeaturesServer for each source, thus, datasets + read from each source can be post-processed independently before being concatenated with others. The dataset + resulting from the concatenation from all sources is then post-processed. + + :param features_extractor: a FeaturesExtractor if required to extract features from audio file + if None, data are loaded from an existing HDF5 file + :param feature_filename_structure: structure of the filename to use to load HDF5 files + :param sources: tuple of sources to load features different files (optional: for the case where datasets + are loaded from several files and concatenated. + :param dataset_list: string of the form '["cep", "fb", vad", energy", "bnf"]' (only when loading datasets + from a single file) list of datasets to load. + :param mask: string of the form '[1-3,10,15-20]' mask to apply on the concatenated dataset + to select specific components. In this example, coefficients 1,2,3,10,15,16,17,18,19,20 are kept + In this example, + :param feat_norm: tpye of normalization to apply as post-processing + :param global_cmvn: boolean, if True, use a global mean and std when normalizing the frames + :param dct_pca: if True, add temporal context by using a PCA-DCT approach + :param dct_pca_config: configuration of the PCA-DCT, default is (12, 12, none) + :param sdc: if True, compute shifted delta cepstra coefficients + :param sdc_config: configuration to compute sdc coefficients, default is (1,3,7) + :param delta: if True, append the first order derivative + :param double_delta: if True, append the second order derivative + :param delta_filter: coefficients of the filter used to compute delta coefficients + :param context: add a left and right context, default is (0,0) + :param traps_dct_nb: number of DCT coefficients to keep when computing TRAP coefficients + :param rasta: if True, perform RASTA filtering + :param keep_all_features: boolean, if True, keep all features, if False, keep frames according to the vad labels + :return: + """ + self.features_extractor = None + self.feature_filename_structure = '{}' + self.sources = () + self.dataset_list = None + + # Post processing options + self.mask = None + self.feat_norm = None + self.global_cmvn = None + self.dct_pca = False + self.dct_pca_config = (12, 12, None) + self.sdc = False + self.sdc_config = (1, 3, 7) + self.delta = False + self.double_delta = False + self.delta_filter = numpy.array([.25, .5, .25, 0, -.25, -.5, -.25]) + self.context = (0, 0) + self.traps_dct_nb = 0 + self.rasta = False + self.keep_all_features = True + + if features_extractor is not None: + self.features_extractor = features_extractor + if feature_filename_structure is not None: + self.feature_filename_structure = feature_filename_structure + if sources is not None: + self.sources = sources + if dataset_list is not None: + self.dataset_list = dataset_list + if mask is not None: + self.mask = parse_mask(mask) + if feat_norm is not None: + self.feat_norm = feat_norm + if global_cmvn is not None: + self.global_cmvn = global_cmvn + if dct_pca is not None: + self.dct_pca = dct_pca + if dct_pca_config is not None: + self.dct_pca_config = dct_pca_config + if sdc is not None: + self.sdc = sdc + if sdc_config is not None: + self.sdc_config = sdc_config + if delta is not None: + self.delta = delta + if double_delta is not None: + self.double_delta = double_delta + if delta_filter is not None: + self.delta_filter = delta_filter + if context is not None: + self.context = context + if traps_dct_nb is not None: + self.traps_dct_nb = traps_dct_nb + if rasta is not None: + self.rasta = rasta + if keep_all_features is not None: + self.keep_all_features = keep_all_features + + self.show = 'empty' + self.input_feature_filename = 'empty' + self.start_stop = (None, None) + self.previous_load = None + + def __repr__(self): + """ + + :return: a string to display the object + """ + ch = '\t show: {} \n\n'.format(self.show) + ch += '\t input_feature_filename: {} \n\n'.format(self.input_feature_filename) + ch += '\t feature_filename_structure: {} \n'.format(self.feature_filename_structure) + ch += '\t \n' + ch += '\t \n\n' + ch += '\t Post processing options: \n' + ch += '\t\t mask: {} \n'.format(self.mask) + ch += '\t\t feat_norm: {} \n'.format(self.feat_norm) + ch += '\t\t dct_pca: {}, dct_pca_config: {} \n'.format(self.dct_pca, + self.dct_pca_config) + ch += '\t\t sdc: {}, sdc_config: {} \n'.format(self.sdc, + self.sdc_config) + ch += '\t\t delta: {}, double_delta: {}, delta_filter: {} \n'.format(self.delta, + self.double_delta, + self.delta_filter) + ch += '\t\t rasta: {} \n'.format(self.rasta) + ch += '\t\t keep_all_features: {} \n'.format(self.keep_all_features) + + return ch + + def post_processing(self, feat, label, global_mean=None, global_std=None): + """ + After cepstral coefficients, filter banks or bottleneck parameters are computed or read from file + post processing is applied. + + :param feat: the matrix of acoustic parameters to post-process + :param label: the VAD labels for the acoustic parameters + :param global_mean: vector or mean to use for normalization + :param global_std: vector of standard deviation to use for normalization + + :return: the matrix of acoustic parameters ingand their VAD labels after post-process + """ + # Perform RASTA filtering if required + if self.rasta: + feat, label = self._rasta(feat, label) + + # Add temporal context + if self.delta or self.double_delta: + feat = self._delta_and_2delta(feat) + elif self.dct_pca: + feat = pca_dct(feat, self.dct_pca_config[0], self.dct_pca_config[1], self.dct_pca_config[2]) + elif self.sdc: + feat = shifted_delta_cepstral(feat, d=self.sdc_config[0], p=self.sdc_config[1], k=self.sdc_config[2]) + + # Apply a mask on the features + if self.mask is not None: + feat = self._mask(feat) + + # Smooth the labels and fuse the channels if more than one. + logging.debug('Smooth the labels and fuse the channels if more than one') + label = label_fusion(label) + + # Normalize the data + if self.feat_norm is None: + logging.debug('no norm') + else: + self._normalize(label, feat, global_mean, global_std) + + # if not self.keep_all_features, only selected features and labels are kept + if not self.keep_all_features: + logging.debug('no keep all') + feat = feat[label] + label = label[label] + + return feat, label + + def _mask(self, cep): + """ + Keep only the MFCC index present in the filter list + :param cep: acoustic parameters to filter + + :return: return the list of MFCC given by filter list + """ + if len(self.mask) == 0: + raise Exception('filter list is empty') + logging.debug('applied mask') + return cep[:, self.mask] + + def _normalize(self, label, cep, global_mean=None, global_std=None): + """ + Normalize acoustic parameters in place + + :param label: vad labels to use for normalization + :param cep: acoustic parameters to normalize + :param global_mean: mean vector to use if provided + :param global_std: standard deviation vector to use if provided + """ + # Perform feature normalization on the entire session. + if self.feat_norm is None: + logging.debug('no norm') + pass + elif self.feat_norm == 'cms': + logging.debug('cms norm') + cms(cep, label, global_mean) + elif self.feat_norm == 'cmvn': + logging.debug('cmvn norm') + cmvn(cep, label, global_mean, global_std) + elif self.feat_norm == 'stg': + logging.debug('stg norm') + stg(cep, label=label) + elif self.feat_norm == 'cmvn_sliding': + logging.debug('sliding cmvn norm') + cep_sliding_norm(cep, label=label, win=301, center=True, reduce=True) + elif self.feat_norm == 'cms_sliding': + logging.debug('sliding cms norm') + cep_sliding_norm(cep, label=label, win=301, center=True, reduce=False) + else: + logging.warning('Wrong feature normalisation type') + + def _delta_and_2delta(self, cep): + """ + Add deltas and double deltas. + :param cep: a matrix of cepstral cefficients + + :return: the cepstral coefficient stacked with deltas and double deltas + """ + if self.delta: + logging.debug('add delta') + delta = compute_delta(cep, filt=self.delta_filter) + cep = numpy.column_stack((cep, delta)) + if self.double_delta: + logging.debug('add delta delta') + double_delta = compute_delta(delta, filt=self.delta_filter) + cep = numpy.column_stack((cep, double_delta)) + return cep + + def _rasta(self, cep, label): + """ + Performs RASTA filtering if required. + The two first frames are copied from the third to keep + the length consistent + !!! if vad is None: label[] is empty + + :param cep: the acoustic features to filter + :param label: the VAD label + :return: + """ + if self.rasta: + logging.debug('perform RASTA %s', self.rasta) + cep = rasta_filt(cep) + cep[:2, :] = cep[2, :] + label[:2] = label[2] + return cep, label + + def get_context(self, feat, start=None, stop=None, label=None): + """ + Add a left and right context to each frame. + First and last frames are duplicated to provide context at the begining and at the end + + :param feat: sequence of feature frames (one fame per line) + :param start: index of the first frame of the selected segment + :param stop: index of the last frame of the selected segment + :param label: vad label if available + + :return: a sequence of frames with their left and right context + """ + if start is None: + start = 0 + if stop is None: + stop = feat.shape[0] + context_feat = framing( + numpy.pad(feat, + ((max(self.context[0] - start, 0), max(stop - feat.shape[0] + self.context[1] + 1, 0)), + (0, 0)), + mode='edge')[start - self.context[0] + max(self.context[0] - start, 0): + stop + self.context[1] + max(self.context[0] - start, 0), :], win_size=1+sum(self.context) + ).reshape(-1, (1+sum(self.context)) * feat.shape[1]) + + if label is not None: + context_label = label[start:stop] + else: + context_label = None + + return context_feat, context_label + + def get_traps(self, feat, start=None, stop=None, label=None): + """ + Compute TRAP parameters. The input frames are concatenated to add their left and right context, + a Hamming window is applied and a DCT reduces the dimensionality of the resulting vector. + + :param feat: input acoustic parameters to process + :param start: index of the first frame of the selected segment + :param stop: index of the last frame of the selected segment + :param label: vad label if available + + :return: a sequence of TRAP parameters + """ + + if start is None: + start = 0 + if stop is None: + stop = feat.shape[0] + + context_feat = framing( + numpy.pad( + feat, + ((self.context[0]-start, stop - feat.shape[0] + self.context[1] + 1), (0, 0)), + mode='edge' + )[start-self.context[0] + + max(self.context[0]-start, 0):stop + self.context[1] + max(self.context[0]-start, 0), :], + win_size=1+sum(self.context) + ).transpose(0, 2, 1) + hamming_dct = (dct_basis(self.traps_dct_nb, sum(self.context) + 1) * + numpy.hamming(sum(self.context) + 1)).T + + if label is not None: + context_label = label[start:stop] + else: + context_label = None + + return numpy.dot( + context_feat.reshape(-1, hamming_dct.shape[0]), + hamming_dct + ).reshape(context_feat.shape[0], -1), context_label + + def load(self, show, channel=0, input_feature_filename=None, label=None, start=None, stop=None): + """ + Depending of the setting of the FeaturesServer, can either: + + 1. Get the datasets from a single HDF5 file + The HDF5 file is loaded from disk or processed on the fly + via the FeaturesExtractor of the current FeaturesServer + + 2. Load datasets from multiple input HDF5 files. The datasets are post-processed separately, then concatenated + and post-process + + :param show: ID of the show to load (should be the same for each HDF5 file to read from) + :param channel: audio channel index in case the parameters are extracted from an audio file + :param input_feature_filename: name of the input feature file in case it is independent from the ID of the show + :param label: vad labels + :param start: index of the first frame of the selected segment + :param stop: index of the last frame of the selected segment + + :return: acoustic parameters and their vad labels + """ + # In case the name of the input file does not include the ID of the show + # (i.e., feature_filename_structure does not include {}) + # self.audio_filename_structure is updated to use the input_feature_filename + if self.show == show \ + and self.input_feature_filename == input_feature_filename\ + and self.start_stop == (start, stop) \ + and self.previous_load is not None: + logging.debug('return previous load') + return self.previous_load + self.show = show + self.input_feature_filename = input_feature_filename + self.start_stop = (start, stop) + + feature_filename = None + if input_feature_filename is not None: + self.feature_filename_structure = input_feature_filename + feature_filename = self.feature_filename_structure.format(show) + + if self.dataset_list is not None: + self.previous_load = self.get_features(show, + channel=channel, + input_feature_filename=feature_filename, + label=label, + start=start, stop=stop) + else: + logging.info('Extract tandem features from multiple sources') + self.previous_load = self.get_tandem_features(show, + channel=channel, + label=label, + start=start, stop=stop) + return self.previous_load + + def get_features(self, show, channel=0, input_feature_filename=None, label=None, start=None, stop=None): + """ + Get the datasets from a single HDF5 file + The HDF5 file is loaded from disk or processed on the fly + via the FeaturesExtractor of the current FeaturesServer + + :param show: ID of the show + :param channel: index of the channel to read + :param input_feature_filename: name of the input file in case it does not include the ID of the show + :param label: vad labels + :param start: index of the first frame of the selected segment + :param stop: index of the last frame of the selected segment + + :return: acoustic parameters and their vad labels + """ + """ + Si le nom du fichier d'entrée est totalement indépendant du show + -> si feature_filename_structure ne contient pas "{}" + on peut mettre à jour: self.audio_filename_structure pour entrer directement le nom du fichier de feature + """ + if input_feature_filename is not None: + self.feature_filename_structure = input_feature_filename + + # If no extractor for this source, open hdf5 file and return handler + if self.features_extractor is None: + h5f = h5py.File(self.feature_filename_structure.format(show), "r") + + # If an extractor is provided for this source, extract features and return an hdf5 handler + else: + h5f = self.features_extractor.extract(show, channel, input_audio_filename=input_feature_filename) + + feat, label, global_mean, global_std, global_cmvn = read_hdf5_segment(h5f, + show, + dataset_list=self.dataset_list, + label=label, + start=start, stop=stop, + global_cmvn=self.global_cmvn) + + # Post-process the features and return the features and vad label + if global_cmvn: + feat, label = self.post_processing(feat, label, global_mean, global_std) + else: + feat, label = self.post_processing(feat, label) + + return feat, label + + def get_features_per_speaker(self, show, idmap, channel=0, input_feature_filename=None, label=None): + """ + Load a single file and return a dictionary with spk_ids as keys and (feature, label) as data + :param show: + :param channel: + :param input_feature_filename: + :param label: + :param idmap: + :return: + """ + if input_feature_filename is not None: + self.feature_filename_structure = input_feature_filename + + # If no extractor for this source, open hdf5 file and return handler + if self.features_extractor is None: + h5f = h5py.File(self.feature_filename_structure.format(show), "r") + + # If an extractor is provided for this source, extract features and return an hdf5 handler + else: + h5f = self.features_extractor.extract(show, channel, input_audio_filename=input_feature_filename) + + tmp_dict = dict() + for spk_id, start, stop in zip(idmap.leftids, idmap.start, idmap.stop): + if spk_id not in tmp_dict: + tmp_dict[spk_id] = numpy.arange(start, stop - 1) + else: + tmp_dict[spk_id] = numpy.concatenate((tmp_dict[spk_id], numpy.arange(start, stop - 1)), axis=0) + + feat, lbl, global_mean, global_std, global_cmvn = read_hdf5_segment(h5f, + show, + dataset_list=self.dataset_list, + label=label, + start=None, stop=None, + global_cmvn=self.global_cmvn) + + fe = FeaturesExtractor(audio_filename_structure="", + feature_filename_structure=None, + sampling_frequency=16000, + lower_frequency=133.3333, + higher_frequency=6855.4976, + filter_bank="log", + filter_bank_size=40, + window_size=0.025, + shift=0.01, + ceps_number=13, + pre_emphasis=0.97, + keep_all_features=True, + vad='percentil', + save_param=["energy", "cep", "vad"] + ) + + feat_per_spk = dict() + for spk_id in tmp_dict.keys(): + lbl1 = copy.deepcopy(lbl) + feat1 = copy.deepcopy(feat) + + _, threshold_id = fe._vad(None, feat1[tmp_dict[spk_id], 0], None, None) + label_id = feat1[:, 0] > threshold_id + lbl1[tmp_dict[spk_id]] = label_id[tmp_dict[spk_id]].flatten() + + # Post-process the features and return the features and vad label + if global_cmvn: + tmp_feat, tmp_lbl = self.post_processing(feat1[tmp_dict[spk_id], :], lbl1[tmp_dict[spk_id]], global_mean, global_std) + else: + tmp_feat, tmp_lbl = self.post_processing(feat1[tmp_dict[spk_id], :], lbl1[tmp_dict[spk_id]]) + + feat_per_spk[spk_id] = (tmp_feat, tmp_lbl) + + return feat_per_spk + + def get_tandem_features(self, show, channel=0, label=None, start=None, stop=None): + """ + Read acoustic parameters from multiple HDF5 files (from disk or extracted by FeaturesExtractor objects). + + :param show: Id of the show + :param channel: index of the channel + :param label: vad labels + :param start: index of the first frame of the selected segment + :param stop: index of the last frame of the selected segment + + :return: acoustic parameters and their vad labels + """ + # Each source has its own sources (including subserver) that provides features and label + features = [] + for features_server, get_vad in self.sources: + # Get features from this source + feat, lbl = features_server.get_features(show, channel=channel, label=label, start=start, stop=stop) + if get_vad: + label = lbl + features.append(feat) + + features = numpy.hstack(features) + + # If the VAD is not required, return all labels at True + if label is None: + label = numpy.ones(feat.shape[0], dtype='bool') + + # Apply the final post-processing on the concatenated features + return self.post_processing(features, label) + + def mean_std(self, show, channel=0, start=None, stop=None): + """ + Compute the mean and standard deviation vectors for a segment of acoustic features + + :param show: the ID of the show + :param channel: the index of the channel + param start: index of the first frame of the selected segment + :param stop: index of the last frame of the selected segment + + :return: the number of frames, the mean of the frames and their standard deviation + """ + feat, _ = self.load(show, channel=channel, start=start, stop=stop) + return feat.shape[0], feat.sum(axis=0), numpy.sum(feat**2, axis=0) + + def stack_features(self, + show_list, + channel_list=None, + feature_filename_list=None, + label_list=None, + start_list=None, + stop_list=None): + """ + Load acoustic features from a list of fils and return them stacked in a 2D-array + one line per frame. + + :param show_list: + :param channel_list: + :param label_list: + :param start_list: + :param stop_list: + :return: + """ + if channel_list is None: + channel_list = numpy.zeros(len(show_list)) + if feature_filename_list is None: + feature_filename_list = numpy.empty(len(show_list), dtype='|O') + if label_list is None: + label_list = numpy.empty(len(show_list), dtype='|O') + if start_list is None: + start_list = numpy.empty(len(show_list), dtype='|O') + if stop_list is None: + stop_list = numpy.empty(len(show_list), dtype='|O') + + features_list = [] + for idx, load_arg in enumerate(zip(show_list, channel_list, feature_filename_list, label_list, start_list, stop_list)): + logging.info("load file {} / {}".format(idx + 1, len(show_list))) + features_list.append(self.load(*load_arg)[0]) + + return numpy.vstack(features_list) + + + def _stack_features_worker(self, + input_queue, + output_queue): + """Load a list of feature files into a Queue object + + :param input: a Queue object + :param output: a list of Queue objects to fill + """ + while True: + next_task = input_queue.get() + if next_task is None: + # Poison pill means shutdown + output_queue.put(None) + input_queue.task_done() + break + output_queue.put(self.load(*next_task)[0]) + input_queue.task_done() + + + #@profile + def stack_features_parallel(self, # fileList, numThread=1): + show_list, + channel_list=None, + feature_filename_list=None, + label_list=None, + start_list=None, + stop_list=None, + num_thread=1): + """Load a list of feature files and stack them in a unique ndarray. + The list of files to load is splited in sublists processed in parallel + + :param fileList: a list of files to load + :param numThread: numbe of thead (optional, default is 1) + """ + if channel_list is None: + channel_list = numpy.zeros(len(show_list)) + if feature_filename_list is None: + feature_filename_list = numpy.empty(len(show_list), dtype='|O') + if label_list is None: + label_list = numpy.empty(len(show_list), dtype='|O') + if start_list is None: + start_list = numpy.empty(len(show_list), dtype='|O') + if stop_list is None: + stop_list = numpy.empty(len(show_list), dtype='|O') + + #queue_in = Queue.Queue(maxsize=len(fileList)+numThread) + queue_in = multiprocessing.JoinableQueue(maxsize=len(show_list)+num_thread) + queue_out = [] + + # Start worker processes + jobs = [] + for i in range(num_thread): + queue_out.append(multiprocessing.Queue()) + p = multiprocessing.Process(target=self._stack_features_worker, + args=(queue_in, queue_out[i])) + jobs.append(p) + p.start() + + # Submit tasks + for task in zip(show_list, channel_list, feature_filename_list, + label_list, start_list, stop_list): + queue_in.put(task) + + # Add None to the queue to kill the workers + for task in range(num_thread): + queue_in.put(None) + + # Wait for all the tasks to finish + queue_in.join() + output = [] + for q in queue_out: + while True: + data = q.get() + if data is None: + break + output.append(data) + + for p in jobs: + p.join() + return numpy.concatenate(output, axis=0) + + diff --git a/sidekit/sidekit/frontend/__init__.py b/sidekit/sidekit/frontend/__init__.py new file mode 100755 index 0000000..821b1d3 --- /dev/null +++ b/sidekit/sidekit/frontend/__init__.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . + +""" +Copyright 2014-2021 Anthony Larcher and Sylvain Meignier + +:mod:`frontend` provides methods to process an audio signal in order to extract +useful parameters for speaker verification. +""" + +from .io import write_pcm +from .io import read_pcm +from .io import pcmu2lin +from .io import read_sph +from .io import write_label +from .io import read_label +from .io import read_spro4 +from .io import read_audio +from .io import write_spro4 +from .io import read_htk +from .io import write_htk +from .io import read_hdf5_segment +from .io import write_hdf5 +from .io import read_hdf5 + +from .vad import vad_snr +from .vad import label_fusion +from .vad import speech_enhancement + + +from .normfeat import cms +from .normfeat import cmvn +from .normfeat import stg +from .normfeat import rasta_filt +from .normfeat import cep_sliding_norm + + +from .features import compute_delta +from .features import framing +from .features import pre_emphasis +from .features import trfbank +from .features import mel_filter_bank +from .features import mfcc +from .features import pca_dct +from .features import shifted_delta_cepstral + + +__author__ = "Anthony Larcher and Sylvain Meignier" +__copyright__ = "Copyright 2014-2021 Anthony Larcher and Sylvain Meignier" +__license__ = "LGPL" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' diff --git a/sidekit/sidekit/frontend/features.py b/sidekit/sidekit/frontend/features.py new file mode 100755 index 0000000..4ee2ed7 --- /dev/null +++ b/sidekit/sidekit/frontend/features.py @@ -0,0 +1,1126 @@ +# -*- coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . + +""" +Copyright 2014-2021 Anthony Larcher and Sylvain Meignier + +:mod:`frontend` provides methods to process an audio signal in order to extract +useful parameters for speaker verification. +""" + +import numpy +import numpy.matlib +import scipy +from scipy.fftpack.realtransforms import dct +from .vad import pre_emphasis +import numpy.matlib + +PARAM_TYPE = numpy.float32 + +__author__ = "Anthony Larcher and Sylvain Meignier" +__copyright__ = "Copyright 2014-2021 Anthony Larcher and Sylvain Meignier" +__license__ = "LGPL" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' + + +def hz2mel(f, htk=True): + """Convert an array of frequency in Hz into mel. + + :param f: frequency to convert + + :return: the equivalence on the mel scale. + """ + if htk: + return 2595 * numpy.log10(1 + f / 700.) + else: + f = numpy.array(f) + + # Mel fn to match Slaney's Auditory Toolbox mfcc.m + # Mel fn to match Slaney's Auditory Toolbox mfcc.m + f_0 = 0. + f_sp = 200. / 3. + brkfrq = 1000. + brkpt = (brkfrq - f_0) / f_sp + logstep = numpy.exp(numpy.log(6.4) / 27) + + linpts = f < brkfrq + + z = numpy.zeros_like(f) + # fill in parts separately + z[linpts] = (f[linpts] - f_0) / f_sp + z[~linpts] = brkpt + (numpy.log(f[~linpts] / brkfrq)) / numpy.log(logstep) + + if z.shape == (1,): + return z[0] + else: + return z + + +def mel2hz(z, htk=True): + """Convert an array of mel values in Hz. + + :param m: ndarray of frequencies to convert in Hz. + + :return: the equivalent values in Hertz. + """ + if htk: + return 700. * (10**(z / 2595.) - 1) + else: + z = numpy.array(z, dtype=float) + f_0 = 0 + f_sp = 200. / 3. + brkfrq = 1000. + brkpt = (brkfrq - f_0) / f_sp + logstep = numpy.exp(numpy.log(6.4) / 27) + + linpts = (z < brkpt) + + f = numpy.zeros_like(z) + + # fill in parts separately + f[linpts] = f_0 + f_sp * z[linpts] + f[~linpts] = brkfrq * numpy.exp(numpy.log(logstep) * (z[~linpts] - brkpt)) + + if f.shape == (1,): + return f[0] + else: + return f + + +def hz2bark(f): + """ + Convert frequencies (Hertz) to Bark frequencies + + :param f: the input frequency + :return: + """ + return 6. * numpy.arcsinh(f / 600.) + + +def bark2hz(z): + """ + Converts frequencies Bark to Hertz (Hz) + + :param z: + :return: + """ + return 600. * numpy.sinh(z / 6.) + + +def compute_delta(features, + win=3, + method='filter', + filt=numpy.array([.25, .5, .25, 0, -.25, -.5, -.25])): + """features is a 2D-ndarray each row of features is a a frame + + :param features: the feature frames to compute the delta coefficients + :param win: parameter that set the length of the computation window. + The size of the window is (win x 2) + 1 + :param method: method used to compute the delta coefficients + can be diff or filter + :param filt: definition of the filter to use in "filter" mode, default one + is similar to SPRO4: filt=numpy.array([.2, .1, 0, -.1, -.2]) + + :return: the delta coefficients computed on the original features. + """ + # First and last features are appended to the begining and the end of the + # stream to avoid border effect + x = numpy.zeros((features.shape[0] + 2 * win, features.shape[1]), dtype=PARAM_TYPE) + x[:win, :] = features[0, :] + x[win:-win, :] = features + x[-win:, :] = features[-1, :] + + delta = numpy.zeros(x.shape, dtype=PARAM_TYPE) + + if method == 'diff': + filt = numpy.zeros(2 * win + 1, dtype=PARAM_TYPE) + filt[0] = -1 + filt[-1] = 1 + + for i in range(features.shape[1]): + delta[:, i] = numpy.convolve(features[:, i], filt) + + return delta[win:-win, :] + + +def pca_dct(cep, left_ctx=12, right_ctx=12, p=None): + """Apply DCT PCA as in [McLaren 2015] paper: + Mitchell McLaren and Yun Lei, 'Improved Speaker Recognition + Using DCT coefficients as features' in ICASSP, 2015 + + A 1D-dct is applied to the cepstral coefficients on a temporal + sliding window. + The resulting matrix is then flatten and reduced by using a Principal + Component Analysis. + + :param cep: a matrix of cepstral cefficients, 1 line per feature vector + :param left_ctx: number of frames to consider for left context + :param right_ctx: number of frames to consider for right context + :param p: a PCA matrix trained on a developpment set to reduce the + dimension of the features. P is a portait matrix + """ + y = numpy.r_[numpy.resize(cep[0, :], (left_ctx, cep.shape[1])), + cep, + numpy.resize(cep[-1, :], (right_ctx, cep.shape[1]))] + + ceps = framing(y, win_size=left_ctx + 1 + right_ctx).transpose(0, 2, 1) + dct_temp = (dct_basis(left_ctx + 1 + right_ctx, left_ctx + 1 + right_ctx)).T + if p is None: + p = numpy.eye(dct_temp.shape[0] * cep.shape[1], dtype=PARAM_TYPE) + return (numpy.dot(ceps.reshape(-1, dct_temp.shape[0]), + dct_temp).reshape(ceps.shape[0], -1)).dot(p) + + +def shifted_delta_cepstral(cep, d=1, p=3, k=7): + """ + Compute the Shifted-Delta-Cepstral features for language identification + + :param cep: matrix of feature, 1 vector per line + :param d: represents the time advance and delay for the delta computation + :param k: number of delta-cepstral blocks whose delta-cepstral + coefficients are stacked to form the final feature vector + :param p: time shift between consecutive blocks. + + return: cepstral coefficient concatenated with shifted deltas + """ + + y = numpy.r_[numpy.resize(cep[0, :], (d, cep.shape[1])), + cep, + numpy.resize(cep[-1, :], (k * 3 + d, cep.shape[1]))] + + delta = compute_delta(y, win=d, method='diff') + sdc = numpy.empty((cep.shape[0], cep.shape[1] * k)) + + idx = numpy.zeros(delta.shape[0], dtype='bool') + for ii in range(k): + idx[d + ii * p] = True + for ff in range(len(cep)): + sdc[ff, :] = delta[idx, :].reshape(1, -1) + idx = numpy.roll(idx, 1) + return numpy.hstack((cep, sdc)) + + +def trfbank(fs, nfft, lowfreq, maxfreq, nlinfilt, nlogfilt, midfreq=1000): + """Compute triangular filterbank for cepstral coefficient computation. + + :param fs: sampling frequency of the original signal. + :param nfft: number of points for the Fourier Transform + :param lowfreq: lower limit of the frequency band filtered + :param maxfreq: higher limit of the frequency band filtered + :param nlinfilt: number of linear filters to use in low frequencies + :param nlogfilt: number of log-linear filters to use in high frequencies + :param midfreq: frequency boundary between linear and log-linear filters + + :return: the filter bank and the central frequencies of each filter + """ + # Total number of filters + nfilt = nlinfilt + nlogfilt + + # ------------------------ + # Compute the filter bank + # ------------------------ + # Compute start/middle/end points of the triangular filters in spectral + # domain + frequences = numpy.zeros(nfilt + 2, dtype=PARAM_TYPE) + if nlogfilt == 0: + linsc = (maxfreq - lowfreq) / (nlinfilt + 1) + frequences[:nlinfilt + 2] = lowfreq + numpy.arange(nlinfilt + 2) * linsc + elif nlinfilt == 0: + low_mel = hz2mel(lowfreq) + max_mel = hz2mel(maxfreq) + mels = numpy.zeros(nlogfilt + 2) + # mels[nlinfilt:] + melsc = (max_mel - low_mel) / (nfilt + 1) + mels[:nlogfilt + 2] = low_mel + numpy.arange(nlogfilt + 2) * melsc + # Back to the frequency domain + frequences = mel2hz(mels) + else: + # Compute linear filters on [0;1000Hz] + linsc = (min([midfreq, maxfreq]) - lowfreq) / (nlinfilt + 1) + frequences[:nlinfilt] = lowfreq + numpy.arange(nlinfilt) * linsc + # Compute log-linear filters on [1000;maxfreq] + low_mel = hz2mel(min([1000, maxfreq])) + max_mel = hz2mel(maxfreq) + mels = numpy.zeros(nlogfilt + 2, dtype=PARAM_TYPE) + melsc = (max_mel - low_mel) / (nlogfilt + 1) + + # Verify that mel2hz(melsc)>linsc + while mel2hz(melsc) < linsc: + # in this case, we add a linear filter + nlinfilt += 1 + nlogfilt -= 1 + frequences[:nlinfilt] = lowfreq + numpy.arange(nlinfilt) * linsc + low_mel = hz2mel(frequences[nlinfilt - 1] + 2 * linsc) + max_mel = hz2mel(maxfreq) + mels = numpy.zeros(nlogfilt + 2, dtype=PARAM_TYPE) + melsc = (max_mel - low_mel) / (nlogfilt + 1) + + mels[:nlogfilt + 2] = low_mel + numpy.arange(nlogfilt + 2) * melsc + # Back to the frequency domain + frequences[nlinfilt:] = mel2hz(mels) + + heights = 2. / (frequences[2:] - frequences[0:-2]) + + # Compute filterbank coeff (in fft domain, in bins) + fbank = numpy.zeros((nfilt, int(numpy.floor(nfft / 2)) + 1), dtype=PARAM_TYPE) + # FFT bins (in Hz) + n_frequences = numpy.arange(nfft) / (1. * nfft) * fs + + for i in range(nfilt): + low = frequences[i] + cen = frequences[i + 1] + hi = frequences[i + 2] + + lid = numpy.arange(numpy.floor(low * nfft / fs) + 1, numpy.floor(cen * nfft / fs) + 1, dtype=numpy.int) + left_slope = heights[i] / (cen - low) + rid = numpy.arange(numpy.floor(cen * nfft / fs) + 1, + min(numpy.floor(hi * nfft / fs) + 1, nfft), dtype=numpy.int) + right_slope = heights[i] / (hi - cen) + fbank[i][lid] = left_slope * (n_frequences[lid] - low) + fbank[i][rid[:-1]] = right_slope * (hi - n_frequences[rid[:-1]]) + + return fbank, frequences + + +def mel_filter_bank(fs, nfft, lowfreq, maxfreq, widest_nlogfilt, widest_lowfreq, widest_maxfreq,): + """Compute triangular filterbank for cepstral coefficient computation. + + :param fs: sampling frequency of the original signal. + :param nfft: number of points for the Fourier Transform + :param lowfreq: lower limit of the frequency band filtered + :param maxfreq: higher limit of the frequency band filtered + :param widest_nlogfilt: number of log filters + :param widest_lowfreq: lower frequency of the filter bank + :param widest_maxfreq: higher frequency of the filter bank + :param widest_maxfreq: higher frequency of the filter bank + + :return: the filter bank and the central frequencies of each filter + """ + + # ------------------------ + # Compute the filter bank + # ------------------------ + # Compute start/middle/end points of the triangular filters in spectral + # domain + widest_freqs = numpy.zeros(widest_nlogfilt + 2, dtype=PARAM_TYPE) + low_mel = hz2mel(widest_lowfreq) + max_mel = hz2mel(widest_maxfreq) + mels = numpy.zeros(widest_nlogfilt+2) + melsc = (max_mel - low_mel) / (widest_nlogfilt + 1) + mels[:widest_nlogfilt + 2] = low_mel + numpy.arange(widest_nlogfilt + 2) * melsc + # Back to the frequency domain + widest_freqs = mel2hz(mels) + + # Select filters in the narrow band + sub_band_freqs = numpy.array([fr for fr in widest_freqs if lowfreq <= fr <= maxfreq], dtype=PARAM_TYPE) + + heights = 2./(sub_band_freqs[2:] - sub_band_freqs[0:-2]) + nfilt = sub_band_freqs.shape[0] - 2 + + # Compute filterbank coeff (in fft domain, in bins) + fbank = numpy.zeros((nfilt, numpy.floor(nfft/2)+1).astype(int), dtype=PARAM_TYPE) + # FFT bins (in Hz) + nfreqs = numpy.arange(nfft) / (1. * nfft) * fs + + for i in range(nfilt): + low = sub_band_freqs[i] + cen = sub_band_freqs[i+1] + hi = sub_band_freqs[i+2] + lid = numpy.arange(numpy.floor(low * nfft / fs) + 1, numpy.floor(cen * nfft / fs) + 1, dtype=numpy.int) + left_slope = heights[i] / (cen - low) + rid = numpy.arange(numpy.floor(cen * nfft / fs) + 1, min(numpy.floor(hi * nfft / fs) + 1, + nfft), dtype=numpy.int) + right_slope = heights[i] / (hi - cen) + fbank[i][lid] = left_slope * (nfreqs[lid] - low) + fbank[i][rid[:-1]] = right_slope * (hi - nfreqs[rid[:-1]]) + + return fbank, sub_band_freqs + + +def power_spectrum(input_sig, + fs=8000, + win_time=0.025, + shift=0.01, + prefac=0.97): + """ + Compute the power spectrum of the signal. + :param input_sig: + :param fs: + :param win_time: + :param shift: + :param prefac: + :return: + """ + window_length = int(round(win_time * fs)) + overlap = window_length - int(shift * fs) + framed = framing(input_sig, window_length, win_shift=window_length-overlap).copy() + # Pre-emphasis filtering is applied after framing to be consistent with stream processing + framed = pre_emphasis(framed, prefac) + l = framed.shape[0] + n_fft = 2 ** int(numpy.ceil(numpy.log2(window_length))) + # Windowing has been changed to hanning which is supposed to have less noisy sidelobes + # ham = numpy.hamming(window_length) + window = numpy.hanning(window_length) + + spec = numpy.ones((l, int(n_fft / 2) + 1), dtype=PARAM_TYPE) + log_energy = numpy.log((framed**2).sum(axis=1)) + dec = 500000 + start = 0 + stop = min(dec, l) + while start < l: + ahan = framed[start:stop, :] * window + mag = numpy.fft.rfft(ahan, n_fft, axis=-1) + spec[start:stop, :] = mag.real**2 + mag.imag**2 + start = stop + stop = min(stop + dec, l) + + return spec, log_energy + + +def mfcc(input_sig, + lowfreq=100, maxfreq=8000, + nlinfilt=0, nlogfilt=24, + nwin=0.025, + fs=16000, + nceps=13, + shift=0.01, + get_spec=False, + get_mspec=False, + prefac=0.97): + """Compute Mel Frequency Cepstral Coefficients. + + :param input_sig: input signal from which the coefficients are computed. + Input audio is supposed to be RAW PCM 16bits + :param lowfreq: lower limit of the frequency band filtered. + Default is 100Hz. + :param maxfreq: higher limit of the frequency band filtered. + Default is 8000Hz. + :param nlinfilt: number of linear filters to use in low frequencies. + Default is 0. + :param nlogfilt: number of log-linear filters to use in high frequencies. + Default is 24. + :param nwin: length of the sliding window in seconds + Default is 0.025. + :param fs: sampling frequency of the original signal. Default is 16000Hz. + :param nceps: number of cepstral coefficients to extract. + Default is 13. + :param shift: shift between two analyses. Default is 0.01 (10ms). + :param get_spec: boolean, if true returns the spectrogram + :param get_mspec: boolean, if true returns the output of the filter banks + :param prefac: pre-emphasis filter value + + :return: the cepstral coefficients in a ndaray as well as + the Log-spectrum in the mel-domain in a ndarray. + + .. note:: MFCC are computed as follows: + + - Pre-processing in time-domain (pre-emphasizing) + - Compute the spectrum amplitude by windowing with a Hamming window + - Filter the signal in the spectral domain with a triangular filter-bank, whose filters are approximatively + linearly spaced on the mel scale, and have equal bandwith in the mel scale + - Compute the DCT of the log-spectrom + - Log-energy is returned as first coefficient of the feature vector. + + For more details, refer to [Davis80]_. + """ + # Compute power spectrum + spec, log_energy = power_spectrum(input_sig, + fs, + win_time=nwin, + shift=shift, + prefac=prefac) + # Filter the spectrum through the triangle filter-bank + n_fft = 2 ** int(numpy.ceil(numpy.log2(int(round(nwin * fs))))) + fbank = trfbank(fs, n_fft, lowfreq, maxfreq, nlinfilt, nlogfilt)[0] + + mspec = numpy.log(numpy.dot(spec, fbank.T)) # A tester avec log10 et log + # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain) + # The C0 term is removed as it is the constant term + ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:, 1:nceps + 1] + lst = list() + lst.append(ceps) + lst.append(log_energy) + if get_spec: + lst.append(spec) + else: + lst.append(None) + del spec + if get_mspec: + lst.append(mspec) + else: + lst.append(None) + del mspec + + return lst + + +def fft2barkmx(n_fft, fs, nfilts=0, width=1., minfreq=0., maxfreq=8000): + """ + Generate a matrix of weights to combine FFT bins into Bark + bins. n_fft defines the source FFT size at sampling rate fs. + Optional nfilts specifies the number of output bands required + (else one per bark), and width is the constant width of each + band in Bark (default 1). + While wts has n_fft columns, the second half are all zero. + Hence, Bark spectrum is fft2barkmx(n_fft,fs) * abs(fft(xincols, n_fft)); + 2004-09-05 dpwe@ee.columbia.edu based on rastamat/audspec.m + + :param n_fft: the source FFT size at sampling rate fs + :param fs: sampling rate + :param nfilts: number of output bands required + :param width: constant width of each band in Bark (default 1) + :param minfreq: + :param maxfreq: + :return: a matrix of weights to combine FFT bins into Bark bins + """ + maxfreq = min(maxfreq, fs / 2.) + + min_bark = hz2bark(minfreq) + nyqbark = hz2bark(maxfreq) - min_bark + + if nfilts == 0: + nfilts = numpy.ceil(nyqbark) + 1 + + wts = numpy.zeros((nfilts, n_fft)) + + # bark per filt + step_barks = nyqbark / (nfilts - 1) + + # Frequency of each FFT bin in Bark + binbarks = hz2bark(numpy.arange(n_fft / 2 + 1) * fs / n_fft) + + for i in range(nfilts): + f_bark_mid = min_bark + i * step_barks + # Linear slopes in log-space (i.e. dB) intersect to trapezoidal window + lof = (binbarks - f_bark_mid - 0.5) + hif = (binbarks - f_bark_mid + 0.5) + wts[i, :n_fft // 2 + 1] = 10 ** (numpy.minimum(numpy.zeros_like(hif), numpy.minimum(hif, -2.5 * lof) / width)) + + return wts + + +def fft2melmx(n_fft, + fs=8000, + nfilts=0, + width=1., + minfreq=0, + maxfreq=4000, + htkmel=False, + constamp=False): + """ + Generate a matrix of weights to combine FFT bins into Mel + bins. n_fft defines the source FFT size at sampling rate fs. + Optional nfilts specifies the number of output bands required + (else one per "mel/width"), and width is the constant width of each + band relative to standard Mel (default 1). + While wts has n_fft columns, the second half are all zero. + Hence, Mel spectrum is fft2melmx(n_fft,fs)*abs(fft(xincols,n_fft)); + minfreq is the frequency (in Hz) of the lowest band edge; + default is 0, but 133.33 is a common standard (to skip LF). + maxfreq is frequency in Hz of upper edge; default fs/2. + You can exactly duplicate the mel matrix in Slaney's mfcc.m + as fft2melmx(512, 8000, 40, 1, 133.33, 6855.5, 0); + htkmel=1 means use HTK's version of the mel curve, not Slaney's. + constamp=1 means make integration windows peak at 1, not sum to 1. + frqs returns bin center frqs. + + % 2004-09-05 dpwe@ee.columbia.edu based on fft2barkmx + + :param n_fft: + :param fs: + :param nfilts: + :param width: + :param minfreq: + :param maxfreq: + :param htkmel: + :param constamp: + :return: + """ + maxfreq = min(maxfreq, fs / 2.) + + if nfilts == 0: + nfilts = numpy.ceil(hz2mel(maxfreq, htkmel) / 2.) + + wts = numpy.zeros((nfilts, n_fft)) + + # Center freqs of each FFT bin + fftfrqs = numpy.arange(n_fft / 2 + 1) / n_fft * fs + + # 'Center freqs' of mel bands - uniformly spaced between limits + minmel = hz2mel(minfreq, htkmel) + maxmel = hz2mel(maxfreq, htkmel) + binfrqs = mel2hz(minmel + numpy.arange(nfilts + 2) / (nfilts + 1) * (maxmel - minmel), htkmel) + + for i in range(nfilts): + _fs = binfrqs[i + numpy.arange(3, dtype=int)] + # scale by width + _fs = _fs[1] + width * (_fs - _fs[1]) + # lower and upper slopes for all bins + loslope = (fftfrqs - _fs[0]) / (_fs[1] - __fs[0]) + hislope = (_fs[2] - fftfrqs)/(_fs[2] - _fs[1]) + + wts[i, 1 + numpy.arange(n_fft//2 + 1)] =numpy.maximum(numpy.zeros_like(loslope),numpy.minimum(loslope, hislope)) + + if not constamp: + # Slaney-style mel is scaled to be approx constant E per channel + wts = numpy.dot(numpy.diag(2. / (binfrqs[2 + numpy.arange(nfilts)] - binfrqs[numpy.arange(nfilts)])) , wts) + + # Make sure 2nd half of FFT is zero + wts[:, n_fft // 2 + 1: n_fft] = 0 + + return wts, binfrqs + + +def audspec(power_spectrum, + fs=16000, + nfilts=None, + fbtype='bark', + minfreq=0, + maxfreq=8000, + sumpower=True, + bwidth=1.): + """ + + :param power_spectrum: + :param fs: + :param nfilts: + :param fbtype: + :param minfreq: + :param maxfreq: + :param sumpower: + :param bwidth: + :return: + """ + if nfilts is None: + nfilts = int(numpy.ceil(hz2bark(fs / 2)) + 1) + + if not fs == 16000: + maxfreq = min(fs / 2, maxfreq) + + nframes, nfreqs = power_spectrum.shape + n_fft = (nfreqs -1 ) * 2 + + if fbtype == 'bark': + wts = fft2barkmx(n_fft, fs, nfilts, bwidth, minfreq, maxfreq) + elif fbtype == 'mel': + wts = fft2melmx(n_fft, fs, nfilts, bwidth, minfreq, maxfreq) + elif fbtype == 'htkmel': + wts = fft2melmx(n_fft, fs, nfilts, bwidth, minfreq, maxfreq, True, True) + elif fbtype == 'fcmel': + wts = fft2melmx(n_fft, fs, nfilts, bwidth, minfreq, maxfreq, True, False) + else: + print('fbtype {} not recognized'.format(fbtype)) + + wts = wts[:, :nfreqs] + + if sumpower: + audio_spectrum = power_spectrum.dot(wts.T) + else: + audio_spectrum = numpy.dot(numpy.sqrt(power_spectrum), wts.T)**2 + + return audio_spectrum, wts + + +def postaud(x, fmax, fbtype='bark', broaden=0): + """ + do loudness equalization and cube root compression + + :param x: + :param fmax: + :param fbtype: + :param broaden: + :return: + """ + nframes, nbands = x.shape + + # Include frequency points at extremes, discard later + nfpts = nbands + 2 * broaden + + if fbtype == 'bark': + bandcfhz = bark2hz(numpy.linspace(0, hz2bark(fmax), num=nfpts)) + elif fbtype == 'mel': + bandcfhz = mel2hz(numpy.linspace(0, hz2bark(fmax), num=nfpts)) + elif fbtype == 'htkmel' or fbtype == 'fcmel': + bandcfhz = mel2hz(numpy.linspace(0, hz2mel(fmax,1), num=nfpts),1) + else: + print('unknown fbtype {}'.format(fbtype)) + + # Remove extremal bands (the ones that will be duplicated) + bandcfhz = bandcfhz[broaden:(nfpts - broaden)] + + # Hynek's magic equal-loudness-curve formula + fsq = bandcfhz ** 2 + ftmp = fsq + 1.6e5 + eql = ((fsq / ftmp) ** 2) * ((fsq + 1.44e6) / (fsq + 9.61e6)) + + # weight the critical bands + z = numpy.matlib.repmat(eql.T,nframes,1) * x + + # cube root compress + z = z ** .33 + + # replicate first and last band (because they are unreliable as calculated) + if broaden == 1: + y = z[:, numpy.hstack((0,numpy.arange(nbands), nbands - 1))] + else: + y = z[:, numpy.hstack((1,numpy.arange(1, nbands - 1), nbands - 2))] + + return y, eql + + +def dolpc(x, model_order=8): + """ + compute autoregressive model from spectral magnitude samples + + :param x: + :param model_order: + :return: + """ + nframes, nbands = x.shape + + r = numpy.real(numpy.fft.ifft(numpy.hstack((x,x[:,numpy.arange(nbands-2,0,-1)])))) + + # First half only + r = r[:, :nbands] + + # Find LPC coeffs by Levinson-Durbin recursion + y_lpc = numpy.ones((r.shape[0], model_order + 1)) + + for ff in range(r.shape[0]): + y_lpc[ff, 1:], e, _ = levinson(r[ff, :-1].T, order=model_order, allow_singularity=True) + # Normalize each poly by gain + y_lpc[ff, :] /= e + + return y_lpc + + +def lpc2cep(a, nout): + """ + Convert the LPC 'a' coefficients in each column of lpcas + into frames of cepstra. + nout is number of cepstra to produce, defaults to size(lpcas,1) + 2003-04-11 dpwe@ee.columbia.edu + + :param a: + :param nout: + :return: + """ + ncol , nin = a.shape + + order = nin - 1 + + if nout is None: + nout = order + 1 + + c = numpy.zeros((ncol, nout)) + + # First cep is log(Error) from Durbin + c[:, 0] = -numpy.log(a[:, 0]) + + # Renormalize lpc A coeffs + a /= numpy.tile(a[:, 0][:, None], (1, nin)) + + for n in range(1, nout): + sum = 0 + for m in range(1, n): + sum += (n - m) * a[:, m] * c[:, n - m] + c[:, n] = -(a[:, n] + sum / n) + + return c + + +def lpc2spec(lpcas, nout=17): + """ + Convert LPC coeffs back into spectra + nout is number of freq channels, default 17 (i.e. for 8 kHz) + + :param lpcas: + :param nout: + :return: + """ + [cols, rows] = lpcas.shape + order = rows - 1 + + gg = lpcas[:, 0] + aa = lpcas / numpy.tile(gg, (rows,1)).T + + # Calculate the actual z-plane polyvals: nout points around unit circle + zz = numpy.exp((-1j * numpy.pi / (nout - 1)) * numpy.outer(numpy.arange(nout).T, numpy.arange(order + 1))) + + # Actual polyvals, in power (mag^2) + features = ( 1./numpy.abs(aa.dot(zz.T))**2) / numpy.tile(gg, (nout, 1)).T + + F = numpy.zeros((cols, rows-1)) + M = numpy.zeros((cols, rows-1)) + + for c in range(cols): + aaa = aa[c, :] + rr = numpy.roots(aaa) + ff = numpy.angle(rr.T) + zz = numpy.exp(1j * numpy.outer(ff, numpy.arange(len(aaa)))) + mags = numpy.sqrt(((1./numpy.abs(zz.dot(aaa)))**2)/gg[c]) + ix = numpy.argsort(ff) + keep = ff[ix] > 0 + ix = ix[keep] + F[c, numpy.arange(len(ix))] = ff[ix] + M[c, numpy.arange(len(ix))] = mags[ix] + + F = F[:, F.sum(axis=0) != 0] + M = M[:, M.sum(axis=0) != 0] + + return features, F, M + + +def spec2cep(spec, ncep=13, type=2): + """ + Calculate cepstra from spectral samples (in columns of spec) + Return ncep cepstral rows (defaults to 9) + This one does type II dct, or type I if type is specified as 1 + dctm returns the DCT matrix that spec was multiplied by to give cep. + + :param spec: + :param ncep: + :param type: + :return: + """ + nrow, ncol = spec.shape + + # Make the DCT matrix + dctm = numpy.zeros(ncep, nrow); + #if type == 2 || type == 3 + # # this is the orthogonal one, the one you want + # for i = 1:ncep + # dctm(i,:) = cos((i-1)*[1:2:(2*nrow-1)]/(2*nrow)*pi) * sqrt(2/nrow); + + # if type == 2 + # # make it unitary! (but not for HTK type 3) + # dctm(1,:) = dctm(1,:)/sqrt(2); + + #elif type == 4: # type 1 with implicit repeating of first, last bins + # """ + # Deep in the heart of the rasta/feacalc code, there is the logic + # that the first and last auditory bands extend beyond the edge of + # the actual spectra, and they are thus copied from their neighbors. + # Normally, we just ignore those bands and take the 19 in the middle, + # but when feacalc calculates mfccs, it actually takes the cepstrum + # over the spectrum *including* the repeated bins at each end. + # Here, we simulate 'repeating' the bins and an nrow+2-length + # spectrum by adding in extra DCT weight to the first and last + # bins. + # """ + # for i = 1:ncep + # dctm(i,:) = cos((i-1)*[1:nrow]/(nrow+1)*pi) * 2; + # # Add in edge points at ends (includes fixup scale) + # dctm(i,1) = dctm(i,1) + 1; + # dctm(i,nrow) = dctm(i,nrow) + ((-1)^(i-1)); + + # dctm = dctm / (2*(nrow+1)); + #else % dpwe type 1 - same as old spec2cep that expanded & used fft + # for i = 1:ncep + # dctm(i,:) = cos((i-1)*[0:(nrow-1)]/(nrow-1)*pi) * 2 / (2*(nrow-1)); + # dctm(:,[1 nrow]) = dctm(:, [1 nrow])/2; + + #cep = dctm*log(spec); + return None, None, None + + +def lifter(x, lift=0.6, invs=False): + """ + Apply lifter to matrix of cepstra (one per column) + lift = exponent of x i^n liftering + or, as a negative integer, the length of HTK-style sin-curve liftering. + If inverse == 1 (default 0), undo the liftering. + + :param x: + :param lift: + :param invs: + :return: + """ + nfrm , ncep = x.shape + + if lift == 0: + y = x + else: + if lift > 0: + if lift > 10: + print('Unlikely lift exponent of {} did you mean -ve?'.format(lift)) + liftwts = numpy.hstack((1, numpy.arange(1, ncep)**lift)) + + elif lift < 0: + # Hack to support HTK liftering + L = float(-lift) + if (L != numpy.round(L)): + print('HTK liftering value {} must be integer'.format(L)) + + liftwts = numpy.hstack((1, 1 + L/2*numpy.sin(numpy.arange(1, ncep) * numpy.pi / L))) + + if invs: + liftwts = 1 / liftwts + + y = x.dot(numpy.diag(liftwts)) + + return y + + +def plp(input_sig, + nwin=0.025, + fs=16000, + plp_order=13, + shift=0.01, + get_spec=False, + get_mspec=False, + prefac=0.97, + rasta=True): + """ + output is matrix of features, row = feature, col = frame + + % fs is sampling rate of samples, defaults to 8000 + % dorasta defaults to 1; if 0, just calculate PLP + % modelorder is order of PLP model, defaults to 8. 0 -> no PLP + + :param input_sig: + :param fs: sampling rate of samples default is 8000 + :param rasta: default is True, if False, juste compute PLP + :param model_order: order of the PLP model, default is 8, 0 means no PLP + + :return: matrix of features, row = features, column are frames + """ + plp_order -= 1 + + # first compute power spectrum + powspec, log_energy = power_spectrum(input_sig, fs, nwin, shift, prefac) + + # next group to critical bands + audio_spectrum = audspec(powspec, fs)[0] + nbands = audio_spectrum.shape[0] + + if rasta: + # put in log domain + nl_aspectrum = numpy.log(audio_spectrum) + + # next do rasta filtering + ras_nl_aspectrum = rasta_filt(nl_aspectrum) + + # do inverse log + audio_spectrum = numpy.exp(ras_nl_aspectrum) + + # do final auditory compressions + post_spectrum = postaud(audio_spectrum, fs / 2.)[0] + + if plp_order > 0: + + # LPC analysis + lpcas = dolpc(post_spectrum, plp_order) + + # convert lpc to cepstra + cepstra = lpc2cep(lpcas, plp_order + 1) + + # .. or to spectra + spectra, F, M = lpc2spec(lpcas, nbands) + + else: + + # No LPC smoothing of spectrum + spectra = post_spectrum + cepstra = spec2cep(spectra) + + cepstra = lifter(cepstra, 0.6) + + lst = list() + lst.append(cepstra) + lst.append(log_energy) + if get_spec: + lst.append(powspec) + else: + lst.append(None) + del powspec + if get_mspec: + lst.append(post_spectrum) + else: + lst.append(None) + del post_spectrum + + return lst + + +def framing(sig, win_size, win_shift=1, context=(0, 0), pad='zeros'): + """ + :param sig: input signal, can be mono or multi dimensional + :param win_size: size of the window in term of samples + :param win_shift: shift of the sliding window in terme of samples + :param context: tuple of left and right context + :param pad: can be zeros or edge + """ + dsize = sig.dtype.itemsize + if sig.ndim == 1: + sig = sig[:, numpy.newaxis] + # Manage padding + c = (context, ) + (sig.ndim - 1) * ((0, 0), ) + _win_size = win_size + sum(context) + shape = (int((sig.shape[0] - win_size) / win_shift) + 1, 1, _win_size, sig.shape[1]) + strides = tuple(map(lambda x: x * dsize, [win_shift * sig.shape[1], 1, sig.shape[1], 1])) + if pad == 'zeros': + return numpy.lib.stride_tricks.as_strided(numpy.lib.pad(sig, c, 'constant', constant_values=(0,)), + shape=shape, + strides=strides).squeeze() + elif pad == 'edge': + return numpy.lib.stride_tricks.as_strided(numpy.lib.pad(sig, c, 'edge'), + shape=shape, + strides=strides).squeeze() + + +def dct_basis(nbasis, length): + """ + :param nbasis: number of CT coefficients to keep + :param length: length of the matrix to process + :return: a basis of DCT coefficients + """ + return scipy.fftpack.idct(numpy.eye(nbasis, length), norm='ortho') + + +def levinson(r, order=None, allow_singularity=False): + r"""Levinson-Durbin recursion. + + Find the coefficients of a length(r)-1 order autoregressive linear process + + :param r: autocorrelation sequence of length N + 1 (first element being the zero-lag autocorrelation) + :param order: requested order of the autoregressive coefficients. default is N. + :param allow_singularity: false by default. Other implementations may be True (e.g., octave) + + :return: + * the `N+1` autoregressive coefficients :math:`A=(1, a_1...a_N)` + * the prediction errors + * the `N` reflections coefficients values + + This algorithm solves the set of complex linear simultaneous equations + using Levinson algorithm. + + .. math:: + + \bold{T}_M \left( \begin{array}{c} 1 \\ \bold{a}_M \end{array} \right) = + \left( \begin{array}{c} \rho_M \\ \bold{0}_M \end{array} \right) + + where :math:`\bold{T}_M` is a Hermitian Toeplitz matrix with elements + :math:`T_0, T_1, \dots ,T_M`. + + .. note:: Solving this equations by Gaussian elimination would + require :math:`M^3` operations whereas the levinson algorithm + requires :math:`M^2+M` additions and :math:`M^2+M` multiplications. + + This is equivalent to solve the following symmetric Toeplitz system of + linear equations + + .. math:: + + \left( \begin{array}{cccc} + r_1 & r_2^* & \dots & r_{n}^*\\ + r_2 & r_1^* & \dots & r_{n-1}^*\\ + \dots & \dots & \dots & \dots\\ + r_n & \dots & r_2 & r_1 \end{array} \right) + \left( \begin{array}{cccc} + a_2\\ + a_3 \\ + \dots \\ + a_{N+1} \end{array} \right) + = + \left( \begin{array}{cccc} + -r_2\\ + -r_3 \\ + \dots \\ + -r_{N+1} \end{array} \right) + + where :math:`r = (r_1 ... r_{N+1})` is the input autocorrelation vector, and + :math:`r_i^*` denotes the complex conjugate of :math:`r_i`. The input r is typically + a vector of autocorrelation coefficients where lag 0 is the first + element :math:`r_1`. + + + .. doctest:: + + >>> import numpy; from spectrum import LEVINSON + >>> T = numpy.array([3., -2+0.5j, .7-1j]) + >>> a, e, k = LEVINSON(T) + + """ + #from numpy import isrealobj + T0 = numpy.real(r[0]) + T = r[1:] + M = len(T) + + if order is None: + M = len(T) + else: + assert order <= M, 'order must be less than size of the input data' + M = order + + realdata = numpy.isrealobj(r) + if realdata is True: + A = numpy.zeros(M, dtype=float) + ref = numpy.zeros(M, dtype=float) + else: + A = numpy.zeros(M, dtype=complex) + ref = numpy.zeros(M, dtype=complex) + + P = T0 + + for k in range(M): + save = T[k] + if k == 0: + temp = -save / P + else: + #save += sum([A[j]*T[k-j-1] for j in range(0,k)]) + for j in range(0, k): + save = save + A[j] * T[k-j-1] + temp = -save / P + if realdata: + P = P * (1. - temp**2.) + else: + P = P * (1. - (temp.real**2+temp.imag**2)) + + if (P <= 0).any() and allow_singularity==False: + raise ValueError("singular matrix") + A[k] = temp + ref[k] = temp # save reflection coeff at each step + if k == 0: + continue + + khalf = (k+1)//2 + if realdata is True: + for j in range(0, khalf): + kj = k-j-1 + save = A[j] + A[j] = save + temp * A[kj] + if j != kj: + A[kj] += temp*save + else: + for j in range(0, khalf): + kj = k-j-1 + save = A[j] + A[j] = save + temp * A[kj].conjugate() + if j != kj: + A[kj] = A[kj] + temp * save.conjugate() + + return A, P, ref diff --git a/sidekit/sidekit/frontend/io.py b/sidekit/sidekit/frontend/io.py new file mode 100755 index 0000000..1ce2fc4 --- /dev/null +++ b/sidekit/sidekit/frontend/io.py @@ -0,0 +1,1706 @@ +# -*- coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . + +""" +Copyright 2014-2021 Anthony Larcher + +:mod:`frontend` provides methods to process an audio signal in order to extract +useful parameters for speaker verification. +""" +import audioop +import decimal +import h5py +import logging +import math +import numpy +import os +import soundfile +import struct +import warnings +import wave +import scipy.signal +import scipy.io.wavfile +from scipy.signal import lfilter +from scipy.signal import decimate +from ..sidekit_wrappers import check_path_existance, process_parallel_lists + + +__author__ = "Anthony Larcher" +__copyright__ = "Copyright 2014-2021 Anthony Larcher" +__license__ = "LGPL" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' + + +wav_flag = "float32" # Could be "int16" + +# HTK parameters +WAVEFORM = 0 +LPC = 1 +LPCREFC = 2 +LPCEPSTRA = 3 +LPCDELCEP = 4 +IREFC = 5 +MFCC = 6 +FBANK = 7 +MELSPEC = 8 +USER = 9 +DISCRETE = 10 +PLP = 11 +ANON = 12 + +_E = 0o000100 # has energy +_N = 0o000200 # absolute energy supressed +_D = 0o000400 # has delta coefficients +_A = 0o001000 # has acceleration coefficients +_C = 0o002000 # is compressed +_Z = 0o004000 # has zero mean static coef. +_K = 0o010000 # has CRC checksum +_0 = 0o020000 # has 0th cepstral coef. +_V = 0o040000 # has VQ data +_T = 0o100000 # has third differential coef. + +parms16bit = [WAVEFORM, IREFC, DISCRETE] + + +@check_path_existance +def write_pcm(data, output_file_name): + """Write signal to single channel PCM 16 bits + + :param data: audio signal to write in a RAW PCM file. + :param output_file_name: name of the file to write + """ + with open(output_file_name, 'wb') as of: + if numpy.abs(data).max() < 1.: + data = numpy.around(numpy.array(data) * 16384, decimals=0).astype('int16') + of.write(struct.pack('<' + 'h' * data.shape[0], *data)) + +@check_path_existance +def write_wav(data, output_file_name, fs): + if data.dtype != numpy.int16: + if data.dtype == numpy.float32: + data /= numpy.abs(data).max() + data *= 0.9 + data = numpy.array(data * 2 ** 15, dtype=numpy.int16) + if numpy.any(data > numpy.iinfo(numpy.int16).max) or numpy.any(data < numpy.iinfo(numpy.int16).min): + warnings.warn('Warning: clipping detected when writing {}'.format(output_file_name)) + scipy.io.wavfile.write(output_file_name, fs, data) + + +def read_pcm(input_file_name): + """Read signal from single channel PCM 16 bits + + :param input_file_name: name of the PCM file to read. + + :return: the audio signal read from the file in a ndarray encoded on 16 bits, None and 2 (depth of the encoding in bytes) + """ + with open(input_file_name, 'rb') as f: + f.seek(0, 2) # Go to te end of the file + # get the sample count + sample_count = int(f.tell() / 2) + f.seek(0, 0) # got to the begining of the file + data = numpy.asarray(struct.unpack('<' + 'h' * sample_count, f.read())) + return data.astype(numpy.float32), None, 2 + + +def read_wav(input_file_name): + """ + :param input_file_name: + :return: + """ + #with wave.open(input_file_name, "r") as wfh: + # (nchannels, sampwidth, framerate, nframes, comptype, compname) = wfh.getparams() + # raw = wfh.readframes(nframes * nchannels) + # out = struct.unpack_from("%dh" % nframes * nchannels, raw) + # sig = numpy.reshape(numpy.array(out), (-1, nchannels)).squeeze() + # return sig.astype(numpy.float32), framerate, sampwidth + nfo = soundfile.info(input_file_name) + sig, sample_rate = soundfile.read(input_file_name, dtype=wav_flag) + sig = numpy.reshape(numpy.array(sig), (-1, nfo.channels)).squeeze() + sig = sig.astype(numpy.float32) + return sig, sample_rate, 4 + + +def pcmu2lin(p, s=4004.189931): + """Convert Mu-law PCM to linear X=(P,S) + lin = pcmu2lin(pcmu) where pcmu contains a vector + of mu-law values in the range 0 to 255. + No checking is performed to see that numbers are in this range. + + Output values are divided by the scale factor s: + + s Output Range + 1 +-8031 (integer values) + 4004.2 +-2.005649 (default) + 8031 +-1 + 8159 +-0.9843118 (+-1 nominal full scale) + + The default scaling factor 4004.189931 is equal to + sqrt((2207^2 + 5215^2)/2) this follows ITU standard G.711. + The sine wave with PCM-Mu values [158 139 139 158 30 11 11 30] + has a mean square value of unity corresponding to 0 dBm0. + :param p: input signal encoded in PCM mu-law to convert + :param s: conversion value from mu-scale oto linear scale + """ + t = 4 / s + m = 15 - (p % 16) + q = numpy.floor(p // 128) + e = (127 - p - m + 128 * q) / 16 + x = (m + 16.5) * numpy.power(2, e) - 16.5 + z = (q - 0.5) * x * t + return z + + +def read_sph(input_file_name, mode='p'): + """ + Read a SPHERE audio file + + :param input_file_name: name of the file to read + :param mode: specifies the following (\* =default) + + .. note:: + + - Scaling: + + - 's' Auto scale to make data peak = +-1 (use with caution if reading in chunks) + - 'r' Raw unscaled data (integer values) + - 'p' Scaled to make +-1 equal full scale + - 'o' Scale to bin centre rather than bin edge (e.g. 127 rather than 127.5 for 8 bit values, + can be combined with n+p,r,s modes) + - 'n' Scale to negative peak rather than positive peak (e.g. 128.5 rather than 127.5 for 8 bit values, + can be combined with o+p,r,s modes) + + - Format + + - 'l' Little endian data (Intel,DEC) (overrides indication in file) + - 'b' Big endian data (non Intel/DEC) (overrides indication in file) + + - File I/O + + - 'f' Do not close file on exit + - 'd' Look in data directory: voicebox('dir_data') + - 'w' Also read the annotation file \*.wrd if present (as in TIMIT) + - 't' Also read the phonetic transcription file \*.phn if present (as in TIMIT) + + - NMAX maximum number of samples to read (or -1 for unlimited [default]) + - NSKIP number of samples to skip from start of file (or -1 to continue from previous read when FFX + is given instead of FILENAME [default]) + + :return: a tupple such that (Y, FS) + + .. note:: + + - Y data matrix of dimension (samples,channels) + - FS sample frequency in Hz + - WRD{\*,2} cell array with word annotations: WRD{\*,:)={[t_start t_end],'text'} where times are in seconds + only present if 'w' option is given + - PHN{\*,2} cell array with phoneme annotations: PHN{\*,:)={[t_start t_end],'phoneme'} where times + are in seconds only present if 't' option is present + - FFX Cell array containing + + 1. filename + 2. header information + + 1. first header field name + 2. first header field value + 3. format string (e.g. NIST_1A) + 4. + 1. file id + 2. current position in file + 3. dataoff byte offset in file to start of data + 4. order byte order (l or b) + 5. nsamp number of samples + 6. number of channels + 7. nbytes bytes per data value + 8. bits number of bits of precision + 9. fs sample frequency + 10. min value + 11. max value + 12. coding 0=PCM,1=uLAW + 0=no compression, 0=shorten,20=wavpack,30=shortpack + 13. file not yet decompressed + + 5. temporary filename + + If no output parameters are specified, + header information will be printed. + The code to decode shorten-encoded files, is + not yet released with this toolkit. + """ + codings = dict([('pcm', 1), ('ulaw', 2)]) + compressions = dict([(',embedded-shorten-', 1), + (',embedded-wavpack-', 2), + (',embedded-shortpack-', 3)]) + byteorder = 'l' + endianess = dict([('l', '<'), ('b', '>')]) + + if not mode == 'p': + mode = [mode, 'p'] + k = list((m >= 'p') & (m <= 's') for m in mode) + # scale to input limits not output limits + mno = all([m != 'o' for m in mode]) + sc = '' + if k[0]: + sc = mode[0] + # Get byte order (little/big endian) + if any([m == 'l' for m in mode]): + byteorder = 'l' + elif any([m == 'b' for m in mode]): + byteorder = 'b' + ffx = ['', '', '', '', ''] + + if isinstance(input_file_name, str): + if os.path.exists(input_file_name): + fid = open(input_file_name, 'rb') + elif os.path.exists("".join((input_file_name, '.sph'))): + input_file_name = "".join((input_file_name, '.sph')) + fid = open(input_file_name, 'rb') + else: + raise Exception('Cannot find file {}'.format(input_file_name)) + ffx[0] = input_file_name + elif not isinstance(input_file_name, str): + ffx = input_file_name + else: + fid = input_file_name + + # Read the header + if ffx[3] == '': + fid.seek(0, 0) # go to the begining of the file + l1 = fid.readline().decode("utf-8") + l2 = fid.readline().decode("utf-8") + if not (l1 == 'NIST_1A\n') & (l2 == ' 1024\n'): + logging.warning('File does not begin with a SPHERE header') + ffx[2] = l1.rstrip() + hlen = int(l2[3:7]) + hdr = {} + while True: # Read the header and fill a dictionary + st = fid.readline().decode("utf-8").rstrip() + if st[0] != ';': + elt = st.split(' ') + if elt[0] == 'end_head': + break + if elt[1][0] != '-': + logging.warning('Missing ''-'' in SPHERE header') + break + if elt[1][1] == 's': + hdr[elt[0]] = elt[2] + elif elt[1][1] == 'i': + hdr[elt[0]] = int(elt[2]) + else: + hdr[elt[0]] = float(elt[2]) + + if 'sample_byte_format' in list(hdr.keys()): + if hdr['sample_byte_format'][0] == '0': + bord = 'l' + else: + bord = 'b' + if (bord != byteorder) & all([m != 'b' for m in mode]) \ + & all([m != 'l' for m in mode]): + byteorder = bord + + icode = 0 # Get encoding, default is PCM + if 'sample_coding' in list(hdr.keys()): + icode = -1 # unknown code + for coding in list(codings.keys()): + if hdr['sample_coding'].startswith(coding): + # is the signal compressed + # if len(hdr['sample_coding']) > codings[coding]: + if len(hdr['sample_coding']) > len(coding): + for compression in list(compressions.keys()): + if hdr['sample_coding'].endswith(compression): + icode = 10 * compressions[compression] \ + + codings[coding] - 1 + break + else: # if the signal is not compressed + icode = codings[coding] - 1 + break + # initialize info of the files with default values + info = [fid, 0, hlen, ord(byteorder), 0, 1, 2, 16, 1, 1, -1, icode] + # Get existing info from the header + if 'sample_count' in list(hdr.keys()): + info[4] = hdr['sample_count'] + if not info[4]: # if no info sample_count or zero + # go to the end of the file + fid.seek(0, 2) # Go to te end of the file + # get the sample count + info[4] = int(math.floor((fid.tell() - info[2]) / (info[5] * info[6]))) # get the sample_count + if 'channel_count' in list(hdr.keys()): + info[5] = hdr['channel_count'] + if 'sample_n_bytes' in list(hdr.keys()): + info[6] = hdr['sample_n_bytes'] + if 'sample_sig_bits' in list(hdr.keys()): + info[7] = hdr['sample_sig_bits'] + if 'sample_rate' in list(hdr.keys()): + info[8] = hdr['sample_rate'] + if 'sample_min' in list(hdr.keys()): + info[9] = hdr['sample_min'] + if 'sample_max' in list(hdr.keys()): + info[10] = hdr['sample_max'] + + ffx[1] = hdr + ffx[3] = info + info = ffx[3] + ksamples = info[4] + if ksamples > 0: + fid = info[0] + if (icode >= 10) & (ffx[4] == ''): # read compressed signal + # need to use a script with SHORTEN + raise Exception('compressed signal, need to unpack in a script with SHORTEN') + info[1] = ksamples + # use modes o and n to determine effective peak + pk = 2 ** (8 * info[6] - 1) * (1 + (float(mno) / 2 - int(all([m != 'b' + for m in + mode]))) / 2 ** + info[7]) + fid.seek(1024) # jump after the header + nsamples = info[5] * ksamples + if info[6] < 3: + if info[6] < 2: + logging.debug('Sphere i1 PCM') + y = numpy.fromfile(fid, endianess[byteorder]+"i1", -1) + if info[11] % 10 == 1: + if y.shape[0] % 2: + y = numpy.frombuffer(audioop.ulaw2lin( + numpy.concatenate((y, numpy.zeros(1, 'int8'))), 2), + numpy.int16)[:-1]/32768. + else: + y = numpy.frombuffer(audioop.ulaw2lin(y, 2), numpy.int16)/32768. + pk = 1. + else: + y = y - 128 + else: + logging.debug('Sphere i2') + y = numpy.fromfile(fid, endianess[byteorder]+"i2", -1) + else: # non verifie + if info[6] < 4: + y = numpy.fromfile(fid, endianess[byteorder]+"i1", -1) + y = y.reshape(nsamples, 3).transpose() + y = (numpy.dot(numpy.array([1, 256, 65536]), y) - (numpy.dot(y[2, :], 2 ** (-7)).astype(int) * 2 ** 24)) + else: + y = numpy.fromfile(fid, endianess[byteorder]+"i4", -1) + + if sc != 'r': + if sc == 's': + if info[9] > info[10]: + info[9] = numpy.min(y) + info[10] = numpy.max(y) + sf = 1 / numpy.max(list(list(map(abs, info[9:11]))), axis=0) + else: + sf = 1 / pk + y = sf * y + + if info[5] > 1: + y = y.reshape(ksamples, info[5]) + else: + y = numpy.array([]) + if mode != 'f': + fid.close() + info[0] = -1 + if not ffx[4] == '': + pass # VERIFY SCRIPT, WHICH CASE IS HANDLED HERE + return y.astype(numpy.float32), int(info[8]), int(info[6]) + + +def read_audio(input_file_name, framerate=None): + """ Read a 1 or 2-channel audio file in SPHERE, WAVE or RAW PCM format. + The format is determined from the file extension. + If the sample rate read from the file is a multiple of the one given + as parameter, we apply a decimation function to subsample the signal. + + :param input_file_name: name of the file to read from + :param framerate: frame rate, optional, if lower than the one read from the file, subsampling is applied + :return: the signal as a numpy array and the sampling frequency + """ + if framerate is None: + raise TypeError("Expected sampling frequency required in sidekit.frontend.io.read_audio") + ext = os.path.splitext(input_file_name)[-1] + if ext.lower() == '.sph': + sig, read_framerate, sampwidth = read_sph(input_file_name, 'p') + elif ext.lower() == '.wav' or ext.lower() == '.wave': + try: + sig, read_framerate, sampwidth = read_wav(input_file_name) + except: + import pydub + audio = pydub.AudioSegment.from_wav(input_file_name) + read_framerate = audio.frame_rate + sampwidth = audio.sample_width + sig = numpy.array(audio.split_to_mono()[0].get_array_of_samples()) + elif ext.lower() == '.pcm' or ext.lower() == '.raw': + sig, read_framerate, sampwidth = read_pcm(input_file_name) + read_framerate = framerate + else: + raise TypeError("Unknown extension of audio file") + + # Convert to 16 bit encoding if needed + #if not sampwidth == 2: + # sig *= (2**(15-sampwidth)) + + if framerate > read_framerate: + print("Warning in read_audio, up-sampling function is not implemented yet!") + elif read_framerate % float(framerate) == 0 and not framerate == read_framerate: + print("downsample {}".format(input_file_name)) + sig = scipy.signal.decimate(sig, int(read_framerate / float(framerate)), n=None, ftype='iir', axis=0) + return sig.astype(numpy.float32), framerate + + +@check_path_existance +def write_label(label, + output_file_name, + selected_label='speech', + frame_per_second=100, + show=None, + format="mdtm"): + """Save labels in ALIZE format + + :param output_file_name: name of the file to write to + :param label: label to write in the file given as a ndarray of boolean + :param selected_label: label to write to the file. Default is 'speech'. + :param frame_per_second: number of frame per seconds. Used to convert + the frame number into time. Default is 100. + """ + if label.shape[0] > 0: + bits = label[:-1] ^ label[1:] + # convert true value into a list of feature indexes + # append 0 at the beginning of the list, append the last index to the list + idx = [0] + (numpy.arange(len(bits))[bits] + 1).tolist() + [len(label)] + framerate = decimal.Decimal(1) / decimal.Decimal(frame_per_second) + + if format == "lab": + # for each pair of indexes (idx[i] and idx[i+1]), create a segment + with open(output_file_name, 'w') as fid: + for i in range(~label[0], len(idx) - 1, 2): + fid.write('{} {} {}\n'.format(str(idx[i]*framerate), + str(idx[i + 1]*framerate), selected_label)) + else: + # write in MDTM format + lst = [] + for i in range(~label[0], len(idx) - 1, 2): + gender = 'U' + env = 'U' + channel = 'U' + start = idx[i]*framerate + stop = idx[i + 1]*framerate + lst.append('{:s} 1 {:.2f} {:.2f} {:s} {:s} {:s} {:s}\n'.format( + show, start, stop - start, gender, + channel, env, "speech")) + + with open(output_file_name, 'w', encoding="utf8") as fid: + for line in lst: + fid.write(line) + + +def read_label(input_file_name, selected_label='speech', frame_per_second=100): + """Read label file in ALIZE format + + :param input_file_name: the label file name + :param selected_label: the label to return. Default is 'speech'. + :param frame_per_second: number of frame per seconds. Used to convert + the frame number into time. Default is 100. + + :return: a logical array + """ + with open(input_file_name) as f: + segments = f.readlines() + + if len(segments) == 0: + lbl = numpy.zeros(0).astype(bool) + else: + # initialize the length from the last segment's end + foo1, stop, foo2 = segments[-1].rstrip().split() + lbl = numpy.zeros(int(float(stop) * 100)).astype(bool) + + begin = numpy.zeros(len(segments)) + end = numpy.zeros(len(segments)) + + for s in range(len(segments)): + start, stop, label = segments[s].rstrip().split() + if label == selected_label: + begin[s] = int(round(float(start) * frame_per_second)) + end[s] = int(round(float(stop) * frame_per_second)) + lbl[begin[s]:end[s]] = True + return lbl + + +def read_spro4(input_file_name, + label_file_name="", + selected_label="", + frame_per_second=100): + """Read a feature stream in SPRO4 format + + :param input_file_name: name of the feature file to read from + :param label_file_name: name of the label file to read if required. + By Default, the method assumes no label to read from. + :param selected_label: label to select in the label file. Default is none. + :param frame_per_second: number of frame per seconds. Used to convert + the frame number into time. Default is 0. + + :return: a sequence of features in a numpy array + """ + with open(input_file_name, 'rb') as f: + + tmp_s = struct.unpack("8c", f.read(8)) + s = () + for i in range(len(tmp_s)): + s += (tmp_s[i].decode("utf-8"),) + f.seek(0, 2) # Go to te end of the file + size = f.tell() # get the position + f.seek(0, 0) # go back to the begining of the file + head_size = 0 + + if "".join(s) == '
': + # swap empty header for general header the code need changing + struct.unpack("19b", f.read(19)) + head_size = 19 + + dim = struct.unpack("H", f.read(2))[0] + struct.unpack("4b", f.read(4)) + struct.unpack("f", f.read(4)) + n_frames = int(math.floor((size - 10 - head_size) / (4 * dim))) + + features = numpy.asarray(struct.unpack('f' * n_frames * dim, + f.read(4 * n_frames * dim))) + features.resize((n_frames, dim)) + + lbl = numpy.ones(numpy.shape(features)[0]).astype(bool) + if not label_file_name == "": + lbl = read_label(label_file_name, selected_label, frame_per_second) + + features = features[lbl, :] + return features.astype(numpy.float32) + + +def read_hdf5_segment(file_handler, + show, + dataset_list, + label, + start=None, stop=None, + global_cmvn=False): + """Read a segment from a stream in HDF5 format. Return the features in the + range start:end + In case the start and end cannot be reached, the first or last feature are copied + so that the length of the returned segment is always end-start + + :param file_name: name of the file to open + :param dataset: identifier of the dataset in the HDF5 file + :param mask: + :param start: + :param end: + + :return:read_hdf5_segment + """ + h5f = file_handler + + compression_type = {0: 'none', 1: 'htk', 2: 'percentile'} + if "compression" not in h5f: + compression = 'none' + print("Warning, default feature storage mode is now using compression") + else: + if isinstance(h5f["compression"], h5py._hl.dataset.Dataset): + compression = compression_type[h5f["compression"][()]] + else: + compression = compression_type[h5f["compression"]] + + if show not in h5f: + raise Exception('show {} is not in the HDF5 file'.format(show)) + + # Get the selected segment + dataset_length = h5f[show + "/" + next(h5f[show].__iter__())].shape[0] + + # Deal with the case where start < 0 or stop > feat.shape[0] + if start is None: + start = 0 + pad_begining = -start if start < 0 else 0 + start = max(start, 0) + + if stop is None: + stop = dataset_length + pad_end = stop - dataset_length if stop > dataset_length else 0 + stop = min(stop, dataset_length) + global_cmvn = global_cmvn and not (start is None or stop is None) + + # Get the data between start and stop + # Concatenate all required datasets + feat = [] + global_mean = [] + global_std = [] + + feat = [] + for data_id in ['energy', 'cep', 'fb', 'bnf']: + if data_id in dataset_list: + if "/".join((show, data_id)) in h5f: + dataset_id = show + '/{}'.format(data_id) + if compression == 'none': + data = _read_segment(h5f, dataset_id, start, stop) + if data.ndim ==1: + data = data[:, numpy.newaxis] + feat.append(data) + elif compression == 'htk': + feat.append(_read_segment_htk(h5f, dataset_id, start, stop)) + else: + feat.append(_read_segment_percentile(h5f, dataset_id, start, stop)) + global_mean.append(h5f["/".join((show, "{}_mean".format(data_id)))][()]) + global_std.append(h5f["/".join((show, "{}_std".format(data_id)))][()]) + + else: + raise Exception('{} is not in the HDF5 file'.format(data_id)) + + feat = numpy.hstack(feat) + global_mean = numpy.hstack(global_mean) + global_std = numpy.hstack(global_std) + + if label is None: + if "/".join((show, "vad")) in h5f: + label = h5f.get("/".join((show, "vad")))[()].astype('bool').squeeze()[start:stop] + else: + label = numpy.ones(feat.shape[0], dtype='bool') + # Pad the segment if needed + feat = numpy.pad(feat, ((pad_begining, pad_end), (0, 0)), mode='edge') + label = numpy.pad(label, (pad_begining, pad_end), mode='edge') + #stop += pad_begining + pad_end + + return feat, label, global_mean, global_std, global_cmvn + + +def read_spro4_segment(input_file_name, start=0, end=None): + """Read a segment from a stream in SPRO4 format. Return the features in the + range start:end + In case the start and end cannot be reached, the first or last feature are copied + so that the length of the returned segment is always end-start + + :param input_file_name: name of the feature file to read from + :param start: index of the first frame to read (start at zero) + :param end: index of the last frame following the segment to read. + end < 0 means that end is the value of the right_context to add + at the end of the file + + :return: a sequence of features in a ndarray of length end-start + """ + with open(input_file_name, 'rb') as f: + + tmpS = struct.unpack("8c", f.read(8)) + s = () + for i in range(len(tmpS)): + s += (tmpS[i].decode("utf-8"),) + f.seek(0, 2) # Go to te end of the file + size = f.tell() # get the position + f.seek(0, 0) # go back to the begining of the file + head_size = 0 + + if "".join(s) == '
': + # swap empty header for general header the code need changing + struct.unpack("19b", f.read(19)) + head_size = 19 + + dim = struct.unpack("H", f.read(2))[0] + struct.unpack("4b", f.read(4)) + struct.unpack("f", f.read(4)) + n_frames = int(math.floor((size - 10 - head_size) / (4 * dim))) + if end is None: + end = n_frames + elif end < 0: + end = n_frames - end + + s, e = max(0, start), min(n_frames, end) + f.seek(2 + 4 + 4 + dim * 4 * s, 0) + features = numpy.fromfile(f, 'IIHH", len(features)+(4 if dt & _C else 0), sampling_period*1e7, + features.shape[1] * (2 if (pk in parms16bit or dt & _C) else 4), dt)) + if pk == 5: + features *= 32767.0 + if pk in parms16bit: + features = features.astype('>h') + elif dt & _C: + mmax, mmin = features.max(axis=0), features.min(axis=0) + mmax[mmax == mmin] += 32767 + mmin[mmax == mmin] -= 32767 # to avoid division by zero for constant coefficients + scale = 2 * 32767. / (mmax - mmin) + bias = 0.5 * scale * (mmax + mmin) + features = features * scale - bias + numpy.array([scale]).astype('>f').tofile(fh) + numpy.array([bias]).astype('>f').tofile(fh) + features = features.astype('>h') + else: + features = features.astype('>f') + features.tofile(fh) + +def read_htk(input_file_name, + label_file_name="", + selected_label="", + frame_per_second=100): + """Read a sequence of features in HTK format + + :param input_file_name: name of the file to read from + :param label_file_name: name of the label file to read from + :param selected_label: label to select + :param frame_per_second: number of frames per second + + :return: a tupple (d, fp, dt, tc, t) described below + + .. note:: + + - d = data: column vector for waveforms, 1 row per frame for other types + - fp = frame period in seconds + - dt = data type (also includes Voicebox code for generating data) + + 0. WAVEFORM Acoustic waveform + 1. LPC Linear prediction coefficients + 2. LPREFC LPC Reflection coefficients: -lpcar2rf([1 LPC]);LPREFC(1)=[]; + 3. LPCEPSTRA LPC Cepstral coefficients + 4. LPDELCEP LPC cepstral+delta coefficients (obsolete) + 5. IREFC LPC Reflection coefficients (16 bit fixed point) + 6. MFCC Mel frequency cepstral coefficients + 7. FBANK Log Fliter bank energies + 8. MELSPEC linear Mel-scaled spectrum + 9. USER User defined features + 10. DISCRETE Vector quantised codebook + 11. PLP Perceptual Linear prediction + 12. ANON + + - tc = full type code = dt plus (optionally) + one or more of the following modifiers + + - 64 _E Includes energy terms + - 128 _N Suppress absolute energy + - 256 _D Include delta coefs + - 512 _A Include acceleration coefs + - 1024 _C Compressed + - 2048 _Z Zero mean static coefs + - 4096 _K CRC checksum (not implemented yet) + - 8192 _0 Include 0'th cepstral coef + - 16384 _V Attach VQ index + - 32768 _T Attach delta-delta-delta index + + - t = text version of type code e.g. LPC_C_K + + This function is a translation of the Matlab code from + VOICEBOX is a MATLAB toolbox for speech processing. + by Mike Brookes + Home page: `VOICEBOX ` + """ + kinds = ['WAVEFORM', 'LPC', 'LPREFC', 'LPCEPSTRA', 'LPDELCEP', 'IREFC', + 'MFCC', 'FBANK', 'MELSPEC', 'USER', 'DISCRETE', 'PLP', 'ANON', + '???'] + with open(input_file_name, 'rb') as fid: + nf = struct.unpack(">l", fid.read(4))[0] # number of frames + # frame interval (in seconds) + fp = struct.unpack(">l", fid.read(4))[0] * 1.e-7 + by = struct.unpack(">h", fid.read(2))[0] # bytes per frame + tc = struct.unpack(">h", fid.read(2))[0] # type code + tc += 65536 * (tc < 0) + cc = 'ENDACZK0VT' # list of suffix codes + nhb = len(cc) # number of suffix codes + ndt = 6 # number of bits for base type + hb = list(int(math.floor(tc * 2 ** x)) + for x in range(- (ndt + nhb), -ndt + 1)) + # extract bits from type code + hd = list(hb[x] - 2 * hb[x - 1] for x in range(nhb, 0, -1)) + # low six bits of tc represent data type + dt = tc - hb[-1] * 2 ** ndt + + # hd(7)=1 CRC check + # hd(5)=1 compressed data + if dt == 5: + fid.seek(0, 2) # Go to te end of the file + flen = fid.tell() # get the position + fid.seek(0, 0) # go back to the begining of the file + if flen > 14 + by * nf: # if file too long + dt = 2 # change type to LPRFEC + hd[4] = 1 # set compressed flag + nf += 4 # frame count doesn't include + # compression constants in this case + + # 16 bit data for waveforms, IREFC and DISCRETE + if any([dt == x for x in [0, 5, 10]]): + n_dim = int(by * nf / 2) + data = numpy.asarray(struct.unpack(">" + "h" * n_dim, fid.read(2 * n_dim))) + d = data.reshape(nf, by / 2) + if dt == 5: + d /= 32767 # scale IREFC + else: + if hd[4]: # compressed data - first read scales + nf -= 4 # frame count includes compression constants + n_col = int(by / 2) + scales = numpy.asarray(struct.unpack(">" + "f" * n_col, fid.read(4 * n_col))) + biases = numpy.asarray(struct.unpack(">" + "f" * n_col, fid.read(4 * n_col))) + data = numpy.asarray(struct.unpack(">" + "h" * n_col * nf, fid.read(2 * n_col * nf))) + d = data.reshape(nf, n_col) + d = d + biases + d = d / scales + else: + data = numpy.asarray(struct.unpack(">" + "f" * int(by / 4) * nf, fid.read(by * nf))) + d = data.reshape(nf, by / 4) + + t = kinds[min(dt, len(kinds) - 1)] + + lbl = numpy.ones(numpy.shape(d)[0]).astype(bool) + if not label_file_name == "": + lbl = read_label(label_file_name, selected_label, frame_per_second) + + d = d[lbl, :] + + return d.astype(numpy.float32), fp, dt, tc, t + + +def read_htk_segment(input_file_name, + start=0, + stop=None): + """Read a segment from a stream in SPRO4 format. Return the features in the + range start:end + In case the start and end cannot be reached, the first or last feature are copied + so that the length of the returned segment is always end-start + + :param input_file_name: name of the feature file to read from or file-like + object alowing to seek in the file + :param start: index of the first frame to read (start at zero) + :param stop: index of the last frame following the segment to read. + end < 0 means that end is the value of the right_context to add + at the end of the file + + :return: a sequence of features in a ndarray of length end-start + """ + try: + fh = open(input_file_name, 'rb') + except TypeError: + fh = input_file_name + try: + fh.seek(0) + n_samples, _, sample_size, parm_kind = struct.unpack(">IIHH", fh.read(12)) + pk = parm_kind & 0x3f + if parm_kind & _C: + scale, bias = numpy.fromfile(fh, '>f', sample_size).reshape(2, sample_size/2) + n_samples -= 4 + s, e = max(0, start), min(n_samples, stop) + fh.seek(s*sample_size, 1) + dtype, _bytes = ('>h', 2) if parm_kind & _C or pk in parms16bit else ('>f', 4) + m = numpy.fromfile(fh, dtype, (e - s) * sample_size / _bytes).reshape(e - s, sample_size / _bytes) + if parm_kind & _C: + m = (m + bias) / scale + if pk == IREFC: + m /= 32767.0 + if pk == WAVEFORM: + m = m.ravel() + finally: + if fh is not input_file_name: + fh.close() + if start != s or stop != e: # repeat first or/and last frame as required + m = numpy.r_[numpy.repeat(m[[0]], s-start, axis=0), m, numpy.repeat(m[[-1]], stop-e, axis=0)] + return m.astype(numpy.float32) + +def _add_dataset_header(fh, + dataset_id, + _min_val, + _range, + _header): + """ + Create a dataset in the HDF5 file and write the data + after compressing float to int + """ + _c_header = (_header - _min_val) / _range + numpy.clip(_c_header, 0., 1.) + _c_header = (_c_header * 65535 + 0.499).astype(int) + + fh.create_dataset(dataset_id + '_header', + data=_c_header, + maxshape=(None, None), + compression="gzip", + fletcher32=True) + fh.create_dataset(dataset_id + '_min_range', + data=numpy.array([_min_val, _range]).astype('float32'), + maxshape=(2,), + compression="gzip", + fletcher32=True) + +def _add_percentile_dataset(fh, + dataset_id, + data): + """ + Create the dataset in the HDF5 file, write the data + compressed in int8 format and the header compressed in + int format + """ + _min_val = data.min() + _range = data.ptp() + + if data.ndim == 1: + data = data[:, numpy.newaxis] + + # First write the compression information in the dataset header + _header = numpy.zeros((data.shape[1], 4)) + + for j, p in enumerate([0, 25, 75, 100]): + _header[:, j] = numpy.percentile(data, p, axis=0, interpolation='lower') + _add_dataset_header(fh, dataset_id, _min_val, _range, _header) + + # now write the compressed data + c_data = numpy.zeros(data.shape, dtype=numpy.uint8) + for i in range(data.shape[1]): + p0, p25, p75, p100 = _header[i] + mat1 = numpy.uint8((((data[:, i] - p0) / (p25 - p0)) * 64 + 0.5)) + mat1 = numpy.clip(mat1, 0, 64) * (data[:, i] < p25) + mat2 = (numpy.uint8(((data[:, i] - p25) / (p75 - p25)) * 128 + 0.5) + 64) + mat2 = numpy.clip(mat2, 64, 192) * ((data[:, i] >= p25) & (data[:, i] < p75)) + mat3 = (numpy.uint8(((data[:, i] - p75) / (p100 - p75)) * 63 + 0.5) + 192) + mat3 = numpy.clip(mat3, 192, 255) * (data[:, i] >= p75) + c_data[:, i] = mat1 + mat2 + mat3 + + fh.create_dataset(dataset_id, + data=c_data, + maxshape=(None, None), + compression="gzip", + fletcher32=True) + +def _read_dataset(h5f, dataset_id): + data = h5f[dataset_id][()] + if data.ndim == 1: + data = data[:, numpy.newaxis] + return data + +def _read_segment(h5f, dataset_id, s, e): + data = h5f[dataset_id][s:e] + return data + +def _read_dataset_htk(h5f, dataset_id): + (A, B) = h5f[dataset_id + "comp"][()] + data = (h5f[dataset_id][()] + B) / A + if data.ndim == 1: + data = data[:, numpy.newaxis] + return data + +def _read_segment_htk(h5f, dataset_id, e, s): + (A, B) = h5f[dataset_id + "comp"][()] + data = (h5f[dataset_id][s:e, :] + B) / A + return data + +def read_dataset_percentile(h5f, dataset_id): + # read the header + (_min_val, _range) = h5f[dataset_id + "_min_range"][()] + c_header = h5f[dataset_id + "_header"][()] + _header = numpy.full(c_header.shape, _min_val) + _header += c_header * _range * 1.52590218966964e-05 + + # decompress the data + c_data = h5f[dataset_id][()] + mat1 = (_header[:,[0]] + (_header[:,[1]] - _header[:,[0]]) * c_data.T * (1/64)) * (c_data.T <= 64) + mat2 = (_header[:,[1]] + (_header[:,[2]] - _header[:,[1]]) * (c_data.T - 64) * (1/128)) * ((c_data.T > 64) & (c_data.T<=192)) + mat3 = (_header[:,[2]] + (_header[:,[3]] - _header[:,[2]]) * (c_data.T - 192) * (1/63)) * (c_data.T > 192) + return (mat1+mat2+mat3).T + +def _read_segment_percentile(h5f, dataset_id, s, e): + # read the header + (_min_val, _range) = h5f[dataset_id + "_min_range"][()] + c_header = h5f[dataset_id + "_header"][()] + _header = numpy.full(c_header.shape, _min_val) + _header += c_header * _range * 1.52590218966964e-05 + + c_data = h5f[dataset_id][()][s:e, :] + mat1 = (_header[:,[0]] + (_header[:,[1]] - _header[:,[0]]) * c_data.T * (1/64)) * (c_data.T <= 64) + mat2 = (_header[:,[1]] + (_header[:,[2]] - _header[:,[1]]) * (c_data.T - 64) * (1/128)) * ((c_data.T > 64) & (c_data.T<=192)) + mat3 = (_header[:,[2]] + (_header[:,[3]] - _header[:,[2]]) * (c_data.T - 192) * (1/63)) * (c_data.T > 192) + return (mat1+mat2+mat3).T + + +def _write_show(show, + fh, + cep, cep_mean, cep_std, + energy, energy_mean, energy_std, + fb, fb_mean, fb_std, + bnf, bnf_mean, bnf_std, + label): + if cep is not None: + fh.create_dataset(show + '/cep', data=cep.astype('float32'), + maxshape=(None, None), + compression="gzip", + fletcher32=True) + if cep_mean is not None: + fh.create_dataset(show + '/cep_mean', data=cep_mean.astype('float32'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + if cep_std is not None: + fh.create_dataset(show + '/cep_std', data=cep_std.astype('float32'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + if energy is not None: + energy = energy.squeeze() + fh.create_dataset(show + '/energy', data=energy.astype('float32'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + if energy_mean is not None: + fh.create_dataset(show + '/energy_mean', data=energy_mean) + if energy_std is not None: + fh.create_dataset(show + '/energy_std', data=energy_std) + if fb is not None: + fh.create_dataset(show + '/fb', data=fb.astype('float32'), + maxshape=(None, None), + compression="gzip", + fletcher32=True) + if fb_mean is not None: + fh.create_dataset(show + '/fb_mean', data=fb_mean.astype('float32'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + if fb_std is not None: + fh.create_dataset(show + '/fb_std', data=fb_std.astype('float32'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + if bnf is not None: + fh.create_dataset(show + '/bnf', data=bnf.astype('float32'), + maxshape=(None, None), + compression="gzip", + fletcher32=True) + if bnf_mean is not None: + fh.create_dataset(show + '/bnf_mean', data=bnf_mean.astype('float32'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + if bnf_std is not None: + fh.create_dataset(show + '/bnf_std', data=bnf_std.astype('float32'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + if label is not None and not show + "/vad" in fh: + fh.create_dataset(show + '/' + "vad", data=label.astype('int8'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + +def _write_show_htk(show, + fh, + cep, cep_mean, cep_std, + energy, energy_mean, energy_std, + fb, fb_mean, fb_std, + bnf, bnf_mean, bnf_std, + label): + if cep is not None: + A_cep = 2 * 32767. / (cep.max() - cep.min()) + B_cep = (cep.max() + cep.min()) * 32767. / (cep.max() - cep.min()) + fh.create_dataset(show + '/cep_comp', data=numpy.array([A_cep, B_cep]).astype('float32'), + maxshape=(2,), + compression="gzip", + fletcher32=True) + fh.create_dataset(show + '/cep', data=(A_cep*cep - B_cep).astype("short"), + maxshape=(None, None), + compression="gzip", + fletcher32=True) + if energy is not None: + A_energy = 2 * 32767. / (energy.max() - energy.min()) + B_energy = (energy.max() + energy.min()) * 32767. / (energy.max() - energy.min()) + fh.create_dataset(show + '/energy_comp', data=numpy.array([A_energy, B_energy]).astype('float32'), + maxshape=(2,), + compression="gzip", + fletcher32=True) + fh.create_dataset(show + '/energy', data=(A_energy * energy - B_energy).astype("short"), + maxshape=(None,), + compression="gzip", + fletcher32=True) + if fb is not None: + A_fb = 2 * 32767. / (fb.max() - fb.min()) + B_fb = (fb.max() + fb.min()) * 32767. / (fb.max() - fb.min()) + fh.create_dataset(show + '/fb_comp', data=numpy.array([A_fb, B_fb]).astype('float32'), + maxshape=(2,), + compression="gzip", + fletcher32=True) + fh.create_dataset(show + '/fb', data=(A_fb * fb - B_fb).astype("short"), + maxshape=(None, None), + compression="gzip", + fletcher32=True) + if bnf is not None: + A_bnf = 2 * 32767. / (bnf.max() - bnf.min()) + B_bnf = (bnf.max() + bnf.min()) * 32767. / (bnf.max() - bnf.min()) + fh.create_dataset(show + '/bnf_comp', data=numpy.array([A_bnf, B_bnf]).astype('float32'), + maxshape=(2,), + compression="gzip", + fletcher32=True) + fh.create_dataset(show + '/bnf', data=(A_bnf * bnf - B_bnf).astype("short"), + maxshape=(None, None), + compression="gzip", + fletcher32=True) + if energy_mean is not None: + fh.create_dataset(show + '/energy_mean', data=energy_mean) + if energy_std is not None: + fh.create_dataset(show + '/energy_std', data=energy_std) + if cep_mean is not None: + fh.create_dataset(show + '/cep_mean', data=cep_mean.astype('float32'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + if cep_std is not None: + fh.create_dataset(show + '/cep_std', data=cep_std.astype('float32'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + if fb_mean is not None: + fh.create_dataset(show + '/fb_mean', data=fb_mean.astype('float32'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + if fb_std is not None: + fh.create_dataset(show + '/fb_std', data=fb_std.astype('float32'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + if bnf_mean is not None: + fh.create_dataset(show + '/bnf_mean', data=bnf_mean.astype('float32'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + if bnf_std is not None: + fh.create_dataset(show + '/bnf_std', data=bnf_std.astype('float32'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + + if label is not None and not show + "/vad" in fh: + fh.create_dataset(show + '/' + "vad", data=label.astype('int8'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + +def _write_show_percentile(show, + fh, + cep, cep_mean, cep_std, + energy, energy_mean, energy_std, + fb, fb_mean, fb_std, + bnf, bnf_mean, bnf_std, + label): + if cep is not None: + _add_percentile_dataset(fh, show + '/cep', cep) + + if energy is not None: + _add_percentile_dataset(fh, show + '/energy', energy) + + if fb is not None: + _add_percentile_dataset(fh, show + '/fb', fb) + + if bnf is not None: + _add_percentile_dataset(fh, show + '/bnf', bnf) + + if cep_mean is not None: + fh.create_dataset(show + '/cep_mean', data=cep_mean.astype('float32'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + + if cep_std is not None: + fh.create_dataset(show + '/cep_std', data=cep_std.astype('float32'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + + if energy_mean is not None: + fh.create_dataset(show + '/energy_mean', data=energy_mean) + + if energy_std is not None: + fh.create_dataset(show + '/energy_std', data=energy_std) + + if fb_mean is not None: + fh.create_dataset(show + '/fb_mean', data=fb_mean.astype('float32'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + if fb_std is not None: + fh.create_dataset(show + '/fb_std', data=fb_std.astype('float32'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + if bnf_mean is not None: + fh.create_dataset(show + '/bnf_mean', data=bnf_mean.astype('float32'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + if bnf_std is not None: + fh.create_dataset(show + '/bnf_std', data=bnf_std.astype('float32'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + + if label is not None and not show + "/vad" in fh: + fh.create_dataset(show + '/' + "vad", data=label.astype('int8'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + + + +def write_hdf5(show, + fh, + cep, cep_mean, cep_std, + energy, energy_mean, energy_std, + fb, fb_mean, fb_std, + bnf, bnf_mean, bnf_std, + label, + compression='percentile'): + """ + :param show: identifier of the show to write + :param fh: HDF5 file handler + :param cep: cepstral coefficients to store + :param cep_mean: pre-computed mean of the cepstral coefficient + :param cep_std: pre-computed standard deviation of the cepstral coefficient + :param energy: energy coefficients to store + :param energy_mean: pre-computed mean of the energy + :param energy_std: pre-computed standard deviation of the energy + :param fb: filter-banks coefficients to store + :param fb_mean: pre-computed mean of the filter bank coefficient + :param fb_std: pre-computed standard deviation of the filter bank coefficient + :param bnf: bottle-neck features to store + :param bnf_mean: pre-computed mean of the bottleneck features + :param bnf_std: pre-computed standard deviation of the bottleneck features + :param label: vad labels to store + :param compressed: boolean, default is False + :return: + """ + #write the the type of compression: could be: + # 0 = no compression + # 1 HTK'style compression + # 2 compression percentile + compression_type = {'none':0, 'htk':1, 'percentile':2} + if "compression" not in fh: + fh.create_dataset('compression', data=compression_type[compression]) + else: + assert(fh['compression'][()] == compression_type[compression]) + + if compression == 'none': + _write_show(show, + fh, + cep, cep_mean, cep_std, + energy, energy_mean, energy_std, + fb, fb_mean, fb_std, + bnf, bnf_mean, bnf_std, + label) + elif compression == 'htk': + _write_show_htk(show, + fh, + cep, cep_mean, cep_std, + energy, energy_mean, energy_std, + fb, fb_mean, fb_std, + bnf, bnf_mean, bnf_std, + label) + else: + # Default: use percentile compression + _write_show_percentile(show, + fh, + cep, cep_mean, cep_std, + energy, energy_mean, energy_std, + fb, fb_mean, fb_std, + bnf, bnf_mean, bnf_std, + label) + +def read_hdf5(h5f, show, dataset_list=("cep", "fb", "energy", "vad", "bnf")): + """ + + :param h5f: HDF5 file handler to read from + :param show: identifier of the show to read + :param dataset_list: list of datasets to read and concatenate + :return: + """ + compression_type = {0:'none', 1:'htk', 2:'percentile'} + if "compression" not in h5f: + compression = 'none' + print("Warning, default feature storage mode is now using compression") + else: + compression = compression_type[h5f["compression"][()]] + + if show not in h5f: + raise Exception('show {} is not in the HDF5 file'.format(show)) + + # initialize the list of features to concatenate + feat = [] + + if "energy" in dataset_list: + if "/".join((show, "energy")) in h5f: + dataset_id = show + '/energy' + if compression == 'none': + feat.append(_read_dataset(h5f, dataset_id)) + elif compression == 'htk': + feat.append(_read_dataset_htk(h5f, dataset_id)) + else: + feat.append(read_dataset_percentile(h5f, dataset_id)) + else: + raise Exception('energy is not in the HDF5 file') + + if "cep" in dataset_list: + if "/".join((show, "cep")) in h5f: + dataset_id = show + '/cep' + if compression == 'none': + feat.append(_read_dataset(h5f, dataset_id)) + elif compression == 'htk': + feat.append(_read_dataset_htk(h5f, dataset_id)) + else: + feat.append(read_dataset_percentile(h5f, dataset_id)) + else: + raise Exception('cep) is not in the HDF5 file') + + if "fb" in dataset_list: + if "/".join((show, "fb")) in h5f: + dataset_id = show + '/fb' + if compression == 'none': + feat.append(_read_dataset(h5f, dataset_id)) + elif compression == 'htk': + feat.append(_read_dataset_htk(h5f, dataset_id)) + else: + feat.append(read_dataset_percentile(h5f, dataset_id)) + else: + raise Exception('cep) is not in the HDF5 file') + + if "bnf" in dataset_list: + if "/".join((show, "bnf")) in h5f: + dataset_id = show + '/bnf' + if compression == 'none': + feat.append(_read_dataset(h5f, dataset_id)) + elif compression == 'htk': + feat.append(_read_dataset_htk(h5f, dataset_id)) + else: + feat.append(read_dataset_percentile(h5f, dataset_id)) + else: + raise Exception('cep) is not in the HDF5 file') + + feat = numpy.hstack(feat) + + label = None + if "vad" in dataset_list: + if "/".join((show, "vad")) in h5f: + label = h5f.get("/".join((show, "vad")))[()].astype('bool').squeeze() + else: + warnings.warn("Warning...........no VAD in this HDF5 file") + label = numpy.ones(feat.shape[0], dtype='bool') + + return feat.astype(numpy.float32), label + + + +def _rms_energy(x): + return 10*numpy.log10((1e-12 + x.dot(x))/len(x)) + +def _add_noise(signal, noise_file_name, snr, sample_rate): + """ + + :param signal: + :param noise_file_name: + :param snr: + :return: + """ + # Open noise file + if isinstance(noise_file_name, numpy.ndarray): + noise = noise_file_name + else: + noise, fs_noise = read_audio(noise_file_name, sample_rate) + + # Generate random section of masker + if len(noise) < len(signal): + dup_factor = len(signal) // len(noise) + 1 + noise = numpy.tile(noise, dup_factor) + + if len(noise) != len(signal): + idx = numpy.random.randint(0, len(noise) - len(signal)) + noise = noise[idx:idx + len(signal)] + + # Compute energy of both signals + N_dB = _rms_energy(noise) + S_dB = _rms_energy(signal) + + # Rescale N + N_new = S_dB - snr + noise_scaled = 10 ** (N_new / 20) * noise / 10 ** (N_dB / 20) + noisy = signal + noise_scaled + + return (noisy - noisy.mean()) / noisy.std() + +def bin_interp(upcount, lwcount, upthr, lwthr, margin, tol=0.1): + n_iter = 1 + if abs(upcount - upthr - margin) < tol: + midcount = upcount + elif abs(lwcount - lwthr - margin) < tol: + midcount = lwcount + else: + midcount = (upcount + lwcount)/2 + midthr = (upthr + lwthr)/2 + diff = midcount - midthr - margin + while abs(diff) > tol: + n_iter += 1 + if n_iter > 20: + tol *= 1.1 + if diff > tol: + midcount = (upcount + midcount)/2 + midthr = (upthr + midthr)/2 + elif diff < -tol: + midcount = (lwcount + midcount)/2 + midthr = (lwthr + midthr)/2 + diff = midcount - midthr - margin + return midcount + +def asl_meter(x, fs, nbits=16): + '''Measure the Active Speech Level (ASR) of x following ITU-T P.56. + If x is integer, it will be scaled to (-1, 1) according to nbits. + ''' + + if numpy.issubdtype(x.dtype, numpy.integer): + x = x / 2**(nbits-1) + + # Constants + MIN_LOG_OFFSET = 1e-20 + T = 0.03 # Time constant of smoothing in seconds + g = numpy.exp(-1/(T*fs)) + H = 0.20 # Time of handover in seconds + I = int(numpy.ceil(H*fs)) + M = 15.9 # Margin between threshold and ASL in dB + + a = numpy.zeros(nbits-1) # Activity count + c = 0.5**numpy.arange(nbits-1, 0, step=-1) # Threshold level + h = numpy.ones(nbits)*I # Hangover count + s = 0 + sq = 0 + p = 0 + q = 0 + asl = -100 + + L = len(x) + s = sum(abs(x)) + sq = sum(x**2) + dclevel = s/numpy.arange(1, L+1) + lond_term_level = 10*numpy.log10(sq/numpy.arange(1, L+1) + MIN_LOG_OFFSET) + c_dB = 20*numpy.log10(c) + + for i in range(L): + p = g * p + (1-g) * abs(x[i]) + q = g * q + (1-g) * p + + for j in range(nbits-1): + if q >= c[j]: + a[j] += 1 + h[j] = 0 + elif h[j] < I: + a[j] += 1; + h[j] += 1 + + a_dB = -100 * numpy.ones(nbits-1) + + for i in range(nbits-1): + if a[i] != 0: + a_dB[i] = 10*numpy.log10(sq/a[i]) + + delta = a_dB - c_dB + idx = numpy.where(delta <= M)[0] + + if len(idx) != 0: + idx = idx[0] + if idx > 1: + asl = bin_interp(a_dB[idx], a_dB[idx-1], c_dB[idx], c_dB[idx-1], M) + else: + asl = a_dB[idx] + + return asl + +def _add_reverb(signal, reverb_file_name, sample_rate, reverb_level=-26.0, ): + '''Adds reverb (convolutive noise) to a speech signal. + The output speech level is normalized to asl_level. + ''' + if isinstance(reverb_file_name, numpy.ndarray): + reverb = reverb_file_name + else: + reverb, _ = read_audio(reverb_file_name, sample_rate) + y = lfilter(reverb, 1, signal) + y = y/10**(asl_meter(y, sample_rate)/20) * 10**(reverb_level/20) + + return (y - y.mean()) / y.std() + + +def degrade_audio(input_path, + input_extension, + output_path, + output_extension, + input_filename, + output_filename, + sampling_frequency=16000, + noise_file_name=None, + snr=-10, + reverb_file_name=None, + reverb_level=-26.): + """ + + :param input_filename: + :param output_filename: + :return: + """ + + # Open audio file, get the signal and possibly the sampling frequency + signal, sample_rate = read_audio(input_filename, sampling_frequency) + if signal.ndim == 1: + signal = signal[:, numpy.newaxis] + + for channel in range(signal.shape[1]): + if noise_file_name is not None: + signal[:, channel] = _add_noise(signal[:, channel], noise_file_name, snr, sampling_frequency) + if reverb_file_name is not None: + signal[:, channel] = _add_reverb(signal[:, channel], reverb_file_name, sampling_frequency, reverb_level) + + write_wav(signal, output_filename, sample_rate) + + +@process_parallel_lists +def augment_list(input_path, + input_extension, + output_path, + output_extension, + sampling_frequency, + show_list, + channel_list, + audio_file_list=None, + feature_file_list=None, + noise_file_list=None, + snr_list=None, + reverb_file_list=None, + reverb_levels=None, + num_thread=1): + """ + Compute the acoustic parameters (filter banks, cepstral coefficients, log-energy and bottleneck features + for a list of audio files and save them to disk in a HDF5 format + The process is parallelized if num_thread is higher than 1 + + + :param show_list: list of IDs of the show to process + :param channel_list: list of channel indices corresponding to each show + :param audio_file_list: list of input audio files if the name is independent from the ID of the show + :param feature_file_list: list of output audio files if the name is independent from the ID of the show + :param num_thread: number of parallel process to run + :return: + """ + + # get the length of the longest list + max_length = max([len(l) for l in [show_list, channel_list, audio_file_list, feature_file_list] + if l is not None]) + + if show_list is None: + show_list = numpy.empty(int(max_length), dtype='|O') + if audio_file_list is None: + audio_file_list = numpy.empty(int(max_length), dtype='|O') + if feature_file_list is None: + feature_file_list = numpy.empty(int(max_length), dtype='|O') + if noise_file_list is None: + noise_file_list = numpy.empty(int(max_length), dtype='|O') + snr_list = numpy.empty(int(max_length), dtype='|O') + elif snr_list is None: + snr_list = numpy.full(int(max_length), 5.) + if reverb_file_list is None: + reverb_file_list = numpy.empty(int(max_length), dtype='|O') + reverb_levels = numpy.empty(int(max_length), dtype='|O') + elif reverb_levels is None: + reverb_levels = numpy.full(int(max_length), -26.) + + + for show, channel, input_file, output_file, noise_file, snr, reverb_file, reverb_level in zip(show_list, + channel_list, + audio_file_list, + feature_file_list, + noise_file_list, + snr_list, + reverb_file_list, + reverb_levels): + degrade_audio(input_path, input_extension, output_path, output_extension, + show, + input_file, + output_file, + sampling_frequency, + noise_file, + snr, + reverb_file, + reverb_level) + + + diff --git a/sidekit/sidekit/frontend/normfeat.py b/sidekit/sidekit/frontend/normfeat.py new file mode 100755 index 0000000..40dfcd6 --- /dev/null +++ b/sidekit/sidekit/frontend/normfeat.py @@ -0,0 +1,230 @@ +# -*- coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . + +""" +Copyright 2014-2021 Anthony Larcher and Sylvain Meignier + +:mod:`frontend` provides methods to process an audio signal in order to extract +useful parameters for speaker verification. +""" +import numpy +import pandas +import scipy.stats as stats +from scipy.signal import lfilter + + +__author__ = "Anthony Larcher and Sylvain Meignier" +__copyright__ = "Copyright 2014-2021 Anthony Larcher and Sylvain Meignier" +__license__ = "LGPL" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' + + +def rasta_filt(x): + """Apply RASTA filtering to the input signal. + + :param x: the input audio signal to filter. + cols of x = critical bands, rows of x = frame + same for y but after filtering + default filter is single pole at 0.94 + """ + x = x.T + numerator = numpy.arange(.2, -.3, -.1) + denominator = numpy.array([1, -0.94]) + + # Initialize the state. This avoids a big spike at the beginning + # resulting from the dc offset level in each band. + # (this is effectively what rasta/rasta_filt.c does). + # Because Matlab uses a DF2Trans implementation, we have to + # specify the FIR part to get the state right (but not the IIR part) + y = numpy.zeros(x.shape) + zf = numpy.zeros((x.shape[0], 4)) + for i in range(y.shape[0]): + y[i, :4], zf[i, :4] = lfilter(numerator, 1, x[i, :4], axis=-1, zi=[0, 0, 0, 0]) + + # .. but don't keep any of these values, just output zero at the beginning + y = numpy.zeros(x.shape) + + # Apply the full filter to the rest of the signal, append it + for i in range(y.shape[0]): + y[i, 4:] = lfilter(numerator, denominator, x[i, 4:], axis=-1, zi=zf[i, :])[0] + + return y.T + + +def cms(features, label=None, global_mean=None): + """Performs cepstral mean subtraction + + :param features: a feature stream of dimension dim x nframes + where dim is the dimension of the acoustic features and nframes the + number of frames in the stream + :param label: a logical vector + :param global_mean: pre-computed mean to use for feature normalization if given + + :return: a feature stream + """ + # If no label file as input: all speech are speech + if label is None: + label = numpy.ones(features.shape[0]).astype(bool) + if label.sum() == 0: + mu = numpy.zeros((features.shape[1])) + if global_mean is not None: + mu = global_mean + else: + mu = numpy.mean(features[label, :], axis=0) + features -= mu + + +def cmvn(features, label=None, global_mean=None, global_std=None): + """Performs mean and variance normalization + + :param features: a feature stream of dimension dim x nframes + where dim is the dimension of the acoustic features and nframes the + number of frames in the stream + :param global_mean: pre-computed mean to use for feature normalization if given + :param global_std: pre-computed standard deviation to use for feature normalization if given + :param label: a logical verctor + + :return: a sequence of features + """ + # If no label file as input: all speech are speech + if label is None: + label = numpy.ones(features.shape[0]).astype(bool) + + if global_mean is not None and global_std is not None: + mu = global_mean + stdev = global_std + features -= mu + features /= stdev + + elif not label.sum() == 0: + mu = numpy.mean(features[label, :], axis=0) + stdev = numpy.std(features[label, :], axis=0) + features -= mu + features /= stdev + + +def stg(features, label=None, win=301): + """Performs feature warping on a sliding window + + :param features: a feature stream of dimension dim x nframes + where dim is the dimension of the acoustic features and nframes the + number of frames in the stream + :param label: label of selected frames to compute the Short Term Gaussianization, by default, al frames are used + :param win: size of the frame window to consider, must be an odd number to get a symetric context on left and right + :return: a sequence of features + """ + + # If no label file as input: all speech are speech + if label is None: + label = numpy.ones(features.shape[0]).astype(bool) + speech_features = features[label, :] + + add_a_feature = False + if win % 2 == 1: + # one feature per line + nframes, dim = numpy.shape(speech_features) + + # If the number of frames is not enough for one window + if nframes < win: + # if the number of frames is not odd, duplicate the last frame + # if nframes % 2 == 1: + if not nframes % 2 == 1: + nframes += 1 + add_a_feature = True + speech_features = numpy.concatenate((speech_features, [speech_features[-1, ]])) + win = nframes + + # create the output feature stream + stg_features = numpy.zeros(numpy.shape(speech_features)) + + # Process first window + r = numpy.argsort(speech_features[:win, ], axis=0) + r = numpy.argsort(r, axis=0) + arg = (r[: (win - 1) // 2] + 0.5) / win + stg_features[: (win - 1) // 2, :] = stats.norm.ppf(arg, 0, 1) + + # process all following windows except the last one + for m in range(int((win - 1) / 2), int(nframes - (win - 1) / 2)): + idx = list(range(int(m - (win - 1) / 2), int(m + (win - 1) / 2 + 1))) + foo = speech_features[idx, :] + r = numpy.sum(foo < foo[(win - 1) // 2], axis=0) + 1 + arg = (r - 0.5) / win + stg_features[m, :] = stats.norm.ppf(arg, 0, 1) + + # Process the last window + r = numpy.argsort(speech_features[list(range(nframes - win, nframes)), ], axis=0) + r = numpy.argsort(r, axis=0) + arg = (r[(win + 1) // 2: win, :] + 0.5) / win + + stg_features[list(range(int(nframes - (win - 1) / 2), nframes)), ] = stats.norm.ppf(arg, 0, 1) + else: + # Raise an exception + raise Exception('Sliding window should have an odd length') + + # wrapFeatures = np.copy(features) + if add_a_feature: + stg_features = stg_features[:-1] + features[label, :] = stg_features + + +def cep_sliding_norm(features, win=301, label=None, center=True, reduce=False): + """ + Performs a cepstal mean substitution and standard deviation normalization + in a sliding windows. MFCC is modified. + + :param features: the MFCC, a numpy array + :param win: the size of the sliding windows + :param label: vad label if available + :param center: performs mean subtraction + :param reduce: performs standard deviation division + + """ + if label is None: + label = numpy.ones(features.shape[0]).astype(bool) + + if numpy.sum(label) <= win: + if reduce: + cmvn(features, label) + else: + cms(features, label) + else: + d_win = win // 2 + + df = pandas.DataFrame(features[label, :]) + r = df.rolling(window=win, center=True) + mean = r.mean().values + std = r.std().values + + mean[0:d_win, :] = mean[d_win, :] + mean[-d_win:, :] = mean[-d_win-1, :] + + std[0:d_win, :] = std[d_win, :] + std[-d_win:, :] = std[-d_win-1, :] + + if center: + features[label, :] -= mean + if reduce: + features[label, :] /= std diff --git a/sidekit/sidekit/frontend/vad.py b/sidekit/sidekit/frontend/vad.py new file mode 100755 index 0000000..02e0c93 --- /dev/null +++ b/sidekit/sidekit/frontend/vad.py @@ -0,0 +1,428 @@ +# -*- coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . + +""" +Copyright 2014-2021 Anthony Larcher and Sylvain Meignier + +:mod:`frontend` provides methods to process an audio signal in order to extract +useful parameters for speaker verification. +""" +import copy +import logging +import numpy +from scipy.fftpack import fft, ifft +from scipy import ndimage + + +__author__ = "Anthony Larcher and Sylvain Meignier" +__copyright__ = "Copyright 2014-2021 Anthony Larcher and Sylvain Meignier" +__license__ = "LGPL" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' + + +def pre_emphasis(input_sig, pre): + """Pre-emphasis of an audio signal. + :param input_sig: the input vector of signal to pre emphasize + :param pre: value that defines the pre-emphasis filter. + """ + if input_sig.ndim == 1: + return (input_sig - numpy.c_[input_sig[numpy.newaxis, :][..., :1], + input_sig[numpy.newaxis, :][..., :-1]].squeeze() * pre) + else: + return input_sig - numpy.c_[input_sig[..., :1], input_sig[..., :-1]] * pre + + +def segment_axis(a, length, overlap=0, axis=None, end='cut', endvalue=0): + """Generate a new array that chops the given array along the given axis + into overlapping frames. + + This method has been implemented by Anne Archibald, + as part of the talk box toolkit + example:: + + segment_axis(arange(10), 4, 2) + array([[0, 1, 2, 3], + ( [2, 3, 4, 5], + [4, 5, 6, 7], + [6, 7, 8, 9]]) + + :param a: the array to segment + :param length: the length of each frame + :param overlap: the number of array elements by which the frames should overlap + :param axis: the axis to operate on; if None, act on the flattened array + :param end: what to do with the last frame, if the array is not evenly + divisible into pieces. Options are: + - 'cut' Simply discard the extra values + - 'wrap' Copy values from the beginning of the array + - 'pad' Pad with a constant value + + :param endvalue: the value to use for end='pad' + + :return: a ndarray + + The array is not copied unless necessary (either because it is unevenly + strided and being flattened or because end is set to 'pad' or 'wrap'). + """ + + if axis is None: + a = numpy.ravel(a) # may copy + axis = 0 + + l = a.shape[axis] + + if overlap >= length: + raise ValueError("frames cannot overlap by more than 100%") + if overlap < 0 or length <= 0: + raise ValueError("overlap must be nonnegative and length must" + + "be positive") + + if l < length or (l - length) % (length - overlap): + if l > length: + roundup = length + (1 + (l - length) // (length - overlap)) * (length - overlap) + rounddown = length + ((l - length) // (length - overlap)) * (length - overlap) + else: + roundup = length + rounddown = 0 + assert rounddown < l < roundup + assert roundup == rounddown + (length - overlap) or (roundup == length and rounddown == 0) + a = a.swapaxes(-1, axis) + + if end == 'cut': + a = a[..., :rounddown] + l = a.shape[0] + elif end in ['pad', 'wrap']: # copying will be necessary + s = list(a.shape) + s[-1] = roundup + b = numpy.empty(s, dtype=a.dtype) + b[..., :l] = a + if end == 'pad': + b[..., l:] = endvalue + elif end == 'wrap': + b[..., l:] = a[..., :roundup - l] + a = b + + a = a.swapaxes(-1, axis) + + if l == 0: + raise ValueError("Not enough data points to segment array " + + "in 'cut' mode; try 'pad' or 'wrap'") + assert l >= length + assert (l - length) % (length - overlap) == 0 + n = 1 + (l - length) // (length - overlap) + s = a.strides[axis] + new_shape = a.shape[:axis] + (n, length) + a.shape[axis + 1:] + new_strides = a.strides[:axis] + ((length - overlap) * s, s) + a.strides[axis + 1:] + + try: + return numpy.ndarray.__new__(numpy.ndarray, strides=new_strides, + shape=new_shape, buffer=a, dtype=a.dtype) + except TypeError: + logging.debug("Problem with ndarray creation forces copy.") + a = a.copy() + # Shape doesn't change but strides does + new_strides = a.strides[:axis] + ((length - overlap) * s, s) + a.strides[axis + 1:] + return numpy.ndarray.__new__(numpy.ndarray, strides=new_strides, + shape=new_shape, buffer=a, dtype=a.dtype) + + +def speech_enhancement(X, Gain, NN): + """This program is only to process the single file seperated by the silence + section if the silence section is detected, then a counter to number of + buffer is set and pre-processing is required. + + Usage: SpeechENhance(wavefilename, Gain, Noise_floor) + + :param X: input audio signal + :param Gain: default value is 0.9, suggestion range 0.6 to 1.4, + higher value means more subtraction or noise redcution + :param NN: + + :return: a 1-dimensional array of boolean that + is True for high energy frames. + + Copyright 2014 Sun Han Wu and Anthony Larcher + """ + if X.shape[0] < 512: # creer une exception + return X + + num1 = 40 # dsiable buffer number + Alpha = 0.75 # original value is 0.9 + FrameSize = 32 * 2 # 256*2 + FrameShift = int(FrameSize / NN) # FrameSize/2=128 + nfft = FrameSize # = FrameSize + Fmax = int(numpy.floor(nfft / 2) + 1) # 128+1 = 129 + # arising hamming windows + Hamm = 1.08 * (0.54 - 0.46 * numpy.cos(2 * numpy.pi * numpy.arange(FrameSize) / (FrameSize - 1))) + y0 = numpy.zeros(FrameSize - FrameShift) # 128 zeros + + Eabsn = numpy.zeros(Fmax) + Eta1 = Eabsn + + ################################################################### + # initial parameter for noise min + mb = numpy.ones((1 + FrameSize // 2, 4)) * FrameSize / 2 # 129x4 set four buffer * FrameSize/2 + im = 0 + Beta1 = 0.9024 # seems that small value is better; + pxn = numpy.zeros(1 + FrameSize // 2) # 1+FrameSize/2=129 zeros vector + + ################################################################### + old_absx = Eabsn + x = numpy.zeros(FrameSize) + x[FrameSize - FrameShift:FrameSize] = X[ + numpy.arange(numpy.min((int(FrameShift), X.shape[0])))] + + if x.shape[0] < FrameSize: + EOF = 1 + return X + + EOF = 0 + Frame = 0 + + ################################################################### + # add the pre-noise estimates + for i in range(200): + Frame += 1 + fftn = fft(x * Hamm) # get its spectrum + absn = numpy.abs(fftn[0:Fmax]) # get its amplitude + + # add the following part from noise estimation algorithm + pxn = Beta1 * pxn + (1 - Beta1) * absn # Beta=0.9231 recursive pxn + im = (im + 1) % 40 # noise_memory=47; im=0 (init) for noise level estimation + + if im: + mb[:, 0] = numpy.minimum(mb[:, 0], pxn) # 129 by 4 im<>0 update the first vector from PXN + else: + mb[:, 1:] = mb[:, :3] # im==0 every 47 time shift pxn to first vector of mb + mb[:, 0] = pxn + # 0-2 vector shifted to 1 to 3 + + pn = 2 * numpy.min(mb, axis=1) # pn = 129x1po(9)=1.5 noise level estimate compensation + # over_sub_noise= oversubtraction factor + + # end of noise detection algotihm + x[:FrameSize - FrameShift] = x[FrameShift:FrameSize] + index1 = numpy.arange(FrameShift * Frame, numpy.min((FrameShift * (Frame + 1), X.shape[0]))) + In_data = X[index1] # fread(ifp, FrameShift, 'short'); + + if In_data.shape[0] < FrameShift: # to check file is out + EOF = 1 + break + else: + x[FrameSize - FrameShift:FrameSize] = In_data # shift new 128 to position 129 to FrameSize location + # end of for loop for noise estimation + + # end of prenoise estimation ************************ + x = numpy.zeros(FrameSize) + x[FrameSize - FrameShift:FrameSize] = X[numpy.arange(numpy.min((int(FrameShift), X.shape[0])))] + + if x.shape[0] < FrameSize: + EOF = 1 + return X + + EOF = 0 + Frame = 0 + + X1 = numpy.zeros(X.shape) + Frame = 0 + + while EOF == 0: + Frame += 1 + xwin = x * Hamm + + fftx = fft(xwin, nfft) # FrameSize FFT + absx = numpy.abs(fftx[0:Fmax]) # Fmax=129,get amplitude of x + argx = fftx[:Fmax] / (absx + numpy.spacing(1)) # normalize x spectrum phase + + absn = absx + + # add the following part from rainer algorithm + pxn = Beta1 * pxn + (1 - Beta1) * absn # s Beta=0.9231 recursive pxn + + im = int((im + 1) % (num1 * NN / 2)) # original =40 noise_memory=47; im=0 (init) for noise level estimation + + if im: + mb[:, 0] = numpy.minimum(mb[:, 0], pxn) # 129 by 4 im<>0 update the first vector from PXN + else: + mb[:, 1:] = mb[:, :3] # im==0 every 47 time shift pxn to first vector of mb + mb[:, 0] = pxn + + pn = 2 * numpy.min(mb, axis=1) # pn = 129x1po(9)=1.5 noise level estimate compensation + + Eabsn = pn + Gaina = Gain + + temp1 = Eabsn * Gaina + + Eta1 = Alpha * old_absx + (1 - Alpha) * numpy.maximum(absx - temp1, 0) + new_absx = (absx * Eta1) / (Eta1 + temp1) # wiener filter + old_absx = new_absx + + ffty = new_absx * argx # multiply amplitude with its normalized spectrum + + #y = numpy.real(numpy.fft.fftpack.ifft(numpy.concatenate((ffty, + # numpy.conj(ffty[numpy.arange(Fmax - 2, 0, -1)]))))) + y = numpy.real(ifft(numpy.concatenate((ffty, numpy.conj(ffty[numpy.arange(Fmax - 2, 0, -1)]))))) + y[:FrameSize - FrameShift] = y[:FrameSize - FrameShift] + y0 + y0 = y[FrameShift:FrameSize] # keep 129 to FrameSize point samples + x[:FrameSize - FrameShift] = x[FrameShift:FrameSize] + + index1 = numpy.arange(FrameShift * Frame, numpy.min((FrameShift * (Frame + 1), X.shape[0]))) + In_data = X[index1] # fread(ifp, FrameShift, 'short'); + + z = 2 / NN * y[:FrameShift] # left channel is the original signal + z /= 1.15 + z = numpy.minimum(z, 32767) + z = numpy.maximum(z, -32768) + index0 = numpy.arange(FrameShift * (Frame - 1), FrameShift * Frame) + if not all(index0 < X1.shape[0]): + idx = 0 + while (index0[idx] < X1.shape[0]) & (idx < index0.shape[0]): + X1[index0[idx]] = z[idx] + idx += 1 + else: + X1[index0] = z + + if In_data.shape[0] == 0: + EOF = 1 + else: + x[numpy.arange(FrameSize - FrameShift, FrameSize + In_data.shape[0] - FrameShift)] = In_data + + X1 = X1[X1.shape[0] - X.shape[0]:] + # } + # catch{ + + # } + return X1 + + +def vad_percentil(log_energy, percent): + """ + + :param log_energy: + :param percent: + :return: + """ + thr = numpy.percentile(log_energy, percent) + return log_energy > thr, thr + + +def vad_energy(log_energy, + distrib_nb=3, + nb_train_it=8, + flooring=0.0001, ceiling=1.0, + alpha=2): + """ + + :param log_energy: + :param distrib_nb: + :param nb_train_it: + :param flooring: + :param ceiling: + :param alpha: + :return: + """ + # center and normalize the energy + log_energy = (log_energy - numpy.mean(log_energy)) / numpy.std(log_energy) + + # Initialize a Mixture with 2 or 3 distributions + world = Mixture() + # set the covariance of each component to 1.0 and the mean to mu + meanIncrement + world.cst = numpy.ones(distrib_nb) / (numpy.pi / 2.0) + world.det = numpy.ones(distrib_nb) + world.mu = -2 + 4.0 * numpy.arange(distrib_nb) / (distrib_nb - 1) + world.mu = world.mu[:, numpy.newaxis] + world.invcov = numpy.ones((distrib_nb, 1)) + # set equal weights for each component + world.w = numpy.ones(distrib_nb) / distrib_nb + world.cov_var_ctl = copy.deepcopy(world.invcov) + + # Initialize the accumulator + accum = copy.deepcopy(world) + + # Perform nbTrainIt iterations of EM + for it in range(nb_train_it): + accum._reset() + # E-step + world._expectation(accum, log_energy) + # M-step + world._maximization(accum, ceiling, flooring) + + # Compute threshold + threshold = world.mu.max() - alpha * numpy.sqrt(1.0 / world.invcov[world.mu.argmax(), 0]) + + # Apply frame selection with the current threshold + label = log_energy > threshold + return label, threshold + + +def vad_snr(sig, snr, fs=16000, shift=0.01, nwin=256): + """Select high energy frames based on the Signal to Noise Ratio + of the signal. + Input signal is expected encoded on 16 bits + + :param sig: the input audio signal + :param snr: Signal to noise ratio to consider + :param fs: sampling frequency of the input signal in Hz. Default is 16000. + :param shift: shift between two frames in seconds. Default is 0.01 + :param nwin: number of samples of the sliding window. Default is 256. + """ + overlap = nwin - int(shift * fs) + sig /= 32768. + sig = speech_enhancement(numpy.squeeze(sig), 1.2, 2) + + # Compute Standard deviation + sig += 0.1 * numpy.random.randn(sig.shape[0]) + + std2 = segment_axis(sig, nwin, overlap, axis=None, end='cut', endvalue=0).T + std2 = numpy.std(std2, axis=0) + std2 = 20 * numpy.log10(std2) # convert the dB + + # APPLY VAD + label = (std2 > numpy.max(std2) - snr) & (std2 > -75) + + return label + + +def label_fusion(label, win=3): + """Apply a morphological filtering on the label to remove isolated labels. + In case the input is a two channel label (2D ndarray of boolean of same + length) the labels of two channels are fused to remove + overlaping segments of speech. + + :param label: input labels given in a 1D or 2D ndarray + :param win: parameter or the morphological filters + """ + channel_nb = len(label) + if channel_nb == 2: + overlap_label = numpy.logical_and(label[0], label[1]) + label[0] = numpy.logical_and(label[0], ~overlap_label) + label[1] = numpy.logical_and(label[1], ~overlap_label) + + for idx, lbl in enumerate(label): + cl = ndimage.grey_closing(lbl, size=win) + label[idx] = ndimage.grey_opening(cl, size=win) + + return label diff --git a/sidekit/sidekit/gmm_scoring.py b/sidekit/sidekit/gmm_scoring.py new file mode 100755 index 0000000..2f9f2a4 --- /dev/null +++ b/sidekit/sidekit/gmm_scoring.py @@ -0,0 +1,177 @@ +# -*- coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . +""" +Copyright 2014-2021 Anthony Larcher and Sylvain Meignier + + :mod:`features_server` provides methods to test gmm models + +""" +import numpy +import warnings +import multiprocessing +import ctypes +import logging + +import sidekit.sv_utils +import sidekit.frontend +from sidekit.mixture import Mixture +from sidekit.statserver import StatServer +from sidekit.features_server import FeaturesServer +from sidekit.bosaris import Ndx +from sidekit.bosaris import Scores + + +__license__ = "LGPL" +__author__ = "Anthony Larcher" +__copyright__ = "Copyright 2014-2021 Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' + + +def gmm_scoring_singleThread(ubm, enroll, ndx, feature_server, score_mat, seg_idx=None): + """Compute log-likelihood ratios for sequences of acoustic feature + frames between a Universal Background Model (UBM) and a list of Gaussian + Mixture Models (GMMs) which only mean vectors differ from the UBM. + + :param ubm: a Mixture object used to compute the denominator + of the likelihood ratios + :param enroll: a StatServer object which stat1 attribute contains mean + super-vectors of the GMMs to use to compute the numerator of the + likelihood ratios. + :param ndx: an Ndx object which define the list of trials to compute + :param feature_server: sidekit.FeaturesServer used to load the acoustic parameters + :param score_mat: a ndarray of scores to fill + :param seg_idx: the list of unique test segments to process. + Those test segments should belong to the list of test segments + in the ndx object. By setting seg_idx=None, all test segments + from the ndx object will be processed + + """ + assert isinstance(ubm, Mixture), 'First parameter should be a Mixture' + assert isinstance(enroll, StatServer), 'Second parameter should be a StatServer' + assert isinstance(ndx, Ndx), 'Third parameter should be a Ndx' + assert isinstance(feature_server, FeaturesServer), 'Fourth parameter should be a FeatureServer' + + if seg_idx is None: + seg_idx = range(ndx.segset.shape[0]) + + for ts in seg_idx: + logging.info('Compute trials involving test segment %d/%d', ts + 1, ndx.segset.shape[0]) + + # Select the models to test with the current segment + models = ndx.modelset[ndx.trialmask[:, ts]] + ind_dict = dict((k, i) for i, k in enumerate(ndx.modelset)) + inter = set(ind_dict.keys()).intersection(models) + idx_ndx = [ind_dict[x] for x in inter] + ind_dict = dict((k, i) for i, k in enumerate(enroll.modelset)) + inter = set(ind_dict.keys()).intersection(models) + idx_enroll = [ind_dict[x] for x in inter] + + # Load feature file + cep, _ = feature_server.load(ndx.segset[ts]) + + llr = numpy.zeros(numpy.array(idx_enroll).shape) + for m in range(llr.shape[0]): + # Compute llk for the current model + if ubm.invcov.ndim == 2: + lp = ubm.compute_log_posterior_probabilities(cep, enroll.stat1[idx_enroll[m], :]) + elif ubm.invcov.ndim == 3: + lp = ubm.compute_log_posterior_probabilities_full(cep, enroll.stat1[idx_enroll[m], :]) + pp_max = numpy.max(lp, axis=1) + log_lk = pp_max + numpy.log(numpy.sum(numpy.exp((lp.transpose() - pp_max).transpose()), axis=1)) + llr[m] = log_lk.mean() + + # Compute and substract llk for the ubm + if ubm.invcov.ndim == 2: + lp = ubm.compute_log_posterior_probabilities(cep) + elif ubm.invcov.ndim == 3: + lp = ubm.compute_log_posterior_probabilities_full(cep) + ppMax = numpy.max(lp, axis=1) + loglk = ppMax + numpy.log(numpy.sum(numpy.exp((lp.transpose() - ppMax).transpose()), axis=1)) + llr = llr - loglk.mean() + + # Fill the score matrix + score_mat[idx_ndx, ts] = llr + + +def gmm_scoring(ubm, enroll, ndx, feature_server, num_thread=1): + """Compute log-likelihood ratios for sequences of acoustic feature + frames between a Universal Background Model (UBM) and a list of + Gaussian Mixture Models (GMMs) which only mean vectors differ + from the UBM. + + :param ubm: a Mixture object used to compute the denominator of the + likelihood ratios + :param enroll: a StatServer object which stat1 attribute contains + mean super-vectors of the GMMs to use to compute the numerator + of the likelihood ratios. + :param ndx: an Ndx object which define the list of trials to compute + :param feature_server: a FeatureServer object to load the features + :param num_thread: number of thread to launch in parallel + + :return: a Score object. + + """ + assert isinstance(ubm, Mixture), 'First parameter should be a Mixture' + assert isinstance(enroll, StatServer), 'Second parameter should be a StatServer' + assert isinstance(ndx, Ndx), 'Third parameter should be a Ndx' + assert isinstance(feature_server, FeaturesServer), 'Fourth parameter should be a FeatureServer' + + # Remove missing models and test segments + if feature_server.features_extractor is None: + existing_test_seg, test_seg_idx = sidekit.sv_utils.check_file_list(ndx.segset, + feature_server.feature_filename_structure) + clean_ndx = ndx.filter(enroll.modelset, existing_test_seg, True) + elif feature_server.features_extractor.audio_filename_structure is not None: + existing_test_seg, test_seg_idx = \ + sidekit.sv_utils.check_file_list(ndx.segset, feature_server.features_extractor.audio_filename_structure) + clean_ndx = ndx.filter(enroll.modelset, existing_test_seg, True) + else: + clean_ndx = ndx + + s = numpy.zeros(clean_ndx.trialmask.shape) + dims = s.shape + with warnings.catch_warnings(): + warnings.simplefilter('ignore', RuntimeWarning) + tmp_stat1 = multiprocessing.Array(ctypes.c_double, s.size) + s = numpy.ctypeslib.as_array(tmp_stat1.get_obj()) + s = s.reshape(dims) + + # Split the list of segment to process for multi-threading + los = numpy.array_split(numpy.arange(clean_ndx.segset.shape[0]), num_thread) + jobs = [] + for idx in los: + p = multiprocessing.Process(target=gmm_scoring_singleThread, args=(ubm, enroll, ndx, feature_server, s, idx)) + jobs.append(p) + p.start() + for p in jobs: + p.join() + + score = Scores() + score.scoremat = s + score.modelset = clean_ndx.modelset + score.segset = clean_ndx.segset + score.scoremask = clean_ndx.trialmask + return score diff --git a/sidekit/sidekit/iv_scoring.py b/sidekit/sidekit/iv_scoring.py new file mode 100755 index 0000000..85fbbed --- /dev/null +++ b/sidekit/sidekit/iv_scoring.py @@ -0,0 +1,574 @@ +# -*- coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . +""" +Copyright 2014-2021 Anthony Larcher and Sylvain Meignier + + :mod:`iv_scoring` provides methods to compare i-vectors +""" +import copy +import logging +import sys + +import numpy +import scipy +import torch + +from sidekit.bosaris import Ndx, Scores +from sidekit.statserver import StatServer + +if sys.version_info.major > 2: + from functools import reduce + + +__license__ = "LGPL" +__author__ = "Anthony Larcher" +__copyright__ = "Copyright 2014-2021 Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' + + +def _check_missing_model(enroll, test, ndx): + # Remove missing models and test segments + clean_ndx = ndx.filter(enroll.modelset, test.segset, True) + + # Align StatServers to match the clean_ndx + enroll.align_models(clean_ndx.modelset) + test.align_segments(clean_ndx.segset) + + return clean_ndx + + +def cosine_scoring(enroll, test, ndx, wccn=None, check_missing=True, device=None): + """Compute the cosine similarities between to sets of vectors. The list of + trials to perform is given in an Ndx object. + + :param enroll: a StatServer in which stat1 are i-vectors + :param test: a StatServer in which stat1 are i-vectors + :param ndx: an Ndx object defining the list of trials to perform + :param wccn: numpy.ndarray, if provided, the i-vectors are normalized by using a Within Class Covariance Matrix + :param check_missing: boolean, if True, check that all models and segments exist + + :return: a score object + """ + assert isinstance(enroll, StatServer), 'First parameter should be a StatServer' + assert isinstance(test, StatServer), 'Second parameter should be a StatServer' + assert isinstance(ndx, Ndx), 'Third parameter should be an Ndx' + enroll_copy = copy.deepcopy(enroll) + test_copy = copy.deepcopy(test) + + # If models are not unique, compute the mean per model, display a warning + #if not numpy.unique(enroll_copy.modelset).shape == enroll_copy.modelset.shape: + # logging.warning("Enrollment models are not unique, average i-vectors") + # enroll_copy = enroll_copy.mean_stat_per_model() + + # Remove missing models and test segments + if check_missing: + clean_ndx = _check_missing_model(enroll_copy, test_copy, ndx) + else: + clean_ndx = ndx + + if wccn is not None: + enroll_copy.rotate_stat1(wccn) + if enroll_copy != test_copy: + test_copy.rotate_stat1(wccn) + + # Cosine scoring + enroll_copy.norm_stat1() + if enroll_copy != test_copy: + test_copy.norm_stat1() + s_size_in_bytes = enroll_copy.stat1.shape[0] * test_copy.stat1.shape[0] * 4 + if device == None: + device = torch.device("cuda:0" if torch.cuda.is_available() and s_size_in_bytes < 3e9 else "cpu") + else: + device = device if torch.cuda.is_available() and s_size_in_bytes < 3e9 else torch.device("cpu") + + score = Scores() + score.scoremat = torch.einsum('ij,kj', torch.FloatTensor(enroll_copy.stat1).to(device), + torch.FloatTensor(test_copy.stat1).to(device)).cpu().numpy() + score.modelset = clean_ndx.modelset + score.segset = clean_ndx.segset + score.scoremask = clean_ndx.trialmask + return score + + +def mahalanobis_scoring(enroll, test, ndx, m, check_missing=True): + """Compute the mahalanobis distance between to sets of vectors. The list of + trials to perform is given in an Ndx object. + + :param enroll: a StatServer in which stat1 are i-vectors + :param test: a StatServer in which stat1 are i-vectors + :param ndx: an Ndx object defining the list of trials to perform + :param m: mahalanobis matrix as a ndarray + :param check_missing: boolean, default is True, set to False not to check missing models + + :return: a score object + """ + assert isinstance(enroll, StatServer), 'First parameter should be a StatServer' + assert isinstance(test, StatServer), 'Second parameter should be a StatServer' + assert isinstance(ndx, Ndx), 'Third parameter should be an Ndx' + assert enroll.stat1.shape[1] == test.stat1.shape[1], 'I-vectors dimension mismatch' + assert enroll.stat1.shape[1] == m.shape[0], 'I-vectors and Mahalanobis matrix dimension mismatch' + + # If models are not unique, compute the mean per model, display a warning + if not numpy.unique(enroll.modelset).shape == enroll.modelset.shape: + logging.warning("Enrollment models are not unique, average i-vectors") + enroll = enroll.mean_stat_per_model() + + # Remove missing models and test segments + if check_missing: + clean_ndx = _check_missing_model(enroll, test, ndx) + else: + clean_ndx = ndx + + # Mahalanobis scoring + s = numpy.zeros((enroll.modelset.shape[0], test.segset.shape[0])) + for i in range(enroll.modelset.shape[0]): + diff = enroll.stat1[i, :] - test.stat1 + s[i, :] = -0.5 * numpy.sum(numpy.dot(diff, m) * diff, axis=1) + + score = Scores() + score.scoremat = s + score.modelset = clean_ndx.modelset + score.segset = clean_ndx.segset + score.scoremask = clean_ndx.trialmask + return score + + +def two_covariance_scoring(enroll, test, ndx, W, B, check_missing=True): + """Compute the 2-covariance scores between to sets of vectors. The list of + trials to perform is given in an Ndx object. Within and between class + co-variance matrices have to be pre-computed. + + :param enroll: a StatServer in which stat1 are i-vectors + :param test: a StatServer in which stat1 are i-vectors + :param ndx: an Ndx object defining the list of trials to perform + :param W: the within-class co-variance matrix to consider + :param B: the between-class co-variance matrix to consider + :param check_missing: boolean, default is True, set to False not to check missing models + + :return: a score object + """ + assert isinstance(enroll, StatServer), 'First parameter should be a directory' + assert isinstance(test, StatServer), 'Second parameter should be a StatServer' + assert isinstance(ndx, Ndx), 'Third parameter should be an Ndx' + assert enroll.stat1.shape[1] == test.stat1.shape[1], 'I-vectors dimension mismatch' + assert enroll.stat1.shape[1] == W.shape[0], 'I-vectors and co-variance matrix dimension mismatch' + assert enroll.stat1.shape[1] == B.shape[0], 'I-vectors and co-variance matrix dimension mismatch' + + # If models are not unique, compute the mean per model, display a warning + if not numpy.unique(enroll.modelset).shape == enroll.modelset.shape: + logging.warning("Enrollment models are not unique, average i-vectors") + enroll = enroll.mean_stat_per_model() + + # Remove missing models and test segments + if check_missing: + clean_ndx = _check_missing_model(enroll, test, ndx) + else: + clean_ndx = ndx + + # Two covariance scoring scoring + S = numpy.zeros((enroll.modelset.shape[0], test.segset.shape[0])) + iW = scipy.linalg.inv(W) + iB = scipy.linalg.inv(B) + + G = reduce(numpy.dot, [iW, scipy.linalg.inv(iB + 2*iW), iW]) + H = reduce(numpy.dot, [iW, scipy.linalg.inv(iB + iW), iW]) + + s2 = numpy.sum(numpy.dot(enroll.stat1, H) * enroll.stat1, axis=1) + s3 = numpy.sum(numpy.dot(test.stat1, H) * test.stat1, axis=1) + + for ii in range(enroll.modelset.shape[0]): + A = enroll.stat1[ii, :] + test.stat1 + s1 = numpy.sum(numpy.dot(A, G) * A, axis=1) + S[ii, :] = s1 - s3 - s2[ii] + + score = Scores() + score.scoremat = S + score.modelset = clean_ndx.modelset + score.segset = clean_ndx.segset + score.scoremask = clean_ndx.trialmask + return score + + +def PLDA_scoring(enroll, + test, + ndx, + mu, + F, + G, + Sigma, + test_uncertainty=None, + Vtrans=None, + p_known=0.0, + scaling_factor=1., + full_model=False): + """Compute the PLDA scores between two sets of vectors. The list of + trials to perform is given in an Ndx object. PLDA matrices have to be + pre-computed. i-vectors are supposed to be whitened before. + + Implements the approach described in [Lee13]_ including scoring + for partially open-set identification + + :param enroll: a StatServer in which stat1 are i-vectors + :param test: a StatServer in which stat1 are i-vectors + :param ndx: an Ndx object defining the list of trials to perform + :param mu: the mean vector of the PLDA gaussian + :param F: the between-class co-variance matrix of the PLDA + :param G: the within-class co-variance matrix of the PLDA + :param Sigma: the residual covariance matrix + :param p_known: probability of having a known speaker for open-set + identification case (=1 for the verification task and =0 for the + closed-set case) + :param scaling_factor: scaling factor to be multiplied by the sufficient statistics + :param full_model: boolean, set to True when using a complete PLDA model (including within class covariance matrix) + + :return: a score object + """ + assert isinstance(enroll, StatServer), 'First parameter should be a StatServer' + assert isinstance(test, StatServer), 'Second parameter should be a StatServer' + assert isinstance(ndx, Ndx), 'Third parameter should be an Ndx' + assert enroll.stat1.shape[1] == test.stat1.shape[1], 'I-vectors dimension mismatch' + assert enroll.stat1.shape[1] == F.shape[0], 'I-vectors and co-variance matrix dimension mismatch' + assert enroll.stat1.shape[1] == G.shape[0], 'I-vectors and co-variance matrix dimension mismatch' + + if not full_model: + return fast_PLDA_scoring(enroll, + test, + ndx, + mu, + F, + Sigma, + test_uncertainty, + Vtrans, + p_known=p_known, + scaling_factor=scaling_factor, + check_missing=True) + else: + return full_PLDA_scoring(enroll, test, ndx, mu, F, G, Sigma, p_known=p_known, scaling_factor=scaling_factor) + + +def full_PLDA_scoring(enroll, test, ndx, mu, F, G, Sigma, p_known=0.0, scaling_factor=1., check_missing=True): + """Compute PLDA scoring + + :param enroll: a StatServer in which stat1 are i-vectors + :param test: a StatServer in which stat1 are i-vectors + :param ndx: an Ndx object defining the list of trials to perform + :param mu: the mean vector of the PLDA gaussian + :param F: the between-class co-variance matrix of the PLDA + :param G: the within-class co-variance matrix of the PLDA + :param Sigma: the residual covariance matrix + :param p_known: probability of having a known speaker for open-set + identification case (=1 for the verification task and =0 for the + closed-set case) + :param check_missing: boolean, default is True, set to False not to check missing models + + """ + + enroll_copy = copy.deepcopy(enroll) + test_copy = copy.deepcopy(test) + + # If models are not unique, compute the mean per model, display a warning + # if not numpy.unique(enroll_copy.modelset).shape == enroll_copy.modelset.shape: + # logging.warning("Enrollment models are not unique, average i-vectors") + # enroll_copy = enroll_copy.mean_stat_per_model() + + # Remove missing models and test segments + if check_missing: + clean_ndx = _check_missing_model(enroll_copy, test_copy, ndx) + else: + clean_ndx = ndx + + # Center the i-vectors around the PLDA mean + enroll_copy.center_stat1(mu) + test_copy.center_stat1(mu) + + # Compute temporary matrices + invSigma = scipy.linalg.inv(Sigma) + I_iv = numpy.eye(mu.shape[0], dtype='float') + I_ch = numpy.eye(G.shape[1], dtype='float') + I_spk = numpy.eye(F.shape[1], dtype='float') + A = numpy.linalg.inv(G.T.dot(invSigma * scaling_factor).dot(G) + I_ch) # keep numpy as interface are different + B = F.T.dot(invSigma * scaling_factor).dot(I_iv - G.dot(A).dot(G.T).dot(invSigma * scaling_factor)) + K = B.dot(F) + K1 = scipy.linalg.inv(K + I_spk) + K2 = scipy.linalg.inv(2 * K + I_spk) + + # Compute the Gaussian distribution constant + alpha1 = numpy.linalg.slogdet(K1)[1] + alpha2 = numpy.linalg.slogdet(K2)[1] + constant = alpha2 / 2.0 - alpha1 + + # Compute verification scores + score = Scores() + score.scoremat = numpy.zeros(clean_ndx.trialmask.shape) + score.modelset = clean_ndx.modelset + score.segset = clean_ndx.segset + score.scoremask = clean_ndx.trialmask + + # Project data in the space that maximizes the speaker separability + test_tmp = B.dot(test_copy.stat1.T) + enroll_tmp = B.dot(enroll_copy.stat1.T) + + # score qui ne dépend que du segment + tmp1 = test_tmp.T.dot(K1) + + # Compute the part of the score that is only dependent on the test segment + S1 = numpy.empty(test_copy.segset.shape[0]) + for seg_idx in range(test_copy.segset.shape[0]): + S1[seg_idx] = tmp1[seg_idx, :].dot(test_tmp[:, seg_idx])/2. + + # Compute the part of the score that depends only on the model (S2) and on both model and test segment + S2 = numpy.empty(enroll_copy.modelset.shape[0]) + + for model_idx in range(enroll_copy.modelset.shape[0]): + mod_plus_test_seg = test_tmp + numpy.atleast_2d(enroll_tmp[:, model_idx]).T + tmp2 = mod_plus_test_seg.T.dot(K2) + + S2[model_idx] = enroll_tmp[:, model_idx].dot(K1).dot(enroll_tmp[:, model_idx])/2. + score.scoremat[model_idx, :] = numpy.einsum("ij, ji->i", tmp2, mod_plus_test_seg)/2. + + score.scoremat += constant - (S1 + S2[:, numpy.newaxis]) + score.scoremat *= scaling_factor + + # Case of open-set identification, we compute the log-likelihood + # by taking into account the probability of having a known impostor + # or an out-of set class + if p_known != 0: + N = score.scoremat.shape[0] + open_set_scores = numpy.empty(score.scoremat.shape) + tmp = numpy.exp(score.scoremat) + for ii in range(N): + # open-set term + open_set_scores[ii, :] = score.scoremat[ii, :] \ + - numpy.log(p_known * tmp[~(numpy.arange(N) == ii)].sum(axis=0) / (N - 1) + (1 - p_known)) + score.scoremat = open_set_scores + + return score + +def fast_PLDA_scoring(enroll, + test, + ndx, + mu, + F, + Sigma, + test_uncertainty=None, + Vtrans=None, + p_known=0.0, + scaling_factor=1., + check_missing=True): + """Compute the PLDA scores between to sets of vectors. The list of + trials to perform is given in an Ndx object. PLDA matrices have to be + pre-computed. i-vectors are supposed to be whitened before. + + :param enroll: a StatServer in which stat1 are i-vectors + :param test: a StatServer in which stat1 are i-vectors + :param ndx: an Ndx object defining the list of trials to perform + :param mu: the mean vector of the PLDA gaussian + :param F: the between-class co-variance matrix of the PLDA + :param Sigma: the residual covariance matrix + :param p_known: probability of having a known speaker for open-set + identification case (=1 for the verification task and =0 for the + closed-set case) + :param check_missing: boolean, if True, check that all models and segments exist + + :return: a score object + """ + # assert isinstance(enroll, StatServer), 'First parameter should be a StatServer' + # assert isinstance(test, StatServer), 'Second parameter should be a StatServer' + # assert isinstance(ndx, Ndx), 'Third parameter should be an Ndx' + # assert enroll.stat1.shape[1] == test.stat1.shape[1], 'I-vectors dimension mismatch' + # assert enroll.stat1.shape[1] == F.shape[0], 'I-vectors and co-variance matrix dimension mismatch' + # assert enroll.stat1.shape[1] == G.shape[0], 'I-vectors and co-variance matrix dimension mismatch' + + enroll_ctr = copy.deepcopy(enroll) + test_ctr = copy.deepcopy(test) + + # If models are not unique, compute the mean per model, display a warning + if not numpy.unique(enroll_ctr.modelset).shape == enroll_ctr.modelset.shape: + logging.warning("Enrollment models are not unique, average i-vectors") + enroll_ctr = enroll_ctr.mean_stat_per_model() + + # Remove missing models and test segments + if check_missing: + clean_ndx = _check_missing_model(enroll_ctr, test_ctr, ndx) + else: + clean_ndx = ndx + + # Center the i-vectors around the PLDA mean + enroll_ctr.center_stat1(mu) + test_ctr.center_stat1(mu) + + # If models are not unique, compute the mean per model, display a warning + if not numpy.unique(enroll_ctr.modelset).shape == enroll_ctr.modelset.shape: + logging.warning("Enrollment models are not unique, average i-vectors") + enroll_ctr = enroll_ctr.mean_stat_per_model() + + # Compute constant component of the PLDA distribution + invSigma = scipy.linalg.inv(Sigma) + I_spk = numpy.eye(F.shape[1], dtype='float') + + K = F.T.dot(invSigma * scaling_factor).dot(F) + K1 = scipy.linalg.inv(K + I_spk) + K2 = scipy.linalg.inv(2 * K + I_spk) + + # Compute the Gaussian distribution constant + alpha1 = numpy.linalg.slogdet(K1)[1] + alpha2 = numpy.linalg.slogdet(K2)[1] + plda_cst = alpha2 / 2.0 - alpha1 + + # Compute intermediate matrices + Sigma_ac = numpy.dot(F, F.T) + Sigma_tot = Sigma_ac + Sigma + Sigma_tot_inv = scipy.linalg.inv(Sigma_tot) + + Tmp = numpy.linalg.inv(Sigma_tot - Sigma_ac.dot(Sigma_tot_inv).dot(Sigma_ac)) + Phi = Sigma_tot_inv - Tmp + Psi = Sigma_tot_inv.dot(Sigma_ac).dot(Tmp) + + # Compute the different parts of PLDA score + model_part = 0.5 * numpy.einsum('ij, ji->i', enroll_ctr.stat1.dot(Phi), enroll_ctr.stat1.T) + seg_part = 0.5 * numpy.einsum('ij, ji->i', test_ctr.stat1.dot(Phi), test_ctr.stat1.T) + + # Compute verification scores + score = Scores() + score.modelset = clean_ndx.modelset + score.segset = clean_ndx.segset + score.scoremask = clean_ndx.trialmask + + score.scoremat = model_part[:, numpy.newaxis] + seg_part + plda_cst + score.scoremat += enroll_ctr.stat1.dot(Psi).dot(test_ctr.stat1.T) + score.scoremat *= scaling_factor + + # Case of open-set identification, we compute the log-likelihood + # by taking into account the probability of having a known impostor + # or an out-of set class + if p_known != 0: + N = score.scoremat.shape[0] + open_set_scores = numpy.empty(score.scoremat.shape) + tmp = numpy.exp(score.scoremat) + for ii in range(N): + # open-set term + open_set_scores[ii, :] = score.scoremat[ii, :] \ + - numpy.log(p_known * tmp[~(numpy.arange(N) == ii)].sum(axis=0) / (N - 1) + (1 - p_known)) + score.scoremat = open_set_scores + + return score + + +#IL FAUT RAJOUTER LA GESTION DES MULTI-SESSIONS (voir fonction PLDA_scoring_with_test_uncertainty_by_the_book de Themos) +#IMPLEMENTER LA VERSION "BY THE BOOK" pour ne pas utiliser la moyenne des i-vecteurs +def PLDA_scoring_uncertainty(enroll, + test, + ndx, + mu, + F, + Sigma, + p_known=0.0, + scaling_factor=1., + test_uncertainty=None, + Vtrans=None, + check_missing=True): + """ + + :param enroll: + :param test: + :param ndx: + :param mu: + :param F: + :param Sigma: + :param p_known: + :param scaling_factor: + :param test_uncertainty: + :param Vtrans: + :param check_missing: + :return: + """ + assert isinstance(enroll, StatServer), 'First parameter should be a StatServer' + assert isinstance(test, StatServer), 'Second parameter should be a StatServer' + assert isinstance(ndx, Ndx), 'Third parameter should be an Ndx' + assert enroll.stat1.shape[1] == test.stat1.shape[1], 'I-vectors dimension mismatch' + assert enroll.stat1.shape[1] == F.shape[0], 'I-vectors and co-variance matrix dimension mismatch' + assert enroll.stat1.shape[1] == G.shape[0], 'I-vectors and co-variance matrix dimension mismatch' + + enroll_ctr = copy.deepcopy(enroll) + test_ctr = copy.deepcopy(test) + + # Remove missing models and test segments + if check_missing: + clean_ndx = _check_missing_model(enroll_ctr, test_ctr, ndx) + else: + clean_ndx = ndx + + # Center the i-vectors around the PLDA mean + enroll_ctr.center_stat1(mu) + test_ctr.center_stat1(mu) + + # Align StatServers to match the clean_ndx + enroll_ctr.align_models_average(clean_ndx.modelset) + test_ctr.align_segments(clean_ndx.segset) + + # Compute constant component of the PLDA distribution + scoremat = numpy.zeros((enroll_ctr.stat1.shape[0], test_ctr.stat1.shape[0]), dtype='float') + invSigma = scipy.linalg.inv(Sigma) + + K1 = scipy.linalg.inv(numpy.eye(F.shape[1]) + F.T.dot(invSigma * scaling_factor).dot(F)) + + FK1Ft = F.dot(K1).dot(F.T) + Xtilda_e = FK1Ft.dot(invSigma * scaling_factor).dot(enroll_ctr.stat1.T).T + + for t in range(test_ctr.stat1.shape[0]): + xt = test_ctr.stat1[t,:] + Pr = numpy.eye(F.shape[1]) - numpy.outer(test.stat1[t, :],test.stat1[t, :]) + Cunc = Pr.dot(Vtrans.transpose()).dot(numpy.diag(test_uncertainty[t, :]).dot(Vtrans)).dot(Pr) + prec_den = scipy.linalg.inv(F.dot(F.T) + Sigma + Cunc) + + denom = -0.5 * xt.dot(prec_den).dot(xt) +0.5 * numpy.linalg.slogdet(prec_den)[1] + prec_num = scipy.linalg.inv(FK1Ft+Sigma+Cunc) + Xec = Xtilda_e - xt + numer = -0.5 * numpy.einsum('ij, ji->i', Xec.dot(prec_num), Xec.T) + 0.5 * numpy.linalg.slogdet(prec_num)[1] + scoremat[:, t] = scaling_factor * (numer - denom) + + # Compute verification scores + score = Scores() + score.modelset = clean_ndx.modelset + score.segset = clean_ndx.segset + score.scoremask = clean_ndx.trialmask + + score.scoremat = scoremat + + # Case of open-set identification, we compute the log-likelihood + # by taking into account the probability of having a known impostor + # or an out-of set class + if p_known != 0: + N = score.scoremat.shape[0] + open_set_scores = numpy.empty(score.scoremat.shape) + tmp = numpy.exp(score.scoremat) + for ii in range(N): + # open-set term + open_set_scores[ii, :] = score.scoremat[ii, :] \ + - numpy.log(p_known * tmp[~(numpy.arange(N) == ii)].sum(axis=0) / (N - 1) + (1 - p_known)) + score.scoremat = open_set_scores + + return score diff --git a/sidekit/sidekit/jfa_scoring.py b/sidekit/sidekit/jfa_scoring.py new file mode 100755 index 0000000..0bdadd3 --- /dev/null +++ b/sidekit/sidekit/jfa_scoring.py @@ -0,0 +1,132 @@ +# -*- coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . +""" +Copyright 2014-2021 Anthony Larcher and Sylvain Meignier + + :mod:`jfa_scoring` provides methods to score using JFA model + +""" +import copy +from sidekit.mixture import Mixture +from sidekit.statserver import StatServer +from sidekit.bosaris import Ndx +from sidekit.bosaris import Scores + + +__license__ = "LGPL" +__author__ = "Anthony Larcher" +__copyright__ = "Copyright 2014-2021 Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' + + +def _check_missing_model(enroll, test, ndx): + # Remove missing models and test segments + clean_ndx = ndx.filter(enroll.modelset, test.segset, True) + + # Align StatServers to match the clean_ndx + enroll.align_models(clean_ndx.modelset) + test.align_segments(clean_ndx.segset) + + return clean_ndx + + +def jfa_scoring(ubm, enroll, test, ndx, mean, sigma, V, U, D, batch_size=100, num_thread=1, check_missing=True): + """Compute a verification score as a channel point estimate + of the log-likelihood ratio. Detail of this scoring can be found in + [Glembeck09]. + + [Glembek09] Ondrej Glembek, Lukas Burget, Najim Dehak, Niko Brummer, + and Patrick Kenny, "Comparison of scoring methods used in speaker + recognition with joint factor analysis." + in Acoustics, Speech and Signal Processing, 2009. ICASSP 2009. + IEEE International Conference on. IEEE, 2009. + + Note that input statistics should not be whitened as + it is done within this function. + + :param ubm: the Mixture object used to compute sufficient statistics + :param enroll: a StatServer object which contains zero- and first-order + statistics. + :param test: a StatServer object which contains zero- and first-order + statistics. + :param ndx: an Ndx object which trial mask will be copied into the output + Scores object + :param mean: mean vector of the JFA model + :param sigma: residual covariance vector of the JFA model + :param V: between class covariance matrix of the JFA model + :param U: within class covariance matrix of the JFA model + :param D: MAP covariance matrix for the JFA model + :param batch_size: size of the batch to reduce memory footprint + :param num_thread: number of parallel process to run + :param check_missing: boolean, if True, check that all model exist + + :return: a Scores object + """ + assert isinstance(ubm, Mixture), '1st parameter must be a Mixture' + assert isinstance(enroll, StatServer), '2nd parameter must be a StatServer' + assert isinstance(test, StatServer), '3rd parameter must be a StatServer' + assert isinstance(ndx, Ndx), '4th parameter shomustuld be a Ndx' + + # Remove missing models and test segments + if check_missing: + clean_ndx = _check_missing_model(enroll, test, ndx) + else: + clean_ndx = ndx + + print("taille de clean_ndx.trial_mask = {}".format(clean_ndx.trialmask.shape)) + + # Sum enrolment statistics per model in case of multi-session + enroll = enroll.sum_stat_per_model()[0] + + # Whiten enroll and test statistics + enroll.whiten_stat1(ubm.get_mean_super_vector(), ubm.get_invcov_super_vector()) + test.whiten_stat1(ubm.get_mean_super_vector(), ubm.get_invcov_super_vector()) + + # Estimate Vy and DZ from the enrollment + trn_y, trn_x, trn_z = enroll.estimate_hidden(mean, sigma, V, U, D, batch_size=batch_size, num_thread=num_thread) + M = ((trn_y.stat1.dot(V.T)) + (trn_z.stat1 * D)) + + # Estimate Ux from the test + tmp = copy.deepcopy(test) + test_y, test_x, test_z = tmp.estimate_hidden(mean, sigma, None, U, None, batch_size, num_thread) + + # remove Ux weighted from the test statistics + Ux = copy.deepcopy(test) + Ux.stat1 = test_x.stat1.dot(U.T) + test = test.subtract_weighted_stat1(Ux) + + # sum zero-order statistics for each test segment + test_stat0_sum = test.stat0.sum(axis=1) + + # Compute score as the dot product of the enrollment supervector and the first + # order statistics divided by the sum of the zero-order stats + scores = Scores() + scores.modelset = enroll.modelset + scores.segset = test.segset + scores.scoremask = clean_ndx.trialmask + scores.scoremat = M.dot((test.stat1 / test_stat0_sum[:, None]).T) + + return scores diff --git a/sidekit/sidekit/libsvm/__init__.py b/sidekit/sidekit/libsvm/__init__.py new file mode 100755 index 0000000..a1fb154 --- /dev/null +++ b/sidekit/sidekit/libsvm/__init__.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . + +""" +Copyright 2014-2021 Anthony Larcher +""" + +__author__ = "Anthony Larcher" +__copyright__ = "Copyright 2014-2021 Anthony Larcher" +__license__ = "LGPL" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' + + +from .svm import * +from .svmutil import * diff --git a/sidekit/sidekit/libsvm/libsvm.so.2 b/sidekit/sidekit/libsvm/libsvm.so.2 new file mode 100755 index 0000000..a46f679 Binary files /dev/null and b/sidekit/sidekit/libsvm/libsvm.so.2 differ diff --git a/sidekit/sidekit/libsvm/svm.py b/sidekit/sidekit/libsvm/svm.py new file mode 100755 index 0000000..9abdad1 --- /dev/null +++ b/sidekit/sidekit/libsvm/svm.py @@ -0,0 +1,339 @@ +#!/usr/bin/env python + +""" +Copyright (c) 2000-2014 Chih-Chung Chang and Chih-Jen Lin +All rights reserved. +""" + +from ctypes import * +from ctypes import c_int, c_double +from ctypes.util import find_library +from os import path +import sys + + +try: + dirname = path.dirname(path.abspath(__file__)) + if sys.platform == 'win32': + libsvm = CDLL(path.join(dirname, r'libsvm.dll')) + else: + libsvm = CDLL(path.join(dirname, 'libsvm.so.2')) +except: + # For unix the prefix 'lib' is not considered. + if find_library('svm'): + libsvm = CDLL(find_library('svm')) + elif find_library('libsvm'): + libsvm = CDLL(find_library('libsvm')) + else: + raise Exception('LIBSVM library not found.') + + +# Construct constants +SVM_TYPE = ['C_SVC', 'NU_SVC', 'ONE_CLASS', 'EPSILON_SVR', 'NU_SVR'] +KERNEL_TYPE = ['LINEAR', 'POLY', 'RBF', 'SIGMOID', 'PRECOMPUTED'] +for i, s in enumerate(SVM_TYPE): + exec("%s = %d" % (s, i)) +for i, s in enumerate(KERNEL_TYPE): + exec("%s = %d" % (s, i)) + +PRINT_STRING_FUN = CFUNCTYPE(None, c_char_p) + + +def print_null(s): + return + + +def genFields(names, types): + return list(zip(names, types)) + + +def fillprototype(f, restype, argtypes): + f.restype = restype + f.argtypes = argtypes + + +class svm_node(Structure): + _names = ["index", "value"] + _types = [c_int, c_double] + _fields_ = genFields(_names, _types) + + def __str__(self): + return '%d:%g' % (self.index, self.value) + + +def gen_svm_nodearray(xi, feature_max=None, isKernel=None): + if isinstance(xi, dict): + index_range = xi.keys() + elif isinstance(xi, (list, tuple)): + if not isKernel: + xi = [0] + xi # idx should start from 1 + index_range = range(len(xi)) + else: + raise TypeError('xi should be a dictionary, list or tuple') + + if feature_max: + assert(isinstance(feature_max, int)) + index_range = filter(lambda j: j <= feature_max, index_range) + if not isKernel: + index_range = filter(lambda j: xi[j] != 0, index_range) + + index_range = sorted(index_range) + ret = (svm_node * (len(index_range)+1))() + ret[-1].index = -1 + for idx, j in enumerate(index_range): + ret[idx].index = j + ret[idx].value = xi[j] + max_idx = 0 + if index_range: + max_idx = index_range[-1] + return ret, max_idx + + +class svm_problem(Structure): + _names = ["l", "y", "x"] + _types = [c_int, POINTER(c_double), POINTER(POINTER(svm_node))] + _fields_ = genFields(_names, _types) + + def __init__(self, y, x, isKernel=None): + if len(y) != len(x): + raise ValueError("len(y) != len(x)") + self.l = l = len(y) + + max_idx = 0 + x_space = self.x_space = [] + for i, xi in enumerate(x): + tmp_xi, tmp_idx = gen_svm_nodearray(xi, isKernel=isKernel) + x_space += [tmp_xi] + max_idx = max(max_idx, tmp_idx) + self.n = max_idx + + self.y = (c_double * l)() + for i, yi in enumerate(y): + self.y[i] = yi + + self.x = (POINTER(svm_node) * l)() + for i, xi in enumerate(self.x_space): + self.x[i] = xi + + +class svm_parameter(Structure): + _names = ["svm_type", "kernel_type", "degree", "gamma", "coef0", + "cache_size", "eps", "C", "nr_weight", "weight_label", "weight", + "nu", "p", "shrinking", "probability"] + _types = [c_int, c_int, c_int, c_double, c_double, + c_double, c_double, c_double, c_int, POINTER(c_int), POINTER(c_double), + c_double, c_double, c_int, c_int] + _fields_ = genFields(_names, _types) + + def __init__(self, options=None): + if options is None: + options = '' + self.parse_options(options) + + def __str__(self): + s = '' + attrs = svm_parameter._names + list(self.__dict__.keys()) + values = map(lambda attr: getattr(self, attr), attrs) + for attr, val in zip(attrs, values): + s += (' %s: %s\n' % (attr, val)) + s = s.strip() + + return s + + def set_to_default_values(self): + self.svm_type = C_SVC + self.kernel_type = RBF + self.degree = 3 + self.gamma = 0 + self.coef0 = 0 + self.nu = 0.5 + self.cache_size = 100 + self.C = 1 + self.eps = 0.001 + self.p = 0.1 + self.shrinking = 1 + self.probability = 0 + self.nr_weight = 0 + self.weight_label = (c_int * 0)() + self.weight = (c_double * 0)() + self.cross_validation = False + self.nr_fold = 0 + self.print_func = None + + def parse_options(self, options): + if isinstance(options, list): + argv = options + elif isinstance(options, str): + argv = options.split() + else: + raise TypeError("arg 1 should be a list or a str.") + self.set_to_default_values() + self.print_func = cast(None, PRINT_STRING_FUN) + weight_label = [] + weight = [] + + i = 0 + while i < len(argv): + if argv[i] == "-s": + i += 1 + self.svm_type = int(argv[i]) + elif argv[i] == "-t": + i += 1 + self.kernel_type = int(argv[i]) + elif argv[i] == "-d": + i += 1 + self.degree = int(argv[i]) + elif argv[i] == "-g": + i += 1 + self.gamma = float(argv[i]) + elif argv[i] == "-r": + i += 1 + self.coef0 = float(argv[i]) + elif argv[i] == "-n": + i += 1 + self.nu = float(argv[i]) + elif argv[i] == "-m": + i += 1 + self.cache_size = float(argv[i]) + elif argv[i] == "-c": + i += 1 + self.C = float(argv[i]) + elif argv[i] == "-e": + i += 1 + self.eps = float(argv[i]) + elif argv[i] == "-p": + i += 1 + self.p = float(argv[i]) + elif argv[i] == "-h": + i += 1 + self.shrinking = int(argv[i]) + elif argv[i] == "-b": + i += 1 + self.probability = int(argv[i]) + elif argv[i] == "-q": + self.print_func = PRINT_STRING_FUN(print_null) + elif argv[i] == "-v": + i += 1 + self.cross_validation = 1 + self.nr_fold = int(argv[i]) + if self.nr_fold < 2: + raise ValueError("n-fold cross validation: n must >= 2") + elif argv[i].startswith("-w"): + i += 1 + self.nr_weight += 1 + nr_weight = self.nr_weight + weight_label += [int(argv[i-1][2:])] + weight += [float(argv[i])] + else: + raise ValueError("Wrong options") + i += 1 + + libsvm.svm_set_print_string_function(self.print_func) + self.weight_label = (c_int*self.nr_weight)() + self.weight = (c_double*self.nr_weight)() + for i in range(self.nr_weight): + self.weight[i] = weight[i] + self.weight_label[i] = weight_label[i] + + +class svm_model(Structure): + _names = ['param', 'nr_class', 'l', 'SV', 'sv_coef', 'rho', + 'probA', 'probB', 'sv_indices', 'label', 'nSV', 'free_sv'] + _types = [svm_parameter, c_int, c_int, POINTER(POINTER(svm_node)), + POINTER(POINTER(c_double)), POINTER(c_double), + POINTER(c_double), POINTER(c_double), POINTER(c_int), + POINTER(c_int), POINTER(c_int), c_int] + _fields_ = genFields(_names, _types) + + def __init__(self): + self.__createfrom__ = 'python' + + def __del__(self): + # free memory created by C to avoid memory leak + if hasattr(self, '__createfrom__') and self.__createfrom__ == 'C': + libsvm.svm_free_and_destroy_model(pointer(self)) + + def get_svm_type(self): + return libsvm.svm_get_svm_type(self) + + def get_nr_class(self): + return libsvm.svm_get_nr_class(self) + + def get_svr_probability(self): + return libsvm.svm_get_svr_probability(self) + + def get_labels(self): + nr_class = self.get_nr_class() + labels = (c_int * nr_class)() + libsvm.svm_get_labels(self, labels) + return labels[:nr_class] + + def get_sv_indices(self): + total_sv = self.get_nr_sv() + sv_indices = (c_int * total_sv)() + libsvm.svm_get_sv_indices(self, sv_indices) + return sv_indices[:total_sv] + + def get_nr_sv(self): + return libsvm.svm_get_nr_sv(self) + + def is_probability_model(self): + return libsvm.svm_check_probability_model(self) == 1 + + def get_sv_coef(self): + return [tuple(self.sv_coef[j][i] for j in range(self.nr_class - 1)) + for i in range(self.l)] + + def get_SV(self): + result = [] + for sparse_sv in self.SV[:self.l]: + row = dict() + + i = 0 + while True: + row[sparse_sv[i].index] = sparse_sv[i].value + if sparse_sv[i].index == -1: + break + i += 1 + + result.append(row) + return result + + +def toPyModel(model_ptr): + """ + toPyModel(model_ptr) -> svm_model + + Convert a ctypes POINTER(svm_model) to a Python svm_model + """ + if not bool(model_ptr): + raise ValueError("Null pointer") + m = model_ptr.contents + m.__createfrom__ = 'C' + return m + +fillprototype(libsvm.svm_train, POINTER(svm_model), [POINTER(svm_problem), POINTER(svm_parameter)]) +fillprototype(libsvm.svm_cross_validation, None, [POINTER(svm_problem), POINTER(svm_parameter), + c_int, POINTER(c_double)]) + +fillprototype(libsvm.svm_save_model, c_int, [c_char_p, POINTER(svm_model)]) +fillprototype(libsvm.svm_load_model, POINTER(svm_model), [c_char_p]) + +fillprototype(libsvm.svm_get_svm_type, c_int, [POINTER(svm_model)]) +fillprototype(libsvm.svm_get_nr_class, c_int, [POINTER(svm_model)]) +fillprototype(libsvm.svm_get_labels, None, [POINTER(svm_model), POINTER(c_int)]) +fillprototype(libsvm.svm_get_sv_indices, None, [POINTER(svm_model), POINTER(c_int)]) +fillprototype(libsvm.svm_get_nr_sv, c_int, [POINTER(svm_model)]) +fillprototype(libsvm.svm_get_svr_probability, c_double, [POINTER(svm_model)]) + +fillprototype(libsvm.svm_predict_values, c_double, [POINTER(svm_model), POINTER(svm_node), POINTER(c_double)]) +fillprototype(libsvm.svm_predict, c_double, [POINTER(svm_model), POINTER(svm_node)]) +fillprototype(libsvm.svm_predict_probability, c_double, [POINTER(svm_model), POINTER(svm_node), POINTER(c_double)]) + +fillprototype(libsvm.svm_free_model_content, None, [POINTER(svm_model)]) +fillprototype(libsvm.svm_free_and_destroy_model, None, [POINTER(POINTER(svm_model))]) +fillprototype(libsvm.svm_destroy_param, None, [POINTER(svm_parameter)]) + +fillprototype(libsvm.svm_check_parameter, c_char_p, [POINTER(svm_problem), POINTER(svm_parameter)]) +fillprototype(libsvm.svm_check_probability_model, c_int, [POINTER(svm_model)]) +fillprototype(libsvm.svm_set_print_string_function, None, [PRINT_STRING_FUN]) diff --git a/sidekit/sidekit/libsvm/svmutil.py b/sidekit/sidekit/libsvm/svmutil.py new file mode 100755 index 0000000..e64743f --- /dev/null +++ b/sidekit/sidekit/libsvm/svmutil.py @@ -0,0 +1,300 @@ +#!/usr/bin/env python + +""" +Copyright (c) 2000-2014 Chih-Chung Chang and Chih-Jen Lin +All rights reserved. +""" + +import sys +import os +import pickle +from .svm import svm_node, svm_problem, svm_parameter, svm_model, toPyModel, SVM_TYPE, KERNEL_TYPE, \ + gen_svm_nodearray, print_null +from ctypes import c_int, c_double + +sys.path = [os.path.dirname(os.path.abspath(__file__))] + sys.path + +def save_svm(svm_file_name, w, b): + """ + Save SVM weights and biais in PICKLE format + :return: + """ + if not os.path.exists(os.path.dirname(svm_file_name)): + os.makedirs(os.path.dirname(svm_file_name)) + with open(svm_file_name, "wb") as f: + pickle.dump((w, b), f) + + +def read_svm(svm_file_name): + """Read SVM model in PICKLE format + + :param svm_file_name: name of the file to read from + """ + with open(svm_file_name, "rb") as f: + (w, b) = pickle.load(f) + return w, b + + +def svm_read_problem(data_file_name): + """ + svm_read_problem(data_file_name) -> [y, x] + + Read LIBSVM-format data from data_file_name and return labels y + and data instances x. + :param data_file_name: name of the file to load from + """ + prob_y = [] + prob_x = [] + for line in open(data_file_name): + line = line.split(None, 1) + # In case an instance with all zero features + if len(line) == 1: + line += [''] + label, features = line + xi = {} + for e in features.split(): + ind, val = e.split(":") + xi[int(ind)] = float(val) + prob_y += [float(label)] + prob_x += [xi] + return prob_y, prob_x + + +def svm_load_model(model_file_name): + """ + svm_load_model(model_file_name) -> model + + Load a LIBSVM model from model_file_name and return. + :param model_file_name: file name to load from + """ + model = svm_load_model(model_file_name.encode()) + if not model: + print("can't open model file %s" % model_file_name) + return None + model = toPyModel(model) + return model + + +def svm_save_model(model_file_name, model): + """ + svm_save_model(model_file_name, model) -> None + + Save a LIBSVM model to the file model_file_name. + :param model_file_name: file name to write to + :param model: model to save + """ + svm_save_model(model_file_name.encode(), model) + + +def evaluations(ty, pv): + """ + evaluations(ty, pv) -> (ACC, MSE, SCC) + + Calculate accuracy, mean squared error and squared correlation coefficient + using the true values (ty) and predicted values (pv). + """ + if len(ty) != len(pv): + raise ValueError("len(ty) must equal to len(pv)") + total_correct = total_error = 0 + sumv = sumy = sumvv = sumyy = sumvy = 0 + for v, y in zip(pv, ty): + if y == v: + total_correct += 1 + total_error += (v-y)*(v-y) + sumv += v + sumy += y + sumvv += v*v + sumyy += y*y + sumvy += v*y + l = len(ty) + ACC = 100.0*total_correct/l + MSE = total_error/l + try: + SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy)) + except: + SCC = float('nan') + return ACC, MSE, SCC + + +def svm_train(arg1, arg2=None, arg3=None): + """ + svm_train(y, x [, options]) -> model | ACC | MSE + svm_train(prob [, options]) -> model | ACC | MSE + svm_train(prob, param) -> model | ACC| MSE + + Train an SVM model from data (y, x) or an svm_problem prob using + \'options\' or an svm_parameter param. + If \'-v\' is specified in \'options\' (i.e., cross validation) + either accuracy (ACC) or mean-squared error (MSE) is returned. + options: + + - -s svm_type : set type of SVM (default 0) + + - 0 -- C-SVC (multi-class classification) + - 1 -- nu-SVC (multi-class classification) + - 2 -- one-class SVM + - 3 -- epsilon-SVR (regression) + - 4 -- nu-SVR (regression) + + - -t kernel_type : set type of kernel function (default 2) + + - 0 -- linear: u\'\*v + - 1 -- polynomial: (gamma\*u\'\*v + coef0)^degree + - 2 -- radial basis function: exp(-gamma\*|u-v|^2) + - 3 -- sigmoid: tanh(gamma\*u\'\*v + coef0) + - 4 -- precomputed kernel (kernel values in training_set_file) + + - -d degree : set degree in kernel function (default 3) + - -g gamma : set gamma in kernel function (default 1/num_features) + - -r coef0 : set coef0 in kernel function (default 0) + - -c cost : set the parameter C of C-SVC, epsilon-SVR, and nu-SVR (default 1) + - -n nu : set the parameter nu of nu-SVC, one-class SVM, and nu-SVR (default 0.5) + - -p epsilon : set the epsilon in loss function of epsilon-SVR (default 0.1) + - -m cachesize : set cache memory size in MB (default 100) + - -e epsilon : set tolerance of termination criterion (default 0.001) + - -h shrinking : whether to use the shrinking heuristics, 0 or 1 (default 1) + - -b probability_estimates : whether to train a SVC or SVR model for probability estimates, 0 or 1 (default 0) + - -wi weight : set the parameter C of class i to weight*C, for C-SVC (default 1) + - -v n: n-fold cross validation mode + - -q : quiet mode (no outputs) + """ + prob, param = None, None + if isinstance(arg1, (list, tuple)): + assert isinstance(arg2, (list, tuple)) + y, x, options = arg1, arg2, arg3 + param = svm_parameter(options) + prob = svm_problem(y, x, isKernel=(param.kernel_type == 'PRECOMPUTED')) + elif isinstance(arg1, svm_problem): + prob = arg1 + if isinstance(arg2, svm_parameter): + param = arg2 + else: + param = svm_parameter(arg2) + if prob is None or param is None: + raise TypeError("Wrong types for the arguments") + + if param.kernel_type == 'PRECOMPUTED': + for xi in prob.x_space: + idx, val = xi[0].index, xi[0].value + if xi[0].index != 0: + raise ValueError('Wrong input format: first column must be 0:sample_serial_number') + if val <= 0 or val > prob.n: + raise ValueError('Wrong input format: sample_serial_number out of range') + + if param.gamma == 0 and prob.n > 0: + param.gamma = 1.0 / prob.n + libsvm.svm_set_print_string_function(param.print_func) + err_msg = libsvm.svm_check_parameter(prob, param) + if err_msg: + raise ValueError('Error: %s' % err_msg) + + if param.cross_validation: + l, nr_fold = prob.l, param.nr_fold + target = (c_double * l)() # pytype: disable=not-callable + libsvm.svm_cross_validation(prob, param, nr_fold, target) + ACC, MSE, SCC = evaluations(prob.y[:l], target[:l]) + if param.svm_type in ['EPSILON_SVR', 'NU_SVR']: + print("Cross Validation Mean squared error = %g" % MSE) + print("Cross Validation Squared correlation coefficient = %g" % SCC) + return MSE + else: + print("Cross Validation Accuracy = %g%%" % ACC) + return ACC + else: + m = svm_train(prob, param) + m = toPyModel(m) + + # If prob is destroyed, data including SVs pointed by m can remain. + m.x_space = prob.x_space + return m + + +def svm_predict(y, x, m, options=""): + """svm_predict(y, x, m [, options]) -> (p_labels, p_acc, p_vals) + + Predict data (y, x) with the SVM model m. + options: + + - "-b" probability_estimates: whether to predict probability estimates, + 0 or 1 (default 0); for one-class SVM only 0 is supported. + - "-q" : quiet mode (no outputs). + + The return tuple contains + + - p_labels: a list of predicted labels + - p_acc: a tuple including accuracy (for classification), + mean-squared error, and squared correlation coefficient (for regression). + - p_vals: a list of decision values or probability estimates + (if \'-b 1\' is specified). If k is the number of classes, + for decision values, each element includes results of predicting k(k-1)/2 binary-class SVMs. + For probabilities, each element contains k values indicating the probability that the testing instance + is in each class. + + .. note:: that the order of classes here is the same as \'model.label\' field in the model structure. + """ + + def info(s): + print(s) + + predict_probability = 0 + argv = options.split() + i = 0 + while i < len(argv): + if argv[i] == '-b': + i += 1 + predict_probability = int(argv[i]) + elif argv[i] == '-q': + info = print_null + else: + raise ValueError("Wrong options") + i += 1 + + svm_type = m.get_svm_type() + is_prob_model = m.is_probability_model() + nr_class = m.get_nr_class() + pred_labels = [] + pred_values = [] + + if predict_probability: + if not is_prob_model: + raise ValueError("Model does not support probabiliy estimates") + + if svm_type in ['NU_SVR', 'EPSILON_SVR']: + info("Prob. model for test data: target value = predicted value + z,\n" + + "z: Laplace distribution e^(-|z|/sigma)/(2sigma),sigma=%g".format(m.get_svr_probability())) + nr_class = 0 + + prob_estimates = (c_double * nr_class)() # pytype: disable=not-callable + for xi in x: + xi, idx = gen_svm_nodearray(xi, isKernel=(m.param.kernel_type == 'PRECOMPUTED')) + label = libsvm.svm_predict_probability(m, xi, prob_estimates) + values = prob_estimates[:nr_class] + pred_labels += [label] + pred_values += [values] + else: + if is_prob_model: + info("Model supports probability estimates, but disabled in predicton.") + if svm_type in ('ONE_CLASS', 'EPSILON_SVR', 'NU_SVC'): + nr_classifier = 1 + else: + nr_classifier = nr_class*(nr_class-1)//2 + dec_values = (c_double * nr_classifier)() # pytype: disable=not-callable + for xi in x: + xi, idx = gen_svm_nodearray(xi, isKernel=(m.param.kernel_type == 'PRECOMPUTED')) + label = libsvm.svm_predict_values(m, xi, dec_values) + if nr_class == 1: + values = [1] + else: + values = dec_values[:nr_classifier] + pred_labels += [label] + pred_values += [values] + + ACC, MSE, SCC = evaluations(y, pred_labels) + l = len(y) + if svm_type in ['EPSILON_SVR', 'NU_SVR']: + info("Mean squared error = %g (regression)" % MSE) + info("Squared correlation coefficient = %g (regression)" % SCC) + else: + info("Accuracy = %g%% (%d/%d) (classification)" % (ACC, int(l*ACC/100), l)) + + return pred_labels, (ACC, MSE, SCC), pred_values diff --git a/sidekit/sidekit/lid_utils.py b/sidekit/sidekit/lid_utils.py new file mode 100755 index 0000000..8359aec --- /dev/null +++ b/sidekit/sidekit/lid_utils.py @@ -0,0 +1,263 @@ +# -*- coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . + +""" +Copyright 2014-2021 Anthony Larcher + +:mod:`lid_utils` provides utilities to perform Language identification. +""" + +import numpy + +from sidekit.mixture import Mixture +from sidekit.statserver import StatServer +from sidekit.bosaris import Scores +import sidekit.sv_utils +import sidekit.frontend + +__license__ = "LGPL" +__author__ = "Anthony Larcher" +__copyright__ = "Copyright 2014-2021 Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' + + +def log_sum_exp(x): + """ + :param x: input vector + """ + xmax = x.max(axis=0) + xnorm = x - xmax + ex = numpy.exp(xnorm) + return xmax + numpy.log(ex.sum(axis=0)) + + +def compute_log_likelihood_ratio(M, p_tar=0.5): + """ + Compute log-likelihood ratio for closed-set identification. + + :param M: a matrix of log-likelihood of shape nb_models x nb_test_segments + :param p_tar: probability of a trial to be from a target + + :return: a matrix of log-likelihood ration of shape + nb_models x nb_test_segments + """ + llr = numpy.empty(M.shape) + log_prior = numpy.ones((M.shape[0] - 1, 1)) * numpy.log((1 - p_tar) / (M.shape[0] - 1)) + for ii in range(M.shape[0]): + llr[ii, :] = numpy.log(p_tar) + M[ii, :] - log_sum_exp(M[~(numpy.arange(M.shape[0]) == ii)] + log_prior) + + return llr + + +def gaussian_backend_train(train_ss): + """ + Take a StatServer of training examples as input + output a StatServer mean for each class and a full tied co-variance matrix + :param train_ss: sidekit.StatServer containing training data + """ + + # Compute parameters of the Gaussian backend (common covariance and constant) + gb_sigma = train_ss.get_within_covariance_stat1() + + # Compute mean of each class + gb_mean = train_ss.mean_stat_per_model() + + # Compute the normalization constant + gb_cst = - 0.5 * (numpy.linalg.slogdet(gb_sigma)[1] + train_ss.stat1.shape[1] * numpy.log(2 * numpy.pi)) + + return gb_mean, gb_sigma, gb_cst + + +def gaussian_backend_train_hetero(train_ss, alpha=0.1): + """ + Take a StatServer of training examples as input + output a StatServer mean for each class and a full tied co-variance matrix + :param train_ss: sidekit.StatServer of input training data + :param alpha: weight of the a priori distribution learned on all training data + """ + + # Compute parameters of the Gaussian backend (common covariance and constant) + vect_size = train_ss.stat1.shape[1] + unique_language = numpy.unique(train_ss.modelset) + # gb_sigma = train_ss.get_within_covariance_stat1() + + W = numpy.zeros((vect_size, vect_size)) + gb_sigma = [] + + for languageID in unique_language: + spk_ctr_vec = train_ss.get_model_stat1(languageID) \ + - numpy.mean(train_ss.get_model_stat1(languageID), axis=0) + gb_sigma.append(numpy.dot(spk_ctr_vec.transpose(), spk_ctr_vec)) + W += gb_sigma[-1] + gb_sigma[-1] /= spk_ctr_vec.shape[0] + W /= train_ss.stat1.shape[0] + + for ii in range(len(gb_sigma)): + gb_sigma[ii] = alpha * gb_sigma[ii] + (1 - alpha) * W + + # Compute mean of each class + gb_mean = train_ss.mean_stat_per_model() + + # Compute the normalization constant + gb_cst = [] + for ii in range(len(gb_sigma)): + gb_cst.append(- 0.5 * (numpy.linalg.slogdet(gb_sigma[ii])[1] + + train_ss.stat1.shape[1] * numpy.log(2 * numpy.pi))) + + return gb_mean, gb_sigma, gb_cst + + +def _gaussian_backend_train(data, label): + """ + Take a StatServer of training examples as input + output a StatServer mean for each class and a tied co-variance matrix + """ + train_ss = StatServer() + train_ss.segset = label + train_ss.modelset = label + train_ss.stat1 = data + train_ss.stat0 = numpy.ones((data.shape[0], 1)) + train_ss.start = numpy.empty(data.shape[0], dtype="object") + train_ss.stop = numpy.empty(data.shape[0], dtype="object") + + return gaussian_backend_train(train_ss) + + +def gaussian_backend_test(test_ss, params, diag=False, compute_llr=True): + """ + Process data through a Gaussian-Backend which parameters (mean and variance) + have been estimated using Gaussian_Backend_Train. + + If compute_llr is set to true, return the log-likelihood ratio, if not, + return rthe log-likelihood on each Gaussian distrbution. Default is True. + + :param test_ss: a StatServer which stat1 are vectors to classify + :param params: Gaussian Backend parameters, a tupple of mean, covariance + and constante computed with Gaussian_Backend_Train + :param diag: boolean, if true: use the diagonal version of the covariance + matrix, if not the full version + :param compute_llr: boolean, if true, return the log-likelihood ratio, if not, + return rthe log-likelihood on each Gaussian distrbution. + """ + gb_mean, gb_sigma, gb_cst = params + + scores = Scores() + scores.modelset = gb_mean.modelset + scores.segset = test_ss.segset + scores.scoremat = numpy.ones((gb_mean.modelset.shape[0], test_ss.segset.shape[0])) + scores.scoremask = numpy.ones(scores.scoremat.shape, dtype='bool') + + if diag: + gb_gmm = Mixture() + gb_gmm.w = numpy.ones(gb_mean.modelset.shape[0], dtype='float') / gb_mean.modelset.shape[0] + gb_gmm.mu = gb_mean.stat1 + if gb_sigma.ndim == 2: + gb_gmm.invcov = numpy.tile(1 / numpy.diag(gb_sigma), (gb_mean.modelset.shape[0], 1)) + elif gb_sigma.ndim == 2: + gb_gmm.invcov = numpy.tile(1. / gb_sigma, (gb_mean.modelset.shape[0], 1)) + gb_gmm._compute_all() + + scores.scoremat = gb_gmm.compute_log_posterior_probabilities(test_ss.stat1).T + + else: + assert gb_sigma.ndim == 2 + scores.scoremat *= gb_cst + + inv_sigma = numpy.linalg.inv(gb_sigma) + + # Compute scores for all trials per language + for lang in range(gb_mean.modelset.shape[0]): + scores.scoremat[lang, :] -= 0.5 * (gb_mean.stat1[lang, :].dot(inv_sigma).dot(gb_mean.stat1[lang, :].T) - + 2 * numpy.sum(test_ss.stat1.dot(inv_sigma) * gb_mean.stat1[lang, :], + axis=1) + + numpy.sum(test_ss.stat1.dot(inv_sigma) * test_ss.stat1, axis=1)) + + if compute_llr: + scores.scoremat = compute_log_likelihood_ratio(scores.scoremat) + + assert scores.validate() + return scores + + +def gaussian_backend_test_hetero(test_ss, params, diag=False, compute_llr=True): + """ + Process data through a Gaussian-Backend which parameters (mean and variance) + have been estimated using Gaussian_Backend_Train. + + If compute_llr is set to true, return the log-likelihood ratio, if not, + return rthe log-likelihood on each Gaussian distrbution. Default is True. + + :param test_ss: a StatServer which stat1 are vectors to classify + :param params: Gaussian Backend parameters, a tupple of mean, covariance + and constante computed with Gaussian_Backend_Train + :param diag: boolean, if true: use the diagonal version of the covariance + matrix, if not the full version + :param compute_llr: boolean, if true, return the log-likelihood ratio, if not, + return rthe log-likelihood on each Gaussian distrbution. + """ + gb_mean, gb_sigma, gb_cst = params + + scores = sidekit.Scores() + scores.modelset = gb_mean.modelset + scores.segset = test_ss.segset + scores.scoremat = numpy.ones((gb_mean.modelset.shape[0], test_ss.segset.shape[0])) + scores.scoremask = numpy.ones(scores.scoremat.shape, dtype='bool') + + if diag: + + gb_gmm = sidekit.Mixture() + gb_gmm.w = numpy.ones(gb_mean.modelset.shape[0], dtype='float') / gb_mean.modelset.shape[0] + gb_gmm.mu = gb_mean.stat1 + gb_gmm.invcov = numpy.empty(gb_gmm.mu.shape) + for l in range(len(gb_sigma)): + if gb_sigma[0].ndim == 2: + gb_gmm.invcov[l, :] = 1 / numpy.diag(gb_sigma[l]) + elif gb_sigma[0].ndim == 1: + gb_gmm.invcov[l, :] = 1 / gb_sigma[l] + gb_gmm._compute_all() + + scores.scoremat = gb_gmm.compute_log_posterior_probabilities(test_ss.stat1).T + + else: + assert gb_sigma[0].ndim == 2 + for lang in range(gb_mean.modelset.shape[0]): + scores.scoremat[lang, :] *= gb_cst[lang] + + inv_sigma = numpy.linalg.inv(gb_sigma) + + # Compute scores for all trials per language + for lang in range(gb_mean.modelset.shape[0]): + scores.scoremat[lang, :] -= 0.5 * ( + gb_mean.stat1[lang, :].dot(inv_sigma[lang]).dot(gb_mean.stat1[lang, :].T) - + 2 * numpy.sum(test_ss.stat1.dot(inv_sigma[lang]) * + gb_mean.stat1[lang, :], axis=1) + + numpy.sum(test_ss.stat1.dot(inv_sigma[lang]) * test_ss.stat1, axis=1)) + + if compute_llr: + scores.scoremat = compute_log_likelihood_ratio(scores.scoremat) + + assert scores.validate() + return scores diff --git a/sidekit/sidekit/mixture.py b/sidekit/sidekit/mixture.py new file mode 100755 index 0000000..0636a4c --- /dev/null +++ b/sidekit/sidekit/mixture.py @@ -0,0 +1,1029 @@ + +# -*- coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . + +""" +Copyright 2014-2021 Anthony Larcher + +:mod:`mixture` provides methods to manage Gaussian mixture models + +""" +import copy +import h5py +import numpy +import struct +import ctypes +import multiprocessing +import warnings +from .sidekit_wrappers import * +from .bosaris import IdMap +from .sv_utils import mean_std_many +import sys + +__license__ = "LGPL" +__author__ = "Anthony Larcher" +__copyright__ = "Copyright 2014-2021 Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' + + +def sum_log_probabilities(lp): + """Sum log probabilities in a secure manner to avoid extreme values + + :param lp: numpy array of log-probabilities to sum + """ + pp_max = numpy.max(lp, axis=1) + log_lk = pp_max + numpy.log(numpy.sum(numpy.exp((lp.transpose() - pp_max).T), axis=1)) + ind = ~numpy.isfinite(pp_max) + if sum(ind) != 0: + log_lk[ind] = pp_max[ind] + pp = numpy.exp((lp.transpose() - log_lk).transpose()) + llk = log_lk.sum() + return pp, llk + + +def vad_energy(log_energy, + distrib_nb=3, + nb_train_it=8, + flooring=0.0001, ceiling=1.0, + alpha=2): + """ + + :param log_energy: + :param distrib_nb: + :param nb_train_it: + :param flooring: + :param ceiling: + :param alpha: + :return: + """ + # center and normalize the energy + log_energy = (log_energy - numpy.mean(log_energy)) / numpy.std(log_energy) + + # Initialize a Mixture with 2 or 3 distributions + world = Mixture() + # set the covariance of each component to 1.0 and the mean to mu + meanIncrement + world.cst = numpy.ones(distrib_nb) / (numpy.pi / 2.0) + world.det = numpy.ones(distrib_nb) + world.mu = -2 + 4.0 * numpy.arange(distrib_nb) / (distrib_nb - 1) + world.mu = world.mu[:, numpy.newaxis] + world.invcov = numpy.ones((distrib_nb, 1)) + # set equal weights for each component + world.w = numpy.ones(distrib_nb) / distrib_nb + world.cov_var_ctl = copy.deepcopy(world.invcov) + + # Initialize the accumulator + accum = copy.deepcopy(world) + + # Perform nbTrainIt iterations of EM + for it in range(nb_train_it): + accum._reset() + # E-step + world._expectation(accum, log_energy) + # M-step + world._maximization(accum, ceiling, flooring) + + # Compute threshold + threshold = world.mu.max() - alpha * numpy.sqrt(1.0 / world.invcov[world.mu.argmax(), 0]) + + # Apply frame selection with the current threshold + label = log_energy > threshold + return label, threshold + + +class Mixture(object): + """ + A class for Gaussian Mixture Model storage. + For more details about Gaussian Mixture Models (GMM) you can refer to + [Bimbot04]_. + + :attr w: array of weight parameters + :attr mu: ndarray of mean parameters, each line is one distribution + :attr invcov: ndarray of inverse co-variance parameters, 2-dimensional + for diagonal co-variance distribution 3-dimensional for full co-variance + :attr invchol: 3-dimensional ndarray containing upper cholesky + decomposition of the inverse co-variance matrices + :attr cst: array of constant computed for each distribution + :attr det: array of determinant for each distribution + + """ + @staticmethod + def read_alize(file_name): + """ + + :param file_name: + :return: + """ + """Read a Mixture in alize raw format + + :param mixtureFileName: name of the file to read from + """ + mixture = Mixture() + + with open(file_name, 'rb') as f: + distrib_nb = struct.unpack("I", f.read(4))[0] + vect_size = struct.unpack("': + distrib_nb = int(w[1]) + mixture.w.resize(distrib_nb) + mixture.cst.resize(distrib_nb) + mixture.det.resize(distrib_nb) + + if w[0] == '': + begin_hmm = True + + if w[0] == '': + state2 = True + + if begin_hmm & state2: + + if w[0].upper() == '': + distrib = int(w[1]) - 1 + mixture.w[distrib] = numpy.double(w[2]) + + elif w[0].upper() == '': + if vect_size == 0: + vect_size = int(w[1]) + mixture.mu.resize(distrib_nb, vect_size) + i += 1 + mixture.mu[distrib, :] = numpy.double(lines[i].split()) + + elif w[0].upper() == '': + if mixture.invcov.shape[0] == 0: + vect_size = int(w[1]) + mixture.invcov.resize(distrib_nb, vect_size) + i += 1 + C = numpy.double(lines[i].split()) + mixture.invcov[distrib, :] = 1 / C + + elif w[0].upper() == '': + raise Exception("we don't manage full covariance model") + elif w[0].upper() == '': + mixture.cst[distrib] = numpy.exp(-.05 * numpy.double(w[1])) + mixture._compute_all() + return mixture + + def __init__(self, + mixture_file_name='', + name='empty'): + """Initialize a Mixture from a file or as an empty Mixture. + + :param mixture_file_name: name of the file to read from, if empty, initialize + an empty mixture + """ + self.w = numpy.array([]) + self.mu = numpy.array([]) + self.invcov = numpy.array([]) + self.invchol = numpy.array([]) + self.cov_var_ctl = numpy.array([]) + self.cst = numpy.array([]) + self.det = numpy.array([]) + self.name = name + self.A = 0 + + if mixture_file_name != '': + self.read(mixture_file_name) + + @accepts('Mixture', 'Mixture', debug=2) + def __add__(self, other): + """Overide the sum for a mixture. + Weight, means and inv_covariances are added, det and cst are + set to 0 + """ + new_mixture = Mixture() + new_mixture.w = self.w + other.w + new_mixture.mu = self.mu + other.mu + new_mixture.invcov = self.invcov + other.invcov + return new_mixture + + def init_from_diag(self, diag_mixture): + """ + + :param diag_mixture: + """ + distrib_nb = diag_mixture.w.shape[0] + dim = diag_mixture.mu.shape[1] + + self.w = diag_mixture.w + self.cst = diag_mixture.cst + self.det = diag_mixture.det + self.mu = diag_mixture.mu + + self.invcov = numpy.empty((distrib_nb, dim, dim)) + self.invchol = numpy.empty((distrib_nb, dim, dim)) + for gg in range(distrib_nb): + self.invcov[gg] = numpy.diag(diag_mixture.invcov[gg, :]) + self.invchol[gg] = numpy.linalg.cholesky(self.invcov[gg]) + self.cov_var_ctl = numpy.diag(diag_mixture.cov_var_ctl) + self.name = diag_mixture.name + self.A = numpy.zeros(self.cst.shape) # we keep zero here as it is not used for full covariance distributions + + def _serialize(self): + """ + Serialization is necessary to share the memomry when running multiprocesses + """ + with warnings.catch_warnings(): + warnings.simplefilter('ignore', RuntimeWarning) + + sh = self.w.shape + tmp = multiprocessing.Array(ctypes.c_double, self.w.size) + self.w = numpy.ctypeslib.as_array(tmp.get_obj()) + self.w = self.w.reshape(sh) + + sh = self.mu.shape + tmp = multiprocessing.Array(ctypes.c_double, self.mu.size) + self.mu = numpy.ctypeslib.as_array(tmp.get_obj()) + self.mu = self.mu.reshape(sh) + + sh = self.invcov.shape + tmp = multiprocessing.Array(ctypes.c_double, self.invcov.size) + self.invcov = numpy.ctypeslib.as_array(tmp.get_obj()) + self.invcov = self.invcov.reshape(sh) + + sh = self.cov_var_ctl.shape + tmp = multiprocessing.Array(ctypes.c_double, self.cov_var_ctl.size) + self.cov_var_ctl = numpy.ctypeslib.as_array(tmp.get_obj()) + self.cov_var_ctl = self.cov_var_ctl.reshape(sh) + + sh = self.cst.shape + tmp = multiprocessing.Array(ctypes.c_double, self.cst.size) + self.cst = numpy.ctypeslib.as_array(tmp.get_obj()) + self.cst = self.cst.reshape(sh) + + sh = self.det.shape + tmp = multiprocessing.Array(ctypes.c_double, self.det.size) + self.det = numpy.ctypeslib.as_array(tmp.get_obj()) + self.det = self.det.reshape(sh) + + def get_distrib_nb(self): + """ + Return the number of Gaussian distributions in the mixture + :return: then number of distributions + """ + return self.w.shape[0] + + def read(self, mixture_file_name, prefix=''): + """Read a Mixture in hdf5 format + + :param mixture_file_name: name of the file to read from + :param prefix: + """ + with h5py.File(mixture_file_name, 'r') as f: + self.w = f.get(prefix+'w')[()] + self.w.resize(numpy.max(self.w.shape)) + self.mu = f.get(prefix+'mu')[()] + self.invcov = f.get(prefix+'invcov')[()] + self.invchol = f.get(prefix+'invchol')[()] + self.cov_var_ctl = f.get(prefix+'cov_var_ctl')[()] + self.cst = f.get(prefix+'cst')[()] + self.det = f.get(prefix+'det')[()] + self.A = f.get(prefix+'a')[()] + + @check_path_existance + def write_alize(self, mixture_file_name): + """Save a mixture in alize raw format + + :param mixture_file_name: name of the file to write in + """ + with open(mixture_file_name, 'wb') as of: + # write the number of distributions per state + of.write(struct.pack("ijm', tmp, self.invchol) + lp = numpy.log(self.w[:, numpy.newaxis]) + numpy.log(self.cst[:, numpy.newaxis]) - 0.5 * (a * a).sum(-1) + + return lp.T + + def compute_log_posterior_probabilities(self, cep, mu=None): + """ Compute log posterior probabilities for a set of feature frames. + + :param cep: a set of feature frames in a ndarray, one feature per row + :param mu: a mean super-vector to replace the ubm's one. If it is an empty + vector, use the UBM + + :return: A ndarray of log-posterior probabilities corresponding to the + input feature set. + """ + if cep.ndim == 1: + cep = cep[numpy.newaxis, :] + A = self.A + if mu is None: + mu = self.mu + else: + # for MAP, Compute the data independent term + A = (numpy.square(mu.reshape(self.mu.shape)) * self.invcov).sum(1) \ + - 2.0 * (numpy.log(self.w) + numpy.log(self.cst)) + + # Compute the data independent term + B = numpy.dot(numpy.square(cep), self.invcov.T) \ + - 2.0 * numpy.dot(cep, numpy.transpose(mu.reshape(self.mu.shape) * self.invcov)) + + # Compute the exponential term + lp = -0.5 * (B + A) + return lp + + @staticmethod + def variance_control(cov, flooring, ceiling, cov_ctl): + """variance_control for Mixture (florring and ceiling) + + :param cov: covariance to control + :param flooring: float, florring value + :param ceiling: float, ceiling value + :param cov_ctl: co-variance to consider for flooring and ceiling + """ + floor = flooring * cov_ctl + ceil = ceiling * cov_ctl + + to_floor = numpy.less_equal(cov, floor) + to_ceil = numpy.greater_equal(cov, ceil) + + cov[to_floor] = floor[to_floor] + cov[to_ceil] = ceil[to_ceil] + return cov + + def _reset(self): + """Set all the Mixture values to ZERO""" + self.cst.fill(0.0) + self.det.fill(0.0) + self.w.fill(0.0) + self.mu.fill(0.0) + self.invcov.fill(0.0) + self.A = 0.0 + + def _split_ditribution(self): + """Split each distribution into two depending on the principal + axis of variance.""" + sigma = 1.0 / self.invcov + sig_max = numpy.max(sigma, axis=1) + arg_max = numpy.argmax(sigma, axis=1) + + shift = numpy.zeros(self.mu.shape) + for x, y, z in zip(range(arg_max.shape[0]), arg_max, sig_max): + shift[x, y] = numpy.sqrt(z) + + self.mu = numpy.vstack((self.mu - shift, self.mu + shift)) + self.invcov = numpy.vstack((self.invcov, self.invcov)) + self.w = numpy.concatenate([self.w, self.w]) * 0.5 + self.cst = numpy.zeros(self.w.shape) + self.det = numpy.zeros(self.w.shape) + self.cov_var_ctl = numpy.vstack((self.cov_var_ctl, self.cov_var_ctl)) + + self._compute_all() + + def _expectation(self, accum, cep): + """Expectation step of the EM algorithm. Calculate the expected value + of the log likelihood function, with respect to the conditional + distribution. + + :param accum: a Mixture object to store the accumulated statistics + :param cep: a set of input feature frames + + :return loglk: float, the log-likelihood computed over the input set of + feature frames. + """ + if cep.ndim == 1: + cep = cep[:, numpy.newaxis] + if self.invcov.ndim == 2: + lp = self.compute_log_posterior_probabilities(cep) + elif self.invcov.ndim == 3: + lp = self.compute_log_posterior_probabilities_full(cep) + pp, loglk = sum_log_probabilities(lp) + + # zero order statistics + accum.w += pp.sum(0) + # first order statistics + accum.mu += numpy.dot(cep.T, pp).T + # second order statistics + if self.invcov.ndim == 2: + accum.invcov += numpy.dot(numpy.square(cep.T), pp).T # version for diagonal covariance + elif self.invcov.ndim == 3: + tmp = numpy.einsum('ijk,ilk->ijl', cep[:, :, numpy.newaxis], cep[:, :, numpy.newaxis]) + accum.invcov += numpy.einsum('ijk,im->mjk', tmp, pp) + + # return the log-likelihood + return loglk + + @process_parallel_lists + def _expectation_list(self, stat_acc, feature_list, feature_server, llk_acc=numpy.zeros(1), num_thread=1): + """ + Expectation step of the EM algorithm. Calculate the expected value + of the log likelihood function, with respect to the conditional + distribution. + + :param stat_acc: + :param feature_list: + :param feature_server: + :param llk_acc: + :param num_thread: + :return: + """ + stat_acc._reset() + feature_server.keep_all_features = False + for feat in feature_list: + cep = feature_server.load(feat)[0] + llk_acc[0] += self._expectation(stat_acc, cep) + + @process_parallel_lists + def _expectation_list_idmap(self, stat_acc, + feature_list, + start_list, + stop_list, + feature_server, + llk_acc=numpy.zeros(1), + num_thread=1): + """ + Expectation step of the EM algorithm. Calculate the expected value + of the log likelihood function, with respect to the conditional + distribution. + + :param stat_acc: + :param feature_list: + :param feature_server: + :param llk_acc: + :param num_thread: + :return: + """ + stat_acc._reset() + feature_server.keep_all_features = False + for feat, start, stop in zip(feature_list, start_list, stop_list): + cep = feature_server.load(feat, start=start, stop=stop)[0] + llk_acc[0] += self._expectation(stat_acc, cep) + + def _maximization(self, accum, ceil_cov=10, floor_cov=1e-2): + """Re-estimate the parmeters of the model which maximize the likelihood + on the data. + + :param accum: a Mixture in which statistics computed during the E step + are stored + :param floor_cov: a constant; minimum bound to consider, default is 1e-200 + """ + self.w = accum.w / numpy.sum(accum.w) + self.mu = accum.mu / accum.w[:, numpy.newaxis] + if accum.invcov.ndim == 2: + cov = accum.invcov / accum.w[:, numpy.newaxis] - numpy.square(self.mu) + cov = Mixture.variance_control(cov, floor_cov, ceil_cov, self.cov_var_ctl) + self.invcov = 1.0 / cov + elif accum.invcov.ndim == 3: + cov = accum.invcov / accum.w[:, numpy.newaxis, numpy.newaxis] \ + - numpy.einsum('ijk,ilk->ijl', self.mu[:, :, numpy.newaxis], self.mu[:, :, numpy.newaxis]) + # ADD VARIANCE CONTROL + for gg in range(self.w.shape[0]): + self.invcov[gg] = numpy.linalg.inv(cov[gg]) + self.invchol[gg] = numpy.linalg.cholesky(self.invcov[gg]).T + self._compute_all() + + def _init(self, features_server, feature_list, start_list=None, stop_list=None, num_thread=1): + """ + Initialize a Mixture as a single Gaussian distribution which + mean and covariance are computed on a set of feature frames + + :param features_server: + :param feature_list: + :param num_thread: + :return: + """ + + # Init using all data + if num_thread == 1: + features = features_server.stack_features(feature_list, + start_list=start_list, + stop_list=stop_list) + else: + features = features_server.stack_features_parallel(feature_list, + start_list=start_list, + stop_list=stop_list, + num_thread=num_thread) + n_frames = features.shape[0] + mu = features.mean(0) + cov = (features**2).mean(0) + #n_frames, mu, cov = mean_std_many(features_server, feature_list, in_context=False, num_thread=num_thread) + self.mu = mu[None] + self.invcov = 1./cov[None] + self.w = numpy.asarray([1.0]) + self.cst = numpy.zeros(self.w.shape) + self.det = numpy.zeros(self.w.shape) + self.cov_var_ctl = 1.0 / copy.deepcopy(self.invcov) + self._compute_all() + + def EM_split(self, + features_server, + feature_list, + distrib_nb, + iterations=(1, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8), + num_thread=1, + llk_gain=0.01, + save_partial=False, + output_file_name="ubm", + ceil_cov=10, + floor_cov=1e-2): + """Expectation-Maximization estimation of the Mixture parameters. + + :param features_server: sidekit.FeaturesServer used to load data + :param feature_list: list of feature files to train the GMM + :param distrib_nb: final number of distributions + :param iterations: list of iteration number for each step of the learning process + :param num_thread: number of thread to launch for parallel computing + :param llk_gain: limit of the training gain. Stop the training when gain between + two iterations is less than this value + :param save_partial: name of the file to save intermediate mixtures, + if True, save before each split of the distributions + :param ceil_cov: + :param floor_cov: + + :return llk: a list of log-likelihoods obtained after each iteration + """ + llk = [] + + process_idmap = isinstance(feature_list, IdMap) + start_list = None + stop_list = None + if process_idmap: + start_list = feature_list.start + stop_list = feature_list.stop + session_list = feature_list.rightids + else: + session_list = feature_list + init_session_list = session_list[:20] + init_start_list = None + init_stop_list = None + if start_list is not None: + init_start_list = start_list[:20] + init_stop_list = stop_list[:20] + self._init(features_server, init_session_list, start_list=init_start_list, stop_list=init_stop_list, num_thread=num_thread) + # for N iterations: + for it in iterations[:int(numpy.log2(distrib_nb))]: + # Save current model before spliting + if save_partial: + self.write('{}_{}g.h5'.format(output_file_name, self.get_distrib_nb()), prefix='') + + self._split_ditribution() + + # initialize the accumulator + accum = copy.deepcopy(self) + + for i in range(it): + accum._reset() + + # serialize the accum + accum._serialize() + llk_acc = numpy.zeros(1) + sh = llk_acc.shape + with warnings.catch_warnings(): + warnings.simplefilter('ignore', RuntimeWarning) + tmp = multiprocessing.Array(ctypes.c_double, llk_acc.size) + llk_acc = numpy.ctypeslib.as_array(tmp.get_obj()) + llk_acc = llk_acc.reshape(sh) + + logging.debug('Expectation') + # E step + if process_idmap: + self._expectation_list_idmap(stat_acc=accum, + feature_list=session_list, + start_list=start_list, + stop_list=stop_list, + feature_server=features_server, + llk_acc=llk_acc, + num_thread=num_thread) + else: + self._expectation_list(stat_acc=accum, + feature_list=session_list, + feature_server=features_server, + llk_acc=llk_acc, + num_thread=num_thread) + llk.append(llk_acc[0] / numpy.sum(accum.w)) + + # M step + logging.debug('Maximisation') + self._maximization(accum, ceil_cov=ceil_cov, floor_cov=floor_cov) + if i > 0: + # gain = llk[-1] - llk[-2] + # if gain < llk_gain: + # logging.debug( + # 'EM (break) distrib_nb: %d %i/%d gain: %f -- %s, %d', + # self.mu.shape[0], i + 1, it, gain, self.name, + # len(cep)) + # break + # else: + # logging.debug( + # 'EM (continu) distrib_nb: %d %i/%d gain: %f -- %s, %d', + # self.mu.shape[0], i + 1, it, gain, self.name, + # len(cep)) + # break + pass + else: + # logging.debug( + # 'EM (start) distrib_nb: %d %i/%i llk: %f -- %s, %d', + # self.mu.shape[0], i + 1, it, llk[-1], + # self.name, len(cep)) + pass + + return llk + + def EM_uniform(self, cep, distrib_nb, iteration_min=3, iteration_max=10, + llk_gain=0.01, do_init=True): + + """Expectation-Maximization estimation of the Mixture parameters. + + :param cep: set of feature frames to consider + :param cep: set of feature frames to consider + :param distrib_nb: number of distributions + :param iteration_min: minimum number of iterations to perform + :param iteration_max: maximum number of iterations to perform + :param llk_gain: gain in term of likelihood, stop the training when the gain is less than this value + :param do_init: boolean, if True initialize the GMM from the training data + + :return llk: a list of log-likelihoods obtained after each iteration + + """ + llk = [] + + if do_init: + self._init_uniform(cep, distrib_nb) + accum = copy.deepcopy(self) + + for i in range(0, iteration_max): + accum._reset() + # serialize the accum + accum._serialize() + llk_acc = numpy.zeros(1) + sh = llk_acc.shape + with warnings.catch_warnings(): + warnings.simplefilter('ignore', RuntimeWarning) + tmp = multiprocessing.Array(ctypes.c_double, llk_acc.size) + llk_acc = numpy.ctypeslib.as_array(tmp.get_obj()) + llk_acc = llk_acc.reshape(sh) + + # E step + # llk.append(self._expectation_parallel(accum, cep, num_thread) / cep.shape[0]) + # self._expectation(accum,cep) + llk.append(self._expectation(accum, cep) / cep.shape[0]) + + # M step + self._maximization(accum) + if i > 0: + gain = llk[-1] - llk[-2] + if gain < llk_gain and i >= iteration_min: + logging.debug( + 'EM (break) distrib_nb: %d %i/%d gain: %f -- %s, %d', + self.mu.shape[0], i + 1, iteration_max, gain, self.name, + len(cep)) + break + else: + logging.debug( + 'EM (continu) distrib_nb: %d %i/%d gain: %f -- %s, %d', + self.mu.shape[0], i + 1, iteration_max, gain, self.name, + len(cep)) + else: + logging.debug( + 'EM (start) distrib_nb: %d %i/%i llk: %f -- %s, %d', + self.mu.shape[0], i + 1, iteration_max, llk[-1], + self.name, len(cep)) + return llk + + def _init_uniform(self, cep, distrib_nb): + """ + + :param cep: matrix of acoustic frames + :param distrib_nb: number of distributions + """ + + # Load data to initialize the mixture + # self._init(fs, cep) + cov_tmp = copy.deepcopy(self.invcov) + nb = cep.shape[0] + self.w = numpy.full(distrib_nb, 1.0 / distrib_nb, "d") + self.cst = numpy.zeros(distrib_nb, "d") + self.det = numpy.zeros(distrib_nb, "d") + + for i in range(0, distrib_nb): + start = nb // distrib_nb * i + end = max(start + 10, nb) + mean = numpy.mean(cep[start:end, :], axis=0) + cov = (cep[start:end, :]**2).mean(0) + if i == 0: + self.mu = mean + self.invcov = 1./cov[None] + else: + self.mu = numpy.vstack((self.mu, mean)) + self.invcov = numpy.vstack((self.invcov, cov)) + self.cov_var_ctl = 1.0 / copy.deepcopy(self.invcov) + + self._compute_all() + + def EM_diag2full(self, diagonal_mixture, features_server, featureList, iterations=2, num_thread=1): + """Expectation-Maximization estimation of the Mixture parameters. + + :param features_server: sidekit.FeaturesServer used to load data + :param featureList: list of feature files to train the GMM + :param iterations: list of iteration number for each step of the learning process + :param num_thread: number of thread to launch for parallel computing + + :return llk: a list of log-likelihoods obtained after each iteration + """ + llk = [] + + # Convert the covariance matrices into full ones + distrib_nb = diagonal_mixture.w.shape[0] + dim = diagonal_mixture.mu.shape[1] + + self.w = diagonal_mixture.w + self.cst = diagonal_mixture.cst + self.det = diagonal_mixture.det + self.mu = diagonal_mixture.mu + + self.invcov = numpy.empty((distrib_nb, dim, dim)) + self.invchol = numpy.empty((distrib_nb, dim, dim)) + for gg in range(distrib_nb): + self.invcov[gg] = numpy.diag(diagonal_mixture.invcov[gg, :]) + self.invchol[gg] = numpy.linalg.cholesky(self.invcov[gg]) + self.cov_var_ctl = numpy.diag(diagonal_mixture.cov_var_ctl) + self.name = diagonal_mixture.name + self.A = numpy.zeros(self.cst.shape) # we keep zero here as it is not used for full covariance distributions + + # Create Accumulator + accum = copy.deepcopy(self) + + # Run iterations of EM + for it in range(iterations): + logging.debug('EM convert full it: %d', it) + + accum._reset() + + # serialize the accum + accum._serialize() + llk_acc = numpy.zeros(1) + sh = llk_acc.shape + with warnings.catch_warnings(): + warnings.simplefilter('ignore', RuntimeWarning) + tmp = multiprocessing.Array(ctypes.c_double, llk_acc.size) + llk_acc = numpy.ctypeslib.as_array(tmp.get_obj()) + llk_acc = llk_acc.reshape(sh) + + logging.debug('Expectation') + # E step + self._expectation_list(stat_acc=accum, + feature_list=featureList, + feature_server=features_server, + llk_acc=llk_acc, + num_thread=num_thread) + llk.append(llk_acc[0] / numpy.sum(accum.w)) + + # M step + logging.debug('Maximisation') + self._maximization(accum) + if it > 0: + # gain = llk[-1] - llk[-2] + # if gain < llk_gain: + # logging.debug( + # 'EM (break) distrib_nb: %d %i/%d gain: %f -- %s, %d', + # self.mu.shape[0], i + 1, it, gain, self.name, + # len(cep)) + # break + # else: + # logging.debug( + # 'EM (continu) distrib_nb: %d %i/%d gain: %f -- %s, %d', + # self.mu.shape[0], i + 1, it, gain, self.name, + # len(cep)) + # break + pass + else: + # logging.debug( + # 'EM (start) distrib_nb: %d %i/%i llk: %f -- %s, %d', + # self.mu.shape[0], i + 1, it, llk[-1], + # self.name, len(cep)) + pass + + return llk + + def merge(self, model_list): + """ + Merge a list of Mixtures into a new one. Weights are normalized uniformly + :param model_list: a list of Mixture objects to merge + """ + self.w = numpy.hstack(([mod.w for mod in model_list])) + self.w /= self.w.sum() + + self.mu = numpy.vstack(([mod.mu for mod in model_list])) + self.invcov = numpy.vstack(([mod.invcov for mod in model_list])) + self.invchol = numpy.vstack(([mod.invchol for mod in model_list])) + self.cov_var_ctl = numpy.vstack(([mod.cov_var_ctl for mod in model_list])) + self.cst = numpy.hstack(([mod.cst for mod in model_list])) + self.det = numpy.hstack(([mod.det for mod in model_list])) + self.name = "_".join([mod.name for mod in model_list]) + self.A = numpy.hstack(([mod.A for mod in model_list])) + + self._compute_all() + assert self.validate(), "Error while merging models" diff --git a/sidekit/sidekit/nnet/__init__.py b/sidekit/sidekit/nnet/__init__.py new file mode 100755 index 0000000..22222a8 --- /dev/null +++ b/sidekit/sidekit/nnet/__init__.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . + +""" +Copyright 2014-2021 Anthony Larcher and Sylvain Meignier + +:mod:`nnet` provides methods to manage Neural Networks using PyTorch +""" + + +from .xsets import IdMapSetPerSpeaker +from .xsets import SideSet +from .xsets import SideSampler +from .xvector import Xtractor +from .xvector import xtrain +from .xvector import extract_embeddings +from .pooling import MeanStdPooling +from .pooling import AttentivePooling +from .pooling import GruPooling +from .res_net import ResBlock +from .res_net import PreResNet34 +from .res_net import PreFastResNet34 +from .res_net import PreHalfResNet34 +from .sincnet import SincNet +from .preprocessor import RawPreprocessor +from .preprocessor import MfccFrontEnd +from .preprocessor import MelSpecFrontEnd + + +__author__ = "Anthony Larcher and Sylvain Meignier" +__copyright__ = "Copyright 2014-2021 Anthony Larcher and Sylvain Meignier" +__license__ = "LGPL" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' diff --git a/sidekit/sidekit/nnet/augmentation.py b/sidekit/sidekit/nnet/augmentation.py new file mode 100644 index 0000000..7f19dfc --- /dev/null +++ b/sidekit/sidekit/nnet/augmentation.py @@ -0,0 +1,308 @@ +# coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . + +""" +Copyright 2014-2021 Anthony Larcher +""" + +import collections +import numpy +import random +import torch +import torchaudio + +from scipy import signal + + +__author__ = "Anthony Larcher and Sylvain Meignier" +__copyright__ = "Copyright 2014-2021 Anthony Larcher and Sylvain Meignier" +__license__ = "LGPL" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' + + +Noise = collections.namedtuple('Noise', 'type file_id duration') + + +class PreEmphasis(torch.nn.Module): + """ + Apply pre-emphasis filtering + """ + + def __init__(self, coef: float = 0.97): + super().__init__() + self.coef = coef + # make kernel + # In pytorch, the convolution operation uses cross-correlation. So, filter is flipped. + self.register_buffer( + 'flipped_filter', torch.FloatTensor([-self.coef, 1.]).unsqueeze(0).unsqueeze(0) + ) + + def forward(self, input_signal: torch.tensor) -> torch.tensor: + """ + Forward pass of the pre-emphasis filtering + + :param input_signal: the input signal + :return: the filtered signal + """ + assert len(input_signal.size()) == 2, 'The number of dimensions of input tensor must be 2!' + # reflect padding to match lengths of in/out + input_signal = input_signal.unsqueeze(1) + input_signal = torch.nn.functional.pad(input_signal, (1, 0), 'reflect') + return torch.nn.functional.conv1d(input_signal, self.flipped_filter).squeeze(1) + + +class FrequencyMask(object): + """Crop randomly the image in a sample. + + Args: + output_size (tuple or int): Desired output size. If int, square crop + is made. + """ + def __init__(self, max_size, feature_size): + self.max_size = max_size + self.feature_size = feature_size + + def __call__(self, sample): + data = sample[0] + if sample[2]: + size = numpy.random.randint(1, self.max_size) + f0 = numpy.random.randint(0, self.feature_size - self.max_size) + data[f0:f0+size, :] = 10. + return data, sample[1], sample[2], sample[3], sample[4], sample[5] + + +class TemporalMask(object): + """Crop randomly the image in a sample. + + Args: + output_size (tuple or int): Desired output size. If int, square crop + is made. + """ + def __init__(self, max_size): + self.max_size = max_size + + def __call__(self, sample): + data = sample[0] + if sample[3]: + size = numpy.random.randint(1, self.max_size) + t0 = numpy.random.randint(0, sample[0].shape[1] - self.max_size) + data[:, t0:t0+size] = 10. + return data, sample[1], sample[2], sample[3], sample[4], sample[5] + + +def normalize(wav): + """ + Center and reduce a waveform + + :param wav: the input waveform + :return: the normalized waveform + """ + return wav / (numpy.sqrt(numpy.mean(wav ** 2)) + 1e-8) + + +def crop(input_signal, duration): + """ + Select a chunk from an audio segment + + :param input_signal: signal to select a chunk from + :param duration: duration of the chunk to select + :return: + """ + start = random.randint(0, input_signal.shape[0] - duration) + chunk = input_signal[start: start + duration] + return chunk + + +def data_augmentation(speech, + sample_rate, + transform_dict, + transform_number, + noise_df=None, + rir_df=None, + babble_noise=True): + """ + Perform data augmentation on an input signal. + Each speech chunk is augmented by using 'transform_number' transformations that are picked up randomly from a + dictionary of possible transformations. + + :param speech: the input signal to be augmented + :param sample_rate: sampling rate of the input signal to augment + :param transform_dict: the dictionary of possibles augmentations to apply + :param transform_number: the number of transformations to apply on each chunk + :param rir_df: a pandas dataframe object including the list of RIR signals to chose from; default is None + :param noise_df: a pandas dataframe object including the list of NOISE signals to chose from; default is None + :param babble_noise: boolean that enable the use of babble noise, True by default (typically turned to False when + the task includes overlapping speech detection). + + :return: augmented signal + + tranformation + pipeline: add_noise,add_reverb + add_noise: + noise_db_csv: filename.csv + snr: 5,6,7,8,9,10,11,12,13,14,15 + add_reverb: + rir_db_csv: filename.csv + codec: true + phone_filtering: true + """ + # Select the data augmentation randomly + aug_idx = random.sample(range(len(transform_dict.keys())), k=transform_number) + augmentations = numpy.array(list(transform_dict.keys()))[aug_idx] + + if "stretch" in augmentations: + strech = torchaudio.functional.TimeStretch() + rate = random.uniform(0.8,1.2) + speech = strech(speech, rate) + + if "add_reverb" in augmentations: + rir_nfo = rir_df.iloc[random.randrange(rir_df.shape[0])].file_id + rir_fn = transform_dict["add_reverb"]["data_path"] + rir_nfo # TODO harmonize with noise + rir, rir_fs = torchaudio.load(rir_fn) + assert rir_fs == sample_rate + #rir = rir[rir_nfo[1], :] #keep selected channel + speech = torch.tensor(signal.convolve(speech, rir, mode='full')[:, :speech.shape[1]]) + + if "add_noise" in augmentations: + # Pick a noise type + noise = torch.zeros_like(speech) + if not babble_noise: + noise_idx = random.randrange(1, 3) + else: + noise_idx = random.randrange(0, 4) + + # speech + if noise_idx == 0: + # Pick a SNR level + # TODO make SNRs configurable by noise type + snr_db = random.randint(13, 20) + pick_count = random.randint(3, 7) + index_list = random.sample(range(noise_df.loc['speech'].shape[0]), k=pick_count) + for idx in index_list: + noise_row = noise_df.loc['speech'].iloc[idx] + noise += load_noise_seg(noise_row, speech.shape, sample_rate, transform_dict["add_noise"]["data_path"]) + noise /= pick_count + # music + elif noise_idx == 1: + snr_db = random.randint(5, 15) + noise_row = noise_df.loc['music'].iloc[random.randrange(noise_df.loc['music'].shape[0])] + noise += load_noise_seg(noise_row, speech.shape, sample_rate, transform_dict["add_noise"]["data_path"]) + # noise + elif noise_idx == 2: + snr_db = random.randint(0, 15) + noise_row = noise_df.loc['noise'].iloc[random.randrange(noise_df.loc['noise'].shape[0])] + noise += load_noise_seg(noise_row, speech.shape, sample_rate, transform_dict["add_noise"]["data_path"]) + # babble noise with different volume + elif noise_idx == 3: + snr_db = random.randint(13,20) + pick_count = random.randint(5,10) # Randomly select 5 to 10 speakers + index_list = random.choices(range(noise_df.loc['speech'].shape[0]), k=pick_count) + + noise = torch.zeros(1,speech.shape[1]) + for idx in index_list: + noise_row = noise_df.loc['speech'].iloc[idx] + noise_ = load_noise_seg(noise_row, speech.shape, sample_rate, transform_dict["add_noise"]["data_path"]) + transform = torchaudio.transforms.Vol(gain=random.randint(5,15),gain_type='db') # Randomly select volume level (5-15d) + noise += transform(noise_) + noise /= pick_count + + speech_power = speech.norm(p=2) + noise_power = noise.norm(p=2) + snr = 10 ** (snr_db / 20) + scale = snr * noise_power / speech_power + speech = (scale * speech + noise) / 2 + + if "phone_filtering" in augmentations: + final_shape = speech.shape[1] + speech, sample_rate = torchaudio.sox_effects.apply_effects_tensor( + speech, + sample_rate, + effects=[ + ["lowpass", "4000"], + ["compand", "0.02,0.05", "-60,-60,-30,-10,-20,-8,-5,-8,-2,-8", "-8", "-7", "0.05"], + ["rate", "16000"], + ]) + speech = speech[:, :final_shape] + + if "filtering" in augmentations: + effects = [ + ["bandpass","2000","3500"], + ["bandstop","200","500"]] + speech, sample_rate = torchaudio.sox_eefects.apply_effects_tensor( + speech, + sample_rate, + effects=[effects[random.randint(0, 1)]], + ) + + if "codec" in augmentations: + final_shape = speech.shape[1] + configs = [ + ({"format": "wav", "encoding": 'ULAW', "bits_per_sample": 8}, "8 bit mu-law"), + ({"format": "gsm"}, "GSM-FR"), + ({"format": "mp3", "compression": -9}, "MP3"), + ({"format": "vorbis", "compression": -1}, "Vorbis") + ] + param, title = random.choice(configs) + speech = torchaudio.functional.apply_codec(speech, sample_rate, **param) + speech = speech[:, :final_shape] + + return speech + + +def load_noise_seg(noise_row, speech_shape, sample_rate, data_path): + """ + Pick a noise signal to add while performing data augmentation + + :param noise_row: a row from a Pandas dataframe object + :param speech_shape: shape of the speech signal to be augmented + :param sample_rate: sampling rate of the speech signal to be augmented + :param data_path: directory where to load the noise file from + :return: + """ + noise_start = noise_row['start'] + noise_duration = noise_row['duration'] + noise_file_id = noise_row['file_id'] + + if noise_duration * sample_rate > speech_shape[1]: + # It is recommended to split noise files (especially speech noise type) in shorter subfiles + # When frame_offset is too high, loading the segment can take much longer + frame_offset = random.randrange(noise_start * sample_rate, + int((noise_start + noise_duration) * sample_rate - speech_shape[1])) + else: + frame_offset = noise_start * sample_rate + + noise_fn = data_path + "/" + noise_file_id + ".wav" + if noise_duration * sample_rate > speech_shape[1]: + noise_seg, noise_sr = torchaudio.load(noise_fn, frame_offset=int(frame_offset), num_frames=int(speech_shape[1])) + else: + noise_seg, noise_sr = torchaudio.load(noise_fn, + frame_offset=int(frame_offset), + num_frames=int(noise_duration * sample_rate)) + assert noise_sr == sample_rate + + if noise_seg.shape[1] < speech_shape[1]: + noise_seg = torch.tensor(numpy.resize(noise_seg.numpy(), speech_shape)) + return noise_seg diff --git a/sidekit/sidekit/nnet/loss.py b/sidekit/sidekit/nnet/loss.py new file mode 100644 index 0000000..832bf14 --- /dev/null +++ b/sidekit/sidekit/nnet/loss.py @@ -0,0 +1,437 @@ +# -*- coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . + +""" +Copyright 2014-2021 Anthony Larcher + +""" + + +import math +import numpy +import torch + +from collections import OrderedDict +from torch.nn import Parameter + + +#from .classification import Classification + +__license__ = "LGPL" +__author__ = "Anthony Larcher" +__copyright__ = "Copyright 2015-2020 Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reS' + + +class ArcMarginModel(torch.nn.Module): + """ + + """ + def __init__(self, args): + super(ArcMarginModel, self).__init__() + + self.weight = Parameter(torch.FloatTensor(num_classes, args.emb_size)) + nn.init.xavier_uniform_(self.weight) + + self.easy_margin = args.easy_margin + self.m = args.margin_m + self.s = args.margin_s + + self.cos_m = math.cos(self.m) + self.sin_m = math.sin(self.m) + self.th = math.cos(math.pi - self.m) + self.mm = math.sin(math.pi - self.m) * self.m + + def forward(self, input, label): + """ + + :param input: + :param label: + :return: + """ + x = F.normalize(input) + W = F.normalize(self.weight) + cosine = F.linear(x, W) + sine = torch.sqrt(1.0 - torch.pow(cosine, 2)) + phi = cosine * self.cos_m - sine * self.sin_m # cos(theta + m) + if self.easy_margin: + phi = torch.where(cosine > 0, phi, cosine) + else: + phi = torch.where(cosine > self.th, phi, cosine - self.mm) + one_hot = torch.zeros(cosine.size(), device=device) + one_hot.scatter_(1, label.view(-1, 1).long(), 1) + output = (one_hot * phi) + ((1.0 - one_hot) * cosine) + output *= self.s + return output + + +def l2_norm(input, axis=1): + """ + + :param input: + :param axis: + :return: + """ + norm = torch.norm(input, 2, axis, True) + output = torch.div(input, norm) + return output + + +class ArcFace(torch.nn.Module): + """ + + """ + # implementation of additive margin softmax loss in https://arxiv.org/abs/1801.05599 + def __init__(self, embedding_size, classnum, s=64., m=0.5): + super(ArcFace, self).__init__() + self.classnum = classnum + self.kernel = Parameter(torch.Tensor(embedding_size, classnum)) + # initial kernel + self.kernel.data.uniform_(-1, 1).renorm_(2, 1, 1e-5).mul_(1e5) + self.m = m # the margin value, default is 0.5 + self.s = s # scalar value default is 64, see normface https://arxiv.org/abs/1704.06369 + self.cos_m = math.cos(m) + self.sin_m = math.sin(m) + self.mm = self.sin_m * m # issue 1 + self.threshold = math.cos(math.pi - m) + + def forward(self, embbedings, target): + """ + + :param embbedings: + :param target: + :return: + """ + # weights norm + nB = len(embbedings) + kernel_norm = l2_norm(self.kernel, axis=0) + # cos(theta+m) + cos_theta = torch.mm(embbedings, kernel_norm) + # output = torch.mm(embbedings,kernel_norm) + cos_theta = cos_theta.clamp(-1, 1) # for numerical stability + cos_theta_2 = torch.pow(cos_theta, 2) + sin_theta_2 = 1 - cos_theta_2 + sin_theta = torch.sqrt(sin_theta_2) + cos_theta_m = (cos_theta * self.cos_m - sin_theta * self.sin_m) + # this condition controls the theta+m should in range [0, pi] + # 0<=theta+m<=pi + # -m<=theta<=pi-m + cond_v = cos_theta - self.threshold + cond_mask = cond_v <= 0 + # when theta not in [0,pi], use cosface instead + keep_val = (cos_theta - self.mm) + cos_theta_m[cond_mask] = keep_val[cond_mask] + # a little bit hacky way to prevent in_place operation on cos_theta + output = cos_theta * 1.0 + idx_ = torch.arange(0, nB, dtype=torch.long) + output[idx_, target] = cos_theta_m[idx_, target] + output *= self.s # scale up in order to make softmax work, first introduced in normface + return output + + +################################## Cosface head ############################################################# + +class Am_softmax(torch.nn.Module): + """ + + """ + # implementation of additive margin softmax loss in https://arxiv.org/abs/1801.05599 + def __init__(self, embedding_size=512, classnum=51332): + super(Am_softmax, self).__init__() + self.classnum = classnum + self.kernel = Parameter(torch.Tensor(embedding_size, classnum)) + # initial kernel + self.kernel.data.uniform_(-1, 1).renorm_(2, 1, 1e-5).mul_(1e5) + self.m = 0.35 # additive margin recommended by the paper + self.s = 30. # see normface https://arxiv.org/abs/1704.06369 + + def forward(self, embbedings, label): + """ + + :param embbedings: + :param label: + :return: + """ + kernel_norm = l2_norm(self.kernel, axis=0) + cos_theta = torch.mm(embbedings, kernel_norm) + cos_theta = cos_theta.clamp(-1, 1) # for numerical stability + phi = cos_theta - self.m + label = label.view(-1, 1) # size=(B,1) + index = cos_theta.data * 0.0 # size=(B,Classnum) + index.scatter_(1, label.data.view(-1, 1), 1) + index = index.byte() + output = cos_theta * 1.0 + output[index] = phi[index] # only change the correct predicted output + output *= self.s # scale up in order to make softmax work, first introduced in normface + return output + + +class ArcLinear(torch.nn.Module): + """Additive Angular Margin linear module (ArcFace) + + Parameters + ---------- + nfeat : int + Embedding dimension + nclass : int + Number of classes + margin : float + Angular margin to penalize distances between embeddings and centers + s : float + Scaling factor for the logits + """ + + def __init__(self, nfeat, nclass, margin, s): + super(ArcLinear, self).__init__() + eps = 1e-4 + self.min_cos = eps - 1 + self.max_cos = 1 - eps + self.nclass = nclass + self.margin = margin + self.s = s + self.W = torch.nn.Parameter(torch.Tensor(nclass, nfeat)) + torch.nn.init.xavier_uniform_(self.W) + + def forward(self, x, target=None): + """Apply the angular margin transformation + + Parameters + ---------- + x : `torch.Tensor` + an embedding batch + target : `torch.Tensor` + a non one-hot label batch + + Returns + ------- + fX : `torch.Tensor` + logits after the angular margin transformation + """ + # the feature vectors has been normalized before calling this layer + #xnorm = torch.nn.functional.normalize(x) + xnorm = x + # normalize W + Wnorm = torch.nn.functional.normalize(self.W) + target = target.long().view(-1, 1) + # calculate cosθj (the logits) + cos_theta_j = torch.matmul(xnorm, torch.transpose(Wnorm, 0, 1)) + # get the cosθ corresponding to the classes + cos_theta_yi = cos_theta_j.gather(1, target) + # for numerical stability + cos_theta_yi = cos_theta_yi.clamp(min=self.min_cos, max=self.max_cos) + # get the angle separating xi and Wyi + theta_yi = torch.acos(cos_theta_yi) + # apply the margin to the angle + cos_theta_yi_margin = torch.cos(theta_yi + self.margin) + # one hot encode y + one_hot = torch.zeros_like(cos_theta_j) + one_hot.scatter_(1, target, 1.0) + # project margin differences into cosθj + return self.s * (cos_theta_j + one_hot * (cos_theta_yi_margin - cos_theta_yi)) + + +class ArcMarginProduct(torch.nn.Module): + """ + Implement of large margin arc distance: : + Args: + in_features: size of each input sample + out_features: size of each output sample + s: norm of input feature + m: margin + cos(theta + m) + """ + + def __init__(self, in_features, out_features, s=30.0, m=0.50, easy_margin=False): + super(ArcMarginProduct, self).__init__() + self.in_features = in_features + self.out_features = out_features + self.s = s + self.m = m + self.weight = Parameter(torch.FloatTensor(out_features, in_features)) + torch.nn.init.xavier_uniform_(self.weight) + + self.easy_margin = easy_margin + self.cos_m = math.cos(self.m) + self.sin_m = math.sin(self.m) + self.th = math.cos(math.pi - self.m) + self.mm = math.sin(math.pi - self.m) * self.m + + def change_params(self, s=None, m=None): + """ + + :param s: + :param m: + """ + if s is None: + s = self.s + if m is None: + m = self.m + self.s = s + self.m = m + self.cos_m = math.cos(self.m) + self.sin_m = math.sin(self.m) + self.th = math.cos(math.pi - self.m) + self.mm = math.sin(math.pi - self.m) * self.m + + def forward(self, input, target=None): + """ + + :param input: + :param target: + :return: + """ + # cos(theta) + cosine = torch.nn.functional.linear(torch.nn.functional.normalize(input), + torch.nn.functional.normalize(self.weight)) + if target == None: + return cosine * self.s + # cos(theta + m) + sine = torch.sqrt((1.0 - torch.mul(cosine, cosine)).clamp(0, 1)) + phi = cosine * self.cos_m - sine * self.sin_m + + if self.easy_margin: + phi = torch.where(cosine > 0, phi, cosine) + else: + phi = torch.where((cosine - self.th) > 0, phi, cosine - self.mm) + + #one_hot = torch.zeros(cosine.size(), device='cuda' if torch.cuda.is_available() else 'cpu') + one_hot = torch.zeros_like(cosine) + one_hot.scatter_(1, target.view(-1, 1), 1) + output = (one_hot * phi) + ((1.0 - one_hot) * cosine) + output = output * self.s + + return output, cosine * self.s + + +class SoftmaxAngularProto(torch.nn.Module): + """ + + from https://github.com/clovaai/voxceleb_trainer/blob/3bfd557fab5a3e6cd59d717f5029b3a20d22a281/loss/angleproto.py + """ + def __init__(self, spk_count, emb_dim=256, init_w=10.0, init_b=-5.0, **kwargs): + super(SoftmaxAngularProto, self).__init__() + + self.test_normalize = True + + self.w = torch.nn.Parameter(torch.tensor(init_w)) + self.b = torch.nn.Parameter(torch.tensor(init_b)) + self.criterion = torch.nn.CrossEntropyLoss() + + self.cce_backend = torch.nn.Sequential(OrderedDict([ + ("linear8", torch.nn.Linear(emb_dim, spk_count)) + ])) + + def forward(self, x, target=None): + """ + + :param x: + :param target: + :return: + """ + assert x.size()[1] >= 2 + + cce_prediction = self.cce_backend(x) + + if target is None: + return cce_prediction + + x = x.reshape(-1, 2, x.size()[-1]).squeeze(1) + + out_anchor = torch.mean(x[:, 1:, :], 1) + out_positive = x[:,0,:] + + cos_sim_matrix = torch.nn.functional.cosine_similarity(out_positive.unsqueeze(-1), + out_anchor.unsqueeze(-1).transpose(0, 2)) + torch.clamp(self.w, 1e-6) + cos_sim_matrix = cos_sim_matrix * self.w + self.b + loss = self.criterion(cos_sim_matrix, torch.arange(0, + cos_sim_matrix.shape[0], + device=x.device)) + self.criterion(cce_prediction, target) + return loss, cce_prediction + + +class AngularProximityMagnet(torch.nn.Module): + """ + from https://github.com/clovaai/voxceleb_trainer/blob/3bfd557fab5a3e6cd59d717f5029b3a20d22a281/loss/angleproto.py + """ + def __init__(self, spk_count, emb_dim=256, batch_size=512, init_w=10.0, init_b=-5.0, **kwargs): + super(AngularProximityMagnet, self).__init__() + + self.test_normalize = True + + self.w = torch.nn.Parameter(torch.tensor(init_w)) + self.b1 = torch.nn.Parameter(torch.tensor(init_b)) + self.b2 = torch.nn.Parameter(torch.tensor(+5.54)) + + #last_linear = torch.nn.Linear(512, 1) + #last_linear.bias.data += 1 + + #self.magnitude = torch.nn.Sequential(OrderedDict([ + # ("linear9", torch.nn.Linear(emb_dim, 512)), + # ("relu9", torch.nn.ReLU()), + # ("linear10", torch.nn.Linear(512, 512)), + # ("relu10", torch.nn.ReLU()), + # ("linear11", last_linear), + # ("relu11", torch.nn.ReLU()) + # ])) + + self.cce_backend = torch.nn.Sequential(OrderedDict([ + ("linear8", torch.nn.Linear(emb_dim, spk_count)) + ])) + + self.criterion = torch.nn.CrossEntropyLoss() + self.magnet_criterion = torch.nn.BCEWithLogitsLoss(reduction='mean') + + def forward(self, x, target=None): + """ + + :param x: + :param target: + :return: + """ + assert x.size()[1] >= 2 + + cce_prediction = self.cce_backend(x) + + if target is None: + return x, cce_prediction + + x = x.reshape(-1, 2, x.size()[-1]).squeeze(1) + out_anchor = torch.mean(x[:, 1:, :], 1) + out_positive = x[:, 0, :] + + ap_sim_matrix = torch.nn.functional.cosine_similarity(out_positive.unsqueeze(-1),out_anchor.unsqueeze(-1).transpose(0,2)) + torch.clamp(self.w, 1e-6) + ap_sim_matrix = ap_sim_matrix * self.w + self.b1 + + labels = torch.arange(0, int(out_positive.shape[0]), device=torch.device("cuda:0")).unsqueeze(1) + cos_sim_matrix = torch.mm(out_positive, out_anchor.T) + cos_sim_matrix = cos_sim_matrix + self.b2 + cos_sim_matrix = cos_sim_matrix + numpy.log(1/out_positive.shape[0] / (1 - 1/out_positive.shape[0])) + mask = (torch.tile(labels, (1, labels.shape[0])) == labels.T).float() + batch_loss = self.criterion(ap_sim_matrix, torch.arange(0, int(out_positive.shape[0]), device=torch.device("cuda:0"))) \ + + self.magnet_criterion(cos_sim_matrix.flatten().unsqueeze(1), mask.flatten().unsqueeze(1)) + return batch_loss, cce_prediction diff --git a/sidekit/sidekit/nnet/pooling.py b/sidekit/sidekit/nnet/pooling.py new file mode 100644 index 0000000..d2c7938 --- /dev/null +++ b/sidekit/sidekit/nnet/pooling.py @@ -0,0 +1,144 @@ +# coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . + +""" +Copyright 2014-2021 Anthony Larcher, Yevhenii Prokopalo +""" + + +import os +import torch + + +os.environ['MKL_THREADING_LAYER'] = 'GNU' + +__license__ = "LGPL" +__author__ = "Anthony Larcher" +__copyright__ = "Copyright 2015-2021 Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reS' + + +class MeanStdPooling(torch.nn.Module): + """ + Mean and Standard deviation pooling + """ + def __init__(self): + """ + + """ + super(MeanStdPooling, self).__init__() + pass + + def forward(self, x): + """ + + :param x: + :return: + """ + mean = torch.mean(x, dim=2) + std = torch.std(x, dim=2) + return torch.cat([mean, std], dim=1) + + +class AttentivePooling(torch.nn.Module): + """ + Mean and Standard deviation attentive pooling + """ + def __init__(self, num_channels, n_mels, reduction=2, global_context=False): + """ + + """ + # TODO Make global_context configurable (True/False) + # TODO Make convolution parameters configurable + super(AttentivePooling, self).__init__() + in_factor = 3 if global_context else 1 + self.attention = torch.nn.Sequential( + torch.nn.Conv1d(num_channels * (n_mels//8) * in_factor, num_channels//reduction, kernel_size=1), + torch.nn.ReLU(), + torch.nn.BatchNorm1d(num_channels//reduction), + torch.nn.Tanh(), + torch.nn.Conv1d(num_channels//reduction, num_channels * (n_mels//8), kernel_size=1), + torch.nn.Softmax(dim=2), + ) + self.global_context = global_context + self.gc = MeanStdPooling() + + def new_parameter(self, *size): + out = torch.nn.Parameter(torch.FloatTensor(*size)) + torch.nn.init.xavier_normal_(out) + return out + + def forward(self, x): + """ + + :param x: + :return: + """ + if self.global_context: + w = self.attention(torch.cat([x, self.gc(x).unsqueeze(2).repeat(1, 1, x.shape[-1])], dim=1)) + else: + w = self.attention(x) + + mu = torch.sum(x * w, dim=2) + rh = torch.sqrt( ( torch.sum((x**2) * w, dim=2) - mu**2 ).clamp(min=1e-5) ) + x = torch.cat((mu, rh),1) + x = x.view(x.size()[0], -1) + return x + + +class GruPooling(torch.nn.Module): + """ + Pooling done by using a recurrent network + """ + def __init__(self, input_size, gru_node, nb_gru_layer): + """ + + :param input_size: + :param gru_node: + :param nb_gru_layer: + """ + super(GruPooling, self).__init__() + self.lrelu_keras = torch.nn.LeakyReLU(negative_slope = 0.3) + self.bn_before_gru = torch.nn.BatchNorm1d(num_features = input_size) + self.gru = torch.nn.GRU(input_size = input_size, + hidden_size = gru_node, + num_layers = nb_gru_layer, + batch_first = True) + + def forward(self, x): + """ + + :param x: + :return: + """ + x = self.bn_before_gru(x) + x = self.lrelu_keras(x) + x = x.permute(0, 2, 1) # (batch, filt, time) >> (batch, time, filt) + self.gru.flatten_parameters() + x, _ = self.gru(x) + x = x[:, -1, :] + + return x diff --git a/sidekit/sidekit/nnet/preprocessor.py b/sidekit/sidekit/nnet/preprocessor.py new file mode 100644 index 0000000..009565f --- /dev/null +++ b/sidekit/sidekit/nnet/preprocessor.py @@ -0,0 +1,248 @@ +# coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . + +""" +Copyright 2014-2021 Anthony Larcher, Yevhenii Prokopalo +""" + + +import logging +import numpy +import os +import torch +import torchaudio + +from .augmentation import PreEmphasis +from .sincnet import SincConv1d +from .res_net import LayerNorm + + +os.environ['MKL_THREADING_LAYER'] = 'GNU' + +__license__ = "LGPL" +__author__ = "Anthony Larcher" +__copyright__ = "Copyright 2015-2021 Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reS' + + +logging.basicConfig(format='%(asctime)s %(message)s') + + +# Make PyTorch Deterministic +torch.manual_seed(0) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = False +numpy.random.seed(0) + + +class MfccFrontEnd(torch.nn.Module): + """ + Module that extract MFCC coefficients + """ + def __init__(self, + pre_emphasis=0.97, + sample_rate=16000, + n_fft=2048, + f_min=133.333, + f_max=6855.4976, + win_length=1024, + window_fn=torch.hann_window, + hop_length=512, + power=2.0, + n_mels=100, + n_mfcc=80): + + super(MfccFrontEnd, self).__init__() + + self.pre_emphasis = pre_emphasis + self.sample_rate = sample_rate + self.n_fft = n_fft + self.f_min = f_min + self.f_max = f_max + self.win_length = win_length + self.window_fn=window_fn + self.hop_length = hop_length + self.power=power + self.window_fn = window_fn + self.n_mels = n_mels + self.n_mfcc = n_mfcc + + self.PreEmphasis = PreEmphasis(self.pre_emphasis) + + self.melkwargs = {"n_fft":self.n_fft, + "f_min":self.f_min, + "f_max":self.f_max, + "win_length":self.win_length, + "window_fn":self.window_fn, + "hop_length":self.hop_length, + "power":self.power, + "n_mels":self.n_mels} + + self.MFCC = torchaudio.transforms.MFCC( + sample_rate=self.sample_rate, + n_mfcc=self.n_mfcc, + dct_type=2, + log_mels=True, + melkwargs=self.melkwargs) + + self.CMVN = torch.nn.InstanceNorm1d(self.n_mfcc) + + def forward(self, x): + """ + + :param x: + :return: + """ + with torch.no_grad(): + with torch.cuda.amp.autocast(enabled=False): + mfcc = self.PreEmphasis(x) + mfcc = self.MFCC(mfcc) + mfcc = self.CMVN(mfcc) + return mfcc + + +class MelSpecFrontEnd(torch.nn.Module): + """ + Module that compute Mel spetrogramm on an audio signal + """ + def __init__(self, + pre_emphasis=0.97, + sample_rate=16000, + n_fft=1024, + f_min=90, + f_max=7600, + win_length=400, + window_fn=torch.hann_window, + hop_length=160, + power=2.0, + n_mels=80): + + super(MelSpecFrontEnd, self).__init__() + + self.pre_emphasis = pre_emphasis + self.sample_rate = sample_rate + self.n_fft = n_fft + self.f_min = f_min + self.f_max = f_max + self.win_length = win_length + self.window_fn=window_fn + self.hop_length = hop_length + self.power=power + self.window_fn = window_fn + self.n_mels = n_mels + + self.PreEmphasis = PreEmphasis(self.pre_emphasis) + + self.melkwargs = {"n_fft":self.n_fft, + "f_min":self.f_min, + "f_max":self.f_max, + "win_length":self.win_length, + "window_fn":self.window_fn, + "hop_length":self.hop_length, + "power":self.power, + "n_mels":self.n_mels} + + self.MelSpec = torchaudio.transforms.MelSpectrogram(sample_rate=self.sample_rate, + n_fft=self.melkwargs['n_fft'], + f_min=self.melkwargs['f_min'], + f_max=self.melkwargs['f_max'], + win_length=self.melkwargs['win_length'], + hop_length=self.melkwargs['hop_length'], + window_fn=self.melkwargs['window_fn'], + power=self.melkwargs['power'], + n_mels=self.melkwargs['n_mels']) + + self.CMVN = torch.nn.InstanceNorm1d(self.n_mels) + self.time_masking = torchaudio.transforms.TimeMasking(time_mask_param=5) + self.freq_masking = torchaudio.transforms.FrequencyMasking(freq_mask_param=10) + + def forward(self, x, is_eval=False): + """ + + :param x: + :param is_eval: + :return: + """ + with torch.no_grad(): + with torch.cuda.amp.autocast(enabled=False): + if x.dim() == 1: + x = x.unsqueeze(0) + out = self.PreEmphasis(x) + out = self.MelSpec(out)+1e-6 + out = torch.log(out) + out = self.CMVN(out) + if not is_eval: + out = self.freq_masking(out) + out = self.time_masking(out) + return out + + +class RawPreprocessor(torch.nn.Module): + """ + Pre-process the raw audio signal by using a SincNet architecture + [ADD REF] + """ + def __init__(self, nb_samp, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, bias=False, groups=1, min_low_hz=50, min_band_hz=50, sample_rate=16000): + """ + + :param nb_samp: + :param in_channels: + :param filts: + :param first_conv: + """ + super(RawPreprocessor, self).__init__() + self.ln = LayerNorm(nb_samp) + self.first_conv = SincConv1d(in_channels = in_channels, + out_channels = out_channels, + kernel_size = kernel_size, + sample_rate = sample_rate, + stride=stride, + padding=padding, + dilation=dilation, + bias=bias, + groups=groups, + min_low_hz=min_low_hz, + min_band_hz=min_band_hz + ) + self.first_bn = torch.nn.BatchNorm1d(num_features = out_channels) + self.lrelu = torch.nn.LeakyReLU() + self.lrelu_keras = torch.nn.LeakyReLU(negative_slope = 0.3) + + def forward(self, x): + """ + + :param x: + :return: + """ + nb_samp = x.shape[0] + len_seq = x.shape[1] + out = self.ln(x) + out = out.view(nb_samp, 1, len_seq) + out = torch.nn.functional.max_pool1d(torch.abs(self.first_conv(out)), 3) + out = self.first_bn(out) + out = self.lrelu_keras(out) + + return out diff --git a/sidekit/sidekit/nnet/res_net.py b/sidekit/sidekit/nnet/res_net.py new file mode 100644 index 0000000..4f3c5c6 --- /dev/null +++ b/sidekit/sidekit/nnet/res_net.py @@ -0,0 +1,607 @@ +# -*- coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . + +""" +Copyright 2014-2021 Anthony Larcher +""" + +import torch +import torchaudio + + +__author__ = "Anthony Larcher and Sylvain Meignier" +__copyright__ = "Copyright 2014-2021 Anthony Larcher and Sylvain Meignier" +__license__ = "LGPL" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' + + +class FeatureMapScaling(torch.nn.Module): + """ + + """ + def __init__(self, nb_dim, do_add = True, do_mul = True): + """ + + :param nb_dim: + :param do_add: + :param do_mul: + """ + super(FeatureMapScaling, self).__init__() + self.fc = torch.nn.Linear(nb_dim, nb_dim) + self.sig = torch.nn.Sigmoid() + self.do_add = do_add + self.do_mul = do_mul + + def forward(self, x): + """ + + :param x: + :return: + """ + y = torch.nn.functional.adaptive_avg_pool1d(x, 1).view(x.size(0), -1) + y = self.sig(self.fc(y)).view(x.size(0), x.size(1), -1) + + if self.do_mul: + x = x * y + + if self.do_add: + x = x + y + + return x + + +class ResBlockWFMS(torch.nn.Module): + """ + + """ + def __init__(self, nb_filts, first=False): + """ + + :param nb_filts: + :param first: + """ + super(ResBlockWFMS, self).__init__() + self.first = first + + if not self.first: + self.bn1 = torch.nn.BatchNorm1d(num_features=nb_filts[0]) + + self.lrelu = torch.nn.LeakyReLU() + self.lrelu_keras = torch.nn.LeakyReLU(negative_slope=0.3) + + self.conv1 = torch.nn.Conv1d(in_channels=nb_filts[0], + out_channels=nb_filts[1], + kernel_size=3, + padding=1, + stride=1) + + self.bn2 = torch.nn.BatchNorm1d(num_features=nb_filts[1]) + + self.conv2 = torch.nn.Conv1d(in_channels=nb_filts[1], + out_channels=nb_filts[1], + padding=1, + kernel_size=3, + stride=1) + + if nb_filts[0] != nb_filts[1]: + self.downsample = True + self.conv_downsample = torch.nn.Conv1d(in_channels=nb_filts[0], + out_channels=nb_filts[1], + padding=0, + kernel_size=1, + stride=1) + else: + self.downsample = False + + self.mp = torch.nn.MaxPool1d(3) + + self.fms = FeatureMapScaling(nb_dim=nb_filts[1], + do_add=True, + do_mul=True + ) + + def forward(self, x): + """ + + :param x: + :return: + """ + identity = x + + if not self.first: + out = self.bn1(x) + out = self.lrelu_keras(out) + else: + out = x + + #out = self.conv1(x) + out = self.conv1(out) # modif Anthony + out = self.bn2(out) + out = self.lrelu_keras(out) + out = self.conv2(out) + + if self.downsample: + identity = self.conv_downsample(identity) + + out += identity + out = self.mp(out) + out = self.fms(out) + + return out + + +class LayerNorm(torch.nn.Module): + """ + + """ + def __init__(self, features, eps=1e-6): + """ + + :param features: + :param eps: + """ + super(LayerNorm,self).__init__() + self.gamma = torch.nn.Parameter(torch.ones(features)) + self.beta = torch.nn.Parameter(torch.zeros(features)) + self.eps = eps + + def forward(self, x): + """ + + :param x: + :return: + """ + mean = x.mean(-1, keepdim=True) + std = x.std(-1, keepdim=True) + return self.gamma * (x - mean) / (std + self.eps) + self.beta + + +class ResBlock(torch.nn.Module): + """ + + """ + def __init__(self, in_channels, out_channels, stride, is_first=False): + """ + + :param filter_size: + :param channel_nb: + :param is_first: boolean, True if this block ios the first of the model, if not, apply a BatchNorm layer first + """ + super(ResBlock, self).__init__() + self.is_first = is_first + self.in_channels = in_channels + self.out_channels = out_channels + self.expansion = self.out_channels // self.in_channels + + self.resample = None + if not self.in_channels == self.out_channels: + self.resample = torch.nn.Sequential( + torch.nn.Conv2d(in_channels=self.in_channels, + out_channels=self.out_channels, + kernel_size=1), + torch.nn.BatchNorm2d(self.in_channels * self.expansion), + ) + + if not self.is_first: + self.batch_norm1 = torch.nn.BatchNorm2d(num_features=self.in_channels) + + self.activation = torch.nn.LeakyReLU() + + self.conv1 = torch.nn.Conv2d(in_channels=self.in_channels, + out_channels=self.out_channels, + kernel_size=(3,3), + stride=stride, + padding=1, + padding_mode='zeros', + dilation=1) + self.conv2= torch.nn.Conv2d(in_channels=self.out_channels, + out_channels=self.out_channels, + stride=stride, + kernel_size=(3,3), + padding=1, + padding_mode='zeros', + dilation=1) + + self.batch_norm2 = torch.nn.BatchNorm2d(num_features=self.out_channels) + + def forward(self, x): + """ + + :param x: + :return: + """ + identity = x + + if not self.is_first: + out = self.activation(self.batch_norm1(x)) + else: + out = x + + out = self.conv1(out) + out = self.batch_norm2(out) + out = self.activation(out) + + out = self.conv2(out) + out = self.batch_norm2(out) + + if not self.expansion == 1: + identity = self.resample(identity) + out += identity + + out = self.activation(out) + return out + + +class SELayer(torch.nn.Module): + """ + + """ + def __init__(self, channel, reduction=16): + super(SELayer, self).__init__() + self.avg_pool = torch.nn.AdaptiveAvgPool2d(1) + self.fc = torch.nn.Sequential( + torch.nn.Linear(channel, channel // reduction, bias=False), + torch.nn.ReLU(inplace=True), + torch.nn.Linear(channel // reduction, channel, bias=False), + torch.nn.Sigmoid() + ) + + def forward(self, x): + """ + + :param x: + :return: + """ + b, c, _, _ = x.size() + y = self.avg_pool(x).view(b, c) + y = self.fc(y).view(b, c, 1, 1) + return x * y.expand_as(x) + + +class BasicBlock(torch.nn.Module): + """ + + """ + expansion = 1 + + def __init__(self, in_planes, planes, stride=1): + super(BasicBlock, self).__init__() + self.conv1 = torch.nn.Conv2d( + in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn1 = torch.nn.BatchNorm2d(planes) + self.conv2 = torch.nn.Conv2d(planes, planes, kernel_size=3, + stride=1, padding=1, bias=False) + self.bn2 = torch.nn.BatchNorm2d(planes) + + self.se = SELayer(planes) + + self.shortcut = torch.nn.Sequential() + if stride != 1 or in_planes != self.expansion*planes: + self.shortcut = torch.nn.Sequential( + torch.nn.Conv2d(in_planes, self.expansion*planes, + kernel_size=1, stride=stride, bias=False), + torch.nn.BatchNorm2d(self.expansion*planes) + ) + + def forward(self, x): + """ + + :param x: + :return: + """ + out = torch.nn.functional.relu(self.bn1(self.conv1(x))) + out = self.bn2(self.conv2(out)) + out = self.se(out) + out += self.shortcut(x) + out = torch.nn.functional.relu(out) + return out + + +class Bottleneck(torch.nn.Module): + """ + + """ + def __init__(self, in_planes, planes, stride=1, expansion=4): + super(Bottleneck, self).__init__() + + self.expansion = expansion + + self.conv1 = torch.nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) + self.bn1 = torch.nn.BatchNorm2d(planes) + self.conv2 = torch.nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn2 = torch.nn.BatchNorm2d(planes) + self.conv3 = torch.nn.Conv2d(planes, self.expansion * planes, kernel_size=1, bias=False) + self.bn3 = torch.nn.BatchNorm2d(self.expansion*planes) + + self.shortcut = torch.nn.Sequential() + if stride != 1 or in_planes != self.expansion*planes: + self.shortcut = torch.nn.Sequential( + torch.nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False), + torch.nn.BatchNorm2d(self.expansion*planes) + ) + + def forward(self, x): + """ + + :param x: + :return: + """ + out = torch.nn.functional.relu(self.bn1(self.conv1(x))) + out = torch.nn.functional.relu(self.bn2(self.conv2(out))) + out = self.bn3(self.conv3(out)) + out += self.shortcut(x) + out = torch.nn.functional.relu(out) + return out + + +class ResNet(torch.nn.Module): + """ + + """ + def __init__(self, block, num_blocks, speaker_number=10): + super(ResNet, self).__init__() + self.in_planes = 128 + self.speaker_number = speaker_number + + # Feature extraction + n_fft = 2048 + win_length = None + hop_length = 512 + n_mels = 80 + n_mfcc = 80 + + # todo modify the front-end like for other architectures + self.MFCC = torchaudio.transforms.MFCC( + sample_rate=16000, + n_mfcc=n_mfcc, melkwargs={'n_fft': n_fft, 'n_mels': n_mels, 'hop_length': hop_length}) + + self.CMVN = torch.nn.InstanceNorm1d(80) + + self.conv1 = torch.nn.Conv2d(1, 128, kernel_size=3, + stride=1, padding=1, bias=False) + self.bn1 = torch.nn.BatchNorm2d(128) + + # With block = [3, 1, 3, 1, 5, 1, 2] + self.layer1 = self._make_layer(block, 128, num_blocks[0], stride=1) + self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) + self.layer3 = self._make_layer(block, 128, num_blocks[2], stride=1) + self.layer4 = self._make_layer(block, 256, num_blocks[3], stride=2) + self.layer5 = self._make_layer(block, 256, num_blocks[4], stride=1) + self.layer6 = self._make_layer(block, 256, num_blocks[5], stride=2) + self.layer7 = self._make_layer(block, 256, num_blocks[5], stride=1) + self.stat_pooling = MeanStdPooling() + self.before_embedding = torch.nn.Linear(5120, 256) + + def _make_layer(self, block, planes, num_blocks, stride): + """ + + :param block: + :param planes: + :param num_blocks: + :param stride: + :return: + """ + strides = [stride] + [1]*(num_blocks-1) + layers = [] + for stride in strides: + layers.append(block(self.in_planes, planes, stride)) + self.in_planes = planes * block.expansion + return torch.nn.Sequential(*layers) + + def forward(self, x): + """ + + :param x: + :return: + """ + out = self.MFCC(x) + out = self.CMVN(out) + out = out.unsqueeze(1) + out = torch.nn.functional.relu(self.bn1(self.conv1(out))) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + out = self.layer4(out) + out = self.layer5(out) + out = self.layer6(out) + out = self.layer7(out) + out = torch.flatten(out, start_dim=1, end_dim=2) + out = self.stat_pooling(out) + out = self.before_embedding(out) + return out + + +class PreResNet34(torch.nn.Module): + """ + Networks that contains only the ResNet part until pooling, with NO classification layers + """ + def __init__(self, block=BasicBlock, num_blocks=[3, 1, 3, 1, 5, 1, 2], speaker_number=10): + super(PreResNet34, self).__init__() + self.in_planes = 128 + + self.speaker_number = speaker_number + self.conv1 = torch.nn.Conv2d(1, 128, kernel_size=3, + stride=1, padding=1, bias=False) + + self.bn1 = torch.nn.BatchNorm2d(128) + + # With block = [3, 1, 3, 1, 5, 1, 2] + self.layer1 = self._make_layer(block, 128, num_blocks[0], stride=1) + self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) + self.layer3 = self._make_layer(block, 128, num_blocks[2], stride=1) + self.layer4 = self._make_layer(block, 256, num_blocks[3], stride=2) + self.layer5 = self._make_layer(block, 256, num_blocks[4], stride=1) + self.layer6 = self._make_layer(block, 256, num_blocks[5], stride=2) + self.layer7 = self._make_layer(block, 256, num_blocks[5], stride=1) + + + def _make_layer(self, block, planes, num_blocks, stride): + """ + + :param block: + :param planes: + :param num_blocks: + :param stride: + :return: + """ + strides = [stride] + [1]*(num_blocks-1) + layers = [] + for stride in strides: + layers.append(block(self.in_planes, planes, stride)) + self.in_planes = planes * block.expansion + return torch.nn.Sequential(*layers) + + def forward(self, x): + """ + + :param x: + :return: + """ + out = x.unsqueeze(1) + out = torch.nn.functional.relu(self.bn1(self.conv1(out))) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + out = self.layer4(out) + out = self.layer5(out) + out = self.layer6(out) + out = self.layer7(out) + out = torch.flatten(out, start_dim=1, end_dim=2) + return out + + +class PreHalfResNet34(torch.nn.Module): + """ + Networks that contains only the ResNet part until pooling, with NO classification layers + """ + def __init__(self, block=BasicBlock, num_blocks=[3, 4, 6, 3], speaker_number=10): + super(PreHalfResNet34, self).__init__() + self.in_planes = 32 + self.speaker_number = speaker_number + + self.conv1 = torch.nn.Conv2d(1, 32, kernel_size=3, + stride=(1, 1), padding=1, bias=False) + self.bn1 = torch.nn.BatchNorm2d(32) + + # With block = [3, 4, 6, 3] + self.layer1 = self._make_layer(block, 32, num_blocks[0], stride=(1, 1)) + self.layer2 = self._make_layer(block, 64, num_blocks[1], stride=(2, 2)) + self.layer3 = self._make_layer(block, 128, num_blocks[2], stride=(2, 2)) + self.layer4 = self._make_layer(block, 256, num_blocks[3], stride=(2, 2)) + + + def _make_layer(self, block, planes, num_blocks, stride): + """ + + :param block: + :param planes: + :param num_blocks: + :param stride: + :return: + """ + strides = [stride] + [1]*(num_blocks-1) + layers = [] + for stride in strides: + layers.append(block(self.in_planes, planes, stride)) + self.in_planes = planes * block.expansion + return torch.nn.Sequential(*layers) + + def forward(self, x): + """ + + :param x: + :return: + """ + out = x.unsqueeze(1) + out = out.contiguous(memory_format=torch.channels_last) + out = torch.nn.functional.relu(self.bn1(self.conv1(out))) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + out = self.layer4(out) + out = out.contiguous(memory_format=torch.contiguous_format) + out = torch.flatten(out, start_dim=1, end_dim=2) + return out + + +class PreFastResNet34(torch.nn.Module): + """ + Networks that contains only the ResNet part until pooling, with NO classification layers + """ + def __init__(self, block=BasicBlock, num_blocks=[3, 4, 6, 3], speaker_number=10): + super(PreFastResNet34, self).__init__() + self.in_planes = 16 + self.speaker_number = speaker_number + + self.conv1 = torch.nn.Conv2d(1, 16, kernel_size=7, + stride=(2, 1), padding=3, bias=False) + self.bn1 = torch.nn.BatchNorm2d(16) + + # With block = [3, 4, 6, 3] + self.layer1 = self._make_layer(block, 16, num_blocks[0], stride=1) + self.layer2 = self._make_layer(block, 32, num_blocks[1], stride=(2, 2)) + self.layer3 = self._make_layer(block, 64, num_blocks[2], stride=(2, 2)) + self.layer4 = self._make_layer(block, 128, num_blocks[3], stride=(1, 1)) + + + def _make_layer(self, block, planes, num_blocks, stride): + """ + + :param block: + :param planes: + :param num_blocks: + :param stride: + :return: + """ + strides = [stride] + [1]*(num_blocks-1) + layers = [] + for stride in strides: + layers.append(block(self.in_planes, planes, stride)) + self.in_planes = planes * block.expansion + return torch.nn.Sequential(*layers) + + def forward(self, x): + """ + + :param x: + :return: + """ + out = x.unsqueeze(1) + out = out.contiguous(memory_format=torch.channels_last) + out = torch.nn.functional.relu(self.bn1(self.conv1(out))) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + out = self.layer4(out) + out = out.contiguous(memory_format=torch.contiguous_format) + out = torch.flatten(out, start_dim=1, end_dim=2) + return out + + +def ResNet34(): + return ResNet(BasicBlock, [3, 1, 3, 1, 5, 1, 2]) + +# TODO create a flexible class that allows to generate different RESNET sequential classes and manage the sizes \ No newline at end of file diff --git a/sidekit/sidekit/nnet/sincnet.py b/sidekit/sidekit/nnet/sincnet.py new file mode 100644 index 0000000..8110ba9 --- /dev/null +++ b/sidekit/sidekit/nnet/sincnet.py @@ -0,0 +1,427 @@ +# The MIT License (MIT) +# +# Copyright (c) 2019 Mirco Ravanelli +# Copyright (c) 2019-2020 CNRS +# Copyright (c) 2020-2021 LIUM +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +# AUTHOR +# Hervé Bredin - http://herve.niderb.fr +# Anthony Larcher + +# Part of this code was taken from https://github.com/mravanelli/SincNet +# (see above license terms). + +# Please give proper credit to the authors if you are using SincNet-based +# models by citing their paper: + +# Mirco Ravanelli, Yoshua Bengio. +# "Speaker Recognition from raw waveform with SincNet". +# SLT 2018. https://arxiv.org/abs/1808.00158 + +from typing import List +import numpy +import torch +import math + + +class SincConv1d(torch.nn.Module): + """Sinc-based 1D convolution + + Parameters + ---------- + in_channels : `int` + Should be 1. + out_channels : `int` + Number of filters. + kernel_size : `int` + Filter length. + stride : `int`, optional + Defaults to 1. + sample_rate : `int`, optional + Sample rate. Defaults to 16000. + min_low_hz: `int`, optional + Defaults to 50. + min_band_hz: `int`, optional + Defaults to 50. + + Usage + ----- + Same as `torch.nn.Conv1d` + + Reference + --------- + Mirco Ravanelli, Yoshua Bengio. "Speaker Recognition from raw waveform with + SincNet". SLT 2018. https://arxiv.org/abs/1808.00158 + """ + + @staticmethod + def to_mel(hz): + return 2595 * numpy.log10(1 + hz / 700) + + @staticmethod + def to_hz(mel): + return 700 * (10 ** (mel / 2595) - 1) + + def __init__(self, + out_channels, + kernel_size, + sample_rate=16000, + in_channels=1, + stride=1, + padding=0, + dilation=1, + bias=False, + groups=1, + min_low_hz=50, + min_band_hz=50): + + super(SincConv1d, self).__init__() + + if in_channels != 1: + msg = f"SincConv1d only supports one input channel. (Here, in_channels = {in_channels})." + raise ValueError(msg) + + self.in_channels = in_channels + self.out_channels = out_channels + self.stride = stride + self.padding = padding + self.dilation = dilation + self.sample_rate = sample_rate + self.min_low_hz = min_low_hz + self.min_band_hz = min_band_hz + + self.kernel_size = kernel_size + if kernel_size % 2 == 0: + self.kernel_size=self.kernel_size+1 + + if bias: + raise ValueError("SincConv1d does not support bias.") + if groups > 1: + raise ValueError("SincConv1d does not support groups.") + + # initialize filterbanks such that they are equally spaced in Mel scale + low_hz = 30 + high_hz = self.sample_rate / 2 - (self.min_low_hz + self.min_band_hz) + + mel = numpy.linspace(self.to_mel(low_hz), + self.to_mel(high_hz), + self.out_channels + 1) + hz = self.to_hz(mel) + + # filter lower frequency (out_channels, 1) + self.low_hz_ = torch.nn.Parameter(torch.Tensor(hz[:-1]).view(-1, 1)) + + # filter frequency band (out_channels, 1) + self.band_hz_ = torch.nn.Parameter(torch.Tensor(numpy.diff(hz)).view(-1, 1)) + + # Half Hamming window + n_lin = torch.linspace(0, (self.kernel_size / 2) - 1, steps=int((self.kernel_size / 2))) + self.window_ = 0.54 - 0.46 * torch.cos(2 * math.pi * n_lin / self.kernel_size) + + # (kernel_size, 1) + # Due to symmetry, I only need half of the time axes + n = (self.kernel_size - 1) / 2.0 + self.n_ = 2 * math.pi * torch.arange(-n, 0).view(1, -1) / self.sample_rate + + def forward(self, waveforms): + """Get sinc filters activations + + Parameters + ---------- + waveforms : `torch.Tensor` (batch_size, 1, n_samples) + Batch of waveforms. + + Returns + ------- + features : `torch.Tensor` (batch_size, out_channels, n_samples_out) + Batch of sinc filters activations. + """ + self.n_ = self.n_.to(waveforms.device) + self.window_ = self.window_.to(waveforms.device) + + low = self.min_low_hz + torch.abs(self.low_hz_) + + high = torch.clamp(low + self.min_band_hz + torch.abs(self.band_hz_), + self.min_low_hz, + self.sample_rate / 2) + band = (high - low)[:, 0] + + f_times_t_low = torch.matmul(low, self.n_) + f_times_t_high = torch.matmul(high, self.n_) + + # Equivalent to Eq.4 of the reference paper + # expanded the sinc and simplified the terms. + # This way I avoid several useless computations. + band_pass_left = ((torch.sin(f_times_t_high) - torch.sin(f_times_t_low)) / (self.n_ / 2)) * self.window_ + band_pass_center = 2 * band.view(-1, 1) + band_pass_right = torch.flip(band_pass_left, dims=[1]) + + band_pass = torch.cat([band_pass_left, band_pass_center, band_pass_right], dim=1) + band_pass = band_pass / (2 * band[:, None]) + + self.filters = band_pass.view(self.out_channels, 1, self.kernel_size) + + + return torch.nn.functional.conv1d(waveforms, + self.filters, + stride=self.stride, + padding=self.padding, + dilation=self.dilation, + bias=None, + groups=1) + + +class SincNet(torch.nn.Module): + """SincNet (learnable) feature extraction + + Parameters + ---------- + waveform_normalize : `bool`, optional + Standardize waveforms (to zero mean and unit standard deviation) and + apply (learnable) affine transform. Defaults to True. + instance_normalize : `bool`, optional + Standardize internal representation (to zero mean and unit standard + deviation) and apply (learnable) affine transform. Defaults to True. + + + Reference + --------- + Mirco Ravanelli, Yoshua Bengio. "Speaker Recognition from raw waveform with + SincNet". SLT 2018. https://arxiv.org/abs/1808.00158 + + """ + + ''' + @staticmethod + def get_alignment(task: Task, **kwargs): + """Get frame alignment""" + return "strict" + + @staticmethod + def get_resolution( + task: Task, + sample_rate: int = 16000, + kernel_size: List[int] = [251, 5, 5], + stride: List[int] = [1, 1, 1], + max_pool: List[int] = [3, 3, 3], + **kwargs, + ) -> SlidingWindow: + """Get frame resolution + + Parameters + ---------- + task : Task + sample_rate : int, optional + kerne_size : list of int, optional + stride : list of int, optional + max_pool : list of int, optional + + Returns + ------- + resolution : SlidingWindow + Frame resolution. + """ + + # https://medium.com/mlreview/a-guide-to-receptive-field-arithmetic-for-convolutional-neural-networks-e0f514068807 + padding = 0 + receptive_field, jump, start = 1, 1, 0.5 + for ks, s, mp in zip(kernel_size, stride, max_pool): + # increase due to (Sinc)Conv1d + receptive_field += (ks - 1) * jump + start += ((ks - 1) / 2 - padding) * jump + jump *= s + # increase in receptive field due to MaxPool1d + receptive_field += (mp - 1) * jump + start += ((mp - 1) / 2 - padding) * jump + jump *= mp + + return SlidingWindow( + duration=receptive_field / sample_rate, step=jump / sample_rate, start=0.0 + ) + ''' + + def __init__( + self, + waveform_normalize=True, + sample_rate=16000, + min_low_hz=50, + min_band_hz=50, + out_channels=[80, 60, 60], + kernel_size: List[int] = [251, 5, 5], + stride=[1, 1, 1], + max_pool=[3, 3, 3], + instance_normalize=True, + activation="leaky_relu", + dropout=0.0, + ): + super().__init__() + + # check parameters values + n_layers = len(out_channels) + if len(kernel_size) != n_layers: + msg = ( + f"out_channels ({len(out_channels):d}) and kernel_size " + f"({len(kernel_size):d}) should have the same length." + ) + raise ValueError(msg) + if len(stride) != n_layers: + msg = ( + f"out_channels ({len(out_channels):d}) and stride " + f"({len(stride):d}) should have the same length." + ) + raise ValueError(msg) + if len(max_pool) != n_layers: + msg = ( + f"out_channels ({len(out_channels):d}) and max_pool " + f"({len(max_pool):d}) should have the same length." + ) + raise ValueError(msg) + + # Waveform normalization + self.waveform_normalize = waveform_normalize + if self.waveform_normalize: + self.waveform_normalize_ = torch.nn.InstanceNorm1d(1, affine=True) + + # SincNet-specific parameters + self.sample_rate = sample_rate + self.min_low_hz = min_low_hz + self.min_band_hz = min_band_hz + + # Conv1D parameters + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.conv1d_ = torch.nn.ModuleList([]) + + # Max-pooling parameters + self.max_pool = max_pool + self.max_pool1d_ = torch.nn.ModuleList([]) + + # Instance normalization + self.instance_normalize = instance_normalize + if self.instance_normalize: + self.instance_norm1d_ = torch.nn.ModuleList([]) + + config = zip(self.out_channels, self.kernel_size, self.stride, self.max_pool) + + in_channels = None + for i, (out_channels, kernel_size, stride, max_pool) in enumerate(config): + + # 1D convolution + if i > 0: + conv1d = torch.nn.Conv1d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=0, + dilation=1, + groups=1, + bias=True, + ) + else: + conv1d = SincConv1d( + out_channels, + kernel_size, + sample_rate=self.sample_rate, + in_channels=1, + min_low_hz=self.min_low_hz, + min_band_hz=self.min_band_hz, + stride=stride, + padding=0, + dilation=1, + bias=False, + groups=1, + ) + self.conv1d_.append(conv1d) + + # 1D max-pooling + max_pool1d = torch.nn.MaxPool1d(max_pool, stride=max_pool, padding=0, dilation=1) + self.max_pool1d_.append(max_pool1d) + + # 1D instance normalization + if self.instance_normalize: + instance_norm1d = torch.nn.InstanceNorm1d(out_channels, affine=True) + self.instance_norm1d_.append(instance_norm1d) + + in_channels = out_channels + + # Activation function + self.activation = activation + if self.activation == "leaky_relu": + self.activation_ = torch.nn.LeakyReLU(negative_slope=0.2) + else: + msg = f'Only "leaky_relu" activation is supported.' + raise ValueError(msg) + + # Dropout + self.dropout = dropout + if self.dropout: + self.dropout_ = torch.nn.Dropout(p=self.dropout) + + def forward(self, waveforms): + """Extract SincNet features + + Parameters + ---------- + waveforms : (batch_size, n_samples, 1) + Batch of waveforms + + Returns + ------- + features : (batch_size, n_frames, out_channels[-1]) + """ + waveforms = waveforms[:, :, None] + output = waveforms.transpose(1, 2) + + # standardize waveforms + if self.waveform_normalize: + output = self.waveform_normalize_(output) + + layers = zip(self.conv1d_, self.max_pool1d_) + for i, (conv1d, max_pool1d) in enumerate(layers): + + output = conv1d(output) + if i == 0: + output = torch.abs(output) + + output = max_pool1d(output) + + if self.instance_normalize: + output = self.instance_norm1d_[i](output) + + output = self.activation_(output) + + if self.dropout: + output = self.dropout_(output) + + #return output.transpose(1, 2) + return output + + def dimension(): + doc = "Output features dimension." + + def fget(self): + return self.out_channels[-1] + + return locals() + + dimension = property(**dimension()) diff --git a/sidekit/sidekit/nnet/xsets.py b/sidekit/sidekit/nnet/xsets.py new file mode 100644 index 0000000..094bb6e --- /dev/null +++ b/sidekit/sidekit/nnet/xsets.py @@ -0,0 +1,589 @@ +# -*- coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . + +""" +Copyright 2014-2021 Anthony Larcher + +""" + +import math +import numpy +import pandas +import random +import torch +import torchaudio +import tqdm +import soundfile +import yaml + +from torch.utils.data import Dataset +from .augmentation import data_augmentation +from ..bosaris.idmap import IdMap + +__license__ = "LGPL" +__author__ = "Anthony Larcher" +__copyright__ = "Copyright 2015-2021 Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' + + +class SideSampler(torch.utils.data.Sampler): + """ + Data Sampler used to generate uniformly distributed batches + """ + + def __init__(self, + data_source, + spk_count, + examples_per_speaker, + samples_per_speaker, + batch_size, + seed=0, + rank=0, + num_process=1, + num_replicas=1): + """[summary] + + Args: + data_source ([type]): [description] + spk_count ([type]): [description] + examples_per_speaker ([type]): [description] + samples_per_speaker ([type]): [description] + batch_size ([type]): [description] + num_replicas: number of GPUs for parallel computing + """ + self.train_sessions = data_source + self.labels_to_indices = dict() + self.spk_count = spk_count + self.examples_per_speaker = examples_per_speaker + self.samples_per_speaker = samples_per_speaker + self.epoch = 0 + self.seed = seed + self.rank = rank + self.num_process = num_process + self.num_replicas = num_replicas + + assert batch_size % examples_per_speaker == 0 + assert (self.samples_per_speaker * self.spk_count * self.examples_per_speaker) % self.num_process == 0 + + self.batch_size = batch_size // (self.examples_per_speaker * self.num_replicas) + #self.batch_size = batch_size // self.examples_per_speaker + + # reference all segment indexes per speaker + for idx in range(self.spk_count): + self.labels_to_indices[idx] = list() + for idx, value in enumerate(self.train_sessions): + self.labels_to_indices[value].append(idx) + # shuffle segments per speaker + g = torch.Generator() + g.manual_seed(self.seed + self.epoch) + for idx, ldlist in enumerate(self.labels_to_indices.values()): + ldlist = numpy.array(ldlist) + self.labels_to_indices[idx] = ldlist[torch.randperm(ldlist.shape[0], generator=g).numpy()] + + self.segment_cursors = numpy.zeros((len(self.labels_to_indices),), dtype=numpy.int) + + + def __iter__(self): + g = torch.Generator() + g.manual_seed(self.seed + self.epoch) + numpy.random.seed(self.seed + self.epoch) + + # Generate batches per speaker + straight = numpy.arange(self.spk_count) + indices = numpy.ones((self.samples_per_speaker, self.spk_count), dtype=numpy.int) * straight + batch_cursor = 0 + # each line of "indices" represents all speaker indexes (shuffled in a different way) + for idx in range(self.samples_per_speaker): + if batch_cursor == 0: + indices[idx, :] = numpy.random.permutation(straight) + else: + # if one batch is split between the end of previous line and the beginning of current line + # we make sure no speaker is present twice in this batch + probs = numpy.ones_like(straight) + probs[indices[idx-1, -batch_cursor:]] = 0 + probs = probs/numpy.sum(probs) + indices[idx, :self.batch_size - batch_cursor] = numpy.random.choice(self.spk_count, self.batch_size - batch_cursor, replace=False, p=probs) + probs = numpy.ones_like(straight) + probs[indices[idx, :self.batch_size - batch_cursor]] = 0 + to_pick = numpy.sum(probs).astype(numpy.int) + probs = probs/numpy.sum(probs) + indices[idx, self.batch_size - batch_cursor:] = numpy.random.choice(self.spk_count, to_pick, replace=False, p=probs) + + assert numpy.sum(indices[idx, :]) == numpy.sum(straight) + batch_cursor = (batch_cursor + indices.shape[1]) % self.batch_size + + # now we have the speaker indexes to sample in batches + batch_matrix = numpy.repeat(indices, self.examples_per_speaker, axis=1).flatten() + + # we want to convert the speaker indexes into segment indexes + self.index_iterator = numpy.zeros_like(batch_matrix) + + # keep track of next segment index to sample for each speaker + for idx, value in enumerate(batch_matrix): + if self.segment_cursors[value] > len(self.labels_to_indices[value]) - 1: + self.labels_to_indices[value] = self.labels_to_indices[value][torch.randperm(self.labels_to_indices[value].shape[0], generator=g)] + self.segment_cursors[value] = 0 + self.index_iterator[idx] = self.labels_to_indices[value][self.segment_cursors[value]] + self.segment_cursors[value] += 1 + #self.index_iterator = self.index_iterator.reshape(-1, self.num_process * self.examples_per_speaker)[:, self.rank * self.examples_per_speaker:(self.rank + 1) * self.examples_per_speaker].flatten() + + self.index_iterator = numpy.repeat(self.index_iterator, self.num_replicas) + self.index_iterator = self.index_iterator.reshape(-1, self.num_process * self.examples_per_speaker * self.num_replicas)[:, self.rank * self.examples_per_speaker * self.num_replicas:(self.rank + 1) * self.examples_per_speaker * self.num_replicas].flatten() + + return iter(self.index_iterator) + + def __len__(self) -> int: + #return (self.samples_per_speaker * self.spk_count * self.examples_per_speaker) // self.num_process + return (self.samples_per_speaker * self.spk_count * self.examples_per_speaker * self.num_replicas) // self.num_process + + + + def set_epoch(self, epoch: int) -> None: + self.epoch = epoch + + +class SideSet(Dataset): + + def __init__(self, + dataset, + set_type="train", + chunk_per_segment=1, + transform_number=1, + overlap=0., + dataset_df=None, + min_duration=0.165, + output_format="pytorch", + ): + """ + + :param dataset_yaml: name of the YAML file describing the dataset + :param set_type: string, can be "train" or "validation" + :param chunk_per_segment: number of chunks to select for each segment + default is 1 and -1 means select all possible chunks + """ + self.data_path = dataset["data_path"] + self.sample_rate = int(dataset["sample_rate"]) + self.data_file_extension = dataset["data_file_extension"] + self.transformation = '' + self.min_duration = min_duration + self.output_format = output_format + self.transform_number = transform_number + + if set_type == "train": + self.duration = dataset["train"]["duration"] + self.transformation = dataset["train"]["transformation"] + else: + self.duration = dataset["valid"]["duration"] + self.transformation = dataset["valid"]["transformation"] + + self.sample_number = int(self.duration * self.sample_rate) + self.overlap = int(overlap * self.sample_rate) + + # Load the dataset description as pandas.dataframe + if dataset_df is None: + df = pandas.read_csv(dataset["dataset_description"]) + else: + assert isinstance(dataset_df, pandas.DataFrame) + df = dataset_df + + # From each segment which duration is longer than the chosen one + # select the requested segments + if set_type == "train": + tmp_sessions = df.loc[df['duration'] > self.duration] + else: + if not "duration" == '': + tmp_sessions = df.loc[df['duration'] > self.duration] + else: + self.sessions = df + + # Create lists for each column of the dataframe + df_dict = dict(zip(df.columns, [[], [], [], [], [], [], []])) + df_dict["file_start"] = list() + df_dict["file_duration"] = list() + + # For each segment, get all possible segments with the current overlap + for idx in tqdm.trange(len(tmp_sessions), desc='indexing all ' + set_type + ' segments', mininterval=1, disable=None): + current_session = tmp_sessions.iloc[idx] + + # Compute possible starts + possible_starts = numpy.arange(0, + int(self.sample_rate * (current_session.duration - self.duration)), + self.sample_number + ) + int(self.sample_rate * (current_session.duration % self.duration / 2)) + possible_starts += int(self.sample_rate * current_session.start) + + # Select max(seg_nb, possible_segments) segments + if chunk_per_segment == -1: + starts = possible_starts + chunk_nb = len(possible_starts) + else: + chunk_nb = min(len(possible_starts), chunk_per_segment) + starts = numpy.random.permutation(possible_starts)[:chunk_nb] + + # Once we know how many segments are selected, create the other fields to fill the DataFrame + for ii in range(chunk_nb): + df_dict["database"].append(current_session.database) + df_dict["speaker_id"].append(current_session.speaker_id) + df_dict["file_id"].append(current_session.file_id) + df_dict["start"].append(starts[ii]) + df_dict["duration"].append(self.duration) + df_dict["file_start"].append(current_session.start) + df_dict["file_duration"].append(current_session.duration) + df_dict["speaker_idx"].append(current_session.speaker_idx) + df_dict["gender"].append(current_session.gender) + + self.sessions = pandas.DataFrame.from_dict(df_dict) + self.len = len(self.sessions) + + self.transform = dict() + if (self.transformation["pipeline"] != '') and (self.transformation["pipeline"] is not None): + transforms = self.transformation["pipeline"].split(',') + if "add_noise" in transforms: + self.transform["add_noise"] = self.transformation["add_noise"] + if "add_reverb" in transforms: + self.transform["add_reverb"] = self.transformation["add_reverb"] + if "codec" in transforms: + self.transform["codec"] = [] + if "phone_filtering" in transforms: + self.transform["phone_filtering"] = [] + + self.noise_df = None + if "add_noise" in self.transform: + noise_df = pandas.read_csv(self.transformation["add_noise"]["noise_db_csv"]) + noise_df = noise_df.loc[noise_df.duration > self.duration] + self.noise_df = noise_df.set_index(noise_df.type) + + self.rir_df = None + if "add_reverb" in self.transform: + tmp_rir_df = pandas.read_csv(self.transformation["add_reverb"]["rir_db_csv"]) + tmp_rir_df = tmp_rir_df.loc[tmp_rir_df["type"] == "simulated_rirs"] + # load the RIR database + self.rir_df = tmp_rir_df.set_index(tmp_rir_df.type) + + + def __getitem__(self, index): + """ + + :return: + """ + # Check the size of the file + current_session = self.sessions.iloc[index] + + # TODO is this required ? + nfo = torchaudio.info(f"{self.data_path}/{current_session['file_id']}{self.data_file_extension}") + original_start = int(current_session['start']) + if self.overlap > 0: + lowest_shift = self.overlap/2 + highest_shift = self.overlap/2 + if original_start < (current_session['file_start']*self.sample_rate + self.sample_number/2): + lowest_shift = int(original_start - current_session['file_start']*self.sample_rate) + if original_start + self.sample_number > (current_session['file_start'] + current_session['file_duration'])*self.sample_rate - self.sample_number/2: + highest_shift = int((current_session['file_start'] + current_session['file_duration'])*self.sample_rate - (original_start + self.sample_number)) + start_frame = original_start + int(random.uniform(-lowest_shift, highest_shift)) + else: + start_frame = original_start + + conversion_rate = nfo.sample_rate // self.sample_rate + + if start_frame + conversion_rate * self.sample_number >= nfo.num_frames: + start_frame = numpy.min(nfo.num_frames - conversion_rate * self.sample_number - 1) + + speech, speech_fs = torchaudio.load(f"{self.data_path}/{current_session['file_id']}{self.data_file_extension}", + frame_offset=conversion_rate*start_frame, + num_frames=conversion_rate*self.sample_number) + + if nfo.sample_rate != self.sample_rate: + speech = torchaudio.transforms.Resample(nfo.sample_rate, self.sample_rate).forward(speech) + + speech += 10e-6 * torch.randn(speech.shape) + + if len(self.transform) > 0: + speech = data_augmentation(speech, + self.sample_rate, + self.transform, + self.transform_number, + noise_df=self.noise_df, + rir_df=self.rir_df) + + speaker_idx = current_session["speaker_idx"] + + if self.output_format == "pytorch": + return speech, torch.tensor(speaker_idx) + else: + return speech, speaker_idx + + def __len__(self): + """ + + :param self: + :return: + """ + return self.len + +def get_sample(path, resample=None): + effects = [ + ["remix", "1"] + ] + if resample: + effects.append(["rate", f'{resample}']) + return torchaudio.sox_effects.apply_effects_file(path, effects=effects) + + +class IdMapSet(Dataset): + """ + DataSet that provide data according to a sidekit.IdMap object + """ + + def __init__(self, + idmap_name, + data_path, + file_extension, + transform_pipeline={}, + transform_number=1, + sliding_window=False, + window_len=3., + window_shift=1.5, + sample_rate=16000, + min_duration=0.165 + ): + """ + + :param data_root_name: + :param idmap_name: + """ + if isinstance(idmap_name, IdMap): + self.idmap = idmap_name + else: + self.idmap = IdMap(idmap_name) + + self.data_path = data_path + self.file_extension = file_extension + self.len = self.idmap.leftids.shape[0] + self.transformation = transform_pipeline + self.min_duration = min_duration + self.sample_rate = sample_rate + self.sliding_window = sliding_window + self.window_len = int(window_len * self.sample_rate) + self.window_shift = int(window_shift * self.sample_rate) + self.transform_number = transform_number + + self.noise_df = None + if "add_noise" in self.transformation: + # Load the noise dataset, filter according to the duration + noise_df = pandas.read_csv(self.transformation["add_noise"]["noise_db_csv"]) + tmp_df = noise_df.loc[noise_df['duration'] > self.duration] + self.noise_df = tmp_df['file_id'].tolist() + + self.rir_df = None + if "add_reverb" in self.transformation: + # load the RIR database + tmp_rir_df = pandas.read_csv(self.transformation["add_reverb"]["rir_db_csv"]) + self.rir_df = zip(tmp_rir_df['file_id'].tolist(), tmp_rir_df['channel'].tolist()) + + def __getitem__(self, index): + """ + + :param index: + :return: + """ + # Read start and stop and convert to time in seconds + if self.idmap.start[index] is None: + start = 0 + else: + start = int(self.idmap.start[index] * 0.01 * self.sample_rate) + + if self.idmap.stop[index] is None: + speech, speech_fs = torchaudio.load(f"{self.data_path}/{self.idmap.rightids[index]}.{self.file_extension}") + duration = int(speech.shape[1] - start) + else: + duration = int(self.idmap.stop[index] * 0.01 * self.sample_rate) - start + # add this in case the segment is too short + if duration <= self.min_duration * self.sample_rate: + middle = start + duration // 2 + start = int(max(0, int(middle - (self.min_duration * self.sample_rate / 2)))) + duration = int(self.min_duration * self.sample_rate) + speech, speech_fs = torchaudio.load(f"{self.data_path}/{self.idmap.rightids[index]}.{self.file_extension}", + frame_offset=start, + num_frames=duration) + + speech += 10e-6 * torch.randn(speech.shape) + + if self.sliding_window: + speech = speech.squeeze().unfold(0, self.window_len, self.window_shift) + #middle_points = numpy.arange(start + self.window_len / 2, + # start + duration - self.window_len / 2, + # self.window_shift) + #starts = middle_points - self.window_shift / 2 + #stops = middle_points + self.window_shift / 2 + #starts[0] = start + #stops[-1] = start + duration + #stop = stops + #start = starts + stop = start + duration + else: + stop = start + duration + + if len(self.transformation.keys()) > 0: + speech = data_augmentation(speech, + speech_fs, + self.transformation, + self.transform_number, + noise_df=self.noise_df, + rir_df=self.rir_df) + + + speech = speech.squeeze() + + return speech, self.idmap.leftids[index], self.idmap.rightids[index], start, stop + + def __len__(self): + """ + + :param self: + :return: + """ + return self.len + + +class IdMapSetPerSpeaker(Dataset): + """ + DataSet that provide data according to a sidekit.IdMap object + """ + + def __init__(self, + idmap_name, + data_path, + file_extension, + transform_pipeline={}, + transform_number=1, + sample_rate=16000, + min_duration=0.165 + ): + """ + + :param idmap_name: + :param data_root_path: + :param file_extension: + :param transform_pipeline: + :param transform_number: + :param sample_rate: + :param min_duration: + """ + if isinstance(idmap_name, IdMap): + self.idmap = idmap_name + else: + self.idmap = IdMap(idmap_name) + + self.data_path = data_path + self.file_extension = file_extension + self.len = len(set(self.idmap.leftids)) + self.transformation = transform_pipeline + self.transform_number = transform_number + self.min_duration = min_duration + self.sample_rate = sample_rate + self.speaker_list = list(set(self.idmap.leftids)) + self.output_im = IdMap() + self.output_im.leftids = numpy.unique(self.idmap.leftids) + self.output_im.rightids = self.output_im.leftids + self.output_im.start = numpy.empty(self.output_im.rightids.shape[0], "|O") + self.output_im.stop = numpy.empty(self.output_im.rightids.shape[0], "|O") + + self.noise_df = None + if "add_noise" in self.transformation: + # Load the noise dataset, filter according to the duration + noise_df = pandas.read_csv(self.transformation["add_noise"]["noise_db_csv"]) + tmp_df = noise_df.loc[noise_df['duration'] > self.duration] + self.noise_df = tmp_df['file_id'].tolist() + + self.rir_df = None + if "add_reverb" in self.transformation: + # load the RIR database + tmp_rir_df = pandas.read_csv(self.transformation["add_reverb"]["rir_db_csv"]) + self.rir_df = zip(tmp_rir_df['file_id'].tolist(), tmp_rir_df['channel'].tolist()) + + def __getitem__(self, index): + """ + + :param index: + :return: + """ + + # Loop on all segments from the given speaker to load data + spk_id = self.output_im.leftids[index] + tmp_data = [] + #nfo = soundfile.info(f"{self.data_path}/{self.idmap.rightids[index]}.{self.file_extension}") + for sid, seg_id, seg_start, seg_stop in zip(self.idmap.leftids, self.idmap.rightids, + self.idmap.start, self.idmap.stop): + if sid == spk_id: + + # Read start and stop and convert to time in seconds + if seg_start is None: + start = 0 + else: + start = int(seg_start * 0.01 * self.sample_rate) + + if seg_stop is None: + speech, speech_fs = torchaudio.load(f"{self.data_path}/{self.idmap.rightids[index]}.{self.file_extension}") + duration = int(speech.shape[1] - start) + else: + duration = int(seg_stop * 0.01 * self.sample_rate) - start + # add this in case the segment is too short + if duration <= self.min_duration * self.sample_rate: + middle = start + duration // 2 + start = int(max(0, int(middle - (self.min_duration * self.sample_rate / 2)))) + duration = int(self.min_duration * self.sample_rate) + + speech, speech_fs = torchaudio.load(f"{self.data_path}/{seg_id}.{self.file_extension}", + frame_offset=start, + num_frames=duration) + + speech += 10e-6 * torch.randn(speech.shape) + tmp_data.append(speech) + + speech = torch.cat(tmp_data, dim=1) + speech += 10e-6 * torch.randn(speech.shape) + + if len(self.transformation.keys()) > 0: + speech = data_augmentation(speech, + speech_fs, + self.transformation, + self.transform_number, + noise_df=self.noise_df, + rir_df=self.rir_df) + + stop = start + duration + speech = speech.squeeze() + + return speech, self.idmap.leftids[index], self.idmap.rightids[index], start, stop + + def __len__(self): + """ + + :param self: + :return: + """ + return self.len + diff --git a/sidekit/sidekit/nnet/xvector.py b/sidekit/sidekit/nnet/xvector.py new file mode 100755 index 0000000..01c5137 --- /dev/null +++ b/sidekit/sidekit/nnet/xvector.py @@ -0,0 +1,1928 @@ +#coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . + +""" +Copyright 2014-2021 Anthony Larcher, Yevhenii Prokopalo +""" + + +import logging +import math +import os +import numpy +import random +import pandas +import shutil +import torch +import tqdm +import yaml +import pickle + +from collections import OrderedDict +from torch.utils.data import DataLoader +from sklearn.model_selection import train_test_split +from .pooling import MeanStdPooling +from .pooling import AttentivePooling +from .pooling import GruPooling +from .preprocessor import MfccFrontEnd +from .preprocessor import MelSpecFrontEnd +from .preprocessor import BNFrontEnd +from .preprocessor import RawPreprocessor +from .xsets import SideSet +from .xsets import IdMapSet +from .xsets import IdMapSetPerSpeaker +from .xsets import SideSampler +from .res_net import ResBlockWFMS +from .res_net import ResBlock +from .res_net import PreFastResNet34 +from .res_net import PreHalfResNet34 +from .res_net import PreResNet34 +from ..bosaris import IdMap +from ..bosaris import Key +from ..bosaris import Ndx +from ..statserver import StatServer +from ..iv_scoring import cosine_scoring +from .sincnet import SincNet +from ..bosaris.detplot import rocch, rocch2eer +from .loss import SoftmaxAngularProto +from .loss import l2_norm +from .loss import ArcMarginProduct +from .loss import ArcLinear +from .loss import AngularProximityMagnet + + +os.environ['MKL_THREADING_LAYER'] = 'GNU' + +#torch.backends.cudnn.benchmark = True + +__license__ = "LGPL" +__author__ = "Anthony Larcher" +__copyright__ = "Copyright 2015-2021 Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reS' + + +def seed_worker(seed_val): + """ + + :param worker_id: + :return: + """ + worker_seed = torch.initial_seed() % 2**32 + numpy.random.seed(worker_seed) + random.seed(worker_seed) + + +def eer(negatives, positives): + """ + Logarithmic complexity EER computation + + :param negatives: negative_scores (numpy array): impostor scores + :param positives: positive_scores (numpy array): genuine scores + :return: float: Equal Error Rate (EER) + """ + positives = numpy.sort(positives) + negatives = numpy.sort(negatives)[::-1] + + pos_count = positives.shape[0] + neg_count = negatives.shape[0] + + p_score = positives[0] + n_score = negatives[0] + + p_index = 0 + n_index = 0 + + next_p_jump = pos_count//2 + next_n_jump = neg_count//2 + + kdx = 0 + while True: + kdx += 1 + if p_index < 0 or n_index < 0: + return 0 + if p_index > pos_count or n_index > neg_count: + return 100 + if p_score < n_score: + p_index = p_index + next_p_jump + n_index = n_index + next_n_jump + if next_p_jump == 0 and next_n_jump == 0: + break + elif p_score >= n_score: + p_index = p_index - next_p_jump + n_index = n_index - next_n_jump + if next_p_jump == 0 and next_n_jump == 0: + break + + p_score = positives[p_index] + n_score = negatives[n_index] + next_p_jump = next_p_jump//2 + next_n_jump = next_n_jump//2 + + eer_predicate = 100 + + tfr = (abs(p_index))/pos_count + tfa = (1+abs(n_index))/neg_count + if (p_score == n_score and tfr == tfa): + return tfr + + while positives[p_index] < negatives[n_index]: + if p_index < pos_count - 1: + p_index += 1 + elif n_index < neg_count - 1: + n_index += 1 + else: + break + + while positives[p_index] > negatives[n_index] and n_index >= 1: + n_index -= 1 + + tfr = (1+p_index)/pos_count + tfa = (1+n_index)/neg_count + + while tfa > tfr: + p_index += 1 + while positives[p_index] > negatives[n_index] and n_index >= 1: + n_index -= 1 + tfr = (1+p_index)/pos_count + tfa = (1+n_index)/neg_count + + if abs(tfr - tfa) <= eer_predicate: + eer_predicate = abs(tfr - tfa) + eer = (tfr + tfa) / 2 + else: + return eer + + tfr = p_index/pos_count + tfa = (1+n_index)/neg_count + if abs(tfr - tfa) <= eer_predicate: + eer_predicate = abs(tfr - tfa) + eer = (tfr + tfa) / 2 + else: + return eer + + while True: + while negatives[n_index + 1] <= positives[p_index - 1]: + p_index -= 1 + tfr = p_index/pos_count + tfa = (1+n_index)/neg_count + if abs(tfr - tfa) <= eer_predicate: + eer_predicate = abs(tfr - tfa) + eer = (tfr + tfa) / 2 + else: + return eer + while negatives[n_index + 1] > positives[p_index - 1]: + n_index += 1 + tfr = p_index/pos_count + tfa = (1+n_index)/neg_count + if abs(tfr - tfa) <= eer_predicate: + eer_predicate = abs(tfr - tfa) + eer = (tfr + tfa) / 2 + else: + return eer + + return eer + + +def test_metrics(model, + device, + model_opts, + data_opts, + train_opts): + """Compute model metrics + + Args: + model ([type]): [description] + validation_loader ([type]): [description] + device ([type]): [description] + speaker_number ([type]): [description] + model_archi ([type]): [description] + + Raises: + NotImplementedError: [description] + NotImplementedError: [description] + + Returns: + [type]: [description] + """ + transform_pipeline = dict() + + xv_stat = extract_embeddings(idmap_name=data_opts["test"]["idmap"], + model_filename=model, + data_root_name=data_opts["test"]["data_path"], + device=device, + transform_pipeline=transform_pipeline, + num_thread=train_opts["num_cpu"], + mixed_precision=train_opts["mixed_precision"]) + + tar, non = cosine_scoring(xv_stat, + xv_stat, + Ndx(data_opts["test"]["ndx"]), + wccn=None, + check_missing=True, + device=device + ).get_tar_non(Key(data_opts["test"]["key"])) + + pmiss, pfa = rocch(tar, non) + + return rocch2eer(pmiss, pfa) + + +def save_checkpoint(state, is_best, filename='checkpoint.pth.tar', best_filename='model_best.pth.tar'): + """ + + :param state: + :param is_best: + :param filename: + :param best_filename: + :return: + """ + torch.save(state, filename) + if is_best: + shutil.copyfile(filename, best_filename) + + +class TrainingMonitor(): + """ + + """ + def __init__(self, + output_file, + log_interval=10, + patience=numpy.inf, + best_accuracy=0.0, + best_eer_epoch=1, + best_eer=100, + compute_test_eer=False + ): + + self.current_epoch = 0 + self.log_interval = log_interval + self.init_patience = patience + self.current_patience = patience + self.best_accuracy = best_accuracy + self.best_eer_epoch = best_eer_epoch + self.best_eer = best_eer + self.compute_test_eer = compute_test_eer + self.test_eer = [] + + self.training_loss = [] + self.training_acc = [] + + self.val_loss = [] + self.val_acc = [] + self.val_eer = [] + + self.is_best = True + + # Initialize the logger + logging_format = '%(asctime)-15s %(message)s' + logging.basicConfig(level=logging.DEBUG, format=logging_format, datefmt='%m-%d %H:%M') + self.logger = logging.getLogger('Monitoring') + self.logger.setLevel(logging.DEBUG) + # create file handler which logs even debug messages + fh = logging.FileHandler(output_file) + formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + fh.setFormatter(formatter) + fh.setLevel(logging.DEBUG) + self.logger.addHandler(fh) + + def display(self): + """ + + :return: + """ + # TODO + self.logger.critical(f"***Validation metrics - Cross validation accuracy = {self.val_acc[-1]} %, EER = {self.val_eer[-1] * 100} %") + self.logger.critical(f"***Test metrics - Test EER = {self.test_eer[-1] * 100} %") + + def display_final(self): + """ + + :return: + """ + self.logger.critical(f"Best accuracy {self.best_accuracy * 100.} obtained at epoch {self.best_accuracy_epoch}") + + def update(self, + epoch=None, + training_acc=None, + training_loss=None, + test_eer=None, + val_eer=None, + val_loss=None, + val_acc=None): + + if epoch is not None: + self.current_epoch = epoch + if training_acc is not None: + self.training_acc.append(training_acc) + if training_loss is not None: + self.training_loss.append(training_loss) + if val_eer is not None: + self.val_eer.append(val_eer) + if val_loss is not None: + self.val_loss.append(val_loss) + if val_acc is not None: + self.val_acc.append(val_acc) + + # remember best accuracy and save checkpoint + if self.compute_test_eer and test_eer is not None: + self.test_eer.append(test_eer) + self.is_best = test_eer < self.best_eer + self.best_eer = min(test_eer, self.best_eer) + if self.is_best: + self.best_eer_epoch = epoch + self.current_patience = self.init_patience + else: + self.current_patience -= 1 + elif val_eer is not None: + self.is_best = val_eer < self.best_eer + self.best_eer = min(val_eer, self.best_eer) + if self.is_best: + self.best_eer_epoch = epoch + self.current_patience = self.init_patience + else: + self.current_patience -= 1 + + +class Xtractor(torch.nn.Module): + """ + Class that defines an x-vector extractor based on 5 convolutional layers and a mean standard deviation pooling + """ + + def __init__(self, + speaker_number, + model_archi="xvector", + loss=None, + norm_embedding=False, + aam_margin=0.2, + aam_s=30, + embedding_size=256): + """ + If config is None, default architecture is created + :param model_archi: + """ + super(Xtractor, self).__init__() + self.speaker_number = speaker_number + self.feature_size = None + self.norm_embedding = norm_embedding + + if model_archi == "xvector": + + self.input_nbdim = 2 + + if loss not in ["cce", 'aam']: + raise NotImplementedError(f"The valid loss are for now cce and aam ") + else: + self.loss = loss + + self.activation = torch.nn.LeakyReLU(0.2) + + self.preprocessor = MfccFrontEnd() + self.feature_size = self.preprocessor.n_mfcc + + self.sequence_network = torch.nn.Sequential(OrderedDict([ + ("conv1", torch.nn.Conv1d(self.feature_size, 512, 5, dilation=1)), + ("activation1", torch.nn.LeakyReLU(0.2)), + ("batch_norm1", torch.nn.BatchNorm1d(512)), + ("conv2", torch.nn.Conv1d(512, 512, 3, dilation=2)), + ("activation2", torch.nn.LeakyReLU(0.2)), + ("batch_norm2", torch.nn.BatchNorm1d(512)), + ("conv3", torch.nn.Conv1d(512, 512, 3, dilation=3)), + ("activation3", torch.nn.LeakyReLU(0.2)), + ("batch_norm3", torch.nn.BatchNorm1d(512)), + ("conv4", torch.nn.Conv1d(512, 512, 1)), + ("activation4", torch.nn.LeakyReLU(0.2)), + ("batch_norm4", torch.nn.BatchNorm1d(512)), + ("conv5", torch.nn.Conv1d(512, 1536, 1)), + ("activation5", torch.nn.LeakyReLU(0.2)), + ("batch_norm5", torch.nn.BatchNorm1d(1536)) + ])) + + self.embedding_size = embedding_size + + self.stat_pooling = MeanStdPooling() + self.stat_pooling_weight_decay = 0 + self.before_speaker_embedding = torch.nn.Sequential(OrderedDict([ + ("linear6", torch.nn.Linear(3072, self.embedding_size)) + ])) + + if self.loss == "aam": + self.after_speaker_embedding = ArcMarginProduct(self.embedding_size, + int(self.speaker_number), + s=64, + m=0.2, + easy_margin=False) + elif self.loss == "cce": + self.after_speaker_embedding = torch.nn.Sequential(OrderedDict([ + ("activation6", torch.nn.LeakyReLU(0.2)), + ("batch_norm6", torch.nn.BatchNorm1d(512)), + ("dropout6", torch.nn.Dropout(p=0.05)), + ("linear7", torch.nn.Linear(512, 512)), + ("activation7", torch.nn.LeakyReLU(0.2)), + ("batch_norm7", torch.nn.BatchNorm1d(512)), + ("linear8", torch.nn.Linear(512, int(self.speaker_number))) + ])) + + self.preprocessor_weight_decay = 0.0002 + self.sequence_network_weight_decay = 0.0002 + self.before_speaker_embedding_weight_decay = 0.002 + self.after_speaker_embedding_weight_decay = 0.002 + + elif model_archi == "resnet34": + + self.preprocessor = MelSpecFrontEnd(n_mels=80) + self.sequence_network = PreResNet34() + self.embedding_size = embedding_size + + self.before_speaker_embedding = torch.nn.Linear(in_features=5120, + out_features=self.embedding_size) + + self.stat_pooling = AttentivePooling(256, 80, global_context=True) + + self.loss = "aam" + self.after_speaker_embedding = ArcMarginProduct(self.embedding_size, + int(self.speaker_number), + s = 30.0, + m = 0.20, + easy_margin = False) + + self.preprocessor_weight_decay = 0.00002 + self.sequence_network_weight_decay = 0.00002 + self.stat_pooling_weight_decay = 0.00002 + self.before_speaker_embedding_weight_decay = 0.00002 + self.after_speaker_embedding_weight_decay = 0.0002 + + elif model_archi == "fastresnet34": + self.preprocessor = MelSpecFrontEnd() + self.sequence_network = PreFastResNet34() + self.embedding_size = embedding_size + + self.before_speaker_embedding = torch.nn.Linear(in_features = 2560, + out_features = self.embedding_size) + + self.stat_pooling = AttentivePooling(128, 80, global_context=False) + self.stat_pooling_weight_decay = 0 + + self.loss = loss + if self.loss == "aam": + self.after_speaker_embedding = ArcMarginProduct(self.embedding_size, + int(self.speaker_number), + s = 30, + m = 0.2, + easy_margin = False) + + elif self.loss == 'aps': + self.after_speaker_embedding = SoftmaxAngularProto(int(self.speaker_number)) + elif self.loss == 'smn': + self.after_speaker_embedding = AngularProximityMagnet(int(self.speaker_number)) + + self.preprocessor_weight_decay = 0.00002 + self.sequence_network_weight_decay = 0.00002 + self.stat_pooling_weight_decay = 0.00002 + self.before_speaker_embedding_weight_decay = 0.00002 + self.after_speaker_embedding_weight_decay = 0.0002 + + elif model_archi == "bn_halfresnet34": + self.preprocessor = BNFrontEnd() + self.sequence_network = PreHalfResNet34() + + self.embedding_size = embedding_size + #self.embedding_size = 256 + #self.before_speaker_embedding = torch.nn.Linear(in_features = 5120, + # out_features = self.embedding_size) + + ars_bn_dim = 256 + self.before_speaker_embedding = torch.nn.Sequential(OrderedDict([ + ("lin_be", torch.nn.Linear(in_features = int((5120/80)*ars_bn_dim), out_features = self.embedding_size, bias=False)), + ("bn_be", torch.nn.BatchNorm1d(self.embedding_size)) + ])) + + self.stat_pooling = AttentivePooling(256, ars_bn_dim, global_context=True) + + self.loss = loss + if self.loss == "aam": + self.after_speaker_embedding = ArcMarginProduct(self.embedding_size, + int(self.speaker_number), + s = 30, + m = 0.2, + easy_margin = False) + elif self.loss == 'aps': + self.after_speaker_embedding = SoftmaxAngularProto(int(self.speaker_number)) + self.preprocessor_weight_decay = 0.00002 + self.sequence_network_weight_decay = 0.00002 + self.stat_pooling_weight_decay = 0.00002 + self.before_speaker_embedding_weight_decay = 0.00002 + self.after_speaker_embedding_weight_decay = 0.000 + + + + elif model_archi == "halfresnet34": + self.preprocessor = MelSpecFrontEnd(n_fft=1024, + win_length=400, + hop_length=160, + n_mels=80) + self.sequence_network = PreHalfResNet34() + + self.embedding_size = embedding_size + #self.embedding_size = 256 + #self.before_speaker_embedding = torch.nn.Linear(in_features = 5120, + # out_features = self.embedding_size) + + self.before_speaker_embedding = torch.nn.Sequential(OrderedDict([ + ("lin_be", torch.nn.Linear(in_features = 5120, out_features = self.embedding_size, bias=False)), + ("bn_be", torch.nn.BatchNorm1d(self.embedding_size)) + ])) + + self.stat_pooling = AttentivePooling(256, 80, global_context=True) + + self.loss = loss + if self.loss == "aam": + self.after_speaker_embedding = ArcMarginProduct(self.embedding_size, + int(self.speaker_number), + s = 30, + m = 0.2, + easy_margin = False) + elif self.loss == 'aps': + self.after_speaker_embedding = SoftmaxAngularProto(int(self.speaker_number)) + self.preprocessor_weight_decay = 0.00002 + self.sequence_network_weight_decay = 0.00002 + self.stat_pooling_weight_decay = 0.00002 + self.before_speaker_embedding_weight_decay = 0.00002 + self.after_speaker_embedding_weight_decay = 0.000 + + elif model_archi == "rawnet2": + + if loss not in ["cce", 'aam']: + raise NotImplementedError(f"The valid loss are for now cce and aam ") + else: + self.loss = loss + + self.input_nbdim = 2 + + filts = [128, [128, 128], [128, 256], [256, 256]] + self.norm_embedding = True + + self.preprocessor = RawPreprocessor(nb_samp=48000, + in_channels=1, + out_channels=filts[0], + kernel_size=3) + + self.sequence_network = torch.nn.Sequential(OrderedDict([ + ("block0", ResBlockWFMS(nb_filts=filts[1], first=True)), + ("block1", ResBlockWFMS(nb_filts=filts[1])), + ("block2", ResBlockWFMS(nb_filts=filts[2])), + ("block3", ResBlockWFMS(nb_filts=[filts[2][1], filts[2][1]])), + ("block4", ResBlockWFMS(nb_filts=[filts[2][1], filts[2][1]])), + ("block5", ResBlockWFMS(nb_filts=[filts[2][1], filts[2][1]])) + ])) + + self.stat_pooling = GruPooling(input_size=filts[2][-1], + gru_node=1024, + nb_gru_layer=1) + + self.before_speaker_embedding = torch.nn.Linear(in_features = 1024, + out_features = 1024) + + if self.loss == "aam": + if loss == 'aam': + self.after_speaker_embedding = ArcLinear(1024, + int(self.speaker_number), + margin=aam_margin, s=aam_s) + elif self.loss == "cce": + self.after_speaker_embedding = torch.nn.Linear(in_features = 1024, + out_features = int(self.speaker_number), + bias = True) + + self.preprocessor_weight_decay = 0.000 + self.sequence_network_weight_decay = 0.000 + self.stat_pooling_weight_decay = 0.000 + self.before_speaker_embedding_weight_decay = 0.00 + self.after_speaker_embedding_weight_decay = 0.00 + + else: + is_first_resblock = True + + if isinstance(model_archi, dict): + cfg = model_archi + else: + # Load Yaml configuration + with open(model_archi, 'r') as fh: + cfg = yaml.load(fh, Loader=yaml.FullLoader) + + self.loss = cfg["loss"]["type"] + if self.loss == "aam": + self.aam_margin = cfg["loss"]["aam_margin"] + self.aam_s = cfg["loss"]["aam_s"] + + """ + Prepare Preprocessor + """ + self.preprocessor = None + if "preprocessor" in cfg: + if cfg['preprocessor']["type"] == "sincnet": + self.preprocessor = SincNet( + waveform_normalize=cfg['preprocessor']["waveform_normalize"], + sample_rate=cfg['preprocessor']["sample_rate"], + min_low_hz=cfg['preprocessor']["min_low_hz"], + min_band_hz=cfg['preprocessor']["min_band_hz"], + out_channels=cfg['preprocessor']["out_channels"], + kernel_size=cfg['preprocessor']["kernel_size"], + stride=cfg['preprocessor']["stride"], + max_pool=cfg['preprocessor']["max_pool"], + instance_normalize=cfg['preprocessor']["instance_normalize"], + activation=cfg['preprocessor']["activation"], + dropout=cfg['preprocessor']["dropout"] + ) + self.feature_size = self.preprocessor.dimension + elif cfg['preprocessor']["type"] == "rawnet2": + self.preprocessor = RawPreprocessor(nb_samp=int(cfg['preprocessor']["sampling_frequency"] * cfg['preprocessor']["duration"]), + in_channels=1, + out_channels=cfg["feature_size"], + kernel_size=cfg['preprocessor']["kernel_size"], + stride=cfg['preprocessor']["stride"], + padding=cfg['preprocessor']["padding"], + dilation=cfg['preprocessor']["dilation"]) + self.feature_size = cfg["feature_size"] + self.preprocessor_weight_decay = 0.000 + + """ + Prepare sequence network + """ + # Get Feature size + if self.feature_size is None: + self.feature_size = cfg["preprocessor"]["feature_size"] + + input_size = self.feature_size + + # Get activation function + if cfg["activation"] == 'LeakyReLU': + self.activation = torch.nn.LeakyReLU(0.2) + elif cfg["activation"] == 'PReLU': + self.activation = torch.nn.PReLU() + elif cfg["activation"] == 'ReLU6': + self.activation = torch.nn.ReLU6() + else: + self.activation = torch.nn.ReLU() + + # Create sequential object for the first part of the network + segmental_layers = [] + for k in cfg["segmental"].keys(): + if k.startswith("lin"): + segmental_layers.append((k, torch.nn.Linear(input_size, + cfg["segmental"][k]["output"]))) + input_size = cfg["segmental"][k]["output"] + + elif k.startswith("conv2D"): + segmental_layers.append((k, torch.nn.Conv2d(in_channels=1, + out_channels=entry_conv_out_channels, + kernel_size=entry_conv_kernel_size, + padding=3, + stride=1))) + + elif k.startswith("conv"): + segmental_layers.append((k, torch.nn.Conv1d(input_size, + cfg["segmental"][k]["output_channels"], + kernel_size=cfg["segmental"][k]["kernel_size"], + dilation=cfg["segmental"][k]["dilation"]))) + input_size = cfg["segmental"][k]["output_channels"] + + elif k.startswith("ctrans"): + segmental_layers.append((k, torch.nn.ConvTranspose1d(input_size, + cfg["segmental"][k]["output_channels"], + kernel_size=cfg["segmental"][k]["kernel_size"], + dilation=cfg["segmental"][k]["dilation"]))) + elif k.startswith("activation"): + segmental_layers.append((k, self.activation)) + + elif k.startswith('batch_norm'): + segmental_layers.append((k, torch.nn.BatchNorm1d(input_size))) + + elif k.startswith('resblock'): + segmental_layers.append((ResBlock(cfg["segmental"][k]["input_channel"], + cfg["segmental"][k]["output_channel"], + is_first_resblock))) + is_first_resblock = False + + self.sequence_network = torch.nn.Sequential(OrderedDict(segmental_layers)) + self.sequence_network_weight_decay = cfg["segmental"]["weight_decay"] + + """ + Pooling + """ + self.stat_pooling = MeanStdPooling() + tmp_input_size = input_size * 2 + if cfg["stat_pooling"]["type"] == "GRU": + self.stat_pooling = GruPooling(input_size=cfg["stat_pooling"]["input_size"], + gru_node=cfg["stat_pooling"]["gru_node"], + nb_gru_layer=cfg["stat_pooling"]["nb_gru_layer"]) + tmp_input_size = cfg["stat_pooling"]["gru_node"] + + self.stat_pooling_weight_decay = cfg["stat_pooling"]["weight_decay"] + + """ + Prepare last part of the network (after pooling) + """ + # Create sequential object for the second part of the network + input_size = tmp_input_size + before_embedding_layers = [] + for k in cfg["before_embedding"].keys(): + if k.startswith("lin"): + if cfg["before_embedding"][k]["output"] == "speaker_number": + before_embedding_layers.append((k, torch.nn.Linear(input_size, self.speaker_number))) + else: + before_embedding_layers.append((k, torch.nn.Linear(input_size, + cfg["before_embedding"][k]["output"]))) + input_size = cfg["before_embedding"][k]["output"] + + elif k.startswith("activation"): + before_embedding_layers.append((k, self.activation)) + + elif k.startswith('batch_norm'): + before_embedding_layers.append((k, torch.nn.BatchNorm1d(input_size))) + + elif k.startswith('dropout'): + before_embedding_layers.append((k, torch.nn.Dropout(p=cfg["before_embedding"][k]))) + + self.embedding_size = input_size + self.before_speaker_embedding = torch.nn.Sequential(OrderedDict(before_embedding_layers)) + self.before_speaker_embedding_weight_decay = cfg["before_embedding"]["weight_decay"] + + # if loss_criteria is "cce" + # Create sequential object for the second part of the network + if self.loss == "cce": + after_embedding_layers = [] + for k in cfg["after_embedding"].keys(): + if k.startswith("lin"): + if cfg["after_embedding"][k]["output"] == "speaker_number": + after_embedding_layers.append((k, torch.nn.Linear(input_size, self.speaker_number))) + else: + after_embedding_layers.append((k, torch.nn.Linear(input_size, + cfg["after_embedding"][k]["output"]))) + input_size = cfg["after_embedding"][k]["output"] + + elif k.startswith('arc'): + after_embedding_layers.append((k, ArcLinear(input_size, + self.speaker_number, + margin=self.aam_margin, + s=self.aam_s))) + + elif k.startswith("activation"): + after_embedding_layers.append((k, self.activation)) + + elif k.startswith('batch_norm'): + after_embedding_layers.append((k, torch.nn.BatchNorm1d(input_size))) + + elif k.startswith('dropout'): + after_embedding_layers.append((k, torch.nn.Dropout(p=cfg["after_embedding"][k]))) + + self.after_speaker_embedding = torch.nn.Sequential(OrderedDict(after_embedding_layers)) + + elif self.loss == "aam": + self.norm_embedding = True + self.after_speaker_embedding = ArcMarginProduct(input_size, + int(self.speaker_number), + s=64, + m=0.2, + easy_margin=True) + + self.after_speaker_embedding_weight_decay = cfg["after_embedding"]["weight_decay"] + + def forward(self, x, is_eval=False, target=None, norm_embedding=True): + """ + + :param x: + :param is_eval: False for training + :return: + """ + if self.preprocessor is not None: + x = self.preprocessor(x, is_eval) + + x = self.sequence_network(x) + + # Mean and Standard deviation pooling + x = self.stat_pooling(x) + + x = self.before_speaker_embedding(x) + + if norm_embedding: + x = l2_norm(x) + + if self.loss == "cce": + if is_eval: + return x + else: + return self.after_speaker_embedding(x), x + + elif self.loss in ['aam', 'aps']: + x = self.after_speaker_embedding(x, target=target), torch.nn.functional.normalize(x, dim=1) + elif self.loss == 'smn': + if not is_eval: + x = self.after_speaker_embedding(x, target=target), x + + + return x + + def context_size(self): + context = 1 + if isinstance(self, Xtractor): + for name, module in self.sequence_network.named_modules(): + if name.startswith("conv"): + context += module.dilation[0] * (module.kernel_size[0] - 1) + else: + for name, module in self.module.sequence_network.named_modules(): + if name.startswith("conv"): + context += module.dilation[0] * (module.kernel_size[0] - 1) + return context + + +def fill_dict(target_dict, source_dict, prefix = ""): + """ + Recursively Fill a dictionary target_dict by taking values from source_dict + + :param target_dict: output dictionary that is initialized with default values + :param source_dict: input dictionary + :return: + """ + for k1, v1 in target_dict.items(): + + if isinstance(v1, dict): + if k1 in source_dict and isinstance(source_dict[k1], dict): + fill_dict(v1, source_dict[k1], prefix + "\t") + else: + pass + else: + if k1 in source_dict and source_dict[k1] is not None: + target_dict[k1] = source_dict[k1] + else: + pass + + +def update_training_dictionary(dataset_description, + model_description, + training_description, + kwargs=None): + """ + + :param dataset_description: + :param model_description: + :param training_description: + :param kwargs: + :return: + """ + dataset_opts=dict() + model_opts=dict() + training_opts=dict() + + if isinstance(dataset_description, str) and os.path.isfile(dataset_description): + with open(dataset_description, 'r') as fh: + tmp_data_dict = yaml.load(fh, Loader=yaml.FullLoader) + else: + tmp_data_dict = dataset_description + + if isinstance(model_description, str) and os.path.isfile(model_description): + with open(model_description, 'r') as fh: + tmp_model_dict = yaml.load(fh, Loader=yaml.FullLoader) + else: + tmp_model_dict = model_description + + if isinstance(training_description, str) and os.path.isfile(training_description): + with open(training_description, 'r') as fh: + tmp_train_dict = yaml.load(fh, Loader=yaml.FullLoader) + else: + tmp_train_dict = training_description + + # Initialize default dictionaries + dataset_opts["data_path"] = None + dataset_opts["dataset_csv"] = None + dataset_opts["stratify"] = False + dataset_opts["data_file_extension"] = ".wav" + dataset_opts["sample_rate"] = 16000 + + dataset_opts["validation_ratio"] = 0.1 + dataset_opts["batch_size"] = 64 + + dataset_opts["train"] = dict() + dataset_opts["train"]["duration"] = 4. + dataset_opts["train"]["chunk_per_segment"] = -1 + dataset_opts["train"]["overlap"] = 3.9 + dataset_opts["train"]["sampler"] = dict() + dataset_opts["train"]["sampler"]["examples_per_speaker"] = 1 + dataset_opts["train"]["sampler"]["samples_per_speaker"] = 100 + dataset_opts["train"]["sampler"]["augmentation_replica"] = 1 + dataset_opts["train"]["transform_number"] = 2 + dataset_opts["train"]["transformation"] = dict() + dataset_opts["train"]["transformation"]["pipeline"] = "" + dataset_opts["train"]["transformation"]["add_noise"] = dict() + dataset_opts["train"]["transformation"]["add_noise"]["noise_db_csv"] = "" + dataset_opts["train"]["transformation"]["add_noise"]["data_path"] = "" + dataset_opts["train"]["transformation"]["add_reverb"] = dict() + dataset_opts["train"]["transformation"]["add_reverb"]["rir_db_csv"] = "" + dataset_opts["train"]["transformation"]["add_reverb"]["data_path"] = "" + + dataset_opts["valid"] = dict() + dataset_opts["valid"]["duration"] = 2. + dataset_opts["valid"]["transformation"] = dict() + dataset_opts["valid"]["transformation"]["pipeline"] = "" + dataset_opts["valid"]["transformation"]["add_noise"] = dict() + dataset_opts["valid"]["transformation"]["add_noise"]["noise_db_csv"] = "" + dataset_opts["valid"]["transformation"]["add_noise"]["data_path"] = "" + dataset_opts["valid"]["transformation"]["add_reverb"] = dict() + dataset_opts["valid"]["transformation"]["add_reverb"]["noise_db_csv"] = "" + dataset_opts["valid"]["transformation"]["add_reverb"]["data_path"] = "" + + dataset_opts["test"] = dict() + dataset_opts["test"]["idmap"] = "" + dataset_opts["test"]["ndx"] = "" + dataset_opts["test"]["key"] = "" + dataset_opts["test"]["data_path"] ="" + + # Initialize model options + model_opts["speaker_number"] = None + model_opts["embedding_size"] = 256 + model_opts["loss"] = dict() + model_opts["loss"]["type"] ="aam" + model_opts["loss"]["aam_margin"] = 0.2 + model_opts["loss"]["aam_s"] = 30 + + model_opts["initial_model_name"] = None + model_opts["reset_parts"] = [] + model_opts["freeze_parts"] = [] + + model_opts["model_type"] = "fastresnet" + + model_opts["preprocessor"] = dict() + model_opts["preprocessor"]["type"] = "mel_spec" + model_opts["preprocessor"]["feature_size"] = 80 + + # Initialize training options + training_opts["log_file"] = "sidekit.log" + training_opts["numpy_seed"] = 0 + training_opts["torch_seed"] = 0 + training_opts["random_seed"] = 0 + training_opts["deterministic"] = False + training_opts["epochs"] = 100 + training_opts["lr"] = 1e-3 + training_opts["patience"] = 50 + training_opts["multi_gpu"] = False + training_opts["num_cpu"] = 5 + training_opts["mixed_precision"] = False + training_opts["clipping"] = False + + training_opts["optimizer"] = dict() + training_opts["optimizer"]["type"] = "sgd" + training_opts["optimizer"]["options"] = None + + training_opts["scheduler"] = dict() + training_opts["scheduler"]["type"] = "ReduceLROnPlateau" + training_opts["scheduler"]["step_size_up"] = 10 + training_opts["scheduler"]["base_lr"] = 1e-8 + training_opts["scheduler"]["mode"] = "triangular2" + + training_opts["compute_test_eer"] = False + training_opts["log_interval"] = 10 + training_opts["validation_frequency"] = 1 + + training_opts["tmp_model_name"] = "tmp_model.pt" + training_opts["best_model_name"] = "best_model.pt" + training_opts["checkpoint_frequency"] = "10" + + # Use options from the YAML config files + fill_dict(dataset_opts, tmp_data_dict) + fill_dict(model_opts, tmp_model_dict) + fill_dict(training_opts, tmp_train_dict) + + # Overwrite with manually given parameters + if "lr" in kwargs: + training_opts["lr"] = kwargs['lr'] + if "batch_size" in kwargs: + dataset_opts["batch_size"] = kwargs["batch_size"] + if "optimizer" in kwargs: + training_opts["optimizer"]["type"] = kwargs["optimizer"] + if "scheduler" in kwargs: + training_opts["scheduler"]["type"] = kwargs["scheduler"] + if "margin" in kwargs: + model_opts["loss"]["aam_margin"] = kwargs["margin"] + if "aam_s" in kwargs: + model_opts["loss"]["aam_s"] = kwargs["aam_s"] + + return dataset_opts, model_opts, training_opts + + +def get_network(model_opts, local_rank): + """ + + :param model_opts: + :param local_rank: + :return: + """ + + if model_opts["model_type"] in ["xvector", "rawnet2", "resnet34", "fastresnet34", "halfresnet34", "bn_halfresnet34"]: + model = Xtractor(model_opts["speaker_number"], model_opts["model_type"], loss=model_opts["loss"]["type"], embedding_size=model_opts["embedding_size"]) + else: + # Custom type of model + model = Xtractor(model_opts["speaker_number"], model_opts, loss=model_opts["loss"]["type"], embedding_size=model_opts["embedding_size"]) + + # Load the model if it exists + if model_opts["initial_model_name"] is not None and os.path.isfile(model_opts["initial_model_name"]): + logging.critical(f"*** Load model from = {model_opts['initial_model_name']}") + checkpoint = torch.load(model_opts["initial_model_name"]) + + """ + Here we remove all layers that we don't want to reload + + """ + pretrained_dict = checkpoint["model_state_dict"] + for part in model_opts["reset_parts"]: + pretrained_dict = {k: v for k, v in pretrained_dict.items() if not k.startswith(part)} + + new_model_dict = model.state_dict() + new_model_dict.update(pretrained_dict) + model.load_state_dict(new_model_dict) + + # Freeze required layers + for name, param in model.named_parameters(): + if name.split(".")[0] in model_opts["reset_parts"]: + param.requires_grad = False + + if model_opts["loss"]["type"] == "aam" and not (model_opts["loss"]["aam_margin"] == 0.2 and model_opts["loss"]["aam_s"] == 30): + model.after_speaker_embedding.change_params(model_opts["loss"]["aam_s"], model_opts["loss"]["aam_margin"]) + print(f"Modified AAM: margin = {model.after_speaker_embedding.m} and s = {model.after_speaker_embedding.s}") + + if local_rank < 1: + + + logging.info(model) + logging.info("Model_parameters_count: {:d}".format( + sum(p.numel() + for p in model.sequence_network.parameters() + if p.requires_grad) + \ + sum(p.numel() + for p in model.before_speaker_embedding.parameters() + if p.requires_grad) + \ + sum(p.numel() + for p in model.stat_pooling.parameters() + if p.requires_grad))) + + return model + + +def get_loaders(dataset_opts, training_opts, model_opts, local_rank=0): + """ + + :param dataset_opts: + :param training_opts: + :param model_opts: + :return: + """ + + """ + Set the dataloaders according to the dataset_yaml + + First we load the dataframe from CSV file in order to split it for training and validation purpose + Then we provide those two + """ + df = pandas.read_csv(dataset_opts["dataset_csv"]) + + stratify = None + if dataset_opts["stratify"]: + stratify = df["speaker_idx"] + training_df, validation_df = train_test_split(df, + test_size=dataset_opts["validation_ratio"], + stratify=stratify) + + torch.manual_seed(training_opts['torch_seed'] + local_rank) + torch.cuda.manual_seed(training_opts['torch_seed'] + local_rank) + + train_DEV4S = "/tmp/DEV4S_train_side_set.pl" + val_DEV4S = "/tmp/DEV4S_val_side_set.pl" + if os.environ.get('DEV4S') == 'True': + dataset_opts["batch_size"] = 4 + if os.environ.get('DEV4S') == 'True' and os.path.isfile(train_DEV4S) and os.environ.get('DEV4S_KeepDataPrep') != "True": + with open(train_DEV4S, "rb") as f: + training_set = pickle.load(f) + with open(val_DEV4S, "rb") as f: + validation_set = pickle.load(f) + else: + training_set = SideSet(dataset_opts, + set_type="train", + chunk_per_segment=-1, + transform_number=dataset_opts['train']['transform_number'], + overlap=dataset_opts['train']['overlap'], + dataset_df=training_df, + output_format="pytorch", + ) + + validation_set = SideSet(dataset_opts, + set_type="validation", + dataset_df=validation_df, + output_format="pytorch") + + if os.environ.get('DEV4S') == 'True' and not os.path.isfile(train_DEV4S): + with open(train_DEV4S, "wb") as f: + pickle.dump(training_set, f) + with open(val_DEV4S, "wb") as f: + pickle.dump(validation_set, f) + + if model_opts["loss"]["type"] == 'aps': + samples_per_speaker = 2 + else: + samples_per_speaker = 1 + + if training_opts["multi_gpu"]: + assert dataset_opts["batch_size"] % int(os.environ['WORLD_SIZE']) == 0 + assert dataset_opts["batch_size"] % samples_per_speaker == 0 + batch_size = dataset_opts["batch_size"]//(int(os.environ['WORLD_SIZE']) * dataset_opts["train"]["sampler"]["examples_per_speaker"]) + + side_sampler = SideSampler(data_source=training_set.sessions['speaker_idx'], + spk_count=model_opts["speaker_number"], + examples_per_speaker=dataset_opts["train"]["sampler"]["examples_per_speaker"], + samples_per_speaker=dataset_opts["train"]["sampler"]["samples_per_speaker"], + batch_size=batch_size, + seed=training_opts['torch_seed'], + rank=local_rank, + num_process=int(os.environ['WORLD_SIZE']), + num_replicas=dataset_opts["train"]["sampler"]["augmentation_replica"] + ) + else: + batch_size = dataset_opts["batch_size"] // dataset_opts["train"]["sampler"]["examples_per_speaker"] + side_sampler = SideSampler(data_source=training_set.sessions['speaker_idx'], + spk_count=model_opts["speaker_number"], + examples_per_speaker=dataset_opts["train"]["sampler"]["examples_per_speaker"], + samples_per_speaker=dataset_opts["train"]["sampler"]["samples_per_speaker"], + batch_size=batch_size, + seed=training_opts['torch_seed'], + rank=0, + num_process=1, + num_replicas=dataset_opts["train"]["sampler"]["augmentation_replica"] + ) + + training_loader = DataLoader(training_set, + batch_size=batch_size * dataset_opts["train"]["sampler"]["augmentation_replica"], + shuffle=False, + drop_last=True, + pin_memory=True, + sampler=side_sampler, + num_workers=training_opts["num_cpu"], + persistent_workers=False, + worker_init_fn=seed_worker) + + validation_loader = DataLoader(validation_set, + batch_size=batch_size, + drop_last=False, + pin_memory=True, + num_workers=training_opts["num_cpu"], + persistent_workers=False, + worker_init_fn=seed_worker) + + # Compute indices for target and non-target trials once only to avoid recomputing for each epoch + classes = torch.ShortTensor(validation_set.sessions['speaker_idx'].to_numpy()) + mask = classes.unsqueeze(1) == classes.unsqueeze(1).T + tar_indices = torch.tril(mask, -1).numpy() + non_indices = torch.tril(~mask, -1).numpy() + + # Select a subset of non-target trials to reduce the number of tests + tar_non_ratio = numpy.sum(tar_indices)/numpy.sum(non_indices) + non_indices *= (numpy.random.rand(*non_indices.shape) < tar_non_ratio) + + return training_loader, validation_loader, side_sampler, tar_indices, non_indices + + +def get_optimizer(model, model_opts, train_opts, training_loader): + """ + + :param model: + :param model_opts: + :param train_opts: + :param training_loader: + :return: + """ + if train_opts["optimizer"]["type"] == 'adam': + _optimizer = torch.optim.Adam + _options = {'lr': train_opts["lr"]} + elif train_opts["optimizer"]["type"] == 'rmsprop': + _optimizer = torch.optim.RMSprop + _options = {'lr': train_opts["lr"]} + else: # train_opts["optimizer"]["type"] == 'sgd' + _optimizer = torch.optim.SGD + _options = {'lr': train_opts["lr"], 'momentum': 0.9} + + param_list = [] + if type(model) is Xtractor: + if model.preprocessor is not None: + param_list.append({'params': model.preprocessor.parameters(), + 'weight_decay': model.preprocessor_weight_decay}) + param_list.append({'params': model.sequence_network.parameters(), + 'weight_decay': model.sequence_network_weight_decay}) + param_list.append({'params': model.stat_pooling.parameters(), + 'weight_decay': model.stat_pooling_weight_decay}) + param_list.append({'params': model.before_speaker_embedding.parameters(), + 'weight_decay': model.before_speaker_embedding_weight_decay}) + param_list.append({'params': model.after_speaker_embedding.parameters(), + 'weight_decay': model.after_speaker_embedding_weight_decay}) + + else: + if model.module.preprocessor is not None: + param_list.append({'params': model.module.preprocessor.parameters(), + 'weight_decay': model.module.preprocessor_weight_decay}) + param_list.append({'params': model.module.sequence_network.parameters(), + 'weight_decay': model.module.sequence_network_weight_decay}) + param_list.append({'params': model.module.stat_pooling.parameters(), + 'weight_decay': model.module.stat_pooling_weight_decay}) + param_list.append({'params': model.module.before_speaker_embedding.parameters(), + 'weight_decay': model.module.before_speaker_embedding_weight_decay}) + param_list.append({'params': model.module.after_speaker_embedding.parameters(), + 'weight_decay': model.module.after_speaker_embedding_weight_decay}) + + optimizer = _optimizer(param_list, **_options) + + if train_opts["scheduler"]["type"] == 'CyclicLR': + cycle_momentum = True + if train_opts["optimizer"]["type"] == "adam": + cycle_momentum = False + scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer=optimizer, + base_lr=train_opts["scheduler"]["base_lr"], + max_lr=train_opts["lr"], + step_size_up=train_opts["scheduler"]["step_size_up"], + step_size_down=None, + cycle_momentum=cycle_momentum, + mode=train_opts["scheduler"]["mode"]) + + elif train_opts["scheduler"]["type"] == "MultiStepLR": + scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer=optimizer, + milestones=[10000,50000,100000], + gamma=0.5) + + elif train_opts["scheduler"]["type"] == "StepLR": + scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer, + step_size=1 * training_loader.__len__(), + gamma=0.95) + + elif train_opts["scheduler"]["type"] == "StepLR2": + scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer, + step_size=1 * training_loader.__len__(), + gamma=0.5) + else: + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, + mode='min', + factor=0.5, + patience=3000, + verbose=True) + + return optimizer, scheduler + + +def save_model(model, training_monitor, model_opts, training_opts, optimizer, scheduler, epoch): + """ + + :param model: + :param training_monitor: + :param model_opts: + :param training_opts: + :param optimizer: + :param scheduler: + :return: + """ + + best_name = training_opts["best_model_name"] + tmp_name = training_opts["tmp_model_name"] + + if epoch is not None: + best_name = best_name + f"_epoch{epoch}" + + # TODO à reprendre + if type(model) is Xtractor: + save_checkpoint({ + 'epoch': training_monitor.current_epoch, + 'model_state_dict': model.state_dict(), + 'optimizer_state_dict': optimizer.state_dict(), + 'accuracy': training_monitor.best_accuracy, + 'scheduler': scheduler, + 'speaker_number' : model.speaker_number, + 'model_archi': model_opts, + 'loss': model_opts["loss"]["type"] + }, training_monitor.is_best, filename=tmp_name, best_filename=best_name) + else: + save_checkpoint({ + 'epoch': training_monitor.current_epoch, + 'model_state_dict': model.module.state_dict(), + 'optimizer_state_dict': optimizer.state_dict(), + 'accuracy': training_monitor.best_accuracy, + 'scheduler': scheduler, + 'speaker_number': model.module.speaker_number, + 'model_archi': model_opts, + 'loss': model_opts["loss"]["type"] + }, training_monitor.is_best, filename=tmp_name, best_filename=best_name) + + +class AAMScheduler(): + """ + For now we only update margin + """ + def __init__(self, original_margin, final_margin, final_steps_nb, update_frequency, mode='lin', Tau=1, verbose=True): + """ + + :param final_margin: + :param num_epochs: + :param mode: can be linear or exp + :param verbose: + """ + self.current_margin = original_margin + self.original_margin = original_margin + self.final_margin = final_margin + self.final_steps_nb = final_steps_nb + self.update_frequency = update_frequency + self.mode = mode + self.Tau = Tau + self.verbose = verbose + self._counter = 0 + + def __step__(self): + self._counter += 1 + + if self._counter % self.update_frequency == 0: + # update the parameters + if self.mode == "lin": + self.current_margin = self.original_margin + \ + (self.final_margin - self.original_margin) * \ + (self._counter / self.final_steps_nb) + else: + self.current_margin = self.original_margin + \ + (self.final_margin - self.original_margin) * \ + (1 - numpy.exp(-self._counter / (self.final_steps_nb/7))) + + return self.current_margin + + +def xtrain(dataset_description, + model_description, + training_description, + **kwargs): + """ + REFACTORING + - en cas de redemarrage à partir d'un modele existant, recharger l'optimize et le scheduler + """ + + torch.multiprocessing.set_start_method('forkserver', force=True) + + local_rank = -1 + if "RANK" in os.environ: + local_rank = int(os.environ['RANK']) + + # Test to optimize + # torch.backends.cudnn.benchmark = True + torch.autograd.profiler.emit_nvtx(enabled=False) + + dataset_opts, model_opts, training_opts = update_training_dictionary(dataset_description, + model_description, + training_description, + kwargs) + if os.environ.get('DEV4S') == 'True': + training_opts["multi_gpu"] = False + + # Initialize the training monitor + monitor = TrainingMonitor(output_file=training_opts["log_file"], + patience=training_opts["patience"], + best_accuracy=0.0, + best_eer_epoch=1, + best_eer=100, + compute_test_eer=training_opts["compute_test_eer"]) + + # Make PyTorch Deterministic + torch.backends.cudnn.deterministic = False + if training_opts["deterministic"]: + torch.backends.cudnn.deterministic = True + + # Set all the seeds + random.seed(training_opts["random_seed"]) + numpy.random.seed(training_opts["numpy_seed"]) # Set the random seed of numpy for the data split. + torch.manual_seed(training_opts["torch_seed"]) + torch.cuda.manual_seed(training_opts["torch_seed"]) + + # Display the entire configurations as YAML dictionaries + if local_rank < 1: + monitor.logger.info("\n*********************************\nDataset options\n*********************************\n") + monitor.logger.info(yaml.dump(dataset_opts, default_flow_style=False)) + monitor.logger.info("\n*********************************\nModel options\n*********************************\n") + monitor.logger.info(yaml.dump(model_opts, default_flow_style=False)) + monitor.logger.info("\n*********************************\nTraining options\n*********************************\n") + monitor.logger.info(yaml.dump(training_opts, default_flow_style=False)) + + # Initialize the model + model = get_network(model_opts, local_rank) + if local_rank < 1: + monitor.logger.info(model) + + embedding_size = model.embedding_size + aam_scheduler = None + #if model.loss == "aam": + # aam_scheduler = AAMScheduler(model_opts["loss"]["aam_margin"], + # final_margin=0.5, + # final_steps_nb=120000, + # update_frequency=25000, + # mode='exp', + # Tau=1, + # verbose=True) + + # Set the device and manage parallel processing + torch.cuda.set_device(local_rank) + if local_rank >= 0: + device = torch.device(local_rank) + else: + device = torch.device("cuda") + + if training_opts["multi_gpu"]: + model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) + + model.to(device) + + + """ [HOW TO] from https://gist.github.com/sgraaf/5b0caa3a320f28c27c12b5efeb35aa4c + - Add the following line right after "if __name__ == '__main__':" in your main script : + parser.add_argument('--local_rank', type=int, default=-1, metavar='N', help='Local process rank.') + - Then, in your shell : + export NUM_NODES=1 + export NUM_GPUS_PER_NODE=2 + export NODE_RANK=0 + export WORLD_SIZE=$(($NUM_NODES * $NUM_GPUS_PER_NODE)) + python -m torch.distributed.launch \ + --nproc_per_node=$NUM_GPUS_PER_NODE \ + --nnodes=$NUM_NODES \ + --node_rank $NODE_RANK \ + train_xvector.py ... + """ + if training_opts["multi_gpu"]: + if local_rank < 1: + print("Let's use", int(os.environ['WORLD_SIZE']), "GPUs!") + torch.distributed.init_process_group(backend='nccl', init_method='env://') + model = torch.nn.parallel.DistributedDataParallel(model, + device_ids=[local_rank], + output_device=local_rank) + else: + print("Train on a single GPU") + + # Initialise data loaders + training_loader, validation_loader,\ + sampler, validation_tar_indices, validation_non_indices = get_loaders(dataset_opts, + training_opts, + model_opts, + local_rank) + + if local_rank < 1: + monitor.logger.info(f"Start training process") + monitor.logger.info(f"Use \t{int(os.environ['WORLD_SIZE'])} \tgpus") + monitor.logger.info(f"Use \t{training_opts['num_cpu']} \tcpus") + + monitor.logger.info(f"Validation EER will be measured using") + monitor.logger.info(f"\t {numpy.sum(validation_tar_indices)} target trials and") + monitor.logger.info(f"\t {numpy.sum(validation_non_indices)} non-target trials") + + # Create optimizer and scheduler + optimizer, scheduler = get_optimizer(model, model_opts, training_opts, training_loader) + + scaler = None + if training_opts["mixed_precision"]: + scaler = torch.cuda.amp.GradScaler() + + for epoch in range(1, training_opts["epochs"] + 1): + + monitor.update(epoch=epoch) + + # Process one epoch and return the current model + if monitor.current_patience == 0: + print(f"Stopping at epoch {epoch} for cause of patience") + break + + sampler.set_epoch(epoch) + if training_opts["multi_gpu"]: + torch.distributed.barrier() + + model = train_epoch(model, + training_opts, + monitor, + training_loader, + optimizer, + scheduler, + device, + scaler=scaler) + # aam_scheduler=aam_scheduler) + + # Cross validation + if math.fmod(epoch, training_opts["validation_frequency"]) == 0: + val_acc, val_loss, val_eer = cross_validation(model, + validation_loader, + device, + [validation_loader.dataset.__len__(), embedding_size], + validation_tar_indices, + validation_non_indices, + training_opts["mixed_precision"]) + + test_eer = None + if training_opts["compute_test_eer"] and local_rank < 1: + test_eer = test_metrics(model, device, model_opts, dataset_opts, training_opts) + + monitor.update(test_eer=test_eer, + val_eer=val_eer, + val_loss=val_loss, + val_acc=val_acc) + + if local_rank < 1: + monitor.display() + + # Save the current model and if needed update the best one + # TODO ajouter une option qui garde les modèles à certaines époques (par exemple avant le changement de LR + if local_rank < 1: + save_model(model, monitor, model_opts, training_opts, optimizer, scheduler, epoch) + + + for ii in range(int(os.environ['WORLD_SIZE'])): + monitor.logger.info(torch.cuda.memory_summary(ii)) + + # TODO gérer l'affichage en utilisant le training_monitor + if local_rank < 1: + monitor.display_final() + + return monitor.best_eer + + +def train_epoch(model, + training_opts, + training_monitor, + training_loader, + optimizer, + scheduler, + device, + scaler=None, + clipping=False, + aam_scheduler=None): + """ + + :param model: + :param training_opts: + :param training_monitor: + :param training_loader: + :param optimizer: + :param scheduler: + :param device: + :param scaler: + :param clipping: + :param aam_scheduler: + :return: + """ + model.train() + criterion = torch.nn.CrossEntropyLoss(reduction='mean') + + if isinstance(model, Xtractor): + loss_criteria = model.loss + else: + loss_criteria = model.module.loss + + accuracy = 0.0 + running_loss = 0.0 + batch_count = 0 + for batch_idx, (data, target) in enumerate(training_loader): + data = data.squeeze().to(device) + + target = target.squeeze() + target = target.to(device) + optimizer.zero_grad(set_to_none=True) + + if scaler is not None: + with torch.cuda.amp.autocast(): + if loss_criteria == 'aam': + output_tuple, _ = model(data, target=target) + output, no_margin_output = output_tuple + loss = criterion(output, target) + elif loss_criteria == 'smn': + output_tuple, _ = model(data, target=target) + loss, output = output_tuple + loss += criterion(output, target) + elif loss_criteria == 'aps': + output_tuple, _ = model(data, target=target) + loss, output = output_tuple + else: + output, _ = model(data, target=None) + loss = criterion(output, target) + + scaler.scale(loss).backward() + if clipping: + scaler.unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.) + scaler.step(optimizer) + scaler.update() + + else: + if loss_criteria == 'aam': + output, _ = model(data, target=target) + loss = criterion(output, target) + elif loss_criteria == 'aps': + output_tuple, _ = model(data, target=target) + cos_sim_matx, output = output_tuple + loss = criterion(cos_sim_matx, torch.arange(0, int(data.shape[0]/2), device=device)) + criterion(output, target) + else: + output, _ = model(data, target=None) + loss = criterion(output, target) + + loss.backward() + optimizer.step() + + running_loss += loss.item() + accuracy += (torch.argmax(no_margin_output.data, 1) == target).sum().cpu() + batch_count += 1 + + if math.fmod(batch_idx, training_opts["log_interval"]) == 0: + batch_size = target.shape[0] + training_monitor.update(training_loss=loss.item(), + training_acc=100.0 * accuracy.item() / ((batch_idx + 1) * batch_size)) + + training_monitor.logger.info('Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tAccuracy: {:.3f}\tLr: {}'.format( + training_monitor.current_epoch, + batch_idx + 1, + training_loader.__len__(), + 100. * batch_idx / training_loader.__len__(), + running_loss / batch_count, + 100.0 * accuracy / (batch_count*target.shape[0]), + scheduler.get_last_lr()[0], # TODO only working fro some scheduler + )) + running_loss = 0.0 + accuracy = 0.0 + batch_count = 0 + running_loss = 0.0 + if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): + scheduler.step(training_monitor.best_eer) + else: + scheduler.step() + if aam_scheduler is not None: + model.after_speaker_embedding.margin = aam_scheduler.__step__() + return model + +def cross_validation(model, validation_loader, device, validation_shape, tar_indices, non_indices, mixed_precision=False): + """ + + :param model: + :param validation_loader: + :param device: + :param validation_shape: + :return: + """ + model.eval() + if isinstance(model, Xtractor): + loss_criteria = model.loss + else: + loss_criteria = model.module.loss + + accuracy = 0.0 + loss = 0.0 + criterion = torch.nn.CrossEntropyLoss() + embeddings = torch.zeros(validation_shape) + cursor = 0 + with torch.no_grad(): + for batch_idx, (data, target) in enumerate(tqdm.tqdm(validation_loader, desc='validation compute', mininterval=1, disable=None)): + if target.dim() != 1: + target = target.squeeze() + target = target.to(device) + batch_size = target.shape[0] + data = data.squeeze().to(device) + with torch.cuda.amp.autocast(enabled=mixed_precision): + output, batch_embeddings = model(data, target=None, is_eval=True) + if loss_criteria == 'cce': + batch_embeddings = l2_norm(batch_embeddings) + if loss_criteria == 'smn': + batch_embeddings, batch_predictions = output + else: + batch_predictions = output + accuracy += (torch.argmax(batch_predictions.data, 1) == target).sum() + loss += criterion(batch_predictions, target) + embeddings[cursor:cursor + batch_size,:] = batch_embeddings.detach().cpu() + cursor += batch_size + + local_device = "cpu" if embeddings.shape[0] > 3e4 else device + embeddings = embeddings.to(local_device) + scores = torch.einsum('ij,kj', embeddings, embeddings).cpu().numpy() + negatives = scores[non_indices] + positives = scores[tar_indices] + + pmiss, pfa = rocch(positives, negatives) + equal_error_rate = rocch2eer(pmiss, pfa) + + return (100. * accuracy.cpu().numpy() / validation_shape[0], + loss.cpu().numpy() / ((batch_idx + 1) * batch_size), + equal_error_rate) + + +def extract_embeddings(idmap_name, + model_filename, + data_root_name, + device, + batch_size=1, + file_extension="wav", + transform_pipeline={}, + sliding_window=False, + win_duration=3., + win_shift=1.5, + num_thread=1, + sample_rate=16000, + mixed_precision=False, + norm_embeddings=True): + """ + + :param idmap_name: + :param model_filename: + :param data_root_name: + :param device: + :param file_extension: + :param transform_pipeline: + :param sliding_window: + :param win_duration: + :param win_shift: + :param num_thread: + :param sample_rate: + :param mixed_precision: + :return: + """ + + if sliding_window: + batch_size = 1 + + # Load the model + if isinstance(model_filename, str): + checkpoint = torch.load(model_filename, map_location=device) + speaker_number = checkpoint["speaker_number"] + model_opts = checkpoint["model_archi"] + model_opts["embedding_size"] = 256 + model = Xtractor(speaker_number, + model_archi=model_opts["model_type"], + loss=model_opts["loss"]["type"], + embedding_size=model_opts["embedding_size"]) + model.load_state_dict(checkpoint["model_state_dict"]) + else: + model = model_filename + + if isinstance(idmap_name, IdMap): + idmap = idmap_name + else: + idmap = IdMap(idmap_name) + + # Create dataset to load the data + dataset = IdMapSet(idmap_name=idmap, + data_path=data_root_name, + file_extension=file_extension, + transform_pipeline=transform_pipeline, + transform_number=0, + sliding_window=sliding_window, + window_len=win_duration, + window_shift=win_shift, + sample_rate=sample_rate, + min_duration=win_duration + ) + + dataloader = DataLoader(dataset, + batch_size=batch_size, + shuffle=False, + drop_last=False, + pin_memory=True, + num_workers=num_thread) + + with torch.no_grad(): + model.eval() + model.to(device) + + embed = [] + modelset= [] + segset = [] + starts = [] + + for idx, (data, mod, seg, start, stop) in enumerate(tqdm.tqdm(dataloader, + desc='xvector extraction', + mininterval=1, + disable=None)): + + if data.dim() > 2: + data = data.squeeze() + + with torch.cuda.amp.autocast(enabled=mixed_precision): + tmp_data = torch.split(data,data.shape[0]//(max(1, data.shape[0]//100))) + for td in tmp_data: + _, vec = model(x=td.to(device), is_eval=True, norm_embedding=norm_embeddings) + embed.append(vec.detach().cpu()) + modelset.extend(mod * data.shape[0]) + segset.extend(seg * data.shape[0]) + if sliding_window: + tmp_start = numpy.arange(0, data.shape[0] * win_shift, win_shift) + starts.extend(tmp_start * sample_rate + start.detach().cpu().numpy()) + else: + starts.append(start.numpy()) + + embeddings = StatServer() + embeddings.stat1 = numpy.concatenate(embed) + embeddings.modelset = numpy.array(modelset).astype('>U') + embeddings.segset = numpy.array(segset).astype('>U') + embeddings.start = numpy.array(starts).squeeze() + embeddings.stop = embeddings.start + win_duration + embeddings.stat0 = numpy.ones((embeddings.modelset.shape[0], 1)) + + return embeddings + + +def extract_embeddings_per_speaker(idmap_name, + model_filename, + data_root_name, + device, + file_extension="wav", + transform_pipeline={}, + sample_rate=16000, + mixed_precision=False, + num_thread=1): + """ + + :param idmap_name: + :param model_filename: + :param data_root_name: + :param device: + :param file_extension: + :param transform_pipeline: + :param sample_rate: + :param mixed_precision: + :param num_thread: + :return: + """ + # Load the model + if isinstance(model_filename, str): + checkpoint = torch.load(model_filename, map_location=device) + speaker_number = checkpoint["speaker_number"] + model_opts = checkpoint["model_archi"] + model = Xtractor(speaker_number, model_archi=model_opts["model_type"], loss=model_opts["loss"]["type"]) + model.load_state_dict(checkpoint["model_state_dict"]) + else: + model = model_filename + + if isinstance(idmap_name, IdMap): + idmap = idmap_name + else: + idmap = IdMap(idmap_name) + + # Create dataset to load the data + dataset = IdMapSetPerSpeaker(idmap_name=idmap, + data_path=data_root_name, + file_extension=file_extension, + transform_pipeline=transform_pipeline, + sample_rate=sample_rate, + min_duration=1.) + + dataloader = DataLoader(dataset, + batch_size=1, + shuffle=False, + drop_last=False, + pin_memory=True, + num_workers=num_thread) + + with torch.no_grad(): + model.eval() + model.to(device) + + # Get the size of embeddings to extract + emb_size = model.embedding_size + + # Create the StatServer + embeddings = StatServer() + embeddings.modelset = dataset.output_im.leftids + embeddings.segset = dataset.output_im.rightids + embeddings.start = dataset.output_im.start + embeddings.stop = dataset.output_im.stop + embeddings.stat0 = numpy.ones((embeddings.modelset.shape[0], 1)) + embeddings.stat1 = numpy.ones((embeddings.modelset.shape[0], emb_size)) + + # Process the data + with torch.cuda.amp.autocast(enabled=mixed_precision): + for idx, (data, mod, seg, start, stop) in enumerate(tqdm.tqdm(dataloader, + desc='xvector extraction', + mininterval=1)): + if data.shape[1] > 20000000: + data = data[..., :20000000] + _, vec = model(x=data.to(device), is_eval=True, norm_embedding=True) + embeddings.stat1[idx, :] = vec.detach().cpu() + + return embeddings + diff --git a/sidekit/sidekit/score_normalization.py b/sidekit/sidekit/score_normalization.py new file mode 100644 index 0000000..65360e9 --- /dev/null +++ b/sidekit/sidekit/score_normalization.py @@ -0,0 +1,137 @@ +# -*- coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . + +""" +Copyright 2014-2019 Anthony Larcher + +:mod:`sidekit_io` provides methods to read and write from and to different +formats. +""" + +import copy +import numpy + +__license__ = "LGPL" +__author__ = "Anthony Larcher" +__copyright__ = "Copyright 2014-2019 Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' + + +def znorm(enrol_test_scores, enrol_imp_scores, sym=False): + """ + imp_scores are formed by scoring enrollment utterance ewith all files from the impostor cohort + thus: enrol_test_scores.modelset and enrol_imp_scores.modelset must be the same + + This function assumes that all models from enrol_test_scores are in enrol_imp_scores + + :param enrol_test_scores: + :param enrol_imp_scores: + :return: + """ + # Align enrol_test_scores.modelset and enrol_imp_scores.modelset + #enrol_imp_scores.filter(enrol_test_scores.modelset, enrol_imp_scores.segset, keep=True) + scores_znorm = copy.deepcopy(enrol_test_scores) + scores_znorm.sort() + enrol_imp_scores.sort() + + # Compute the new enrol_test_scores normalized scores + if sym: + mean_per_model = (enrol_imp_scores.scoremat.sum(1) - numpy.diag(enrol_imp_scores.scoremat)) / (enrol_imp_scores.scoremat.shape[1] - 1) + tmp = numpy.square(enrol_imp_scores.scoremat - mean_per_model) + std_per_model = (tmp.sum(1) - numpy.diag(tmp)) / (tmp.shape[1] - 1) + else: + mean_per_model = enrol_imp_scores.scoremat.mean(1) + std_per_model = enrol_imp_scores.scoremat.std(1) + scores_znorm.scoremat = (scores_znorm.scoremat - mean_per_model) / std_per_model + + return scores_znorm + + +def tnorm(enrol_test_scores, imp_test_scores): + """ + + :param enrol_test_scores: + :param imp_test_scores: + :return: + """ + # Align enrol_test_scores.segset and imp_test_scores.segset + #imp_test_scores.filter(imp_test_scores.modelset, enrol_test_scores.segset, keep=True) + scores_tnorm = copy.deepcopy(enrol_test_scores) + scores_tnorm.sort() + imp_test_scores.sort() + + # Compute the new enrol_test_scores normalized scores + mean_per_segment = imp_test_scores.scoremat.mean(0) + std_per_segment = imp_test_scores.scoremat.std(0) + scores_tnorm.scoremat = (scores_tnorm.scoremat - mean_per_segment) / std_per_segment + + return scores_tnorm + + +def ztnorm(enrol_test_scores, enrol_imp_scores, imp_test_scores, imp_imp_scores): + """ + + :param enrol_test_scores: + :param enrol_imp_scores: + :param imp_test_scores: + :param imp_imp_scores: + :return: + """ + + # Apply Z-norm first on enrol_test_scores by using enrol_imp_scores + # and on imp_test_scores by using imp_imp_scores + # + # to produce Z_enrol_test_scores and Z_imp_test_scores + z_enrol_test_scores = znorm(enrol_test_scores, enrol_imp_scores) + z_imp_test_scores = znorm(imp_test_scores, imp_imp_scores, sym=True) + + # Apply t-norm on Z_enrol_test_scores with Z_imp_test_scores + zt_enrol_test_scores = tnorm(z_enrol_test_scores, z_imp_test_scores) + + return zt_enrol_test_scores + + +def snorm(enrol_test_scores, enrol_imp_scores, imp_test_scores): + """ + + :param enrol_test_scores: + :return: + """ + # Compute z-norm scores + z_enrol_test_scores = znorm(enrol_test_scores, enrol_imp_scores) + + # Compute t-norm scores + t_enrol_test_scores = tnorm(enrol_test_scores, imp_test_scores) + + # Average the z and t normed scores + s_enrol_test_scores = copy.deepcopy(enrol_test_scores) + s_enrol_test_scores.scoremat = 0.5 * (z_enrol_test_scores.scoremat + t_enrol_test_scores.scoremat) + + return s_enrol_test_scores + + +def asnorm(enrol_test_scores): + pass diff --git a/sidekit/sidekit/sidekit_io.py b/sidekit/sidekit/sidekit_io.py new file mode 100755 index 0000000..5adbad4 --- /dev/null +++ b/sidekit/sidekit/sidekit_io.py @@ -0,0 +1,469 @@ +# -*- coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . + +""" +Copyright 2014-2019 Anthony Larcher + +:mod:`sidekit_io` provides methods to read and write from and to different +formats. +""" + +import h5py +import array +import numpy +import os +import pickle +import struct +import gzip +import logging +from sidekit.sidekit_wrappers import check_path_existance + + +__license__ = "LGPL" +__author__ = "Anthony Larcher" +__copyright__ = "Copyright 2014-2019 Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' + + +def read_vect(filename): + """Read vector in ALIZE binary format and return an array + + :param filename: name of the file to read from + + :return: a numpy.ndarray object + """ + with open(filename, 'rb') as f: + struct.unpack("<2l", f.read(8)) + data = array.array("d") + data.fromstring(f.read()) + return numpy.array(data) + + +def read_matrix(filename): + """Read matrix in ALIZE binary format and return a ndarray + + :param filename: name of the file to read from + + :return: a numpy.ndarray object + """ + with open(filename, 'rb') as f: + m_dim = struct.unpack("<2l", f.read(8)) + data = array.array("d") + data.fromstring(f.read()) + T = numpy.array(data) + T.resize(m_dim[0], m_dim[1]) + return T + + +@check_path_existance +def write_matrix(m, filename): + """Write a matrix in ALIZE binary format + + :param m: a 2-dimensional ndarray + :param filename: name of the file to write in + + :exception: TypeError if m is not a 2-dimensional ndarray + """ + if not m.ndim == 2: + raise TypeError("To write vector, use write_vect") + else: + with open(filename, 'wb') as mf: + data = numpy.array(m.flatten()) + mf.write(struct.pack(". +""" +Copyright 2014-2021 Sylvain Meignier and Anthony Larcher + + :mod:`sidekit_mpi` provides methods to run using Message Passing interface + +""" + +import copy +import numpy +import os +import logging +import h5py +import scipy +import sys +from sidekit.statserver import StatServer +from sidekit.factor_analyser import FactorAnalyser +from sidekit.mixture import Mixture +from sidekit.sidekit_io import write_matrix_hdf5, read_matrix_hdf5 + +from sidekit.sv_utils import serialize +from sidekit.factor_analyser import e_on_batch +from mpi4py import MPI + + +__license__ = "LGPL" +__author__ = "Anthony Larcher" +__copyright__ = "Copyright 2014-2021 Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' + +data_type = numpy.float32 + +def total_variability(stat_server_file_name, + ubm, + tv_rank, + nb_iter=20, + min_div=True, + tv_init=None, + save_init=False, + output_file_name=None): + """ + Train a total variability model using multiple process on multiple nodes with MPI. + + Example of how to train a total variability matrix using MPI. + Here is what your script should look like: + + ---------------------------------------------------------------- + + import sidekit + + fa = sidekit.FactorAnalyser() + fa.total_variability_mpi("/lium/spk1/larcher/expe/MPI_TV/data/statserver.h5", + ubm, + tv_rank, + nb_iter=tv_iteration, + min_div=True, + tv_init=tv_new_init2, + output_file_name="data/TV_mpi") + + ---------------------------------------------------------------- + + This script should be run using mpirun command (see MPI4PY website for + more information about how to use it + http://pythonhosted.org/mpi4py/ + ) + + mpirun --hostfile hostfile ./my_script.py + + :param comm: MPI.comm object defining the group of nodes to use + :param stat_server_file_name: name of the StatServer file to load (make sure you provide absolute path and that + it is accessible from all your nodes). + :param ubm: a Mixture object + :param tv_rank: rank of the total variability model + :param nb_iter: number of EM iteration + :param min_div: boolean, if True, apply minimum divergence re-estimation + :param tv_init: initial matrix to start the EM iterations with + :param output_file_name: name of the file where to save the matrix + """ + comm = MPI.COMM_WORLD + + comm.Barrier() + + # this lines allows to process a single StatServer or a list of StatServers + if not isinstance(stat_server_file_name, list): + stat_server_file_name = [stat_server_file_name] + + # Initialize useful variables + sv_size = ubm.get_mean_super_vector().shape[0] + gmm_covariance = "diag" if ubm.invcov.ndim == 2 else "full" + nb_distrib, feature_size = ubm.mu.shape + upper_triangle_indices = numpy.triu_indices(tv_rank) + + # Initialize the FactorAnalyser, mean and Sigma are initialized at ZEROS as statistics are centered + factor_analyser = FactorAnalyser() + factor_analyser.mean = numpy.zeros(ubm.get_mean_super_vector().shape) + factor_analyser.F = serialize(numpy.zeros((sv_size, tv_rank)).astype(data_type)) + if tv_init is None: + factor_analyser.F = numpy.random.randn(sv_size, tv_rank).astype(data_type) + else: + factor_analyser.F = tv_init + factor_analyser.Sigma = numpy.zeros(ubm.get_mean_super_vector().shape) + + # Save init if required + if comm.rank == 0: + if output_file_name is None: + output_file_name = "temporary_factor_analyser" + if save_init: + factor_analyser.write(output_file_name + "_init.h5") + + # Iterative training of the FactorAnalyser + for it in range(nb_iter): + if comm.rank == 0: + logging.critical("Start it {}".format(it)) + + _A = numpy.zeros((nb_distrib, tv_rank * (tv_rank + 1) // 2), dtype=data_type) + _C = numpy.zeros((tv_rank, sv_size), dtype=data_type) + _R = numpy.zeros((tv_rank * (tv_rank + 1) // 2), dtype=data_type) + + if comm.rank == 0: + total_session_nb = 0 + + # E-step + for stat_server_file in stat_server_file_name: + + with h5py.File(stat_server_file, 'r') as fh: + nb_sessions = fh["segset"].shape[0] + + if comm.rank == 0: + total_session_nb += nb_sessions + + comm.Barrier() + if comm.rank == 0: + logging.critical("Process file: {}".format(stat_server_file)) + + # Allocate a list of sessions to process to each node + local_session_idx = numpy.array_split(range(nb_sessions), comm.size) + stat0 = fh['stat0'][local_session_idx[comm.rank], :] + stat1 = fh['stat1'][local_session_idx[comm.rank], :] + e_h, e_hh = e_on_batch(stat0, stat1, ubm, factor_analyser.F) + + _A += stat0.T.dot(e_hh) + _C += e_h.T.dot(stat1) + _R += numpy.sum(e_hh, axis=0) + + comm.Barrier() + + comm.Barrier() + + # Sum all statistics + if comm.rank == 0: + # only processor 0 will actually get the data + total_A = numpy.zeros_like(_A) + total_C = numpy.zeros_like(_C) + total_R = numpy.zeros_like(_R) + else: + total_A = [None] * _A.shape[0] + total_C = None + total_R = None + + # Accumulate _A, using a list in order to avoid limitations of MPI (impossible to reduce matrices bigger + # than 4GB) + for ii in range(_A.shape[0]): + _tmp = copy.deepcopy(_A[ii]) + if comm.rank == 0: + _total_A = numpy.zeros_like(total_A[ii]) + else: + _total_A = None + + comm.Reduce( + [_tmp, MPI.FLOAT], + [_total_A, MPI.FLOAT], + op=MPI.SUM, + root=0 + ) + if comm.rank == 0: + total_A[ii] = copy.deepcopy(_total_A) + + comm.Reduce( + [_C, MPI.FLOAT], + [total_C, MPI.FLOAT], + op=MPI.SUM, + root=0 + ) + + comm.Reduce( + [_R, MPI.FLOAT], + [total_R, MPI.FLOAT], + op=MPI.SUM, + root=0 + ) + + comm.Barrier() + + # M-step + if comm.rank == 0: + + total_R /= total_session_nb + _A_tmp = numpy.zeros((tv_rank, tv_rank), dtype=data_type) + for c in range(nb_distrib): + distrib_idx = range(c * feature_size, (c + 1) * feature_size) + _A_tmp[upper_triangle_indices] = _A_tmp.T[upper_triangle_indices] = total_A[c, :] + factor_analyser.F[distrib_idx, :] = scipy.linalg.solve(_A_tmp, total_C[:, distrib_idx]).T + + # minimum divergence + if min_div: + _R_tmp = numpy.zeros((tv_rank, tv_rank), dtype=data_type) + _R_tmp[upper_triangle_indices] = _R_tmp.T[upper_triangle_indices] = total_R + ch = scipy.linalg.cholesky(_R_tmp) + factor_analyser.F = factor_analyser.F.dot(ch) + + # Save the current FactorAnalyser + if output_file_name is not None: + if it < nb_iter - 1: + factor_analyser.write(output_file_name + "_it-{}.h5".format(it)) + else: + factor_analyser.write(output_file_name + ".h5") + factor_analyser.F = comm.bcast(factor_analyser.F, root=0) + comm.Barrier() + + +def extract_ivector(tv, + stat_server_file_name, + ubm, + output_file_name, + uncertainty=False, + prefix=''): + """ + Estimate i-vectors for a given StatServer using multiple process on multiple nodes. + + :param comm: MPI.comm object defining the group of nodes to use + :param stat_server_file_name: file name of the sufficient statistics StatServer HDF5 file + :param ubm: Mixture object (the UBM) + :param output_file_name: name of the file to save the i-vectors StatServer in HDF5 format + :param uncertainty: boolean, if True, saves a matrix with uncertainty matrices (diagonal of the matrices) + :param prefix: prefixe of the dataset to read from in HDF5 file + """ + assert(isinstance(ubm, Mixture) and ubm.validate()), "Second argument must be a proper Mixture" + + comm = MPI.COMM_WORLD + + comm.Barrier() + + gmm_covariance = "diag" if ubm.invcov.ndim == 2 else "full" + + # Set useful variables + tv_rank = tv.F.shape[1] + feature_size = ubm.mu.shape[1] + nb_distrib = ubm.w.shape[0] + + # Get the number of sessions to process + with h5py.File(stat_server_file_name, 'r') as fh: + nb_sessions = fh["segset"].shape[0] + + # Work on each node with different data + indices = numpy.array_split(numpy.arange(nb_sessions), comm.size, axis=0) + sendcounts = numpy.array([idx.shape[0] * tv.F.shape[1] for idx in indices]) + displacements = numpy.hstack((0, numpy.cumsum(sendcounts)[:-1])) + + stat_server = StatServer.read_subset(stat_server_file_name, indices[comm.rank]) + + # Whiten the statistics for diagonal or full models + if gmm_covariance == "diag": + stat_server.whiten_stat1(ubm.get_mean_super_vector(), 1. / ubm.get_invcov_super_vector()) + elif gmm_covariance == "full": + stat_server.whiten_stat1(ubm.get_mean_super_vector(), ubm.invchol) + + # Estimate i-vectors + if comm.rank == 0: + iv = numpy.zeros((nb_sessions, tv_rank)) + iv_sigma = numpy.zeros((nb_sessions, tv_rank)) + else: + iv = None + iv_sigma = None + + local_iv = numpy.zeros((stat_server.modelset.shape[0], tv_rank)) + local_iv_sigma = numpy.ones((stat_server.modelset.shape[0], tv_rank)) + + # Replicate stat0 + index_map = numpy.repeat(numpy.arange(nb_distrib), feature_size) + for sess in range(stat_server.segset.shape[0]): + + inv_lambda = scipy.linalg.inv(numpy.eye(tv_rank) + (tv.F.T * stat_server.stat0[sess, index_map]).dot(tv.F)) + + Aux = tv.F.T.dot(stat_server.stat1[sess, :]) + local_iv[sess, :] = Aux.dot(inv_lambda) + local_iv_sigma[sess, :] = numpy.diag(inv_lambda + numpy.outer(local_iv[sess, :], local_iv[sess, :])) + comm.Barrier() + + comm.Gatherv(local_iv,[iv, sendcounts, displacements,MPI.DOUBLE], root=0) + comm.Gatherv(local_iv_sigma,[iv_sigma, sendcounts, displacements,MPI.DOUBLE], root=0) + + if comm.rank == 0: + + with h5py.File(stat_server_file_name, 'r') as fh: + iv_stat_server = StatServer() + iv_stat_server.modelset = fh.get(prefix+"modelset")[()] + iv_stat_server.segset = fh.get(prefix+"segset")[()] + + # if running python 3, need a conversion to unicode + if sys.version_info[0] == 3: + iv_stat_server.modelset = iv_stat_server.modelset.astype('U', copy=False) + iv_stat_server.segset = iv_stat_server.segset.astype('U', copy=False) + + tmpstart = fh.get(prefix+"start")[()] + tmpstop = fh.get(prefix+"stop")[()] + iv_stat_server.start = numpy.empty(fh[prefix+"start"].shape, '|O') + iv_stat_server.stop = numpy.empty(fh[prefix+"stop"].shape, '|O') + iv_stat_server.start[tmpstart != -1] = tmpstart[tmpstart != -1] + iv_stat_server.stop[tmpstop != -1] = tmpstop[tmpstop != -1] + iv_stat_server.stat0 = numpy.ones((nb_sessions, 1)) + iv_stat_server.stat1 = iv + + iv_stat_server.write(output_file_name) + if uncertainty: + path = os.path.splitext(output_file_name) + write_matrix_hdf5(iv_sigma, path[0] + "_uncertainty" + path[1]) + + +def EM_split(ubm, + features_server, + feature_list, + distrib_nb, + output_filename, + iterations=(1, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8), + llk_gain=0.01, + save_partial=False, + ceil_cov=10, + floor_cov=1e-2, + num_thread=1): + """Expectation-Maximization estimation of the Mixture parameters. + + :param comm: + :param features: a 2D-array of feature frames (one raow = 1 frame) + :param distrib_nb: final number of distributions + :param iterations: list of iteration number for each step of the learning process + :param llk_gain: limit of the training gain. Stop the training when gain between + two iterations is less than this value + :param save_partial: name of the file to save intermediate mixtures, + if True, save before each split of the distributions + :param ceil_cov: + :param floor_cov: + + :return llk: a list of log-likelihoods obtained after each iteration + """ + + # Load the features + features = features_server.stack_features_parallel(feature_list, num_thread=num_thread) + + comm = MPI.COMM_WORLD + comm.Barrier() + + if comm.rank == 0: + import sys + + llk = [] + + # Initialize the mixture + n_frames, feature_size = features.shape + mu = features.mean(axis=0) + cov = numpy.mean(features**2, axis=0) + ubm.mu = mu[None] + ubm.invcov = 1./cov[None] + ubm.w = numpy.asarray([1.0]) + ubm.cst = numpy.zeros(ubm.w.shape) + ubm.det = numpy.zeros(ubm.w.shape) + ubm.cov_var_ctl = 1.0 / copy.deepcopy(ubm.invcov) + ubm._compute_all() + + else: + n_frames = None + feature_size = None + features = None + + comm.Barrier() + + # Broadcast the UBM on each process + ubm = comm.bcast(ubm, root=0) + + # Send n_frames and feature_size to all process + n_frames = comm.bcast(n_frames, root=0) + feature_size = comm.bcast(feature_size, root=0) + + # Compute the size of all matrices to scatter to each process + indices = numpy.array_split(numpy.arange(n_frames), comm.size, axis=0) + sendcounts = numpy.array([idx.shape[0] * feature_size for idx in indices]) + displacements = numpy.hstack((0, numpy.cumsum(sendcounts)[:-1])) + + # Scatter features on all process + local_features = numpy.empty((indices[comm.rank].shape[0], feature_size)) + + comm.Scatterv([features, tuple(sendcounts), tuple(displacements), MPI.DOUBLE], local_features) + comm.Barrier() + + # for N iterations: + for nbg, it in enumerate(iterations[:int(numpy.log2(distrib_nb))]): + + if comm.rank == 0: + logging.critical("Start training model with {} distributions".format(2**nbg)) + # Save current model before spliting + if save_partial: + ubm.write(output_filename + '_{}g.h5'.format(ubm.get_distrib_nb()), prefix='') + + ubm._split_ditribution() + + if comm.rank == 0: + accum = copy.deepcopy(ubm) + else: + accum = Mixture() + accum.w = accum.mu = accum.invcov = None + + # Create one accumulator for each process + local_accum = copy.deepcopy(ubm) + for i in range(it): + + local_accum._reset() + + if comm.rank == 0: + logging.critical("\titeration {} / {}".format(i+1, it)) + _tmp_llk = numpy.array(0) + accum._reset() + + else: + _tmp_llk = numpy.array([None]) + + # E step + logging.critical("\nStart E-step, rank {}".format(comm.rank)) + local_llk = numpy.array(ubm._expectation(local_accum, local_features)) + + # Reduce all accumulators in process 1 + comm.Barrier() + comm.Reduce( + [local_accum.w, MPI.DOUBLE], + [accum.w, MPI.DOUBLE], + op=MPI.SUM, + root=0 + ) + + comm.Reduce( + [local_accum.mu, MPI.DOUBLE], + [accum.mu, MPI.DOUBLE], + op=MPI.SUM, + root=0 + ) + + comm.Reduce( + [local_accum.invcov, MPI.DOUBLE], + [accum.invcov, MPI.DOUBLE], + op=MPI.SUM, + root=0 + ) + + comm.Reduce( + [local_llk, MPI.DOUBLE], + [_tmp_llk, MPI.DOUBLE], + op=MPI.SUM, + root=0 + ) + comm.Barrier() + + if comm.rank == 0: + llk.append(_tmp_llk / numpy.sum(accum.w)) + + # M step + logging.critical("\nStart M-step, rank {}".format(comm.rank)) + ubm._maximization(accum, ceil_cov=ceil_cov, floor_cov=floor_cov) + + if i > 0: + # gain = llk[-1] - llk[-2] + # if gain < llk_gain: + # logging.debug( + # 'EM (break) distrib_nb: %d %i/%d gain: %f -- %s, %d', + # self.mu.shape[0], i + 1, it, gain, self.name, + # len(cep)) + # break + # else: + # logging.debug( + # 'EM (continu) distrib_nb: %d %i/%d gain: %f -- %s, %d', + # self.mu.shape[0], i + 1, it, gain, self.name, + # len(cep)) + # break + pass + else: + # logging.debug( + # 'EM (start) distrib_nb: %d %i/%i llk: %f -- %s, %d', + # self.mu.shape[0], i + 1, it, llk[-1], + # self.name, len(cep)) + pass + # Send the new Mixture to all process + comm.Barrier() + ubm = comm.bcast(ubm, root=0) + comm.Barrier() + if comm.rank == 0: + ubm.write(output_filename + '_{}g.h5'.format(ubm.get_distrib_nb()), prefix='') + #return llk + diff --git a/sidekit/sidekit/sidekit_wrappers.py b/sidekit/sidekit/sidekit_wrappers.py new file mode 100755 index 0000000..046bc9b --- /dev/null +++ b/sidekit/sidekit/sidekit_wrappers.py @@ -0,0 +1,289 @@ +# -*- coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . + +""" +Copyright 2014-2021 Anthony Larcher + +:mod:`sidekit_wrappers` provides wrappers for different purposes. +The aim when using wrappers is to simplify the development of new function +in an efficient manner +""" +import os +import numpy +import copy +import logging +from sidekit import PARALLEL_MODULE + +__license__ = "LGPL" +__author__ = "Anthony Larcher" +__copyright__ = "Copyright 2014-2021 Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' + + +def coroutine(func): + """ + Decorator that allows to forget about the first call of a coroutine .next() + method or .send(None) + This call is done inside the decorator + :param func: the coroutine to decorate + """ + def start(*args, **kwargs): + cr = func(*args, **kwargs) + next(cr) + return cr + return start + + +def deprecated(func): + """ + + :param func: + :return: + """ + count = [0] + + def wrapper(*args, **kwargs): + count[0] += 1 + if count[0] == 1: + logging.warning(func.__name__ + ' is deprecated') + return func(*args, **kwargs) + return wrapper + + +def check_path_existance(func): + """ Decorator for a function wich prototype is: + + func(features, outputFileName) + + This decorator gets the path included in 'outputFileName' if any + and check if this path exists; if not the path is created. + :param func: function to decorate + """ + def wrapper(*args, **kwargs): + dir_name = os.path.dirname(args[1]) # get the path + # Create the directory if it dosn't exist + if not os.path.exists(dir_name) and not (dir_name == ''): + os.makedirs(dir_name) + # Do the job + func(*args, **kwargs) + return wrapper + + +def process_parallel_lists(func): + """ + Decorator that is used to parallelize process. + This decorator takes a function with any list of arguments including + "num_thread" and parallelize the process by creating "num_thread" number of + parallel process or threads. + + The choice of process or threas depends on the value of the global variable + "PARALLEL_MODULE" that is defined in ./sidekit/__init__.py + + Parallelization is done as follow: + - all arguments have to be given to the decorator with their names + any other case might limit the parallelization. + - the function that is decorated is called by "num_thread" concurrent + process (or threads) with the list of arguments that is given + to the decorator except special arguments (see below) + + Special arguments: + Special arguments are the one that lead to parallelization. + There are 3 types of special arguments which name end with a special + suffix: + + - arguments which names are "*_list" or "*_indices" are lists + or numpy arrays that will be split (equally or almost) and + each sub-list will be passed as an argument for a process/thread + + - arguments which names are "_acc" are duplicated and each thread is + given a copy of this accumulator. At the end of the function, all + accumulators will be summed to return a unique accumulatore; thus + any object passed as a "*_acc" argument has to implement + a "+" operator + + - arguments which names are "*_server" are duplicated using a deepcopy + of the original argument. This is mostly used to pass servers such + as FeaturesServer as arguments + :param func: function to decorate + + """ + def wrapper(*args, **kwargs): + """ + + :param args: + :param kwargs: + :return: + """ + + if len(args) > 1: + print("Warning, some arguments are not named, computation might not be parallelized") + + num_thread = 1 + if "num_thread" in kwargs.keys(): + num_thread = kwargs["num_thread"] + + # On créé un dictionnaire de paramètres kwargs pour chaque thread + if PARALLEL_MODULE in ['threading', 'multiprocessing'] and num_thread > 1: + + # If arguments end with _list or _indices, + # set number of Threads to the minimum length of the lists and raise a warning + list_length = numpy.inf + for k, v in kwargs.items(): + # If v is a list or a numpy.array + if k.endswith("_list") or k.endswith("_indices"): + list_length = min(list_length, len(list(v))) + num_thread = min(num_thread, list_length) + + # Create a list of dictionaries, one per thread, and initialize + # them with the keys + parallel_kwargs = [] + for ii in range(num_thread): + parallel_kwargs.append(dict(zip(kwargs.keys(), + [None]*len(kwargs.keys())))) + + for k, v in kwargs.items(): + + # If v is a list or a numpy.array + if k.endswith("_list") or k.endswith("_indices"): + sub_lists = numpy.array_split(v, num_thread) + for ii in range(num_thread): + parallel_kwargs[ii][k] = sub_lists[ii] # the ii-th sub_list is used for the thread ii + + elif k == "num_thread": + for ii in range(num_thread): + parallel_kwargs[ii][k] = 1 + + # If v is an accumulator (meaning k ends with "_acc") + # v is duplicated for each thread + elif k.endswith("_acc"): + for ii in range(num_thread): + parallel_kwargs[ii][k] = v + + # Duplicate servers for each thread + elif k.endswith("_server") or k.endswith("_extractor"): + for ii in range(num_thread): + parallel_kwargs[ii][k] = copy.deepcopy(v) + + # All other parameters are just given to each thread + else: + for ii in range(num_thread): + parallel_kwargs[ii][k] = v + + if PARALLEL_MODULE == 'multiprocessing': + import multiprocessing + jobs = [] + multiprocessing.freeze_support() + for idx in range(num_thread): + p = multiprocessing.Process(target=func, args=args, kwargs=parallel_kwargs[idx]) + jobs.append(p) + p.start() + for p in jobs: + p.join() + + elif PARALLEL_MODULE == 'threading': + import threading + jobs = [] + for idx in range(num_thread): + p = threading.Thread(target=func, args=args, kwargs=parallel_kwargs[idx]) + jobs.append(p) + p.start() + for p in jobs: + p.join() + + elif PARALLEL_MODULE == 'MPI': + # TODO + print("ParallelProcess using MPI is not implemented yet") + pass + + # Sum accumulators if any + for k, v in kwargs.items(): + if k.endswith("_acc"): + for ii in range(num_thread): + if isinstance(kwargs[k], list): + kwargs[k][0] += parallel_kwargs[ii][k][0] + else: + kwargs[k] += parallel_kwargs[ii][k] + + else: + logging.debug("No Parallel processing with this module") + func(*args, **kwargs) + + return wrapper + + +def accepts(*types, **kw): + """Function decorator. Checks decorated function's arguments are + of the expected types. + + Sources: https://wiki.python.org/moin/PythonDecoratorLibrary#Type_Enforcement_.28accepts.2Freturns.29 + + Parameters: + types -- The expected types of the inputs to the decorated function. + Must specify type for each parameter. + kw -- Optional specification of 'debug' level (this is the only valid + keyword argument, no other should be given). + debug = ( 0 | 1 | 2 ) + + """ + if not kw: + # default level: MEDIUM + debug = 1 + else: + debug = kw['debug'] + try: + def decorator(f): + def newf(*args): + if debug == 0: + return f(*args) + assert len(args) == len(types) + argtypes = tuple([a.__class__.__name__ for a in args]) + if argtypes != types: + print("argtypes = {} and types = {}".format(argtypes, types)) + msg = info(f.__name__, types, argtypes, 0) + if debug == 1: + print('TypeWarning: ', msg) + elif debug == 2: + raise TypeError(msg) + return f(*args) + newf.__name__ = f.__name__ + return newf + return decorator + except KeyError as key: + raise KeyError(key + "is not a valid keyword argument") + except TypeError(msg): + raise TypeError(msg) + + +def info(fname, expected, actual, flag): + """Convenience function returns nicely formatted error/warning msg. + :param fname: function to decorate + :param expected: expected format of the function + :param actual: actual format of the function to check + :param flag: flag + """ + _format = lambda types: ', '.join([str(t).split("'")[0] for t in types]) + expected, actual = _format(expected), _format(actual) + msg = "'{}' method ".format(fname)\ + + ("accepts", "returns")[flag] + " ({}), but ".format(expected)\ + + ("was given", "result is")[flag] + " ({})".format(actual) + return msg diff --git a/sidekit/sidekit/statserver.py b/sidekit/sidekit/statserver.py new file mode 100755 index 0000000..2b57922 --- /dev/null +++ b/sidekit/sidekit/statserver.py @@ -0,0 +1,2041 @@ +# -*- coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . + + +""" +Copyright 2014-2021 Anthony Larcher + +:mod:`statserver` provides methods to manage zero and first statistics. +""" +import copy +import ctypes +import h5py +import logging +import multiprocessing +import numpy +import os +import scipy +import sys +import tqdm +import warnings + +from sidekit.bosaris import IdMap +from sidekit.mixture import Mixture +from sidekit.features_server import FeaturesServer +from sidekit.sidekit_wrappers import process_parallel_lists, deprecated, check_path_existance +import sidekit.frontend +from sidekit import STAT_TYPE + + +ct = ctypes.c_double +if STAT_TYPE == numpy.float32: + ct = ctypes.c_float + + +__license__ = "LGPL" +__author__ = "Anthony Larcher" +__copyright__ = "Copyright 2014-2021 Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' + + +def compute_llk(stat, V, sigma, U=None): + # Compute Likelihood + (n, d) = stat.stat1.shape + centered_data = stat.stat1 - stat.get_mean_stat1() + + if sigma.ndim == 2: + sigma_tot = numpy.dot(V, V.T) + sigma + else: + sigma_tot = numpy.dot(V, V.T) + numpy.diag(sigma) + if U is not None: + sigma_tot += numpy.dot(U, U.T) + + E, junk = scipy.linalg.eigh(sigma_tot) + log_det = numpy.sum(numpy.log(E)) + + return (-0.5 * (n * d * numpy.log(2 * numpy.pi) + n * log_det + + numpy.sum(numpy.sum(numpy.dot(centered_data, + scipy.linalg.inv(sigma_tot)) * centered_data, axis=1)))) + + +def sum_log_probabilities(lp): + """Sum log probabilities in a secure manner to avoid extreme values + + :param lp: ndarray of log-probabilities to sum + """ + pp_max = numpy.max(lp, axis=1) + log_lk = pp_max \ + + numpy.log(numpy.sum(numpy.exp((lp.transpose() - pp_max).transpose()), axis=1)) + ind = ~numpy.isfinite(pp_max) + if sum(ind) != 0: + log_lk[ind] = pp_max[ind] + pp = numpy.exp((lp.transpose() - log_lk).transpose()) + return pp, log_lk + + +@process_parallel_lists +def fa_model_loop(batch_start, + mini_batch_indices, + r, + phi_white, + phi, + sigma, + stat0, + stat1, + e_h, + e_hh, + num_thread=1): + """ + :param batch_start: index to start at in the list + :param mini_batch_indices: indices of the elements in the list (should start at zero) + :param r: rank of the matrix + :param phi_white: whitened version of the factor matrix + :param phi: non-whitened version of the factor matrix + :param sigma: covariance matrix + :param stat0: matrix of zero order statistics + :param stat1: matrix of first order statistics + :param e_h: accumulator + :param e_hh: accumulator + :param num_thread: number of parallel process to run + """ + if sigma.ndim == 2: + A = phi.T.dot(scipy.linalg.solve(sigma, phi)).astype(dtype=STAT_TYPE) + + tmp = numpy.zeros((phi.shape[1], phi.shape[1]), dtype=STAT_TYPE) + + for idx in mini_batch_indices: + if sigma.ndim == 1: + inv_lambda = scipy.linalg.inv(numpy.eye(r) + (phi_white.T * stat0[idx + batch_start, :]).dot(phi_white)) + else: + inv_lambda = scipy.linalg.inv(stat0[idx + batch_start, 0] * A + numpy.eye(A.shape[0])) + + Aux = phi_white.T.dot(stat1[idx + batch_start, :]) + numpy.dot(Aux, inv_lambda, out=e_h[idx]) + e_hh[idx] = inv_lambda + numpy.outer(e_h[idx], e_h[idx], tmp) + + +@process_parallel_lists +def fa_distribution_loop(distrib_indices, _A, stat0, batch_start, batch_stop, e_hh, num_thread=1): + """ + :param distrib_indices: indices of the distributions to iterate on + :param _A: accumulator + :param stat0: matrix of zero order statistics + :param batch_start: index of the first session to process + :param batch_stop: index of the last session to process + :param e_hh: accumulator + :param num_thread: number of parallel process to run + """ + tmp = numpy.zeros((e_hh.shape[1], e_hh.shape[1]), dtype=STAT_TYPE) + for c in distrib_indices: + _A[c] += numpy.einsum('ijk,i->jk', e_hh, stat0[batch_start:batch_stop, c], out=tmp) + # The line abov is equivalent to the two lines below: + # tmp = (E_hh.T * stat0[batch_start:batch_stop, c]).T + # _A[c] += numpy.sum(tmp, axis=0) + + +def load_existing_statistics_hdf5(statserver, statserver_file_name): + """Load required statistics into the StatServer by reading from a file + in hdf5 format. + + :param statserver: sidekit.StatServer to fill + :param statserver_file_name: name of the file to read from + """ + assert os.path.isfile(statserver_file_name), "statserver_file_name does not exist" + + # Load the StatServer + ss = StatServer(statserver_file_name) + + # Check dimension consistency with current Stat_Server + ok = True + if statserver.stat0.shape[0] > 0: + ok &= (ss.stat0.shape[0] == statserver.stat0.shape[1]) + ok &= (ss.stat1.shape[0] == statserver.stat1.shape[1]) + else: + statserver.stat0 = numpy.zeros((statserver. modelset.shape[0], ss.stat0.shape[1]), dtype=STAT_TYPE) + statserver.stat1 = numpy.zeros((statserver. modelset.shape[0], ss.stat1.shape[1]), dtype=STAT_TYPE) + + if ok: + # For each segment, load statistics if they exist + # Get the lists of existing segments + seg_idx = [i for i in range(statserver. segset.shape[0]) if statserver.segset[i] in ss.segset] + stat_idx = [numpy.where(ss.segset == seg)[0][0] for seg in statserver.segset if seg in ss.segset] + + # Copy statistics + statserver.stat0[seg_idx, :] = ss.stat0[stat_idx, :] + statserver.stat1[seg_idx, :] = ss.stat1[stat_idx, :] + else: + raise Exception('Mismatched statistic dimensions') + + +class StatServer: + """A class for statistic storage and processing + + :attr modelset: list of model IDs for each session as an array of strings + :attr segset: the list of session IDs as an array of strings + :attr start: index of the first frame of the segment + :attr stop: index of the last frame of the segment + :attr stat0: a ndarray of float64. Each line contains 0-order statistics + from the corresponding session + :attr stat1: a ndarray of float64. Each line contains 1-order statistics + from the corresponding session + + """ + + #def __init__(self, statserver_file_name=None, ubm=None, index=None):$ + def __init__(self, statserver_file_name=None, distrib_nb=0, feature_size=0, index=None, ubm=None): + """Initialize an empty StatServer or load a StatServer from an existing + file. + + :param statserver_file_name: name of the file to read from. If filename + is an empty string, the StatServer is initialized empty. + If filename is an IdMap object, the StatServer is initialized + to match the structure of the IdMap. + """ + self.modelset = numpy.empty(0, dtype="|O") + self.segset = numpy.empty(0, dtype="|O") + self.start = numpy.empty(0, dtype="|O") + self.stop = numpy.empty(0, dtype="|O") + self.stat0 = numpy.array([], dtype=STAT_TYPE) + self.stat1 = numpy.array([], dtype=STAT_TYPE) + + if ubm is not None: + distrib_nb = ubm.w.shape[0] + feature_size = ubm.mu.shape[1] + + if statserver_file_name is None: + pass + # initialize + elif isinstance(statserver_file_name, IdMap): + self.modelset = statserver_file_name.leftids + self.segset = statserver_file_name.rightids + self.start = statserver_file_name.start + self.stop = statserver_file_name.stop + self.stat0 = numpy.empty((self.segset.shape[0], distrib_nb), dtype=STAT_TYPE) + self.stat1 = numpy.empty((self.segset.shape[0], distrib_nb * feature_size), dtype=STAT_TYPE) + + with warnings.catch_warnings(): + warnings.simplefilter('ignore', RuntimeWarning) + tmp_stat0 = multiprocessing.Array(ct, self.stat0.size) + self.stat0 = numpy.ctypeslib.as_array(tmp_stat0.get_obj()) + self.stat0 = self.stat0.reshape(self.segset.shape[0], distrib_nb) + + tmp_stat1 = multiprocessing.Array(ct, self.stat1.size) + self.stat1 = numpy.ctypeslib.as_array(tmp_stat1.get_obj()) + self.stat1 = self.stat1.reshape(self.segset.shape[0], distrib_nb * feature_size) + + # initialize by reading an existing StatServer + elif isinstance(statserver_file_name, str) and index is None: + tmp = StatServer.read(statserver_file_name) + self.modelset = tmp.modelset + self.segset = tmp.segset + self.start = tmp.start + self.stop = tmp.stop + self.stat0 = tmp.stat0.astype(STAT_TYPE) + self.stat1 = tmp.stat1.astype(STAT_TYPE) + + with warnings.catch_warnings(): + size = self.stat0.shape + warnings.simplefilter('ignore', RuntimeWarning) + tmp_stat0 = multiprocessing.Array(ct, self.stat0.size) + self.stat0 = numpy.ctypeslib.as_array(tmp_stat0.get_obj()) + self.stat0 = self.stat0.reshape(size) + + size = self.stat1.shape + tmp_stat1 = multiprocessing.Array(ct, self.stat1.size) + self.stat1 = numpy.ctypeslib.as_array(tmp_stat1.get_obj()) + self.stat1 = self.stat1.reshape(size) + + self.stat0 = copy.deepcopy(tmp.stat0).astype(STAT_TYPE) + self.stat1 = copy.deepcopy(tmp.stat1).astype(STAT_TYPE) + + + elif isinstance(statserver_file_name, str) and index is not None: + tmp = StatServer.read_subset(statserver_file_name, index) + self.modelset = tmp.modelset + self.segset = tmp.segset + self.start = tmp.start + self.stop = tmp.stop + self.stat0 = tmp.stat0.astype(STAT_TYPE) + self.stat1 = tmp.stat1.astype(STAT_TYPE) + + with warnings.catch_warnings(): + warnings.simplefilter('ignore', RuntimeWarning) + tmp_stat0 = multiprocessing.Array(ct, self.stat0.size) + self.stat0 = numpy.ctypeslib.as_array(tmp_stat0.get_obj()) + self.stat0 = self.stat0.reshape(self.segset.shape[0], distrib_nb) + + tmp_stat1 = multiprocessing.Array(ct, self.stat1.size) + self.stat1 = numpy.ctypeslib.as_array(tmp_stat1.get_obj()) + self.stat1 = self.stat1.reshape(self.segset.shape[0], feature_size * distrib_nb) + + self.stat0 = copy.deepcopy(tmp.stat0).astype(STAT_TYPE) + self.stat1 = copy.deepcopy(tmp.stat1).astype(STAT_TYPE) + + + def __repr__(self): + ch = '-' * 30 + '\n' + ch += 'modelset: ' + self.modelset.__repr__() + '\n' + ch += 'segset: ' + self.segset.__repr__() + '\n' + ch += 'seg start:' + self.start.__repr__() + '\n' + ch += 'seg stop:' + self.stop.__repr__() + '\n' + ch += 'stat0:' + self.stat0.__repr__() + '\n' + ch += 'stat1:' + self.stat1.__repr__() + '\n' + ch += '-' * 30 + '\n' + return ch + + def validate(self, warn=False): + """Validate the structure and content of the StatServer. + Check consistency between the different attributes of + the StatServer: + - dimension of the modelset + - dimension of the segset + - length of the modelset and segset + - consistency of stat0 and stat1 + + :param warn: bollean optional, if True, display possible warning + """ + ok = self.modelset.ndim == 1 \ + and (self.modelset.shape == self.segset.shape == self.start.shape == self.stop.shape) \ + and (self.stat0.shape[0] == self.stat1.shape[0] == self.modelset.shape[0]) \ + and (not bool(self.stat1.shape[1] % self.stat0.shape[1])) + + if warn and (self.segset.shape != numpy.unique(self.segset).shape): + logging.warning('Duplicated segments in StatServer') + return ok + + def merge(*arg): + """ + Merge a variable number of StatServers into one. + If a pair segmentID is duplicated, keep ony one + of them and raises a WARNING + """ + line_number = 0 + for idx, ss in enumerate(arg): + assert (isinstance(ss, sidekit.StatServer) and ss.validate()), "Arguments must be proper StatServers" + + # Check consistency of StatServers (dimension of the stat0 and stat1) + if idx == 0: + dim_stat0 = ss.stat0.shape[1] + dim_stat1 = ss.stat1.shape[1] + else: + assert (dim_stat0 == ss.stat0.shape[1] and + dim_stat1 == ss.stat1.shape[1]), "Stat dimensions are not consistent" + + line_number += ss.modelset.shape[0] + + # Get a list of unique modelID-segmentID-start-stop + id_list = [] + for ss in arg: + for m, s, start, stop in zip(ss.modelset, ss.segset, ss.start, ss.stop): + id_list.append("{}-{}-{}-{}".format(m, s, str(start), str(stop))) + id_set = set(id_list) + if line_number != len(id_set): + print("WARNING: duplicated segmentID in input StatServers") + + # Initialize the new StatServer with unique set of segmentID + tmp = numpy.array(list(id_set)) + new_stat_server = sidekit.StatServer() + new_stat_server.modelset = numpy.empty(len(id_set), dtype='object') + new_stat_server.segset = numpy.empty(len(id_set), dtype='object') + new_stat_server.start = numpy.empty(len(id_set), 'object') + new_stat_server.stop = numpy.empty(len(id_set), dtype='object') + new_stat_server.stat0 = numpy.zeros((len(id_set), dim_stat0), dtype=STAT_TYPE) + new_stat_server.stat1 = numpy.zeros((len(id_set), dim_stat1), dtype=STAT_TYPE) + + for ss in arg: + for idx, (m, s, start, stop) in enumerate(zip(ss.modelset, ss.segset, ss.start, ss.stop)): + key = "{}-{}-{}-{}".format(m, s, str(start), str(stop)) + new_idx = numpy.argwhere(tmp == key) + new_stat_server.modelset[new_idx] = ss.modelset[idx] + new_stat_server.segset[new_idx] = ss.segset[idx] + new_stat_server.start[new_idx] = ss.start[idx] + new_stat_server.stop[new_idx] = ss.stop[idx] + new_stat_server.stat0[new_idx, :] = ss.stat0[idx, :].astype(STAT_TYPE) + new_stat_server.stat1[new_idx, :] = ss.stat1[idx, :].astype(STAT_TYPE) + + assert (new_stat_server.validate()), "Problem in StatServer Merging" + return new_stat_server + + @staticmethod + def read(statserver_file_name, prefix=''): + """Read StatServer in hdf5 format + + :param statserver_file_name: name of the file to read from + :param prefix: prefixe of the dataset to read from in HDF5 file + """ + with h5py.File(statserver_file_name, "r") as f: + statserver = StatServer() + statserver.modelset = f.get(prefix+"modelset")[()] + statserver.segset = f.get(prefix+"segset")[()] + + # if running python 3, need a conversion to unicode + if sys.version_info[0] == 3: + statserver.modelset = statserver.modelset.astype('U', copy=False) + statserver.segset = statserver.segset.astype('U', copy=False) + + tmpstart = f.get(prefix+"start")[()] + tmpstop = f.get(prefix+"stop")[()] + statserver.start = numpy.empty(f[prefix+"start"].shape, '|O') + statserver.stop = numpy.empty(f[prefix+"stop"].shape, '|O') + statserver.start[tmpstart != -1] = tmpstart[tmpstart != -1] + statserver.stop[tmpstop != -1] = tmpstop[tmpstop != -1] + + statserver.stat0 = f.get(prefix+"stat0")[()].astype(dtype=STAT_TYPE) + statserver.stat1 = f.get(prefix+"stat1")[()].astype(dtype=STAT_TYPE) + + assert statserver.validate(), "Error: wrong StatServer format" + return statserver + + @check_path_existance + def write(self, output_file_name, prefix='', mode='w'): + """Write the StatServer to disk in hdf5 format. + + :param output_file_name: name of the file to write in. + :param prefix: + """ + assert self.validate(), "Error: wrong StatServer format" + + start = copy.deepcopy(self.start) + start[numpy.isnan(self.start.astype('float'))] = -1 + start = start.astype('int32', copy=False) + + stop = copy.deepcopy(self.stop) + stop[numpy.isnan(self.stop.astype('float'))] = -1 + stop = stop.astype('int32', copy=False) + + with h5py.File(output_file_name, mode) as f: + + ds_already_exist = prefix in f + + # If the file doesn't exist before, create it + if mode == "w" or not ds_already_exist: + + f.create_dataset(prefix+"modelset", data=self.modelset.astype('S'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + f.create_dataset(prefix+"segset", data=self.segset.astype('S'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + f.create_dataset(prefix+"stat0", data=self.stat0.astype(numpy.float32), + maxshape=(None, self.stat0.shape[1]), + compression="gzip", + fletcher32=True) + f.create_dataset(prefix+"stat1", data=self.stat1.astype(numpy.float32), + maxshape=(None, self.stat1.shape[1]), + compression="gzip", + fletcher32=True) + f.create_dataset(prefix+"start", data=start, + maxshape=(None,), + compression="gzip", + fletcher32=True) + f.create_dataset(prefix+"stop", data=stop, + maxshape=(None,), + compression="gzip", + fletcher32=True) + + # If the file already exist, we extend all datasets and add the new data + else: + + previous_size = f[prefix+"modelset"].shape[0] + + # Extend the size of each dataset + f[prefix+"modelset"].resize((previous_size + self.modelset.shape[0],)) + f[prefix+"segset"].resize((previous_size + self.segset.shape[0],)) + f[prefix+"start"].resize((previous_size + start.shape[0],)) + f[prefix+"stop"].resize((previous_size + stop.shape[0],)) + f[prefix+"stat0"].resize((previous_size + self.stat0.shape[0], self.stat0.shape[1])) + f[prefix+"stat1"].resize((previous_size + self.stat1.shape[0], self.stat1.shape[1])) + + # add the new data; WARNING: no check is done on the new data, beware of duplicated entries + f[prefix+"modelset"][previous_size:] = self.modelset.astype('S') + f[prefix+"segset"][previous_size:] = self.segset.astype('S') + f[prefix+"start"][previous_size:] = start + f[prefix+"stop"][previous_size:] = stop + f[prefix+"stat0"][previous_size:, :] = self.stat0.astype(STAT_TYPE) + f[prefix+"stat1"][previous_size:, :] = self.stat1.astype(STAT_TYPE) + + def to_hdf5(self, h5f, prefix='', mode='w'): + """Write the StatServer to disk in hdf5 format. + + :param output_file_name: name of the file to write in. + :param prefix: + """ + assert self.validate(), "Error: wrong StatServer format" + + start = copy.deepcopy(self.start) + start[numpy.isnan(self.start.astype('float'))] = -1 + start = start.astype('int32', copy=False) + + stop = copy.deepcopy(self.stop) + stop[numpy.isnan(self.stop.astype('float'))] = -1 + stop = stop.astype('int32', copy=False) + + # If the file doesn't exist before, create it + if mode == "w": + h5f.create_dataset(prefix + "modelset", data=self.modelset.astype('S'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + h5f.create_dataset(prefix + "segset", data=self.segset.astype('S'), + maxshape=(None,), + compression="gzip", + fletcher32=True) + h5f.create_dataset(prefix + "stat0", data=self.stat0.astype(numpy.float32), + maxshape=(None, self.stat0.shape[1]), + compression="gzip", + fletcher32=True) + h5f.create_dataset(prefix + "stat1", data=self.stat1.astype(numpy.float32), + maxshape=(None, self.stat1.shape[1]), + compression="gzip", + fletcher32=True) + h5f.create_dataset(prefix + "start", data=start, + maxshape=(None,), + compression="gzip", + fletcher32=True) + h5f.create_dataset(prefix + "stop", data=stop, + maxshape=(None,), + compression="gzip", + fletcher32=True) + + # If the file already exist, we extend all datasets and add the new data + else: + + previous_size = f[prefix + "modelset"].shape[0] + + # Extend the size of each dataset + h5f[prefix + "modelset"].resize((previous_size + self.modelset.shape[0],)) + h5f[prefix + "segset"].resize((previous_size + self.segset.shape[0],)) + h5f[prefix + "start"].resize((previous_size + start.shape[0],)) + h5f[prefix + "stop"].resize((previous_size + stop.shape[0],)) + h5f[prefix + "stat0"].resize((previous_size + self.stat0.shape[0], self.stat0.shape[1])) + h5f[prefix + "stat1"].resize((previous_size + self.stat1.shape[0], self.stat1.shape[1])) + + # add the new data; WARNING: no check is done on the new data, beware of duplicated entries + h5f[prefix + "modelset"][previous_size:] = self.modelset.astype('S') + h5f[prefix + "segset"][previous_size:] = self.segset.astype('S') + h5f[prefix + "start"][previous_size:] = start + h5f[prefix + "stop"][previous_size:] = stop + h5f[prefix + "stat0"][previous_size:, :] = self.stat0.astype(STAT_TYPE) + h5f[prefix + "stat1"][previous_size:, :] = self.stat1.astype(STAT_TYPE) + + def get_model_stat0(self, mod_id): + """Return zero-order statistics of a given model + + :param mod_id: ID of the model which stat0 will be returned + + :return: a matrix of zero-order statistics as a ndarray + """ + S = self.stat0[self. modelset == mod_id, :] + return S + + def get_model_stat1(self, mod_id): + """Return first-order statistics of a given model + + :param mod_id: string, ID of the model which stat1 will be returned + + :return: a matrix of first-order statistics as a ndarray + """ + return self.stat1[self.modelset == mod_id, :] + + def get_model_stat0_by_index(self, mod_idx): + """Return zero-order statistics of model number modIDX + + :param mod_idx: integer, index of the unique model which stat0 will be + returned + + :return: a matrix of zero-order statistics as a ndarray + """ + return self.stat0[(self.modelset == numpy.unique(self.modelset)[mod_idx]), :] + + def get_model_stat1_by_index(self, mod_idx): + """Return first-order statistics of model number modIDX + + :param mod_idx: integer, index of the unique model which stat1 will be + returned + + :return: a matrix of first-order statistics as a ndarray + """ + selectSeg = (self.modelset == numpy.unique(self.modelset)[mod_idx]) + return self.stat1[selectSeg, :] + + def get_segment_stat0(self, seg_id): + """Return zero-order statistics of segment which ID is segID + + :param seg_id: string, ID of the segment which stat0 will be + returned + + :return: a matrix of zero-order statistics as a ndarray + """ + return self.stat0[self.segset == seg_id, :] + + def get_segment_stat1(self, seg_id): + """Return first-order statistics of segment which ID is segID + + :param seg_id: string, ID of the segment which stat1 will be + returned + + :return: a matrix of first-order statistics as a ndarray + """ + return self.stat1[self.segset == seg_id, :] + + def get_segment_stat0_by_index(self, seg_idx): + """Return zero-order statistics of segment number segIDX + + :param seg_idx: integer, index of the unique segment which stat0 will be + returned + + :return: a matrix of zero-order statistics as a ndarray + """ + return self.stat0[seg_idx, :] + + def get_segment_stat1_by_index(self, seg_idx): + """Return first-order statistics of segment number segIDX + + :param seg_idx: integer, index of the unique segment which stat1 will be + returned + + :return: a matrix of first-order statistics as a ndarray + """ + return self.stat1[seg_idx, :] + + def get_model_segments(self, mod_id): + """Return the list of segments belonging to model modID + + :param mod_id: string, ID of the model which belonging segments will be + returned + + :return: a list of segments belonging to the model + """ + return self.segset[self.modelset == mod_id] + + def get_model_segments_by_index(self, mod_idx): + """Return the list of segments belonging to model number modIDX + + :param mod_idx: index of the model which list of segments will be + returned + + :return: a list of segments belonging to the model + """ + select_seg = (self.modelset == numpy.unique(self.modelset)[mod_idx]) + return self.segset[select_seg, :] + + def align_segments(self, segment_list): + """Align segments of the current StatServer to match a list of segment + provided as input parameter. The size of the StatServer might be + reduced to match the input list of segments. + + :param segment_list: ndarray of strings, list of segments to match + """ + indx = numpy.array([numpy.argwhere(self.segset == v)[0][0] for v in segment_list]) + self.segset = self.segset[indx] + self.modelset = self.modelset[indx] + self.start = self.start[indx] + self.stop = self.stop[indx] + self.stat0 = self.stat0[indx, :] + self.stat1 = self.stat1[indx, :] + + def align_models(self, model_list): + """Align models of the current StatServer to match a list of models + provided as input parameter. The size of the StatServer might be + reduced to match the input list of models. + + :param model_list: ndarray of strings, list of models to match + """ + indx = numpy.array([numpy.argwhere(self.modelset == v)[0][0] for v in model_list]) + self.segset = self.segset[indx] + self.modelset = self.modelset[indx] + self.start = self.start[indx] + self.stop = self.stop[indx] + self.stat0 = self.stat0[indx, :] + self.stat1 = self.stat1[indx, :] + + @process_parallel_lists + def accumulate_stat(self, ubm, feature_server, seg_indices=None, channel_extension=("", "_b"), num_thread=1): + """Compute statistics for a list of sessions which indices + are given in segIndices. + + :param ubm: a Mixture object used to compute the statistics + :param feature_server: featureServer object + :param seg_indices: list of indices of segments to process + if segIndices is an empty list, process all segments. + :param channel_extension: tuple of strings, extension of first and second channel for stereo files, default + is ("", "_b") + :param num_thread: number of parallel process to run + """ + assert isinstance(ubm, Mixture), 'First parameter has to be a Mixture' + assert isinstance(feature_server, FeaturesServer), 'Second parameter has to be a FeaturesServer' + + if (seg_indices is None) \ + or (self.stat0.shape[0] != self.segset.shape[0]) \ + or (self.stat1.shape[0] != self.segset.shape[0]): + self.stat0 = numpy.zeros((self.segset.shape[0], ubm.distrib_nb()), dtype=STAT_TYPE) + self.stat1 = numpy.zeros((self.segset.shape[0], ubm.sv_size()), dtype=STAT_TYPE) + seg_indices = range(self.segset.shape[0]) + feature_server.keep_all_features = True + + for count, idx in enumerate(seg_indices): + logging.debug('Compute statistics for {}'.format(self.segset[idx])) + + show = self.segset[idx] + + # If using a FeaturesExtractor, get the channel number by checking the extension of the show + channel = 0 + if feature_server.features_extractor is not None and show.endswith(channel_extension[1]): + channel = 1 + show = show[:show.rfind(channel_extension[channel])] + + cep, vad = feature_server.load(show, channel=channel) + stop = vad.shape[0] if self.stop[idx] is None else min(self.stop[idx], vad.shape[0]) + logging.info('{} start: {} stop: {}'.format(show, self.start[idx], stop)) + data = cep[self.start[idx]:stop, :] + data = data[vad[self.start[idx]:stop], :] + + # Verify that frame dimension is equal to gmm dimension + if not ubm.dim() == data.shape[1]: + raise Exception('dimension of ubm and features differ: {:d} / {:d}'.format(ubm.dim(), data.shape[1])) + else: + if ubm.invcov.ndim == 2: + lp = ubm.compute_log_posterior_probabilities(data) + else: + lp = ubm.compute_log_posterior_probabilities_full(data) + pp, foo = sum_log_probabilities(lp) + # Compute 0th-order statistics + self.stat0[idx, :] = pp.sum(0) + # Compute 1st-order statistics + self.stat1[idx, :] = numpy.reshape(numpy.transpose(numpy.dot(data.transpose(), pp)), ubm.sv_size()).astype(STAT_TYPE) + + def accumulate_stat_per_speaker(self, ubm, input_feature_filename, idmap, feature_server, channel_extension=("", "_b")): + """ + + :param ubm: + :param show: + :param input_feature_filename: + :param idmap: + :param feature_server: + :param channel_extension: + :return: + """ + assert isinstance(ubm, Mixture), 'First parameter has to be a Mixture' + assert isinstance(feature_server, FeaturesServer), 'Second parameter has to be a FeaturesServer' + + #show = self.segset[0] + show = input_feature_filename + + # If using a FeaturesExtractor, get the channel number by checking the extension of the show + channel = 0 + if feature_server.features_extractor is not None and show.endswith(channel_extension[1]): + channel = 1 + + show = show[:show.rfind(channel_extension[channel])] + + features_per_speaker = feature_server.get_features_per_speaker(show, idmap, channel) + + print(f"S0 a {features_per_speaker['S0'][1].sum()} trames") + + + for idx, (spk_id, data) in enumerate(features_per_speaker.items()): + cep, vad = data + cep = cep[vad] + + # Verify that frame dimension is equal to gmm dimension + if not ubm.dim() == cep.shape[1]: + raise Exception('dimension of ubm and features differ: {:d} / {:d}'.format(ubm.dim(), cep.shape[1])) + else: + if ubm.invcov.ndim == 2: + lp = ubm.compute_log_posterior_probabilities(cep) + else: + lp = ubm.compute_log_posterior_probabilities_full(cep) + pp, foo = sum_log_probabilities(lp) + # Compute 0th-order statistics + self.stat0[idx, :] = pp.sum(0) + # Compute 1st-order statistics + self.stat1[idx, :] = numpy.reshape(numpy.transpose(numpy.dot(cep.transpose(), pp)), + ubm.sv_size()).astype(STAT_TYPE) + + def get_mean_stat1(self): + """Return the mean of first order statistics + + return: the mean array of the first order statistics. + """ + mu = numpy.mean(self.stat1, axis=0) + return mu + + def norm_stat1(self): + """Divide all first-order statistics by their euclidian norm.""" + vect_norm = numpy.clip(numpy.linalg.norm(self.stat1, axis=1), 1e-08, numpy.inf) + self.stat1 = (self.stat1.transpose() / vect_norm).transpose() + + def rotate_stat1(self, R): + """Rotate first-order statistics by a right-product. + + :param R: ndarray, matrix to use for right product on the first order + statistics. + """ + self.stat1 = numpy.dot(self.stat1, R) + + def center_stat1(self, mu): + """Center first order statistics. + + :param mu: array to center on. + """ + dim = self.stat1.shape[1] / self.stat0.shape[1] + index_map = numpy.repeat(numpy.arange(self.stat0.shape[1]), dim) + self.stat1 = self.stat1 - (self.stat0[:, index_map] * mu.astype(STAT_TYPE)) + + def subtract_weighted_stat1(self, sts): + """Subtract the stat1 from from the sts StatServer to the stat1 of + the current StatServer after multiplying by the zero-order statistics + from the current statserver + + :param sts: a StatServer + + :return: a new StatServer + """ + new_sts = copy.deepcopy(self) + + # check the compatibility of the two statservers + # exact match of the sessions and dimensions of the stat0 and stat1 + if all(numpy.sort(sts.modelset) == numpy.sort(self.modelset))and \ + all(numpy.sort(sts.segset) == numpy.sort(self.segset)) and \ + (sts.stat0.shape == self.stat0.shape) and \ + (sts.stat1.shape == self.stat1.shape): + + # align sts according to self.segset + idx = self.segset.argsort() + idx_sts = sts.segset.argsort() + new_sts.stat1[idx, :] = sts.stat1[idx_sts, :] + + # Subtract the stat1 + dim = self.stat1.shape[1] / self.stat0.shape[1] + index_map = numpy.repeat(numpy.arange(self.stat0.shape[1]), dim) + new_sts.stat1 = self.stat1 - (self.stat0[:, index_map] * new_sts.stat1) + + else: + raise Exception('Statserver are not compatible') + + return new_sts + + def whiten_stat1(self, mu, sigma, isSqrInvSigma=False): + """Whiten first-order statistics + If sigma.ndim == 1, case of a diagonal covariance + If sigma.ndim == 2, case of a single Gaussian with full covariance + If sigma.ndim == 3, case of a full covariance UBM + + :param mu: array, mean vector to be subtracted from the statistics + :param sigma: narray, co-variance matrix or covariance super-vector + :param isSqrInvSigma: boolean, True if the input Sigma matrix is the inverse of the square root of a covariance + matrix + """ + if sigma.ndim == 1: + self.center_stat1(mu) + self.stat1 = self.stat1 / numpy.sqrt(sigma.astype(STAT_TYPE)) + + elif sigma.ndim == 2: + # Compute the inverse square root of the co-variance matrix Sigma + sqr_inv_sigma = sigma + + if not isSqrInvSigma: + eigen_values, eigen_vectors = scipy.linalg.eigh(sigma) + ind = eigen_values.real.argsort()[::-1] + eigen_values = eigen_values.real[ind] + eigen_vectors = eigen_vectors.real[:, ind] + + sqr_inv_eval_sigma = 1 / numpy.sqrt(eigen_values.real) + sqr_inv_sigma = numpy.dot(eigen_vectors, numpy.diag(sqr_inv_eval_sigma)) + else: + pass + + # Whitening of the first-order statistics + self.center_stat1(mu) + self.rotate_stat1(sqr_inv_sigma) + + elif sigma.ndim == 3: + # we assume that sigma is a 3D ndarray of size D x n x n + # where D is the number of distributions and n is the dimension of a single distibution + n = self.stat1.shape[1] // self.stat0.shape[1] + sess_nb = self.stat0.shape[0] + self.center_stat1(mu) + self.stat1 = numpy.einsum("ikj,ikl->ilj", + self.stat1.T.reshape(-1, n, sess_nb), sigma).reshape(-1, sess_nb).T + + else: + raise Exception('Wrong dimension of Sigma, must be 1 or 2') + + def whiten_cholesky_stat1(self, mu, sigma): + """Whiten first-order statistics by using Cholesky decomposition of + Sigma + + :param mu: array, mean vector to be subtracted from the statistics + :param sigma: narray, co-variance matrix or covariance super-vector + """ + if sigma.ndim == 2: + # Compute the inverse square root of the co-variance matrix Sigma + inv_sigma = scipy.linalg.inv(sigma) + chol_invcov = scipy.linalg.cholesky(inv_sigma).T + + # Whitening of the first-order statistics + self.center_stat1(mu) + self.stat1 = self.stat1.dot(chol_invcov) + + elif sigma.ndim == 1: + self.center_stat1(mu) + self.stat1 = self.stat1 / numpy.sqrt(sigma) + else: + raise Exception('Wrong dimension of Sigma, must be 1 or 2') + + def get_total_covariance_stat1(self): + """Compute and return the total covariance matrix of the first-order + statistics. + + :return: the total co-variance matrix of the first-order statistics + as a ndarray. + """ + C = self.stat1 - self.stat1.mean(axis=0) + return numpy.dot(C.transpose(), C) / self.stat1.shape[0] + + def get_total_covariance_stat1_mix(self, mean): + """Compute and return the total covariance matrix of the first-order + statistics. + + :return: the total co-variance matrix of the first-order statistics + as a ndarray. + """ + C = self.stat1 - mean + return numpy.dot(C.transpose(), C) / self.stat1.shape[0] + + def get_within_covariance_stat1(self): + """Compute and return the within-class covariance matrix of the + first-order statistics. + + :return: the within-class co-variance matrix of the first-order statistics + as a ndarray. + """ + vect_size = self.stat1.shape[1] + unique_speaker = numpy.unique(self.modelset) + W = numpy.zeros((vect_size, vect_size)) + + for speakerID in tqdm.tqdm(unique_speaker): + spk_ctr_vec = self.get_model_stat1(speakerID) \ + - numpy.mean(self.get_model_stat1(speakerID), axis=0) + W += numpy.dot(spk_ctr_vec.transpose(), spk_ctr_vec) + W /= self.stat1.shape[0] + return W + + def get_between_covariance_stat1(self): + """Compute and return the between-class covariance matrix of the + first-order statistics. + + :return: the between-class co-variance matrix of the first-order + statistics as a ndarray. + """ + vect_size = self.stat1.shape[1] + unique_speaker = numpy.unique(self.modelset) + B = numpy.zeros((vect_size, vect_size)) + + # Compute overall mean first-order statistics + mu = self.get_mean_stat1() + + # Compute and accumulate mean first-order statistics for each class + for speaker_id in unique_speaker: + spk_sessions = self.get_model_stat1(speaker_id) + tmp = numpy.mean(spk_sessions, axis=0) - mu + B += (spk_sessions.shape[0] * numpy.outer(tmp, tmp)) + B /= self.stat1.shape[0] + return B + + def get_lda_matrix_stat1(self, rank): + """Compute and return the Linear Discriminant Analysis matrix + on the first-order statistics. Columns of the LDA matrix are ordered + according to the corresponding eigenvalues in descending order. + + :param rank: integer, rank of the LDA matrix to return + + :return: the LDA matrix of rank "rank" as a ndarray + """ + vect_size = self.stat1.shape[1] + unique_speaker = numpy.unique(self.modelset) + + mu = self.get_mean_stat1() + + class_means = numpy.zeros((unique_speaker.shape[0], vect_size)) + Sw = numpy.zeros((vect_size, vect_size)) + + spk_idx = 0 + for speaker_id in unique_speaker: + spk_sessions = self.get_model_stat1(speaker_id) \ + - numpy.mean(self.get_model_stat1(speaker_id), axis=0) + Sw += numpy.dot(spk_sessions.transpose(), spk_sessions) / spk_sessions.shape[0] + class_means[spk_idx, :] = numpy.mean(self.get_model_stat1(speaker_id), axis=0) + spk_idx += 1 + + # Compute Between-class scatter matrix + class_means = class_means - mu + Sb = numpy.dot(class_means.transpose(), class_means) + + # Compute the Eigenvectors & eigenvalues of the discrimination matrix + DiscriminationMatrix = numpy.dot(Sb, scipy.linalg.inv(Sw)).transpose() + eigen_values, eigen_vectors = scipy.linalg.eigh(DiscriminationMatrix) + eigen_values = eigen_values.real + eigen_vectors = eigen_vectors.real + + # Rearrange the eigenvectors according to decreasing eigenvalues + # get indexes of the rank top eigen values + idx = eigen_values.real.argsort()[-rank:][::-1] + L = eigen_vectors[:, idx] + return L + + def get_mahalanobis_matrix_stat1(self): + """Compute and return Mahalanobis matrix of first-order statistics. + + :return: the mahalanobis matrix computed on the first-order + statistics as a ndarray + """ + W = self.get_within_covariance_stat1() + M = scipy.linalg.inv(W) + return M + + def get_wccn_choleski_stat1(self): + """Compute and return the lower Cholesky decomposition matrix of the + Within Class Co-variance Normalization matrix on the first-order + statistics. + + :return: the lower Choleski decomposition of the WCCN matrix + as a ndarray + """ + vect_size = self.stat1.shape[1] + unique_speaker = numpy.unique(self.modelset) + WCCN = numpy.zeros((vect_size, vect_size)) + + for speaker_id in unique_speaker: + spk_ctr_vec = self.get_model_stat1(speaker_id) \ + - numpy.mean(self.get_model_stat1(speaker_id), axis=0) + #WCCN += numpy.dot(spk_ctr_vec.transpose(), spk_ctr_vec) + WCCN += numpy.dot(spk_ctr_vec.transpose(), spk_ctr_vec) / spk_ctr_vec.shape[0] + + #WCCN /= self.stat1.shape[0] + WCCN = WCCN / unique_speaker.shape[0] + + # Choleski decomposition of the WCCN matrix + invW = scipy.linalg.inv(WCCN) + W = scipy.linalg.cholesky(invW).T + return W + + def get_nap_matrix_stat1(self, co_rank): + """Compute return the Nuisance Attribute Projection matrix + from first-order statistics. + + :param co_rank: co-rank of the Nuisance Attribute Projection matrix + + :return: the NAP matrix of rank "coRank" + """ + vectSize = self.stat1.shape[1] + W = numpy.dot(self.stat1, self.stat1.transpose()) / vectSize + eigenValues, eigenVectors = scipy.linalg.eigh(W) + + # Rearrange the eigenvectors according to decreasing eigenvalues + # get indexes of the rank top eigen values + idx = eigenValues.real.argsort()[-co_rank:][::-1] + N = numpy.dot(self.stat1.transpose(), eigenVectors[:, idx]) + N = numpy.dot(N, numpy.diag(1 / numpy.sqrt(vectSize * eigenValues.real[idx]))) + return N + + def adapt_mean_map(self, ubm, r=16, norm=False): + """Maximum A Posteriori adaptation of the mean super-vector of ubm, + train one model per segment. + + :param ubm: a Mixture object to adapt + :param r: float, the relevant factor for MAP adaptation + :param norm: boolean, normalize by using the UBM co-variance. + Default is False + + :return: a StatServer with 1 as stat0 and the MAP adapted super-vectors + as stat1 + """ + gsv_statserver = StatServer() + gsv_statserver.modelset = self.modelset + gsv_statserver.segset = self.segset + gsv_statserver.start = self.start + gsv_statserver.stop = self.stop + gsv_statserver.stat0 = numpy.ones((self.segset. shape[0], 1), dtype=STAT_TYPE) + + index_map = numpy.repeat(numpy.arange(ubm.distrib_nb()), ubm.dim()) + + # Adapt mean vectors + alpha = (self.stat0 + numpy.finfo(numpy.float32).eps) / (self.stat0 + numpy.finfo(numpy.float32).eps + r) + M = self.stat1 / self.stat0[:, index_map] + M[numpy.isnan(M)] = 0 # Replace NaN due to divide by zeros + M = alpha[:, index_map] * M + (1 - alpha[:, index_map]) * \ + numpy.tile(ubm.get_mean_super_vector(), (M.shape[0], 1)) + + if norm: + if ubm.invcov.ndim == 2: + # Normalization corresponds to KL divergence + w = numpy.repeat(ubm.w, ubm.dim()) + KLD = numpy.sqrt(w * ubm.get_invcov_super_vector()) + + M = M * KLD + + gsv_statserver.stat1 = M.astype(dtype=STAT_TYPE) + gsv_statserver.validate() + return gsv_statserver + + def adapt_mean_map_multisession(self, ubm, r=16, norm=False): + """Maximum A Posteriori adaptation of the mean super-vector of ubm, + train one model per model in the modelset by summing the statistics + of the multiple segments. + + :param ubm: a Mixture object to adapt + :param r: float, the relevant factor for MAP adaptation + :param norm: boolean, normalize by using the UBM co-variance. + Default is False + + :return: a StatServer with 1 as stat0 and the MAP adapted super-vectors + as stat1 + """ + gsv_statserver = StatServer() + gsv_statserver.modelset = numpy.unique(self.modelset) + gsv_statserver.segset = numpy.unique(self.modelset) + gsv_statserver.start = numpy.empty(gsv_statserver.modelset.shape, dtype="|O") + gsv_statserver.stop = numpy.empty(gsv_statserver.modelset.shape, dtype="|O") + gsv_statserver.stat0 = numpy.ones((numpy.unique(self.modelset).shape[0], 1), dtype=STAT_TYPE) + + index_map = numpy.repeat(numpy.arange(ubm.distrib_nb()), ubm.dim()) + + # Sum the statistics per model + modelStat = self.sum_stat_per_model()[0] + + # Adapt mean vectors + alpha = modelStat.stat0 / (modelStat.stat0 + r) + M = modelStat.stat1 / modelStat.stat0[:, index_map] + M[numpy.isnan(M)] = 0 # Replace NaN due to divide by zeros + M = alpha[:, index_map] * M \ + + (1 - alpha[:, index_map]) * numpy.tile(ubm.get_mean_super_vector(), (M.shape[0], 1)) + + if norm: + if ubm.invcov.ndim == 2: + # Normalization corresponds to KL divergence + w = numpy.repeat(ubm.w, ubm.dim()) + KLD = numpy.sqrt(w * ubm.get_invcov_super_vector()) + + M = M * KLD + + gsv_statserver.stat1 = M.astype(dtype=STAT_TYPE) + gsv_statserver.validate() + return gsv_statserver + + def precompute_svm_kernel_stat1(self): + """Pre-compute the Kernel for SVM training and testing, + the output parameter is a matrix that only contains the impostor + part of the Kernel. This one has to be completed by the + target-dependent part during training and testing. + + :return: the impostor part of the SVM Graam matrix as a ndarray + """ + K = numpy.dot(self.stat1, self.stat1.transpose()) + return K + + def ivector_extraction_weight(self, ubm, W, Tnorm, delta=numpy.array([])): + """Compute i-vectors using the ubm weight approximation. + For more information, refers to: + + Glembeck, O.; Burget, L.; Matejka, P.; Karafiat, M. & Kenny, P. + "Simplification and optimization of I-Vector extraction," + in IEEE International Conference on Acoustics, Speech, and Signal + Processing, ICASSP, 2011, 4516-4519 + + :param ubm: a Mixture used as UBM for i-vector estimation + :param W: fix matrix pre-computed using the weights from + the UBM and the total variability matrix + :param Tnorm: total variability matrix pre-normalized using + the co-variance of the UBM + :param delta: men vector if re-estimated using minimum divergence + criteria + + :return: a StatServer which zero-order statistics are 1 + and first-order statistics are approximated i-vectors. + """ + # check consistency of dimensions for delta, Tnorm, W, ubm + assert ubm.get_invcov_super_vector().shape[0] == Tnorm.shape[0], \ + 'UBM and TV matrix dimension are not consistent' + if delta.shape == (0, ): + delta = numpy.zeros(ubm.get_invcov_super_vector().shape) + assert ubm.get_invcov_super_vector().shape[0] == delta.shape[0],\ + 'Minimum divergence mean and TV matrix dimension not consistent' + assert W.shape[0] == Tnorm.shape[1], 'W and TV matrix dimension are not consistent' + ivector_size = Tnorm.shape[1] + + # Sum stat0 + sumStat0 = self.stat0.sum(axis=1) + + # Center and normalize first-order statistics + # for the case of diagonal covariance UBM + self.whiten_stat1(delta, 1./ubm.get_invcov_super_vector()) + + X = numpy.dot(self.stat1, Tnorm) + + enroll_iv = StatServer() + enroll_iv.modelset = self.modelset + enroll_iv.segset = self.segset + enroll_iv.stat0 = numpy.ones((enroll_iv.segset.shape[0], 1)) + enroll_iv.stat1 = numpy.zeros((enroll_iv.segset.shape[0], ivector_size)) + for iv in range(self.stat0.shape[0]): # loop on i-vector + logging.debug('Estimate i-vector [ %d / %d ]', iv + 1, self.stat0.shape[0]) + # Compute precision matrix + L = numpy.eye(ivector_size) + sumStat0[iv] * W + # Estimate i-vector + enroll_iv.stat1[iv, :] = scipy.linalg.solve(L, X[iv, :]) + + return enroll_iv + + def ivector_extraction_eigen_decomposition(self, + ubm, + Q, + D_bar_c, + Tnorm, + delta=numpy.array([])): + """Compute i-vectors using the eigen decomposition approximation. + For more information, refers to[Glembeck09]_ + + :param ubm: a Mixture used as UBM for i-vector estimation + :param Q: Q matrix as described in [Glembeck11] + :param D_bar_c: matrices as described in [Glembeck11] + :param Tnorm: total variability matrix pre-normalized using + the co-variance of the UBM + :param delta: men vector if re-estimated using minimum divergence + criteria + + :return: a StatServer which zero-order statistics are 1 + and first-order statistics are approximated i-vectors. + """ + # check consistency of dimensions for delta, Tnorm, Q, D_bar_c, ubm + assert ubm.get_invcov_super_vector().shape[0] == Tnorm.shape[0], \ + 'UBM and TV matrix dimension not consistent' + if delta.shape == (0, ): + delta = numpy.zeros(ubm.get_invcov_super_vector().shape) + assert ubm.get_invcov_super_vector().shape[0] == delta.shape[0], \ + 'Minimum divergence mean and TV matrix dimension not consistent' + assert D_bar_c.shape[1] == Tnorm.shape[1], \ + 'D_bar_c and TV matrix dimension are not consistent' + assert D_bar_c.shape[0] == ubm.w.shape[0], \ + 'D_bar_c and UBM dimension are not consistent' + + ivector_size = Tnorm.shape[1] + + # Center and normalize first-order statistics + # for the case of diagonal covariance UBM + self.whiten_stat1(delta, 1./ubm.get_invcov_super_vector()) + + X = numpy.dot(self.stat1, Tnorm) + + enroll_iv = StatServer() + enroll_iv.modelset = self.modelset + enroll_iv.segset = self.segset + enroll_iv.stat0 = numpy.ones((enroll_iv.segset.shape[0], 1), dtype=STAT_TYPE) + enroll_iv.stat1 = numpy.zeros((enroll_iv.segset.shape[0], ivector_size), dtype=STAT_TYPE) + for iv in range(self.stat0.shape[0]): # loop on i-vector + logging.debug('Estimate i-vector [ %d / %d ]', iv + 1, self.stat0.shape[0]) + + # Compute precision matrix + diag_L = 1 + numpy.sum(numpy.dot(numpy.diag(self.stat0[iv, :]), D_bar_c), axis=0) + + # Estimate i-vector + enroll_iv.stat1[iv, :] = X[iv, :].dot(Q).dot(numpy.diag(1/diag_L)).dot(Q.transpose()) + + return enroll_iv + + def estimate_spectral_norm_stat1(self, it=1, mode='efr'): + """Compute meta-parameters for Spectral Normalization as described + in [Bousquet11]_ + + Can be used to perform Eigen Factor Radial or Spherical Nuisance + Normalization. Default behavior is equivalent to Length Norm as + described in [Garcia-Romero11]_ + + Statistics are transformed while the meta-parameters are + estimated. + + :param it: integer, number of iterations to perform + :param mode: string, can be + - efr for Eigen Factor Radial + - sphNorm, for Spherical Nuisance Normalization + + :return: a tupple of two lists: + - a list of mean vectors + - a list of co-variance matrices as ndarrays + """ + spectral_norm_mean = [] + spectral_norm_cov = [] + tmp_iv = copy.deepcopy(self) + + for i in range(it): + # estimate mean and covariance matrix + spectral_norm_mean.append(tmp_iv.get_mean_stat1()) + + if mode == 'efr': + spectral_norm_cov.append(tmp_iv.get_total_covariance_stat1()) + elif mode == 'sphNorm': + spectral_norm_cov.append(tmp_iv.get_within_covariance_stat1()) + + # Center and whiten the statistics + tmp_iv.whiten_stat1(spectral_norm_mean[i], spectral_norm_cov[i]) + tmp_iv.norm_stat1() + return spectral_norm_mean, spectral_norm_cov + + def spectral_norm_stat1(self, spectral_norm_mean, spectral_norm_cov, is_sqr_inv_sigma=False): + """Apply Spectral Sormalization to all first order statistics. + See more details in [Bousquet11]_ + + The number of iterations performed is equal to the length of the + input lists. + + :param spectral_norm_mean: a list of mean vectors + :param spectral_norm_cov: a list of co-variance matrices as ndarrays + :param is_sqr_inv_sigma: boolean, True if + """ + assert len(spectral_norm_mean) == len(spectral_norm_cov), \ + 'Number of mean vectors and covariance matrices is different' + + for mu, Cov in zip(spectral_norm_mean, spectral_norm_cov): + self.whiten_stat1(mu, Cov, is_sqr_inv_sigma) + self.norm_stat1() + + def sum_stat_per_model(self): + """Sum the zero- and first-order statistics per model and store them + in a new StatServer. + + :return: a StatServer with the statistics summed per model + """ + sts_per_model = sidekit.StatServer() + sts_per_model.modelset = numpy.unique(self.modelset) + sts_per_model.segset = copy.deepcopy(sts_per_model.modelset) + sts_per_model.stat0 = numpy.zeros((sts_per_model.modelset.shape[0], self.stat0.shape[1]), dtype=STAT_TYPE) + sts_per_model.stat1 = numpy.zeros((sts_per_model.modelset.shape[0], self.stat1.shape[1]), dtype=STAT_TYPE) + sts_per_model.start = numpy.empty(sts_per_model.segset.shape, '|O') + sts_per_model.stop = numpy.empty(sts_per_model.segset.shape, '|O') + + session_per_model = numpy.zeros(numpy.unique(self.modelset).shape[0]) + + for idx, model in enumerate(sts_per_model.modelset): + sts_per_model.stat0[idx, :] = self.get_model_stat0(model).sum(axis=0) + sts_per_model.stat1[idx, :] = self.get_model_stat1(model).sum(axis=0) + session_per_model[idx] += self.get_model_stat1(model).shape[0] + return sts_per_model, session_per_model + + def mean_stat_per_model(self): + """Average the zero- and first-order statistics per model and store them + in a new StatServer. + + :return: a StatServer with the statistics averaged per model + """ + sts_per_model = sidekit.StatServer() + sts_per_model.modelset = numpy.unique(self.modelset) + sts_per_model.segset = copy.deepcopy(sts_per_model.modelset) + sts_per_model.stat0 = numpy.zeros((sts_per_model.modelset.shape[0], self.stat0.shape[1]), dtype=STAT_TYPE) + sts_per_model.stat1 = numpy.zeros((sts_per_model.modelset.shape[0], self.stat1.shape[1]), dtype=STAT_TYPE) + sts_per_model.start = numpy.empty(sts_per_model.segset.shape, '|O') + sts_per_model.stop = numpy.empty(sts_per_model.segset.shape, '|O') + + for idx, model in enumerate(sts_per_model.modelset): + sts_per_model.stat0[idx, :] = self.get_model_stat0(model).mean(axis=0) + sts_per_model.stat1[idx, :] = self.get_model_stat1(model).mean(axis=0) + return sts_per_model + + def _expectation(self, phi, mean, sigma, session_per_model, batch_size=100, num_thread=1): + """ + dans cette version, on considère que les stats NE sont PAS blanchis avant + """ + warnings.warn("deprecated, use FactorAnalyser module", DeprecationWarning) + + r = phi.shape[-1] + d = int(self.stat1.shape[1] / self.stat0.shape[1]) + C = self.stat0.shape[1] + + """Whiten the statistics and multiply the covariance matrix by the + square root of the inverse of the residual covariance""" + self.whiten_stat1(mean, sigma) + phi_white = copy.deepcopy(phi) + if sigma.ndim == 2: + eigen_values, eigen_vectors = scipy.linalg.eigh(sigma) + ind = eigen_values.real.argsort()[::-1] + eigen_values = eigen_values.real[ind] + eigen_vectors = eigen_vectors.real[:, ind] + sqr_inv_eval_sigma = 1 / numpy.sqrt(eigen_values.real) + sqr_inv_sigma = numpy.dot(eigen_vectors, numpy.diag(sqr_inv_eval_sigma)) + phi_white = sqr_inv_sigma.T.dot(phi) + elif sigma.ndim == 1: + sqr_inv_sigma = 1/numpy.sqrt(sigma) + phi_white = phi * sqr_inv_sigma[:, None] + + # Replicate self.stat0 + index_map = numpy.repeat(numpy.arange(C), d) + _stat0 = self.stat0[:, index_map] + + # Create accumulators for the list of models to process + with warnings.catch_warnings(): + warnings.simplefilter('ignore', RuntimeWarning) + _A = numpy.zeros((C, r, r), dtype=STAT_TYPE) + tmp_A = multiprocessing.Array(ct, _A.size) + _A = numpy.ctypeslib.as_array(tmp_A.get_obj()) + _A = _A.reshape(C, r, r) + + _C = numpy.zeros((r, d * C), dtype=STAT_TYPE) + + _R = numpy.zeros((r, r), dtype=STAT_TYPE) + _r = numpy.zeros(r, dtype=STAT_TYPE) + + # Process in batches in order to reduce the memory requirement + batch_nb = int(numpy.floor(self.segset.shape[0]/float(batch_size) + 0.999)) + + for batch in range(batch_nb): + batch_start = batch * batch_size + batch_stop = min((batch + 1) * batch_size, self.segset.shape[0]) + batch_len = batch_stop - batch_start + + # Allocate the memory to save time + with warnings.catch_warnings(): + warnings.simplefilter('ignore', RuntimeWarning) + e_h = numpy.zeros((batch_len, r), dtype=STAT_TYPE) + tmp_e_h = multiprocessing.Array(ct, e_h.size) + e_h = numpy.ctypeslib.as_array(tmp_e_h.get_obj()) + e_h = e_h.reshape(batch_len, r) + + e_hh = numpy.zeros((batch_len, r, r), dtype=STAT_TYPE) + tmp_e_hh = multiprocessing.Array(ct, e_hh.size) + e_hh = numpy.ctypeslib.as_array(tmp_e_hh.get_obj()) + e_hh = e_hh.reshape(batch_len, r, r) + + # loop on model id's + fa_model_loop(batch_start=batch_start, mini_batch_indices=numpy.arange(batch_len), + r=r, phi_white=phi_white, phi=phi, sigma=sigma, + stat0=_stat0, stat1=self.stat1, + e_h=e_h, e_hh=e_hh, num_thread=num_thread) + + # Accumulate for minimum divergence step + _r += numpy.sum(e_h * session_per_model[batch_start:batch_stop, None], axis=0) + _R += numpy.sum(e_hh, axis=0) + + if sqr_inv_sigma.ndim == 2: + _C += e_h.T.dot(self.stat1[batch_start:batch_stop, :]).dot(scipy.linalg.inv(sqr_inv_sigma)) + elif sqr_inv_sigma.ndim == 1: + _C += e_h.T.dot(self.stat1[batch_start:batch_stop, :]) / sqr_inv_sigma + + # Parallelized loop on the model id's + fa_distribution_loop(distrib_indices=numpy.arange(C), + _A=_A, + stat0=self.stat0, + batch_start=batch_start, + batch_stop=batch_stop, + e_hh=e_hh, + num_thread=num_thread) + + _r /= session_per_model.sum() + _R /= session_per_model.shape[0] + + return _A, _C, _R + + def _maximization(self, phi, _A, _C, _R=None, sigma_obs=None, session_number=None): + """ + """ + warnings.warn("deprecated, use FactorAnalyser module", DeprecationWarning) + d = self.stat1.shape[1] // self.stat0.shape[1] + C = self.stat0.shape[1] + + for c in range(C): + distrib_idx = range(c * d, (c+1) * d) + phi[distrib_idx, :] = scipy.linalg.solve(_A[c], _C[:, distrib_idx]).T + + # Update the residual covariance if needed + # (only for full co-variance case of PLDA + sigma = None + if sigma_obs is not None: + sigma = sigma_obs - phi.dot(_C) / session_number + + # MINIMUM DIVERGENCE STEP + if _R is not None: + print('applyminDiv reestimation') + ch = scipy.linalg.cholesky(_R) + phi = phi.dot(ch) + + return phi, sigma + + def estimate_between_class(self, + itNb, + V, + mean, + sigma_obs, + batch_size=100, + Ux=None, + Dz=None, + minDiv=True, + num_thread=1, + re_estimate_residual=False, + save_partial=False): + """ + Estimate the factor loading matrix for the between class covariance + + :param itNb: + :param V: initial between class covariance matrix + :param mean: global mean vector + :param sigma_obs: covariance matrix of the input data + :param batch_size: size of the batches to process one by one to reduce the memory usage + :param Ux: statserver of supervectors + :param Dz: statserver of supervectors + :param minDiv: boolean, if True run the minimum divergence step after maximization + :param num_thread: number of parallel process to run + :param re_estimate_residual: boolean, if True the residual covariance matrix is re-estimated (for PLDA) + :param save_partial: boolean, if True, save FA model for each iteration + :return: the within class factor loading matrix + """ + warnings.warn("deprecated, use FactorAnalyser module", DeprecationWarning) + # Initialize the covariance + sigma = sigma_obs + + # Estimate F by iterating the EM algorithm + for it in range(itNb): + logging.info('Estimate between class covariance, it %d / %d', + it + 1, itNb) + + # Dans la fonction estimate_between_class + model_shifted_stat = copy.deepcopy(self) + + # subtract channel effect, Ux, if already estimated + if Ux is not None: + model_shifted_stat = model_shifted_stat.subtract_weighted_stat1(Ux) + + # Sum statistics per speaker + model_shifted_stat, session_per_model = model_shifted_stat.sum_stat_per_model() + # subtract residual, Dz, if already estimated + if Dz is not None: + model_shifted_stat = model_shifted_stat.subtract(Dz) + + # E-step + print("E_step") + _A, _C, _R = model_shifted_stat._expectation(V, mean, sigma, session_per_model, batch_size, num_thread) + + if not minDiv: + _R = None + + # M-step + print("M_step") + if re_estimate_residual: + V, sigma = model_shifted_stat._maximization(V, _A, _C, _R, sigma_obs, session_per_model.sum()) + else: + V = model_shifted_stat._maximization(V, _A, _C, _R)[0] + + if sigma.ndim == 2: + logging.info('Likelihood after iteration %d / %f', it + 1, compute_llk(self, V, sigma)) + + del model_shifted_stat + + if save_partial: + sidekit.sidekit_io.write_fa_hdf5((mean, V, None, None, sigma), + "Partial_plda_{}_between_class.h5".format(it)) + + return V, sigma + + def estimate_within_class(self, + it_nb, + U, + mean, + sigma_obs, + batch_size=100, + Vy=None, + Dz=None, + min_div=True, + num_thread=1, + save_partial=False): + """ + Estimate the factor loading matrix for the within class covariance + + :param it_nb: number of iterations to estimate the within class covariance matrix + :param U: initial within class covariance matrix + :param mean: mean of the input data + :param sigma_obs: co-variance matrix of the input data + :param batch_size: number of sessions to process per batch to optimize memory usage + :param Vy: statserver of supervectors + :param Dz: statserver of supervectors + :param min_div: boolean, if True run the minimum divergence step after maximization + :param num_thread: number of parallel process to run + :param save_partial: boolean, if True, save FA model for each iteration + :return: the within class factor loading matrix + """ + warnings.warn("deprecated, use FactorAnalyser module", DeprecationWarning) + session_shifted_stat = copy.deepcopy(self) + + session_per_model = numpy.ones(session_shifted_stat.modelset.shape[0]) + # Estimate F by iterating the EM algorithm + for it in range(it_nb): + logging.info('Estimate between class covariance, it %d / %d', + it + 1, it_nb) + + session_shifted_stat = self + # subtract channel effect, Ux, if already estimated + # and sum per speaker + if Vy is not None: + session_shifted_stat = session_shifted_stat.subtract_weighted_stat1(Vy) + # session_shifted_stat = self.subtract_weighted_stat1(Vy) + + # subtract residual, Dz, if already estimated + if Dz is not None: + session_shifted_stat = session_shifted_stat.subtract_weighted_stat1(Dz) + + # E step + A, C, R = session_shifted_stat._expectation(U, mean, sigma_obs, + session_per_model, + batch_size, num_thread) + + # M step + if not min_div: + R = None + U = session_shifted_stat._maximization(U, A, C, R)[0] + + if save_partial: + sidekit.sidekit_io.write_fa_hdf5((None, None, U, None, None), save_partial + "_{}_within_class.h5") + + return U + + def estimate_map(self, itNb, D, mean, Sigma, Vy=None, Ux=None, num_thread=1, save_partial=False): + """ + + :param itNb: number of iterations to estimate the MAP covariance matrix + :param D: Maximum a Posteriori marix to estimate + :param mean: mean of the input parameters + :param Sigma: residual covariance matrix + :param Vy: statserver of supervectors + :param Ux: statserver of supervectors + :param num_thread: number of parallel process to run + :param save_partial: boolean, if True save MAP matrix after each iteration + + :return: the MAP covariance matrix into a vector as it is diagonal + """ + warnings.warn("deprecated, use FactorAnalyser module", DeprecationWarning) + model_shifted_stat = copy.deepcopy(self) + + logging.info('Estimate MAP matrix') + # subtract speaker and channel if already estimated + model_shifted_stat.center_stat1(mean) + if Vy is not None: + model_shifted_stat = model_shifted_stat.subtract_weighted_stat1(Vy) + if Ux is not None: + model_shifted_stat = model_shifted_stat.subtract_weighted_stat1(Ux) + + # Sum statistics per speaker + model_shifted_stat = model_shifted_stat.sum_stat_per_model()[0] + + d = model_shifted_stat.stat1.shape[1] / model_shifted_stat.stat0.shape[1] + C = model_shifted_stat.stat0.shape[1] + + # Replicate self.stat0 + index_map = numpy.repeat(numpy.arange(C), d) + _stat0 = model_shifted_stat.stat0[:, index_map] + + # Estimate D by iterating the EM algorithm + for it in range(itNb): + logging.info('Estimate MAP covariance, it %d / %d', it + 1, itNb) + + # E step + e_h = numpy.zeros(model_shifted_stat.stat1.shape, dtype=STAT_TYPE) + _A = numpy.zeros(D.shape, dtype=STAT_TYPE) + _C = numpy.zeros(D.shape, dtype=STAT_TYPE) + for idx in range(model_shifted_stat.modelset.shape[0]): + Lambda = numpy.ones(D.shape) + (_stat0[idx, :] * D**2 / Sigma) + e_h[idx] = model_shifted_stat.stat1[idx] * D / (Lambda * Sigma) + _A = _A + (1/Lambda + e_h[idx]**2) * _stat0[idx, :] + _C = _C + e_h[idx] * model_shifted_stat.stat1[idx] + + # M step + D = _C / _A + + if save_partial: + sidekit.sidekit_io.write_fa_hdf5((None, None, None, D, None), save_partial + "_{}_map.h5") + + return D + + def estimate_hidden(self, mean, sigma, V=None, U=None, D=None, batch_size=100, num_thread=1): + """ + Assume that the statistics have not been whitened + :param mean: global mean of the data to subtract + :param sigma: residual covariance matrix of the Factor Analysis model + :param V: between class covariance matrix + :param U: within class covariance matrix + :param D: MAP covariance matrix + :param batch_size: size of the batches used to reduce memory footprint + :param num_thread: number of parallel process to run + """ + warnings.warn("deprecated, use FactorAnalyser module", DeprecationWarning) + if V is None: + V = numpy.zeros((self.stat1.shape[1], 0), dtype=STAT_TYPE) + if U is None: + U = numpy.zeros((self.stat1.shape[1], 0), dtype=STAT_TYPE) + W = numpy.hstack((V, U)) + + # Estimate yx + r = W.shape[1] + d = int(self.stat1.shape[1] / self.stat0.shape[1]) + C = self.stat0.shape[1] + + self.whiten_stat1(mean, sigma) + W_white = copy.deepcopy(W) + if sigma.ndim == 2: + eigenvalues, eigenvectors = scipy.linalg.eigh(sigma) + ind = eigenvalues.real.argsort()[::-1] + eigenvalues = eigenvalues.real[ind] + eigenvectors = eigenvectors.real[:, ind] + sqr_inv_eval_sigma = 1 / numpy.sqrt(eigenvalues.real) + sqr_inv_sigma = numpy.dot(eigenvectors, numpy.diag(sqr_inv_eval_sigma)) + W_white = sqr_inv_sigma.T.dot(W) + elif sigma.ndim == 1: + sqr_inv_sigma = 1/numpy.sqrt(sigma) + W_white = W * sqr_inv_sigma[:, None] + + # Replicate self.stat0 + index_map = numpy.repeat(numpy.arange(C), d) + _stat0 = self.stat0[:, index_map] + + y = sidekit.StatServer() + y.modelset = copy.deepcopy(self.modelset) + y.segset = copy.deepcopy(self.segset) + y.start = copy.deepcopy(self.start) + y.stop = copy.deepcopy(self.stop) + y.stat0 = numpy.ones((self.modelset.shape[0], 1)) + y.stat1 = numpy.ones((self.modelset.shape[0], V.shape[1])) + + x = sidekit.StatServer() + x.modelset = copy.deepcopy(self.modelset) + x.segset = copy.deepcopy(self.segset) + x.start = copy.deepcopy(self.start) + x.stop = copy.deepcopy(self.stop) + x.stat0 = numpy.ones((self.modelset.shape[0], 1)) + x.stat1 = numpy.ones((self.modelset.shape[0], U.shape[1])) + + z = sidekit.StatServer() + if D is not None: + z.modelset = copy.deepcopy(self.modelset) + z.segset = copy.deepcopy(self.segset) + z.stat0 = numpy.ones((self.modelset.shape[0], 1), dtype=STAT_TYPE) + z.stat1 = numpy.ones((self.modelset.shape[0], D.shape[0]), dtype=STAT_TYPE) + + VUyx = copy.deepcopy(self) + + # Process in batches in order to reduce the memory requirement + batch_nb = int(numpy.floor(self.segset.shape[0]/float(batch_size) + 0.999)) + + for batch in range(batch_nb): + batch_start = batch * batch_size + batch_stop = min((batch + 1) * batch_size, self.segset.shape[0]) + batch_len = batch_stop - batch_start + + # Allocate the memory to save time + with warnings.catch_warnings(): + warnings.simplefilter('ignore', RuntimeWarning) + e_h = numpy.zeros((batch_len, r), dtype=STAT_TYPE) + tmp_e_h = multiprocessing.Array(ct, e_h.size) + e_h = numpy.ctypeslib.as_array(tmp_e_h.get_obj()) + e_h = e_h.reshape(batch_len, r) + + e_hh = numpy.zeros((batch_len, r, r), dtype=STAT_TYPE) + tmp_e_hh = multiprocessing.Array(ct, e_hh.size) + e_hh = numpy.ctypeslib.as_array(tmp_e_hh.get_obj()) + e_hh = e_hh.reshape(batch_len, r, r) + + # Parallelized loop on the model id's + fa_model_loop(batch_start=batch_start, mini_batch_indices=numpy.arange(batch_len), + r=r, phi_white=W_white, phi=W, sigma=sigma, + stat0=_stat0, stat1=self.stat1, + e_h=e_h, e_hh=e_hh, num_thread=num_thread) + + y.stat1[batch_start:batch_start + batch_len, :] = e_h[:, :V.shape[1]] + x.stat1[batch_start:batch_start + batch_len, :] = e_h[:, V.shape[1]:] + + if D is not None: + # subtract Vy + Ux from the first-order statistics + VUyx.stat1[batch_start:batch_start + batch_len, :] = e_h.dot(W.T) + + if D is not None: + # subtract Vy + Ux from the first-order statistics + self = self.subtract_weighted_stat1(VUyx) + + # estimate z + for idx in range(self.modelset.shape[0]): + Lambda = numpy.ones(D.shape, dtype=STAT_TYPE) + (_stat0[idx, :] * D**2) + z.stat1[idx] = self.stat1[idx] * D / Lambda + + return y, x, z + + def factor_analysis(self, rank_f, rank_g=0, rank_h=None, re_estimate_residual=False, + it_nb=(10, 10, 10), min_div=True, ubm=None, + batch_size=100, num_thread=1, save_partial=False, init_matrices=(None, None, None)): + """ + :param rank_f: rank of the between class variability matrix + :param rank_g: rank of the within class variab1ility matrix + :param rank_h: boolean, if True, estimate the residual covariance + matrix. Default is False + :param re_estimate_residual: boolean, if True, the residual covariance matrix is re-estimated (use for PLDA) + :param it_nb: tupple of three integers; number of iterations to run + for F, G, H estimation + :param min_div: boolean, if True, re-estimate the covariance matrices + according to the minimum divergence criteria + :param batch_size: number of sessions to process in one batch or memory optimization + :param num_thread: number of thread to run in parallel + :param ubm: origin of the space; should be None for PLDA and be a + Mixture object for JFA or TV + :param save_partial: name of the file to save intermediate models, + if True, save before each split of the distributions + :param init_matrices: tuple of three optional matrices to initialize the model, default is (None, None, None) + + :return: three matrices, the between class factor loading matrix, + the within class factor loading matrix the diagonal MAP matrix + (as a vector) and the residual covariance matrix + """ + warnings.warn("deprecated, use FactorAnalyser module", DeprecationWarning) + + (F_init, G_init, H_init) = init_matrices + """ not true anymore, stats are not whiten""" + # Whiten the statistics around the UBM.mean or, + # if there is no UBM, around the effective mean + + vect_size = self.stat1.shape[1] + if ubm is None: + mean = self.stat1.mean(axis=0) + Sigma_obs = self.get_total_covariance_stat1() + if F_init is None: + evals, evecs = scipy.linalg.eigh(Sigma_obs) + idx = numpy.argsort(evals)[::-1] + evecs = evecs[:, idx] + F_init = evecs[:, :rank_f] + else: + mean = ubm.get_mean_super_vector() + Sigma_obs = 1. / ubm.get_invcov_super_vector() + if F_init is None: + F_init = numpy.random.randn(vect_size, rank_f).astype(dtype=STAT_TYPE) + + if G_init is None: + G_init = numpy.random.randn(vect_size, rank_g) + # rank_H = 0 + if rank_h is not None: # H is empty or full-rank + rank_h = vect_size + else: + rank_h = 0 + if H_init is None: + H_init = numpy.random.randn(rank_h).astype(dtype=STAT_TYPE) * Sigma_obs.mean() + + # Estimate the between class variability matrix + if rank_f == 0 or it_nb[0] == 0: + F = F_init + sigma = Sigma_obs + else: + # Modify the StatServer for the Total Variability estimation + # each session is considered a class. + if rank_g == rank_h == 0 and not re_estimate_residual: + modelset_backup = copy.deepcopy(self.modelset) + self.modelset = self.segset + + F, sigma = self.estimate_between_class(it_nb[0], + F_init, + mean, + Sigma_obs, + batch_size, + None, + None, + min_div, + num_thread, + re_estimate_residual, + save_partial) + + if rank_g == rank_h == 0 and not re_estimate_residual: + self.modelset = modelset_backup + + # Estimate the within class variability matrix + if rank_g == 0 or it_nb[2] == 0: + G = G_init + else: + # Estimate Vy per model (not per session) + Gtmp = numpy.random.randn(vect_size, 0) + model_shifted_stat = self.sum_stat_per_model()[0] + y, x, z = model_shifted_stat.estimate_hidden(mean, Sigma_obs, + F, Gtmp, None, + num_thread) + + """ Here we compute Vy for each session so we duplicate first + the Y computed per model for each session corresponding to + this model and then multiply by V. + We subtract then a weighted version of Vy from the statistics.""" + duplicate_y = numpy.zeros((self.modelset.shape[0], rank_f), dtype=STAT_TYPE) + for idx, mod in enumerate(y.modelset): + duplicate_y[self.modelset == mod] = y.stat1[idx] + Vy = copy.deepcopy(self) + Vy.stat1 = duplicate_y.dot(F.T) + + # Estimate G + G = self.estimate_within_class(it_nb[1], + G_init, + mean, + Sigma_obs, + batch_size, + Vy, + None, + min_div, + num_thread, + save_partial) + + # Estimate the MAP covariance matrix + if rank_h == 0 or it_nb[2] == 0: + H = H_init + else: + # Estimate Vy per model (not per session) + empty = numpy.random.randn(vect_size, 0) + tmp_stat = self.sum_stat_per_model()[0] + y, x, z = tmp_stat.estimate_hidden(mean, Sigma_obs, F, empty, None, num_thread) + + """ Here we compute Vy for each session so we duplicate first + the Y computed per model for each session corresponding to + this model and then multiply by V. + We subtract then a weighted version of Vy from the statistics.""" + duplicate_y = numpy.zeros((self.modelset.shape[0], rank_f), dtype=STAT_TYPE) + for idx, mod in enumerate(y.modelset): + duplicate_y[self.modelset == mod] = y.stat1[idx] + Vy = copy.deepcopy(self) + Vy.stat1 = duplicate_y.dot(F.T) + + # Estimate Ux per session + tmp_stat = copy.deepcopy(self) + tmp_stat = tmp_stat.subtract_weighted_stat1(Vy) + y, x, z = tmp_stat.estimate_hidden(mean, Sigma_obs, empty, G, None, num_thread) + + Ux = copy.deepcopy(self) + Ux.stat1 = x.stat1.dot(G.T) + + # Estimate H + H = self.estimate_map(it_nb[2], H_init, + mean, + Sigma_obs, + Vy, + Ux, + save_partial) + + return mean, F, G, H, sigma + + # @staticmethod + # def read_subset(statserver_filename, idmap, prefix=''): + # """ + # Given a statserver in HDF5 format stored on disk and an IdMap, + # create a StatServer object filled with sessions corresponding to the IdMap. + # + # :param statserver_filename: name of the statserver in hdf5 format to read from + # :param idmap: the IdMap of sessions to load + # :param prefix: prefix of the group in HDF5 file + # :return: a StatServer + # """ + # with h5py.File(statserver_filename, 'r') as h5f: + # + # # create tuples of (model,seg) for both HDF5 and IdMap for quick comparaison + # sst = [(mod, seg) for mod, seg in zip(h5f[prefix+"modelset"].value.astype('U', copy=False), + # h5f[prefix+"segset"].value.astype('U', copy=False))] + # imt = [(mod, seg) for mod, seg in zip(idmap.leftids, idmap.rightids)] + # + # # Get indices of existing sessions + # existing_sessions = set(sst).intersection(set(imt)) + # idx = numpy.sort(numpy.array([sst.index(session) for session in existing_sessions])) + # + # # Create the new StatServer by loading the correct sessions + # statserver = sidekit.StatServer() + # statserver.modelset = h5f[prefix+"modelset"].value[idx].astype('U', copy=False) + # statserver.segset = h5f[prefix+"segset"].value[idx].astype('U', copy=False) + # + # tmpstart = h5f.get(prefix+"start").value[idx] + # tmpstop = h5f.get(prefix+"stop").value[idx] + # statserver.start = numpy.empty(idx.shape, '|O') + # statserver.stop = numpy.empty(idx.shape, '|O') + # statserver.start[tmpstart != -1] = tmpstart[tmpstart != -1] + # statserver.stop[tmpstop != -1] = tmpstop[tmpstop != -1] + # + # statserver.stat0 = h5f[prefix+"stat0"].value[idx, :] + # statserver.stat1 = h5f[prefix+"stat1"].value[idx, :] + # + # return statserver + + def generator(self): + """ + Create a generator which yield stat0, stat1, of one session at a time + """ + i = 0 + while i < self.stat0.shape[0]: + yield self.stat0[i, :], self.stat1[i, :] + i += 1 + + @staticmethod + def read_subset(statserver_filename, index, prefix=''): + """ + Given a statserver in HDF5 format stored on disk and an IdMap, + create a StatServer object filled with sessions corresponding to the IdMap. + + :param statserver_filename: name of the statserver in hdf5 format to read from + :param index: the IdMap of sessions to load or an array of index to load + :param prefix: prefix of the group in HDF5 file + :return: a StatServer + """ + with h5py.File(statserver_filename, 'r') as h5f: + + if isinstance(index, sidekit.IdMap): + # create tuples of (model,seg) for both HDF5 and IdMap for quick comparaison + sst = [(mod, seg) for mod, seg in zip(h5f[prefix+"modelset"][()].astype('U', copy=False), + h5f[prefix+"segset"][()].astype('U', copy=False))] + imt = [(mod, seg) for mod, seg in zip(index.leftids, index.rightids)] + + # Get indices of existing sessions + existing_sessions = set(sst).intersection(set(imt)) + idx = numpy.sort(numpy.array([sst.index(session) for session in existing_sessions]).astype(int)) + + zero_vectors = numpy.argwhere(h5f['stat1'][()].sum(1) == 0) + idx = [] + for ii, couple in enumerate(sst): + if couple in existing_sessions and not ii in zero_vectors: + idx.append(ii) + idx = numpy.array(idx) + + else: + idx = numpy.array(index) + # If some indices are higher than the size of the StatServer, they are replace by the last index + idx = numpy.array([min(len(h5f[prefix+"modelset"]) - 1, idx[ii]) for ii in range(len(idx))]) + + # Create the new StatServer by loading the correct sessions + statserver = sidekit.StatServer() + statserver.modelset = h5f[prefix+"modelset"][()][idx].astype('U', copy=False) + statserver.segset = h5f[prefix+"segset"][()][idx].astype('U', copy=False) + + tmpstart = h5f.get(prefix+"start")[()][idx] + tmpstop = h5f.get(prefix+"stop")[()][idx] + statserver.start = numpy.empty(idx.shape, '|O') + statserver.stop = numpy.empty(idx.shape, '|O') + statserver.start[tmpstart != -1] = tmpstart[tmpstart != -1] + statserver.stop[tmpstop != -1] = tmpstop[tmpstop != -1] + + statserver.stat0 = h5f[prefix+"stat0"][()][idx, :] + statserver.stat1 = h5f[prefix+"stat1"][()][idx, :] + + return statserver diff --git a/sidekit/sidekit/sv_utils.py b/sidekit/sidekit/sv_utils.py new file mode 100755 index 0000000..37305e1 --- /dev/null +++ b/sidekit/sidekit/sv_utils.py @@ -0,0 +1,464 @@ +# -*- coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . + +""" +Copyright 2014-2021 Anthony Larcher + +:mod:`sv_utils` provides utilities to facilitate the work with SIDEKIT. +""" +import ctypes +import copy +import gzip +import multiprocessing +import numpy +import os +import pickle +import re +import scipy +import sys +if sys.version_info.major > 2: + from functools import reduce + +__license__ = "LGPL" +__author__ = "Anthony Larcher" +__copyright__ = "Copyright 2014-2021 Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' + + +def save_svm(svm_file_name, w, b): + """Save SVM weights and bias in PICKLE format + + :param svm_file_name: name of the file to write + :param w: weight coefficients of the SVM to store + :param b: biais of the SVM to store + """ + if not os.path.exists(os.path.dirname(svm_file_name)): + os.makedirs(os.path.dirname(svm_file_name)) + with gzip.open(svm_file_name, "wb") as f: + pickle.dump((w, b), f) + + +def read_svm(svm_file_name): + """Read SVM model in PICKLE format + + :param svm_file_name: name of the file to read from + + :return: a tupple of weight and biais + """ + with gzip.open(svm_file_name, "rb") as f: + (w, b) = pickle.load(f) + return numpy.squeeze(w), b + + +def check_file_list(input_file_list, file_name_structure): + """Check the existence of a list of files in a specific directory + Return a new list with the existing segments and a list of indices + of those files in the original list. Return outputFileList and + idx such that inputFileList[idx] = outputFileList + + :param input_file_list: list of file names + :param file_name_structure: structure of the filename to search for + + :return: a list of existing files and the indices + of the existing files in the input list + """ + exist_files = numpy.array([os.path.isfile(file_name_structure.format(f)) for f in input_file_list]) + output_file_list = input_file_list[exist_files] + idx = numpy.argwhere(numpy.in1d(input_file_list, output_file_list)) + return output_file_list, idx.transpose()[0] + + +def initialize_iv_extraction_weight(ubm, T): + """ + Estimate matrices W and T for approximation of the i-vectors + For more information, refers to [Glembeck09]_ + + :param ubm: Mixture object, Universal Background Model + :param T: Raw TotalVariability matrix as a ndarray + + :return: + W: fix matrix pre-computed using the weights from the UBM and the + total variability matrix + Tnorm: total variability matrix pre-normalized using the co-variance + of the UBM + """ + # Normalize the total variability matrix by using UBM co-variance + + sqrt_invcov = numpy.sqrt(ubm.get_invcov_super_vector()[:, numpy.newaxis]) + Tnorm = T * sqrt_invcov + + # Split the Total Variability matrix into sub-matrices per distribution + Tnorm_c = numpy.array_split(Tnorm, ubm.distrib_nb()) + + # Compute fixed matrix W + W = numpy.zeros((T.shape[1], T.shape[1])) + for c in range(ubm.distrib_nb()): + W = W + ubm.w[c] * numpy.dot(Tnorm_c[c].transpose(), Tnorm_c[c]) + + return W, Tnorm + + +def initialize_iv_extraction_eigen_decomposition(ubm, T): + """Estimate matrices Q, D_bar_c and Tnorm, for approximation + of the i-vectors. + For more information, refers to [Glembeck09]_ + + :param ubm: Mixture object, Universal Background Model + :param T: Raw TotalVariability matrix + + :return: + Q: Q matrix as described in [Glembeck11] + D_bar_c: matrices as described in [Glembeck11] + Tnorm: total variability matrix pre-normalized using the co-variance of the UBM + """ + # Normalize the total variability matrix by using UBM co-variance + sqrt_invcov = numpy.sqrt(ubm.get_invcov_super_vector()[:, numpy.newaxis]) + Tnorm = T * sqrt_invcov + + # Split the Total Variability matrix into sub-matrices per distribution + Tnorm_c = numpy.array_split(Tnorm, ubm.distrib_nb()) + + # Compute fixed matrix Q + W = numpy.zeros((T.shape[1], T.shape[1])) + for c in range(ubm.distrib_nb()): + W = W + ubm.w[c] * numpy.dot(Tnorm_c[c].transpose(), Tnorm_c[c]) + + eigen_values, Q = scipy.linalg.eig(W) + + # Compute D_bar_c matrix which is the diagonal approximation of Tc' * Tc + D_bar_c = numpy.zeros((ubm.distrib_nb(), T.shape[1])) + for c in range(ubm.distrib_nb()): + D_bar_c[c, :] = numpy.diag(reduce(numpy.dot, [Q.transpose(), Tnorm_c[c].transpose(), Tnorm_c[c], Q])) + return Q, D_bar_c, Tnorm + + +def initialize_iv_extraction_fse(ubm, T): + """Estimate matrices for approximation of the i-vectors. + For more information, refers to [Cumani13]_ + + :param ubm: Mixture object, Universal Background Model + :param T: Raw TotalVariability matrix + + :return: + Q: Q matrix as described in [Glembeck11] + D_bar_c: matrices as described in [Glembeck11] + Tnorm: total variability matrix pre-normalized using the co-variance of the UBM + """ + # % Initialize the process + # %init = 1; + # + # + # Extract i-vectors by using different methods + # + # %rank_T = 10; + # %featureSize = 50; + # %distribNb = 32; + # %dictSize = 5; + # %dictSizePerDis=rank_T; % a modifier par la suite + # + # %ubm_file = 'gmm/world32.gmm'; + # %t_file = 'mat/TV_32.matx'; + # + # + # + # Load data + # + # + # % Load UBM for weight parameters that are used in the optimization + # % function + # %UBM = ALize_LoadModel(ubm_file); + # + # % Load meand from Minimum Divergence re-estimation + # %sv_mindiv = ALize_LoadVect(minDiv_file)'; + # + # % Load T matrix + # %T = ALize_LoadMatrix(t_file)'; + # + # + # function [O,PI,Q] = factorized_subspace_estimation(UBM,T,dictSize,outIterNb,inIterNb) + # + # rank_T = size(T,2); + # distribNb = size(UBM.W,2); + # featureSize = size(UBM.mu,1); + # + # % Normalize matrix T + # sv_invcov = reshape(UBM.invcov,[],1); + # T_norm = T .* repmat(sqrt(sv_invcov),1,rank_T); + # + # Tc{1,distribNb} = []; + # for ii=0:distribNb-1 + # % Split the matrix in sub-matrices + # Tc{ii+1} = T_norm((ii*featureSize)+1:(ii+1)*featureSize,:); + # end + # + # % Initialize O and Qc by Singular Value Decomposition + # init_FSE.Qc{distribNb} = []; + # init_FSE.O{distribNb} = []; + # PI{distribNb} = []; + # + # for cc=1:distribNb + # init_FSE.Qc{cc} = zeros(featureSize,featureSize); + # init_FSE.O{cc} = zeros(featureSize,featureSize); + # PI{cc} = sparse(zeros(featureSize,dictSize*distribNb)); % a remplacer par une matrice sparse + # end + # + # % For each distribution + # for cc=1:distribNb + # fprintf('Initilize matrice for distribution %d / %d\n',cc,distribNb); + # % Initialized O with Singular vectors from SVD + # [init_FSE.O{cc},~,V] = svd(UBM.W(1,cc)*Tc{cc}); + # init_FSE.Qc{cc} = V'; + # end + # + # + # % Concatenate Qc to create the matrix Q: A MODIFIER POUR DISSOCIER + # % dictSize DU NOMBRE DE DISTRIBUTIONS + # Q = []; + # for cc=1:distribNb + # Q = [Q;init_FSE.Qc{cc}(1:dictSize,:)]; + # end + # O = init_FSE.O; + # clear 'init_FSE' + # + # + # % OUTER iteration process : update Q iteratively + # for it = 1:outIterNb + # + # fprintf('Start iteration %d / %d for Q re-estimation\n',it,5); + # + # % INNER iteration process: update PI and O iteratively + # for pioIT = 1:inIterNb + # fprintf(' Start iteration %d / %d for PI and O re-estimation\n',pioIT,10); + # + # % Update PI + # %Compute diagonal terms of QQ' + # % diagQ = diag(Q*Q'); + # + # for cc=1:distribNb + # + # % Compute optimal k and optimal v + # % for each line f of PI{cc} + # A = O{cc}'*Tc{cc}*Q'; + # f = 1; + # while (f < size(A,1)+1) + # + # if(f == 1) + # A = O{cc}'*Tc{cc}*Q'; % equation (26) + # PI{cc} = sparse(zeros(featureSize,dictSize*distribNb)); + # end + # + # % Find the optimal index k + # [~,k] = max(A(f,:).^2); + # k_opt = k; + # + # % Find the optimal value v + # v_opt = A(f,k_opt); + # + # % Update the line of PI{cc} with the value v_opt in the + # % k_opt-th column + # PI{cc}(f,k_opt) = v_opt; + # + # % if the column already has a non-zero element, + # % update O and PI + # I = find(PI{cc}(:,k_opt)~=0); + # if size(I,1)>1 + # % get indices of the two lines of PI{cc} which + # % have a non-zero element on the same column + # a = I(1); + # b = I(2); + # + # % Replace column O{cc}(:,a) and O{cc}(:,b) + # Oa = (PI{cc}(a,k_opt)*O{cc}(:,a)+PI{cc}(b,k_opt)*O{cc}(:,b))/ + # (sqrt(PI{cc}(a,k_opt)^2+PI{cc}(b,k_opt)^2)); + # Ob = (PI{cc}(a,k_opt)*O{cc}(:,b)-PI{cc}(b,k_opt)*O{cc}(:,a))/ + # (sqrt(PI{cc}(a,k_opt)^2+PI{cc}(b,k_opt)^2)); + # O{cc}(:,a) = Oa; + # O{cc}(:,b) = Ob; + # + # PI{cc}(a,k_opt) = sqrt(PI{cc}(a,k_opt)^2+PI{cc}(b,k_opt)^2); + # PI{cc}(b,k_opt) = 0; + # + # f = 0; + # end + # f =f +1; + # end + # end + # + # obj = computeObjFunc(UBM.W,Tc,O,PI,Q); + # fprintf('Objective Function after estimation of PI = %2.10f\n',obj); + # + # % Update O + # for cc=1:distribNb + # + # % Compute + # Z = PI{cc}*Q*Tc{cc}'; + # + # % Compute Singular value decomposition of Z + # [Uz,~,Vz] = svd(Z); + # + # % Compute the new O{cc} + # O{cc} = Vz*Uz'; + # end + # obj = computeObjFunc(UBM.W,Tc,O,PI,Q); + # fprintf('Objective Function after estimation of O = %2.10f\n',obj); + # end % END OF INNER ITERATION PROCESS + # + # + # % Update Q + # D = sparse(zeros(size(PI{cc},2),size(PI{cc},2))); + # E = zeros(size(PI{cc},2),rank_T); + # for cc=1:distribNb + # % Accumulate D + # D = D + UBM.W(1,cc) * PI{cc}'*PI{cc}; + # + # % Accumulate the second term + # E = E + UBM.W(1,cc) * PI{cc}'*O{cc}'*Tc{cc}; + # end + # Q = D\E; + # + # % Normalize rows of Q and update PI accordingly + # + # % Compute norm of each row + # c1 = bsxfun(@times,Q,Q); + # c2 = sum(c1,2); + # c3 = sqrt(c2); + # Q = bsxfun(@rdivide,Q,c3); + # + # % Update PI accordingly + # PI = cellfun(@(x)x*sparse(diag(c3)), PI, 'uni',false); + # + # obj = computeObjFunc(UBM.W,Tc,O,PI,Q); + # fprintf('Objective Function after re-estimation of Q = %2.10f\n',obj); + # end + # end + pass + + +def clean_stat_server(statserver): + """ + + :param statserver: + :return: + """ + zero_idx = ~(statserver.stat0.sum(axis=1) == 0.) + statserver.modelset = statserver.modelset[zero_idx] + statserver.segset = statserver.segset[zero_idx] + statserver.start = statserver.start[zero_idx] + statserver.stop = statserver.stop[zero_idx] + statserver.stat0 = statserver.stat0[zero_idx, :] + statserver.stat1 = statserver.stat1[zero_idx, :] + assert statserver.validate(), "Error after cleaning StatServer" + print("Removed {} empty sessions in StatServer".format((~zero_idx).sum())) + + +def parse_mask(mask): + """ + + :param mask: + :return: + """ + if not set(re.sub("\s", "", mask)[1:-1]).issubset(set("0123456789-,")): + raise Exception("Wrong mask format") + tmp = [k.split('-') for k in re.sub(r"[\s]", '', mask)[1:-1].split(',')] + indices = [] + for seg in tmp: + if len(seg) == 1: + seg += seg + if len(seg) == 2: + indices += list(range(int(seg[0]), int(seg[1])+1)) + else: + raise Exception("Wrong mask format") + return indices + + +def segment_mean_std_hdf5(input_segment, in_context=False): + """ + Compute the sum and square sum of all features for a list of segments. + Input files are in HDF5 format + + :param input_segment: list of segments to read from, each element of the list is a tuple of 5 values, + the filename, the index of thefirst frame, index of the last frame, the number of frames for the + left context and the number of frames for the right context + :param in_context: + :return: a tuple of three values, the number of frames, the sum of frames and the sum of squares + """ + features_server, show, start, stop, in_context = input_segment + + if start is None or stop is None or not in_context: + feat, _ = features_server.load(show, + start=start, + stop=stop) + + else: + # Load the segment of frames plus left and right context + feat, _ = features_server.load(show, + start=start-features_server.context[0], + stop=stop+features_server.context[1]) + # Get features in context + feat, _ = features_server.get_context(feat=feat, + label=None, + start=features_server.context[0], + stop=feat.shape[0]-features_server.context[1]) + + return feat.shape[0], feat.sum(axis=0), numpy.sum(feat**2, axis=0) + + +def mean_std_many(features_server, seg_list, in_context=False, num_thread=1): + """ + Compute the mean and standard deviation from a list of segments. + + :param features_server: + :param seg_list: list of file names with start and stop indices + :param in_context: + :param num_thread: + :return: a tuple of three values, the number of frames, the mean and the variance + """ + if isinstance(seg_list[0], tuple): + inputs = [(copy.deepcopy(features_server), seg[0], seg[1], seg[2], in_context) for seg in seg_list] + elif isinstance(seg_list[0], str): + inputs = [(copy.deepcopy(features_server), seg, None, None, in_context) for seg in seg_list] + + pool = multiprocessing.Pool(processes=num_thread) + res = pool.map(segment_mean_std_hdf5, inputs) + pool.terminate() + total_N = 0 + total_F = 0 + total_S = 0 + for N, F, S in res: + total_N += N + total_F += F + total_S += S + return total_N, total_F / total_N, total_S / total_N + + +def serialize(M): + M_shape = M.shape + ct = ctypes.c_double + if M.dtype == numpy.float32: + ct = ctypes.c_float + tmp_M = multiprocessing.Array(ct, M.size) + M = numpy.ctypeslib.as_array(tmp_M.get_obj()) + return M.reshape(M_shape) diff --git a/sidekit/sidekit/svm_scoring.py b/sidekit/sidekit/svm_scoring.py new file mode 100755 index 0000000..1b62e27 --- /dev/null +++ b/sidekit/sidekit/svm_scoring.py @@ -0,0 +1,128 @@ +# -*- coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . + +""" +Copyright 2014-2021 Anthony Larcher + +:mod:`svm_scoring` provides functions to perform speaker verification +by using Support Vector Machines. +""" +import ctypes +import numpy +import multiprocessing +import logging +import sidekit.sv_utils +from sidekit.bosaris import Ndx +from sidekit.bosaris import Scores +from sidekit.statserver import StatServer + + +__license__ = "LGPL" +__author__ = "Anthony Larcher" +__copyright__ = "Copyright 2014-2021 Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' + + +def svm_scoring_singleThread(svm_filename_structure, test_sv, ndx, score, seg_idx=None): + """Compute scores for SVM verification on a single thread + (two classes only as implementeed at the moment) + + :param svm_filename_structure: structure of the filename where to load the SVM models + :param test_sv: StatServer object of super-vectors. stat0 are set to 1 and stat1 are the super-vector to classify + :param ndx: Ndx object of the trials to perform + :param score: Scores object to fill + :param seg_idx: list of segments to classify. Classify all if the list is empty. + """ + assert isinstance(test_sv, StatServer), 'Second parameter should be a StatServer' + assert isinstance(ndx, Ndx), 'Third parameter should be an Ndx' + + if seg_idx is None: + seg_idx = range(ndx.segset.shape[0]) + + # Load SVM models + Msvm = numpy.zeros((ndx.modelset.shape[0], test_sv.stat1.shape[1])) + bsvm = numpy.zeros(ndx.modelset.shape[0]) + for m in range(ndx.modelset.shape[0]): + svm_file_name = svm_filename_structure.format(ndx.modelset[m]) + w, b = sidekit.sv_utils.read_svm(svm_file_name) + Msvm[m, :] = w + bsvm[m] = b + + # Compute scores against all test segments + for ts in seg_idx: + logging.info('Compute trials involving test segment %d/%d', ts + 1, ndx.segset.shape[0]) + + # Select the models to test with the current segment + models = ndx.modelset[ndx.trialmask[:, ts]] + ind_dict = dict((k, i) for i, k in enumerate(ndx.modelset)) + inter = set(ind_dict.keys()).intersection(models) + idx_ndx = numpy.array([ind_dict[x] for x in inter]) + + scores = numpy.dot(Msvm[idx_ndx, :], test_sv.stat1[ts, :]) + bsvm[idx_ndx] + + # Fill the score matrix + score.scoremat[idx_ndx, ts] = scores + + +def svm_scoring(svm_filename_structure, test_sv, ndx, num_thread=1): + """Compute scores for SVM verification on multiple threads + (two classes only as implementeed at the moment) + + :param svm_filename_structure: structure of the filename where to load the SVM models + :param test_sv: StatServer object of super-vectors. stat0 are set to 1 and stat1 + are the super-vector to classify + :param ndx: Ndx object of the trials to perform + :param num_thread: number of thread to launch in parallel + + :return: a Score object. + """ + # Remove missing models and test segments + existing_models, model_idx = sidekit.sv_utils.check_file_list(ndx.modelset, svm_filename_structure) + clean_ndx = ndx.filter(existing_models, test_sv.segset, True) + + score = Scores() + score.scoremat = numpy.zeros(clean_ndx.trialmask.shape) + score.modelset = clean_ndx.modelset + score.segset = clean_ndx.segset + score.scoremask = clean_ndx.trialmask + + tmp = multiprocessing.Array(ctypes.c_double, score.scoremat.size) + score.scoremat = numpy.ctypeslib.as_array(tmp.get_obj()) + score.scoremat = score.scoremat.reshape(score.modelset.shape[0], score.segset.shape[0]) + + # Split the list of segment to process for multi-threading + los = numpy.array_split(numpy.arange(clean_ndx.segset.shape[0]), num_thread) + + jobs = [] + for idx in los: + p = multiprocessing.Process(target=svm_scoring_singleThread, + args=(svm_filename_structure, test_sv, clean_ndx, score, idx)) + jobs.append(p) + p.start() + for p in jobs: + p.join() + + return score diff --git a/sidekit/sidekit/svm_training.py b/sidekit/sidekit/svm_training.py new file mode 100755 index 0000000..ceedbfb --- /dev/null +++ b/sidekit/sidekit/svm_training.py @@ -0,0 +1,132 @@ +# -*- coding: utf-8 -*- +# +# This file is part of SIDEKIT. +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is a python package for speaker verification. +# Home page: http://www-lium.univ-lemans.fr/sidekit/ +# +# SIDEKIT is free software: you can redistribute it and/or modify +# it under the terms of the GNU LLesser General Public License as +# published by the Free Software Foundation, either version 3 of the License, +# or (at your option) any later version. +# +# SIDEKIT is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with SIDEKIT. If not, see . + +""" +Copyright 2014-2021 Anthony Larcher + +:mod:`svm_training` provides utilities to train Support Vector Machines +to perform speaker verification. +""" +import numpy +import os +import logging +from sidekit.libsvm.svmutil import svm_problem, svm_parameter, svm_train +from sidekit.statserver import StatServer +import multiprocessing +import sidekit.sv_utils + + +__license__ = "LGPL" +__author__ = "Anthony Larcher" +__copyright__ = "Copyright 2014-2021 Anthony Larcher" +__maintainer__ = "Anthony Larcher" +__email__ = "anthony.larcher@univ-lemans.fr" +__status__ = "Production" +__docformat__ = 'reStructuredText' + + +def svm_training_singleThread(K, msn, bsn, svm_dir, background_sv, models, enroll_sv): + """Train Suport Vector Machine classifiers for two classes task + (as implemented for nowbut miht change in the future to include multi-class + classification) + + :param K: pre-computed part of the Gram matrix + :param msn: maximum number of sessions to train a SVM + :param bsn: number of session used as background impostors + :param svm_dir: directory where to store the SVM models + :param background_sv: StatServer of super-vectors for background impostors. All + super-vectors are used without selection + :param models: list of models to train. The models must be included in the + enroll_sv StatServer + :param enroll_sv: StatServer of super-vectors used for the target models + """ + gram = numpy.zeros((bsn + msn, bsn + msn)) + gram[:bsn, :bsn] = K + # labels of the target examples are set to 1 + # labels of the impostor vectors are set to 2 + K_label = (2 * numpy.ones(bsn, 'int')).tolist() + numpy.ones(msn, 'int').tolist() + + for model in models: + logging.info('Train SVM model for %s', model) + # Compute the part of the Kernel which depends on the enrollment data + csn = enroll_sv.get_model_segments(model).shape[0] + X = numpy.vstack((background_sv.stat1, enroll_sv.get_model_stat1(model))) + gram[:bsn + csn, bsn:bsn + csn] = numpy.dot(X, enroll_sv.get_model_stat1(model).transpose()) + gram[bsn:bsn + csn, :bsn] = gram[:bsn, bsn:bsn + csn].transpose() + + # train the SVM for the current model (where libsvm is used) + Kernel = numpy.zeros((gram.shape[0], gram.shape[1] + 1)).tolist() + for i in range(gram.shape[0]): + Kernel[i][0] = int(i + 1) + Kernel[i][1:] = gram[i, ] + + # isKernel=True must be set for precomputer kernel + # Precomputed kernel data (-t 4) + prob = svm_problem(K_label, Kernel, isKernel=True) + c = 1 / numpy.mean(numpy.diag(gram)) + param = svm_parameter('-t 4 -c {}'.format(c)) + svm = svm_train(prob, param) + # Compute the weights + w = -numpy.dot(X[numpy.array(svm.get_sv_indices()) - 1, ].transpose(), numpy.array(svm.get_sv_coef())) + bsvm = svm.rho[0] + svmFileName = os.path.join(svm_dir, model + '.svm') + sidekit.sv_utils.save_svm(svmFileName, w, bsvm) + + +def svm_training(svmDir, background_sv, enroll_sv, num_thread=1): + """Train Suport Vector Machine classifiers for two classes task + (as implemented for nowbut miht change in the future to include multi-class + classification) + Training is parallelized on multiple threads. + + :param svmDir: directory where to store the SVM models + :param background_sv: StatServer of super-vectors for background impostors. All + super-vectors are used without selection + :param enroll_sv: StatServer of super-vectors used for the target models + :param num_thread: number of thread to launch in parallel + """ + assert isinstance(background_sv, StatServer), 'Second parameter has to be a StatServer' + assert isinstance(enroll_sv, StatServer), 'Third parameter has to be a StatServer' + + # The effective Kernel is initialize for the case of multi-session + # by considering the maximum number of sessions per speaker. + # For the SVM training, only a subpart of the kernel is used, accordingly + # to the number of sessions of the current speaker + K = background_sv.precompute_svm_kernel_stat1() + msn = max([enroll_sv.modelset.tolist().count(a) for a in enroll_sv.modelset.tolist()]) + bsn = K.shape[0] + + # Split the list of unique model names + listOfModels = numpy.array_split(numpy.unique(enroll_sv.modelset), num_thread) + + # Process each sub-list of models in a separate thread + jobs = [] + for idx, models in enumerate(listOfModels): + p = multiprocessing.Process(target=svm_training_singleThread, + args=(K, msn, bsn, svmDir, background_sv, models, enroll_sv)) + jobs.append(p) + p.start() + for p in jobs: + p.join() + +