forked from espnet/espnet
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcompute-cmvn-stats.py
executable file
·169 lines (140 loc) · 6.23 KB
/
compute-cmvn-stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#!/usr/bin/env python
import argparse
import logging
import kaldiio
import numpy as np
from espnet.transform.transformation import Transformation
from espnet.utils.cli_readers import file_reader_helper
from espnet.utils.cli_utils import get_commandline_args
from espnet.utils.cli_utils import is_scipy_wav_style
from espnet.utils.cli_writers import file_writer_helper
def get_parser():
parser = argparse.ArgumentParser(
description='Compute cepstral mean and '
'variance normalization statistics'
'If wspecifier provided: per-utterance by default, '
'or per-speaker if'
'spk2utt option provided; if wxfilename: global',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--spk2utt', type=str,
help='A text file of speaker to utterance-list map. '
'(Don\'t give rspecifier format, such as '
'"ark:utt2spk")')
parser.add_argument('--verbose', '-V', default=0, type=int,
help='Verbose option')
parser.add_argument('--in-filetype', type=str, default='mat',
choices=['mat', 'hdf5', 'sound.hdf5', 'sound'],
help='Specify the file format for the rspecifier. '
'"mat" is the matrix format in kaldi')
parser.add_argument('--out-filetype', type=str, default='mat',
choices=['mat', 'hdf5', 'npy'],
help='Specify the file format for the wspecifier. '
'"mat" is the matrix format in kaldi')
parser.add_argument('--preprocess-conf', type=str, default=None,
help='The configuration file for the pre-processing')
parser.add_argument('rspecifier', type=str,
help='Read specifier for feats. e.g. ark:some.ark')
parser.add_argument('wspecifier_or_wxfilename', type=str,
help='Write specifier. e.g. ark:some.ark')
return parser
def main():
args = get_parser().parse_args()
logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
if args.verbose > 0:
logging.basicConfig(level=logging.INFO, format=logfmt)
else:
logging.basicConfig(level=logging.WARN, format=logfmt)
logging.info(get_commandline_args())
is_wspecifier = ':' in args.wspecifier_or_wxfilename
if is_wspecifier:
if args.spk2utt is not None:
logging.info('Performing as speaker CMVN mode')
utt2spk_dict = {}
with open(args.spk2utt) as f:
for line in f:
spk, utts = line.rstrip().split(None, 1)
for utt in utts.split():
utt2spk_dict[utt] = spk
def utt2spk(x):
return utt2spk_dict[x]
else:
logging.info('Performing as utterance CMVN mode')
def utt2spk(x):
return x
if args.out_filetype == 'npy':
logging.warning('--out-filetype npy is allowed only for '
'Global CMVN mode, changing to hdf5')
args.out_filetype = 'hdf5'
else:
logging.info('Performing as global CMVN mode')
if args.spk2utt is not None:
logging.warning('spk2utt is not used for global CMVN mode')
def utt2spk(x):
return None
if args.out_filetype == 'hdf5':
logging.warning('--out-filetype hdf5 is not allowed for '
'Global CMVN mode, changing to npy')
args.out_filetype = 'npy'
if args.preprocess_conf is not None:
preprocessing = Transformation(args.preprocess_conf)
logging.info('Apply preprocessing: {}'.format(preprocessing))
else:
preprocessing = None
# Calculate stats for each speaker
counts = {}
sum_feats = {}
square_sum_feats = {}
idx = 0
for idx, (utt, matrix) in enumerate(file_reader_helper(
args.rspecifier, args.in_filetype), 1):
if is_scipy_wav_style(matrix):
# If data is sound file, then got as Tuple[int, ndarray]
rate, matrix = matrix
if preprocessing is not None:
matrix = preprocessing(matrix, uttid_list=utt)
spk = utt2spk(utt)
# Init at the first seen of the spk
if spk not in counts:
counts[spk] = 0
feat_shape = matrix.shape[1:]
# Accumulate in double precision
sum_feats[spk] = np.zeros(feat_shape, dtype=np.float64)
square_sum_feats[spk] = np.zeros(feat_shape, dtype=np.float64)
counts[spk] += matrix.shape[0]
sum_feats[spk] += matrix.sum(axis=0)
square_sum_feats[spk] += (matrix ** 2).sum(axis=0)
logging.info('Processed {} utterances'.format(idx))
assert idx > 0, idx
cmvn_stats = {}
for spk in counts:
feat_shape = sum_feats[spk].shape
cmvn_shape = (2, feat_shape[0] + 1) + feat_shape[1:]
_cmvn_stats = np.empty(cmvn_shape, dtype=np.float64)
_cmvn_stats[0, :-1] = sum_feats[spk]
_cmvn_stats[1, :-1] = square_sum_feats[spk]
_cmvn_stats[0, -1] = counts[spk]
_cmvn_stats[1, -1] = 0.
# You can get the mean and std as following,
# >>> N = _cmvn_stats[0, -1]
# >>> mean = _cmvn_stats[0, :-1] / N
# >>> std = np.sqrt(_cmvn_stats[1, :-1] / N - mean ** 2)
cmvn_stats[spk] = _cmvn_stats
# Per utterance or speaker CMVN
if is_wspecifier:
with file_writer_helper(args.wspecifier_or_wxfilename,
filetype=args.out_filetype) as writer:
for spk, mat in cmvn_stats.items():
writer[spk] = mat
# Global CMVN
else:
matrix = cmvn_stats[None]
if args.out_filetype == 'npy':
np.save(args.wspecifier_or_wxfilename, matrix)
elif args.out_filetype == 'mat':
# Kaldi supports only matrix or vector
kaldiio.save_mat(args.wspecifier_or_wxfilename, matrix)
else:
raise RuntimeError('Not supporting: --out-filetype {}'
.format(args.out_filetype))
if __name__ == "__main__":
main()