-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_xvector.sh
executable file
·154 lines (133 loc) · 5.99 KB
/
run_xvector.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/bin/bash
# Copyright 2017 David Snyder
# 2017 Johns Hopkins University (Author: Daniel Garcia-Romero)
# 2017 Johns Hopkins University (Author: Daniel Povey)
#
# Copied from egs/sre16/v1/local/nnet3/xvector/tuning/run_xvector_1a.sh (commit e082c17d4a8f8a791428ae4d9f7ceb776aef3f0b).
#
# Apache 2.0.
# Adapted from the follwing:
# script trains a DNN similar to the recipe described in http://www.danielpovey.com/files/2018_icassp_xvectors.pdf
. ./cmd.sh
. ./path.sh
set -e
stage=1
train_stage=-1
lrate=001
epochs=1
shrink=10
data=data/train_clean_360
nnet_dir=exp/xvect
egs_dir=$nnet_dir/egs
. ./utils/parse_options.sh
num_pdfs=$(awk '{print $2}' $data/utt2spk | sort | uniq -c | wc -l)
# Now we create the nnet examples using sid/nnet3/xvector/get_egs.sh.
# The argument --num-repeats is related to the number of times a speaker
# repeats per archive. If it seems like you're getting too many archives
# (e.g., more than 200) try increasing the --frames-per-iter option. The
# arguments --min-frames-per-chunk and --max-frames-per-chunk specify the
# minimum and maximum length (in terms of number of frames) of the features
# in the examples.
#
# To make sense of the egs script, it may be necessary to put an "exit 1"
# command immediately after stage 3. Then, inspect
# exp/<your-dir>/egs/temp/ranges.* . The ranges files specify the examples that
# will be created, and which archives they will be stored in. Each line of
# ranges.* has the following form:
# <utt-id> <local-ark-indx> <global-ark-indx> <start-frame> <end-frame> <spk-id>
# For example:
# 100304-f-sre2006-kacg-A 1 2 4079 881 23
# If you're satisfied with the number of archives (e.g., 50-150 archives is
# reasonable) and with the number of examples per speaker (e.g., 1000-5000
# is reasonable) then you can let the script continue to the later stages.
# Otherwise, try increasing or decreasing the --num-repeats option. You might
# need to fiddle with --frames-per-iter. Increasing this value decreases the
# the number of archives and increases the number of examples per archive.
# Decreasing this value increases the number of archives, while decreasing the
# number of examples per archive.
if [ $stage -le 6 ]; then
echo "$0: Getting neural network training egs";
sid/nnet3/xvector/get_egs.sh --cmd "$train_cmd" \
--nj 8 \
--stage 0 \
--frames-per-iter 100000000 \
--frames-per-iter-diagnostic 100000 \
--min-frames-per-chunk 200 \
--max-frames-per-chunk 400 \
--num-diagnostic-archives 3 \
--num-repeats 50 \
"$data" $egs_dir || exit 1
fi
if [ $stage -le 7 ]; then
echo "$0: creating neural net configs using the xconfig parser";
num_targets=$(wc -w $egs_dir/pdf2num | awk '{print $1}')
feat_dim=$(cat $egs_dir/info/feat_dim)
# This chunk-size corresponds to the maximum number of frames the
# stats layer is able to pool over. In this script, it corresponds
# to 100 seconds. If the input recording is greater than 100 seconds,
# we will compute multiple xvectors from the same recording and average
# to produce the final xvector.
max_chunk_size=10000
# The smallest number of frames we're comfortable computing an xvector from.
# Note that the hard minimum is given by the left and right context of the
# frame-level layers.
min_chunk_size=25
mkdir -p $nnet_dir/configs
cat <<EOF > $nnet_dir/configs/network.xconfig
# please note that it is important to have input layer with the name=input
# The frame-level layers
input dim=${feat_dim} name=input
relu-batchnorm-layer name=tdnn1 input=Append(-2,-1,0,1,2) dim=512
relu-batchnorm-layer name=tdnn2 input=Append(-2,0,2) dim=512
relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=512
relu-batchnorm-layer name=tdnn4 dim=512
relu-batchnorm-layer name=tdnn5 dim=1500
# The stats pooling layer. Layers after this are segment-level.
# In the config below, the first and last argument (0, and ${max_chunk_size})
# means that we pool over an input segment starting at frame 0
# and ending at frame ${max_chunk_size} or earlier. The other arguments (1:1)
# mean that no subsampling is performed.
stats-layer name=stats config=mean+stddev(0:1:1:${max_chunk_size})
# This is where we usually extract the embedding (aka xvector) from.
relu-batchnorm-layer name=tdnn6 dim=512 input=stats
# This is where another layer the embedding could be extracted
# from, but usually the previous one works better.
relu-batchnorm-layer name=tdnn7 dim=512
output-layer name=output include-log-softmax=true dim=${num_targets}
EOF
steps/nnet3/xconfig_to_configs.py \
--xconfig-file $nnet_dir/configs/network.xconfig \
--config-dir $nnet_dir/configs
cp $nnet_dir/configs/final.config $nnet_dir/nnet.config
# These three files will be used by sid/nnet3/xvector/extract_xvectors.sh
echo "output-node name=output input=tdnn6.affine" > $nnet_dir/extract.config
echo "$max_chunk_size" > $nnet_dir/max_chunk_size
echo "$min_chunk_size" > $nnet_dir/min_chunk_size
fi
# --trainer.input-model exp/models/asv_eval/xvect_01709_1/final.raw \
dropout_schedule='0,[email protected],[email protected],0'
srand=123
if [ $stage -le 8 ]; then
steps/nnet3/train_raw_dnn.py --stage=$train_stage \
--cmd="$train_cmd" \
--trainer.input-model exp/models/asv_eval_b1_anon/xvect_01709_1/final.raw \
--trainer.optimization.proportional-shrink $shrink \
--trainer.optimization.momentum=0.5 \
--trainer.optimization.num-jobs-initial=2 \
--trainer.optimization.num-jobs-final=2 \
--trainer.optimization.initial-effective-lrate=0.$lrate \
--trainer.optimization.final-effective-lrate=0.0$lrate \
--trainer.optimization.minibatch-size=64 \
--trainer.srand=$srand \
--trainer.max-param-change=2 \
--trainer.num-epochs=$epochs \
--trainer.dropout-schedule="$dropout_schedule" \
--trainer.shuffle-buffer-size=1000 \
--egs.frames-per-eg=1 \
--egs.dir="$egs_dir" \
--cleanup.remove-egs false \
--cleanup.preserve-model-interval=5 \
--use-gpu=true \
--dir=$nnet_dir || exit 1
fi
exit 0