Skip to content

Commit 4e8b0a1

Browse files
authored
Provide torchscript example in Python for silero-vad (#695)
1 parent ea0c0b5 commit 4e8b0a1

File tree

10 files changed

+490
-1
lines changed

10 files changed

+490
-1
lines changed

.github/workflows/build-doc.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,8 @@ jobs:
254254
ls -lh _static/sense-voice
255255
256256
- name: Release sherpa.pdf
257-
if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push'
257+
# if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push'
258+
if: false
258259
uses: svenstaro/upload-release-action@v2
259260
with:
260261
file_glob: true
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
name: export-silero-vad
2+
3+
on:
4+
push:
5+
branches:
6+
- export-silero-vad
7+
workflow_dispatch:
8+
9+
concurrency:
10+
group: export-silero-vad-${{ github.ref }}
11+
cancel-in-progress: true
12+
13+
jobs:
14+
export-silero-vad:
15+
if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj'
16+
name: export ${{ matrix.model }}
17+
runs-on: ${{ matrix.os }}
18+
strategy:
19+
fail-fast: false
20+
matrix:
21+
os: [macos-latest]
22+
python-version: ["3.10"]
23+
model: ['v4']
24+
25+
steps:
26+
- uses: actions/checkout@v4
27+
28+
- name: Setup Python ${{ matrix.python-version }}
29+
uses: actions/setup-python@v5
30+
with:
31+
python-version: ${{ matrix.python-version }}
32+
33+
- name: Install pyannote
34+
shell: bash
35+
run: |
36+
pip install torch==1.13.0 torchaudio==0.13.0 soundfile librosa numpy==1.26.4
37+
38+
- name: Export ${{ matrix.model }}
39+
shell: bash
40+
run: |
41+
pushd scripts/silero-vad
42+
model=${{ matrix.model }}
43+
./run-$model.sh
44+
python3 ./export-$model.py
45+
ls -lh
46+
47+
48+
- name: Test ${{ matrix.model }}
49+
shell: bash
50+
run: |
51+
pushd scripts/silero-vad
52+
53+
model=${{ matrix.model }}
54+
python3 ./test-$model.py
55+
ls -lh
56+
57+
- name: Test ${{ matrix.model }} batch
58+
shell: bash
59+
run: |
60+
pushd scripts/silero-vad
61+
62+
model=${{ matrix.model }}
63+
python3 ./test-$model-batch.py
64+
ls -lh
65+
66+
- name: Collect results
67+
shell: bash
68+
run: |
69+
cp scripts/silero-vad/*.pt ./
70+
71+
- name: Release
72+
uses: svenstaro/upload-release-action@v2
73+
with:
74+
file_glob: true
75+
file: ./*.pt
76+
overwrite: true
77+
repo_name: k2-fsa/sherpa
78+
repo_token: ${{ secrets.UPLOAD_GH_SHERPA_TOKEN }}
79+
tag: vad-models
80+
81+
- name: Publish ${{ matrix.model }} to huggingface
82+
shell: bash
83+
env:
84+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
85+
run: |
86+
model=${{ matrix.model }}
87+
88+
git config --global user.email "[email protected]"
89+
git config --global user.name "Fangjun Kuang"
90+
91+
export GIT_CLONE_PROTECTION_ACTIVE=false
92+
93+
export GIT_LFS_SKIP_SMUDGE=1
94+
95+
rm -rf huggingface
96+
git clone https://csukuangfj:[email protected]/k2-fsa/sherpa-models huggingface
97+
98+
mkdir -p ./huggingface/vad
99+
100+
cp -av *.pt ./huggingface/vad
101+
102+
cd huggingface
103+
104+
git status
105+
ls -lh
106+
git lfs track "*.pt*"
107+
108+
git add .
109+
git commit -m "upload $src" || true
110+
git push https://csukuangfj:[email protected]/k2-fsa/sherpa-models main || true
111+

scripts/silero-vad/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*.jit

scripts/silero-vad/export-v4.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/usr/bin/env python3
2+
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
3+
4+
import torch
5+
6+
7+
def main():
8+
m = torch.jit.load("./silero_vad_v4.jit")
9+
meta_data = {
10+
"version": "4",
11+
}
12+
m.save("silero-vad-v4.pt", _extra_files=meta_data)
13+
14+
15+
if __name__ == "__main__":
16+
main()

scripts/silero-vad/export-v5.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/usr/bin/env python3
2+
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
3+
4+
import torch
5+
6+
7+
class Wrapper(torch.nn.Module):
8+
def __init__(self, m):
9+
super().__init__()
10+
self.sample_rates = m.sample_rates
11+
self.m = m
12+
13+
@torch.jit.export
14+
def audio_forward(self, x: torch.Tensor, sr: int, window_size: int = 512):
15+
# window_size is ignored
16+
# we wrap v5 so that it has the same interface as v4 for audio_forward
17+
return self.m.audio_forward(x, sr)
18+
19+
20+
def main():
21+
m = torch.jit.load("./silero_vad_v5.jit")
22+
wrapper = Wrapper(m)
23+
24+
meta_data = {
25+
"version": "5",
26+
}
27+
m = torch.jit.script(wrapper)
28+
m.save("silero-vad-v5.pt", _extra_files=meta_data)
29+
30+
31+
if __name__ == "__main__":
32+
main()

scripts/silero-vad/run-v4.sh

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/usr/bin/env bash
2+
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
3+
4+
set -ex
5+
6+
if [ ! -f ./silero_vad_v4.jit ]; then
7+
# It is silero_vad v4. You can also download it from
8+
# https://github.com/snakers4/silero-vad/blob/v4.0/files/silero_vad.jit
9+
#
10+
# Note that we have renamed silero_vad.jit to silero_vad_v4.jit
11+
#
12+
wget https://huggingface.co/csukuangfj/tmp-files/resolve/main/silero_vad_v4.jit
13+
fi
14+
15+
if [ ! -f ./lei-jun-test.wav ]; then
16+
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
17+
fi
18+
19+
if [ ! -f ./Obama.wav ]; then
20+
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
21+
fi

scripts/silero-vad/run-v5.sh

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/usr/bin/env bash
2+
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
3+
4+
set -ex
5+
6+
if [ ! -f ./silero_vad_v5.jit ]; then
7+
# It is silero_vad v5. You can also download it from
8+
# https://github.com/snakers4/silero-vad/blob/v5.1.2/src/silero_vad/data/silero_vad.jit
9+
#
10+
# Note that we have renamed silero_vad.jit to silero_vad_v5.jit
11+
#
12+
wget https://huggingface.co/csukuangfj/tmp-files/resolve/main/silero_vad_v5.jit
13+
fi
14+
15+
if [ ! -f ./lei-jun-test.wav ]; then
16+
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
17+
fi
18+
19+
if [ ! -f ./Obama.wav ]; then
20+
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
21+
fi

scripts/silero-vad/test-v4-batch.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
#!/usr/bin/env python3
2+
3+
import torch
4+
import numpy as np
5+
import soundfile as sf
6+
import librosa
7+
8+
9+
def load_audio(filename: str) -> np.ndarray:
10+
data, sample_rate = sf.read(
11+
filename,
12+
always_2d=True,
13+
dtype="float32",
14+
)
15+
data = data[:, 0] # use only the first channel
16+
samples = np.ascontiguousarray(data)
17+
18+
if sample_rate != 16000:
19+
samples = librosa.resample(
20+
samples,
21+
orig_sr=sample_rate,
22+
target_sr=16000,
23+
)
24+
25+
return samples
26+
27+
28+
@torch.inference_mode()
29+
def main():
30+
m = torch.jit.load("./silero-vad-v4.pt")
31+
m.eval()
32+
33+
filenames = ["./lei-jun-test.wav", "./Obama.wav"]
34+
35+
samples1 = load_audio(filenames[0])
36+
samples2 = load_audio(filenames[1])
37+
print(samples1.shape)
38+
print(samples2.shape)
39+
40+
samples = torch.nn.utils.rnn.pad_sequence(
41+
[torch.from_numpy(samples1), torch.from_numpy(samples2)],
42+
batch_first=True,
43+
padding_value=0,
44+
)
45+
print(samples.shape)
46+
47+
sample_rate = 16000
48+
49+
start = 0
50+
window_size = 512
51+
out = m.audio_forward(samples, torch.tensor([sample_rate]), window_size)
52+
# out: (batch_size, num_frames)
53+
assert out.shape[0] == samples.shape[0], out.shape
54+
print(out.shape)
55+
threshold = 0.5
56+
out = out > threshold
57+
min_speech_duration = 0.25 * sample_rate / window_size
58+
min_silence_duration = 0.25 * sample_rate / window_size
59+
60+
indexes = torch.nonzero(out, as_tuple=False)
61+
duration = [samples1.shape[0] / sample_rate, samples2.shape[0] / sample_rate]
62+
63+
for i in range(samples.shape[0]):
64+
w = indexes[indexes[:, 0] == i, 1].tolist()
65+
66+
result = []
67+
start = last = w[0]
68+
for k in w[1:]:
69+
if k - last < min_speech_duration:
70+
last = k
71+
continue
72+
else:
73+
if last - start > min_speech_duration:
74+
result.append((start, last))
75+
start = last = k
76+
77+
if last - start > min_speech_duration:
78+
result.append((start, last))
79+
80+
final = [result[0]]
81+
for r in result[1:]:
82+
f = final[-1]
83+
if r[0] - f[1] < min_silence_duration:
84+
final[-1] = (f[0], r[1])
85+
else:
86+
final.append(r)
87+
88+
final = filter(lambda f: f[1] - f[0] > min_speech_duration, final)
89+
90+
print(f"----------{filenames[i]}----------")
91+
for f in final:
92+
start = f[0] * window_size / sample_rate
93+
end = f[1] * window_size / sample_rate
94+
if start > duration[i] or end > duration[i]:
95+
break
96+
print("{:.3f} -- {:.3f}".format(start, end))
97+
98+
99+
if __name__ == "__main__":
100+
main()

0 commit comments

Comments
 (0)