Provide torchscript example in Python for silero-vad (#695)

csukuangfj · web-flow · commit 4e8b0a15ce69 · 2025-01-09T13:42:55.000+08:00
diff --git a/.github/workflows/build-doc.yml b/.github/workflows/build-doc.yml
@@ -254,7 +254,8 @@ jobs:
           ls -lh _static/sense-voice
 
       - name: Release sherpa.pdf
-        if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push'
+        # if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push'
+        if: false
         uses: svenstaro/upload-release-action@v2
         with:
           file_glob: true
diff --git a/.github/workflows/export-silero-vad.yaml b/.github/workflows/export-silero-vad.yaml
@@ -0,0 +1,111 @@
+name: export-silero-vad
+
+on:
+  push:
+    branches:
+      - export-silero-vad
+  workflow_dispatch:
+
+concurrency:
+  group: export-silero-vad-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  export-silero-vad:
+    if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj'
+    name: export ${{ matrix.model }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [macos-latest]
+        python-version: ["3.10"]
+        model: ['v4']
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install pyannote
+        shell: bash
+        run: |
+          pip install torch==1.13.0 torchaudio==0.13.0 soundfile librosa numpy==1.26.4
+
+      - name: Export ${{ matrix.model }}
+        shell: bash
+        run: |
+          pushd scripts/silero-vad
+          model=${{ matrix.model }}
+          ./run-$model.sh
+          python3 ./export-$model.py
+          ls -lh
+
+
+      - name: Test ${{ matrix.model }}
+        shell: bash
+        run: |
+          pushd scripts/silero-vad
+
+          model=${{ matrix.model }}
+          python3 ./test-$model.py
+          ls -lh
+
+      - name: Test ${{ matrix.model }} batch
+        shell: bash
+        run: |
+          pushd scripts/silero-vad
+
+          model=${{ matrix.model }}
+          python3 ./test-$model-batch.py
+          ls -lh
+
+      - name: Collect results
+        shell: bash
+        run: |
+          cp scripts/silero-vad/*.pt ./
+
+      - name: Release
+        uses: svenstaro/upload-release-action@v2
+        with:
+          file_glob: true
+          file: ./*.pt
+          overwrite: true
+          repo_name: k2-fsa/sherpa
+          repo_token: ${{ secrets.UPLOAD_GH_SHERPA_TOKEN }}
+          tag: vad-models
+
+      - name: Publish ${{ matrix.model }} to huggingface
+        shell: bash
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          model=${{ matrix.model }}
+
+          git config --global user.email "csukuangfj@gmail.com"
+          git config --global user.name "Fangjun Kuang"
+
+          export GIT_CLONE_PROTECTION_ACTIVE=false
+
+          export GIT_LFS_SKIP_SMUDGE=1
+
+          rm -rf huggingface
+          git clone https://csukuangfj:$HF_TOKEN@huggingface.co/k2-fsa/sherpa-models huggingface
+
+          mkdir -p ./huggingface/vad
+
+          cp -av *.pt ./huggingface/vad
+
+          cd huggingface
+
+          git status
+          ls -lh
+          git lfs track "*.pt*"
+
+          git add .
+          git commit -m "upload $src" || true
+          git push https://csukuangfj:$HF_TOKEN@huggingface.co/k2-fsa/sherpa-models main || true
+
diff --git a/scripts/silero-vad/.gitignore b/scripts/silero-vad/.gitignore
@@ -0,0 +1 @@
+*.jit
diff --git a/scripts/silero-vad/export-v4.py b/scripts/silero-vad/export-v4.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python3
+# Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
+
+import torch
+
+
+def main():
+    m = torch.jit.load("./silero_vad_v4.jit")
+    meta_data = {
+        "version": "4",
+    }
+    m.save("silero-vad-v4.pt", _extra_files=meta_data)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/silero-vad/export-v5.py b/scripts/silero-vad/export-v5.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+# Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
+
+import torch
+
+
+class Wrapper(torch.nn.Module):
+    def __init__(self, m):
+        super().__init__()
+        self.sample_rates = m.sample_rates
+        self.m = m
+
+    @torch.jit.export
+    def audio_forward(self, x: torch.Tensor, sr: int, window_size: int = 512):
+        # window_size is ignored
+        # we wrap v5 so that it has the same interface as v4 for audio_forward
+        return self.m.audio_forward(x, sr)
+
+
+def main():
+    m = torch.jit.load("./silero_vad_v5.jit")
+    wrapper = Wrapper(m)
+
+    meta_data = {
+        "version": "5",
+    }
+    m = torch.jit.script(wrapper)
+    m.save("silero-vad-v5.pt", _extra_files=meta_data)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/silero-vad/run-v4.sh b/scripts/silero-vad/run-v4.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+# Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
+
+set -ex
+
+if [ ! -f ./silero_vad_v4.jit ]; then
+  # It is silero_vad v4. You can also download it from
+  # https://github.com/snakers4/silero-vad/blob/v4.0/files/silero_vad.jit
+  #
+  # Note that we have renamed silero_vad.jit to silero_vad_v4.jit
+  #
+  wget https://huggingface.co/csukuangfj/tmp-files/resolve/main/silero_vad_v4.jit
+fi
+
+if [ ! -f ./lei-jun-test.wav ]; then
+  wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
+fi
+
+if [ ! -f ./Obama.wav ]; then
+  wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
+fi
diff --git a/scripts/silero-vad/run-v5.sh b/scripts/silero-vad/run-v5.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+# Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
+
+set -ex
+
+if [ ! -f ./silero_vad_v5.jit ]; then
+  # It is silero_vad v5. You can also download it from
+  # https://github.com/snakers4/silero-vad/blob/v5.1.2/src/silero_vad/data/silero_vad.jit
+  #
+  # Note that we have renamed silero_vad.jit to silero_vad_v5.jit
+  #
+  wget https://huggingface.co/csukuangfj/tmp-files/resolve/main/silero_vad_v5.jit
+fi
+
+if [ ! -f ./lei-jun-test.wav ]; then
+  wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
+fi
+
+if [ ! -f ./Obama.wav ]; then
+  wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
+fi
diff --git a/scripts/silero-vad/test-v4-batch.py b/scripts/silero-vad/test-v4-batch.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+
+import torch
+import numpy as np
+import soundfile as sf
+import librosa
+
+
+def load_audio(filename: str) -> np.ndarray:
+    data, sample_rate = sf.read(
+        filename,
+        always_2d=True,
+        dtype="float32",
+    )
+    data = data[:, 0]  # use only the first channel
+    samples = np.ascontiguousarray(data)
+
+    if sample_rate != 16000:
+        samples = librosa.resample(
+            samples,
+            orig_sr=sample_rate,
+            target_sr=16000,
+        )
+
+    return samples
+
+
+@torch.inference_mode()
+def main():
+    m = torch.jit.load("./silero-vad-v4.pt")
+    m.eval()
+
+    filenames = ["./lei-jun-test.wav", "./Obama.wav"]
+
+    samples1 = load_audio(filenames[0])
+    samples2 = load_audio(filenames[1])
+    print(samples1.shape)
+    print(samples2.shape)
+
+    samples = torch.nn.utils.rnn.pad_sequence(
+        [torch.from_numpy(samples1), torch.from_numpy(samples2)],
+        batch_first=True,
+        padding_value=0,
+    )
+    print(samples.shape)
+
+    sample_rate = 16000
+
+    start = 0
+    window_size = 512
+    out = m.audio_forward(samples, torch.tensor([sample_rate]), window_size)
+    # out: (batch_size, num_frames)
+    assert out.shape[0] == samples.shape[0], out.shape
+    print(out.shape)
+    threshold = 0.5
+    out = out > threshold
+    min_speech_duration = 0.25 * sample_rate / window_size
+    min_silence_duration = 0.25 * sample_rate / window_size
+
+    indexes = torch.nonzero(out, as_tuple=False)
+    duration = [samples1.shape[0] / sample_rate, samples2.shape[0] / sample_rate]
+
+    for i in range(samples.shape[0]):
+        w = indexes[indexes[:, 0] == i, 1].tolist()
+
+        result = []
+        start = last = w[0]
+        for k in w[1:]:
+            if k - last < min_speech_duration:
+                last = k
+                continue
+            else:
+                if last - start > min_speech_duration:
+                    result.append((start, last))
+                start = last = k
+
+        if last - start > min_speech_duration:
+            result.append((start, last))
+
+        final = [result[0]]
+        for r in result[1:]:
+            f = final[-1]
+            if r[0] - f[1] < min_silence_duration:
+                final[-1] = (f[0], r[1])
+            else:
+                final.append(r)
+
+        final = filter(lambda f: f[1] - f[0] > min_speech_duration, final)
+
+        print(f"----------{filenames[i]}----------")
+        for f in final:
+            start = f[0] * window_size / sample_rate
+            end = f[1] * window_size / sample_rate
+            if start > duration[i] or end > duration[i]:
+                break
+            print("{:.3f} -- {:.3f}".format(start, end))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/silero-vad/test-v4.py b/scripts/silero-vad/test-v4.py
diff --git a/scripts/silero-vad/test-v5.py b/scripts/silero-vad/test-v5.py