Skip to content

Commit

Permalink
Merge branch 'SYSTRAN:master' into feature_extractor.py
Browse files Browse the repository at this point in the history
  • Loading branch information
BBC-Esq authored Oct 25, 2024
2 parents 9c5975c + b2da055 commit a31b95e
Show file tree
Hide file tree
Showing 15 changed files with 364 additions and 519 deletions.
4 changes: 2 additions & 2 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
include faster_whisper/assets/silero_vad.onnx
include faster_whisper/assets/silero_encoder_v5.onnx
include faster_whisper/assets/silero_decoder_v5.onnx
include requirements.txt
include requirements.conversion.txt
include faster_whisper/assets/pyannote_vad_model.bin
3 changes: 0 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,6 @@ language_info = model.detect_language_multi_segment("audio.mp3")

### Batched faster-whisper


The batched version of faster-whisper is inspired by [whisper-x](https://github.com/m-bain/whisperX) licensed under the BSD-2 Clause license and integrates its VAD model to this library. We modify this implementation and also replaced the feature extraction with a faster torch-based implementation. Batched version improves the speed upto 10-12x compared to openAI implementation and 3-4x compared to the sequential faster_whisper version. It works by transcribing semantically meaningful audio chunks as batches leading to faster inference.

The following code snippet illustrates how to run inference with batched version on an example audio file. Please also refer to the test scripts of batched faster whisper.

```python
Expand Down
83 changes: 83 additions & 0 deletions benchmark/evaluate_yt_commons.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import argparse
import json
import os

from io import BytesIO

from datasets import load_dataset
from evaluate import load
from pytubefix import YouTube
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers.models.whisper.english_normalizer import EnglishTextNormalizer

from faster_whisper import BatchedInferencePipeline, WhisperModel, decode_audio


def url_to_audio(row):
buffer = BytesIO()
yt = YouTube(row["link"])
video = (
yt.streams.filter(only_audio=True, mime_type="audio/mp4")
.order_by("bitrate")
.desc()
.first()
)
video.stream_to_buffer(buffer)
buffer.seek(0)
row["audio"] = decode_audio(buffer)
return row


parser = argparse.ArgumentParser(description="WER benchmark")
parser.add_argument(
"--audio_numb",
type=int,
default=None,
help="Specify the number of validation audio files in the dataset."
" Set to None to retrieve all audio files.",
)
args = parser.parse_args()

# define the evaluation metric
wer_metric = load("wer")

with open(os.path.join(os.path.dirname(__file__), "normalizer.json"), "r") as f:
normalizer = EnglishTextNormalizer(json.load(f))

dataset = load_dataset("mobiuslabsgmbh/youtube-commons-asr-eval", streaming=True).map(
url_to_audio
)
dataset = iter(
DataLoader(dataset["test"], batch_size=1, prefetch_factor=4, num_workers=2)
)

model = WhisperModel("large-v3", device="cuda")
pipeline = BatchedInferencePipeline(model, device="cuda")


all_transcriptions = []
all_references = []
# iterate over the dataset and run inference
for i, row in tqdm(enumerate(dataset), desc="Evaluating..."):
result, info = pipeline.transcribe(
row["audio"][0],
batch_size=8,
word_timestamps=False,
without_timestamps=True,
)

all_transcriptions.append("".join(segment.text for segment in result))
all_references.append(row["text"][0])
if args.audio_numb and i == (args.audio_numb - 1):
break

# normalize predictions and references
all_transcriptions = [normalizer(transcription) for transcription in all_transcriptions]
all_references = [normalizer(reference) for reference in all_references]

# compute the WER metric
wer = 100 * wer_metric.compute(
predictions=all_transcriptions, references=all_references
)
print("WER: %.3f" % wer)
1 change: 1 addition & 0 deletions benchmark/requirements.benchmark.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ evaluate
datasets
memory_profiler
py3nvml
pytubefix
2 changes: 1 addition & 1 deletion docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04
FROM nvidia/cuda:12.3.2-cudnn9-runtime-ubuntu22.04
WORKDIR /root
RUN apt-get update -y && apt-get install -y python3-pip
COPY infer.py jfk.flac ./
Expand Down
Binary file removed faster_whisper/assets/pyannote_vad_model.bin
Binary file not shown.
Binary file added faster_whisper/assets/silero_decoder_v5.onnx
Binary file not shown.
Binary file added faster_whisper/assets/silero_encoder_v5.onnx
Binary file not shown.
Binary file removed faster_whisper/assets/silero_vad.onnx
Binary file not shown.
92 changes: 83 additions & 9 deletions faster_whisper/audio.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,20 @@
"""We use the PyAV library to decode the audio: https://github.com/PyAV-Org/PyAV
The advantage of PyAV is that it bundles the FFmpeg libraries so there is no additional
system dependencies. FFmpeg does not need to be installed on the system.
However, the API is quite low-level so we need to manipulate audio frames directly.
"""

import gc
import io
import itertools

from typing import BinaryIO, Union

import av
import numpy as np
import torch
import torchaudio


def decode_audio(
Expand All @@ -17,22 +30,83 @@ def decode_audio(
split_stereo: Return separate left and right channels.
Returns:
A float32 Torch Tensor.
A float32 Numpy array.
If `split_stereo` is enabled, the function returns a 2-tuple with the
separated left and right channels.
"""
resampler = av.audio.resampler.AudioResampler(
format="s16",
layout="mono" if not split_stereo else "stereo",
rate=sampling_rate,
)

waveform, audio_sf = torchaudio.load(input_file) # waveform: channels X T
raw_buffer = io.BytesIO()
dtype = None

with av.open(input_file, mode="r", metadata_errors="ignore") as container:
frames = container.decode(audio=0)
frames = _ignore_invalid_frames(frames)
frames = _group_frames(frames, 500000)
frames = _resample_frames(frames, resampler)

for frame in frames:
array = frame.to_ndarray()
dtype = array.dtype
raw_buffer.write(array)

# It appears that some objects related to the resampler are not freed
# unless the garbage collector is manually run.
# https://github.com/SYSTRAN/faster-whisper/issues/390
# note that this slows down loading the audio a little bit
# if that is a concern, please use ffmpeg directly as in here:
# https://github.com/openai/whisper/blob/25639fc/whisper/audio.py#L25-L62
del resampler
gc.collect()

audio = np.frombuffer(raw_buffer.getbuffer(), dtype=dtype)

# Convert s16 back to f32.
audio = audio.astype(np.float32) / 32768.0

if audio_sf != sampling_rate:
waveform = torchaudio.functional.resample(
waveform, orig_freq=audio_sf, new_freq=sampling_rate
)
if split_stereo:
return waveform[0], waveform[1]
left_channel = audio[0::2]
right_channel = audio[1::2]
return torch.from_numpy(left_channel), torch.from_numpy(right_channel)

return torch.from_numpy(audio)


def _ignore_invalid_frames(frames):
iterator = iter(frames)

while True:
try:
yield next(iterator)
except StopIteration:
break
except av.error.InvalidDataError:
continue


def _group_frames(frames, num_samples=None):
fifo = av.audio.fifo.AudioFifo()

for frame in frames:
frame.pts = None # Ignore timestamp check.
fifo.write(frame)

if num_samples is not None and fifo.samples >= num_samples:
yield fifo.read()

if fifo.samples > 0:
yield fifo.read()


return waveform.mean(0)
def _resample_frames(frames, resampler):
# Add None to flush the resampler.
for frame in itertools.chain(frames, [None]):
yield from resampler.resample(frame)


def pad_or_trim(array, length: int, *, axis: int = -1):
Expand Down
Loading

0 comments on commit a31b95e

Please sign in to comment.