Skip to content

Commit

Permalink
feat: update A2T audio conversion (#389)
Browse files Browse the repository at this point in the history
* update audio conversion to convert to numpy ndarray to feed raw audio data to transcription model
  • Loading branch information
ad-astra-video authored Jan 10, 2025
1 parent 5237d5c commit 84924f7
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 10 deletions.
14 changes: 6 additions & 8 deletions runner/app/pipelines/audio_to_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,13 +120,11 @@ def __init__(self, model_id: str):

def __call__(self, audio: UploadFile, duration: float, **kwargs) -> List[File]:
audioBytes = audio.file.read()

# Convert M4A/MP4 files for pipeline compatibility.
if (
os.path.splitext(audio.filename)[1].lower().lstrip(".")
in INCOMPATIBLE_EXTENSIONS
):
audioBytes = self._audio_converter.convert(audioBytes, "mp3")
#re-encode audio to match pre-processing done in transformers.
# pipeline accepts np.ndarray and does not convert it again. String file path and bytes are converted to np.ndarray in the pipeline.
#https://github.com/huggingface/transformers/blob/47c29ccfaf56947d845971a439cbe75a764b63d7/src/transformers/pipelines/automatic_speech_recognition.py#L353
#https://github.com/huggingface/transformers/blob/47c29ccfaf56947d845971a439cbe75a764b63d7/src/transformers/pipelines/audio_utils.py#L10
audio_array = self._audio_converter.to_ndarray(audioBytes)

# Adjust batch size and chunk length based on timestamps and duration.
# NOTE: Done to prevent CUDA OOM errors for large audio files.
Expand All @@ -150,7 +148,7 @@ def __call__(self, audio: UploadFile, duration: float, **kwargs) -> List[File]:
)

try:
outputs = self.tm(audioBytes, **kwargs)
outputs = self.tm(audio_array, **kwargs)
outputs.setdefault("chunks", [])
except torch.cuda.OutOfMemoryError as e:
raise e
Expand Down
43 changes: 41 additions & 2 deletions runner/app/pipelines/utils/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from io import BytesIO

import av

import numpy as np

class AudioConversionError(Exception):
"""Raised when an audio file cannot be converted."""
Expand All @@ -19,7 +19,46 @@ class AudioConverter:
"""Converts audio files to different formats."""

@staticmethod
def convert(input_bytes: bytes, output_extension: str, output_codec=None) -> bytes:
def to_ndarray(input_bytes: bytes) -> np.ndarray:
#inspired by https://github.com/SYSTRAN/faster-whisper/blob/d889345e071de21a83bdae60ba4b07110cfd0696/faster_whisper/audio.py
"""Converts audio in media file to a NumPy array.
Args:
input_bytes: The audio file as bytes to convert.
Returns:
The audio file as a NumPy array.
"""
output_buffer = BytesIO()
input_buffer = BytesIO(input_bytes)
resampler = av.audio.resampler.AudioResampler(
format="s16",
layout="mono",
rate=16000,
)

audio_array = None
try:
input_container = av.open(input_buffer, mode="r")
for stream in input_container.streams.audio:
for frame in input_container.decode(stream):
resampled_frame = resampler.resample(frame)
array = resampled_frame[0].to_ndarray()
dtype = array.dtype
output_buffer.write(array)

audio_array = np.frombuffer(output_buffer.getbuffer(), dtype=dtype)
audio_array = audio_array.astype(np.float32) / 32768.0

except Exception as e:
raise AudioConversionError(f"Error during audio conversion to numpy array: {e}")
finally:
input_container.close()

return audio_array

@staticmethod
def convert(input_bytes: bytes, output_extension: str, output_codec: str) -> bytes:
"""Converts an audio file to a different format.
Args:
Expand Down

0 comments on commit 84924f7

Please sign in to comment.