feat: update A2T audio conversion (#389)

* update audio conversion to convert to numpy ndarray to feed raw audio data to transcription model
livepeer · Jan 10, 2025 · 84924f7 · 84924f7
1 parent 5237d5c
commit 84924f7
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 10 deletions.
diff --git a/runner/app/pipelines/audio_to_text.py b/runner/app/pipelines/audio_to_text.py
@@ -120,13 +120,11 @@ def __init__(self, model_id: str):
 
     def __call__(self, audio: UploadFile, duration: float, **kwargs) -> List[File]:
         audioBytes = audio.file.read()
-
-        # Convert M4A/MP4 files for pipeline compatibility.
-        if (
-            os.path.splitext(audio.filename)[1].lower().lstrip(".")
-            in INCOMPATIBLE_EXTENSIONS
-        ):
-            audioBytes = self._audio_converter.convert(audioBytes, "mp3")
+        #re-encode audio to match pre-processing done in transformers.
+        # pipeline accepts np.ndarray and does not convert it again. String file path and bytes are converted to np.ndarray in the pipeline.
+        #https://github.com/huggingface/transformers/blob/47c29ccfaf56947d845971a439cbe75a764b63d7/src/transformers/pipelines/automatic_speech_recognition.py#L353
+        #https://github.com/huggingface/transformers/blob/47c29ccfaf56947d845971a439cbe75a764b63d7/src/transformers/pipelines/audio_utils.py#L10
+        audio_array = self._audio_converter.to_ndarray(audioBytes)
 
         # Adjust batch size and chunk length based on timestamps and duration.
         # NOTE: Done to prevent CUDA OOM errors for large audio files.
@@ -150,7 +148,7 @@ def __call__(self, audio: UploadFile, duration: float, **kwargs) -> List[File]:
         )
 
         try:
-            outputs = self.tm(audioBytes, **kwargs)
+            outputs = self.tm(audio_array, **kwargs)
             outputs.setdefault("chunks", [])
         except torch.cuda.OutOfMemoryError as e:
             raise e

diff --git a/runner/app/pipelines/utils/audio.py b/runner/app/pipelines/utils/audio.py
@@ -5,7 +5,7 @@
 from io import BytesIO
 
 import av
-
+import numpy as np
 
 class AudioConversionError(Exception):
     """Raised when an audio file cannot be converted."""
@@ -19,7 +19,46 @@ class AudioConverter:
     """Converts audio files to different formats."""
 
     @staticmethod
-    def convert(input_bytes: bytes, output_extension: str, output_codec=None) -> bytes:
+    def to_ndarray(input_bytes: bytes) -> np.ndarray:
+        #inspired by https://github.com/SYSTRAN/faster-whisper/blob/d889345e071de21a83bdae60ba4b07110cfd0696/faster_whisper/audio.py
+        """Converts audio in media file to a NumPy array.
+        
+        Args:
+            input_bytes: The audio file as bytes to convert.
+
+        Returns:
+            The audio file as a NumPy array.
+        """
+        output_buffer = BytesIO()
+        input_buffer = BytesIO(input_bytes)
+        resampler = av.audio.resampler.AudioResampler(
+            format="s16",
+            layout="mono",
+            rate=16000,
+        )
+
+        audio_array = None
+        try:
+            input_container = av.open(input_buffer, mode="r")
+            for stream in input_container.streams.audio:
+                for frame in input_container.decode(stream):
+                    resampled_frame = resampler.resample(frame)
+                    array = resampled_frame[0].to_ndarray()
+                    dtype = array.dtype
+                    output_buffer.write(array)
+
+            audio_array = np.frombuffer(output_buffer.getbuffer(), dtype=dtype)
+            audio_array = audio_array.astype(np.float32) / 32768.0
+
+        except Exception as e:
+            raise AudioConversionError(f"Error during audio conversion to numpy array: {e}")
+        finally:
+            input_container.close()
+
+        return audio_array
+
+    @staticmethod
+    def convert(input_bytes: bytes, output_extension: str, output_codec: str) -> bytes:
         """Converts an audio file to a different format.
 
         Args: