skip transcription if no speech is found

SYSTRAN · Aug 26, 2024 · 64852b5 · 64852b5
1 parent 54a5ed2
commit 64852b5
Showing 1 changed file with 13 additions and 9 deletions.
diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py
@@ -385,9 +385,9 @@ def transcribe(
             all_language_probs,
         ) = self.get_language_and_tokenizer(audio, task, language)
 
-        duration_after_vad = sum(
-            (segment["end"] - segment["start"]) / sampling_rate
-            for segment in clip_timestamps
+        duration_after_vad = (
+            sum((segment["end"] - segment["start"]) for segment in clip_timestamps)
+            / sampling_rate
         )
 
         # batched options: see the difference with default options in WhisperModel
@@ -438,13 +438,17 @@ def transcribe(
         to_cpu = (
             self.model.model.device == "cuda" and len(self.model.model.device_index) > 1
         )
-        features = torch.stack(
-            [
-                self.model.feature_extractor(chunk, to_cpu=to_cpu)[
-                    ..., : self.model.feature_extractor.nb_max_frames
+        features = (
+            torch.stack(
+                [
+                    self.model.feature_extractor(chunk, to_cpu=to_cpu)[
+                        ..., : self.model.feature_extractor.nb_max_frames
+                    ]
+                    for chunk in audio_chunks
                 ]
-                for chunk in audio_chunks
-            ]
+            )
+            if duration_after_vad
+            else []
         )
 
         segments = self._batched_segments_generator(