Add stream transcriber to preview the transcribe progress

compustar · compustar · commit d8b53d5e3446 · 2024-10-31T10:02:34.000Z
diff --git a/cli.py b/cli.py
@@ -5,7 +5,7 @@
 
 from utils import to_srt
 
-from transcriber.Transcriber import Transcriber
+from transcriber.StreamTranscriber import StreamTranscriber
 
 # Configure logging first, before any imports
 logging.basicConfig(
@@ -78,7 +78,7 @@ def main():
         check_models()
 
         logger.info("Transcribing %s", args.audio_file)
-        transcriber = Transcriber(
+        transcriber = StreamTranscriber(
             corrector="opencc", use_denoiser=args.denoise, with_punct=args.punct)
         transcribe_results = transcriber.transcribe(args.audio_file)
 
diff --git a/transcriber/StreamTranscriber.py b/transcriber/StreamTranscriber.py
@@ -0,0 +1,111 @@
+import logging
+import os
+import re
+from typing import List, Literal, Union, Generator, Iterator
+import inspect
+
+import librosa
+import numpy as np
+import onnxruntime
+import torch
+from funasr_onnx import Fsmn_vad_online, SenseVoiceSmall
+from funasr_onnx.utils.sentencepiece_tokenizer import SentencepiecesTokenizer
+from resampy.core import resample
+from torchaudio.pipelines import MMS_FA as bundle
+from tqdm.auto import tqdm
+
+from corrector.Corrector import Corrector
+from denoiser import denoiser
+from transcriber.TranscribeResult import TranscribeResult
+from transcriber.Transcriber import Transcriber
+
+logger = logging.getLogger(__name__)
+
+
+class StreamTranscriber(Transcriber):
+    """
+    StreamTranscriber class
+
+    """
+
+    def transcribe(
+        self,
+        audio_file: str,
+    ) -> Generator[TranscribeResult, None, None]:
+        """
+        Transcribe audio file to text with timestamps.
+
+        Args:
+            audio_file (str): Path to audio file
+
+        Returns:
+            Generator[TranscribeResult]: Generator of transcription results
+        """
+        speech, sr = librosa.load(audio_file, sr=self.sr)
+
+        if self.use_denoiser:
+            logger.info("Denoising speech...")
+            speech, _ = denoiser(speech, sr)
+
+        if sr != 16_000:
+            speech = resample(speech, sr, 16_000,
+                              filter="kaiser_best", parallel=True)
+
+        logger.info("Segmenting speech...")
+        vad_segments = self._segment_speech(speech)
+
+        if not vad_segments:
+            return []
+
+
+        pgb_vad_segments = tqdm(
+            enumerate(vad_segments),
+            total=len(vad_segments),
+            desc="Transcribing"
+        )
+        
+        result_generator =  self._process_segments(speech, pgb_vad_segments)
+        for result in self._convert_to_traditional_chinese(result_generator):
+            pgb_vad_segments.set_description(result.text)
+            yield result
+
+    def _process_segments(
+        self,
+        speech: np.ndarray,
+        pgb_vad_segments: Iterator
+    ) -> Generator[TranscribeResult, None, None]:
+        """Process each speech segment"""
+        speech_lengths = len(speech)
+
+        for _, segment in pgb_vad_segments:
+            speech_j, _ = self._slice_padding_audio_samples(
+                speech,
+                speech_lengths,
+                [[segment]]
+            )
+
+            stt_results = self._asr(speech_j[0])
+            timestamp_offset = ((segment[0] * 16) / 16_000) - 0.1
+
+            if not stt_results:
+                continue
+
+            for result in stt_results:
+                result.start_time += timestamp_offset
+                result.end_time += timestamp_offset
+
+                yield result
+
+    def _convert_to_traditional_chinese(
+        self,
+        results: Iterator[TranscribeResult]
+    ) -> Generator[TranscribeResult, None, None]:
+        """Convert simplified Chinese to traditional Chinese"""
+        if not results:
+            return results
+
+        corrector = Corrector(self.corrector)
+
+        for result in results:
+            result.text = corrector.correct(result.text)
+            yield result
diff --git a/transcriber/__init__.py b/transcriber/__init__.py
@@ -1,4 +1,5 @@
 from .Transcriber import Transcriber
+from .StreamTranscriber import StreamTranscriber
 from .TranscribeResult import TranscribeResult
 
-__all__ = ["Transcriber", "TranscribeResult"]
+__all__ = ["Transcriber", "StreamTranscriber", "TranscribeResult"]
diff --git a/utils.py b/utils.py
@@ -1,7 +1,7 @@
 import logging
 import os
 import tempfile
-from typing import List
+from typing import Iterator
 
 from pysrt import SubRipFile, SubRipItem, SubRipTime
 from pytubefix import YouTube
@@ -46,7 +46,7 @@ def download_youtube_audio(video_id: str) -> str:
         return None
 
 
-def to_srt(results: List["TranscribeResult"]) -> str:
+def to_srt(results: Iterator["TranscribeResult"]) -> str:
     """
     Convert the list of TranscribeResult objects into a SRT file
     """