vad streaming return [beg, -1], [], [-1, end], [beg, end] (#1306)

* fix add_file bug (#1296) Co-authored-by: shixian.shi <[email protected]> * funasr1.0 uniasr * funasr1.0 uniasr * update with main (#1305) * v1.0.3 * update clients for 2pass * update download tools --------- Co-authored-by: 雾聪 <[email protected]> * vad streaming return [beg, -1], [], [-1, end], [beg, end]] --------- Co-authored-by: shixian.shi <[email protected]> Co-authored-by: 雾聪 <[email protected]>
modelscope · Jan 26, 2024 · 65396ee · 65396ee
1 parent 5d300ae
commit 65396ee
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 15 deletions.
diff --git a/examples/industrial_data_pretraining/fsmn_vad_streaming/demo.py b/examples/industrial_data_pretraining/fsmn_vad_streaming/demo.py
@@ -6,10 +6,9 @@
 from funasr import AutoModel
 wav_file = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav"
 
-chunk_size = 60000 # ms
 model = AutoModel(model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", model_revision="v2.0.4")
 
-res = model.generate(input=wav_file, chunk_size=chunk_size, )
+res = model.generate(input=wav_file)
 print(res)
 
 
@@ -20,6 +19,7 @@
 wav_file = os.path.join(model.model_path, "example/vad_example.wav")
 speech, sample_rate = soundfile.read(wav_file)
 
+chunk_size = 200 # ms
 chunk_stride = int(chunk_size * sample_rate / 1000)
 
 cache = {}
@@ -32,6 +32,8 @@
  cache=cache,
  is_final=is_final,
  chunk_size=chunk_size,
+ disable_pbar=True,
  )
+ # print(res)
  if len(res[0]["value"]):
  print(res)
diff --git a/examples/industrial_data_pretraining/uniasr/demo.py b/examples/industrial_data_pretraining/uniasr/demo.py
@@ -5,8 +5,9 @@
 
 from funasr import AutoModel
 
-model = AutoModel(model="iic/speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline", model_revision="v2.0.4",
- )
+
+model = AutoModel(model="iic/speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline", model_revision="v2.0.4",)
+
 
 res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav")
 print(res)

diff --git a/funasr/models/fsmn_vad_streaming/model.py b/funasr/models/fsmn_vad_streaming/model.py
@@ -482,13 +482,17 @@ def GetFrameState(self, t: int, cache: dict = {}):
 
  return frame_state
 
- def forward(self, feats: torch.Tensor, waveform: torch.tensor, cache: dict = {},
- is_final: bool = False
+ def forward(self, feats: torch.Tensor,
+ waveform: torch.tensor,
+ cache: dict = {},
+ is_final: bool = False,
+ **kwargs,
  ):
  # if len(cache) == 0:
  # self.AllResetDetection()
  # self.waveform = waveform # compute decibel for each frame
  cache["stats"].waveform = waveform
+ is_streaming_input = kwargs.get("is_streaming_input", True)
  self.ComputeDecibel(cache=cache)
  self.ComputeScores(feats, cache=cache)
  if not is_final:
@@ -500,13 +504,32 @@ def forward(self, feats: torch.Tensor, waveform: torch.tensor, cache: dict = {},
  segment_batch = []
  if len(cache["stats"].output_data_buf) > 0:
  for i in range(cache["stats"].output_data_buf_offset, len(cache["stats"].output_data_buf)):
- if not is_final and (not cache["stats"].output_data_buf[i].contain_seg_start_point or not
- cache["stats"].output_data_buf[
- i].contain_seg_end_point):
- continue
- segment = [cache["stats"].output_data_buf[i].start_ms, cache["stats"].output_data_buf[i].end_ms]
+ if is_streaming_input: # in this case, return [beg, -1], [], [-1, end], [beg, end]
+ if not cache["stats"].output_data_buf[i].contain_seg_start_point:
+ continue
+ if not cache["stats"].next_seg and not cache["stats"].output_data_buf[i].contain_seg_end_point:
+ continue
+ start_ms = cache["stats"].output_data_buf[i].start_ms if cache["stats"].next_seg else -1
+ if cache["stats"].output_data_buf[i].contain_seg_end_point:
+ end_ms = cache["stats"].output_data_buf[i].end_ms
+ cache["stats"].next_seg = True
+ cache["stats"].output_data_buf_offset += 1
+ else:
+ end_ms = -1
+ cache["stats"].next_seg = False
+ segment = [start_ms, end_ms]
+
+ else: # in this case, return [beg, end]
+
+ if not is_final and (not cache["stats"].output_data_buf[i].contain_seg_start_point or not
+ cache["stats"].output_data_buf[
+ i].contain_seg_end_point):
+ continue
+ segment = [cache["stats"].output_data_buf[i].start_ms, cache["stats"].output_data_buf[i].end_ms]
+ cache["stats"].output_data_buf_offset += 1 # need update this parameter
+
  segment_batch.append(segment)
- cache["stats"].output_data_buf_offset += 1 # need update this parameter
+
  if segment_batch:
  segments.append(segment_batch)
  # if is_final:
@@ -551,7 +574,8 @@ def inference(self,
  chunk_stride_samples = int(chunk_size * frontend.fs / 1000)
 
  time1 = time.perf_counter()
- cfg = {"is_final": kwargs.get("is_final", False)}
+ is_streaming_input = kwargs.get("is_streaming_input", False) if chunk_size >= 15000 else kwargs.get("is_streaming_input", True)
+ cfg = {"is_final": kwargs.get("is_final", False), "is_streaming_input": is_streaming_input}
  audio_sample_list = load_audio_text_image_video(data_in,
  fs=frontend.fs,
  audio_fs=kwargs.get("fs", 16000),
@@ -560,7 +584,7 @@ def inference(self,
  cache=cfg,
  )
  _is_final = cfg["is_final"] # if data_in is a file or url, set is_final=True
-
+ is_streaming_input = cfg["is_streaming_input"]
  time2 = time.perf_counter()
  meta_data["load_data"] = f"{time2 - time1:0.3f}"
  assert len(audio_sample_list) == 1, "batch_size must be set 1"
@@ -588,7 +612,8 @@ def inference(self,
  "feats": speech,
  "waveform": cache["frontend"]["waveforms"],
  "is_final": kwargs["is_final"],
- "cache": cache
+ "cache": cache,
+ "is_streaming_input": is_streaming_input
  }
  segments_i = self.forward(**batch)
  if len(segments_i) > 0:

diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py
@@ -51,6 +51,7 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs:
  # if data_in is a file or url, set is_final=True
  if "cache" in kwargs:
  kwargs["cache"]["is_final"] = True
+ kwargs["cache"]["is_streaming_input"] = False
  elif isinstance(data_or_path_or_list, str) and data_type == "text" and tokenizer is not None:
  data_or_path_or_list = tokenizer.encode(data_or_path_or_list)
  elif isinstance(data_or_path_or_list, np.ndarray): # audio sample point