Skip to content

Commit

Permalink
vad streaming return [beg, -1], [], [-1, end], [beg, end] (#1306)
Browse files Browse the repository at this point in the history
* fix add_file bug (#1296)

Co-authored-by: shixian.shi <shixian.shi@alibaba-inc.com>

* funasr1.0 uniasr

* funasr1.0 uniasr

* update with main (#1305)

* v1.0.3

* update clients for 2pass

* update download tools

---------

Co-authored-by: 雾聪 <wucong.lyb@alibaba-inc.com>

* vad streaming return [beg, -1], [], [-1, end], [beg, end]]

---------

Co-authored-by: shixian.shi <shixian.shi@alibaba-inc.com>
Co-authored-by: 雾聪 <wucong.lyb@alibaba-inc.com>
  • Loading branch information
3 people authored Jan 26, 2024
1 parent 5d300ae commit 65396ee
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,9 @@
from funasr import AutoModel
wav_file = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav"

chunk_size = 60000 # ms
model = AutoModel(model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", model_revision="v2.0.4")

res = model.generate(input=wav_file, chunk_size=chunk_size, )
res = model.generate(input=wav_file)
print(res)


Expand All @@ -20,6 +19,7 @@
wav_file = os.path.join(model.model_path, "example/vad_example.wav")
speech, sample_rate = soundfile.read(wav_file)

chunk_size = 200 # ms
chunk_stride = int(chunk_size * sample_rate / 1000)

cache = {}
Expand All @@ -32,6 +32,8 @@
cache=cache,
is_final=is_final,
chunk_size=chunk_size,
disable_pbar=True,
)
# print(res)
if len(res[0]["value"]):
print(res)
5 changes: 3 additions & 2 deletions examples/industrial_data_pretraining/uniasr/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@

from funasr import AutoModel

model = AutoModel(model="iic/speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline", model_revision="v2.0.4",
)

model = AutoModel(model="iic/speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline", model_revision="v2.0.4",)


res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav")
print(res)
Expand Down
47 changes: 36 additions & 11 deletions funasr/models/fsmn_vad_streaming/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,13 +482,17 @@ def GetFrameState(self, t: int, cache: dict = {}):

return frame_state

def forward(self, feats: torch.Tensor, waveform: torch.tensor, cache: dict = {},
is_final: bool = False
def forward(self, feats: torch.Tensor,
waveform: torch.tensor,
cache: dict = {},
is_final: bool = False,
**kwargs,
):
# if len(cache) == 0:
# self.AllResetDetection()
# self.waveform = waveform # compute decibel for each frame
cache["stats"].waveform = waveform
is_streaming_input = kwargs.get("is_streaming_input", True)
self.ComputeDecibel(cache=cache)
self.ComputeScores(feats, cache=cache)
if not is_final:
Expand All @@ -500,13 +504,32 @@ def forward(self, feats: torch.Tensor, waveform: torch.tensor, cache: dict = {},
segment_batch = []
if len(cache["stats"].output_data_buf) > 0:
for i in range(cache["stats"].output_data_buf_offset, len(cache["stats"].output_data_buf)):
if not is_final and (not cache["stats"].output_data_buf[i].contain_seg_start_point or not
cache["stats"].output_data_buf[
i].contain_seg_end_point):
continue
segment = [cache["stats"].output_data_buf[i].start_ms, cache["stats"].output_data_buf[i].end_ms]
if is_streaming_input: # in this case, return [beg, -1], [], [-1, end], [beg, end]
if not cache["stats"].output_data_buf[i].contain_seg_start_point:
continue
if not cache["stats"].next_seg and not cache["stats"].output_data_buf[i].contain_seg_end_point:
continue
start_ms = cache["stats"].output_data_buf[i].start_ms if cache["stats"].next_seg else -1
if cache["stats"].output_data_buf[i].contain_seg_end_point:
end_ms = cache["stats"].output_data_buf[i].end_ms
cache["stats"].next_seg = True
cache["stats"].output_data_buf_offset += 1
else:
end_ms = -1
cache["stats"].next_seg = False
segment = [start_ms, end_ms]

else: # in this case, return [beg, end]

if not is_final and (not cache["stats"].output_data_buf[i].contain_seg_start_point or not
cache["stats"].output_data_buf[
i].contain_seg_end_point):
continue
segment = [cache["stats"].output_data_buf[i].start_ms, cache["stats"].output_data_buf[i].end_ms]
cache["stats"].output_data_buf_offset += 1 # need update this parameter

segment_batch.append(segment)
cache["stats"].output_data_buf_offset += 1 # need update this parameter

if segment_batch:
segments.append(segment_batch)
# if is_final:
Expand Down Expand Up @@ -551,7 +574,8 @@ def inference(self,
chunk_stride_samples = int(chunk_size * frontend.fs / 1000)

time1 = time.perf_counter()
cfg = {"is_final": kwargs.get("is_final", False)}
is_streaming_input = kwargs.get("is_streaming_input", False) if chunk_size >= 15000 else kwargs.get("is_streaming_input", True)
cfg = {"is_final": kwargs.get("is_final", False), "is_streaming_input": is_streaming_input}
audio_sample_list = load_audio_text_image_video(data_in,
fs=frontend.fs,
audio_fs=kwargs.get("fs", 16000),
Expand All @@ -560,7 +584,7 @@ def inference(self,
cache=cfg,
)
_is_final = cfg["is_final"] # if data_in is a file or url, set is_final=True

is_streaming_input = cfg["is_streaming_input"]
time2 = time.perf_counter()
meta_data["load_data"] = f"{time2 - time1:0.3f}"
assert len(audio_sample_list) == 1, "batch_size must be set 1"
Expand Down Expand Up @@ -588,7 +612,8 @@ def inference(self,
"feats": speech,
"waveform": cache["frontend"]["waveforms"],
"is_final": kwargs["is_final"],
"cache": cache
"cache": cache,
"is_streaming_input": is_streaming_input
}
segments_i = self.forward(**batch)
if len(segments_i) > 0:
Expand Down
1 change: 1 addition & 0 deletions funasr/utils/load_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs:
# if data_in is a file or url, set is_final=True
if "cache" in kwargs:
kwargs["cache"]["is_final"] = True
kwargs["cache"]["is_streaming_input"] = False
elif isinstance(data_or_path_or_list, str) and data_type == "text" and tokenizer is not None:
data_or_path_or_list = tokenizer.encode(data_or_path_or_list)
elif isinstance(data_or_path_or_list, np.ndarray): # audio sample point
Expand Down

0 comments on commit 65396ee

Please sign in to comment.