From d79839aebca2a7e42676b86989a577c4026c65ea Mon Sep 17 00:00:00 2001 From: cronrpc <147160173+cronrpc@users.noreply.github.com> Date: Sun, 10 Dec 2023 01:50:19 +0800 Subject: [PATCH] support openai-whisper automated annotations --- README.md | 5 +- README_zh.md | 5 +- subfix/cli.py | 7 ++- subfix/models/audio/asr/__init__.py | 3 +- subfix/models/audio/asr/openai_whisper.py | 24 +++++++ subfix/solution/whisper_multi_lang.py | 76 +++++++++++++++++++++++ 6 files changed, 115 insertions(+), 5 deletions(-) create mode 100644 subfix/models/audio/asr/openai_whisper.py create mode 100644 subfix/solution/whisper_multi_lang.py diff --git a/README.md b/README.md index 03be325..8b6fb98 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # SubFix `SubFix` is a web tool designed for easily editing and modifying audio subtitles. Users can see changes in real-time and conveniently **merge, split, delete, and edit subtitles** of audios. -`SubFix` also supports automated voice annotation, utilizing `modelscope` and `whisper` for multilingual text annotation. Currently, `modelscope` provides automated annotations in languages including Chinese, English, Japanese, German, and Russian. +`SubFix` also supports automated voice annotation, utilizing `modelscope` and `whisper` for multilingual text annotation. Currently, `modelscope` provides automated annotations in languages including Chinese, English, Japanese, German, and Russian. `whisper` supports almost all languages." [中文版本](README_zh.md) @@ -49,6 +49,9 @@ subfix create modelscope --source_dir origin --language EN subfix create modelscope --source_dir origin --language ZH # Japanese subfix create modelscope --source_dir origin --language JA +# OpenAI Whisper Annotation (Supports Almost All Languages) +subfix create whisper --source_dir origin --language ZH +subfix create whisper --source_dir origin --language JA # diarization (speaker segmentation) subfix diarization -h subfix diarization --source_dir origin --target_dir diarization --min_seconds 3.0 diff --git a/README_zh.md b/README_zh.md index 9350171..01ab677 100644 --- a/README_zh.md +++ b/README_zh.md @@ -1,7 +1,7 @@ # SubFix `SubFix`是一个用于轻松地编辑修改音频字幕的网页工具。能够实时地看到改动,方便地对音频进行**合并、分割、删除、编辑字幕**。 -`SubFix`同时也支持自动化语音标注,使用`modelscope`和`whisper`对文本进行多语言标注。目前`modelscope`支持中文、英语、日语、德语、德语、俄语的自动化标注。 +`SubFix`同时也支持自动化语音标注,使用`modelscope`和`whisper`对文本进行多语言标注。目前`modelscope`支持中文、英语、日语、德语、德语、俄语的自动化标注。`whisper`支持几乎所有语言。 [English Version](README.md) @@ -50,6 +50,9 @@ subfix create modelscope --source_dir origin --language EN subfix create modelscope --source_dir origin --language ZH # 日语 subfix create modelscope --source_dir origin --language JA +# Openai-Whisper标注 (几乎支持所有语言) +subfix create whisper --source_dir origin --language ZH +subfix create whisper --source_dir origin --language JA # 说话人确认 (分离不同说话人) subfix diarization -h subfix diarization --source_dir origin --target_dir diarization --min_seconds 3.0 diff --git a/subfix/cli.py b/subfix/cli.py index 26e427d..b4500e1 100644 --- a/subfix/cli.py +++ b/subfix/cli.py @@ -38,6 +38,9 @@ def handle_create(args): if args.solution == "modelscope": from .solution.modelscope_multi_lang import run_task run_task(args) + elif args.solution == "whisper": + from .solution.whisper_multi_lang import run_whisper_task + run_whisper_task(args) def cli(): @@ -73,7 +76,6 @@ def cli(): modelscope_subparsers.add_argument("--language", type=str, default="ZH", help="Language, Default: ZH|JA|KO|EN|DE|RU") modelscope_subparsers.add_argument("--output", type=str, default="demo.list", help="List file, Default: demo.list") modelscope_subparsers.add_argument("--max_seconds", type=int, default=15, help="Max sliced voice length(seconds), Default: 15") - modelscope_subparsers.add_argument("--revision", type=str, default="1.0", help="the modelscope sulotions: 1.0; default: 1.0") modelscope_subparsers.set_defaults(func=handle_create) # create whisper @@ -82,8 +84,9 @@ def cli(): whisper_subparsers.add_argument("--source_dir", type=str, default="origin", help="Source directory path, Default: origin") whisper_subparsers.add_argument("--target_dir", type=str, default="dataset", help="Target directory path, Default: dataset") whisper_subparsers.add_argument("--cache_dir", type=str, default="cache", help="cache directory path, Default: cache") + whisper_subparsers.add_argument("--model", type=str, default="large-v3", help="whisper model small/medium/large-v3, Default: small") whisper_subparsers.add_argument("--sample_rate", type=int, default=44100, help="Sample rate, Default: 44100") - whisper_subparsers.add_argument("--language", type=str, default="ZH", help="Language, Default: ZH") + whisper_subparsers.add_argument("--language", type=str, default="ZH", help="Any Language whisper support, Default: ZH") whisper_subparsers.add_argument("--output", type=str, default="demo.list", help="List file, Default: demo.list") whisper_subparsers.add_argument("--max_seconds", type=int, default=15, help="Max sliced voice length(seconds), Default: 15") whisper_subparsers.set_defaults(func=handle_create) diff --git a/subfix/models/audio/asr/__init__.py b/subfix/models/audio/asr/__init__.py index 35a346e..1346d7c 100644 --- a/subfix/models/audio/asr/__init__.py +++ b/subfix/models/audio/asr/__init__.py @@ -1,2 +1,3 @@ from .speech_paraformer_large_vad_punc_asr_zh import Speech_Paraformer_Large_Vad_Punc_Asr_zh -from .speech_uniasr_asr_multilang import Speech_UniASR_Asr_MultiLang \ No newline at end of file +from .speech_uniasr_asr_multilang import Speech_UniASR_Asr_MultiLang +from .openai_whisper import Openai_Whisper \ No newline at end of file diff --git a/subfix/models/audio/asr/openai_whisper.py b/subfix/models/audio/asr/openai_whisper.py new file mode 100644 index 0000000..1be799d --- /dev/null +++ b/subfix/models/audio/asr/openai_whisper.py @@ -0,0 +1,24 @@ + +from typing import Any +import librosa + +class Openai_Whisper(): + def __init__(self, language : str, model_name : str = "large-v3") -> None: + import whisper + self.whisper_model = whisper.load_model(model_name, download_root = None) + self.language = language + + def infer(self, audio_in) -> None: + print("start asr:", audio_in) + segments = self.whisper_model.transcribe(audio_in, word_timestamps=True, language = self.language)['segments'] + data_list = [] + for _ in segments: + item = {} + item['start'] = _['start'] + item['end'] = _['end'] + item['text'] = _['text'].strip() + data_list.append(item) + return data_list + + def __call__(self, *args: Any, **kwds: Any) -> Any: + return self.infer(*args, **kwds) \ No newline at end of file diff --git a/subfix/solution/whisper_multi_lang.py b/subfix/solution/whisper_multi_lang.py new file mode 100644 index 0000000..f2b2ec5 --- /dev/null +++ b/subfix/solution/whisper_multi_lang.py @@ -0,0 +1,76 @@ +import argparse +import os +import re +import subprocess + +import librosa +import numpy as np +import soundfile + +from subfix.models.audio.asr import Openai_Whisper +from subfix.utils import convert_files +from subfix.utils.misc import merge_audio_slice, get_sub_dirs + + +def create_whisper_dataset(source_dir, target_dir, sample_rate, language, infer_model, max_seconds): + # source_dir, target_dir, sample_rate=44100, language = "ZH", inference_pipeline = None + + roles = get_sub_dirs(source_dir) + count = 0 + result = [] + + for speaker_name in roles: + + source_audios = [f for f in os.listdir(os.path.join(source_dir, speaker_name)) if f.endswith(".wav")] + source_audios = [os.path.join(source_dir, speaker_name, filename) for filename in source_audios] + slice_dir = os.path.join(target_dir, speaker_name) + os.makedirs(slice_dir, exist_ok=True) + + for audio_path in sorted(source_audios): + + data_list = infer_model(audio_in=audio_path) + + data, count = merge_audio_slice(audio_path, slice_dir, data_list, count, sample_rate, max_seconds, language, speaker_name) + + for item_audio in data: + sliced_audio_path = item_audio['sliced_audio_path'] + speaker_name = item_audio['speaker_name'] + language = item_audio['language'] + text = item_audio['text'] + result.append(f"{sliced_audio_path}|{speaker_name}|{language}|{text}") + + return result + + +def create_whisper_list(source_dir, target_dir, cache_dir, sample_rate, language, output_list, max_seconds, model_name): + + resample_dir = os.path.join(cache_dir,"subfix","origin",f"{sample_rate}") + + convert_files(source_dir, resample_dir, sample_rate) + + lang_map = { + "ZH" : "Chinese", + "EN" : "English", + "JA" : "Japanese", + "RU" : "ru", + "DE" : "de", + "KO" : "ko" + } + + language_map = lang_map[language] if (language in lang_map.keys()) else language + + asr_model = Openai_Whisper(language = language_map, model_name = model_name) + + result = create_whisper_dataset(resample_dir, target_dir, sample_rate = sample_rate, language = language, infer_model = asr_model, max_seconds = max_seconds) + + with open(output_list, "w", encoding="utf-8") as file: + for line in result: + try: + file.write(line.strip() + '\n') + except UnicodeEncodeError: + print("UnicodeEncodeError: Can't encode to ASCII:", line) + + +def run_whisper_task(args): + create_whisper_list(args.source_dir, args.target_dir, args.cache_dir, args.sample_rate, args.language, args.output, args.max_seconds, args.model) +