Skip to content

Commit

Permalink
support openai-whisper automated annotations
Browse files Browse the repository at this point in the history
  • Loading branch information
cronrpc committed Dec 9, 2023
1 parent 48e4b14 commit d79839a
Show file tree
Hide file tree
Showing 6 changed files with 115 additions and 5 deletions.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# SubFix
`SubFix` is a web tool designed for easily editing and modifying audio subtitles. Users can see changes in real-time and conveniently **merge, split, delete, and edit subtitles** of audios.

`SubFix` also supports automated voice annotation, utilizing `modelscope` and `whisper` for multilingual text annotation. Currently, `modelscope` provides automated annotations in languages including Chinese, English, Japanese, German, and Russian.
`SubFix` also supports automated voice annotation, utilizing `modelscope` and `whisper` for multilingual text annotation. Currently, `modelscope` provides automated annotations in languages including Chinese, English, Japanese, German, and Russian. `whisper` supports almost all languages."

[中文版本](README_zh.md)

Expand Down Expand Up @@ -49,6 +49,9 @@ subfix create modelscope --source_dir origin --language EN
subfix create modelscope --source_dir origin --language ZH
# Japanese
subfix create modelscope --source_dir origin --language JA
# OpenAI Whisper Annotation (Supports Almost All Languages)
subfix create whisper --source_dir origin --language ZH
subfix create whisper --source_dir origin --language JA
# diarization (speaker segmentation)
subfix diarization -h
subfix diarization --source_dir origin --target_dir diarization --min_seconds 3.0
Expand Down
5 changes: 4 additions & 1 deletion README_zh.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# SubFix
`SubFix`是一个用于轻松地编辑修改音频字幕的网页工具。能够实时地看到改动,方便地对音频进行**合并、分割、删除、编辑字幕**

`SubFix`同时也支持自动化语音标注,使用`modelscope``whisper`对文本进行多语言标注。目前`modelscope`支持中文、英语、日语、德语、德语、俄语的自动化标注。
`SubFix`同时也支持自动化语音标注,使用`modelscope``whisper`对文本进行多语言标注。目前`modelscope`支持中文、英语、日语、德语、德语、俄语的自动化标注。`whisper`支持几乎所有语言。

[English Version](README.md)

Expand Down Expand Up @@ -50,6 +50,9 @@ subfix create modelscope --source_dir origin --language EN
subfix create modelscope --source_dir origin --language ZH
# 日语
subfix create modelscope --source_dir origin --language JA
# Openai-Whisper标注 (几乎支持所有语言)
subfix create whisper --source_dir origin --language ZH
subfix create whisper --source_dir origin --language JA
# 说话人确认 (分离不同说话人)
subfix diarization -h
subfix diarization --source_dir origin --target_dir diarization --min_seconds 3.0
Expand Down
7 changes: 5 additions & 2 deletions subfix/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ def handle_create(args):
if args.solution == "modelscope":
from .solution.modelscope_multi_lang import run_task
run_task(args)
elif args.solution == "whisper":
from .solution.whisper_multi_lang import run_whisper_task
run_whisper_task(args)


def cli():
Expand Down Expand Up @@ -73,7 +76,6 @@ def cli():
modelscope_subparsers.add_argument("--language", type=str, default="ZH", help="Language, Default: ZH|JA|KO|EN|DE|RU")
modelscope_subparsers.add_argument("--output", type=str, default="demo.list", help="List file, Default: demo.list")
modelscope_subparsers.add_argument("--max_seconds", type=int, default=15, help="Max sliced voice length(seconds), Default: 15")
modelscope_subparsers.add_argument("--revision", type=str, default="1.0", help="the modelscope sulotions: 1.0; default: 1.0")
modelscope_subparsers.set_defaults(func=handle_create)

# create whisper
Expand All @@ -82,8 +84,9 @@ def cli():
whisper_subparsers.add_argument("--source_dir", type=str, default="origin", help="Source directory path, Default: origin")
whisper_subparsers.add_argument("--target_dir", type=str, default="dataset", help="Target directory path, Default: dataset")
whisper_subparsers.add_argument("--cache_dir", type=str, default="cache", help="cache directory path, Default: cache")
whisper_subparsers.add_argument("--model", type=str, default="large-v3", help="whisper model small/medium/large-v3, Default: small")
whisper_subparsers.add_argument("--sample_rate", type=int, default=44100, help="Sample rate, Default: 44100")
whisper_subparsers.add_argument("--language", type=str, default="ZH", help="Language, Default: ZH")
whisper_subparsers.add_argument("--language", type=str, default="ZH", help="Any Language whisper support, Default: ZH")
whisper_subparsers.add_argument("--output", type=str, default="demo.list", help="List file, Default: demo.list")
whisper_subparsers.add_argument("--max_seconds", type=int, default=15, help="Max sliced voice length(seconds), Default: 15")
whisper_subparsers.set_defaults(func=handle_create)
Expand Down
3 changes: 2 additions & 1 deletion subfix/models/audio/asr/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .speech_paraformer_large_vad_punc_asr_zh import Speech_Paraformer_Large_Vad_Punc_Asr_zh
from .speech_uniasr_asr_multilang import Speech_UniASR_Asr_MultiLang
from .speech_uniasr_asr_multilang import Speech_UniASR_Asr_MultiLang
from .openai_whisper import Openai_Whisper
24 changes: 24 additions & 0 deletions subfix/models/audio/asr/openai_whisper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@

from typing import Any
import librosa

class Openai_Whisper():
def __init__(self, language : str, model_name : str = "large-v3") -> None:
import whisper
self.whisper_model = whisper.load_model(model_name, download_root = None)
self.language = language

def infer(self, audio_in) -> None:
print("start asr:", audio_in)
segments = self.whisper_model.transcribe(audio_in, word_timestamps=True, language = self.language)['segments']
data_list = []
for _ in segments:
item = {}
item['start'] = _['start']
item['end'] = _['end']
item['text'] = _['text'].strip()
data_list.append(item)
return data_list

def __call__(self, *args: Any, **kwds: Any) -> Any:
return self.infer(*args, **kwds)
76 changes: 76 additions & 0 deletions subfix/solution/whisper_multi_lang.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import argparse
import os
import re
import subprocess

import librosa
import numpy as np
import soundfile

from subfix.models.audio.asr import Openai_Whisper
from subfix.utils import convert_files
from subfix.utils.misc import merge_audio_slice, get_sub_dirs


def create_whisper_dataset(source_dir, target_dir, sample_rate, language, infer_model, max_seconds):
# source_dir, target_dir, sample_rate=44100, language = "ZH", inference_pipeline = None

roles = get_sub_dirs(source_dir)
count = 0
result = []

for speaker_name in roles:

source_audios = [f for f in os.listdir(os.path.join(source_dir, speaker_name)) if f.endswith(".wav")]
source_audios = [os.path.join(source_dir, speaker_name, filename) for filename in source_audios]
slice_dir = os.path.join(target_dir, speaker_name)
os.makedirs(slice_dir, exist_ok=True)

for audio_path in sorted(source_audios):

data_list = infer_model(audio_in=audio_path)

data, count = merge_audio_slice(audio_path, slice_dir, data_list, count, sample_rate, max_seconds, language, speaker_name)

for item_audio in data:
sliced_audio_path = item_audio['sliced_audio_path']
speaker_name = item_audio['speaker_name']
language = item_audio['language']
text = item_audio['text']
result.append(f"{sliced_audio_path}|{speaker_name}|{language}|{text}")

return result


def create_whisper_list(source_dir, target_dir, cache_dir, sample_rate, language, output_list, max_seconds, model_name):

resample_dir = os.path.join(cache_dir,"subfix","origin",f"{sample_rate}")

convert_files(source_dir, resample_dir, sample_rate)

lang_map = {
"ZH" : "Chinese",
"EN" : "English",
"JA" : "Japanese",
"RU" : "ru",
"DE" : "de",
"KO" : "ko"
}

language_map = lang_map[language] if (language in lang_map.keys()) else language

asr_model = Openai_Whisper(language = language_map, model_name = model_name)

result = create_whisper_dataset(resample_dir, target_dir, sample_rate = sample_rate, language = language, infer_model = asr_model, max_seconds = max_seconds)

with open(output_list, "w", encoding="utf-8") as file:
for line in result:
try:
file.write(line.strip() + '\n')
except UnicodeEncodeError:
print("UnicodeEncodeError: Can't encode to ASCII:", line)


def run_whisper_task(args):
create_whisper_list(args.source_dir, args.target_dir, args.cache_dir, args.sample_rate, args.language, args.output, args.max_seconds, args.model)

0 comments on commit d79839a

Please sign in to comment.