{\rtf1\ansi\ansicpg1252\cocoartf2639 \cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fswiss\fcharset0 Helvetica;} {\colortbl;\red255\green255\blue255;} {\*\expandedcolortbl;;} \margl1440\margr1440\vieww11520\viewh8400\viewkind0 \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 \f0\fs24 \cf0 ######################################################################\ # Instantiate a model\ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\ #\ # load OpenAI Whisper automatic speech transcription \ import whisper\ import subprocess\ import librosa\ \ \ import argparse\ from whisper.utils import str2bool\ parser = argparse.ArgumentParser()\ parser.add_argument('--audio_file', default=\'91./test.wav')\ parser.add_argument('--modelsize',default='small.en')\ #parser.add_argument('--fp16',type=str2bool,default=1)\ parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default")\ args = parser.parse_args()\ \ print('===========================================')\ \ audio_file = args.audio_file\ modelsize = args.modelsize\ fp16flag = args.fp16\ \ \ audio_len = librosa.get_duration(filename=audio_file)\ print('Audio Length(sec):',audio_len)\ \ waveform, sr = librosa.load(audio_file)\ print('Audio sampling rate(hz):',sr)\ \ \ if fp16flag:\ print('FP16 Half-precision Model Loading')\ else:\ print('FP32 Full-precision Model Loading') \ \ def ML_model_load():\ \ # choose among "tiny", "base", "small", "medium", "large"\ # see https://github.com/openai/whisper/\ print('Loading whisper model:')\ print('Model size:',modelsize)\ model = whisper.load_model(modelsize) \ return model\ \ def ML_inference_task():\ text = model.transcribe(waveform.squeeze(), fp16=fp16flag, language='en')\ return text\ \ \ from time import sleep, perf_counter\ \ # start the threads\ model = ML_model_load()\ \ \ start_time = perf_counter()\ ML_inference_task()\ end_time = perf_counter()\ \ \ latency = end_time - start_time\ process_speed = audio_len/latency\ \ print(f'It took \{end_time- start_time: 0.4f\} second(s) to complete.')\ print('Audio processing speed:',process_speed,'seconds of input audio per second')\ \ \ print('===========================================')}