diff --git a/whisper.cpp/common.cpp b/whisper.cpp/common.cpp index 816edce897..1492c1489a 100644 --- a/whisper.cpp/common.cpp +++ b/whisper.cpp/common.cpp @@ -27,206 +27,6 @@ #include "stb/stb_vorbis.h" #include "miniaudio.h" -#define MA_DATA_CONVERTER_STACK_BUFFER_SIZE 4096 - -static std::string delete_me; - -static void on_exit(void) { - if (!delete_me.empty()) { - unlink(delete_me.c_str()); - } -} - -static ma_result perform_audio_conversion(ma_decoder* pDecoder, ma_encoder* pEncoder) { - ma_result rc = MA_SUCCESS; - for (;;) { - ma_uint8 pRawData[MA_DATA_CONVERTER_STACK_BUFFER_SIZE]; - ma_uint64 framesReadThisIteration; - ma_uint64 framesToReadThisIteration; - framesToReadThisIteration = sizeof(pRawData) / ma_get_bytes_per_frame(pDecoder->outputFormat, pDecoder->outputChannels); - rc = ma_decoder_read_pcm_frames(pDecoder, pRawData, framesToReadThisIteration, &framesReadThisIteration); - if (rc != MA_SUCCESS) { - break; - } - ma_encoder_write_pcm_frames(pEncoder, pRawData, framesReadThisIteration, NULL); - if (framesReadThisIteration < framesToReadThisIteration) { - break; - } - } - return rc; -} - -// converts audio file to signed 16-bit 16000hz wav -static std::string convert_audio_file(const std::string & fname, bool stereo) { - - // create temporary filename - std::string newpath; - newpath = __get_tmpdir(); - newpath += "/whisperfile."; - newpath += std::to_string(_rand64()); - newpath += ".wav"; - - // create decoder - ma_decoder_config decoderConfig = - ma_decoder_config_init(ma_format_s16, 1 + stereo, COMMON_SAMPLE_RATE); - decoderConfig.resampling.algorithm = ma_resample_algorithm_linear; - decoderConfig.resampling.linear.lpfOrder = 8; - - // open input file - ma_decoder decoder; - ma_result rc = ma_decoder_init_file(fname.c_str(), &decoderConfig, &decoder); - if (rc != MA_SUCCESS) { - fprintf(stderr, "%s: failed to open audio file: %s (we support .wav, .mp3, .flac, and .ogg)\n", - fname.c_str(), ma_result_description(rc)); - return ""; - } - - // create encoder - ma_encoder encoder; - ma_encoder_config encoderConfig = ma_encoder_config_init( - ma_encoding_format_wav, - decoder.outputFormat, - decoder.outputChannels, - decoder.outputSampleRate); - rc = ma_encoder_init_file(newpath.c_str(), &encoderConfig, &encoder); - if (rc != MA_SUCCESS) { - ma_decoder_uninit(&decoder); - fprintf(stderr, "%s: failed to open output file: %s\n", - newpath.c_str(), ma_result_description(rc)); - return ""; - } - - // perform the conversion - rc = perform_audio_conversion(&decoder, &encoder); - ma_encoder_uninit(&encoder); - ma_decoder_uninit(&decoder); - if (rc != MA_SUCCESS) { - fprintf(stderr, "%s: failed to convert audio file: %s\n", - fname.c_str(), ma_result_description(rc)); - return ""; - } - - // return new path - delete_me = newpath; - atexit(on_exit); - return newpath; -} - -#define TRY_CONVERSION \ - do { \ - if (did_conversion) { \ - fprintf(stderr, "error: failed to open audio file\n"); \ - return false; \ - } \ - std::string fname2 = convert_audio_file(fname, stereo); \ - if (fname2.empty()) { \ - return false; \ - } \ - fname = fname2; \ - did_conversion = true; \ - goto TryAgain; \ - } while (0) - -bool read_wav(const std::string & fname_, std::vector& pcmf32, std::vector>& pcmf32s, bool stereo) { - drwav wav; - std::vector wav_data; // used for pipe input from stdin - std::string fname = fname_; - bool did_conversion = false; - -TryAgain: - if (fname == "-") { - { - #ifdef _WIN32 - _setmode(_fileno(stdin), _O_BINARY); - #endif - - uint8_t buf[1024]; - while (true) - { - const size_t n = fread(buf, 1, sizeof(buf), stdin); - if (n == 0) { - break; - } - wav_data.insert(wav_data.end(), buf, buf + n); - } - } - - if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) { - fprintf(stderr, "error: failed to open WAV file from stdin\n"); - return false; - } - - fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size()); - } - else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) { - tinylogf("%s: converting to wav...\n", fname.c_str()); - TRY_CONVERSION; - } - - if (stereo && wav.channels < 2) { - fprintf(stderr, "%s: audio file must be stereo for diarization\n", fname.c_str()); - drwav_uninit(&wav); - return false; - } - - if (wav.channels != 1 && wav.channels != 2) { - tinylogf("%s: audio file has %d channels\n", fname.c_str(), wav.channels); - drwav_uninit(&wav); - TRY_CONVERSION; - } - - if (stereo && wav.channels != 2) { - tinylogf("%s: audio file has %d channels (we want diarization)\n", fname.c_str(), wav.channels); - drwav_uninit(&wav); - TRY_CONVERSION; - } - - if (wav.sampleRate != COMMON_SAMPLE_RATE) { - tinylogf("%s: audio file has %d sample rate\n", fname.c_str(), wav.sampleRate); - drwav_uninit(&wav); - TRY_CONVERSION; - } - - if (wav.bitsPerSample != 16) { - tinylogf("%s: audio file has %d bits per sample\n", fname.c_str(), wav.bitsPerSample); - drwav_uninit(&wav); - TRY_CONVERSION; - } - - const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8); - - std::vector pcm16; - pcm16.resize(n*wav.channels); - drwav_read_pcm_frames_s16(&wav, n, pcm16.data()); - drwav_uninit(&wav); - - // convert to mono, float - pcmf32.resize(n); - if (wav.channels == 1) { - for (uint64_t i = 0; i < n; i++) { - pcmf32[i] = float(pcm16[i])/32768.0f; - } - } else { - for (uint64_t i = 0; i < n; i++) { - pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f; - } - } - - if (stereo) { - // convert to stereo, float - pcmf32s.resize(2); - - pcmf32s[0].resize(n); - pcmf32s[1].resize(n); - for (uint64_t i = 0; i < n; i++) { - pcmf32s[0][i] = float(pcm16[2*i])/32768.0f; - pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f; - } - } - - return true; -} - void high_pass_filter(std::vector & data, float cutoff, float sample_rate) { const float rc = 1.0f / (2.0f * M_PI * cutoff); const float dt = 1.0f / sample_rate; diff --git a/whisper.cpp/common.h b/whisper.cpp/common.h index 9b06355710..c4dd288792 100644 --- a/whisper.cpp/common.h +++ b/whisper.cpp/common.h @@ -19,16 +19,6 @@ // Check if a buffer is a WAV audio file bool is_wav_buffer(const std::string buf); -// Read WAV audio file and store the PCM data into pcmf32 -// fname can be a buffer of WAV data instead of a filename -// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE -// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM -bool read_wav( - const std::string & fname, - std::vector & pcmf32, - std::vector> & pcmf32s, - bool stereo); - // Write PCM data into WAV audio file class wav_writer { private: diff --git a/whisper.cpp/main.cpp b/whisper.cpp/main.cpp index 68275faec6..9a138f8727 100644 --- a/whisper.cpp/main.cpp +++ b/whisper.cpp/main.cpp @@ -5,6 +5,7 @@ #include "llamafile/llamafile.h" #include "llama.cpp/cores.h" #include "common.h" +#include "slurp.h" #include "whisper.h" #include "grammar-parser.h" @@ -1108,8 +1109,8 @@ int main(int argc, char ** argv) { std::vector pcmf32; // mono-channel F32 PCM std::vector> pcmf32s; // stereo-channel F32 PCM - if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) { - fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str()); + if (!slurp_audio_file(fname_inp.c_str(), pcmf32, pcmf32s, params.diarize)) { + fprintf(stderr, "error: failed to read audio file '%s'\n", fname_inp.c_str()); continue; } diff --git a/whisper.cpp/server.cpp b/whisper.cpp/server.cpp index 2daa4398d6..a4f28dc16e 100644 --- a/whisper.cpp/server.cpp +++ b/whisper.cpp/server.cpp @@ -2,6 +2,7 @@ // vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi #include "llamafile/debug.h" #include "common.h" +#include "slurp.h" #include "whisper.h" #include "httplib.h" @@ -44,8 +45,6 @@ struct server_params int32_t port = 8080; int32_t read_timeout = 600; int32_t write_timeout = 600; - - bool ffmpeg_converter = false; }; struct whisper_params { @@ -138,7 +137,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " --public PATH, [%-7s] Path to the public folder\n", sparams.public_path.c_str()); fprintf(stderr, " --request-path PATH, [%-7s] Request path for all requests\n", sparams.request_path.c_str()); fprintf(stderr, " --inference-path PATH, [%-7s] Inference path for all requests\n", sparams.inference_path.c_str()); - fprintf(stderr, " --convert, [%-7s] Convert audio to WAV, requires ffmpeg on the server\n", sparams.ffmpeg_converter ? "true" : "false"); fprintf(stderr, " --recompile [%-7s] Force GPU support to be recompiled at runtime if possible.\n", FLAG_recompile ? "true" : "false"); fprintf(stderr, " --nocompile [%-7s] Never compile GPU support at runtime.", FLAG_nocompile ? "true" : "false"); fprintf(stderr, "\n"); @@ -224,7 +222,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve else if ( arg == "--host") { sparams.hostname = argv[++i]; } else if ( arg == "--public") { sparams.public_path = argv[++i]; } else if ( arg == "--request-path") { sparams.request_path = argv[++i]; } - else if ( arg == "--convert") { sparams.ffmpeg_converter = true; } else if ( arg == "--recompile") { FLAG_recompile = true; } else if ( arg == "--nocompile") { FLAG_nocompile = true; } else if ( arg == "--tinyblas") { FLAG_tinyblas = true; } @@ -262,45 +259,6 @@ struct whisper_print_user_data { int progress_prev; }; -void check_ffmpeg_availibility() { - int result = system("ffmpeg -version"); - - if (result == 0) { - std::cout << "ffmpeg is available." << std::endl; - } else { - // ffmpeg is not available - std::cout << "ffmpeg is not found. Please ensure that ffmpeg is installed "; - std::cout << "and that its executable is included in your system's PATH. "; - exit(0); - } -} - -bool convert_to_wav(const std::string & temp_filename, std::string & error_resp) { - std::ostringstream cmd_stream; - std::string converted_filename_temp = temp_filename + "_temp.wav"; - cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -ar 16000 -ac 1 -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1"; - std::string cmd = cmd_stream.str(); - - int status = std::system(cmd.c_str()); - if (status != 0) { - error_resp = "{\"error\":\"FFmpeg conversion failed.\"}"; - return false; - } - - // Remove the original file - if (remove(temp_filename.c_str()) != 0) { - error_resp = "{\"error\":\"Failed to remove the original file.\"}"; - return false; - } - - // Rename the temporary file to match the original filename - if (rename(converted_filename_temp.c_str(), temp_filename.c_str()) != 0) { - error_resp = "{\"error\":\"Failed to rename the temporary file.\"}"; - return false; - } - return true; -} - std::string estimate_diarization_speaker(std::vector> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) { std::string speaker = ""; const int64_t n_samples = pcmf32s[0].size(); @@ -558,9 +516,6 @@ int whisper_server_main(int argc, char ** argv) { exit(0); } - if (sparams.ffmpeg_converter) { - check_ffmpeg_availibility(); - } // whisper init struct whisper_context_params cparams = whisper_context_default_params(); @@ -741,36 +696,14 @@ int whisper_server_main(int argc, char ** argv) { temp_file << audio_file.content; temp_file.close(); - if (sparams.ffmpeg_converter) { - - std::string error_resp = "{\"error\":\"Failed to execute ffmpeg command.\"}"; - const bool is_converted = convert_to_wav(temp_filename, error_resp); - if (!is_converted) { - res.set_content(error_resp, "application/json"); - return; - } - - // read wav content into pcmf32 - if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize)) - { - fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str()); - const std::string error_resp = "{\"error\":\"failed to read WAV file\"}"; - res.set_content(error_resp, "application/json"); - std::remove(temp_filename.c_str()); - return; - } - } else { - if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize)) - { - fprintf(stderr, "error: failed to read WAV file\n"); - const std::string error_resp = "{\"error\":\"failed to read WAV file\"}"; - res.set_content(error_resp, "application/json"); - return; - } + bool ok = slurp_audio_file(temp_filename.c_str(), pcmf32, pcmf32s, params.diarize); + unlink(temp_filename.c_str()); + if (!ok) { + fprintf(stderr, "error: failed to read audio file\n"); + const std::string error_resp = "{\"error\":\"failed to read audio file\"}"; + res.set_content(error_resp, "application/json"); + return; } - // remove temp file - std::remove(temp_filename.c_str()); - printf("Successfully loaded %s\n", filename.c_str()); diff --git a/whisper.cpp/slurp.cpp b/whisper.cpp/slurp.cpp new file mode 100644 index 0000000000..b40b05f963 --- /dev/null +++ b/whisper.cpp/slurp.cpp @@ -0,0 +1,109 @@ +// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*- +// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi + +#include "slurp.h" +#include "miniaudio.h" +#include "llamafile/log.h" + +static int get_audio_file_channels(const char *fname) { + ma_decoder decoder; + ma_result rc = ma_decoder_init_file(fname, NULL, &decoder); + if (rc != MA_SUCCESS) { + tinylogf("%s: failed to open audio file: %s (we support .wav, .mp3, .flac, and .ogg)\n", + fname, ma_result_description(rc)); + return -1; + } + int channels = decoder.outputChannels; + ma_decoder_uninit(&decoder); + return channels; +} + +/** + * Reads entire pulse-code modulation content of audio file into memory. + * + * This function reads raw audio data from an MP3/WAV/OGG/FLAC file into + * `pcmf32` at the `COMMON_SAMPLE_RATE`. Resampling, channel mixing, and + * data type conversions will be performed as necessary. + * + * If `stereo` is true, then `pcmf32s` will also be populated with two + * vectors, holding the left and right audio channels, and `pcmf32` will + * receive their mixture. If the audio file does not have two or more + * channels, then an error is returned. + * + * The output vectors are not cleared. Therefore this function may be + * called multiple times to append audio files. + */ +bool slurp_audio_file(const char *fname, + std::vector &pcmf32, + std::vector> &pcmf32s, + bool stereo) { + + // validate stereo is stereo + if (stereo) { + int channels = get_audio_file_channels(fname); + if (channels == -1) + return false; + if (channels < 2) { + tinylogf("%s: audio file is mono when stereo is required\n", fname); + return false; + } + } + + // create decoder + ma_decoder_config decoderConfig = + ma_decoder_config_init(ma_format_f32, 1 + stereo, 16000); + decoderConfig.resampling.algorithm = ma_resample_algorithm_linear; + decoderConfig.resampling.linear.lpfOrder = 8; + + // open input file + ma_decoder decoder; + ma_result rc = ma_decoder_init_file(fname, &decoderConfig, &decoder); + if (rc != MA_SUCCESS) { + tinylogf("%s: failed to open audio file: %s (we support .wav, .mp3, .flac, and .ogg)\n", + fname, ma_result_description(rc)); + return false; + } + + // load pulse-code modulation samples + if (!stereo) { + ma_uint64 total = pcmf32.size(); + ma_uint64 want = 512; + ma_uint64 got; + do { + pcmf32.resize(total + want); + rc = ma_decoder_read_pcm_frames(&decoder, &pcmf32[total], want, &got); + if (rc != MA_SUCCESS) { + ma_decoder_uninit(&decoder); + tinylogf("%s: failed to read pcm frames from audio file: %s\total", + fname, ma_result_description(rc)); + return false; + } + pcmf32.resize((total += got)); + } while (got == want); + } else { + float frames[512]; + ma_uint64 want = sizeof(frames) / sizeof(*frames) / 2; + ma_uint64 got; + pcmf32s.resize(2); + do { + rc = ma_decoder_read_pcm_frames(&decoder, frames, want, &got); + if (rc != MA_SUCCESS) { + ma_decoder_uninit(&decoder); + tinylogf("%s: failed to read pcm frames from audio file: %s\n", + fname, ma_result_description(rc)); + return false; + } + for (int i = 0; i < got; ++i) { + float left = frames[i*2+0]; + float right = frames[i*2+1]; + pcmf32.push_back(left + right); + pcmf32s[0].push_back(left); + pcmf32s[1].push_back(right); + } + } while (got == want); + } + + // we're done + ma_decoder_uninit(&decoder); + return true; +} diff --git a/whisper.cpp/slurp.h b/whisper.cpp/slurp.h new file mode 100644 index 0000000000..2902769101 --- /dev/null +++ b/whisper.cpp/slurp.h @@ -0,0 +1,8 @@ +#pragma once +#include +#include + +bool slurp_audio_file(const char *fname, + std::vector &pcmf32, + std::vector> &pcmf32s, + bool stereo);