Skip to content

Commit

Permalink
Rewrite audio file loader code
Browse files Browse the repository at this point in the history
We now have a new function slurp_audio_file() which replaces read_wav().
This function has simpler code, and allows us to avoid a temporary file.

See #568
  • Loading branch information
jart committed Sep 28, 2024
1 parent 79cc41f commit beb2f19
Show file tree
Hide file tree
Showing 6 changed files with 128 additions and 287 deletions.
200 changes: 0 additions & 200 deletions whisper.cpp/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,206 +27,6 @@
#include "stb/stb_vorbis.h"
#include "miniaudio.h"

#define MA_DATA_CONVERTER_STACK_BUFFER_SIZE 4096

static std::string delete_me;

static void on_exit(void) {
if (!delete_me.empty()) {
unlink(delete_me.c_str());
}
}

static ma_result perform_audio_conversion(ma_decoder* pDecoder, ma_encoder* pEncoder) {
ma_result rc = MA_SUCCESS;
for (;;) {
ma_uint8 pRawData[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
ma_uint64 framesReadThisIteration;
ma_uint64 framesToReadThisIteration;
framesToReadThisIteration = sizeof(pRawData) / ma_get_bytes_per_frame(pDecoder->outputFormat, pDecoder->outputChannels);
rc = ma_decoder_read_pcm_frames(pDecoder, pRawData, framesToReadThisIteration, &framesReadThisIteration);
if (rc != MA_SUCCESS) {
break;
}
ma_encoder_write_pcm_frames(pEncoder, pRawData, framesReadThisIteration, NULL);
if (framesReadThisIteration < framesToReadThisIteration) {
break;
}
}
return rc;
}

// converts audio file to signed 16-bit 16000hz wav
static std::string convert_audio_file(const std::string & fname, bool stereo) {

// create temporary filename
std::string newpath;
newpath = __get_tmpdir();
newpath += "/whisperfile.";
newpath += std::to_string(_rand64());
newpath += ".wav";

// create decoder
ma_decoder_config decoderConfig =
ma_decoder_config_init(ma_format_s16, 1 + stereo, COMMON_SAMPLE_RATE);
decoderConfig.resampling.algorithm = ma_resample_algorithm_linear;
decoderConfig.resampling.linear.lpfOrder = 8;

// open input file
ma_decoder decoder;
ma_result rc = ma_decoder_init_file(fname.c_str(), &decoderConfig, &decoder);
if (rc != MA_SUCCESS) {
fprintf(stderr, "%s: failed to open audio file: %s (we support .wav, .mp3, .flac, and .ogg)\n",
fname.c_str(), ma_result_description(rc));
return "";
}

// create encoder
ma_encoder encoder;
ma_encoder_config encoderConfig = ma_encoder_config_init(
ma_encoding_format_wav,
decoder.outputFormat,
decoder.outputChannels,
decoder.outputSampleRate);
rc = ma_encoder_init_file(newpath.c_str(), &encoderConfig, &encoder);
if (rc != MA_SUCCESS) {
ma_decoder_uninit(&decoder);
fprintf(stderr, "%s: failed to open output file: %s\n",
newpath.c_str(), ma_result_description(rc));
return "";
}

// perform the conversion
rc = perform_audio_conversion(&decoder, &encoder);
ma_encoder_uninit(&encoder);
ma_decoder_uninit(&decoder);
if (rc != MA_SUCCESS) {
fprintf(stderr, "%s: failed to convert audio file: %s\n",
fname.c_str(), ma_result_description(rc));
return "";
}

// return new path
delete_me = newpath;
atexit(on_exit);
return newpath;
}

#define TRY_CONVERSION \
do { \
if (did_conversion) { \
fprintf(stderr, "error: failed to open audio file\n"); \
return false; \
} \
std::string fname2 = convert_audio_file(fname, stereo); \
if (fname2.empty()) { \
return false; \
} \
fname = fname2; \
did_conversion = true; \
goto TryAgain; \
} while (0)

bool read_wav(const std::string & fname_, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
drwav wav;
std::vector<uint8_t> wav_data; // used for pipe input from stdin
std::string fname = fname_;
bool did_conversion = false;

TryAgain:
if (fname == "-") {
{
#ifdef _WIN32
_setmode(_fileno(stdin), _O_BINARY);
#endif

uint8_t buf[1024];
while (true)
{
const size_t n = fread(buf, 1, sizeof(buf), stdin);
if (n == 0) {
break;
}
wav_data.insert(wav_data.end(), buf, buf + n);
}
}

if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
fprintf(stderr, "error: failed to open WAV file from stdin\n");
return false;
}

fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
}
else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
tinylogf("%s: converting to wav...\n", fname.c_str());
TRY_CONVERSION;
}

if (stereo && wav.channels < 2) {
fprintf(stderr, "%s: audio file must be stereo for diarization\n", fname.c_str());
drwav_uninit(&wav);
return false;
}

if (wav.channels != 1 && wav.channels != 2) {
tinylogf("%s: audio file has %d channels\n", fname.c_str(), wav.channels);
drwav_uninit(&wav);
TRY_CONVERSION;
}

if (stereo && wav.channels != 2) {
tinylogf("%s: audio file has %d channels (we want diarization)\n", fname.c_str(), wav.channels);
drwav_uninit(&wav);
TRY_CONVERSION;
}

if (wav.sampleRate != COMMON_SAMPLE_RATE) {
tinylogf("%s: audio file has %d sample rate\n", fname.c_str(), wav.sampleRate);
drwav_uninit(&wav);
TRY_CONVERSION;
}

if (wav.bitsPerSample != 16) {
tinylogf("%s: audio file has %d bits per sample\n", fname.c_str(), wav.bitsPerSample);
drwav_uninit(&wav);
TRY_CONVERSION;
}

const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);

std::vector<int16_t> pcm16;
pcm16.resize(n*wav.channels);
drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
drwav_uninit(&wav);

// convert to mono, float
pcmf32.resize(n);
if (wav.channels == 1) {
for (uint64_t i = 0; i < n; i++) {
pcmf32[i] = float(pcm16[i])/32768.0f;
}
} else {
for (uint64_t i = 0; i < n; i++) {
pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
}
}

if (stereo) {
// convert to stereo, float
pcmf32s.resize(2);

pcmf32s[0].resize(n);
pcmf32s[1].resize(n);
for (uint64_t i = 0; i < n; i++) {
pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
}
}

return true;
}

void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
const float rc = 1.0f / (2.0f * M_PI * cutoff);
const float dt = 1.0f / sample_rate;
Expand Down
10 changes: 0 additions & 10 deletions whisper.cpp/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,6 @@
// Check if a buffer is a WAV audio file
bool is_wav_buffer(const std::string buf);

// Read WAV audio file and store the PCM data into pcmf32
// fname can be a buffer of WAV data instead of a filename
// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
bool read_wav(
const std::string & fname,
std::vector<float> & pcmf32,
std::vector<std::vector<float>> & pcmf32s,
bool stereo);

// Write PCM data into WAV audio file
class wav_writer {
private:
Expand Down
5 changes: 3 additions & 2 deletions whisper.cpp/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "llamafile/llamafile.h"
#include "llama.cpp/cores.h"
#include "common.h"
#include "slurp.h"

#include "whisper.h"
#include "grammar-parser.h"
Expand Down Expand Up @@ -1108,8 +1109,8 @@ int main(int argc, char ** argv) {
std::vector<float> pcmf32; // mono-channel F32 PCM
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM

if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
if (!slurp_audio_file(fname_inp.c_str(), pcmf32, pcmf32s, params.diarize)) {
fprintf(stderr, "error: failed to read audio file '%s'\n", fname_inp.c_str());
continue;
}

Expand Down
83 changes: 8 additions & 75 deletions whisper.cpp/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
#include "llamafile/debug.h"
#include "common.h"
#include "slurp.h"

#include "whisper.h"
#include "httplib.h"
Expand Down Expand Up @@ -44,8 +45,6 @@ struct server_params
int32_t port = 8080;
int32_t read_timeout = 600;
int32_t write_timeout = 600;

bool ffmpeg_converter = false;
};

struct whisper_params {
Expand Down Expand Up @@ -138,7 +137,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " --public PATH, [%-7s] Path to the public folder\n", sparams.public_path.c_str());
fprintf(stderr, " --request-path PATH, [%-7s] Request path for all requests\n", sparams.request_path.c_str());
fprintf(stderr, " --inference-path PATH, [%-7s] Inference path for all requests\n", sparams.inference_path.c_str());
fprintf(stderr, " --convert, [%-7s] Convert audio to WAV, requires ffmpeg on the server\n", sparams.ffmpeg_converter ? "true" : "false");
fprintf(stderr, " --recompile [%-7s] Force GPU support to be recompiled at runtime if possible.\n", FLAG_recompile ? "true" : "false");
fprintf(stderr, " --nocompile [%-7s] Never compile GPU support at runtime.", FLAG_nocompile ? "true" : "false");
fprintf(stderr, "\n");
Expand Down Expand Up @@ -224,7 +222,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
else if ( arg == "--host") { sparams.hostname = argv[++i]; }
else if ( arg == "--public") { sparams.public_path = argv[++i]; }
else if ( arg == "--request-path") { sparams.request_path = argv[++i]; }
else if ( arg == "--convert") { sparams.ffmpeg_converter = true; }
else if ( arg == "--recompile") { FLAG_recompile = true; }
else if ( arg == "--nocompile") { FLAG_nocompile = true; }
else if ( arg == "--tinyblas") { FLAG_tinyblas = true; }
Expand Down Expand Up @@ -262,45 +259,6 @@ struct whisper_print_user_data {
int progress_prev;
};

void check_ffmpeg_availibility() {
int result = system("ffmpeg -version");

if (result == 0) {
std::cout << "ffmpeg is available." << std::endl;
} else {
// ffmpeg is not available
std::cout << "ffmpeg is not found. Please ensure that ffmpeg is installed ";
std::cout << "and that its executable is included in your system's PATH. ";
exit(0);
}
}

bool convert_to_wav(const std::string & temp_filename, std::string & error_resp) {
std::ostringstream cmd_stream;
std::string converted_filename_temp = temp_filename + "_temp.wav";
cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -ar 16000 -ac 1 -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1";
std::string cmd = cmd_stream.str();

int status = std::system(cmd.c_str());
if (status != 0) {
error_resp = "{\"error\":\"FFmpeg conversion failed.\"}";
return false;
}

// Remove the original file
if (remove(temp_filename.c_str()) != 0) {
error_resp = "{\"error\":\"Failed to remove the original file.\"}";
return false;
}

// Rename the temporary file to match the original filename
if (rename(converted_filename_temp.c_str(), temp_filename.c_str()) != 0) {
error_resp = "{\"error\":\"Failed to rename the temporary file.\"}";
return false;
}
return true;
}

std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
std::string speaker = "";
const int64_t n_samples = pcmf32s[0].size();
Expand Down Expand Up @@ -558,9 +516,6 @@ int whisper_server_main(int argc, char ** argv) {
exit(0);
}

if (sparams.ffmpeg_converter) {
check_ffmpeg_availibility();
}
// whisper init
struct whisper_context_params cparams = whisper_context_default_params();

Expand Down Expand Up @@ -741,36 +696,14 @@ int whisper_server_main(int argc, char ** argv) {
temp_file << audio_file.content;
temp_file.close();

if (sparams.ffmpeg_converter) {

std::string error_resp = "{\"error\":\"Failed to execute ffmpeg command.\"}";
const bool is_converted = convert_to_wav(temp_filename, error_resp);
if (!is_converted) {
res.set_content(error_resp, "application/json");
return;
}

// read wav content into pcmf32
if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize))
{
fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
res.set_content(error_resp, "application/json");
std::remove(temp_filename.c_str());
return;
}
} else {
if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize))
{
fprintf(stderr, "error: failed to read WAV file\n");
const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
res.set_content(error_resp, "application/json");
return;
}
bool ok = slurp_audio_file(temp_filename.c_str(), pcmf32, pcmf32s, params.diarize);
unlink(temp_filename.c_str());
if (!ok) {
fprintf(stderr, "error: failed to read audio file\n");
const std::string error_resp = "{\"error\":\"failed to read audio file\"}";
res.set_content(error_resp, "application/json");
return;
}
// remove temp file
std::remove(temp_filename.c_str());


printf("Successfully loaded %s\n", filename.c_str());

Expand Down
Loading

0 comments on commit beb2f19

Please sign in to comment.