diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp index 7f869d8d0a2..ad0131d5cd3 100644 --- a/examples/stream/stream.cpp +++ b/examples/stream/stream.cpp @@ -2,7 +2,7 @@ // // A very quick-n-dirty implementation serving mainly as a proof of concept. // - +#include #include "common-sdl.h" #include "common.h" #include "whisper.h" @@ -13,7 +13,60 @@ #include #include #include +#include + +class SimpleWavWriter { +private: + std::ofstream file; + int32_t dataSize = 0; + +public: + SimpleWavWriter(const std::string &filename, int sampleRate, int bitsPerSample, int channels) { + file.open(filename, std::ios::binary); + + file.write("RIFF", 4); + file.write("\0\0\0\0", 4); // Placeholder for file size + file.write("WAVE", 4); + file.write("fmt ", 4); + + int32_t subChunkSize = 16; + int16_t audioFormat = 1; // PCM format + int32_t byteRate = sampleRate * channels * bitsPerSample / 8; + int16_t blockAlign = channels * bitsPerSample / 8; + + file.write(reinterpret_cast(&subChunkSize), 4); + file.write(reinterpret_cast(&audioFormat), 2); + file.write(reinterpret_cast(&channels), 2); + file.write(reinterpret_cast(&sampleRate), 4); + file.write(reinterpret_cast(&byteRate), 4); + file.write(reinterpret_cast(&blockAlign), 2); + file.write(reinterpret_cast(&bitsPerSample), 2); + file.write("data", 4); + file.write("\0\0\0\0", 4); // Placeholder for data size + } + + void writeData(const float *data, size_t length) { + for (size_t i = 0; i < length; ++i) { + int16_t intSample = static_cast(data[i] * 32767); + file.write(reinterpret_cast(&intSample), sizeof(int16_t)); + dataSize += sizeof(int16_t); + } + if (file.is_open()) { + file.seekp(4, std::ios::beg); + int32_t fileSize = 36 + dataSize; + file.write(reinterpret_cast(&fileSize), 4); + file.seekp(40, std::ios::beg); + file.write(reinterpret_cast(&dataSize), 4); + file.seekp(0, std::ios::end); + } + } + ~SimpleWavWriter() { + if (file.is_open()) { + file.close(); + } + } +}; // 500 -> 00:05.000 // 6000 -> 01:00.000 std::string to_timestamp(int64_t t) { @@ -52,6 +105,7 @@ struct whisper_params { std::string language = "en"; std::string model = "models/ggml-base.en.bin"; std::string fname_out; + bool save_audio = false; // save audio to wav file }; void whisper_print_usage(int argc, char ** argv, const whisper_params & params); @@ -82,6 +136,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; } else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; } else if (arg == "-tdrz" || arg == "--tinydiarize") { params.tinydiarize = true; } + else if (arg == "-sa" || arg == "--save-audio") { params.save_audio = true; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); @@ -117,6 +172,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str()); fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str()); fprintf(stderr, " -tdrz, --tinydiarize [%-7s] enable tinydiarize (requires a tdrz model)\n", params.tinydiarize ? "true" : "false"); + fprintf(stderr, " -sa, --save-audio [%-7s] save the recorded audio to a file\n", params.save_audio ? "true" : "false"); fprintf(stderr, "\n"); } @@ -154,7 +210,6 @@ int main(int argc, char ** argv) { audio.resume(); // whisper init - if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1){ fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str()); whisper_print_usage(argc, argv, params); @@ -211,15 +266,28 @@ int main(int argc, char ** argv) { return 1; } } - - printf("[Start speaking]"); + // save wav file + SimpleWavWriter *wavWriter = nullptr; + if (params.save_audio) { + // Get current date/time for filename + time_t now = time(0); + char buffer[80]; + strftime(buffer, sizeof(buffer), "%Y%m%d%H%M%S", localtime(&now)); + std::string filename = std::string(buffer) + ".wav"; + + wavWriter = new SimpleWavWriter(filename, WHISPER_SAMPLE_RATE, 16, 1); + } + printf("[Start speaking]\n"); fflush(stdout); - auto t_last = std::chrono::high_resolution_clock::now(); + auto t_last = std::chrono::high_resolution_clock::now(); const auto t_start = t_last; // main audio loop while (is_running) { + if (params.save_audio && wavWriter) { + wavWriter->writeData(pcmf32_new.data(), pcmf32_new.size()); + } // handle Ctrl + C is_running = sdl_poll_events(); @@ -371,7 +439,7 @@ int main(int argc, char ** argv) { fout << std::endl; } - if (use_vad){ + if (use_vad) { printf("\n"); printf("### Transcription %d END\n", n_iter); } @@ -408,4 +476,4 @@ int main(int argc, char ** argv) { whisper_free(ctx); return 0; -} +} \ No newline at end of file