Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

save the recorded audio to a file #1310

Merged
merged 13 commits into from
Sep 22, 2023
83 changes: 76 additions & 7 deletions examples/stream/stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
//
// A very quick-n-dirty implementation serving mainly as a proof of concept.
//

#include <fstream>
bobqianic marked this conversation as resolved.
Show resolved Hide resolved
#include "common-sdl.h"
#include "common.h"
#include "whisper.h"
Expand All @@ -13,7 +13,61 @@
#include <thread>
#include <vector>
#include <fstream>
#include <ctime>

class SimpleWavWriter {
bobqianic marked this conversation as resolved.
Show resolved Hide resolved
private:
std::ofstream file;
int32_t dataSize = 0;

public:
SimpleWavWriter(const std::string &filename, int sampleRate, int bitsPerSample, int channels) {
file.open(filename, std::ios::binary);

file.write("RIFF", 4);
file.write("\0\0\0\0", 4); // Placeholder for file size
file.write("WAVE", 4);
file.write("fmt ", 4);

int32_t subChunkSize = 16;
int16_t audioFormat = 1; // PCM format
int32_t byteRate = sampleRate * channels * bitsPerSample / 8;
int16_t blockAlign = channels * bitsPerSample / 8;

file.write(reinterpret_cast<char *>(&subChunkSize), 4);
file.write(reinterpret_cast<char *>(&audioFormat), 2);
file.write(reinterpret_cast<char *>(&channels), 2);
file.write(reinterpret_cast<char *>(&sampleRate), 4);
file.write(reinterpret_cast<char *>(&byteRate), 4);
file.write(reinterpret_cast<char *>(&blockAlign), 2);
file.write(reinterpret_cast<char *>(&bitsPerSample), 2);
file.write("data", 4);
file.write("\0\0\0\0", 4); // Placeholder for data size
}

void writeData(const float *data, size_t length) {
for (size_t i = 0; i < length; ++i) {
int16_t intSample = static_cast<int16_t>(data[i] * 32767);
file.write(reinterpret_cast<char *>(&intSample), sizeof(int16_t));
dataSize += sizeof(int16_t);
}
bobqianic marked this conversation as resolved.
Show resolved Hide resolved
if (file.is_open()) {
file.seekp(4, std::ios::beg);
int32_t fileSize = 36 + dataSize;
file.write(reinterpret_cast<char *>(&fileSize), 4);
file.seekp(40, std::ios::beg);
file.write(reinterpret_cast<char *>(&dataSize), 4);
file.seekp(0, std::ios::end);
}
}
}
bobqianic marked this conversation as resolved.
Show resolved Hide resolved

~SimpleWavWriter() {
if (file.is_open()) {
file.close();
}
}
};
// 500 -> 00:05.000
// 6000 -> 01:00.000
std::string to_timestamp(int64_t t) {
Expand Down Expand Up @@ -52,6 +106,7 @@ struct whisper_params {
std::string language = "en";
std::string model = "models/ggml-base.en.bin";
std::string fname_out;
bool save_audio = false; // save audio to wav file
};

void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
Expand Down Expand Up @@ -82,6 +137,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
else if (arg == "-tdrz" || arg == "--tinydiarize") { params.tinydiarize = true; }
else if (arg == "-sa" || arg == "--save-audio") { params.save_audio = true; }

else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
Expand Down Expand Up @@ -117,6 +173,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
fprintf(stderr, " -tdrz, --tinydiarize [%-7s] enable tinydiarize (requires a tdrz model)\n", params.tinydiarize ? "true" : "false");
fprintf(stderr, " -sa, --save-audio [%-7s] save the recorded audio to a file\n", params.save_audio ? "true" : "false");
fprintf(stderr, "\n");
}

Expand Down Expand Up @@ -154,7 +211,6 @@ int main(int argc, char ** argv) {
audio.resume();

// whisper init

if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1){
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
whisper_print_usage(argc, argv, params);
Expand Down Expand Up @@ -211,15 +267,28 @@ int main(int argc, char ** argv) {
return 1;
}
}

printf("[Start speaking]");
// save wav file
SimpleWavWriter *wavWriter = nullptr;
if (params.save_audio) {
// Get current date/time for filename
time_t now = time(0);
char buffer[80];
strftime(buffer, sizeof(buffer), "%Y%m%d%H%M%S", localtime(&now));
std::string filename = std::string(buffer) + ".wav";

wavWriter = new SimpleWavWriter(filename, WHISPER_SAMPLE_RATE, 16, 1);
}
printf("[Start speaking]\n");
fflush(stdout);

auto t_last = std::chrono::high_resolution_clock::now();
auto t_last = std::chrono::high_resolution_clock::now();
const auto t_start = t_last;

// main audio loop
while (is_running) {
if (params.save_audio && wavWriter) {
wavWriter->writeData(pcmf32_new.data(), pcmf32_new.size());
}
// handle Ctrl + C
is_running = sdl_poll_events();
bobqianic marked this conversation as resolved.
Show resolved Hide resolved

Expand Down Expand Up @@ -371,7 +440,7 @@ int main(int argc, char ** argv) {
fout << std::endl;
}

if (use_vad){
if (use_vad) {
printf("\n");
printf("### Transcription %d END\n", n_iter);
}
Expand Down Expand Up @@ -408,4 +477,4 @@ int main(int argc, char ** argv) {
whisper_free(ctx);

return 0;
}
}
Loading