Skip to content

Commit

Permalink
added processEOS and resolved issue #335 with unit test
Browse files Browse the repository at this point in the history
  • Loading branch information
kushaljain-apra committed Mar 1, 2024
1 parent 5358310 commit 4ab2e39
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 4 deletions.
2 changes: 2 additions & 0 deletions base/include/AudioToTextXForm.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ class AudioToTextXForm : public Module
bool validateOutputPins();
void addInputPin(framemetadata_sp& metadata, string& pinId);
bool handlePropsChange(frame_sp& frame);
bool processEOS(string &pinId);
bool handleFlushingBuffer();

private:
void setMetadata(framemetadata_sp& metadata);
Expand Down
17 changes: 15 additions & 2 deletions base/src/AudioToTextXForm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,7 @@ bool AudioToTextXForm::process(frame_container& frames)
}

if (mDetail->mInputAudioBuffer.size() < mDetail->mProps.bufferSize) {
sendEOS();
return true;
return handleFlushingBuffer();
}
whisper_full(
mDetail->mWhisperContext,
Expand Down Expand Up @@ -221,4 +220,18 @@ void AudioToTextXForm::setProps(AudioToTextXFormProps& props)
throw AIPException(AIP_FATAL, "Model Path dynamic change not handled");
}
Module::addPropsToQueue(props);
}

bool AudioToTextXForm::processEOS(string &pinId)
{
mDetail->mInputAudioBuffer.clear();
return true;
}

bool AudioToTextXForm::handleFlushingBuffer()
{
mDetail->mInputAudioBuffer.clear();
LOG_ERROR << "Flushed Buffer Successfully...\n";
Module::sendEOS();
return true;
}
63 changes: 61 additions & 2 deletions base/test/audioToTextXform_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,9 +224,9 @@ BOOST_AUTO_TEST_CASE(change_unsupported_prop_asr)
BOOST_CHECK_THROW(asr->setProps(propschange), std::runtime_error);
}

BOOST_AUTO_TEST_CASE(checkEOS_asr)
BOOST_AUTO_TEST_CASE(check_eos_frame_asr)
{
std::vector<std::string> asrOutText = { "./data/asr_out.txt" };
std::vector<std::string> asrOutText = { "./data/asr_check_eos_frame.txt" };
Test_Utils::FileCleaner f(asrOutText);

Logger::setLogLevel(boost::log::trivial::severity_level::info);
Expand Down Expand Up @@ -273,4 +273,63 @@ BOOST_AUTO_TEST_CASE(checkEOS_asr)
in_file_text.close();
}

BOOST_AUTO_TEST_CASE(check_flushed_buffer_asr)
{
std::vector<std::string> asrOutText = { "./data/asr_flushed_buffer.txt" };
Test_Utils::FileCleaner f(asrOutText);

Logger::setLogLevel(boost::log::trivial::severity_level::info);

// This is a PCM file without WAV header
auto fileReaderProps = FileReaderModuleProps("./data/audioToTextXform_test.pcm");
fileReaderProps.readLoop = true;
auto fileReader = boost::shared_ptr<FileReaderModule>(new FileReaderModule(fileReaderProps));
auto metadata = framemetadata_sp(new FrameMetadata(FrameMetadata::AUDIO));
auto pinId = fileReader->addOutputPin(metadata);

auto asr = boost::shared_ptr<AudioToTextXForm>(new AudioToTextXForm(AudioToTextXFormProps(
AudioToTextXFormProps::DecoderSamplingStrategy::GREEDY
,"./data/whisper/models/ggml-tiny.en-q8_0.bin",160000)));
fileReader->setNext(asr);

auto outputFile = boost::shared_ptr<FileWriterModule>(new FileWriterModule(FileWriterModuleProps(asrOutText[0], false)));
asr->setNext(outputFile);

auto sink = boost::shared_ptr<ExternalSinkModule>(new ExternalSinkModule());
asr->setNext(sink);

BOOST_TEST(fileReader->init());
BOOST_TEST(asr->init());
BOOST_TEST(outputFile->init());
BOOST_TEST(sink->init());

fileReader->step();
asr->step();

auto frames = sink->pop();
auto eosframe = frames.begin()->second;
BOOST_TEST(eosframe->isEOS());

outputFile->step();

AudioToTextXFormProps propschange = asr->getProps();
propschange.bufferSize = 18000;
asr->setProps(propschange);

for (int i = 0; i < 2; i++) {
fileReader->step();
asr->step();
}
outputFile->step();

std::ifstream in_file_text(asrOutText[0]);
std::ostringstream buffer;
buffer << in_file_text.rdbuf();
std:string output = " The Matic speech recognition also known as ASR is the use of machine learning or artificial intelligence technology to process human speech into readable text.";
double thres = 0.95;
BOOST_TEST(cosineSimilarity(buffer.str(), output) >= thres);
// BOOST_TEST(buffer.str() == output);
in_file_text.close();
}

BOOST_AUTO_TEST_SUITE_END()

0 comments on commit 4ab2e39

Please sign in to comment.