From 5358310a48ada5ba3c81e41652bcacb7a4acd102 Mon Sep 17 00:00:00 2001 From: Kashyap Jois Date: Wed, 28 Feb 2024 14:41:13 +0400 Subject: [PATCH] Integrate Whisper CPP and write a wrapper module in Aprapipes (#324) * Add custom port vcpkg for whisper * Add whisper stream * Add whisper stream header * Add whisper cpp to Cmake list * Add test frame type and minor changes * Add whisper to vcpkg * Add vcpkg custom overlay ports to thirdparty * Modify with whisper option * Send whisper output as text frames * revert changes to sound record test * Add whisper UT * Fix PS to remove whisper from vcpkg json * Revert changes to OPTIONS section, remove WHISPER option, rename Whisper source files to generic AudioToTextXForm * Move pcm to git lfs * Add pcm and model bin file to lfs * Fix UT name * Throw AIP exception for unknown strategy * Revert sound_record_tests.cpp changes * Revert changes to vcpkg indentation and remove Whisper option * Linux -> OFF to ON Windows ON -> OFF * Add reserve statement for vector Move constructor impl * update submodule for pipeline to run * Update whisper port with install fix * update submodule * Update vcpkg version * Add changes to handle props change * Improve UT and refactor for changing sample strategy during run time. * Add apt-get install libx11-dev libgles2-mesa-dev for libepoxy error * Add memory type check in validate input pins and throw exception if model path changes. * update submodule * update vcpkg mysys2 * update submodule * Address nits * Export env variable overlay port for building in arm64 * added fix-for-arm64.patch for whisper * update fix-vcpkg-json.ps1 * update CMakeLists.txt * update vcpkg url for build * update whisper tests threshold * update code formatting * update whisper test * added EOS for small buffer size --------- Co-authored-by: Kushal Jain Co-authored-by: Vinayak Y-B --- .github/workflows/CI-Linux-ARM64.yml | 2 +- .../workflows/build-test-lin-container.yml | 2 +- .github/workflows/build-test-lin-wsl.yml | 2 +- .github/workflows/build-test-lin.yml | 2 +- base/CMakeLists.txt | 16 +- base/fix-vcpkg-json.ps1 | 4 + base/fix-vcpkg-json.sh | 4 + base/include/AudioToTextXForm.h | 57 ++++ base/include/FrameMetadata.h | 3 +- base/src/AudioToTextXForm.cpp | 224 ++++++++++++++ base/test/audioToTextXform_tests.cpp | 276 ++++++++++++++++++ base/vcpkg.json | 9 +- build_jetson.sh | 2 +- data/.gitattributes | 1 + data/audioToTextXform_test.pcm | 3 + data/whisper/models/.gitattributes | 1 + data/whisper/models/ggml-tiny.en-q8_0.bin | 3 + .../whisper/fix-for-arm64.patch | 127 ++++++++ .../custom-overlay/whisper/portfile.cmake | 40 +++ thirdparty/custom-overlay/whisper/usage | 4 + thirdparty/custom-overlay/whisper/vcpkg.json | 28 ++ vcpkg | 2 +- 22 files changed, 797 insertions(+), 15 deletions(-) create mode 100644 base/include/AudioToTextXForm.h create mode 100644 base/src/AudioToTextXForm.cpp create mode 100644 base/test/audioToTextXform_tests.cpp create mode 100644 data/.gitattributes create mode 100644 data/audioToTextXform_test.pcm create mode 100644 data/whisper/models/.gitattributes create mode 100644 data/whisper/models/ggml-tiny.en-q8_0.bin create mode 100644 thirdparty/custom-overlay/whisper/fix-for-arm64.patch create mode 100644 thirdparty/custom-overlay/whisper/portfile.cmake create mode 100644 thirdparty/custom-overlay/whisper/usage create mode 100644 thirdparty/custom-overlay/whisper/vcpkg.json diff --git a/.github/workflows/CI-Linux-ARM64.yml b/.github/workflows/CI-Linux-ARM64.yml index 438ef51a2..383963a94 100644 --- a/.github/workflows/CI-Linux-ARM64.yml +++ b/.github/workflows/CI-Linux-ARM64.yml @@ -19,7 +19,7 @@ jobs: cuda: 'ON' prep-cmd: 'echo skipping builder prep as I can not sudo' cache-path: './none' - cmake-conf-cmd: 'export VCPKG_FORCE_SYSTEM_BINARIES=1 && cmake -B . -DENABLE_ARM64=ON ../base' + cmake-conf-cmd: 'export VCPKG_FORCE_SYSTEM_BINARIES=1 && export VCPKG_OVERLAY_PORTS=../thirdparty/custom-overlay && cmake -B . -DENABLE_ARM64=ON ../base' nProc: 6 jetson-publish: needs: jetson-build-test diff --git a/.github/workflows/build-test-lin-container.yml b/.github/workflows/build-test-lin-container.yml index 3bd29c054..53158fc99 100644 --- a/.github/workflows/build-test-lin-container.yml +++ b/.github/workflows/build-test-lin-container.yml @@ -30,7 +30,7 @@ on: prep-cmd: type: string description: 'commands required to be run on a builder to prep it for build' - default: 'sudo apt-get update -qq && sudo apt-get -y install ca-certificates curl zip unzip tar autoconf automake autopoint build-essential flex git-core libass-dev libfreetype6-dev libgnutls28-dev libmp3lame-dev libsdl2-dev libtool libsoup-gnome2.4-dev libva-dev libvdpau-dev libvorbis-dev libxcb1-dev libxcb-shm0-dev libxcb-xfixes0-dev libncurses5-dev libncursesw5-dev ninja-build pkg-config texinfo wget yasm zlib1g-dev nasm gperf bison python3 python3-pip dos2unix && pip3 install meson' + default: 'sudo apt-get update -qq && sudo apt-get -y install ca-certificates curl zip unzip tar autoconf automake autopoint build-essential flex git-core libass-dev libfreetype6-dev libgnutls28-dev libmp3lame-dev libsdl2-dev libtool libsoup-gnome2.4-dev libva-dev libvdpau-dev libvorbis-dev libxcb1-dev libxcb-shm0-dev libxcb-xfixes0-dev libncurses5-dev libncursesw5-dev ninja-build pkg-config texinfo wget yasm zlib1g-dev nasm gperf bison python3 python3-pip dos2unix libx11-dev libgles2-mesa-dev && pip3 install meson' required: false prep-check-cmd: type: string diff --git a/.github/workflows/build-test-lin-wsl.yml b/.github/workflows/build-test-lin-wsl.yml index a193b781a..d74aa1b0c 100644 --- a/.github/workflows/build-test-lin-wsl.yml +++ b/.github/workflows/build-test-lin-wsl.yml @@ -30,7 +30,7 @@ on: prep-cmd: type: string description: 'commands required to be run on a builder to prep it for build' - default: 'sudo apt-get update -qq && sudo apt-get -y install ca-certificates curl zip unzip tar autoconf automake autopoint build-essential flex git-core libass-dev libfreetype6-dev libgnutls28-dev libmp3lame-dev libsdl2-dev libtool libsoup-gnome2.4-dev libva-dev libvdpau-dev libvorbis-dev libxcb1-dev libxcb-shm0-dev libxcb-xfixes0-dev libncurses5-dev libncursesw5-dev ninja-build pkg-config texinfo wget yasm zlib1g-dev nasm gperf bison python3 python3-pip dos2unix && pip3 install meson' + default: 'sudo apt-get update -qq && sudo apt-get -y install ca-certificates curl zip unzip tar autoconf automake autopoint build-essential flex git-core libass-dev libfreetype6-dev libgnutls28-dev libmp3lame-dev libsdl2-dev libtool libsoup-gnome2.4-dev libva-dev libvdpau-dev libvorbis-dev libxcb1-dev libxcb-shm0-dev libxcb-xfixes0-dev libncurses5-dev libncursesw5-dev ninja-build pkg-config texinfo wget yasm zlib1g-dev nasm gperf bison python3 python3-pip dos2unix libx11-dev libgles2-mesa-dev && pip3 install meson' required: false prep-check-cmd: type: string diff --git a/.github/workflows/build-test-lin.yml b/.github/workflows/build-test-lin.yml index e97f3ce9a..3098a0adb 100644 --- a/.github/workflows/build-test-lin.yml +++ b/.github/workflows/build-test-lin.yml @@ -30,7 +30,7 @@ on: prep-cmd: type: string description: 'commands required to be run on a builder to prep it for build' - default: 'sudo apt-get update -qq && sudo apt-get -y install ca-certificates curl zip unzip tar autoconf automake autopoint build-essential flex git-core libass-dev libfreetype6-dev libgnutls28-dev libmp3lame-dev libsdl2-dev libtool libsoup-gnome2.4-dev libva-dev libvdpau-dev libvorbis-dev libxdamage-dev libxcb1-dev libxcb-shm0-dev libxcb-xfixes0-dev libncurses5-dev libncursesw5-dev ninja-build pkg-config texinfo wget yasm zlib1g-dev nasm gperf bison python3 python3-pip dos2unix && pip3 install meson' + default: 'sudo apt-get update -qq && sudo apt-get -y install ca-certificates curl zip unzip tar autoconf automake autopoint build-essential flex git-core libass-dev libfreetype6-dev libgnutls28-dev libmp3lame-dev libsdl2-dev libtool libsoup-gnome2.4-dev libva-dev libvdpau-dev libvorbis-dev libxdamage-dev libxcb1-dev libxcb-shm0-dev libxcb-xfixes0-dev libncurses5-dev libncursesw5-dev ninja-build pkg-config texinfo wget yasm zlib1g-dev nasm gperf bison python3 python3-pip dos2unix libx11-dev libgles2-mesa-dev && pip3 install meson' required: false prep-check-cmd: type: string diff --git a/base/CMakeLists.txt b/base/CMakeLists.txt index f9d14b705..9f2cd1470 100755 --- a/base/CMakeLists.txt +++ b/base/CMakeLists.txt @@ -6,6 +6,8 @@ OPTION(ENABLE_ARM64 "Use this switch to enable ARM64" OFF) OPTION(ENABLE_WINDOWS "Use this switch to enable WINDOWS" OFF) set(VCPKG_INSTALL_OPTIONS "--clean-after-build") +set(VCPKG_OVERLAY_PORTS "${CMAKE_CURRENT_SOURCE_DIR}/../thirdparty/custom-overlay") + IF(ENABLE_CUDA) add_compile_definitions(APRA_CUDA_ENABLED) ENDIF(ENABLE_CUDA) @@ -23,6 +25,7 @@ IF(ENABLE_ARM64) add_compile_definitions(ARM64) set(VCPKG_OVERLAY_PORTS ../vcpkg/ports/cudnn) set(VCPKG_OVERLAY_TRIPLETS ../vcpkg/triplets/community/arm64-linux.cmake) + set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc) ENDIF(ENABLE_ARM64) #use /MP only for language CXX (and not CUDA) and MSVC for both targets @@ -38,8 +41,6 @@ project(APRAPIPES) message(STATUS $ENV{PKG_CONFIG_PATH}">>>>>> PKG_CONFIG_PATH") find_package(PkgConfig REQUIRED) - - find_package(Boost COMPONENTS system thread filesystem serialization log chrono unit_test_framework REQUIRED) find_package(JPEG REQUIRED) find_package(OpenCV CONFIG REQUIRED) @@ -50,6 +51,7 @@ find_package(FFMPEG REQUIRED) find_package(ZXing CONFIG REQUIRED) find_package(bigint CONFIG REQUIRED) find_package(SFML COMPONENTS system window audio graphics CONFIG REQUIRED) +find_package(whisper CONFIG REQUIRED) IF(ENABLE_CUDA) if((NOT DEFINED CMAKE_CUDA_ARCHITECTURES) OR (CMAKE_CUDA_ARCHITECTURES STREQUAL "")) @@ -280,10 +282,9 @@ SET(IP_FILES src/OverlayFactory.h src/OverlayFactory.cpp src/TestSignalGeneratorSrc.cpp + src/AudioToTextXForm.cpp ) - - - + SET(IP_FILES_H include/HistogramOverlay.h include/CalcHistogramCV.h @@ -306,10 +307,9 @@ SET(IP_FILES_H include/TextOverlayXForm.h include/ColorConversionXForm.h include/Overlay.h + include/AudioToTextXForm.h ) - - SET(CUDA_CORE_FILES src/apra_cudamalloc_allocator.cu src/apra_cudamallochost_allocator.cu @@ -561,6 +561,7 @@ SET(UT_FILES test/mp4_dts_strategy_tests.cpp test/overlaymodule_tests.cpp test/testSignalGeneratorSrc_tests.cpp + test/audioToTextXform_tests.cpp ${ARM64_UT_FILES} ${CUDA_UT_FILES} ) @@ -607,6 +608,7 @@ target_link_libraries(aprapipesut liblzma::liblzma bigint::bigint sfml-audio + whisper::whisper ) IF(ENABLE_WINDOWS) diff --git a/base/fix-vcpkg-json.ps1 b/base/fix-vcpkg-json.ps1 index f2f8bcde9..634ef9c83 100644 --- a/base/fix-vcpkg-json.ps1 +++ b/base/fix-vcpkg-json.ps1 @@ -8,6 +8,10 @@ if ($removeCUDA.IsPresent) $v.dependencies | Where-Object { $_.name -eq 'opencv4' } | ForEach-Object { $_.features = $_.features -ne 'cuda' -ne 'cudnn' } + + $v.dependencies | + Where-Object { $_.name -eq 'whisper' } | + ForEach-Object { $_.features = $_.features -ne 'cuda' } } if($removeOpenCV.IsPresent) diff --git a/base/fix-vcpkg-json.sh b/base/fix-vcpkg-json.sh index e4b63c59a..74bfc0aa3 100644 --- a/base/fix-vcpkg-json.sh +++ b/base/fix-vcpkg-json.sh @@ -21,6 +21,10 @@ if $removeCUDA; then # Remove "cuda" and "cudnn" features for this "opencv4" instance v=$(echo "$v" | jq ".dependencies[$index].features |= map(select(. != \"cuda\" and . != \"cudnn\"))") fi + if [ "$name" == "whisper"]; then + # Remove "cuda" features for this "whisper" instance + v=$(echo "$v" | jq ".dependencies[$index].features |= map(select(. != \"cuda\"))") + fi done fi diff --git a/base/include/AudioToTextXForm.h b/base/include/AudioToTextXForm.h new file mode 100644 index 000000000..c95cf796f --- /dev/null +++ b/base/include/AudioToTextXForm.h @@ -0,0 +1,57 @@ +#pragma once + +#include "Module.h" + +// size of audio to process should be a parameter. +// Cache variable to collect frames for processing + +class AudioToTextXFormProps : public ModuleProps +{ +public: + enum DecoderSamplingStrategy { + GREEDY, + BEAM_SEARCH + }; + + DecoderSamplingStrategy samplingStrategy; + std::string modelPath; + int bufferSize; + + AudioToTextXFormProps( + DecoderSamplingStrategy _samplingStrategy, + std::string _modelPath, + int _bufferSize); + size_t getSerializeSize(); + + +private: + friend class boost::serialization::access; + + template + void serialize(Archive& ar, const unsigned int version); +}; + +class AudioToTextXForm : public Module +{ + +public: + AudioToTextXForm(AudioToTextXFormProps _props); + virtual ~AudioToTextXForm(); + bool init(); + bool term(); + void setProps(AudioToTextXFormProps& props); + AudioToTextXFormProps getProps(); + +protected: + bool process(frame_container& frames); + bool processSOS(frame_sp& frame); + bool validateInputPins(); + bool validateOutputPins(); + void addInputPin(framemetadata_sp& metadata, string& pinId); + bool handlePropsChange(frame_sp& frame); + +private: + void setMetadata(framemetadata_sp& metadata); + class Detail; + boost::shared_ptr mDetail; +}; diff --git a/base/include/FrameMetadata.h b/base/include/FrameMetadata.h index ca8e5f646..ebddf592b 100755 --- a/base/include/FrameMetadata.h +++ b/base/include/FrameMetadata.h @@ -50,7 +50,8 @@ class FrameMetadata { HEVC_DATA, //H265 MOTION_VECTOR_DATA, OVERLAY_INFO_IMAGE, - FACE_LANDMARKS_INFO + FACE_LANDMARKS_INFO, + TEXT }; enum MemType diff --git a/base/src/AudioToTextXForm.cpp b/base/src/AudioToTextXForm.cpp new file mode 100644 index 000000000..d84a13073 --- /dev/null +++ b/base/src/AudioToTextXForm.cpp @@ -0,0 +1,224 @@ +#include "AudioToTextXForm.h" +#include "FrameMetadata.h" +#include "FrameMetadataFactory.h" +#include "Frame.h" +#include "Logger.h" +#include "Utils.h" +#include "whisper.h" +#include "SFML/Config.hpp" + +AudioToTextXFormProps::AudioToTextXFormProps( + DecoderSamplingStrategy _samplingStrategy, + std::string _modelPath, + int _bufferSize) : samplingStrategy(_samplingStrategy), + modelPath(_modelPath), + bufferSize(_bufferSize) +{} + +size_t AudioToTextXFormProps::getSerializeSize() { + return ModuleProps::getSerializeSize() + + sizeof(samplingStrategy) + + sizeof(modelPath) + + sizeof(bufferSize); +} + +template +void AudioToTextXFormProps::serialize(Archive& ar, const unsigned int version) { + ar& boost::serialization::base_object(*this); + ar& samplingStrategy; + ar& modelPath; + ar& bufferSize; +} + +class AudioToTextXForm::Detail +{ +public: + Detail(AudioToTextXFormProps& _props) : mProps(_props) + { + } + ~Detail() {} + + whisper_full_params fetchDefaultParams() { + auto samplingStrategy = whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY; + switch (mProps.samplingStrategy) + { + case AudioToTextXFormProps::DecoderSamplingStrategy::GREEDY: + samplingStrategy = whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY; + break; + case AudioToTextXFormProps::DecoderSamplingStrategy::BEAM_SEARCH: + samplingStrategy = whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH; + break; + default: + throw AIPException(AIP_FATAL, "Unknown Sampling Strategy"); + } + return whisper_full_default_params(samplingStrategy); + } + + void setProps(AudioToTextXFormProps& props) + { + mProps = props; + } + +public: + framemetadata_sp mOutputMetadata; + std::string mOutputPinId; + std::vector mInputAudioBuffer; + AudioToTextXFormProps mProps; + int mFrameType; + whisper_context *mWhisperContext = NULL; + whisper_full_params mWhisperFullParams; + whisper_context_params mWhisperContextParams; + int toleranceBufferSize = 16000; //16000 is 1 second worth of samples since data is captured at 16KHz +}; + +AudioToTextXForm::AudioToTextXForm(AudioToTextXFormProps _props) : Module(TRANSFORM, "AudioToTextXForm", _props) +{ + mDetail.reset(new Detail(_props)); +} + +AudioToTextXForm::~AudioToTextXForm() {} + +bool AudioToTextXForm::validateInputPins() +{ + //TODO: Reject any audio pin that has a samplingRate!=16KHz + //https://github.com/Apra-Labs/ApraPipes/issues/325 + if (getNumberOfInputPins() != 1) + { + LOG_ERROR << "<" << getId() << ">::validateInputPins size is expected to be 1. Actual<" << getNumberOfInputPins() << ">"; + return false; + } + + framemetadata_sp metadata = getFirstInputMetadata(); + + FrameMetadata::FrameType frameType = metadata->getFrameType(); + + if (frameType != FrameMetadata::AUDIO) + { + LOG_ERROR << "<" << getId() << ">::validateInputPins input frameType is expected to be Audio. Actual<" << frameType << ">"; + return false; + } + + FrameMetadata::MemType memType = metadata->getMemType(); + if (memType != FrameMetadata::MemType::HOST) + { + LOG_ERROR << "<" << getId() << ">::validateInputPins input memType is expected to be HOST. Actual<" << memType << ">"; + return false; + } + return true; +} + +bool AudioToTextXForm::validateOutputPins() +{ + if (getNumberOfOutputPins() != 1) + { + LOG_ERROR << "<" << getId() << ">::validateOutputPins size is expected to be 1. Actual<" << getNumberOfOutputPins() << ">"; + return false; + } + + framemetadata_sp metadata = getFirstOutputMetadata(); + FrameMetadata::FrameType frameType = metadata->getFrameType(); + if (frameType != FrameMetadata::TEXT) + { + LOG_ERROR << "<" << getId() << ">::validateOutputPins input frameType is expected to be TEXT. Actual<" << frameType << ">"; + return false; + } + + return true; +} + +void AudioToTextXForm::addInputPin(framemetadata_sp& metadata, string& pinId) +{ + Module::addInputPin(metadata, pinId); + mDetail->mOutputMetadata = framemetadata_sp(new FrameMetadata(FrameMetadata::FrameType::TEXT)); + mDetail->mOutputMetadata->copyHint(*metadata.get()); + mDetail->mOutputPinId = addOutputPin(mDetail->mOutputMetadata); +} + +bool AudioToTextXForm::init() +{ + mDetail->mInputAudioBuffer.reserve(mDetail->mProps.bufferSize + mDetail->toleranceBufferSize); + mDetail->mWhisperFullParams = mDetail->fetchDefaultParams(); + mDetail->mWhisperContextParams = whisper_context_default_params(); + mDetail->mWhisperContext = whisper_init_from_file_with_params(mDetail->mProps.modelPath.c_str(), mDetail->mWhisperContextParams); + return Module::init(); +} + +bool AudioToTextXForm::term() +{ + whisper_free_context_params(&mDetail->mWhisperContextParams); + whisper_free_params(&mDetail->mWhisperFullParams); + whisper_free(mDetail->mWhisperContext); + return Module::term(); +} + +bool AudioToTextXForm::process(frame_container& frames) +{ + auto frame = frames.begin()->second; + sf::Int16* constFloatPointer = static_cast(frame->data()); + int numberOfSamples = frame->size() / 2; + //TODO: Modify to use NPP/ IPP + for (int index = 0; index < numberOfSamples; index++) { + mDetail->mInputAudioBuffer.push_back((float)constFloatPointer[index]/ 32768.0f); + } + + if (mDetail->mInputAudioBuffer.size() < mDetail->mProps.bufferSize) { + sendEOS(); + return true; + } + whisper_full( + mDetail->mWhisperContext, + mDetail->mWhisperFullParams, + mDetail->mInputAudioBuffer.data(), + mDetail->mInputAudioBuffer.size() + ); + std::string output = ""; + const int n_segments = whisper_full_n_segments(mDetail->mWhisperContext); + for (int i = 0; i < n_segments; ++i) { + const char* text = whisper_full_get_segment_text(mDetail->mWhisperContext, i); + output += text; + } + mDetail->mInputAudioBuffer.clear(); + auto outFrame = makeFrame(output.length()); + memcpy(outFrame->data(), output.c_str(), output.length()); + frames.insert(make_pair(mDetail->mOutputPinId, outFrame)); + send(frames); + return true; +} + +void AudioToTextXForm::setMetadata(framemetadata_sp& metadata) +{ + if (!metadata->isSet()) + { + return; + } +} + +bool AudioToTextXForm::processSOS(frame_sp& frame) +{ + auto metadata = frame->getMetadata(); + setMetadata(metadata); + return true; +} + +AudioToTextXFormProps AudioToTextXForm::getProps() +{ + fillProps(mDetail->mProps); + return mDetail->mProps; +} + +bool AudioToTextXForm::handlePropsChange(frame_sp& frame) +{ + AudioToTextXFormProps props(mDetail->mProps.samplingStrategy, mDetail->mProps.modelPath, mDetail->mProps.bufferSize); + auto ret = Module::handlePropsChange(frame, props); + mDetail->setProps(props); + mDetail->mWhisperFullParams = mDetail->fetchDefaultParams(); + return ret; +} + +void AudioToTextXForm::setProps(AudioToTextXFormProps& props) +{ + if (props.modelPath != mDetail->mProps.modelPath) { + throw AIPException(AIP_FATAL, "Model Path dynamic change not handled"); + } + Module::addPropsToQueue(props); +} \ No newline at end of file diff --git a/base/test/audioToTextXform_tests.cpp b/base/test/audioToTextXform_tests.cpp new file mode 100644 index 000000000..b566f5bb2 --- /dev/null +++ b/base/test/audioToTextXform_tests.cpp @@ -0,0 +1,276 @@ +#include +#include "stdafx.h" +#include +#include + +#include "FrameMetadata.h" +#include "FrameMetadataFactory.h" +#include "Frame.h" +#include "Logger.h" +#include "test_utils.h" +#include "PipeLine.h" +#include "FileWriterModule.h" +#include "FileReaderModule.h" +#include "FileWriterModule.h" +#include "AudioToTextXForm.h" +#include "Module.h" +#include "ExternalSinkModule.h" + +#include +#include +#include + +// Function to calculate the frequency of each word in a string +std::unordered_map calculateWordFrequency(const std::string &str) +{ + std::unordered_map frequencyMap; + std::string word = ""; + for (char c : str) + { + if (c == ' ' || c == '.' || c == ',' || c == ';' || c == ':' || c == '!' || c == '?') + { + if (!word.empty()) + { + frequencyMap[word]++; + word = ""; + } + } + else + { + word += std::tolower(c); + } + } + if (!word.empty()) + { + frequencyMap[word]++; + } + return frequencyMap; +} + +// Function to calculate dot product of two vectors +double dotProduct(const std::unordered_map &vec1, const std::unordered_map &vec2) +{ + double dotProduct = 0.0; + for (const auto &pair : vec1) + { + if (vec2.count(pair.first) > 0) + { + dotProduct += pair.second * vec2.at(pair.first); + } + } + return dotProduct; +} + +// Function to calculate magnitude of a vector +double magnitude(const std::unordered_map &vec) +{ + double mag = 0.0; + for (const auto &pair : vec) + { + mag += std::pow(pair.second, 2); + } + return std::sqrt(mag); +} + +// Function to calculate cosine similarity between two strings +double cosineSimilarity(const std::string &str1, const std::string &str2) +{ + unordered_map vec1 = calculateWordFrequency(str1); + unordered_map vec2 = calculateWordFrequency(str2); + + double dotProd = dotProduct(vec1, vec2); + double magVec1 = magnitude(vec1); + double magVec2 = magnitude(vec2); + + if (magVec1 == 0 || magVec2 == 0) + { + return 0; // Handle division by zero + } + + return dotProd / (magVec1 * magVec2); +} + +BOOST_AUTO_TEST_SUITE(audioToTextXform_test) + +BOOST_AUTO_TEST_CASE(test_asr) +{ + std::vector asrOutText = { "./data/asr_out.txt" }; + Test_Utils::FileCleaner f(asrOutText); + + Logger::setLogLevel(boost::log::trivial::severity_level::info); + + // This is a PCM file without WAV header + auto fileReaderProps = FileReaderModuleProps("./data/audioToTextXform_test.pcm"); + fileReaderProps.readLoop = false; + auto fileReader = boost::shared_ptr(new FileReaderModule(fileReaderProps)); + auto metadata = framemetadata_sp(new FrameMetadata(FrameMetadata::AUDIO)); + auto pinId = fileReader->addOutputPin(metadata); + + auto asr = boost::shared_ptr(new AudioToTextXForm(AudioToTextXFormProps( + AudioToTextXFormProps::DecoderSamplingStrategy::GREEDY + ,"./data/whisper/models/ggml-tiny.en-q8_0.bin",18000))); + fileReader->setNext(asr); + + auto outputFile = boost::shared_ptr(new FileWriterModule(FileWriterModuleProps(asrOutText[0], false))); + asr->setNext(outputFile); + + BOOST_TEST(fileReader->init()); + BOOST_TEST(asr->init()); + BOOST_TEST(outputFile->init()); + + fileReader->step(); + asr->step(); + outputFile->step(); + + std::ifstream in_file_text(asrOutText[0]); + std::ostringstream buffer; + buffer << in_file_text.rdbuf(); + std:string output = " The Matic speech recognition also known as ASR is the use of machine learning or artificial intelligence technology to process human speech into readable text."; + double thres = 0.95; + BOOST_TEST(cosineSimilarity(buffer.str(), output) >= thres); + // BOOST_TEST(buffer.str() == output); + in_file_text.close(); +} + +BOOST_AUTO_TEST_CASE(changeprop_asr) +{ + std::vector asrOutText = { "./data/asr_change_props_out.txt" }; + Test_Utils::FileCleaner f(asrOutText); + + Logger::setLogLevel(boost::log::trivial::severity_level::info); + + // This is a PCM file without WAV header + auto fileReaderProps = FileReaderModuleProps("./data/audioToTextXform_test.pcm"); + fileReaderProps.readLoop = true; + auto fileReader = boost::shared_ptr(new FileReaderModule(fileReaderProps)); + auto metadata = framemetadata_sp(new FrameMetadata(FrameMetadata::AUDIO)); + auto pinId = fileReader->addOutputPin(metadata); + + auto asr = boost::shared_ptr(new AudioToTextXForm(AudioToTextXFormProps( + AudioToTextXFormProps::DecoderSamplingStrategy::GREEDY + , "./data/whisper/models/ggml-tiny.en-q8_0.bin", 18000))); + fileReader->setNext(asr); + + auto outputFile = boost::shared_ptr(new FileWriterModule(FileWriterModuleProps(asrOutText[0], false))); + asr->setNext(outputFile); + + BOOST_TEST(fileReader->init()); + BOOST_TEST(asr->init()); + BOOST_TEST(outputFile->init()); + + AudioToTextXFormProps propschange = asr->getProps(); + propschange.bufferSize = 20000; + propschange.samplingStrategy = AudioToTextXFormProps::DecoderSamplingStrategy::BEAM_SEARCH; + fileReader->step(); + asr->step(); + outputFile->step(); + + asr->setProps(propschange); + for (int i = 0; i < 2; i++) { + fileReader->step(); + asr->step(); + } + outputFile->step(); + propschange = asr->getProps(); + std::ifstream in_file_text(asrOutText[0]); + std::ostringstream buffer; + buffer << in_file_text.rdbuf(); + std:string output = " Metex speech recognition, also known as ASR, is the use of machine learning or artificial intelligence technology to process human speech into readable text."; + //TODO: This test fails in Linux Cuda. Maybe Something to do with the Beam Search / change in props size that makes the behaviour different from windows + double thres = 0.95; + BOOST_TEST(cosineSimilarity(buffer.str(), output) >= thres); + // BOOST_TEST(buffer.str() == output); + + in_file_text.close(); + + BOOST_TEST( + (propschange.bufferSize == 20000)); + BOOST_TEST( + (propschange.samplingStrategy == AudioToTextXFormProps::DecoderSamplingStrategy::BEAM_SEARCH)); +} + +BOOST_AUTO_TEST_CASE(change_unsupported_prop_asr) +{ + std::vector asrOutText = { "./data/asr_change_props_out.txt" }; + Test_Utils::FileCleaner f(asrOutText); + + Logger::setLogLevel(boost::log::trivial::severity_level::info); + + // This is a PCM file without WAV header + auto fileReaderProps = FileReaderModuleProps("./data/audioToTextXform_test.pcm"); + fileReaderProps.readLoop = true; + auto fileReader = boost::shared_ptr(new FileReaderModule(fileReaderProps)); + auto metadata = framemetadata_sp(new FrameMetadata(FrameMetadata::AUDIO)); + auto pinId = fileReader->addOutputPin(metadata); + + auto asr = boost::shared_ptr(new AudioToTextXForm(AudioToTextXFormProps( + AudioToTextXFormProps::DecoderSamplingStrategy::GREEDY + , "./data/whisper/models/ggml-tiny.en-q8_0.bin", 18000))); + fileReader->setNext(asr); + + auto outputFile = boost::shared_ptr(new FileWriterModule(FileWriterModuleProps(asrOutText[0], false))); + asr->setNext(outputFile); + + BOOST_TEST(fileReader->init()); + BOOST_TEST(asr->init()); + BOOST_TEST(outputFile->init()); + + AudioToTextXFormProps propschange = asr->getProps(); + propschange.modelPath = "./newpath.bin"; + fileReader->step(); + asr->step(); + outputFile->step(); + + BOOST_CHECK_THROW(asr->setProps(propschange), std::runtime_error); +} + +BOOST_AUTO_TEST_CASE(checkEOS_asr) +{ + std::vector asrOutText = { "./data/asr_out.txt" }; + Test_Utils::FileCleaner f(asrOutText); + + Logger::setLogLevel(boost::log::trivial::severity_level::info); + + // This is a PCM file without WAV header + auto fileReaderProps = FileReaderModuleProps("./data/audioToTextXform_test.pcm"); + fileReaderProps.readLoop = false; + auto fileReader = boost::shared_ptr(new FileReaderModule(fileReaderProps)); + auto metadata = framemetadata_sp(new FrameMetadata(FrameMetadata::AUDIO)); + auto pinId = fileReader->addOutputPin(metadata); + + auto asr = boost::shared_ptr(new AudioToTextXForm(AudioToTextXFormProps( + AudioToTextXFormProps::DecoderSamplingStrategy::GREEDY + ,"./data/whisper/models/ggml-tiny.en-q8_0.bin",160000))); + fileReader->setNext(asr); + + auto outputFile = boost::shared_ptr(new FileWriterModule(FileWriterModuleProps(asrOutText[0], false))); + asr->setNext(outputFile); + + auto sink = boost::shared_ptr(new ExternalSinkModule()); + asr->setNext(sink); + + BOOST_TEST(fileReader->init()); + BOOST_TEST(asr->init()); + BOOST_TEST(outputFile->init()); + BOOST_TEST(sink->init()); + + fileReader->step(); + asr->step(); + + auto frames = sink->pop(); + auto eosframe = frames.begin()->second; + BOOST_TEST(eosframe->isEOS()); + + outputFile->step(); + + std::ifstream in_file_text(asrOutText[0]); + std::ostringstream buffer; + buffer << in_file_text.rdbuf(); + std:string output = " The Matic speech recognition also known as ASR is the use of machine learning or artificial intelligence technology to process human speech into readable text."; + double thres = 0; + BOOST_TEST(cosineSimilarity(buffer.str(), output) == thres); + // BOOST_TEST(buffer.str() == output); + in_file_text.close(); +} + +BOOST_AUTO_TEST_SUITE_END() \ No newline at end of file diff --git a/base/vcpkg.json b/base/vcpkg.json index e27033817..4df4664c0 100644 --- a/base/vcpkg.json +++ b/base/vcpkg.json @@ -2,8 +2,15 @@ "$schema": "https://raw.githubusercontent.com/microsoft/vcpkg/master/scripts/vcpkg.schema.json", "name": "apra-pipes-cuda", "version": "0.0.1", - "builtin-baseline": "46cf263b3d4bfab6d322b47ab40222db167c28b1", + "builtin-baseline": "eac79fc7bda260819c646d10c97dec825305aecd", "dependencies": [ + { + "name": "whisper", + "default-features": false, + "features": [ + "cuda" + ] + }, { "name": "opencv4", "default-features": false, diff --git a/build_jetson.sh b/build_jetson.sh index aca502de1..cafbd407a 100755 --- a/build_jetson.sh +++ b/build_jetson.sh @@ -9,5 +9,5 @@ cd .. CMAKE_THCOUNT=$(sh ./checkProc.sh) mkdir -p _build cd _build -cmake -B . -DENABLE_ARM64=ON -DENABLE_WINDOWS=OFF -DCMAKE_BUILD_TYPE=RelWithDebInfo ../base -DCMAKE_TOOLCHAIN_FILE=../vcpkg/scripts/buildsystems/vcpkg.cmake +export VCPKG_FORCE_SYSTEM_BINARIES=1 && export VCPKG_OVERLAY_PORTS=../thirdparty/custom-overlay && cmake -B . -DENABLE_ARM64=ON -DENABLE_WINDOWS=OFF -DCMAKE_BUILD_TYPE=RelWithDebInfo ../base -DCMAKE_TOOLCHAIN_FILE=../vcpkg/scripts/buildsystems/vcpkg.cmake cmake --build . -- -j "$(($(nproc) - 1))" diff --git a/data/.gitattributes b/data/.gitattributes new file mode 100644 index 000000000..478d8b918 --- /dev/null +++ b/data/.gitattributes @@ -0,0 +1 @@ +*.pcm filter=lfs diff=lfs merge=lfs -text diff --git a/data/audioToTextXform_test.pcm b/data/audioToTextXform_test.pcm new file mode 100644 index 000000000..ec5454952 --- /dev/null +++ b/data/audioToTextXform_test.pcm @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20575eb6d3d5199b1d56d3171c137e84559e0aea73a573846aa287133c6259d1 +size 315200 diff --git a/data/whisper/models/.gitattributes b/data/whisper/models/.gitattributes new file mode 100644 index 000000000..96c9e36b1 --- /dev/null +++ b/data/whisper/models/.gitattributes @@ -0,0 +1 @@ +ggml-tiny.en-q8_0.bin filter=lfs diff=lfs merge=lfs -text diff --git a/data/whisper/models/ggml-tiny.en-q8_0.bin b/data/whisper/models/ggml-tiny.en-q8_0.bin new file mode 100644 index 000000000..959468eb9 --- /dev/null +++ b/data/whisper/models/ggml-tiny.en-q8_0.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bc2b3860aa151a4c6e7bb095e1fcce7cf12c7b020ca08dcec0c6d018bb7dd94 +size 43550795 diff --git a/thirdparty/custom-overlay/whisper/fix-for-arm64.patch b/thirdparty/custom-overlay/whisper/fix-for-arm64.patch new file mode 100644 index 000000000..8c9c897f0 --- /dev/null +++ b/thirdparty/custom-overlay/whisper/fix-for-arm64.patch @@ -0,0 +1,127 @@ +diff --git a/ggml-cuda.cu b/ggml-cuda.cu +index 2db5043..c799e32 100644 +--- a/ggml-cuda.cu ++++ b/ggml-cuda.cu +@@ -12,9 +12,6 @@ + #include + #include + #include +-#include "ggml-cuda.h" +-#include "ggml.h" +-#include "ggml-backend-impl.h" + + #if defined(GGML_USE_HIPBLAS) + #include +@@ -108,6 +105,10 @@ + #include + #include + ++#include "ggml-cuda.h" ++#include "ggml.h" ++#include "ggml-backend-impl.h" ++ + #if CUDART_VERSION < 11020 + #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED + #define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH +diff --git a/ggml-quants.c b/ggml-quants.c +index 601d155..01921c6 100644 +--- a/ggml-quants.c ++++ b/ggml-quants.c +@@ -425,17 +425,86 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) { + + #else + +-#define ggml_int16x8x2_t int16x8x2_t +-#define ggml_uint8x16x2_t uint8x16x2_t +-#define ggml_uint8x16x4_t uint8x16x4_t +-#define ggml_int8x16x2_t int8x16x2_t +-#define ggml_int8x16x4_t int8x16x4_t +- +-#define ggml_vld1q_s16_x2 vld1q_s16_x2 +-#define ggml_vld1q_u8_x2 vld1q_u8_x2 +-#define ggml_vld1q_u8_x4 vld1q_u8_x4 +-#define ggml_vld1q_s8_x2 vld1q_s8_x2 +-#define ggml_vld1q_s8_x4 vld1q_s8_x4 ++typedef struct ggml_int16x8x2_t { ++ int16x8_t val[2]; ++} ggml_int16x8x2_t; ++ ++inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) { ++ ggml_int16x8x2_t res; ++ ++ res.val[0] = vld1q_s16(ptr + 0); ++ res.val[1] = vld1q_s16(ptr + 8); ++ ++ return res; ++} ++ ++typedef struct ggml_uint8x16x2_t { ++ uint8x16_t val[2]; ++} ggml_uint8x16x2_t; ++ ++inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) { ++ ggml_uint8x16x2_t res; ++ ++ res.val[0] = vld1q_u8(ptr + 0); ++ res.val[1] = vld1q_u8(ptr + 16); ++ ++ return res; ++} ++ ++typedef struct ggml_uint8x16x4_t { ++ uint8x16_t val[4]; ++} ggml_uint8x16x4_t; ++ ++inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) { ++ ggml_uint8x16x4_t res; ++ ++ res.val[0] = vld1q_u8(ptr + 0); ++ res.val[1] = vld1q_u8(ptr + 16); ++ res.val[2] = vld1q_u8(ptr + 32); ++ res.val[3] = vld1q_u8(ptr + 48); ++ ++ return res; ++} ++ ++typedef struct ggml_int8x16x2_t { ++ int8x16_t val[2]; ++} ggml_int8x16x2_t; ++ ++inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) { ++ ggml_int8x16x2_t res; ++ ++ res.val[0] = vld1q_s8(ptr + 0); ++ res.val[1] = vld1q_s8(ptr + 16); ++ ++ return res; ++} ++ ++typedef struct ggml_int8x16x4_t { ++ int8x16_t val[4]; ++} ggml_int8x16x4_t; ++ ++inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) { ++ ggml_int8x16x4_t res; ++ ++ res.val[0] = vld1q_s8(ptr + 0); ++ res.val[1] = vld1q_s8(ptr + 16); ++ res.val[2] = vld1q_s8(ptr + 32); ++ res.val[3] = vld1q_s8(ptr + 48); ++ ++ return res; ++} ++ ++// #define ggml_int16x8x2_t int16x8x2_t ++// #define ggml_uint8x16x2_t uint8x16x2_t ++// #define ggml_uint8x16x4_t uint8x16x4_t ++// #define ggml_int8x16x2_t int8x16x2_t ++// #define ggml_int8x16x4_t int8x16x4_t ++ ++// #define ggml_vld1q_s16_x2 vld1q_s16_x2 ++// #define ggml_vld1q_u8_x2 vld1q_u8_x2 ++// #define ggml_vld1q_u8_x4 vld1q_u8_x4 ++// #define ggml_vld1q_s8_x2 vld1q_s8_x2 ++// #define ggml_vld1q_s8_x4 vld1q_s8_x4 + + #endif + diff --git a/thirdparty/custom-overlay/whisper/portfile.cmake b/thirdparty/custom-overlay/whisper/portfile.cmake new file mode 100644 index 000000000..1ff52d319 --- /dev/null +++ b/thirdparty/custom-overlay/whisper/portfile.cmake @@ -0,0 +1,40 @@ +vcpkg_check_linkage(ONLY_STATIC_LIBRARY) + +vcpkg_from_github( + OUT_SOURCE_PATH SOURCE_PATH + REPO Apra-Labs/whisper.cpp + REF c3bff0d121e2af823344939643d64a27e4a76ea2 #v1.5.4 + SHA512 d51a32c91340d2b9f18bf5221e134e57a0259bc3a1c803ef427adc6e3de5f54c556232cd4ef070b9c07f93968efd942a61cfe311c2cbca013a928f0eb8055e6f # This is a temporary value. We will modify this value in the next section. + HEAD_REF kj/add-Config-for-vcpkg + PATCHES "fix-for-arm64.patch" +) + +vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS + FEATURES + "cuda" WHISPER_CUBLAS +) + +set(WHISPER_CUBLAS OFF) +if("cuda" IN_LIST FEATURES) + set(WHISPER_CUBLAS ON) +endif() + + +vcpkg_cmake_configure( + SOURCE_PATH "${SOURCE_PATH}" + DISABLE_PARALLEL_CONFIGURE +) + +vcpkg_cmake_install() +vcpkg_cmake_config_fixup( + CONFIG_PATH lib/cmake/whisper + PACKAGE_NAME whisper + ) +vcpkg_copy_pdbs() + +file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include") + +file(INSTALL "${SOURCE_PATH}/LICENSE" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}" RENAME copyright) +configure_file("${CMAKE_CURRENT_LIST_DIR}/usage" "${CURRENT_PACKAGES_DIR}/share/${PORT}/usage" COPYONLY) + +file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/share") \ No newline at end of file diff --git a/thirdparty/custom-overlay/whisper/usage b/thirdparty/custom-overlay/whisper/usage new file mode 100644 index 000000000..b997cd604 --- /dev/null +++ b/thirdparty/custom-overlay/whisper/usage @@ -0,0 +1,4 @@ +whisper provides CMake targets: + +find_package(whisper CONFIG REQUIRED) +target_link_libraries(main PRIVATE whisper::whisper) \ No newline at end of file diff --git a/thirdparty/custom-overlay/whisper/vcpkg.json b/thirdparty/custom-overlay/whisper/vcpkg.json new file mode 100644 index 000000000..0290a42fe --- /dev/null +++ b/thirdparty/custom-overlay/whisper/vcpkg.json @@ -0,0 +1,28 @@ +{ + "name": "whisper", + "version": "1.5.4", + "homepage": "https://github.com/Apra-Labs/whisper.cpp", + "description": "Fork of whisper.cpp a High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model in cpp.", + "license": "MIT", + "dependencies": [ + { + "name": "vcpkg-cmake", + "host": true + }, + { + "name": "vcpkg-cmake-config", + "host": true + } + ], + "default-features": [ + "default-features" + ], + "features": { + "cuda": { + "description": "Build Whisper with CUDA support", + "dependencies": [ + "cuda" + ] + } + } +} \ No newline at end of file diff --git a/vcpkg b/vcpkg index 0fd721ec0..7754d62d1 160000 --- a/vcpkg +++ b/vcpkg @@ -1 +1 @@ -Subproject commit 0fd721ec00106e7e249dfbf40d5dc6e67e12f38c +Subproject commit 7754d62d19501a3bb4e2d4f2eab80e8de9703e41