From 5358310a48ada5ba3c81e41652bcacb7a4acd102 Mon Sep 17 00:00:00 2001
From: Kashyap Jois <kjois@iprdgroup.com>
Date: Wed, 28 Feb 2024 14:41:13 +0400
Subject: [PATCH] Integrate Whisper CPP and write a wrapper module in Aprapipes
 (#324)

* Add custom port vcpkg for whisper

* Add whisper stream

* Add whisper stream header

* Add whisper cpp to Cmake list

* Add test frame type and minor changes

* Add whisper to vcpkg

* Add vcpkg custom overlay ports to thirdparty

* Modify with whisper option

* Send whisper output as text frames

* revert changes to sound record test

* Add whisper UT

* Fix PS to remove whisper from vcpkg json

* Revert changes to OPTIONS section, remove WHISPER option, rename Whisper source files to generic AudioToTextXForm

* Move pcm to git lfs

* Add pcm and model bin file to lfs

* Fix UT name

* Throw AIP exception for unknown strategy

* Revert sound_record_tests.cpp changes

* Revert changes to vcpkg indentation and remove Whisper option

* Linux -> OFF to ON Windows ON -> OFF

* Add reserve statement for vector
Move constructor impl

* update submodule for pipeline to run

* Update whisper port with install fix

* update submodule

* Update vcpkg version

* Add changes to handle props change

* Improve UT and refactor for changing sample strategy during run time.

* Add apt-get install libx11-dev libgles2-mesa-dev for libepoxy error

* Add memory type check in validate input pins and throw exception if model path changes.

* update submodule

* update vcpkg mysys2

* update submodule

* Address nits

* Export env variable overlay port for building in arm64

* added fix-for-arm64.patch for whisper

* update fix-vcpkg-json.ps1

* update CMakeLists.txt

* update vcpkg url for build

* update whisper tests threshold

* update code formatting

* update whisper test

* added EOS for small buffer size

---------

Co-authored-by: Kushal Jain <kushalj@apra.in>
Co-authored-by: Vinayak Y-B <vinayakb@apra.in>
---
 .github/workflows/CI-Linux-ARM64.yml          |   2 +-
 .../workflows/build-test-lin-container.yml    |   2 +-
 .github/workflows/build-test-lin-wsl.yml      |   2 +-
 .github/workflows/build-test-lin.yml          |   2 +-
 base/CMakeLists.txt                           |  16 +-
 base/fix-vcpkg-json.ps1                       |   4 +
 base/fix-vcpkg-json.sh                        |   4 +
 base/include/AudioToTextXForm.h               |  57 ++++
 base/include/FrameMetadata.h                  |   3 +-
 base/src/AudioToTextXForm.cpp                 | 224 ++++++++++++++
 base/test/audioToTextXform_tests.cpp          | 276 ++++++++++++++++++
 base/vcpkg.json                               |   9 +-
 build_jetson.sh                               |   2 +-
 data/.gitattributes                           |   1 +
 data/audioToTextXform_test.pcm                |   3 +
 data/whisper/models/.gitattributes            |   1 +
 data/whisper/models/ggml-tiny.en-q8_0.bin     |   3 +
 .../whisper/fix-for-arm64.patch               | 127 ++++++++
 .../custom-overlay/whisper/portfile.cmake     |  40 +++
 thirdparty/custom-overlay/whisper/usage       |   4 +
 thirdparty/custom-overlay/whisper/vcpkg.json  |  28 ++
 vcpkg                                         |   2 +-
 22 files changed, 797 insertions(+), 15 deletions(-)
 create mode 100644 base/include/AudioToTextXForm.h
 create mode 100644 base/src/AudioToTextXForm.cpp
 create mode 100644 base/test/audioToTextXform_tests.cpp
 create mode 100644 data/.gitattributes
 create mode 100644 data/audioToTextXform_test.pcm
 create mode 100644 data/whisper/models/.gitattributes
 create mode 100644 data/whisper/models/ggml-tiny.en-q8_0.bin
 create mode 100644 thirdparty/custom-overlay/whisper/fix-for-arm64.patch
 create mode 100644 thirdparty/custom-overlay/whisper/portfile.cmake
 create mode 100644 thirdparty/custom-overlay/whisper/usage
 create mode 100644 thirdparty/custom-overlay/whisper/vcpkg.json

diff --git a/.github/workflows/CI-Linux-ARM64.yml b/.github/workflows/CI-Linux-ARM64.yml
index 438ef51a2..383963a94 100644
--- a/.github/workflows/CI-Linux-ARM64.yml
+++ b/.github/workflows/CI-Linux-ARM64.yml
@@ -19,7 +19,7 @@ jobs:
       cuda: 'ON'
       prep-cmd: 'echo skipping builder prep as I can not sudo'
       cache-path: './none'
-      cmake-conf-cmd: 'export VCPKG_FORCE_SYSTEM_BINARIES=1 && cmake -B . -DENABLE_ARM64=ON ../base'
+      cmake-conf-cmd: 'export VCPKG_FORCE_SYSTEM_BINARIES=1 && export VCPKG_OVERLAY_PORTS=../thirdparty/custom-overlay && cmake -B . -DENABLE_ARM64=ON ../base'
       nProc: 6
   jetson-publish:
     needs: jetson-build-test
diff --git a/.github/workflows/build-test-lin-container.yml b/.github/workflows/build-test-lin-container.yml
index 3bd29c054..53158fc99 100644
--- a/.github/workflows/build-test-lin-container.yml
+++ b/.github/workflows/build-test-lin-container.yml
@@ -30,7 +30,7 @@ on:
       prep-cmd:
         type: string
         description: 'commands required to be run on a builder to prep it for build'
-        default: 'sudo apt-get update -qq && sudo apt-get -y install ca-certificates curl zip unzip tar  autoconf   automake  autopoint build-essential  flex git-core   libass-dev   libfreetype6-dev   libgnutls28-dev   libmp3lame-dev   libsdl2-dev   libtool   libsoup-gnome2.4-dev   libva-dev   libvdpau-dev   libvorbis-dev   libxcb1-dev   libxcb-shm0-dev   libxcb-xfixes0-dev libncurses5-dev libncursesw5-dev  ninja-build   pkg-config   texinfo   wget   yasm   zlib1g-dev   nasm   gperf  bison python3 python3-pip dos2unix && pip3 install meson'
+        default: 'sudo apt-get update -qq && sudo apt-get -y install ca-certificates curl zip unzip tar  autoconf   automake  autopoint build-essential  flex git-core   libass-dev   libfreetype6-dev   libgnutls28-dev   libmp3lame-dev   libsdl2-dev   libtool   libsoup-gnome2.4-dev   libva-dev   libvdpau-dev   libvorbis-dev   libxcb1-dev   libxcb-shm0-dev   libxcb-xfixes0-dev libncurses5-dev libncursesw5-dev  ninja-build   pkg-config   texinfo   wget   yasm   zlib1g-dev   nasm   gperf  bison python3 python3-pip dos2unix libx11-dev libgles2-mesa-dev && pip3 install meson'
         required: false
       prep-check-cmd:
         type: string
diff --git a/.github/workflows/build-test-lin-wsl.yml b/.github/workflows/build-test-lin-wsl.yml
index a193b781a..d74aa1b0c 100644
--- a/.github/workflows/build-test-lin-wsl.yml
+++ b/.github/workflows/build-test-lin-wsl.yml
@@ -30,7 +30,7 @@ on:
       prep-cmd:
         type: string
         description: 'commands required to be run on a builder to prep it for build'
-        default: 'sudo apt-get update -qq && sudo apt-get -y install ca-certificates curl zip unzip tar  autoconf   automake  autopoint build-essential  flex git-core   libass-dev   libfreetype6-dev   libgnutls28-dev   libmp3lame-dev   libsdl2-dev   libtool   libsoup-gnome2.4-dev   libva-dev   libvdpau-dev   libvorbis-dev   libxcb1-dev   libxcb-shm0-dev   libxcb-xfixes0-dev libncurses5-dev libncursesw5-dev  ninja-build   pkg-config   texinfo   wget   yasm   zlib1g-dev   nasm   gperf  bison python3 python3-pip dos2unix && pip3 install meson'
+        default: 'sudo apt-get update -qq && sudo apt-get -y install ca-certificates curl zip unzip tar  autoconf   automake  autopoint build-essential  flex git-core   libass-dev   libfreetype6-dev   libgnutls28-dev   libmp3lame-dev   libsdl2-dev   libtool   libsoup-gnome2.4-dev   libva-dev   libvdpau-dev   libvorbis-dev   libxcb1-dev   libxcb-shm0-dev   libxcb-xfixes0-dev libncurses5-dev libncursesw5-dev  ninja-build   pkg-config   texinfo   wget   yasm   zlib1g-dev   nasm   gperf  bison python3 python3-pip dos2unix libx11-dev libgles2-mesa-dev && pip3 install meson'
         required: false
       prep-check-cmd:
         type: string
diff --git a/.github/workflows/build-test-lin.yml b/.github/workflows/build-test-lin.yml
index e97f3ce9a..3098a0adb 100644
--- a/.github/workflows/build-test-lin.yml
+++ b/.github/workflows/build-test-lin.yml
@@ -30,7 +30,7 @@ on:
       prep-cmd:
         type: string
         description: 'commands required to be run on a builder to prep it for build'
-        default: 'sudo apt-get update -qq && sudo apt-get -y install ca-certificates curl zip unzip tar  autoconf   automake  autopoint build-essential  flex git-core   libass-dev   libfreetype6-dev   libgnutls28-dev   libmp3lame-dev   libsdl2-dev   libtool   libsoup-gnome2.4-dev   libva-dev   libvdpau-dev   libvorbis-dev   libxdamage-dev   libxcb1-dev   libxcb-shm0-dev   libxcb-xfixes0-dev libncurses5-dev libncursesw5-dev  ninja-build   pkg-config   texinfo   wget   yasm   zlib1g-dev   nasm   gperf  bison python3 python3-pip dos2unix && pip3 install meson'
+        default: 'sudo apt-get update -qq && sudo apt-get -y install ca-certificates curl zip unzip tar  autoconf   automake  autopoint build-essential  flex git-core   libass-dev   libfreetype6-dev   libgnutls28-dev   libmp3lame-dev   libsdl2-dev   libtool   libsoup-gnome2.4-dev   libva-dev   libvdpau-dev   libvorbis-dev   libxdamage-dev   libxcb1-dev   libxcb-shm0-dev   libxcb-xfixes0-dev libncurses5-dev libncursesw5-dev  ninja-build   pkg-config   texinfo   wget   yasm   zlib1g-dev   nasm   gperf  bison python3 python3-pip dos2unix libx11-dev libgles2-mesa-dev && pip3 install meson'
         required: false
       prep-check-cmd:
         type: string
diff --git a/base/CMakeLists.txt b/base/CMakeLists.txt
index f9d14b705..9f2cd1470 100755
--- a/base/CMakeLists.txt
+++ b/base/CMakeLists.txt
@@ -6,6 +6,8 @@ OPTION(ENABLE_ARM64 "Use this switch to enable ARM64" OFF)
 OPTION(ENABLE_WINDOWS "Use this switch to enable WINDOWS" OFF)
 
 set(VCPKG_INSTALL_OPTIONS "--clean-after-build")
+set(VCPKG_OVERLAY_PORTS "${CMAKE_CURRENT_SOURCE_DIR}/../thirdparty/custom-overlay")
+
 IF(ENABLE_CUDA)
 	add_compile_definitions(APRA_CUDA_ENABLED)
 ENDIF(ENABLE_CUDA)
@@ -23,6 +25,7 @@ IF(ENABLE_ARM64)
 	add_compile_definitions(ARM64)
 	set(VCPKG_OVERLAY_PORTS ../vcpkg/ports/cudnn)	
 	set(VCPKG_OVERLAY_TRIPLETS ../vcpkg/triplets/community/arm64-linux.cmake)
+	set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
 ENDIF(ENABLE_ARM64)
 
 #use /MP only for language CXX (and not CUDA) and MSVC for both targets
@@ -38,8 +41,6 @@ project(APRAPIPES)
 message(STATUS $ENV{PKG_CONFIG_PATH}">>>>>> PKG_CONFIG_PATH")
 
 find_package(PkgConfig REQUIRED)
-        
-
 find_package(Boost COMPONENTS system thread filesystem serialization log chrono unit_test_framework REQUIRED)
 find_package(JPEG REQUIRED)
 find_package(OpenCV CONFIG REQUIRED)
@@ -50,6 +51,7 @@ find_package(FFMPEG REQUIRED)
 find_package(ZXing CONFIG REQUIRED)
 find_package(bigint CONFIG REQUIRED)
 find_package(SFML COMPONENTS system window audio graphics CONFIG REQUIRED)
+find_package(whisper CONFIG REQUIRED)
 
 IF(ENABLE_CUDA)
 	if((NOT DEFINED CMAKE_CUDA_ARCHITECTURES) OR (CMAKE_CUDA_ARCHITECTURES STREQUAL ""))
@@ -280,10 +282,9 @@ SET(IP_FILES
 	src/OverlayFactory.h
 	src/OverlayFactory.cpp
 	src/TestSignalGeneratorSrc.cpp
+	src/AudioToTextXForm.cpp 
 )
-
-
-
+	
 SET(IP_FILES_H
 	include/HistogramOverlay.h
 	include/CalcHistogramCV.h
@@ -306,10 +307,9 @@ SET(IP_FILES_H
 	include/TextOverlayXForm.h
 	include/ColorConversionXForm.h
 	include/Overlay.h
+	include/AudioToTextXForm.h
 )
 
-
-
 SET(CUDA_CORE_FILES
 	src/apra_cudamalloc_allocator.cu
 	src/apra_cudamallochost_allocator.cu
@@ -561,6 +561,7 @@ SET(UT_FILES
 	test/mp4_dts_strategy_tests.cpp
 	test/overlaymodule_tests.cpp
 	test/testSignalGeneratorSrc_tests.cpp
+	test/audioToTextXform_tests.cpp
 	${ARM64_UT_FILES}
 	${CUDA_UT_FILES}
 )
@@ -607,6 +608,7 @@ target_link_libraries(aprapipesut
   liblzma::liblzma
   bigint::bigint
   sfml-audio
+  whisper::whisper
   )
 
 IF(ENABLE_WINDOWS)
diff --git a/base/fix-vcpkg-json.ps1 b/base/fix-vcpkg-json.ps1
index f2f8bcde9..634ef9c83 100644
--- a/base/fix-vcpkg-json.ps1
+++ b/base/fix-vcpkg-json.ps1
@@ -8,6 +8,10 @@ if ($removeCUDA.IsPresent)
     $v.dependencies |
         Where-Object { $_.name -eq 'opencv4' } |
         ForEach-Object { $_.features = $_.features -ne 'cuda' -ne 'cudnn' }
+    
+    $v.dependencies |
+        Where-Object { $_.name -eq 'whisper' } |
+        ForEach-Object { $_.features = $_.features -ne 'cuda' }
 }
 
 if($removeOpenCV.IsPresent)
diff --git a/base/fix-vcpkg-json.sh b/base/fix-vcpkg-json.sh
index e4b63c59a..74bfc0aa3 100644
--- a/base/fix-vcpkg-json.sh
+++ b/base/fix-vcpkg-json.sh
@@ -21,6 +21,10 @@ if $removeCUDA; then
             # Remove "cuda" and "cudnn" features for this "opencv4" instance
             v=$(echo "$v" | jq ".dependencies[$index].features |= map(select(. != \"cuda\" and . != \"cudnn\"))")
         fi
+        if [ "$name" == "whisper"]; then
+            # Remove "cuda" features for this "whisper" instance
+            v=$(echo "$v" | jq ".dependencies[$index].features |= map(select(. != \"cuda\"))")
+        fi
     done
 fi
 
diff --git a/base/include/AudioToTextXForm.h b/base/include/AudioToTextXForm.h
new file mode 100644
index 000000000..c95cf796f
--- /dev/null
+++ b/base/include/AudioToTextXForm.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include "Module.h"
+
+// size of audio to process should be a parameter. 
+// Cache variable to collect frames for processing
+
+class AudioToTextXFormProps : public ModuleProps
+{
+public:
+	enum DecoderSamplingStrategy {
+		GREEDY,  
+		BEAM_SEARCH
+	};
+	
+	DecoderSamplingStrategy samplingStrategy;
+	std::string modelPath;
+	int bufferSize;
+
+	AudioToTextXFormProps(
+		DecoderSamplingStrategy _samplingStrategy,
+		std::string _modelPath,
+		int _bufferSize);
+	size_t getSerializeSize();
+
+
+private:
+	friend class boost::serialization::access;
+	
+	template <class Archive>
+	void serialize(Archive& ar, const unsigned int version);
+};
+
+class AudioToTextXForm  : public Module
+{
+
+public:
+	AudioToTextXForm(AudioToTextXFormProps _props);
+	virtual ~AudioToTextXForm();
+	bool init();
+	bool term();
+	void setProps(AudioToTextXFormProps& props);
+	AudioToTextXFormProps getProps();
+
+protected:
+	bool process(frame_container& frames);
+	bool processSOS(frame_sp& frame);
+	bool validateInputPins();
+	bool validateOutputPins();
+	void addInputPin(framemetadata_sp& metadata, string& pinId);
+	bool handlePropsChange(frame_sp& frame);
+
+private:
+	void setMetadata(framemetadata_sp& metadata);
+	class Detail;
+	boost::shared_ptr<Detail> mDetail;
+};
diff --git a/base/include/FrameMetadata.h b/base/include/FrameMetadata.h
index ca8e5f646..ebddf592b 100755
--- a/base/include/FrameMetadata.h
+++ b/base/include/FrameMetadata.h
@@ -50,7 +50,8 @@ class FrameMetadata {
 		HEVC_DATA, //H265
 		MOTION_VECTOR_DATA,
 		OVERLAY_INFO_IMAGE,
-		FACE_LANDMARKS_INFO
+		FACE_LANDMARKS_INFO,
+		TEXT
 	};
 
 	enum MemType
diff --git a/base/src/AudioToTextXForm.cpp b/base/src/AudioToTextXForm.cpp
new file mode 100644
index 000000000..d84a13073
--- /dev/null
+++ b/base/src/AudioToTextXForm.cpp
@@ -0,0 +1,224 @@
+#include "AudioToTextXForm.h"
+#include "FrameMetadata.h"
+#include "FrameMetadataFactory.h"
+#include "Frame.h"
+#include "Logger.h"
+#include "Utils.h"
+#include "whisper.h"
+#include "SFML/Config.hpp"
+
+AudioToTextXFormProps::AudioToTextXFormProps(
+	DecoderSamplingStrategy _samplingStrategy,
+	std::string _modelPath,
+	int _bufferSize) : samplingStrategy(_samplingStrategy),
+	modelPath(_modelPath),
+	bufferSize(_bufferSize)
+{}
+
+size_t AudioToTextXFormProps::getSerializeSize() {
+	return ModuleProps::getSerializeSize() +
+		sizeof(samplingStrategy) +
+		sizeof(modelPath) +
+		sizeof(bufferSize);
+}
+
+template <class Archive>
+void AudioToTextXFormProps::serialize(Archive& ar, const unsigned int version) {
+	ar& boost::serialization::base_object<ModuleProps>(*this);
+	ar& samplingStrategy;
+	ar& modelPath;
+	ar& bufferSize;
+}
+
+class AudioToTextXForm::Detail
+{
+public:
+	Detail(AudioToTextXFormProps& _props) : mProps(_props)
+	{
+	}
+	~Detail() {}
+
+	whisper_full_params fetchDefaultParams() {
+		auto samplingStrategy = whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY;
+		switch (mProps.samplingStrategy)
+		{
+		case AudioToTextXFormProps::DecoderSamplingStrategy::GREEDY:
+			samplingStrategy = whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY;
+			break;
+		case AudioToTextXFormProps::DecoderSamplingStrategy::BEAM_SEARCH:
+			samplingStrategy = whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH;
+			break;
+		default:
+			throw AIPException(AIP_FATAL, "Unknown Sampling Strategy");
+		}
+		return whisper_full_default_params(samplingStrategy);
+	}
+
+	void setProps(AudioToTextXFormProps& props)
+	{
+		mProps = props;
+	}
+
+public:
+	framemetadata_sp mOutputMetadata;
+	std::string mOutputPinId;
+	std::vector<float> mInputAudioBuffer;
+	AudioToTextXFormProps mProps;
+	int mFrameType;
+	whisper_context *mWhisperContext = NULL;
+	whisper_full_params mWhisperFullParams;
+	whisper_context_params mWhisperContextParams;
+	int toleranceBufferSize = 16000; //16000 is 1 second worth of samples since data is captured at 16KHz
+};
+
+AudioToTextXForm::AudioToTextXForm(AudioToTextXFormProps _props) : Module(TRANSFORM, "AudioToTextXForm", _props)
+{
+	mDetail.reset(new Detail(_props));
+}
+
+AudioToTextXForm::~AudioToTextXForm() {}
+
+bool AudioToTextXForm::validateInputPins()
+{
+	//TODO: Reject any audio pin that has a samplingRate!=16KHz 
+	//https://github.com/Apra-Labs/ApraPipes/issues/325
+	if (getNumberOfInputPins() != 1)
+	{
+		LOG_ERROR << "<" << getId() << ">::validateInputPins size is expected to be 1. Actual<" << getNumberOfInputPins() << ">";
+		return false;
+	}
+
+	framemetadata_sp metadata = getFirstInputMetadata();
+
+	FrameMetadata::FrameType frameType = metadata->getFrameType();
+
+	if (frameType != FrameMetadata::AUDIO)
+	{
+		LOG_ERROR << "<" << getId() << ">::validateInputPins input frameType is expected to be Audio. Actual<" << frameType << ">";
+		return false;
+	}
+
+	FrameMetadata::MemType memType = metadata->getMemType();
+	if (memType != FrameMetadata::MemType::HOST)
+	{
+		LOG_ERROR << "<" << getId() << ">::validateInputPins input memType is expected to be HOST. Actual<" << memType << ">";
+		return false;
+	}
+	return true;
+}
+
+bool AudioToTextXForm::validateOutputPins()
+{
+	if (getNumberOfOutputPins() != 1)
+	{
+		LOG_ERROR << "<" << getId() << ">::validateOutputPins size is expected to be 1. Actual<" << getNumberOfOutputPins() << ">";
+		return false;
+	}
+
+	framemetadata_sp metadata = getFirstOutputMetadata();
+	FrameMetadata::FrameType frameType = metadata->getFrameType();
+	if (frameType != FrameMetadata::TEXT)
+	{
+		LOG_ERROR << "<" << getId() << ">::validateOutputPins input frameType is expected to be TEXT. Actual<" << frameType << ">";
+		return false;
+	}
+
+	return true;
+}
+
+void AudioToTextXForm::addInputPin(framemetadata_sp& metadata, string& pinId)
+{
+	Module::addInputPin(metadata, pinId);
+	mDetail->mOutputMetadata = framemetadata_sp(new FrameMetadata(FrameMetadata::FrameType::TEXT));
+	mDetail->mOutputMetadata->copyHint(*metadata.get());
+	mDetail->mOutputPinId = addOutputPin(mDetail->mOutputMetadata);
+}
+
+bool AudioToTextXForm::init()
+{
+	mDetail->mInputAudioBuffer.reserve(mDetail->mProps.bufferSize + mDetail->toleranceBufferSize);
+	mDetail->mWhisperFullParams = mDetail->fetchDefaultParams();
+	mDetail->mWhisperContextParams = whisper_context_default_params();
+	mDetail->mWhisperContext = whisper_init_from_file_with_params(mDetail->mProps.modelPath.c_str(), mDetail->mWhisperContextParams);
+	return Module::init();
+}
+
+bool AudioToTextXForm::term()
+{
+	whisper_free_context_params(&mDetail->mWhisperContextParams);
+	whisper_free_params(&mDetail->mWhisperFullParams);
+	whisper_free(mDetail->mWhisperContext);
+	return Module::term();
+}
+
+bool AudioToTextXForm::process(frame_container& frames)
+{
+	auto frame = frames.begin()->second;
+	sf::Int16* constFloatPointer = static_cast<sf::Int16*>(frame->data());
+	int numberOfSamples = frame->size() / 2;
+	//TODO: Modify to use NPP/ IPP
+	for (int index = 0; index < numberOfSamples; index++) {
+		mDetail->mInputAudioBuffer.push_back((float)constFloatPointer[index]/ 32768.0f);
+	}
+
+	if (mDetail->mInputAudioBuffer.size() < mDetail->mProps.bufferSize) {
+		sendEOS();
+		return true;
+	}
+	whisper_full(
+		mDetail->mWhisperContext,
+		mDetail->mWhisperFullParams,
+		mDetail->mInputAudioBuffer.data(),
+		mDetail->mInputAudioBuffer.size()
+	);
+	std::string output = "";
+	const int n_segments = whisper_full_n_segments(mDetail->mWhisperContext);
+	for (int i = 0; i < n_segments; ++i) {
+		const char* text = whisper_full_get_segment_text(mDetail->mWhisperContext, i);
+		output += text;
+	}
+	mDetail->mInputAudioBuffer.clear();
+	auto outFrame = makeFrame(output.length());
+	memcpy(outFrame->data(), output.c_str(), output.length());
+	frames.insert(make_pair(mDetail->mOutputPinId, outFrame));
+	send(frames);
+	return true;
+}
+
+void AudioToTextXForm::setMetadata(framemetadata_sp& metadata)
+{
+	if (!metadata->isSet())
+	{
+		return;
+	}
+}
+
+bool AudioToTextXForm::processSOS(frame_sp& frame)
+{
+	auto metadata = frame->getMetadata();
+	setMetadata(metadata);
+	return true;
+}
+
+AudioToTextXFormProps AudioToTextXForm::getProps()
+{
+	fillProps(mDetail->mProps);
+	return mDetail->mProps;
+}
+
+bool AudioToTextXForm::handlePropsChange(frame_sp& frame)
+{
+	AudioToTextXFormProps props(mDetail->mProps.samplingStrategy, mDetail->mProps.modelPath, mDetail->mProps.bufferSize);
+	auto ret = Module::handlePropsChange(frame, props);
+	mDetail->setProps(props);
+	mDetail->mWhisperFullParams = mDetail->fetchDefaultParams();
+	return ret;
+}
+
+void AudioToTextXForm::setProps(AudioToTextXFormProps& props)
+{
+	if (props.modelPath != mDetail->mProps.modelPath) {
+		throw AIPException(AIP_FATAL, "Model Path dynamic change not handled");
+	}
+	Module::addPropsToQueue(props);
+}
\ No newline at end of file
diff --git a/base/test/audioToTextXform_tests.cpp b/base/test/audioToTextXform_tests.cpp
new file mode 100644
index 000000000..b566f5bb2
--- /dev/null
+++ b/base/test/audioToTextXform_tests.cpp
@@ -0,0 +1,276 @@
+#include <boost/test/unit_test.hpp>
+#include "stdafx.h"
+#include<fstream>
+#include<vector>
+
+#include "FrameMetadata.h"
+#include "FrameMetadataFactory.h"
+#include "Frame.h"
+#include "Logger.h"
+#include "test_utils.h"
+#include "PipeLine.h"
+#include "FileWriterModule.h"
+#include "FileReaderModule.h"
+#include "FileWriterModule.h"
+#include "AudioToTextXForm.h"
+#include "Module.h"
+#include "ExternalSinkModule.h"
+
+#include <unordered_map>
+#include <string>
+#include <cmath>
+
+// Function to calculate the frequency of each word in a string
+std::unordered_map<string, int> calculateWordFrequency(const std::string &str)
+{
+    std::unordered_map<std::string, int> frequencyMap;
+    std::string word = "";
+    for (char c : str)
+    {
+        if (c == ' ' || c == '.' || c == ',' || c == ';' || c == ':' || c == '!' || c == '?')
+        {
+            if (!word.empty())
+            {
+                frequencyMap[word]++;
+                word = "";
+            }
+        }
+        else
+        {
+            word += std::tolower(c);
+        }
+    }
+    if (!word.empty())
+    {
+        frequencyMap[word]++;
+    }
+    return frequencyMap;
+}
+
+// Function to calculate dot product of two vectors
+double dotProduct(const std::unordered_map<std::string, int> &vec1, const std::unordered_map<std::string, int> &vec2)
+{
+    double dotProduct = 0.0;
+    for (const auto &pair : vec1)
+    {
+        if (vec2.count(pair.first) > 0)
+        {
+            dotProduct += pair.second * vec2.at(pair.first);
+        }
+    }
+    return dotProduct;
+}
+
+// Function to calculate magnitude of a vector
+double magnitude(const std::unordered_map<std::string, int> &vec)
+{
+    double mag = 0.0;
+    for (const auto &pair : vec)
+    {
+        mag += std::pow(pair.second, 2);
+    }
+    return std::sqrt(mag);
+}
+
+// Function to calculate cosine similarity between two strings
+double cosineSimilarity(const std::string &str1, const std::string &str2)
+{
+    unordered_map<string, int> vec1 = calculateWordFrequency(str1);
+    unordered_map<string, int> vec2 = calculateWordFrequency(str2);
+
+    double dotProd = dotProduct(vec1, vec2);
+    double magVec1 = magnitude(vec1);
+    double magVec2 = magnitude(vec2);
+
+    if (magVec1 == 0 || magVec2 == 0)
+    {
+        return 0; // Handle division by zero
+    }
+
+    return dotProd / (magVec1 * magVec2);
+}
+
+BOOST_AUTO_TEST_SUITE(audioToTextXform_test)
+
+BOOST_AUTO_TEST_CASE(test_asr)
+{
+    std::vector<std::string> asrOutText = { "./data/asr_out.txt" };
+    Test_Utils::FileCleaner f(asrOutText);
+
+	Logger::setLogLevel(boost::log::trivial::severity_level::info);
+
+    // This is a PCM file without WAV header
+    auto fileReaderProps = FileReaderModuleProps("./data/audioToTextXform_test.pcm");
+    fileReaderProps.readLoop = false;
+    auto fileReader = boost::shared_ptr<FileReaderModule>(new FileReaderModule(fileReaderProps));
+    auto metadata = framemetadata_sp(new FrameMetadata(FrameMetadata::AUDIO));
+    auto pinId = fileReader->addOutputPin(metadata);
+   
+    auto asr = boost::shared_ptr<AudioToTextXForm>(new AudioToTextXForm(AudioToTextXFormProps(
+        AudioToTextXFormProps::DecoderSamplingStrategy::GREEDY
+        ,"./data/whisper/models/ggml-tiny.en-q8_0.bin",18000)));
+    fileReader->setNext(asr);
+
+    auto outputFile = boost::shared_ptr<FileWriterModule>(new FileWriterModule(FileWriterModuleProps(asrOutText[0], false)));
+    asr->setNext(outputFile);
+
+    BOOST_TEST(fileReader->init());
+    BOOST_TEST(asr->init());
+    BOOST_TEST(outputFile->init());
+
+    fileReader->step();
+    asr->step();
+    outputFile->step();
+
+    std::ifstream in_file_text(asrOutText[0]);
+    std::ostringstream  buffer;
+    buffer << in_file_text.rdbuf();
+    std:string output = " The Matic speech recognition also known as ASR is the use of machine learning or artificial intelligence technology to process human speech into readable text.";
+    double thres = 0.95;
+    BOOST_TEST(cosineSimilarity(buffer.str(), output) >= thres);
+    // BOOST_TEST(buffer.str() == output);
+    in_file_text.close();
+}
+
+BOOST_AUTO_TEST_CASE(changeprop_asr)
+{
+    std::vector<std::string> asrOutText = { "./data/asr_change_props_out.txt" };
+    Test_Utils::FileCleaner f(asrOutText);
+
+    Logger::setLogLevel(boost::log::trivial::severity_level::info);
+
+    // This is a PCM file without WAV header
+    auto fileReaderProps = FileReaderModuleProps("./data/audioToTextXform_test.pcm");
+    fileReaderProps.readLoop = true;
+    auto fileReader = boost::shared_ptr<FileReaderModule>(new FileReaderModule(fileReaderProps));
+    auto metadata = framemetadata_sp(new FrameMetadata(FrameMetadata::AUDIO));
+    auto pinId = fileReader->addOutputPin(metadata);
+
+    auto asr = boost::shared_ptr<AudioToTextXForm>(new AudioToTextXForm(AudioToTextXFormProps(
+        AudioToTextXFormProps::DecoderSamplingStrategy::GREEDY
+        , "./data/whisper/models/ggml-tiny.en-q8_0.bin", 18000)));
+    fileReader->setNext(asr);
+
+    auto outputFile = boost::shared_ptr<FileWriterModule>(new FileWriterModule(FileWriterModuleProps(asrOutText[0], false)));
+    asr->setNext(outputFile);
+
+    BOOST_TEST(fileReader->init());
+    BOOST_TEST(asr->init());
+    BOOST_TEST(outputFile->init());
+
+    AudioToTextXFormProps propschange = asr->getProps();
+    propschange.bufferSize = 20000;
+    propschange.samplingStrategy = AudioToTextXFormProps::DecoderSamplingStrategy::BEAM_SEARCH;
+    fileReader->step();
+    asr->step();
+    outputFile->step();
+
+    asr->setProps(propschange);
+    for (int i = 0; i < 2; i++) {
+        fileReader->step();
+        asr->step();
+    }
+    outputFile->step();
+    propschange = asr->getProps();
+    std::ifstream in_file_text(asrOutText[0]);
+    std::ostringstream  buffer;
+    buffer << in_file_text.rdbuf();
+    std:string output = " Metex speech recognition, also known as ASR, is the use of machine learning or artificial intelligence technology to process human speech into readable text.";
+    //TODO: This test fails in Linux Cuda. Maybe Something to do with the Beam Search / change in props size that makes the behaviour different from windows
+    double thres = 0.95;
+    BOOST_TEST(cosineSimilarity(buffer.str(), output) >= thres);
+    // BOOST_TEST(buffer.str() == output);
+
+    in_file_text.close();
+    
+    BOOST_TEST(
+        (propschange.bufferSize == 20000));
+    BOOST_TEST(
+        (propschange.samplingStrategy == AudioToTextXFormProps::DecoderSamplingStrategy::BEAM_SEARCH));
+}
+
+BOOST_AUTO_TEST_CASE(change_unsupported_prop_asr)
+{
+    std::vector<std::string> asrOutText = { "./data/asr_change_props_out.txt" };
+    Test_Utils::FileCleaner f(asrOutText);
+
+    Logger::setLogLevel(boost::log::trivial::severity_level::info);
+
+    // This is a PCM file without WAV header
+    auto fileReaderProps = FileReaderModuleProps("./data/audioToTextXform_test.pcm");
+    fileReaderProps.readLoop = true;
+    auto fileReader = boost::shared_ptr<FileReaderModule>(new FileReaderModule(fileReaderProps));
+    auto metadata = framemetadata_sp(new FrameMetadata(FrameMetadata::AUDIO));
+    auto pinId = fileReader->addOutputPin(metadata);
+
+    auto asr = boost::shared_ptr<AudioToTextXForm>(new AudioToTextXForm(AudioToTextXFormProps(
+        AudioToTextXFormProps::DecoderSamplingStrategy::GREEDY
+        , "./data/whisper/models/ggml-tiny.en-q8_0.bin", 18000)));
+    fileReader->setNext(asr);
+
+    auto outputFile = boost::shared_ptr<FileWriterModule>(new FileWriterModule(FileWriterModuleProps(asrOutText[0], false)));
+    asr->setNext(outputFile);
+
+    BOOST_TEST(fileReader->init());
+    BOOST_TEST(asr->init());
+    BOOST_TEST(outputFile->init());
+
+    AudioToTextXFormProps propschange = asr->getProps();
+    propschange.modelPath = "./newpath.bin";
+    fileReader->step();
+    asr->step();
+    outputFile->step();
+
+    BOOST_CHECK_THROW(asr->setProps(propschange), std::runtime_error);
+}
+
+BOOST_AUTO_TEST_CASE(checkEOS_asr)
+{
+    std::vector<std::string> asrOutText = { "./data/asr_out.txt" };
+    Test_Utils::FileCleaner f(asrOutText);
+
+	Logger::setLogLevel(boost::log::trivial::severity_level::info);
+
+    // This is a PCM file without WAV header
+    auto fileReaderProps = FileReaderModuleProps("./data/audioToTextXform_test.pcm");
+    fileReaderProps.readLoop = false;
+    auto fileReader = boost::shared_ptr<FileReaderModule>(new FileReaderModule(fileReaderProps));
+    auto metadata = framemetadata_sp(new FrameMetadata(FrameMetadata::AUDIO));
+    auto pinId = fileReader->addOutputPin(metadata);
+   
+    auto asr = boost::shared_ptr<AudioToTextXForm>(new AudioToTextXForm(AudioToTextXFormProps(
+        AudioToTextXFormProps::DecoderSamplingStrategy::GREEDY
+        ,"./data/whisper/models/ggml-tiny.en-q8_0.bin",160000)));
+    fileReader->setNext(asr);
+
+    auto outputFile = boost::shared_ptr<FileWriterModule>(new FileWriterModule(FileWriterModuleProps(asrOutText[0], false)));
+    asr->setNext(outputFile);
+
+    auto sink = boost::shared_ptr<ExternalSinkModule>(new ExternalSinkModule());
+    asr->setNext(sink);
+
+    BOOST_TEST(fileReader->init());
+    BOOST_TEST(asr->init());
+    BOOST_TEST(outputFile->init());
+    BOOST_TEST(sink->init());
+
+    fileReader->step();
+    asr->step();
+
+    auto frames = sink->pop();
+    auto eosframe = frames.begin()->second;
+    BOOST_TEST(eosframe->isEOS());
+    
+    outputFile->step();
+
+    std::ifstream in_file_text(asrOutText[0]);
+    std::ostringstream  buffer;
+    buffer << in_file_text.rdbuf();
+    std:string output = " The Matic speech recognition also known as ASR is the use of machine learning or artificial intelligence technology to process human speech into readable text.";
+    double thres = 0;
+    BOOST_TEST(cosineSimilarity(buffer.str(), output) == thres);
+    // BOOST_TEST(buffer.str() == output);
+    in_file_text.close();
+}
+
+BOOST_AUTO_TEST_SUITE_END()
\ No newline at end of file
diff --git a/base/vcpkg.json b/base/vcpkg.json
index e27033817..4df4664c0 100644
--- a/base/vcpkg.json
+++ b/base/vcpkg.json
@@ -2,8 +2,15 @@
   "$schema": "https://raw.githubusercontent.com/microsoft/vcpkg/master/scripts/vcpkg.schema.json",
   "name": "apra-pipes-cuda",
   "version": "0.0.1",
-  "builtin-baseline": "46cf263b3d4bfab6d322b47ab40222db167c28b1",
+  "builtin-baseline": "eac79fc7bda260819c646d10c97dec825305aecd",
   "dependencies": [
+    {
+      "name": "whisper",
+      "default-features": false,
+      "features": [
+          "cuda"
+      ]
+    },
     {
       "name": "opencv4",
       "default-features": false,
diff --git a/build_jetson.sh b/build_jetson.sh
index aca502de1..cafbd407a 100755
--- a/build_jetson.sh
+++ b/build_jetson.sh
@@ -9,5 +9,5 @@ cd ..
 CMAKE_THCOUNT=$(sh ./checkProc.sh)
 mkdir -p _build
 cd _build
-cmake -B . -DENABLE_ARM64=ON -DENABLE_WINDOWS=OFF -DCMAKE_BUILD_TYPE=RelWithDebInfo ../base -DCMAKE_TOOLCHAIN_FILE=../vcpkg/scripts/buildsystems/vcpkg.cmake
+export VCPKG_FORCE_SYSTEM_BINARIES=1 && export VCPKG_OVERLAY_PORTS=../thirdparty/custom-overlay && cmake -B . -DENABLE_ARM64=ON -DENABLE_WINDOWS=OFF -DCMAKE_BUILD_TYPE=RelWithDebInfo ../base -DCMAKE_TOOLCHAIN_FILE=../vcpkg/scripts/buildsystems/vcpkg.cmake
 cmake --build . -- -j "$(($(nproc) - 1))"
diff --git a/data/.gitattributes b/data/.gitattributes
new file mode 100644
index 000000000..478d8b918
--- /dev/null
+++ b/data/.gitattributes
@@ -0,0 +1 @@
+*.pcm filter=lfs diff=lfs merge=lfs -text
diff --git a/data/audioToTextXform_test.pcm b/data/audioToTextXform_test.pcm
new file mode 100644
index 000000000..ec5454952
--- /dev/null
+++ b/data/audioToTextXform_test.pcm
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20575eb6d3d5199b1d56d3171c137e84559e0aea73a573846aa287133c6259d1
+size 315200
diff --git a/data/whisper/models/.gitattributes b/data/whisper/models/.gitattributes
new file mode 100644
index 000000000..96c9e36b1
--- /dev/null
+++ b/data/whisper/models/.gitattributes
@@ -0,0 +1 @@
+ggml-tiny.en-q8_0.bin filter=lfs diff=lfs merge=lfs -text
diff --git a/data/whisper/models/ggml-tiny.en-q8_0.bin b/data/whisper/models/ggml-tiny.en-q8_0.bin
new file mode 100644
index 000000000..959468eb9
--- /dev/null
+++ b/data/whisper/models/ggml-tiny.en-q8_0.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5bc2b3860aa151a4c6e7bb095e1fcce7cf12c7b020ca08dcec0c6d018bb7dd94
+size 43550795
diff --git a/thirdparty/custom-overlay/whisper/fix-for-arm64.patch b/thirdparty/custom-overlay/whisper/fix-for-arm64.patch
new file mode 100644
index 000000000..8c9c897f0
--- /dev/null
+++ b/thirdparty/custom-overlay/whisper/fix-for-arm64.patch
@@ -0,0 +1,127 @@
+diff --git a/ggml-cuda.cu b/ggml-cuda.cu
+index 2db5043..c799e32 100644
+--- a/ggml-cuda.cu
++++ b/ggml-cuda.cu
+@@ -12,9 +12,6 @@
+ #include <vector>
+ #include <map>
+ #include <array>
+-#include "ggml-cuda.h"
+-#include "ggml.h"
+-#include "ggml-backend-impl.h"
+ 
+ #if defined(GGML_USE_HIPBLAS)
+ #include <hip/hip_runtime.h>
+@@ -108,6 +105,10 @@
+ #include <cublas_v2.h>
+ #include <cuda_fp16.h>
+ 
++#include "ggml-cuda.h"
++#include "ggml.h"
++#include "ggml-backend-impl.h"
++
+ #if CUDART_VERSION < 11020
+ #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
+ #define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
+diff --git a/ggml-quants.c b/ggml-quants.c
+index 601d155..01921c6 100644
+--- a/ggml-quants.c
++++ b/ggml-quants.c
+@@ -425,17 +425,86 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
+ 
+ #else
+ 
+-#define ggml_int16x8x2_t  int16x8x2_t
+-#define ggml_uint8x16x2_t uint8x16x2_t
+-#define ggml_uint8x16x4_t uint8x16x4_t
+-#define ggml_int8x16x2_t  int8x16x2_t
+-#define ggml_int8x16x4_t  int8x16x4_t
+-
+-#define ggml_vld1q_s16_x2 vld1q_s16_x2
+-#define ggml_vld1q_u8_x2  vld1q_u8_x2
+-#define ggml_vld1q_u8_x4  vld1q_u8_x4
+-#define ggml_vld1q_s8_x2  vld1q_s8_x2
+-#define ggml_vld1q_s8_x4  vld1q_s8_x4
++typedef struct ggml_int16x8x2_t {
++    int16x8_t val[2];
++} ggml_int16x8x2_t;
++
++inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
++    ggml_int16x8x2_t res;
++
++    res.val[0] = vld1q_s16(ptr + 0);
++    res.val[1] = vld1q_s16(ptr + 8);
++
++    return res;
++}
++
++typedef struct ggml_uint8x16x2_t {
++    uint8x16_t val[2];
++} ggml_uint8x16x2_t;
++
++inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
++    ggml_uint8x16x2_t res;
++
++    res.val[0] = vld1q_u8(ptr + 0);
++    res.val[1] = vld1q_u8(ptr + 16);
++
++    return res;
++}
++
++typedef struct ggml_uint8x16x4_t {
++    uint8x16_t val[4];
++} ggml_uint8x16x4_t;
++
++inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
++    ggml_uint8x16x4_t res;
++
++    res.val[0] = vld1q_u8(ptr + 0);
++    res.val[1] = vld1q_u8(ptr + 16);
++    res.val[2] = vld1q_u8(ptr + 32);
++    res.val[3] = vld1q_u8(ptr + 48);
++
++    return res;
++}
++
++typedef struct ggml_int8x16x2_t {
++    int8x16_t val[2];
++} ggml_int8x16x2_t;
++
++inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
++    ggml_int8x16x2_t res;
++
++    res.val[0] = vld1q_s8(ptr + 0);
++    res.val[1] = vld1q_s8(ptr + 16);
++
++    return res;
++}
++
++typedef struct ggml_int8x16x4_t {
++    int8x16_t val[4];
++} ggml_int8x16x4_t;
++
++inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
++    ggml_int8x16x4_t res;
++
++    res.val[0] = vld1q_s8(ptr + 0);
++    res.val[1] = vld1q_s8(ptr + 16);
++    res.val[2] = vld1q_s8(ptr + 32);
++    res.val[3] = vld1q_s8(ptr + 48);
++
++    return res;
++}
++
++// #define ggml_int16x8x2_t  int16x8x2_t
++// #define ggml_uint8x16x2_t uint8x16x2_t
++// #define ggml_uint8x16x4_t uint8x16x4_t
++// #define ggml_int8x16x2_t  int8x16x2_t
++// #define ggml_int8x16x4_t  int8x16x4_t
++
++// #define ggml_vld1q_s16_x2 vld1q_s16_x2
++// #define ggml_vld1q_u8_x2  vld1q_u8_x2
++// #define ggml_vld1q_u8_x4  vld1q_u8_x4
++// #define ggml_vld1q_s8_x2  vld1q_s8_x2
++// #define ggml_vld1q_s8_x4  vld1q_s8_x4
+ 
+ #endif
+ 
diff --git a/thirdparty/custom-overlay/whisper/portfile.cmake b/thirdparty/custom-overlay/whisper/portfile.cmake
new file mode 100644
index 000000000..1ff52d319
--- /dev/null
+++ b/thirdparty/custom-overlay/whisper/portfile.cmake
@@ -0,0 +1,40 @@
+vcpkg_check_linkage(ONLY_STATIC_LIBRARY)
+
+vcpkg_from_github(
+    OUT_SOURCE_PATH SOURCE_PATH
+    REPO Apra-Labs/whisper.cpp
+    REF c3bff0d121e2af823344939643d64a27e4a76ea2 #v1.5.4
+    SHA512 d51a32c91340d2b9f18bf5221e134e57a0259bc3a1c803ef427adc6e3de5f54c556232cd4ef070b9c07f93968efd942a61cfe311c2cbca013a928f0eb8055e6f  # This is a temporary value. We will modify this value in the next section.
+    HEAD_REF kj/add-Config-for-vcpkg
+    PATCHES "fix-for-arm64.patch"
+)
+
+vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS
+ FEATURES
+ "cuda" WHISPER_CUBLAS
+)
+
+set(WHISPER_CUBLAS OFF)
+if("cuda" IN_LIST FEATURES)
+  set(WHISPER_CUBLAS ON)
+endif()
+
+
+vcpkg_cmake_configure(
+    SOURCE_PATH "${SOURCE_PATH}"
+    DISABLE_PARALLEL_CONFIGURE
+)
+
+vcpkg_cmake_install()
+vcpkg_cmake_config_fixup(
+    CONFIG_PATH lib/cmake/whisper
+    PACKAGE_NAME whisper
+    )
+vcpkg_copy_pdbs()
+
+file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include")
+
+file(INSTALL "${SOURCE_PATH}/LICENSE" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}" RENAME copyright)
+configure_file("${CMAKE_CURRENT_LIST_DIR}/usage" "${CURRENT_PACKAGES_DIR}/share/${PORT}/usage" COPYONLY)
+
+file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/share")
\ No newline at end of file
diff --git a/thirdparty/custom-overlay/whisper/usage b/thirdparty/custom-overlay/whisper/usage
new file mode 100644
index 000000000..b997cd604
--- /dev/null
+++ b/thirdparty/custom-overlay/whisper/usage
@@ -0,0 +1,4 @@
+whisper provides CMake targets:
+
+find_package(whisper CONFIG REQUIRED)
+target_link_libraries(main PRIVATE whisper::whisper)
\ No newline at end of file
diff --git a/thirdparty/custom-overlay/whisper/vcpkg.json b/thirdparty/custom-overlay/whisper/vcpkg.json
new file mode 100644
index 000000000..0290a42fe
--- /dev/null
+++ b/thirdparty/custom-overlay/whisper/vcpkg.json
@@ -0,0 +1,28 @@
+{
+  "name": "whisper",
+  "version": "1.5.4",
+  "homepage": "https://github.com/Apra-Labs/whisper.cpp",
+  "description": "Fork of whisper.cpp a High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model in cpp.",
+  "license": "MIT",
+  "dependencies": [
+    {
+      "name": "vcpkg-cmake",
+      "host": true
+    },
+    {
+      "name": "vcpkg-cmake-config",
+      "host": true
+    }
+  ],
+  "default-features": [
+    "default-features"
+  ],
+  "features": {
+    "cuda": {
+      "description": "Build Whisper with CUDA support",
+      "dependencies": [
+        "cuda"
+      ]
+    }
+  }
+}
\ No newline at end of file
diff --git a/vcpkg b/vcpkg
index 0fd721ec0..7754d62d1 160000
--- a/vcpkg
+++ b/vcpkg
@@ -1 +1 @@
-Subproject commit 0fd721ec00106e7e249dfbf40d5dc6e67e12f38c
+Subproject commit 7754d62d19501a3bb4e2d4f2eab80e8de9703e41